mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-28 04:52:55 +00:00
Compare commits
211 Commits
issue/997
...
fix-issue-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e5a87dd5bb | ||
|
|
ff4813529e | ||
|
|
470bc18e9b | ||
|
|
0b1add0ec6 | ||
|
|
1db76dd9cf | ||
|
|
467a9517db | ||
|
|
b361315a67 | ||
|
|
4e3771bffc | ||
|
|
8176b0335a | ||
|
|
811ac98f36 | ||
|
|
f4b2e71800 | ||
|
|
c431cfcf12 | ||
|
|
92f20bc5a2 | ||
|
|
57f931da3c | ||
|
|
9b662e6d03 | ||
|
|
18377d949c | ||
|
|
e6427b2588 | ||
|
|
0062fe705d | ||
|
|
9b3e508753 | ||
|
|
a1ac63ee1c | ||
|
|
e496ae0470 | ||
|
|
1e4df54ab3 | ||
|
|
2de249af74 | ||
|
|
10f056fbb4 | ||
|
|
074b09d0c0 | ||
|
|
86d0727659 | ||
|
|
be3e1b8718 | ||
|
|
8fdf59bdac | ||
|
|
ebebce2102 | ||
|
|
8044ec38da | ||
|
|
7413f87265 | ||
|
|
aea2e77665 | ||
|
|
a15845f9fd | ||
|
|
94ac44df4f | ||
|
|
f80d804a57 | ||
|
|
3b5c1d7817 | ||
|
|
24274edf81 | ||
|
|
d58497529b | ||
|
|
130495abab | ||
|
|
9b743d60fb | ||
|
|
5c9e2ef036 | ||
|
|
8526434b63 | ||
|
|
6ba302c481 | ||
|
|
de92f094aa | ||
|
|
c82cee66de | ||
|
|
6eed05b1ce | ||
|
|
bb488305c9 | ||
|
|
f05e84f964 | ||
|
|
65546ed22b | ||
|
|
57ae5b27dc | ||
|
|
f9531ec3c9 | ||
|
|
5b54a32563 | ||
|
|
cd049e28bc | ||
|
|
646e41bec4 | ||
|
|
36528c5e83 | ||
|
|
cd169dee23 | ||
|
|
b5cc60f80b | ||
|
|
060b83159a | ||
|
|
a40ff35453 | ||
|
|
268e6bfe6e | ||
|
|
f902440b8b | ||
|
|
77a0902605 | ||
|
|
c889ae10e4 | ||
|
|
0a534c6ee0 | ||
|
|
167d88b449 | ||
|
|
1071ed84f2 | ||
|
|
abb5624af2 | ||
|
|
1d41b96d32 | ||
|
|
ef4665945f | ||
|
|
294cd5fd0b | ||
|
|
f4d271177c | ||
|
|
451538fecf | ||
|
|
e78e0fec59 | ||
|
|
2e639cebf8 | ||
|
|
e296da7ade | ||
|
|
3b3e26c4b8 | ||
|
|
6a4883ac69 | ||
|
|
0ba05df545 | ||
|
|
aa3c4d4029 | ||
|
|
60df629725 | ||
|
|
2570b005ac | ||
|
|
d5212cd19d | ||
|
|
2193d85622 | ||
|
|
dfdbfe9eff | ||
|
|
b999e836b2 | ||
|
|
be2dd41e69 | ||
|
|
483fdb79cc | ||
|
|
aefd0fc907 | ||
|
|
3298d6cb71 | ||
|
|
c02c78ea73 | ||
|
|
6bf4fee1ba | ||
|
|
5209238c1b | ||
|
|
7ef25ec400 | ||
|
|
221e7cbb55 | ||
|
|
873ac1a3ac | ||
|
|
ebe55a7ae1 | ||
|
|
9f32d40b27 | ||
|
|
8ae10a930a | ||
|
|
473a346814 | ||
|
|
3a8a0fe79a | ||
|
|
511dc8f87f | ||
|
|
3901295329 | ||
|
|
f5918c6c74 | ||
|
|
abe6b4baec | ||
|
|
6e4b61154f | ||
|
|
2aad0ced77 | ||
|
|
41ea14840d | ||
|
|
dff0ffd38a | ||
|
|
8d32c3ba3a | ||
|
|
4afba005f9 | ||
|
|
85fb0cc20a | ||
|
|
5ef2d56ec2 | ||
|
|
fd8e5bdf57 | ||
|
|
4f8481a1e4 | ||
|
|
bcd72e5c14 | ||
|
|
249bc6cf72 | ||
|
|
1c0af5765d | ||
|
|
7ba771ed1b | ||
|
|
a4002622f8 | ||
|
|
8e21087ad7 | ||
|
|
d523543dc7 | ||
|
|
6ca27b6dd4 | ||
|
|
8d51e9cc91 | ||
|
|
2aced2d958 | ||
|
|
3fcba00a1f | ||
|
|
372d12766a | ||
|
|
dfed8896b9 | ||
|
|
d71aa57077 | ||
|
|
3e85fe57ac | ||
|
|
537021e12d | ||
|
|
ec4834cd73 | ||
|
|
712c01aa93 | ||
|
|
cde324d4b4 | ||
|
|
478571ebb4 | ||
|
|
fde9d27482 | ||
|
|
f38daab7f7 | ||
|
|
25b9429929 | ||
|
|
83cf638a2e | ||
|
|
a04e0bdaf1 | ||
|
|
c200d59d1e | ||
|
|
bbeac5888c | ||
|
|
daa53522b5 | ||
|
|
2c0f6e3319 | ||
|
|
27f587aa13 | ||
|
|
cfc27c9665 | ||
|
|
88a1a90c3c | ||
|
|
6d8581baae | ||
|
|
2b4b16ae90 | ||
|
|
075c23eb8c | ||
|
|
cbf805c3e6 | ||
|
|
46beb2a989 | ||
|
|
c01c175744 | ||
|
|
eca496ee24 | ||
|
|
083bb3ec3f | ||
|
|
2dc5403e7b | ||
|
|
aead5d4068 | ||
|
|
6fb3622abb | ||
|
|
39dd8cfe24 | ||
|
|
b9b9e9e518 | ||
|
|
e2c91aff33 | ||
|
|
96098fce20 | ||
|
|
18bfe131fe | ||
|
|
8dc3e7704c | ||
|
|
1ebfc71721 | ||
|
|
afd3dc7e81 | ||
|
|
cbaecb1ea4 | ||
|
|
8883e32dd8 | ||
|
|
4243780e0a | ||
|
|
5f3fd08509 | ||
|
|
98a225acd1 | ||
|
|
d8555cc8a1 | ||
|
|
3ccd93ac67 | ||
|
|
336428df8b | ||
|
|
d69aace9ec | ||
|
|
777debf5d7 | ||
|
|
7c20771d20 | ||
|
|
fef428a9c6 | ||
|
|
cc9972ad6c | ||
|
|
c2fdc60569 | ||
|
|
39320f953c | ||
|
|
be7c9cc9b8 | ||
|
|
f58345f0f0 | ||
|
|
f518012656 | ||
|
|
12fb9a95cb | ||
|
|
55e79e34af | ||
|
|
1649f31258 | ||
|
|
7849736d80 | ||
|
|
b7159dd48e | ||
|
|
38992251c5 | ||
|
|
a00049b879 | ||
|
|
ba4bc6d7c3 | ||
|
|
868f4fd174 | ||
|
|
e58401be78 | ||
|
|
5c1ce5b0e1 | ||
|
|
9af3aa0de0 | ||
|
|
71309c5528 | ||
|
|
54decc60bb | ||
|
|
50eea4376b | ||
|
|
443aa17329 | ||
|
|
be1d9e0db7 | ||
|
|
5743b46457 | ||
|
|
e67e5ebd46 | ||
|
|
a550c85369 | ||
|
|
b185df2b22 | ||
|
|
f82922b354 | ||
|
|
86b30d9d7f | ||
|
|
f1499d5b3e | ||
|
|
30b6828d71 | ||
|
|
e6b7b7da0a | ||
|
|
38a20ae269 | ||
|
|
a0ec6e1e9d |
13
.github/ISSUE_TEMPLATE/actions.md
vendored
Normal file
13
.github/ISSUE_TEMPLATE/actions.md
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
---
|
||||
name: Actions
|
||||
about: Actions not directly related to producing code.
|
||||
|
||||
---
|
||||
|
||||
# Actions title
|
||||
|
||||
Action description.
|
||||
e.g.
|
||||
- benchmark
|
||||
- investigate and report
|
||||
- etc.
|
||||
8
.github/dependabot.yml
vendored
Normal file
8
.github/dependabot.yml
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
version: 2
|
||||
updates:
|
||||
- package-ecosystem: cargo
|
||||
directory: "/"
|
||||
schedule:
|
||||
interval: daily
|
||||
time: "20:00"
|
||||
open-pull-requests-limit: 10
|
||||
27
.github/workflows/coverage.yml
vendored
Normal file
27
.github/workflows/coverage.yml
vendored
Normal file
@@ -0,0 +1,27 @@
|
||||
name: coverage
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
jobs:
|
||||
test:
|
||||
name: coverage
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: xd009642/tarpaulin:develop-nightly
|
||||
options: --security-opt seccomp=unconfined
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Generate code coverage
|
||||
run: |
|
||||
cargo +nightly tarpaulin --verbose --all-features --workspace --timeout 120 --out Xml
|
||||
|
||||
- name: Upload to codecov.io
|
||||
uses: codecov/codecov-action@v1
|
||||
with:
|
||||
# token: ${{secrets.CODECOV_TOKEN}} # not required for public repos
|
||||
fail_ci_if_error: true
|
||||
30
.github/workflows/test.yml
vendored
Normal file
30
.github/workflows/test.yml
vendored
Normal file
@@ -0,0 +1,30 @@
|
||||
name: Rust
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
|
||||
jobs:
|
||||
build:
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Build
|
||||
run: cargo build --verbose --workspace
|
||||
- name: Install latest nightly to test also against unstable feature flag
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: nightly
|
||||
override: true
|
||||
components: rustfmt
|
||||
- name: Run tests
|
||||
run: cargo test --all-features --verbose --workspace
|
||||
- name: Check Formatting
|
||||
run: cargo fmt --all -- --check
|
||||
@@ -2,51 +2,51 @@
|
||||
|
||||
## What is tantivy?
|
||||
|
||||
Tantivy is a library that is meant to build search engines. Although it is by no mean a port of Lucene, its architecture is strongly inspired by it. If you are familiar with Lucene, you may be struck by the overlapping vocabulary.
|
||||
Tantivy is a library that is meant to build search engines. Although it is by no means a port of Lucene, its architecture is strongly inspired by it. If you are familiar with Lucene, you may be struck by the overlapping vocabulary.
|
||||
This is not fortuitous.
|
||||
|
||||
Tantivy's bread and butter is to address the problem of full-text search :
|
||||
|
||||
Given a large set of textual documents, and a text query, return the K-most relevant documents in a very efficient way. In order to execute these queries rapidly, the tantivy need to build an index beforehand. The relevance score implemented in the tantivy is not configurable. Tantivy uses the same score as the default similarity used in Lucene / Elasticsearch, called [BM25](https://en.wikipedia.org/wiki/Okapi_BM25).
|
||||
Given a large set of textual documents, and a text query, return the K-most relevant documents in a very efficient way. To execute these queries rapidly, the tantivy needs to build an index beforehand. The relevance score implemented in the tantivy is not configurable. Tantivy uses the same score as the default similarity used in Lucene / Elasticsearch, called [BM25](https://en.wikipedia.org/wiki/Okapi_BM25).
|
||||
|
||||
But tantivy's scope does not stop there. Numerous features are required to power rich search applications. For instance, one may want to:
|
||||
But tantivy's scope does not stop there. Numerous features are required to power rich-search applications. For instance, one may want to:
|
||||
- compute the count of documents matching a query in the different section of an e-commerce website,
|
||||
- display an average price per meter square for a real estate search engine,
|
||||
- take in account historical user data to rank documents in a specific way,
|
||||
- take into account historical user data to rank documents in a specific way,
|
||||
- or even use tantivy to power an OLAP database.
|
||||
|
||||
A more abstract description of the problem space tantivy is trying to address is the following.
|
||||
|
||||
Ingest a large set of documents, create an index that makes it possible to
|
||||
rapidly select all documents matching a given predicate (also known as a query) and
|
||||
collect some information about them (See collector).
|
||||
collect some information about them ([See collector](#collector-define-what-to-do-with-matched-documents)).
|
||||
|
||||
Roughly speaking the design is following these guiding principles:
|
||||
- Search should be O(1) in memory.
|
||||
- Indexing should be O(1) in memory. (In practise it is just sublinear)
|
||||
- Indexing should be O(1) in memory. (In practice it is just sublinear)
|
||||
- Search should be as fast as possible
|
||||
|
||||
This comes at the cost of the dynamicity of the index : while it is possible to add, and delete documents from our corpus, the tantivy is designed to handle these updates in large batches.
|
||||
This comes at the cost of the dynamicity of the index: while it is possible to add, and delete documents from our corpus, the tantivy is designed to handle these updates in large batches.
|
||||
|
||||
## [core/](src/core): Index, segments, searchers.
|
||||
|
||||
Core contains all of the high level code to make it possible for to create an index, add documents, delete documents and commit.
|
||||
Core contains all of the high-level code to make it possible to create an index, add documents, delete documents and commit.
|
||||
|
||||
This is both the most high-level part of tantivy, the least performance sensitive one, the seemingly most mundane code... And paradoxically the most complicated part.
|
||||
This is both the most high-level part of tantivy, the least performance-sensitive one, the seemingly most mundane code... And paradoxically the most complicated part.
|
||||
|
||||
### Index and Segments...
|
||||
|
||||
A tantivy index is in fact a collection of smaller independent immutable segments.
|
||||
Each segment contains its own independent set of datastructures.
|
||||
A tantivy index is a collection of smaller independent immutable segments.
|
||||
Each segment contains its own independent set of data structures.
|
||||
|
||||
A segment is identified by a segment id that is in fact a UUID.
|
||||
The file of a segment has the format
|
||||
|
||||
```segment-id . ext ```
|
||||
|
||||
The extension signals which datastructure (or [`SegmentComponent`](src/core/segment_component.rs)) is stored in the file.
|
||||
The extension signals which data structure (or [`SegmentComponent`](src/core/segment_component.rs)) is stored in the file.
|
||||
|
||||
A small `meta.json` file is in charge keeping track of the list of segments, as well as the schema.
|
||||
A small `meta.json` file is in charge of keeping track of the list of segments, as well as the schema.
|
||||
|
||||
On commit, one segment per indexing thread is written to disk, and the `meta.json` is then updated atomically.
|
||||
|
||||
@@ -66,16 +66,16 @@ An opstamp is simply an incremental id that identifies any operation applied to
|
||||
### DocId
|
||||
|
||||
Within a segment, all documents are identified by a DocId that ranges within `[0, max_doc)`.
|
||||
where max doc is the number of documents in the segment, (deleted or not). Having such a compact `DocId` space is key to the compression of our datastructures.
|
||||
where `max_doc` is the number of documents in the segment, (deleted or not). Having such a compact `DocId` space is key to the compression of our data structures.
|
||||
|
||||
The DocIds are simply allocated in the order documents are added to the index.
|
||||
|
||||
### Merges
|
||||
|
||||
In separate threads, tantivy's index writer search for opportunities to merge segments.
|
||||
The point of segment merges is to:
|
||||
The point of segment merge is to:
|
||||
- eventually get rid of tombstoned documents
|
||||
- reduce the otherwise evergrowing number of segments.
|
||||
- reduce the otherwise ever-growing number of segments.
|
||||
|
||||
Indeed, while having several segments instead of one does not hurt search too much, having hundreds can have a measurable impact on the search performance.
|
||||
|
||||
@@ -99,7 +99,7 @@ but users can extend tantivy with their own implementation.
|
||||
|
||||
## [schema/](src/schema): What are documents?
|
||||
|
||||
Tantivy's document follow a very strict schema , decided before building any index.
|
||||
Tantivy's document follows a very strict schema, decided before building any index.
|
||||
|
||||
The schema defines all of the fields that the indexes [`Document`](src/schema/document.rs) may and should contain, their types (`text`, `i64`, `u64`, `Date`, ...) as well as how it should be indexed / represented in tantivy.
|
||||
|
||||
@@ -108,25 +108,25 @@ Depending on the type of the field, you can decide to
|
||||
- store it as a fast field
|
||||
- index it
|
||||
|
||||
Practically, tantivy will push values associated to this type to up to 3 respective
|
||||
datastructures.
|
||||
Practically, tantivy will push values associated with this type to up to 3 respective
|
||||
data structures.
|
||||
|
||||
*Limitations*
|
||||
|
||||
As of today, tantivy's schema impose a 1:1 relationship between a field that is being ingested and a field represented in the search index. In sophisticated search application, it is fairly common to want to index a field twice using different tokenizers, or to index the concatenation of several fields together into one field.
|
||||
As of today, tantivy's schema imposes a 1:1 relationship between a field that is being ingested and a field represented in the search index. In sophisticated search application, it is fairly common to want to index a field twice using different tokenizers, or to index the concatenation of several fields together into one field.
|
||||
|
||||
This is not something tantivy supports, and it is up to the user to duplicate field / concatenate fields before feeding them to tantivy.
|
||||
|
||||
## General information about these datastructures.
|
||||
## General information about these data structures.
|
||||
|
||||
All datastructures in tantivy, have:
|
||||
All data structures in tantivy, have:
|
||||
- a writer
|
||||
- a serializer
|
||||
- a reader
|
||||
|
||||
The writer builds a in-memory representation of a batch of documents. This representation is not searchable. It is just meant as intermediary mutable representation, to which we can sequentially add
|
||||
The writer builds an in-memory representation of a batch of documents. This representation is not searchable. It is just meant as an intermediary mutable representation, to which we can sequentially add
|
||||
the document of a batch. At the end of the batch (or if a memory limit is reached), this representation
|
||||
is then converted in an on-disk immutable representation, that is extremely compact.
|
||||
is then converted into an on-disk immutable representation, that is extremely compact.
|
||||
This conversion is done by the serializer.
|
||||
|
||||
Finally, the reader is in charge of offering an API to read on this on-disk read-only representation.
|
||||
@@ -134,8 +134,8 @@ In tantivy, readers are designed to require very little anonymous memory. The da
|
||||
|
||||
## [store/](src/store): Here is my DocId, Gimme my document!
|
||||
|
||||
The docstore is a row-oriented storage that, for each documents, stores a subset of the fields
|
||||
that are marked as stored in the schema. The docstore is compressed using a general purpose algorithm
|
||||
The docstore is a row-oriented storage that, for each document, stores a subset of the fields
|
||||
that are marked as stored in the schema. The docstore is compressed using a general-purpose algorithm
|
||||
like LZ4.
|
||||
|
||||
**Useful for**
|
||||
@@ -146,7 +146,7 @@ Once the top 10 documents have been identified, we fetch them from the store, an
|
||||
**Not useful for**
|
||||
|
||||
Fetching a document from the store is typically a "slow" operation. It usually consists in
|
||||
- searching into a compact tree-like datastructure to find the position of the right block.
|
||||
- searching into a compact tree-like data structure to find the position of the right block.
|
||||
- decompressing a small block
|
||||
- returning the document from this block.
|
||||
|
||||
@@ -180,10 +180,10 @@ all of the documents matching a query (aka [DocSet](src/docset.rs)).
|
||||
|
||||
For instance, one could define a function to combine upvotes with tantivy's internal relevancy score.
|
||||
This can be done by fetching a fast field during scoring.
|
||||
Once could also compute the mean price of the items matching a query in an e-commerce website.
|
||||
One could also compute the mean price of the items matching a query in an e-commerce website.
|
||||
This can be done by fetching a fast field in a collector.
|
||||
Finally one could decide to post filter a docset to remove docset with a price within a specific range.
|
||||
If the ratio of filtered out documents is not too low, an efficient way to do this is to fetch the price, and apply the filter on the collector side.
|
||||
Finally one could decide to post-filter a docset to remove docset with a price within a specific range.
|
||||
If the ratio of filtered out documents is not too low, an efficient way to do this is to fetch the price and apply the filter on the collector side.
|
||||
|
||||
Aside from integer values, it is also possible to store an actual byte payload.
|
||||
For advanced search engine, it is possible to store all of the features required for learning-to-rank in a byte payload, access it during search, and apply the learning-to-rank model.
|
||||
@@ -193,24 +193,22 @@ Finally facets are a specific kind of fast field, and the associated source code
|
||||
# The inverted search index.
|
||||
|
||||
The inverted index is the core part of full-text search.
|
||||
When presented a new document with the text field "Hello, happy tax payer!", tantivy breaks it into a list of so-called token. In addition to just splitting this strings into tokens, it might also do different kind of operations like dropping the punctuation, converting the character to lowercase, apply stemming etc. Tantivy makes it possible to configure the operations to be applied in the schema
|
||||
(tokenizer/ is the place where these operations are implemented).
|
||||
When presented a new document with the text field "Hello, happy tax payer!", tantivy breaks it into a list of so-called tokens. In addition to just splitting these strings into tokens, it might also do different kinds of operations like dropping the punctuation, converting the character to lowercase, apply stemming, etc. Tantivy makes it possible to configure the operations to be applied in the schema (tokenizer/ is the place where these operations are implemented).
|
||||
|
||||
For instance, the default tokenizer of tantivy would break our text into: `[hello, happy, tax, payer]`.
|
||||
The document will therefore be registered in the inverted index as containing the terms
|
||||
`[text:hello, text:happy, text:tax, text:payer]`.
|
||||
|
||||
The role of the inverted index is, when given a term, supply us with a very fast iterator over
|
||||
the sorted doc ids that match the term.
|
||||
The role of the inverted index is, when given a term, gives us in return a very fast iterator over the sorted doc ids that match the term.
|
||||
|
||||
Such an iterator is called a posting list. In addition to giving us `DocId`, they can also give us optionally to the number of occurrence of the term for each document, also called term frequency or TF.
|
||||
Such an iterator is called a posting list. In addition to giving us `DocId`, they can also give us optionally the number of occurrence of the term for each document, also called term frequency or TF.
|
||||
|
||||
These iterators being sorted by DocId, one can create an iterator over the document containing `text:tax AND text:payer`, `(text:tax AND text:payer) OR (text:contribuable)` or any boolean expression.
|
||||
|
||||
In order to represent the function
|
||||
```Term ⟶ Posting```
|
||||
|
||||
The inverted index actually consists of two datastructures chained together.
|
||||
The inverted index actually consists of two data structures chained together.
|
||||
|
||||
- [Term](src/schema/term.rs) ⟶ [TermInfo](src/postings/term_info.rs) is addressed by the term dictionary.
|
||||
- [TermInfo](src/postings/term_info.rs) ⟶ [Posting](src/postings/postings.rs) is addressed by the posting lists.
|
||||
@@ -224,7 +222,7 @@ Tantivy's term dictionary is mainly in charge of supplying the function
|
||||
|
||||
[Term](src/schema/term.rs) ⟶ [TermInfo](src/postings/term_info.rs)
|
||||
|
||||
It is itself is broken into two parts.
|
||||
It is itself broken into two parts.
|
||||
- [Term](src/schema/term.rs) ⟶ [TermOrdinal](src/termdict/mod.rs) is addressed by a finite state transducer, implemented by the fst crate.
|
||||
- [TermOrdinal](src/termdict/mod.rs) ⟶ [TermInfo](src/postings/term_info.rs) is addressed by the term info store.
|
||||
|
||||
@@ -234,7 +232,7 @@ It is itself is broken into two parts.
|
||||
A posting list makes it possible to store a sorted list of doc ids and for each doc store
|
||||
a term frequency as well.
|
||||
|
||||
The posting list are stored in a separate file. The [TermInfo](src/postings/term_info.rs) contains an offset into that file and a number of documents for the given posting list. Both are required and sufficient to read the posting list.
|
||||
The posting lists are stored in a separate file. The [TermInfo](src/postings/term_info.rs) contains an offset into that file and a number of documents for the given posting list. Both are required and sufficient to read the posting list.
|
||||
|
||||
The posting list is organized in block of 128 documents.
|
||||
One block of doc ids is followed by one block of term frequencies.
|
||||
@@ -242,11 +240,11 @@ One block of doc ids is followed by one block of term frequencies.
|
||||
The doc ids are delta encoded and bitpacked.
|
||||
The term frequencies are bitpacked.
|
||||
|
||||
Because the number of docs is rarely a multiple of 128, the last block my contain an arbitrary number of docs between 1 and 127 documents. We then use variable int encoding instead of bitpacking.
|
||||
Because the number of docs is rarely a multiple of 128, the last block may contain an arbitrary number of docs between 1 and 127 documents. We then use variable int encoding instead of bitpacking.
|
||||
|
||||
## [positions/](src/positions): Where are my terms within the documents?
|
||||
|
||||
Phrase queries make it possible to search for documents containing a specific sequence of document.
|
||||
Phrase queries make it possible to search for documents containing a specific sequence of terms.
|
||||
For instance, when the phrase query "the art of war" does not match "the war of art".
|
||||
To make it possible, it is possible to specify in the schema that a field should store positions in addition to being indexed.
|
||||
|
||||
@@ -257,7 +255,7 @@ we advance the position reader by the number of term frequencies of the current
|
||||
## [fieldnorms/](src/fieldnorms): Here is my doc, how many tokens in this field?
|
||||
|
||||
The [BM25](https://en.wikipedia.org/wiki/Okapi_BM25) formula also requires to know the number of tokens stored in a specific field for a given document. We store this information on one byte per document in the fieldnorm.
|
||||
The fieldnorm is therefore compressed. Values up to 40 are encoded unchanged. There is then a logarithmic mapping that
|
||||
The fieldnorm is therefore compressed. Values up to 40 are encoded unchanged.
|
||||
|
||||
|
||||
## [tokenizer/](src/tokenizer): How should we process text?
|
||||
@@ -274,24 +272,24 @@ Tantivy's comes with few tokenizers, but external crates are offering advanced t
|
||||
## [query/](src/query): Define and compose queries
|
||||
|
||||
The [Query](src/query/query.rs) trait defines what a query is.
|
||||
Due to the necessity for some query to compute some statistics over the entire index, and because the
|
||||
index is composed of several `SegmentReader`, the path from transforming a `Query` to a iterator over document is slightly convoluted, but fundamentally, this is what a Query is.
|
||||
Due to the necessity for some queries to compute some statistics over the entire index, and because the
|
||||
index is composed of several `SegmentReader`, the path from transforming a `Query` to an iterator over documents is slightly convoluted, but fundamentally, this is what a Query is.
|
||||
|
||||
The iterator over a document comes with some scoring function. The resulting trait is called a
|
||||
[Scorer](src/query/scorer.rs) and is specific to a segment.
|
||||
|
||||
Different queries can be combined using the [BooleanQuery](src/query/boolean_query/).
|
||||
Tantivy comes with different types of queries, and can be extended by implementing
|
||||
the Query`, `Weight` and `Scorer` traits.
|
||||
Tantivy comes with different types of queries and can be extended by implementing
|
||||
the `Query`, `Weight`, and `Scorer` traits.
|
||||
|
||||
## [collector](src/collector): Define what to do with matched documents
|
||||
|
||||
Collectors define how to aggregate the documents matching a query, in the broadest sense possible.
|
||||
The search will push matched document one by one, calling their
|
||||
The search will push matched documents one by one, calling their
|
||||
`fn collect(doc: DocId, score: Score);` method.
|
||||
|
||||
Users may implement their own collectors by implementing the [Collector](src/collector/mod.rs) trait.
|
||||
|
||||
## [query-grammar](query-grammar): Defines the grammar of the query parser
|
||||
|
||||
While the [QueryParser](src/query/query_parser/query_parser.rs) struct is located in the `query/` directory, the actual parser combinator used to convert user queries into an AST is in an external crate called `query-grammar`. This part was externalize to lighten the work of the compiler.
|
||||
While the [QueryParser](src/query/query_parser/query_parser.rs) struct is located in the `query/` directory, the actual parser combinator used to convert user queries into an AST is in an external crate called `query-grammar`. This part was externalized to lighten the work of the compiler.
|
||||
|
||||
32
CHANGELOG.md
32
CHANGELOG.md
@@ -1,9 +1,38 @@
|
||||
Tantivy 0.15.3
|
||||
=========================
|
||||
- Major bugfix. Deleting documents was broken when the index was sorted by a field. (@appaquet, @fulmicoton) #1101
|
||||
|
||||
|
||||
Tantivy 0.15.2
|
||||
========================
|
||||
- Major bugfix. DocStore still panics when a deleted doc is at the beginning of a block. (@appaquet) #1088
|
||||
|
||||
Tantivy 0.15.1
|
||||
=========================
|
||||
- Major bugfix. DocStore panics when first block is deleted. (@appaquet) #1077
|
||||
|
||||
Tantivy 0.15.0
|
||||
=========================
|
||||
- API Changes. Using Range instead of (start, end) in the API and internals (`FileSlice`, `OwnedBytes`, `Snippets`, ...)
|
||||
This change is breaking but migration is trivial.
|
||||
- Added an Histogram collector. (@fulmicoton) #994
|
||||
- Added support for Option<TCollector>. (@fulmicoton)
|
||||
- Added support for Option<TCollector>. (@fulmicoton)
|
||||
- DocAddress is now a struct (@scampi) #987
|
||||
- Bugfix consistent tie break handling in facet's topk (@hardikpnsp) #357
|
||||
- Date field support for range queries (@rihardsk) #516
|
||||
- Added lz4-flex as the default compression scheme in tantivy (@PSeitz) #1009
|
||||
- Renamed a lot of symbols to avoid all uppercasing on acronyms, as per new clippy recommendation. For instance, RAMDirectory -> RamDirectory. (@fulmicoton)
|
||||
- Simplified positions index format (@fulmicoton) #1022
|
||||
- Moved bitpacking to bitpacker subcrate and add BlockedBitpacker, which bitpacks blocks of 128 elements (@PSeitz) #1030
|
||||
- Added support for more-like-this query in tantivy (@evanxg852000) #1011
|
||||
- Added support for sorting an index, e.g presorting documents in an index by a timestamp field. This can heavily improve performance for certain scenarios, by utilizing the sorted data (Top-n optimizations)(@PSeitz). #1026
|
||||
- Add iterator over documents in doc store (@PSeitz). #1044
|
||||
- Fix log merge policy (@PSeitz). #1043
|
||||
- Add detection to avoid small doc store blocks on merge (@PSeitz). #1054
|
||||
- Make doc store compression dynamic (@PSeitz). #1060
|
||||
- Switch to json for footer version handling (@PSeitz). #1060
|
||||
- Updated TermMerger implementation to rely on the union feature of the FST (@scampi) #469
|
||||
- Add boolean marking whether position is required in the query_terms API call (@fulmicoton). #1070
|
||||
|
||||
|
||||
Tantivy 0.14.0
|
||||
@@ -20,6 +49,7 @@ Tantivy 0.14.0
|
||||
- Simplified the encoding of the skip reader struct. BlockWAND max tf is now encoded over a single byte. (@fulmicoton)
|
||||
- `FilterCollector` now supports all Fast Field value types (@barrotsteindev)
|
||||
- FastField are not all loaded when opening the segment reader. (@fulmicoton)
|
||||
- Added an API to merge segments, see `tantivy::merge_segments` #1005. (@evanxg852000)
|
||||
|
||||
This version breaks compatibility and requires users to reindex everything.
|
||||
|
||||
|
||||
83
Cargo.toml
83
Cargo.toml
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.14.0"
|
||||
version = "0.16.0-dev"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
@@ -14,51 +14,57 @@ edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
base64 = "0.13"
|
||||
byteorder = "1"
|
||||
crc32fast = "1"
|
||||
once_cell = "1"
|
||||
regex ={version = "1", default-features = false, features = ["std"]}
|
||||
byteorder = "1.4.3"
|
||||
crc32fast = "1.2.1"
|
||||
once_cell = "1.7.2"
|
||||
regex ={ version = "1.5.4", default-features = false, features = ["std"] }
|
||||
tantivy-fst = "0.3"
|
||||
memmap = {version = "0.7", optional=true}
|
||||
lz4 = {version="1", optional=true}
|
||||
brotli = {version="3.3.0", optional=true}
|
||||
snap = "1"
|
||||
tempfile = {version="3", optional=true}
|
||||
log = "0.4"
|
||||
serde = {version="1", features=["derive"]}
|
||||
serde_json = "1"
|
||||
num_cpus = "1"
|
||||
fs2={version="0.4", optional=true}
|
||||
memmap2 = {version = "0.3", optional=true}
|
||||
lz4_flex = { version = "0.8.0", default-features = false, features = ["checked-decode"], optional = true }
|
||||
brotli = { version = "3.3", optional = true }
|
||||
snap = { version = "1.0.5", optional = true }
|
||||
tempfile = { version = "3.2", optional = true }
|
||||
log = "0.4.14"
|
||||
serde = { version = "1.0.126", features = ["derive"] }
|
||||
serde_json = "1.0.64"
|
||||
num_cpus = "1.13"
|
||||
fs2={ version = "0.4.3", optional = true }
|
||||
levenshtein_automata = "0.2"
|
||||
uuid = { version = "0.8", features = ["v4", "serde"] }
|
||||
uuid = { version = "0.8.2", features = ["v4", "serde"] }
|
||||
crossbeam = "0.8"
|
||||
futures = {version = "0.3", features=["thread-pool"] }
|
||||
tantivy-query-grammar = { version="0.14.0", path="./query-grammar" }
|
||||
stable_deref_trait = "1"
|
||||
rust-stemmers = "1"
|
||||
downcast-rs = "1"
|
||||
bitpacking = {version="0.8", default-features = false, features=["bitpacker4x"]}
|
||||
futures = { version = "0.3.15", features = ["thread-pool"] }
|
||||
tantivy-query-grammar = { version="0.15.0", path="./query-grammar" }
|
||||
tantivy-bitpacker = { version="0.1", path="./bitpacker" }
|
||||
common = { version="0.1", path="./common" }
|
||||
fastfield_codecs = { version="0.1", path="./fastfield_codecs", default-features = false }
|
||||
ownedbytes = { version="0.1", path="./ownedbytes" }
|
||||
stable_deref_trait = "1.2"
|
||||
rust-stemmers = "1.2"
|
||||
downcast-rs = "1.2"
|
||||
bitpacking = { version = "0.8.4", default-features = false, features = ["bitpacker4x"] }
|
||||
census = "0.4"
|
||||
fnv = "1"
|
||||
thiserror = "1.0"
|
||||
htmlescape = "0.3"
|
||||
fnv = "1.0.7"
|
||||
thiserror = "1.0.24"
|
||||
htmlescape = "0.3.1"
|
||||
fail = "0.4"
|
||||
murmurhash32 = "0.2"
|
||||
chrono = "0.4"
|
||||
smallvec = "1"
|
||||
rayon = "1"
|
||||
lru = "0.6"
|
||||
chrono = "0.4.19"
|
||||
smallvec = "1.6.1"
|
||||
rayon = "1.5"
|
||||
lru = "0.6.5"
|
||||
fastdivide = "0.3"
|
||||
itertools = "0.10.0"
|
||||
measure_time = "0.7.0"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.3"
|
||||
winapi = "0.3.9"
|
||||
|
||||
[dev-dependencies]
|
||||
rand = "0.8"
|
||||
maplit = "1"
|
||||
rand = "0.8.3"
|
||||
maplit = "1.0.2"
|
||||
matches = "0.1.8"
|
||||
proptest = "1.0"
|
||||
criterion = "0.3"
|
||||
criterion = "0.3.4"
|
||||
|
||||
[dev-dependencies.fail]
|
||||
version = "0.4"
|
||||
@@ -74,16 +80,19 @@ debug-assertions = true
|
||||
overflow-checks = true
|
||||
|
||||
[features]
|
||||
default = ["mmap"]
|
||||
mmap = ["fs2", "tempfile", "memmap"]
|
||||
default = ["mmap", "lz4-compression" ]
|
||||
mmap = ["fs2", "tempfile", "memmap2"]
|
||||
|
||||
brotli-compression = ["brotli"]
|
||||
lz4-compression = ["lz4"]
|
||||
lz4-compression = ["lz4_flex"]
|
||||
snappy-compression = ["snap"]
|
||||
|
||||
failpoints = ["fail/failpoints"]
|
||||
unstable = [] # useful for benches.
|
||||
wasm-bindgen = ["uuid/wasm-bindgen"]
|
||||
|
||||
[workspace]
|
||||
members = ["query-grammar"]
|
||||
members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbytes"]
|
||||
|
||||
[badges]
|
||||
travis-ci = { repository = "tantivy-search/tantivy" }
|
||||
|
||||
@@ -38,7 +38,7 @@ Your mileage WILL vary depending on the nature of queries and their load.
|
||||
# Features
|
||||
|
||||
- Full-text search
|
||||
- Configurable tokenizer (stemming available for 17 Latin languages with third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)), Japanese ([lindera](https://github.com/lindera-morphology/lindera-tantivy) and [tantivy-tokenizer-tiny-segmente](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter)) and Korean ([lindera](https://github.com/lindera-morphology/lindera-tantivy) + [lindera-ko-dic-builder](https://github.com/lindera-morphology/lindera-ko-dic-builder))
|
||||
- Configurable tokenizer (stemming available for 17 Latin languages with third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)), Japanese ([lindera](https://github.com/lindera-morphology/lindera-tantivy) and [tantivy-tokenizer-tiny-segmenter](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter)) and Korean ([lindera](https://github.com/lindera-morphology/lindera-tantivy) + [lindera-ko-dic-builder](https://github.com/lindera-morphology/lindera-ko-dic-builder))
|
||||
- Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/) :sparkles: :racehorse:)
|
||||
- Tiny startup time (<10ms), perfect for command line tools
|
||||
- BM25 scoring (the same as Lucene)
|
||||
|
||||
@@ -18,5 +18,6 @@ install:
|
||||
build: false
|
||||
|
||||
test_script:
|
||||
- REM SET RUST_LOG=tantivy,test & cargo test --all --verbose --no-default-features --features mmap
|
||||
- REM SET RUST_LOG=tantivy,test & cargo test --all --verbose --no-default-features --features lz4-compression --features mmap
|
||||
- REM SET RUST_LOG=tantivy,test & cargo test test_store --verbose --no-default-features --features lz4-compression --features snappy-compression --features brotli-compression --features mmap
|
||||
- REM SET RUST_BACKTRACE=1 & cargo build --examples
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use tantivy::tokenizer::TokenizerManager;
|
||||
|
||||
const ALICE_TXT: &'static str = include_str!("alice.txt");
|
||||
const ALICE_TXT: &str = include_str!("alice.txt");
|
||||
|
||||
pub fn criterion_benchmark(c: &mut Criterion) {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
|
||||
15
bitpacker/Cargo.toml
Normal file
15
bitpacker/Cargo.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
[package]
|
||||
name = "tantivy-bitpacker"
|
||||
version = "0.1.0"
|
||||
edition = "2018"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = []
|
||||
description = """Tantivy-sub crate: bitpacking"""
|
||||
repository = "https://github.com/tantivy-search/tantivy"
|
||||
keywords = []
|
||||
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
33
bitpacker/benches/bench.rs
Normal file
33
bitpacker/benches/bench.rs
Normal file
@@ -0,0 +1,33 @@
|
||||
#![feature(test)]
|
||||
|
||||
extern crate test;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use tantivy_bitpacker::BlockedBitpacker;
|
||||
use test::Bencher;
|
||||
#[bench]
|
||||
fn bench_blockedbitp_read(b: &mut Bencher) {
|
||||
let mut blocked_bitpacker = BlockedBitpacker::new();
|
||||
for val in 0..=21500 {
|
||||
blocked_bitpacker.add(val * val);
|
||||
}
|
||||
b.iter(|| {
|
||||
let mut out = 0;
|
||||
for val in 0..=21500 {
|
||||
out = blocked_bitpacker.get(val);
|
||||
}
|
||||
out
|
||||
});
|
||||
}
|
||||
#[bench]
|
||||
fn bench_blockedbitp_create(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
let mut blocked_bitpacker = BlockedBitpacker::new();
|
||||
for val in 0..=21500 {
|
||||
blocked_bitpacker.add(val * val);
|
||||
}
|
||||
blocked_bitpacker
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -1,13 +1,14 @@
|
||||
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
|
||||
use std::io;
|
||||
use std::{convert::TryInto, io};
|
||||
|
||||
use crate::directory::OwnedBytes;
|
||||
|
||||
pub(crate) struct BitPacker {
|
||||
pub struct BitPacker {
|
||||
mini_buffer: u64,
|
||||
mini_buffer_written: usize,
|
||||
}
|
||||
|
||||
impl Default for BitPacker {
|
||||
fn default() -> Self {
|
||||
BitPacker::new()
|
||||
}
|
||||
}
|
||||
impl BitPacker {
|
||||
pub fn new() -> BitPacker {
|
||||
BitPacker {
|
||||
@@ -16,6 +17,7 @@ impl BitPacker {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn write<TWrite: io::Write>(
|
||||
&mut self,
|
||||
val: u64,
|
||||
@@ -26,14 +28,14 @@ impl BitPacker {
|
||||
let num_bits = num_bits as usize;
|
||||
if self.mini_buffer_written + num_bits > 64 {
|
||||
self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32);
|
||||
output.write_u64::<LittleEndian>(self.mini_buffer)?;
|
||||
output.write_all(self.mini_buffer.to_le_bytes().as_ref())?;
|
||||
self.mini_buffer = val_u64.wrapping_shr((64 - self.mini_buffer_written) as u32);
|
||||
self.mini_buffer_written = self.mini_buffer_written + num_bits - 64;
|
||||
} else {
|
||||
self.mini_buffer |= val_u64 << self.mini_buffer_written;
|
||||
self.mini_buffer_written += num_bits;
|
||||
if self.mini_buffer_written == 64 {
|
||||
output.write_u64::<LittleEndian>(self.mini_buffer)?;
|
||||
output.write_all(self.mini_buffer.to_le_bytes().as_ref())?;
|
||||
self.mini_buffer_written = 0;
|
||||
self.mini_buffer = 0u64;
|
||||
}
|
||||
@@ -44,10 +46,10 @@ impl BitPacker {
|
||||
pub fn flush<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||
if self.mini_buffer_written > 0 {
|
||||
let num_bytes = (self.mini_buffer_written + 7) / 8;
|
||||
let mut arr: [u8; 8] = [0u8; 8];
|
||||
LittleEndian::write_u64(&mut arr, self.mini_buffer);
|
||||
output.write_all(&arr[..num_bytes])?;
|
||||
let bytes = self.mini_buffer.to_le_bytes();
|
||||
output.write_all(&bytes[..num_bytes])?;
|
||||
self.mini_buffer_written = 0;
|
||||
self.mini_buffer = 0;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -60,15 +62,14 @@ impl BitPacker {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct BitUnpacker {
|
||||
num_bits: u64,
|
||||
mask: u64,
|
||||
data: OwnedBytes,
|
||||
}
|
||||
|
||||
impl BitUnpacker {
|
||||
pub fn new(data: OwnedBytes, num_bits: u8) -> BitUnpacker {
|
||||
pub fn new(num_bits: u8) -> BitUnpacker {
|
||||
let mask: u64 = if num_bits == 64 {
|
||||
!0u64
|
||||
} else {
|
||||
@@ -77,15 +78,14 @@ impl BitUnpacker {
|
||||
BitUnpacker {
|
||||
num_bits: u64::from(num_bits),
|
||||
mask,
|
||||
data,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get(&self, idx: u64) -> u64 {
|
||||
#[inline]
|
||||
pub fn get(&self, idx: u64, data: &[u8]) -> u64 {
|
||||
if self.num_bits == 0 {
|
||||
return 0u64;
|
||||
}
|
||||
let data: &[u8] = self.data.as_slice();
|
||||
let num_bits = self.num_bits;
|
||||
let mask = self.mask;
|
||||
let addr_in_bits = idx * num_bits;
|
||||
@@ -95,7 +95,10 @@ impl BitUnpacker {
|
||||
addr + 8 <= data.len() as u64,
|
||||
"The fast field field should have been padded with 7 bytes."
|
||||
);
|
||||
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[(addr as usize)..]);
|
||||
let bytes: [u8; 8] = (&data[(addr as usize)..(addr as usize) + 8])
|
||||
.try_into()
|
||||
.unwrap();
|
||||
let val_unshifted_unmasked: u64 = u64::from_le_bytes(bytes);
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
val_shifted & mask
|
||||
}
|
||||
@@ -104,9 +107,8 @@ impl BitUnpacker {
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::{BitPacker, BitUnpacker};
|
||||
use crate::directory::OwnedBytes;
|
||||
|
||||
fn create_fastfield_bitpacker(len: usize, num_bits: u8) -> (BitUnpacker, Vec<u64>) {
|
||||
fn create_fastfield_bitpacker(len: usize, num_bits: u8) -> (BitUnpacker, Vec<u64>, Vec<u8>) {
|
||||
let mut data = Vec::new();
|
||||
let mut bitpacker = BitPacker::new();
|
||||
let max_val: u64 = (1u64 << num_bits as u64) - 1u64;
|
||||
@@ -118,14 +120,14 @@ mod test {
|
||||
}
|
||||
bitpacker.close(&mut data).unwrap();
|
||||
assert_eq!(data.len(), ((num_bits as usize) * len + 7) / 8 + 7);
|
||||
let bitunpacker = BitUnpacker::new(OwnedBytes::new(data), num_bits);
|
||||
(bitunpacker, vals)
|
||||
let bitunpacker = BitUnpacker::new(num_bits);
|
||||
(bitunpacker, vals, data)
|
||||
}
|
||||
|
||||
fn test_bitpacker_util(len: usize, num_bits: u8) {
|
||||
let (bitunpacker, vals) = create_fastfield_bitpacker(len, num_bits);
|
||||
let (bitunpacker, vals, data) = create_fastfield_bitpacker(len, num_bits);
|
||||
for (i, val) in vals.iter().enumerate() {
|
||||
assert_eq!(bitunpacker.get(i as u64), *val);
|
||||
assert_eq!(bitunpacker.get(i as u64, &data), *val);
|
||||
}
|
||||
}
|
||||
|
||||
178
bitpacker/src/blocked_bitpacker.rs
Normal file
178
bitpacker/src/blocked_bitpacker.rs
Normal file
@@ -0,0 +1,178 @@
|
||||
use crate::{minmax, BitUnpacker};
|
||||
|
||||
use super::{bitpacker::BitPacker, compute_num_bits};
|
||||
|
||||
const BLOCK_SIZE: usize = 128;
|
||||
|
||||
/// `BlockedBitpacker` compresses data in blocks of
|
||||
/// 128 elements, while keeping an index on it
|
||||
///
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct BlockedBitpacker {
|
||||
// bitpacked blocks
|
||||
compressed_blocks: Vec<u8>,
|
||||
// uncompressed data, collected until BLOCK_SIZE
|
||||
buffer: Vec<u64>,
|
||||
offset_and_bits: Vec<BlockedBitpackerEntryMetaData>,
|
||||
}
|
||||
impl Default for BlockedBitpacker {
|
||||
fn default() -> Self {
|
||||
BlockedBitpacker::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// `BlockedBitpackerEntryMetaData` encodes the
|
||||
/// offset and bit_width into a u64 bit field
|
||||
///
|
||||
/// This saves some space, since 7byte is more
|
||||
/// than enough and also keeps the access fast
|
||||
/// because of alignment
|
||||
#[derive(Debug, Clone, Default)]
|
||||
struct BlockedBitpackerEntryMetaData {
|
||||
encoded: u64,
|
||||
base_value: u64,
|
||||
}
|
||||
|
||||
impl BlockedBitpackerEntryMetaData {
|
||||
fn new(offset: u64, num_bits: u8, base_value: u64) -> Self {
|
||||
let encoded = offset | (num_bits as u64) << (64 - 8);
|
||||
Self {
|
||||
encoded,
|
||||
base_value,
|
||||
}
|
||||
}
|
||||
fn offset(&self) -> u64 {
|
||||
(self.encoded << 8) >> 8
|
||||
}
|
||||
fn num_bits(&self) -> u8 {
|
||||
(self.encoded >> 56) as u8
|
||||
}
|
||||
fn base_value(&self) -> u64 {
|
||||
self.base_value
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn metadata_test() {
|
||||
let meta = BlockedBitpackerEntryMetaData::new(50000, 6, 40000);
|
||||
assert_eq!(meta.offset(), 50000);
|
||||
assert_eq!(meta.num_bits(), 6);
|
||||
}
|
||||
|
||||
impl BlockedBitpacker {
|
||||
pub fn new() -> Self {
|
||||
let mut compressed_blocks = vec![];
|
||||
compressed_blocks.resize(8, 0);
|
||||
Self {
|
||||
compressed_blocks,
|
||||
buffer: vec![],
|
||||
offset_and_bits: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
/// The memory used (inclusive childs)
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
std::mem::size_of::<BlockedBitpacker>()
|
||||
+ self.compressed_blocks.capacity()
|
||||
+ self.offset_and_bits.capacity()
|
||||
* std::mem::size_of_val(&self.offset_and_bits.get(0).cloned().unwrap_or_default())
|
||||
+ self.buffer.capacity()
|
||||
* std::mem::size_of_val(&self.buffer.get(0).cloned().unwrap_or_default())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn add(&mut self, val: u64) {
|
||||
self.buffer.push(val);
|
||||
if self.buffer.len() == BLOCK_SIZE as usize {
|
||||
self.flush();
|
||||
}
|
||||
}
|
||||
|
||||
pub fn flush(&mut self) {
|
||||
if let Some((min_value, max_value)) = minmax(self.buffer.iter()) {
|
||||
let mut bit_packer = BitPacker::new();
|
||||
let num_bits_block = compute_num_bits(*max_value - min_value);
|
||||
// todo performance: the padding handling could be done better, e.g. use a slice and
|
||||
// return num_bytes written from bitpacker
|
||||
self.compressed_blocks
|
||||
.resize(self.compressed_blocks.len() - 8, 0); // remove padding for bitpacker
|
||||
let offset = self.compressed_blocks.len() as u64;
|
||||
// todo performance: for some bit_width we
|
||||
// can encode multiple vals into the
|
||||
// mini_buffer before checking to flush
|
||||
// (to be done in BitPacker)
|
||||
for val in self.buffer.iter() {
|
||||
bit_packer
|
||||
.write(
|
||||
*val - min_value,
|
||||
num_bits_block,
|
||||
&mut self.compressed_blocks,
|
||||
)
|
||||
.expect("cannot write bitpacking to output"); // write to in memory can't fail
|
||||
}
|
||||
bit_packer.flush(&mut self.compressed_blocks).unwrap();
|
||||
self.offset_and_bits
|
||||
.push(BlockedBitpackerEntryMetaData::new(
|
||||
offset,
|
||||
num_bits_block,
|
||||
*min_value,
|
||||
));
|
||||
|
||||
self.buffer.clear();
|
||||
self.compressed_blocks
|
||||
.resize(self.compressed_blocks.len() + 8, 0); // add padding for bitpacker
|
||||
}
|
||||
}
|
||||
#[inline]
|
||||
pub fn get(&self, idx: usize) -> u64 {
|
||||
let metadata_pos = idx / BLOCK_SIZE as usize;
|
||||
let pos_in_block = idx % BLOCK_SIZE as usize;
|
||||
if let Some(metadata) = self.offset_and_bits.get(metadata_pos) {
|
||||
let unpacked = BitUnpacker::new(metadata.num_bits()).get(
|
||||
pos_in_block as u64,
|
||||
&self.compressed_blocks[metadata.offset() as usize..],
|
||||
);
|
||||
unpacked + metadata.base_value()
|
||||
} else {
|
||||
self.buffer[pos_in_block]
|
||||
}
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> impl Iterator<Item = u64> + '_ {
|
||||
// todo performance: we could decompress a whole block and cache it instead
|
||||
let bitpacked_elems = self.offset_and_bits.len() * BLOCK_SIZE;
|
||||
let iter = (0..bitpacked_elems)
|
||||
.map(move |idx| self.get(idx))
|
||||
.chain(self.buffer.iter().cloned());
|
||||
iter
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
#[test]
|
||||
fn blocked_bitpacker_empty() {
|
||||
let blocked_bitpacker = BlockedBitpacker::new();
|
||||
assert_eq!(blocked_bitpacker.iter().collect::<Vec<u64>>(), vec![]);
|
||||
}
|
||||
#[test]
|
||||
fn blocked_bitpacker_one() {
|
||||
let mut blocked_bitpacker = BlockedBitpacker::new();
|
||||
blocked_bitpacker.add(50000);
|
||||
assert_eq!(blocked_bitpacker.get(0), 50000);
|
||||
assert_eq!(blocked_bitpacker.iter().collect::<Vec<u64>>(), vec![50000]);
|
||||
}
|
||||
#[test]
|
||||
fn blocked_bitpacker_test() {
|
||||
let mut blocked_bitpacker = BlockedBitpacker::new();
|
||||
for val in 0..21500 {
|
||||
blocked_bitpacker.add(val);
|
||||
}
|
||||
for val in 0..21500 {
|
||||
assert_eq!(blocked_bitpacker.get(val as usize), val);
|
||||
}
|
||||
assert_eq!(blocked_bitpacker.iter().count(), 21500);
|
||||
assert_eq!(blocked_bitpacker.iter().last().unwrap(), 21499);
|
||||
}
|
||||
}
|
||||
52
bitpacker/src/lib.rs
Normal file
52
bitpacker/src/lib.rs
Normal file
@@ -0,0 +1,52 @@
|
||||
mod bitpacker;
|
||||
mod blocked_bitpacker;
|
||||
|
||||
pub use crate::bitpacker::BitPacker;
|
||||
pub use crate::bitpacker::BitUnpacker;
|
||||
pub use crate::blocked_bitpacker::BlockedBitpacker;
|
||||
|
||||
/// Computes the number of bits that will be used for bitpacking.
|
||||
///
|
||||
/// In general the target is the minimum number of bits
|
||||
/// required to express the amplitude given in argument.
|
||||
///
|
||||
/// e.g. If the amplitude is 10, we can store all ints on simply 4bits.
|
||||
///
|
||||
/// The logic is slightly more convoluted here as for optimization
|
||||
/// reasons, we want to ensure that a value spawns over at most 8 bytes
|
||||
/// of aligned bytes.
|
||||
///
|
||||
/// Spanning over 9 bytes is possible for instance, if we do
|
||||
/// bitpacking with an amplitude of 63 bits.
|
||||
/// In this case, the second int will start on bit
|
||||
/// 63 (which belongs to byte 7) and ends at byte 15;
|
||||
/// Hence 9 bytes (from byte 7 to byte 15 included).
|
||||
///
|
||||
/// To avoid this, we force the number of bits to 64bits
|
||||
/// when the result is greater than `64-8 = 56 bits`.
|
||||
///
|
||||
/// Note that this only affects rare use cases spawning over
|
||||
/// a very large range of values. Even in this case, it results
|
||||
/// in an extra cost of at most 12% compared to the optimal
|
||||
/// number of bits.
|
||||
pub fn compute_num_bits(n: u64) -> u8 {
|
||||
let amplitude = (64u32 - n.leading_zeros()) as u8;
|
||||
if amplitude <= 64 - 8 {
|
||||
amplitude
|
||||
} else {
|
||||
64
|
||||
}
|
||||
}
|
||||
|
||||
pub fn minmax<I, T>(mut vals: I) -> Option<(T, T)>
|
||||
where
|
||||
I: Iterator<Item = T>,
|
||||
T: Copy + Ord,
|
||||
{
|
||||
if let Some(first_el) = vals.next() {
|
||||
return Some(vals.fold((first_el, first_el), |(min_val, max_val), el| {
|
||||
(min_val.min(el), max_val.max(el))
|
||||
}));
|
||||
}
|
||||
None
|
||||
}
|
||||
12
common/Cargo.toml
Normal file
12
common/Cargo.toml
Normal file
@@ -0,0 +1,12 @@
|
||||
[package]
|
||||
name = "common"
|
||||
version = "0.1.0"
|
||||
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
|
||||
license = "MIT"
|
||||
edition = "2018"
|
||||
description = "common traits and utility functions used by multiple tantivy subcrates"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
byteorder = "1.4.3"
|
||||
9
common/src/lib.rs
Normal file
9
common/src/lib.rs
Normal file
@@ -0,0 +1,9 @@
|
||||
pub use byteorder::LittleEndian as Endianness;
|
||||
|
||||
mod serialize;
|
||||
mod vint;
|
||||
mod writer;
|
||||
|
||||
pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize};
|
||||
pub use vint::{read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt};
|
||||
pub use writer::{AntiCallToken, CountingWriter, TerminatingWrite};
|
||||
@@ -1,5 +1,5 @@
|
||||
use crate::common::Endianness;
|
||||
use crate::common::VInt;
|
||||
use crate::Endianness;
|
||||
use crate::VInt;
|
||||
use byteorder::{ReadBytesExt, WriteBytesExt};
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
@@ -14,6 +14,20 @@ pub trait BinarySerializable: fmt::Debug + Sized {
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self>;
|
||||
}
|
||||
|
||||
pub trait DeserializeFrom<T: BinarySerializable> {
|
||||
fn deserialize(&mut self) -> io::Result<T>;
|
||||
}
|
||||
|
||||
/// Implement deserialize from &[u8] for all types which implement BinarySerializable.
|
||||
///
|
||||
/// TryFrom would actually be preferrable, but not possible because of the orphan
|
||||
/// rules (not completely sure if this could be resolved)
|
||||
impl<T: BinarySerializable> DeserializeFrom<T> for &[u8] {
|
||||
fn deserialize(&mut self) -> io::Result<T> {
|
||||
T::deserialize(self)
|
||||
}
|
||||
}
|
||||
|
||||
/// `FixedSize` marks a `BinarySerializable` as
|
||||
/// always serializing to the same size.
|
||||
pub trait FixedSize: BinarySerializable {
|
||||
@@ -61,6 +75,11 @@ impl<Left: BinarySerializable, Right: BinarySerializable> BinarySerializable for
|
||||
Ok((Left::deserialize(reader)?, Right::deserialize(reader)?))
|
||||
}
|
||||
}
|
||||
impl<Left: BinarySerializable + FixedSize, Right: BinarySerializable + FixedSize> FixedSize
|
||||
for (Left, Right)
|
||||
{
|
||||
const SIZE_IN_BYTES: usize = Left::SIZE_IN_BYTES + Right::SIZE_IN_BYTES;
|
||||
}
|
||||
|
||||
impl BinarySerializable for u32 {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
@@ -141,6 +160,28 @@ impl FixedSize for u8 {
|
||||
const SIZE_IN_BYTES: usize = 1;
|
||||
}
|
||||
|
||||
impl BinarySerializable for bool {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let val = if *self { 1 } else { 0 };
|
||||
writer.write_u8(val)
|
||||
}
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<bool> {
|
||||
let val = reader.read_u8()?;
|
||||
match val {
|
||||
0 => Ok(false),
|
||||
1 => Ok(true),
|
||||
_ => Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
"invalid bool value on deserialization, data corrupted",
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FixedSize for bool {
|
||||
const SIZE_IN_BYTES: usize = 1;
|
||||
}
|
||||
|
||||
impl BinarySerializable for String {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let data: &[u8] = self.as_bytes();
|
||||
@@ -161,9 +202,9 @@ impl BinarySerializable for String {
|
||||
#[cfg(test)]
|
||||
pub mod test {
|
||||
|
||||
use super::VInt;
|
||||
use super::*;
|
||||
use crate::common::VInt;
|
||||
|
||||
use crate::serialize::BinarySerializable;
|
||||
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
|
||||
let mut buffer = Vec::new();
|
||||
O::default().serialize(&mut buffer).unwrap();
|
||||
@@ -106,7 +106,7 @@ pub fn read_u32_vint_no_advance(data: &[u8]) -> (u32, usize) {
|
||||
pub fn write_u32_vint<W: io::Write>(val: u32, writer: &mut W) -> io::Result<()> {
|
||||
let mut buf = [0u8; 8];
|
||||
let data = serialize_vint_u32(val, &mut buf);
|
||||
writer.write_all(&data)
|
||||
writer.write_all(data)
|
||||
}
|
||||
|
||||
impl VInt {
|
||||
@@ -175,14 +175,14 @@ impl BinarySerializable for VInt {
|
||||
mod tests {
|
||||
|
||||
use super::serialize_vint_u32;
|
||||
use super::BinarySerializable;
|
||||
use super::VInt;
|
||||
use crate::common::BinarySerializable;
|
||||
|
||||
fn aux_test_vint(val: u64) {
|
||||
let mut v = [14u8; 10];
|
||||
let num_bytes = VInt(val).serialize_into(&mut v);
|
||||
for i in num_bytes..10 {
|
||||
assert_eq!(v[i], 14u8);
|
||||
for el in &v[num_bytes..10] {
|
||||
assert_eq!(el, &14u8);
|
||||
}
|
||||
assert!(num_bytes > 0);
|
||||
if num_bytes < 10 {
|
||||
@@ -1,7 +1,4 @@
|
||||
use crate::directory::AntiCallToken;
|
||||
use crate::directory::TerminatingWrite;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::io::{self, BufWriter, Write};
|
||||
|
||||
pub struct CountingWriter<W> {
|
||||
underlying: W,
|
||||
@@ -16,41 +13,87 @@ impl<W: Write> CountingWriter<W> {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn written_bytes(&self) -> u64 {
|
||||
self.written_bytes
|
||||
}
|
||||
|
||||
/// Returns the underlying write object.
|
||||
/// Note that this method does not trigger any flushing.
|
||||
#[inline]
|
||||
pub fn finish(self) -> W {
|
||||
self.underlying
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: Write> Write for CountingWriter<W> {
|
||||
#[inline]
|
||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||
let written_size = self.underlying.write(buf)?;
|
||||
self.written_bytes += written_size as u64;
|
||||
Ok(written_size)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
|
||||
self.underlying.write_all(buf)?;
|
||||
self.written_bytes += buf.len() as u64;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
self.underlying.flush()
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: TerminatingWrite> TerminatingWrite for CountingWriter<W> {
|
||||
#[inline]
|
||||
fn terminate_ref(&mut self, token: AntiCallToken) -> io::Result<()> {
|
||||
self.underlying.terminate_ref(token)
|
||||
}
|
||||
}
|
||||
|
||||
/// Struct used to prevent from calling [`terminate_ref`](trait.TerminatingWrite#method.terminate_ref) directly
|
||||
///
|
||||
/// The point is that while the type is public, it cannot be built by anyone
|
||||
/// outside of this module.
|
||||
pub struct AntiCallToken(());
|
||||
|
||||
/// Trait used to indicate when no more write need to be done on a writer
|
||||
pub trait TerminatingWrite: Write {
|
||||
/// Indicate that the writer will no longer be used. Internally call terminate_ref.
|
||||
fn terminate(mut self) -> io::Result<()>
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
self.terminate_ref(AntiCallToken(()))
|
||||
}
|
||||
|
||||
/// You should implement this function to define custom behavior.
|
||||
/// This function should flush any buffer it may hold.
|
||||
fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()>;
|
||||
}
|
||||
|
||||
impl<W: TerminatingWrite + ?Sized> TerminatingWrite for Box<W> {
|
||||
fn terminate_ref(&mut self, token: AntiCallToken) -> io::Result<()> {
|
||||
self.as_mut().terminate_ref(token)
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: TerminatingWrite> TerminatingWrite for BufWriter<W> {
|
||||
fn terminate_ref(&mut self, a: AntiCallToken) -> io::Result<()> {
|
||||
self.flush()?;
|
||||
self.get_mut().terminate_ref(a)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TerminatingWrite for &'a mut Vec<u8> {
|
||||
fn terminate_ref(&mut self, _a: AntiCallToken) -> io::Result<()> {
|
||||
self.flush()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
use tantivy::collector::{Collector, SegmentCollector};
|
||||
use tantivy::fastfield::FastFieldReader;
|
||||
use tantivy::fastfield::{DynamicFastFieldReader, FastFieldReader};
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::Field;
|
||||
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
|
||||
@@ -98,7 +98,7 @@ impl Collector for StatsCollector {
|
||||
}
|
||||
|
||||
struct StatsSegmentCollector {
|
||||
fast_field_reader: FastFieldReader<u64>,
|
||||
fast_field_reader: DynamicFastFieldReader<u64>,
|
||||
stats: Stats,
|
||||
}
|
||||
|
||||
@@ -139,7 +139,7 @@ fn main() -> tantivy::Result<()> {
|
||||
//
|
||||
// Lets index a bunch of fake documents for the sake of
|
||||
// this example.
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
index_writer.add_document(doc!(
|
||||
|
||||
@@ -90,7 +90,7 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
let frankenstein_isbn = Term::from_field_text(isbn, "978-9176370711");
|
||||
|
||||
// Oops our frankenstein doc seems mispelled
|
||||
// Oops our frankenstein doc seems misspelled
|
||||
let frankenstein_doc_misspelled = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
|
||||
assert_eq!(
|
||||
schema.to_json(&frankenstein_doc_misspelled),
|
||||
|
||||
@@ -92,7 +92,7 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
// Check the reference doc for different ways to create a `Facet` object.
|
||||
{
|
||||
let facet = Facet::from_text("/Felidae/Pantherinae");
|
||||
let facet = Facet::from("/Felidae/Pantherinae");
|
||||
let facet_term = Term::from_facet(classification, &facet);
|
||||
let facet_term_query = TermQuery::new(facet_term, IndexRecordOption::Basic);
|
||||
let mut facet_collector = FacetCollector::for_field(classification);
|
||||
|
||||
@@ -12,7 +12,7 @@ fn main() -> tantivy::Result<()> {
|
||||
let ingredient = schema_builder.add_facet_field("ingredient", INDEXED);
|
||||
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer(30_000_000)?;
|
||||
|
||||
@@ -51,7 +51,7 @@ fn main() -> tantivy::Result<()> {
|
||||
let query = BooleanQuery::new_multiterms_query(
|
||||
facets
|
||||
.iter()
|
||||
.map(|key| Term::from_facet(ingredient, &key))
|
||||
.map(|key| Term::from_facet(ingredient, key))
|
||||
.collect(),
|
||||
);
|
||||
let top_docs_by_custom_score =
|
||||
|
||||
@@ -22,7 +22,7 @@ fn main() -> tantivy::Result<()> {
|
||||
let title = schema_builder.add_text_field("title", TEXT | STORED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
|
||||
index_writer.add_document(doc!(title => "The Old Man and the Sea"));
|
||||
|
||||
@@ -82,7 +82,7 @@ fn main() -> tantivy::Result<()> {
|
||||
}]
|
||||
}"#;
|
||||
|
||||
let short_man_doc = schema.parse_document(&short_man_json)?;
|
||||
let short_man_doc = schema.parse_document(short_man_json)?;
|
||||
|
||||
index_writer.add_document(short_man_doc);
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@ fn main() -> tantivy::Result<()> {
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// # Indexing documents
|
||||
let index = Index::create_in_dir(&index_path, schema.clone())?;
|
||||
let index = Index::create_in_dir(&index_path, schema)?;
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use tantivy;
|
||||
use tantivy::schema::*;
|
||||
|
||||
// # Document from json
|
||||
@@ -22,7 +21,7 @@ fn main() -> tantivy::Result<()> {
|
||||
}"#;
|
||||
|
||||
// We can parse our document
|
||||
let _mice_and_men_doc = schema.parse_document(&mice_and_men_doc_json)?;
|
||||
let _mice_and_men_doc = schema.parse_document(mice_and_men_doc_json)?;
|
||||
|
||||
// Multi-valued field are allowed, they are
|
||||
// expressed in JSON by an array.
|
||||
@@ -31,7 +30,7 @@ fn main() -> tantivy::Result<()> {
|
||||
"title": ["Frankenstein", "The Modern Prometheus"],
|
||||
"year": 1818
|
||||
}"#;
|
||||
let _frankenstein_doc = schema.parse_document(&frankenstein_json)?;
|
||||
let _frankenstein_doc = schema.parse_document(frankenstein_json)?;
|
||||
|
||||
// Note that the schema is saved in your index directory.
|
||||
//
|
||||
|
||||
24
fastfield_codecs/Cargo.toml
Normal file
24
fastfield_codecs/Cargo.toml
Normal file
@@ -0,0 +1,24 @@
|
||||
[package]
|
||||
name = "fastfield_codecs"
|
||||
version = "0.1.0"
|
||||
authors = ["Pascal Seitz <pascal@quickwit.io>"]
|
||||
license = "MIT"
|
||||
edition = "2018"
|
||||
description = "Fast field codecs used by tantivy"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
common = { path = "../common/" }
|
||||
tantivy-bitpacker = { path = "../bitpacker/" }
|
||||
prettytable-rs = {version="0.8.0", optional= true}
|
||||
rand = {version="0.8.3", optional= true}
|
||||
|
||||
[dev-dependencies]
|
||||
more-asserts = "0.2.1"
|
||||
rand = "0.8.3"
|
||||
|
||||
[features]
|
||||
bin = ["prettytable-rs", "rand"]
|
||||
default = ["bin"]
|
||||
|
||||
68
fastfield_codecs/README.md
Normal file
68
fastfield_codecs/README.md
Normal file
@@ -0,0 +1,68 @@
|
||||
|
||||
|
||||
# Fast Field Codecs
|
||||
|
||||
This crate contains various fast field codecs, used to compress/decompress fast field data in tantivy.
|
||||
|
||||
## Contributing
|
||||
|
||||
Contributing is pretty straightforward. Since the bitpacking is the simplest compressor, you can check it for reference.
|
||||
|
||||
A codec needs to implement 2 traits:
|
||||
|
||||
- A reader implementing `FastFieldCodecReader` to read the codec.
|
||||
- A serializer implementing `FastFieldCodecSerializer` for compression estimation and codec name + id.
|
||||
|
||||
### Tests
|
||||
|
||||
Once the traits are implemented test and benchmark integration is pretty easy (see `test_with_codec_data_sets` and `bench.rs`).
|
||||
|
||||
Make sure to add the codec to the main.rs, which tests the compression ratio and estimation against different data sets. You can run it with:
|
||||
```
|
||||
cargo run --features bin
|
||||
```
|
||||
|
||||
### TODO
|
||||
- Add real world data sets in comparison
|
||||
- Add codec to cover sparse data sets
|
||||
|
||||
|
||||
### Codec Comparison
|
||||
```
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| | Compression Ratio | Compression Estimation |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Autoincrement | | |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| LinearInterpol | 0.000039572664 | 0.000004396963 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| MultiLinearInterpol | 0.1477348 | 0.17275847 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Bitpacked | 0.28126493 | 0.28125 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Monotonically increasing concave | | |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| LinearInterpol | 0.25003937 | 0.26562938 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| MultiLinearInterpol | 0.190665 | 0.1883836 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Bitpacked | 0.31251436 | 0.3125 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Monotonically increasing convex | | |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| LinearInterpol | 0.25003937 | 0.28125438 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| MultiLinearInterpol | 0.18676 | 0.2040086 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Bitpacked | 0.31251436 | 0.3125 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Almost monotonically increasing | | |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| LinearInterpol | 0.14066513 | 0.1562544 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| MultiLinearInterpol | 0.16335973 | 0.17275847 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Bitpacked | 0.28126493 | 0.28125 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
|
||||
```
|
||||
108
fastfield_codecs/benches/bench.rs
Normal file
108
fastfield_codecs/benches/bench.rs
Normal file
@@ -0,0 +1,108 @@
|
||||
#![feature(test)]
|
||||
|
||||
extern crate test;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use fastfield_codecs::{
|
||||
bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer},
|
||||
linearinterpol::{LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer},
|
||||
multilinearinterpol::{
|
||||
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
|
||||
},
|
||||
*,
|
||||
};
|
||||
|
||||
fn get_data() -> Vec<u64> {
|
||||
let mut data: Vec<_> = (100..55000_u64)
|
||||
.map(|num| num + rand::random::<u8>() as u64)
|
||||
.collect();
|
||||
data.push(99_000);
|
||||
data.insert(1000, 2000);
|
||||
data.insert(2000, 100);
|
||||
data.insert(3000, 4100);
|
||||
data.insert(4000, 100);
|
||||
data.insert(5000, 800);
|
||||
data
|
||||
}
|
||||
|
||||
fn value_iter() -> impl Iterator<Item = u64> {
|
||||
0..20_000
|
||||
}
|
||||
fn bench_get<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
|
||||
b: &mut Bencher,
|
||||
data: &[u64],
|
||||
) {
|
||||
let mut bytes = vec![];
|
||||
S::serialize(
|
||||
&mut bytes,
|
||||
&data,
|
||||
stats_from_vec(data),
|
||||
data.iter().cloned(),
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
let reader = R::open_from_bytes(&bytes).unwrap();
|
||||
b.iter(|| {
|
||||
for pos in value_iter() {
|
||||
reader.get_u64(pos as u64, &bytes);
|
||||
}
|
||||
});
|
||||
}
|
||||
fn bench_create<S: FastFieldCodecSerializer>(b: &mut Bencher, data: &[u64]) {
|
||||
let mut bytes = vec![];
|
||||
b.iter(|| {
|
||||
S::serialize(
|
||||
&mut bytes,
|
||||
&data,
|
||||
stats_from_vec(data),
|
||||
data.iter().cloned(),
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
});
|
||||
}
|
||||
|
||||
use test::Bencher;
|
||||
#[bench]
|
||||
fn bench_fastfield_bitpack_create(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_create::<BitpackedFastFieldSerializer>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_linearinterpol_create(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_create::<LinearInterpolFastFieldSerializer>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_create::<MultiLinearInterpolFastFieldSerializer>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_bitpack_get(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_get::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_linearinterpol_get(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_get::<LinearInterpolFastFieldSerializer, LinearInterpolFastFieldReader>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_get::<MultiLinearInterpolFastFieldSerializer, MultiLinearInterpolFastFieldReader>(
|
||||
b, &data,
|
||||
);
|
||||
}
|
||||
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
|
||||
let min_value = data.iter().cloned().min().unwrap_or(0);
|
||||
let max_value = data.iter().cloned().max().unwrap_or(0);
|
||||
FastFieldStats {
|
||||
min_value,
|
||||
max_value,
|
||||
num_vals: data.len() as u64,
|
||||
}
|
||||
}
|
||||
}
|
||||
176
fastfield_codecs/src/bitpacked.rs
Normal file
176
fastfield_codecs/src/bitpacked.rs
Normal file
@@ -0,0 +1,176 @@
|
||||
use crate::FastFieldCodecReader;
|
||||
use crate::FastFieldCodecSerializer;
|
||||
use crate::FastFieldDataAccess;
|
||||
use crate::FastFieldStats;
|
||||
use common::BinarySerializable;
|
||||
use std::io::{self, Write};
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitPacker;
|
||||
|
||||
use tantivy_bitpacker::BitUnpacker;
|
||||
|
||||
/// Depending on the field type, a different
|
||||
/// fast field is required.
|
||||
#[derive(Clone)]
|
||||
pub struct BitpackedFastFieldReader {
|
||||
bit_unpacker: BitUnpacker,
|
||||
pub min_value_u64: u64,
|
||||
pub max_value_u64: u64,
|
||||
}
|
||||
|
||||
impl<'data> FastFieldCodecReader for BitpackedFastFieldReader {
|
||||
/// Opens a fast field given a file.
|
||||
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
|
||||
let (_data, mut footer) = bytes.split_at(bytes.len() - 16);
|
||||
let min_value = u64::deserialize(&mut footer)?;
|
||||
let amplitude = u64::deserialize(&mut footer)?;
|
||||
let max_value = min_value + amplitude;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let bit_unpacker = BitUnpacker::new(num_bits);
|
||||
Ok(BitpackedFastFieldReader {
|
||||
min_value_u64: min_value,
|
||||
max_value_u64: max_value,
|
||||
bit_unpacker,
|
||||
})
|
||||
}
|
||||
#[inline]
|
||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
|
||||
self.min_value_u64 + self.bit_unpacker.get(doc, data)
|
||||
}
|
||||
#[inline]
|
||||
fn min_value(&self) -> u64 {
|
||||
self.min_value_u64
|
||||
}
|
||||
#[inline]
|
||||
fn max_value(&self) -> u64 {
|
||||
self.max_value_u64
|
||||
}
|
||||
}
|
||||
pub struct BitpackedFastFieldSerializerLegacy<'a, W: 'a + Write> {
|
||||
bit_packer: BitPacker,
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
amplitude: u64,
|
||||
num_bits: u8,
|
||||
}
|
||||
|
||||
impl<'a, W: Write> BitpackedFastFieldSerializerLegacy<'a, W> {
|
||||
/// Creates a new fast field serializer.
|
||||
///
|
||||
/// The serializer in fact encode the values by bitpacking
|
||||
/// `(val - min_value)`.
|
||||
///
|
||||
/// It requires a `min_value` and a `max_value` to compute
|
||||
/// compute the minimum number of bits required to encode
|
||||
/// values.
|
||||
pub fn open(
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<BitpackedFastFieldSerializerLegacy<'a, W>> {
|
||||
assert!(min_value <= max_value);
|
||||
let amplitude = max_value - min_value;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let bit_packer = BitPacker::new();
|
||||
Ok(BitpackedFastFieldSerializerLegacy {
|
||||
bit_packer,
|
||||
write,
|
||||
min_value,
|
||||
amplitude,
|
||||
num_bits,
|
||||
})
|
||||
}
|
||||
/// Pushes a new value to the currently open u64 fast field.
|
||||
#[inline]
|
||||
pub fn add_val(&mut self, val: u64) -> io::Result<()> {
|
||||
let val_to_write: u64 = val - self.min_value;
|
||||
self.bit_packer
|
||||
.write(val_to_write, self.num_bits, &mut self.write)?;
|
||||
Ok(())
|
||||
}
|
||||
pub fn close_field(mut self) -> io::Result<()> {
|
||||
self.bit_packer.close(&mut self.write)?;
|
||||
self.min_value.serialize(&mut self.write)?;
|
||||
self.amplitude.serialize(&mut self.write)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct BitpackedFastFieldSerializer {}
|
||||
|
||||
impl FastFieldCodecSerializer for BitpackedFastFieldSerializer {
|
||||
const NAME: &'static str = "Bitpacked";
|
||||
const ID: u8 = 1;
|
||||
/// Serializes data with the BitpackedFastFieldSerializer.
|
||||
///
|
||||
/// The serializer in fact encode the values by bitpacking
|
||||
/// `(val - min_value)`.
|
||||
///
|
||||
/// It requires a `min_value` and a `max_value` to compute
|
||||
/// compute the minimum number of bits required to encode
|
||||
/// values.
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
_data_iter1: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
let mut serializer =
|
||||
BitpackedFastFieldSerializerLegacy::open(write, stats.min_value, stats.max_value)?;
|
||||
|
||||
for val in data_iter {
|
||||
serializer.add_val(val)?;
|
||||
}
|
||||
serializer.close_field()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
fn is_applicable(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
_stats: FastFieldStats,
|
||||
) -> bool {
|
||||
true
|
||||
}
|
||||
fn estimate(_fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
||||
let amplitude = stats.max_value - stats.min_value;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let num_bits_uncompressed = 64;
|
||||
num_bits as f32 / num_bits_uncompressed as f32
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
|
||||
fn create_and_validate(data: &[u64], name: &str) {
|
||||
crate::tests::create_and_validate::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>(
|
||||
data, name,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_with_codec_data_sets() {
|
||||
let data_sets = get_codec_test_data_sets();
|
||||
for (mut data, name) in data_sets {
|
||||
create_and_validate(&data, name);
|
||||
data.reverse();
|
||||
create_and_validate(&data, name);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bitpacked_fast_field_rand() {
|
||||
for _ in 0..500 {
|
||||
let mut data = (0..1 + rand::random::<u8>() as usize)
|
||||
.map(|_| rand::random::<i64>() as u64 / 2)
|
||||
.collect::<Vec<_>>();
|
||||
create_and_validate(&data, "rand");
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data, "rand");
|
||||
}
|
||||
}
|
||||
}
|
||||
225
fastfield_codecs/src/lib.rs
Normal file
225
fastfield_codecs/src/lib.rs
Normal file
@@ -0,0 +1,225 @@
|
||||
#[cfg(test)]
|
||||
#[macro_use]
|
||||
extern crate more_asserts;
|
||||
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
pub mod bitpacked;
|
||||
pub mod linearinterpol;
|
||||
pub mod multilinearinterpol;
|
||||
|
||||
pub trait FastFieldCodecReader: Sized {
|
||||
/// reads the metadata and returns the CodecReader
|
||||
fn open_from_bytes(bytes: &[u8]) -> std::io::Result<Self>;
|
||||
|
||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64;
|
||||
|
||||
fn min_value(&self) -> u64;
|
||||
fn max_value(&self) -> u64;
|
||||
}
|
||||
|
||||
/// The FastFieldSerializerEstimate trait is required on all variants
|
||||
/// of fast field compressions, to decide which one to choose.
|
||||
pub trait FastFieldCodecSerializer {
|
||||
/// A codex needs to provide a unique name and id, which is
|
||||
/// used for debugging and de/serialization.
|
||||
const NAME: &'static str;
|
||||
const ID: u8;
|
||||
|
||||
/// Check if the Codec is able to compress the data
|
||||
fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> bool;
|
||||
|
||||
/// Returns an estimate of the compression ratio.
|
||||
/// The baseline is uncompressed 64bit data.
|
||||
///
|
||||
/// It could make sense to also return a value representing
|
||||
/// computational complexity.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32;
|
||||
|
||||
/// Serializes the data using the serializer into write.
|
||||
/// There are multiple iterators, in case the codec needs to read the data multiple times.
|
||||
/// The iterators should be preferred over using fastfield_accessor for performance reasons.
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
data_iter1: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()>;
|
||||
}
|
||||
|
||||
/// FastFieldDataAccess is the trait to access fast field data during serialization and estimation.
|
||||
pub trait FastFieldDataAccess {
|
||||
/// Return the value associated to the given position.
|
||||
///
|
||||
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance reasons.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `position` is greater than the index.
|
||||
fn get_val(&self, position: u64) -> u64;
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FastFieldStats {
|
||||
pub min_value: u64,
|
||||
pub max_value: u64,
|
||||
pub num_vals: u64,
|
||||
}
|
||||
|
||||
impl<'a> FastFieldDataAccess for &'a [u64] {
|
||||
fn get_val(&self, position: u64) -> u64 {
|
||||
self[position as usize]
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldDataAccess for Vec<u64> {
|
||||
fn get_val(&self, position: u64) -> u64 {
|
||||
self[position as usize]
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::{
|
||||
bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer},
|
||||
linearinterpol::{LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer},
|
||||
multilinearinterpol::{
|
||||
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
|
||||
},
|
||||
};
|
||||
|
||||
pub fn create_and_validate<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
|
||||
data: &[u64],
|
||||
name: &str,
|
||||
) -> (f32, f32) {
|
||||
if !S::is_applicable(&data, crate::tests::stats_from_vec(data)) {
|
||||
return (f32::MAX, 0.0);
|
||||
}
|
||||
let estimation = S::estimate(&data, crate::tests::stats_from_vec(data));
|
||||
let mut out = vec![];
|
||||
S::serialize(
|
||||
&mut out,
|
||||
&data,
|
||||
crate::tests::stats_from_vec(data),
|
||||
data.iter().cloned(),
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let reader = R::open_from_bytes(&out).unwrap();
|
||||
for (doc, orig_val) in data.iter().enumerate() {
|
||||
let val = reader.get_u64(doc as u64, &out);
|
||||
if val != *orig_val {
|
||||
panic!(
|
||||
"val {:?} does not match orig_val {:?}, in data set {}, data {:?}",
|
||||
val, orig_val, name, data
|
||||
);
|
||||
}
|
||||
}
|
||||
let actual_compression = data.len() as f32 / out.len() as f32;
|
||||
(estimation, actual_compression)
|
||||
}
|
||||
pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
let mut data_and_names = vec![];
|
||||
|
||||
let data = (10..=20_u64).collect::<Vec<_>>();
|
||||
data_and_names.push((data, "simple monotonically increasing"));
|
||||
|
||||
data_and_names.push((
|
||||
vec![5, 6, 7, 8, 9, 10, 99, 100],
|
||||
"offset in linear interpol",
|
||||
));
|
||||
data_and_names.push((vec![5, 50, 3, 13, 1, 1000, 35], "rand small"));
|
||||
data_and_names.push((vec![10], "single value"));
|
||||
|
||||
data_and_names
|
||||
}
|
||||
|
||||
fn test_codec<S: FastFieldCodecSerializer, R: FastFieldCodecReader>() {
|
||||
let codec_name = S::NAME;
|
||||
for (data, data_set_name) in get_codec_test_data_sets() {
|
||||
let (estimate, actual) =
|
||||
crate::tests::create_and_validate::<S, R>(&data, data_set_name);
|
||||
let result = if estimate == f32::MAX {
|
||||
"Disabled".to_string()
|
||||
} else {
|
||||
format!("Estimate {:?} Actual {:?} ", estimate, actual)
|
||||
};
|
||||
println!(
|
||||
"Codec {}, DataSet {}, {}",
|
||||
codec_name, data_set_name, result
|
||||
);
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn test_codec_bitpacking() {
|
||||
test_codec::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>();
|
||||
}
|
||||
#[test]
|
||||
fn test_codec_interpolation() {
|
||||
test_codec::<LinearInterpolFastFieldSerializer, LinearInterpolFastFieldReader>();
|
||||
}
|
||||
#[test]
|
||||
fn test_codec_multi_interpolation() {
|
||||
test_codec::<MultiLinearInterpolFastFieldSerializer, MultiLinearInterpolFastFieldReader>();
|
||||
}
|
||||
|
||||
use super::*;
|
||||
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
|
||||
let min_value = data.iter().cloned().min().unwrap_or(0);
|
||||
let max_value = data.iter().cloned().max().unwrap_or(0);
|
||||
FastFieldStats {
|
||||
min_value,
|
||||
max_value,
|
||||
num_vals: data.len() as u64,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn estimation_good_interpolation_case() {
|
||||
let data = (10..=20000_u64).collect::<Vec<_>>();
|
||||
|
||||
let linear_interpol_estimation =
|
||||
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(linear_interpol_estimation, 0.01);
|
||||
|
||||
let multi_linear_interpol_estimation =
|
||||
MultiLinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(multi_linear_interpol_estimation, 0.2);
|
||||
assert_le!(linear_interpol_estimation, multi_linear_interpol_estimation);
|
||||
|
||||
let bitpacked_estimation =
|
||||
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(linear_interpol_estimation, bitpacked_estimation);
|
||||
}
|
||||
#[test]
|
||||
fn estimation_test_bad_interpolation_case() {
|
||||
let data = vec![200, 10, 10, 10, 10, 1000, 20];
|
||||
|
||||
let linear_interpol_estimation =
|
||||
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(linear_interpol_estimation, 0.32);
|
||||
|
||||
let bitpacked_estimation =
|
||||
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(bitpacked_estimation, linear_interpol_estimation);
|
||||
}
|
||||
#[test]
|
||||
fn estimation_test_bad_interpolation_case_monotonically_increasing() {
|
||||
let mut data = (200..=20000_u64).collect::<Vec<_>>();
|
||||
data.push(1_000_000);
|
||||
|
||||
// in this case the linear interpolation can't in fact not be worse than bitpacking,
|
||||
// but the estimator adds some threshold, which leads to estimated worse behavior
|
||||
let linear_interpol_estimation =
|
||||
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(linear_interpol_estimation, 0.35);
|
||||
|
||||
let bitpacked_estimation =
|
||||
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(bitpacked_estimation, 0.32);
|
||||
assert_le!(bitpacked_estimation, linear_interpol_estimation);
|
||||
}
|
||||
}
|
||||
295
fastfield_codecs/src/linearinterpol.rs
Normal file
295
fastfield_codecs/src/linearinterpol.rs
Normal file
@@ -0,0 +1,295 @@
|
||||
use crate::FastFieldCodecReader;
|
||||
use crate::FastFieldCodecSerializer;
|
||||
use crate::FastFieldDataAccess;
|
||||
use crate::FastFieldStats;
|
||||
use std::io::{self, Read, Write};
|
||||
use std::ops::Sub;
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitPacker;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use common::FixedSize;
|
||||
use tantivy_bitpacker::BitUnpacker;
|
||||
|
||||
/// Depending on the field type, a different
|
||||
/// fast field is required.
|
||||
#[derive(Clone)]
|
||||
pub struct LinearInterpolFastFieldReader {
|
||||
bit_unpacker: BitUnpacker,
|
||||
pub footer: LinearInterpolFooter,
|
||||
pub slope: f32,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct LinearInterpolFooter {
|
||||
pub relative_max_value: u64,
|
||||
pub offset: u64,
|
||||
pub first_val: u64,
|
||||
pub last_val: u64,
|
||||
pub num_vals: u64,
|
||||
pub min_value: u64,
|
||||
pub max_value: u64,
|
||||
}
|
||||
|
||||
impl BinarySerializable for LinearInterpolFooter {
|
||||
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
|
||||
self.relative_max_value.serialize(write)?;
|
||||
self.offset.serialize(write)?;
|
||||
self.first_val.serialize(write)?;
|
||||
self.last_val.serialize(write)?;
|
||||
self.num_vals.serialize(write)?;
|
||||
self.min_value.serialize(write)?;
|
||||
self.max_value.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<LinearInterpolFooter> {
|
||||
Ok(LinearInterpolFooter {
|
||||
relative_max_value: u64::deserialize(reader)?,
|
||||
offset: u64::deserialize(reader)?,
|
||||
first_val: u64::deserialize(reader)?,
|
||||
last_val: u64::deserialize(reader)?,
|
||||
num_vals: u64::deserialize(reader)?,
|
||||
min_value: u64::deserialize(reader)?,
|
||||
max_value: u64::deserialize(reader)?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl FixedSize for LinearInterpolFooter {
|
||||
const SIZE_IN_BYTES: usize = 56;
|
||||
}
|
||||
|
||||
impl FastFieldCodecReader for LinearInterpolFastFieldReader {
|
||||
/// Opens a fast field given a file.
|
||||
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
|
||||
let (_data, mut footer) = bytes.split_at(bytes.len() - LinearInterpolFooter::SIZE_IN_BYTES);
|
||||
let footer = LinearInterpolFooter::deserialize(&mut footer)?;
|
||||
let slope = get_slope(footer.first_val, footer.last_val, footer.num_vals);
|
||||
|
||||
let num_bits = compute_num_bits(footer.relative_max_value);
|
||||
let bit_unpacker = BitUnpacker::new(num_bits);
|
||||
Ok(LinearInterpolFastFieldReader {
|
||||
bit_unpacker,
|
||||
footer,
|
||||
slope,
|
||||
})
|
||||
}
|
||||
#[inline]
|
||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
|
||||
let calculated_value = get_calculated_value(self.footer.first_val, doc, self.slope);
|
||||
(calculated_value + self.bit_unpacker.get(doc, data)) - self.footer.offset
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn min_value(&self) -> u64 {
|
||||
self.footer.min_value
|
||||
}
|
||||
#[inline]
|
||||
fn max_value(&self) -> u64 {
|
||||
self.footer.max_value
|
||||
}
|
||||
}
|
||||
|
||||
/// Fastfield serializer, which tries to guess values by linear interpolation
|
||||
/// and stores the difference bitpacked.
|
||||
pub struct LinearInterpolFastFieldSerializer {}
|
||||
|
||||
#[inline]
|
||||
fn get_slope(first_val: u64, last_val: u64, num_vals: u64) -> f32 {
|
||||
if num_vals <= 1 {
|
||||
return 0.0;
|
||||
}
|
||||
// We calculate the slope with f64 high precision and use the result in lower precision f32
|
||||
// This is done in order to handle estimations for very large values like i64::MAX
|
||||
((last_val as f64 - first_val as f64) / (num_vals as u64 - 1) as f64) as f32
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
|
||||
first_val + (pos as f32 * slope) as u64
|
||||
}
|
||||
|
||||
impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
|
||||
const NAME: &'static str = "LinearInterpol";
|
||||
const ID: u8 = 2;
|
||||
/// Creates a new fast field serializer.
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
data_iter1: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
assert!(stats.min_value <= stats.max_value);
|
||||
|
||||
let first_val = fastfield_accessor.get_val(0);
|
||||
let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1);
|
||||
let slope = get_slope(first_val, last_val, stats.num_vals);
|
||||
// calculate offset to ensure all values are positive
|
||||
let mut offset = 0;
|
||||
let mut rel_positive_max = 0;
|
||||
for (pos, actual_value) in data_iter1.enumerate() {
|
||||
let calculated_value = get_calculated_value(first_val, pos as u64, slope);
|
||||
if calculated_value > actual_value {
|
||||
// negative value we need to apply an offset
|
||||
// we ignore negative values in the max value calculation, because negative values
|
||||
// will be offset to 0
|
||||
offset = offset.max(calculated_value - actual_value);
|
||||
} else {
|
||||
//positive value no offset reuqired
|
||||
rel_positive_max = rel_positive_max.max(actual_value - calculated_value);
|
||||
}
|
||||
}
|
||||
|
||||
// rel_positive_max will be adjusted by offset
|
||||
let relative_max_value = rel_positive_max + offset;
|
||||
|
||||
let num_bits = compute_num_bits(relative_max_value);
|
||||
let mut bit_packer = BitPacker::new();
|
||||
for (pos, val) in data_iter.enumerate() {
|
||||
let calculated_value = get_calculated_value(first_val, pos as u64, slope);
|
||||
let diff = (val + offset) - calculated_value;
|
||||
bit_packer.write(diff, num_bits, write)?;
|
||||
}
|
||||
bit_packer.close(write)?;
|
||||
|
||||
let footer = LinearInterpolFooter {
|
||||
relative_max_value,
|
||||
offset,
|
||||
first_val,
|
||||
last_val,
|
||||
num_vals: stats.num_vals,
|
||||
min_value: stats.min_value,
|
||||
max_value: stats.max_value,
|
||||
};
|
||||
footer.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
fn is_applicable(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> bool {
|
||||
if stats.num_vals < 3 {
|
||||
return false; //disable compressor for this case
|
||||
}
|
||||
// On serialisation the offset is added to the actual value.
|
||||
// We need to make sure this won't run into overflow calculation issues.
|
||||
// For this we take the maximum theroretical offset and add this to the max value.
|
||||
// If this doesn't overflow the algortihm should be fine
|
||||
let theorethical_maximum_offset = stats.max_value - stats.min_value;
|
||||
if stats
|
||||
.max_value
|
||||
.checked_add(theorethical_maximum_offset)
|
||||
.is_none()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
true
|
||||
}
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima for the deviation of the calculated value are and
|
||||
/// the offset to shift all values to >=0 is also unknown.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
||||
let first_val = fastfield_accessor.get_val(0);
|
||||
let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1);
|
||||
let slope = get_slope(first_val, last_val, stats.num_vals);
|
||||
|
||||
// let's sample at 0%, 5%, 10% .. 95%, 100%
|
||||
let num_vals = stats.num_vals as f32 / 100.0;
|
||||
let sample_positions = (0..20)
|
||||
.map(|pos| (num_vals * pos as f32 * 5.0) as usize)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let max_distance = sample_positions
|
||||
.iter()
|
||||
.map(|pos| {
|
||||
let calculated_value = get_calculated_value(first_val, *pos as u64, slope);
|
||||
let actual_value = fastfield_accessor.get_val(*pos as u64);
|
||||
distance(calculated_value, actual_value)
|
||||
})
|
||||
.max()
|
||||
.unwrap_or(0);
|
||||
|
||||
// the theory would be that we don't have the actual max_distance, but we are close within 50%
|
||||
// threshold.
|
||||
// It is multiplied by 2 because in a log case scenario the line would be as much above as
|
||||
// below. So the offset would = max_distance
|
||||
//
|
||||
let relative_max_value = (max_distance as f32 * 1.5) * 2.0;
|
||||
|
||||
let num_bits = compute_num_bits(relative_max_value as u64) as u64 * stats.num_vals as u64
|
||||
+ LinearInterpolFooter::SIZE_IN_BYTES as u64;
|
||||
let num_bits_uncompressed = 64 * stats.num_vals;
|
||||
num_bits as f32 / num_bits_uncompressed as f32
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
|
||||
if x < y {
|
||||
y - x
|
||||
} else {
|
||||
x - y
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
|
||||
fn create_and_validate(data: &[u64], name: &str) {
|
||||
crate::tests::create_and_validate::<
|
||||
LinearInterpolFastFieldSerializer,
|
||||
LinearInterpolFastFieldReader,
|
||||
>(data, name);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_with_codec_data_sets() {
|
||||
let data_sets = get_codec_test_data_sets();
|
||||
for (mut data, name) in data_sets {
|
||||
create_and_validate(&data, name);
|
||||
data.reverse();
|
||||
create_and_validate(&data, name);
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_test_large_amplitude() {
|
||||
let data = vec![
|
||||
i64::MAX as u64 / 2,
|
||||
i64::MAX as u64 / 3,
|
||||
i64::MAX as u64 / 2,
|
||||
];
|
||||
|
||||
create_and_validate(&data, "large amplitude");
|
||||
}
|
||||
#[test]
|
||||
fn linear_interpol_fast_concave_data() {
|
||||
let data = vec![0, 1, 2, 5, 8, 10, 20, 50];
|
||||
create_and_validate(&data, "concave data");
|
||||
}
|
||||
#[test]
|
||||
fn linear_interpol_fast_convex_data() {
|
||||
let data = vec![0, 40, 60, 70, 75, 77];
|
||||
create_and_validate(&data, "convex data");
|
||||
}
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_test_simple() {
|
||||
let data = (10..=20_u64).collect::<Vec<_>>();
|
||||
|
||||
create_and_validate(&data, "simple monotonically");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_rand() {
|
||||
for _ in 0..5000 {
|
||||
let mut data = (0..50).map(|_| rand::random::<u64>()).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "random");
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data, "random");
|
||||
}
|
||||
}
|
||||
}
|
||||
127
fastfield_codecs/src/main.rs
Normal file
127
fastfield_codecs/src/main.rs
Normal file
@@ -0,0 +1,127 @@
|
||||
#[macro_use]
|
||||
extern crate prettytable;
|
||||
use fastfield_codecs::{
|
||||
linearinterpol::LinearInterpolFastFieldSerializer,
|
||||
multilinearinterpol::MultiLinearInterpolFastFieldSerializer, FastFieldCodecSerializer,
|
||||
FastFieldStats,
|
||||
};
|
||||
use prettytable::{Cell, Row, Table};
|
||||
|
||||
fn main() {
|
||||
let mut table = Table::new();
|
||||
|
||||
// Add a row per time
|
||||
table.add_row(row!["", "Compression Ratio", "Compression Estimation"]);
|
||||
|
||||
for (data, data_set_name) in get_codec_test_data_sets() {
|
||||
let mut results = vec![];
|
||||
let res = serialize_with_codec::<LinearInterpolFastFieldSerializer>(&data);
|
||||
results.push(res);
|
||||
let res = serialize_with_codec::<MultiLinearInterpolFastFieldSerializer>(&data);
|
||||
results.push(res);
|
||||
let res = serialize_with_codec::<fastfield_codecs::bitpacked::BitpackedFastFieldSerializer>(
|
||||
&data,
|
||||
);
|
||||
results.push(res);
|
||||
|
||||
//let best_estimation_codec = results
|
||||
//.iter()
|
||||
//.min_by(|res1, res2| res1.partial_cmp(&res2).unwrap())
|
||||
//.unwrap();
|
||||
let best_compression_ratio_codec = results
|
||||
.iter()
|
||||
.min_by(|res1, res2| res1.partial_cmp(res2).unwrap())
|
||||
.cloned()
|
||||
.unwrap();
|
||||
|
||||
table.add_row(Row::new(vec![Cell::new(data_set_name).style_spec("Bbb")]));
|
||||
for (is_applicable, est, comp, name) in results {
|
||||
let (est_cell, ratio_cell) = if !is_applicable {
|
||||
("Codec Disabled".to_string(), "".to_string())
|
||||
} else {
|
||||
(est.to_string(), comp.to_string())
|
||||
};
|
||||
#[allow(clippy::all)]
|
||||
let style = if comp == best_compression_ratio_codec.1 {
|
||||
"Fb"
|
||||
} else {
|
||||
""
|
||||
};
|
||||
|
||||
table.add_row(Row::new(vec![
|
||||
Cell::new(&name.to_string()).style_spec("bFg"),
|
||||
Cell::new(&ratio_cell).style_spec(style),
|
||||
Cell::new(&est_cell).style_spec(""),
|
||||
]));
|
||||
}
|
||||
}
|
||||
|
||||
table.printstd();
|
||||
}
|
||||
|
||||
pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
let mut data_and_names = vec![];
|
||||
|
||||
let data = (1000..=200_000_u64).collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Autoincrement"));
|
||||
|
||||
let mut current_cumulative = 0;
|
||||
let data = (1..=200_000_u64)
|
||||
.map(|num| {
|
||||
let num = (num as f32 + num as f32).log10() as u64;
|
||||
current_cumulative += num;
|
||||
current_cumulative
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
//let data = (1..=200000_u64).map(|num| num + num).collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Monotonically increasing concave"));
|
||||
|
||||
let mut current_cumulative = 0;
|
||||
let data = (1..=200_000_u64)
|
||||
.map(|num| {
|
||||
let num = (200_000.0 - num as f32).log10() as u64;
|
||||
current_cumulative += num;
|
||||
current_cumulative
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Monotonically increasing convex"));
|
||||
|
||||
let data = (1000..=200_000_u64)
|
||||
.map(|num| num + rand::random::<u8>() as u64)
|
||||
.collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Almost monotonically increasing"));
|
||||
|
||||
data_and_names
|
||||
}
|
||||
|
||||
pub fn serialize_with_codec<S: FastFieldCodecSerializer>(
|
||||
data: &[u64],
|
||||
) -> (bool, f32, f32, &'static str) {
|
||||
let is_applicable = S::is_applicable(&data, stats_from_vec(data));
|
||||
if !is_applicable {
|
||||
return (false, 0.0, 0.0, S::NAME);
|
||||
}
|
||||
let estimation = S::estimate(&data, stats_from_vec(data));
|
||||
let mut out = vec![];
|
||||
S::serialize(
|
||||
&mut out,
|
||||
&data,
|
||||
stats_from_vec(data),
|
||||
data.iter().cloned(),
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let actual_compression = out.len() as f32 / (data.len() * 8) as f32;
|
||||
(true, estimation, actual_compression, S::NAME)
|
||||
}
|
||||
|
||||
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
|
||||
let min_value = data.iter().cloned().min().unwrap_or(0);
|
||||
let max_value = data.iter().cloned().max().unwrap_or(0);
|
||||
FastFieldStats {
|
||||
min_value,
|
||||
max_value,
|
||||
num_vals: data.len() as u64,
|
||||
}
|
||||
}
|
||||
425
fastfield_codecs/src/multilinearinterpol.rs
Normal file
425
fastfield_codecs/src/multilinearinterpol.rs
Normal file
@@ -0,0 +1,425 @@
|
||||
/*!
|
||||
|
||||
MultiLinearInterpol compressor uses linear interpolation to guess a values and stores the offset, but in blocks of 512.
|
||||
|
||||
With a CHUNK_SIZE of 512 and 29 byte metadata per block, we get a overhead for metadata of 232 / 512 = 0,45 bits per element.
|
||||
The additional space required per element in a block is the the maximum deviation of the linear interpolation estimation function.
|
||||
|
||||
E.g. if the maximum deviation of an element is 12, all elements cost 4bits.
|
||||
|
||||
Size per block:
|
||||
Num Elements * Maximum Deviation from Interpolation + 29 Byte Metadata
|
||||
|
||||
*/
|
||||
|
||||
use crate::FastFieldCodecReader;
|
||||
use crate::FastFieldCodecSerializer;
|
||||
use crate::FastFieldDataAccess;
|
||||
use crate::FastFieldStats;
|
||||
use common::CountingWriter;
|
||||
use std::io::{self, Read, Write};
|
||||
use std::ops::Sub;
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitPacker;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use common::DeserializeFrom;
|
||||
use tantivy_bitpacker::BitUnpacker;
|
||||
|
||||
const CHUNK_SIZE: u64 = 512;
|
||||
|
||||
/// Depending on the field type, a different
|
||||
/// fast field is required.
|
||||
#[derive(Clone)]
|
||||
pub struct MultiLinearInterpolFastFieldReader {
|
||||
pub footer: MultiLinearInterpolFooter,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default)]
|
||||
struct Function {
|
||||
// The offset in the data is required, because we have diffrent bit_widths per block
|
||||
data_start_offset: u64,
|
||||
// start_pos in the block will be CHUNK_SIZE * BLOCK_NUM
|
||||
start_pos: u64,
|
||||
// only used during serialization, 0 after deserialization
|
||||
end_pos: u64,
|
||||
// only used during serialization, 0 after deserialization
|
||||
value_start_pos: u64,
|
||||
// only used during serialization, 0 after deserialization
|
||||
value_end_pos: u64,
|
||||
slope: f32,
|
||||
// The offset so that all values are positive when writing them
|
||||
positive_val_offset: u64,
|
||||
num_bits: u8,
|
||||
bit_unpacker: BitUnpacker,
|
||||
}
|
||||
|
||||
impl Function {
|
||||
fn calc_slope(&mut self) {
|
||||
let num_vals = self.end_pos - self.start_pos;
|
||||
get_slope(self.value_start_pos, self.value_end_pos, num_vals);
|
||||
}
|
||||
// split the interpolation into two function, change self and return the second split
|
||||
fn split(&mut self, split_pos: u64, split_pos_value: u64) -> Function {
|
||||
let mut new_function = Function {
|
||||
start_pos: split_pos,
|
||||
end_pos: self.end_pos,
|
||||
value_start_pos: split_pos_value,
|
||||
value_end_pos: self.value_end_pos,
|
||||
..Default::default()
|
||||
};
|
||||
new_function.calc_slope();
|
||||
self.end_pos = split_pos;
|
||||
self.value_end_pos = split_pos_value;
|
||||
self.calc_slope();
|
||||
new_function
|
||||
}
|
||||
}
|
||||
|
||||
impl BinarySerializable for Function {
|
||||
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
|
||||
self.data_start_offset.serialize(write)?;
|
||||
self.value_start_pos.serialize(write)?;
|
||||
self.positive_val_offset.serialize(write)?;
|
||||
self.slope.serialize(write)?;
|
||||
self.num_bits.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Function> {
|
||||
let data_start_offset = u64::deserialize(reader)?;
|
||||
let value_start_pos = u64::deserialize(reader)?;
|
||||
let offset = u64::deserialize(reader)?;
|
||||
let slope = f32::deserialize(reader)?;
|
||||
let num_bits = u8::deserialize(reader)?;
|
||||
let interpolation = Function {
|
||||
data_start_offset,
|
||||
value_start_pos,
|
||||
positive_val_offset: offset,
|
||||
num_bits,
|
||||
bit_unpacker: BitUnpacker::new(num_bits),
|
||||
slope,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
Ok(interpolation)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct MultiLinearInterpolFooter {
|
||||
pub num_vals: u64,
|
||||
pub min_value: u64,
|
||||
pub max_value: u64,
|
||||
interpolations: Vec<Function>,
|
||||
}
|
||||
|
||||
impl BinarySerializable for MultiLinearInterpolFooter {
|
||||
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
|
||||
let mut out = vec![];
|
||||
self.num_vals.serialize(&mut out)?;
|
||||
self.min_value.serialize(&mut out)?;
|
||||
self.max_value.serialize(&mut out)?;
|
||||
self.interpolations.serialize(&mut out)?;
|
||||
write.write_all(&out)?;
|
||||
(out.len() as u32).serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<MultiLinearInterpolFooter> {
|
||||
let mut footer = MultiLinearInterpolFooter {
|
||||
num_vals: u64::deserialize(reader)?,
|
||||
min_value: u64::deserialize(reader)?,
|
||||
max_value: u64::deserialize(reader)?,
|
||||
interpolations: Vec::<Function>::deserialize(reader)?,
|
||||
};
|
||||
for (num, interpol) in footer.interpolations.iter_mut().enumerate() {
|
||||
interpol.start_pos = CHUNK_SIZE * num as u64;
|
||||
}
|
||||
Ok(footer)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_interpolation_position(doc: u64) -> usize {
|
||||
let index = doc / CHUNK_SIZE;
|
||||
index as usize
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_interpolation_function(doc: u64, interpolations: &[Function]) -> &Function {
|
||||
&interpolations[get_interpolation_position(doc)]
|
||||
}
|
||||
|
||||
impl FastFieldCodecReader for MultiLinearInterpolFastFieldReader {
|
||||
/// Opens a fast field given a file.
|
||||
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
|
||||
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
|
||||
|
||||
let (_data, mut footer) = bytes.split_at(bytes.len() - (4 + footer_len) as usize);
|
||||
let footer = MultiLinearInterpolFooter::deserialize(&mut footer)?;
|
||||
|
||||
Ok(MultiLinearInterpolFastFieldReader { footer })
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
|
||||
let interpolation = get_interpolation_function(doc, &self.footer.interpolations);
|
||||
let doc = doc - interpolation.start_pos;
|
||||
let calculated_value =
|
||||
get_calculated_value(interpolation.value_start_pos, doc, interpolation.slope);
|
||||
let diff = interpolation
|
||||
.bit_unpacker
|
||||
.get(doc, &data[interpolation.data_start_offset as usize..]);
|
||||
(calculated_value + diff) - interpolation.positive_val_offset
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn min_value(&self) -> u64 {
|
||||
self.footer.min_value
|
||||
}
|
||||
#[inline]
|
||||
fn max_value(&self) -> u64 {
|
||||
self.footer.max_value
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_slope(first_val: u64, last_val: u64, num_vals: u64) -> f32 {
|
||||
((last_val as f64 - first_val as f64) / (num_vals as u64 - 1) as f64) as f32
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
|
||||
(first_val as i64 + (pos as f32 * slope) as i64) as u64
|
||||
}
|
||||
|
||||
/// Same as LinearInterpolFastFieldSerializer, but working on chunks of CHUNK_SIZE elements.
|
||||
pub struct MultiLinearInterpolFastFieldSerializer {}
|
||||
|
||||
impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
|
||||
const NAME: &'static str = "MultiLinearInterpol";
|
||||
const ID: u8 = 3;
|
||||
/// Creates a new fast field serializer.
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
_data_iter1: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
assert!(stats.min_value <= stats.max_value);
|
||||
|
||||
let first_val = fastfield_accessor.get_val(0);
|
||||
let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1);
|
||||
|
||||
let mut first_function = Function {
|
||||
end_pos: stats.num_vals,
|
||||
value_start_pos: first_val,
|
||||
value_end_pos: last_val,
|
||||
..Default::default()
|
||||
};
|
||||
first_function.calc_slope();
|
||||
let mut interpolations = vec![first_function];
|
||||
|
||||
// Since we potentially apply multiple passes over the data, the data is cached.
|
||||
// Multiple iteration can be expensive (merge with index sorting can add lot of overhead per
|
||||
// iteration)
|
||||
let data = data_iter.collect::<Vec<_>>();
|
||||
|
||||
//// let's split this into chunks of CHUNK_SIZE
|
||||
for data_pos in (0..data.len() as u64).step_by(CHUNK_SIZE as usize).skip(1) {
|
||||
let new_fun = {
|
||||
let current_interpolation = interpolations.last_mut().unwrap();
|
||||
current_interpolation.split(data_pos, data[data_pos as usize])
|
||||
};
|
||||
interpolations.push(new_fun);
|
||||
}
|
||||
// calculate offset and max (-> numbits) for each function
|
||||
for interpolation in &mut interpolations {
|
||||
let mut offset = 0;
|
||||
let mut rel_positive_max = 0;
|
||||
for (pos, actual_value) in data
|
||||
[interpolation.start_pos as usize..interpolation.end_pos as usize]
|
||||
.iter()
|
||||
.cloned()
|
||||
.enumerate()
|
||||
{
|
||||
let calculated_value = get_calculated_value(
|
||||
interpolation.value_start_pos,
|
||||
pos as u64,
|
||||
interpolation.slope,
|
||||
);
|
||||
if calculated_value > actual_value {
|
||||
// negative value we need to apply an offset
|
||||
// we ignore negative values in the max value calculation, because negative values
|
||||
// will be offset to 0
|
||||
offset = offset.max(calculated_value - actual_value);
|
||||
} else {
|
||||
//positive value no offset reuqired
|
||||
rel_positive_max = rel_positive_max.max(actual_value - calculated_value);
|
||||
}
|
||||
}
|
||||
|
||||
interpolation.positive_val_offset = offset;
|
||||
interpolation.num_bits = compute_num_bits(rel_positive_max + offset);
|
||||
}
|
||||
let mut bit_packer = BitPacker::new();
|
||||
|
||||
let write = &mut CountingWriter::wrap(write);
|
||||
for interpolation in &mut interpolations {
|
||||
interpolation.data_start_offset = write.written_bytes();
|
||||
let num_bits = interpolation.num_bits;
|
||||
for (pos, actual_value) in data
|
||||
[interpolation.start_pos as usize..interpolation.end_pos as usize]
|
||||
.iter()
|
||||
.cloned()
|
||||
.enumerate()
|
||||
{
|
||||
let calculated_value = get_calculated_value(
|
||||
interpolation.value_start_pos,
|
||||
pos as u64,
|
||||
interpolation.slope,
|
||||
);
|
||||
let diff = (actual_value + interpolation.positive_val_offset) - calculated_value;
|
||||
bit_packer.write(diff, num_bits, write)?;
|
||||
}
|
||||
bit_packer.flush(write)?;
|
||||
}
|
||||
bit_packer.close(write)?;
|
||||
|
||||
let footer = MultiLinearInterpolFooter {
|
||||
num_vals: stats.num_vals,
|
||||
min_value: stats.min_value,
|
||||
max_value: stats.max_value,
|
||||
interpolations,
|
||||
};
|
||||
footer.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn is_applicable(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> bool {
|
||||
if stats.num_vals < 5_000 {
|
||||
return false;
|
||||
}
|
||||
// On serialization the offset is added to the actual value.
|
||||
// We need to make sure this won't run into overflow calculation issues.
|
||||
// For this we take the maximum theroretical offset and add this to the max value.
|
||||
// If this doesn't overflow the algortihm should be fine
|
||||
let theorethical_maximum_offset = stats.max_value - stats.min_value;
|
||||
if stats
|
||||
.max_value
|
||||
.checked_add(theorethical_maximum_offset)
|
||||
.is_none()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
true
|
||||
}
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima are for the deviation of the calculated value and
|
||||
/// the offset is also unknown.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
||||
let first_val_in_first_block = fastfield_accessor.get_val(0);
|
||||
let last_elem_in_first_chunk = CHUNK_SIZE.min(stats.num_vals);
|
||||
let last_val_in_first_block =
|
||||
fastfield_accessor.get_val(last_elem_in_first_chunk as u64 - 1);
|
||||
let slope = get_slope(
|
||||
first_val_in_first_block,
|
||||
last_val_in_first_block,
|
||||
stats.num_vals,
|
||||
);
|
||||
|
||||
// let's sample at 0%, 5%, 10% .. 95%, 100%, but for the first block only
|
||||
let sample_positions = (0..20)
|
||||
.map(|pos| (last_elem_in_first_chunk as f32 / 100.0 * pos as f32 * 5.0) as usize)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let max_distance = sample_positions
|
||||
.iter()
|
||||
.map(|pos| {
|
||||
let calculated_value =
|
||||
get_calculated_value(first_val_in_first_block, *pos as u64, slope);
|
||||
let actual_value = fastfield_accessor.get_val(*pos as u64);
|
||||
distance(calculated_value, actual_value)
|
||||
})
|
||||
.max()
|
||||
.unwrap();
|
||||
|
||||
// Estimate one block and extrapolate the cost to all blocks.
|
||||
// the theory would be that we don't have the actual max_distance, but we are close within 50%
|
||||
// threshold.
|
||||
// It is multiplied by 2 because in a log case scenario the line would be as much above as
|
||||
// below. So the offset would = max_distance
|
||||
//
|
||||
let relative_max_value = (max_distance as f32 * 1.5) * 2.0;
|
||||
|
||||
let num_bits = compute_num_bits(relative_max_value as u64) as u64 * stats.num_vals as u64
|
||||
// function metadata per block
|
||||
+ 29 * (stats.num_vals / CHUNK_SIZE);
|
||||
let num_bits_uncompressed = 64 * stats.num_vals;
|
||||
num_bits as f32 / num_bits_uncompressed as f32
|
||||
}
|
||||
}
|
||||
|
||||
fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
|
||||
if x < y {
|
||||
y - x
|
||||
} else {
|
||||
x - y
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
|
||||
fn create_and_validate(data: &[u64], name: &str) {
|
||||
crate::tests::create_and_validate::<
|
||||
MultiLinearInterpolFastFieldSerializer,
|
||||
MultiLinearInterpolFastFieldReader,
|
||||
>(data, name);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_with_codec_data_sets() {
|
||||
let data_sets = get_codec_test_data_sets();
|
||||
for (mut data, name) in data_sets {
|
||||
create_and_validate(&data, name);
|
||||
data.reverse();
|
||||
create_and_validate(&data, name);
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn test_simple() {
|
||||
let data = (10..=20_u64).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "simple monotonically");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn border_cases_1() {
|
||||
let data = (0..1024).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "border case");
|
||||
}
|
||||
#[test]
|
||||
fn border_case_2() {
|
||||
let data = (0..1025).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "border case");
|
||||
}
|
||||
#[test]
|
||||
fn rand() {
|
||||
for _ in 0..10 {
|
||||
let mut data = (5_000..20_000)
|
||||
.map(|_| rand::random::<u64>() as u64)
|
||||
.collect::<Vec<_>>();
|
||||
create_and_validate(&data, "random");
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data, "random");
|
||||
}
|
||||
}
|
||||
}
|
||||
10
ownedbytes/Cargo.toml
Normal file
10
ownedbytes/Cargo.toml
Normal file
@@ -0,0 +1,10 @@
|
||||
[package]
|
||||
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
|
||||
name = "ownedbytes"
|
||||
version = "0.1.0"
|
||||
edition = "2018"
|
||||
description = "Expose data as static slice"
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
stable_deref_trait = "1.2.0"
|
||||
290
ownedbytes/src/lib.rs
Normal file
290
ownedbytes/src/lib.rs
Normal file
@@ -0,0 +1,290 @@
|
||||
use stable_deref_trait::StableDeref;
|
||||
use std::convert::TryInto;
|
||||
use std::mem;
|
||||
use std::ops::{Deref, Range};
|
||||
use std::sync::Arc;
|
||||
use std::{fmt, io};
|
||||
|
||||
/// An OwnedBytes simply wraps an object that owns a slice of data and exposes
|
||||
/// this data as a static slice.
|
||||
///
|
||||
/// The backing object is required to be `StableDeref`.
|
||||
#[derive(Clone)]
|
||||
pub struct OwnedBytes {
|
||||
data: &'static [u8],
|
||||
box_stable_deref: Arc<dyn Deref<Target = [u8]> + Sync + Send>,
|
||||
}
|
||||
|
||||
impl OwnedBytes {
|
||||
/// Creates an empty `OwnedBytes`.
|
||||
pub fn empty() -> OwnedBytes {
|
||||
OwnedBytes::new(&[][..])
|
||||
}
|
||||
|
||||
/// Creates an `OwnedBytes` intance given a `StableDeref` object.
|
||||
pub fn new<T: StableDeref + Deref<Target = [u8]> + 'static + Send + Sync>(
|
||||
data_holder: T,
|
||||
) -> OwnedBytes {
|
||||
let box_stable_deref = Arc::new(data_holder);
|
||||
let bytes: &[u8] = box_stable_deref.as_ref();
|
||||
let data = unsafe { mem::transmute::<_, &'static [u8]>(bytes.deref()) };
|
||||
OwnedBytes {
|
||||
data,
|
||||
box_stable_deref,
|
||||
}
|
||||
}
|
||||
|
||||
/// creates a fileslice that is just a view over a slice of the data.
|
||||
pub fn slice(&self, range: Range<usize>) -> Self {
|
||||
OwnedBytes {
|
||||
data: &self.data[range],
|
||||
box_stable_deref: self.box_stable_deref.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the underlying slice of data.
|
||||
/// `Deref` and `AsRef` are also available.
|
||||
#[inline]
|
||||
pub fn as_slice(&self) -> &[u8] {
|
||||
self.data
|
||||
}
|
||||
|
||||
/// Returns the len of the slice.
|
||||
#[inline]
|
||||
pub fn len(&self) -> usize {
|
||||
self.data.len()
|
||||
}
|
||||
|
||||
/// Splits the OwnedBytes into two OwnedBytes `(left, right)`.
|
||||
///
|
||||
/// Left will hold `split_len` bytes.
|
||||
///
|
||||
/// This operation is cheap and does not require to copy any memory.
|
||||
/// On the other hand, both `left` and `right` retain a handle over
|
||||
/// the entire slice of memory. In other words, the memory will only
|
||||
/// be released when both left and right are dropped.
|
||||
pub fn split(self, split_len: usize) -> (OwnedBytes, OwnedBytes) {
|
||||
let right_box_stable_deref = self.box_stable_deref.clone();
|
||||
let left = OwnedBytes {
|
||||
data: &self.data[..split_len],
|
||||
box_stable_deref: self.box_stable_deref,
|
||||
};
|
||||
let right = OwnedBytes {
|
||||
data: &self.data[split_len..],
|
||||
box_stable_deref: right_box_stable_deref,
|
||||
};
|
||||
(left, right)
|
||||
}
|
||||
|
||||
/// Returns true iff this `OwnedBytes` is empty.
|
||||
#[inline]
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.as_slice().is_empty()
|
||||
}
|
||||
|
||||
/// Drops the left most `advance_len` bytes.
|
||||
///
|
||||
/// See also [.clip(clip_len: usize))](#method.clip).
|
||||
#[inline]
|
||||
pub fn advance(&mut self, advance_len: usize) {
|
||||
self.data = &self.data[advance_len..]
|
||||
}
|
||||
|
||||
/// Reads an `u8` from the `OwnedBytes` and advance by one byte.
|
||||
#[inline]
|
||||
pub fn read_u8(&mut self) -> u8 {
|
||||
assert!(!self.is_empty());
|
||||
|
||||
let byte = self.as_slice()[0];
|
||||
self.advance(1);
|
||||
byte
|
||||
}
|
||||
|
||||
/// Reads an `u64` encoded as little-endian from the `OwnedBytes` and advance by 8 bytes.
|
||||
#[inline]
|
||||
pub fn read_u64(&mut self) -> u64 {
|
||||
assert!(self.len() > 7);
|
||||
|
||||
let octlet: [u8; 8] = self.as_slice()[..8].try_into().unwrap();
|
||||
self.advance(8);
|
||||
u64::from_le_bytes(octlet)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for OwnedBytes {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
// We truncate the bytes in order to make sure the debug string
|
||||
// is not too long.
|
||||
let bytes_truncated: &[u8] = if self.len() > 8 {
|
||||
&self.as_slice()[..10]
|
||||
} else {
|
||||
self.as_slice()
|
||||
};
|
||||
write!(f, "OwnedBytes({:?}, len={})", bytes_truncated, self.len())
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for OwnedBytes {
|
||||
type Target = [u8];
|
||||
|
||||
#[inline]
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.as_slice()
|
||||
}
|
||||
}
|
||||
|
||||
impl io::Read for OwnedBytes {
|
||||
#[inline]
|
||||
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
||||
let read_len = {
|
||||
let data = self.as_slice();
|
||||
if data.len() >= buf.len() {
|
||||
let buf_len = buf.len();
|
||||
buf.copy_from_slice(&data[..buf_len]);
|
||||
buf.len()
|
||||
} else {
|
||||
let data_len = data.len();
|
||||
buf[..data_len].copy_from_slice(data);
|
||||
data_len
|
||||
}
|
||||
};
|
||||
self.advance(read_len);
|
||||
Ok(read_len)
|
||||
}
|
||||
#[inline]
|
||||
fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
|
||||
let read_len = {
|
||||
let data = self.as_slice();
|
||||
buf.extend(data);
|
||||
data.len()
|
||||
};
|
||||
self.advance(read_len);
|
||||
Ok(read_len)
|
||||
}
|
||||
#[inline]
|
||||
fn read_exact(&mut self, buf: &mut [u8]) -> io::Result<()> {
|
||||
let read_len = self.read(buf)?;
|
||||
if read_len != buf.len() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::UnexpectedEof,
|
||||
"failed to fill whole buffer",
|
||||
));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<[u8]> for OwnedBytes {
|
||||
#[inline]
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
self.as_slice()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::io::{self, Read};
|
||||
|
||||
use super::OwnedBytes;
|
||||
|
||||
#[test]
|
||||
fn test_owned_bytes_debug() {
|
||||
let short_bytes = OwnedBytes::new(b"abcd".as_ref());
|
||||
assert_eq!(
|
||||
format!("{:?}", short_bytes),
|
||||
"OwnedBytes([97, 98, 99, 100], len=4)"
|
||||
);
|
||||
let long_bytes = OwnedBytes::new(b"abcdefghijklmnopq".as_ref());
|
||||
assert_eq!(
|
||||
format!("{:?}", long_bytes),
|
||||
"OwnedBytes([97, 98, 99, 100, 101, 102, 103, 104, 105, 106], len=17)"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_owned_bytes_read() -> io::Result<()> {
|
||||
let mut bytes = OwnedBytes::new(b"abcdefghiklmnopqrstuvwxyz".as_ref());
|
||||
{
|
||||
let mut buf = [0u8; 5];
|
||||
bytes.read_exact(&mut buf[..]).unwrap();
|
||||
assert_eq!(&buf, b"abcde");
|
||||
assert_eq!(bytes.as_slice(), b"fghiklmnopqrstuvwxyz")
|
||||
}
|
||||
{
|
||||
let mut buf = [0u8; 2];
|
||||
bytes.read_exact(&mut buf[..]).unwrap();
|
||||
assert_eq!(&buf, b"fg");
|
||||
assert_eq!(bytes.as_slice(), b"hiklmnopqrstuvwxyz")
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_owned_bytes_read_right_at_the_end() -> io::Result<()> {
|
||||
let mut bytes = OwnedBytes::new(b"abcde".as_ref());
|
||||
let mut buf = [0u8; 5];
|
||||
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 5);
|
||||
assert_eq!(&buf, b"abcde");
|
||||
assert_eq!(bytes.as_slice(), b"");
|
||||
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 0);
|
||||
assert_eq!(&buf, b"abcde");
|
||||
Ok(())
|
||||
}
|
||||
#[test]
|
||||
fn test_owned_bytes_read_incomplete() -> io::Result<()> {
|
||||
let mut bytes = OwnedBytes::new(b"abcde".as_ref());
|
||||
let mut buf = [0u8; 7];
|
||||
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 5);
|
||||
assert_eq!(&buf[..5], b"abcde");
|
||||
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 0);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_owned_bytes_read_to_end() -> io::Result<()> {
|
||||
let mut bytes = OwnedBytes::new(b"abcde".as_ref());
|
||||
let mut buf = Vec::new();
|
||||
bytes.read_to_end(&mut buf)?;
|
||||
assert_eq!(buf.as_slice(), b"abcde".as_ref());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_owned_bytes_read_u8() -> io::Result<()> {
|
||||
let mut bytes = OwnedBytes::new(b"\xFF".as_ref());
|
||||
assert_eq!(bytes.read_u8(), 255);
|
||||
assert_eq!(bytes.len(), 0);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_owned_bytes_read_u64() -> io::Result<()> {
|
||||
let mut bytes = OwnedBytes::new(b"\0\xFF\xFF\xFF\xFF\xFF\xFF\xFF".as_ref());
|
||||
assert_eq!(bytes.read_u64(), u64::MAX - 255);
|
||||
assert_eq!(bytes.len(), 0);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_owned_bytes_split() {
|
||||
let bytes = OwnedBytes::new(b"abcdefghi".as_ref());
|
||||
let (left, right) = bytes.split(3);
|
||||
assert_eq!(left.as_slice(), b"abc");
|
||||
assert_eq!(right.as_slice(), b"defghi");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_owned_bytes_split_boundary() {
|
||||
let bytes = OwnedBytes::new(b"abcdefghi".as_ref());
|
||||
{
|
||||
let (left, right) = bytes.clone().split(0);
|
||||
assert_eq!(left.as_slice(), b"");
|
||||
assert_eq!(right.as_slice(), b"abcdefghi");
|
||||
}
|
||||
{
|
||||
let (left, right) = bytes.split(9);
|
||||
assert_eq!(left.as_slice(), b"abcdefghi");
|
||||
assert_eq!(right.as_slice(), b"");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy-query-grammar"
|
||||
version = "0.14.0"
|
||||
version = "0.15.0"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
@@ -14,3 +14,5 @@ edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
combine = {version="4", default-features=false, features=[] }
|
||||
once_cell = "1.7.2"
|
||||
regex ={ version = "1.5.4", default-features = false, features = ["std"] }
|
||||
|
||||
@@ -5,11 +5,11 @@ use combine::parser::Parser;
|
||||
|
||||
pub use crate::occur::Occur;
|
||||
use crate::query_grammar::parse_to_ast;
|
||||
pub use crate::user_input_ast::{UserInputAST, UserInputBound, UserInputLeaf, UserInputLiteral};
|
||||
pub use crate::user_input_ast::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral};
|
||||
|
||||
pub struct Error;
|
||||
|
||||
pub fn parse_query(query: &str) -> Result<UserInputAST, Error> {
|
||||
pub fn parse_query(query: &str) -> Result<UserInputAst, Error> {
|
||||
let (user_input_ast, _remaining) = parse_to_ast().parse(query).map_err(|_| Error)?;
|
||||
Ok(user_input_ast)
|
||||
}
|
||||
|
||||
@@ -1,21 +1,44 @@
|
||||
use super::user_input_ast::{UserInputAST, UserInputBound, UserInputLeaf, UserInputLiteral};
|
||||
use super::user_input_ast::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral};
|
||||
use crate::Occur;
|
||||
use combine::error::StringStreamError;
|
||||
use combine::parser::char::{char, digit, letter, space, spaces, string};
|
||||
use combine::parser::char::{char, digit, space, spaces, string};
|
||||
use combine::parser::range::{take_while, take_while1};
|
||||
use combine::parser::repeat::escaped;
|
||||
use combine::parser::Parser;
|
||||
use combine::{
|
||||
attempt, choice, eof, many, many1, one_of, optional, parser, satisfy, skip_many1, value,
|
||||
};
|
||||
use combine::{error::StringStreamError, parser::combinator::recognize};
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
|
||||
fn field<'a>() -> impl Parser<&'a str, Output = String> {
|
||||
(
|
||||
(letter().or(char('_'))),
|
||||
many(satisfy(|c: char| {
|
||||
c.is_alphanumeric() || c == '_' || c == '-'
|
||||
})),
|
||||
)
|
||||
.skip(char(':'))
|
||||
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
|
||||
// Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to special characters.
|
||||
const SPECIAL_CHARS: &[char] = &[
|
||||
'+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '~', '!', '\\', '*', ' ',
|
||||
];
|
||||
const ESCAPED_SPECIAL_CHARS_PATTERN: &str = r#"\\(\+|\^|`|:|\{|\}|"|\[|\]|\(|\)|\~|!|\\|\*| )"#;
|
||||
|
||||
/// Parses a field_name
|
||||
/// A field name must have at least one character and be followed by a colon.
|
||||
/// All characters are allowed including special characters `SPECIAL_CHARS`, but these
|
||||
/// need to be escaped with a backslack character '\'.
|
||||
fn field_name<'a>() -> impl Parser<&'a str, Output = String> {
|
||||
static ESCAPED_SPECIAL_CHARS_RE: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(ESCAPED_SPECIAL_CHARS_PATTERN).unwrap());
|
||||
|
||||
recognize::<String, _, _>(escaped(
|
||||
(
|
||||
take_while1(|c| !SPECIAL_CHARS.contains(&c) && c != '-'),
|
||||
take_while(|c| !SPECIAL_CHARS.contains(&c)),
|
||||
),
|
||||
'\\',
|
||||
satisfy(|c| SPECIAL_CHARS.contains(&c)),
|
||||
))
|
||||
.skip(char(':'))
|
||||
.map(|s| ESCAPED_SPECIAL_CHARS_RE.replace_all(&s, "$1").to_string())
|
||||
.and_then(|s: String| match s.is_empty() {
|
||||
true => Err(StringStreamError::UnexpectedParse),
|
||||
_ => Ok(s),
|
||||
})
|
||||
}
|
||||
|
||||
fn word<'a>() -> impl Parser<&'a str, Output = String> {
|
||||
@@ -35,6 +58,62 @@ fn word<'a>() -> impl Parser<&'a str, Output = String> {
|
||||
})
|
||||
}
|
||||
|
||||
/// Parses a date time according to rfc3339
|
||||
/// 2015-08-02T18:54:42+02
|
||||
/// 2021-04-13T19:46:26.266051969+00:00
|
||||
///
|
||||
/// NOTE: also accepts 999999-99-99T99:99:99.266051969+99:99
|
||||
/// We delegate rejecting such invalid dates to the logical AST compuation code
|
||||
/// which invokes chrono::DateTime::parse_from_rfc3339 on the value to actually parse
|
||||
/// it (instead of merely extracting the datetime value as string as done here).
|
||||
fn date_time<'a>() -> impl Parser<&'a str, Output = String> {
|
||||
let two_digits = || recognize::<String, _, _>((digit(), digit()));
|
||||
|
||||
// Parses a time zone
|
||||
// -06:30
|
||||
// Z
|
||||
let time_zone = {
|
||||
let utc = recognize::<String, _, _>(char('Z'));
|
||||
let offset = recognize((
|
||||
choice([char('-'), char('+')]),
|
||||
two_digits(),
|
||||
char(':'),
|
||||
two_digits(),
|
||||
));
|
||||
|
||||
utc.or(offset)
|
||||
};
|
||||
|
||||
// Parses a date
|
||||
// 2010-01-30
|
||||
let date = {
|
||||
recognize::<String, _, _>((
|
||||
many1::<String, _, _>(digit()),
|
||||
char('-'),
|
||||
two_digits(),
|
||||
char('-'),
|
||||
two_digits(),
|
||||
))
|
||||
};
|
||||
|
||||
// Parses a time
|
||||
// 12:30:02
|
||||
// 19:46:26.266051969
|
||||
let time = {
|
||||
recognize::<String, _, _>((
|
||||
two_digits(),
|
||||
char(':'),
|
||||
two_digits(),
|
||||
char(':'),
|
||||
two_digits(),
|
||||
optional((char('.'), many1::<String, _, _>(digit()))),
|
||||
time_zone,
|
||||
))
|
||||
};
|
||||
|
||||
recognize((date, char('T'), time))
|
||||
}
|
||||
|
||||
fn term_val<'a>() -> impl Parser<&'a str, Output = String> {
|
||||
let phrase = char('"').with(many1(satisfy(|c| c != '"'))).skip(char('"'));
|
||||
phrase.or(word())
|
||||
@@ -42,7 +121,7 @@ fn term_val<'a>() -> impl Parser<&'a str, Output = String> {
|
||||
|
||||
fn term_query<'a>() -> impl Parser<&'a str, Output = UserInputLiteral> {
|
||||
let term_val_with_field = negative_number().or(term_val());
|
||||
(field(), term_val_with_field).map(|(field_name, phrase)| UserInputLiteral {
|
||||
(field_name(), term_val_with_field).map(|(field_name, phrase)| UserInputLiteral {
|
||||
field_name: Some(field_name),
|
||||
phrase,
|
||||
})
|
||||
@@ -83,7 +162,8 @@ fn spaces1<'a>() -> impl Parser<&'a str, Output = ()> {
|
||||
/// [a TO *], [a TO c], [abc TO bcd}
|
||||
fn range<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
|
||||
let range_term_val = || {
|
||||
word()
|
||||
attempt(date_time())
|
||||
.or(word())
|
||||
.or(negative_number())
|
||||
.or(char('*').with(value("*".to_string())))
|
||||
};
|
||||
@@ -138,7 +218,7 @@ fn range<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
|
||||
);
|
||||
|
||||
(
|
||||
optional(field()).skip(spaces()),
|
||||
optional(field_name()).skip(spaces()),
|
||||
// try elastic first, if it matches, the range is unbounded
|
||||
attempt(elastic_unbounded_range).or(lower_to_upper),
|
||||
)
|
||||
@@ -152,21 +232,21 @@ fn range<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
|
||||
})
|
||||
}
|
||||
|
||||
fn negate(expr: UserInputAST) -> UserInputAST {
|
||||
fn negate(expr: UserInputAst) -> UserInputAst {
|
||||
expr.unary(Occur::MustNot)
|
||||
}
|
||||
|
||||
fn leaf<'a>() -> impl Parser<&'a str, Output = UserInputAST> {
|
||||
fn leaf<'a>() -> impl Parser<&'a str, Output = UserInputAst> {
|
||||
parser(|input| {
|
||||
char('(')
|
||||
.with(ast())
|
||||
.skip(char(')'))
|
||||
.or(char('*').map(|_| UserInputAST::from(UserInputLeaf::All)))
|
||||
.or(char('*').map(|_| UserInputAst::from(UserInputLeaf::All)))
|
||||
.or(attempt(
|
||||
string("NOT").skip(spaces1()).with(leaf()).map(negate),
|
||||
))
|
||||
.or(attempt(range().map(UserInputAST::from)))
|
||||
.or(literal().map(UserInputAST::from))
|
||||
.or(attempt(range().map(UserInputAst::from)))
|
||||
.or(literal().map(UserInputAst::from))
|
||||
.parse_stream(input)
|
||||
.into_result()
|
||||
})
|
||||
@@ -178,7 +258,7 @@ fn occur_symbol<'a>() -> impl Parser<&'a str, Output = Occur> {
|
||||
.or(char('+').map(|_| Occur::Must))
|
||||
}
|
||||
|
||||
fn occur_leaf<'a>() -> impl Parser<&'a str, Output = (Option<Occur>, UserInputAST)> {
|
||||
fn occur_leaf<'a>() -> impl Parser<&'a str, Output = (Option<Occur>, UserInputAst)> {
|
||||
(optional(occur_symbol()), boosted_leaf())
|
||||
}
|
||||
|
||||
@@ -199,10 +279,10 @@ fn boost<'a>() -> impl Parser<&'a str, Output = f64> {
|
||||
(char('^'), positive_float_number()).map(|(_, boost)| boost)
|
||||
}
|
||||
|
||||
fn boosted_leaf<'a>() -> impl Parser<&'a str, Output = UserInputAST> {
|
||||
fn boosted_leaf<'a>() -> impl Parser<&'a str, Output = UserInputAst> {
|
||||
(leaf(), optional(boost())).map(|(leaf, boost_opt)| match boost_opt {
|
||||
Some(boost) if (boost - 1.0).abs() > std::f64::EPSILON => {
|
||||
UserInputAST::Boost(Box::new(leaf), boost)
|
||||
UserInputAst::Boost(Box::new(leaf), boost)
|
||||
}
|
||||
_ => leaf,
|
||||
})
|
||||
@@ -221,10 +301,10 @@ fn binary_operand<'a>() -> impl Parser<&'a str, Output = BinaryOperand> {
|
||||
}
|
||||
|
||||
fn aggregate_binary_expressions(
|
||||
left: UserInputAST,
|
||||
others: Vec<(BinaryOperand, UserInputAST)>,
|
||||
) -> UserInputAST {
|
||||
let mut dnf: Vec<Vec<UserInputAST>> = vec![vec![left]];
|
||||
left: UserInputAst,
|
||||
others: Vec<(BinaryOperand, UserInputAst)>,
|
||||
) -> UserInputAst {
|
||||
let mut dnf: Vec<Vec<UserInputAst>> = vec![vec![left]];
|
||||
for (operator, operand_ast) in others {
|
||||
match operator {
|
||||
BinaryOperand::And => {
|
||||
@@ -238,33 +318,33 @@ fn aggregate_binary_expressions(
|
||||
}
|
||||
}
|
||||
if dnf.len() == 1 {
|
||||
UserInputAST::and(dnf.into_iter().next().unwrap()) //< safe
|
||||
UserInputAst::and(dnf.into_iter().next().unwrap()) //< safe
|
||||
} else {
|
||||
let conjunctions = dnf.into_iter().map(UserInputAST::and).collect();
|
||||
UserInputAST::or(conjunctions)
|
||||
let conjunctions = dnf.into_iter().map(UserInputAst::and).collect();
|
||||
UserInputAst::or(conjunctions)
|
||||
}
|
||||
}
|
||||
|
||||
fn operand_leaf<'a>() -> impl Parser<&'a str, Output = (BinaryOperand, UserInputAST)> {
|
||||
fn operand_leaf<'a>() -> impl Parser<&'a str, Output = (BinaryOperand, UserInputAst)> {
|
||||
(
|
||||
binary_operand().skip(spaces()),
|
||||
boosted_leaf().skip(spaces()),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn ast<'a>() -> impl Parser<&'a str, Output = UserInputAST> {
|
||||
pub fn ast<'a>() -> impl Parser<&'a str, Output = UserInputAst> {
|
||||
let boolean_expr = (boosted_leaf().skip(spaces()), many1(operand_leaf()))
|
||||
.map(|(left, right)| aggregate_binary_expressions(left, right));
|
||||
let whitespace_separated_leaves = many1(occur_leaf().skip(spaces().silent())).map(
|
||||
|subqueries: Vec<(Option<Occur>, UserInputAST)>| {
|
||||
|subqueries: Vec<(Option<Occur>, UserInputAst)>| {
|
||||
if subqueries.len() == 1 {
|
||||
let (occur_opt, ast) = subqueries.into_iter().next().unwrap();
|
||||
match occur_opt.unwrap_or(Occur::Should) {
|
||||
Occur::Must | Occur::Should => ast,
|
||||
Occur::MustNot => UserInputAST::Clause(vec![(Some(Occur::MustNot), ast)]),
|
||||
Occur::MustNot => UserInputAst::Clause(vec![(Some(Occur::MustNot), ast)]),
|
||||
}
|
||||
} else {
|
||||
UserInputAST::Clause(subqueries.into_iter().collect())
|
||||
UserInputAst::Clause(subqueries.into_iter().collect())
|
||||
}
|
||||
},
|
||||
);
|
||||
@@ -272,10 +352,10 @@ pub fn ast<'a>() -> impl Parser<&'a str, Output = UserInputAST> {
|
||||
spaces().with(expr).skip(spaces())
|
||||
}
|
||||
|
||||
pub fn parse_to_ast<'a>() -> impl Parser<&'a str, Output = UserInputAST> {
|
||||
pub fn parse_to_ast<'a>() -> impl Parser<&'a str, Output = UserInputAst> {
|
||||
spaces()
|
||||
.with(optional(ast()).skip(eof()))
|
||||
.map(|opt_ast| opt_ast.unwrap_or_else(UserInputAST::empty_query))
|
||||
.map(|opt_ast| opt_ast.unwrap_or_else(UserInputAst::empty_query))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -324,6 +404,22 @@ mod test {
|
||||
error_parse("-1.");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_date_time() {
|
||||
let (val, remaining) = date_time()
|
||||
.parse("2015-08-02T18:54:42+02:30")
|
||||
.expect("cannot parse date");
|
||||
assert_eq!(val, "2015-08-02T18:54:42+02:30");
|
||||
assert_eq!(remaining, "");
|
||||
assert!(date_time().parse("2015-08-02T18:54:42+02").is_err());
|
||||
|
||||
let (val, remaining) = date_time()
|
||||
.parse("2021-04-13T19:46:26.266051969+00:00")
|
||||
.expect("cannot parse fractional date");
|
||||
assert_eq!(val, "2021-04-13T19:46:26.266051969+00:00");
|
||||
assert_eq!(remaining, "");
|
||||
}
|
||||
|
||||
fn test_parse_query_to_ast_helper(query: &str, expected: &str) {
|
||||
let query = parse_to_ast().parse(query).unwrap().0;
|
||||
let query_str = format!("{:?}", query);
|
||||
@@ -391,21 +487,21 @@ mod test {
|
||||
|
||||
#[test]
|
||||
fn test_parse_elastic_query_ranges() {
|
||||
test_parse_query_to_ast_helper("title: >a", "title:{\"a\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("title:>=a", "title:[\"a\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("title: <a", "title:{\"*\" TO \"a\"}");
|
||||
test_parse_query_to_ast_helper("title:<=a", "title:{\"*\" TO \"a\"]");
|
||||
test_parse_query_to_ast_helper("title:<=bsd", "title:{\"*\" TO \"bsd\"]");
|
||||
test_parse_query_to_ast_helper("title: >a", "\"title\":{\"a\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("title:>=a", "\"title\":[\"a\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("title: <a", "\"title\":{\"*\" TO \"a\"}");
|
||||
test_parse_query_to_ast_helper("title:<=a", "\"title\":{\"*\" TO \"a\"]");
|
||||
test_parse_query_to_ast_helper("title:<=bsd", "\"title\":{\"*\" TO \"bsd\"]");
|
||||
|
||||
test_parse_query_to_ast_helper("weight: >70", "weight:{\"70\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("weight:>=70", "weight:[\"70\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("weight: <70", "weight:{\"*\" TO \"70\"}");
|
||||
test_parse_query_to_ast_helper("weight:<=70", "weight:{\"*\" TO \"70\"]");
|
||||
test_parse_query_to_ast_helper("weight: >60.7", "weight:{\"60.7\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("weight: >70", "\"weight\":{\"70\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("weight:>=70", "\"weight\":[\"70\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("weight: <70", "\"weight\":{\"*\" TO \"70\"}");
|
||||
test_parse_query_to_ast_helper("weight:<=70", "\"weight\":{\"*\" TO \"70\"]");
|
||||
test_parse_query_to_ast_helper("weight: >60.7", "\"weight\":{\"60.7\" TO \"*\"}");
|
||||
|
||||
test_parse_query_to_ast_helper("weight: <= 70", "weight:{\"*\" TO \"70\"]");
|
||||
test_parse_query_to_ast_helper("weight: <= 70", "\"weight\":{\"*\" TO \"70\"]");
|
||||
|
||||
test_parse_query_to_ast_helper("weight: <= 70.5", "weight:{\"*\" TO \"70.5\"]");
|
||||
test_parse_query_to_ast_helper("weight: <= 70.5", "\"weight\":{\"*\" TO \"70.5\"]");
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -418,44 +514,100 @@ mod test {
|
||||
#[test]
|
||||
fn test_field_name() -> TestParseResult {
|
||||
assert_eq!(
|
||||
super::field().parse("my-field-name:a")?,
|
||||
("my-field-name".to_string(), "a")
|
||||
super::field_name().parse(".my.field.name:a"),
|
||||
Ok((".my.field.name".to_string(), "a"))
|
||||
);
|
||||
assert_eq!(
|
||||
super::field().parse("my_field_name:a")?,
|
||||
("my_field_name".to_string(), "a")
|
||||
super::field_name().parse("my\\ field\\ name:a"),
|
||||
Ok(("my field name".to_string(), "a"))
|
||||
);
|
||||
assert!(super::field().parse(":a").is_err());
|
||||
assert!(super::field().parse("-my_field:a").is_err());
|
||||
assert!(super::field_name().parse("my field:a").is_err());
|
||||
assert_eq!(
|
||||
super::field().parse("_my_field:a")?,
|
||||
super::field_name().parse("\\(1\\+1\\):2"),
|
||||
Ok(("(1+1)".to_string(), "2"))
|
||||
);
|
||||
assert_eq!(
|
||||
super::field_name().parse("my_field_name:a"),
|
||||
Ok(("my_field_name".to_string(), "a"))
|
||||
);
|
||||
assert!(super::field_name().parse("my_field_name").is_err());
|
||||
assert!(super::field_name().parse(":a").is_err());
|
||||
assert!(super::field_name().parse("-my_field:a").is_err());
|
||||
assert_eq!(
|
||||
super::field_name().parse("_my_field:a")?,
|
||||
("_my_field".to_string(), "a")
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_field_name_re() {
|
||||
let escaped_special_chars_re = Regex::new(ESCAPED_SPECIAL_CHARS_PATTERN).unwrap();
|
||||
for special_char in SPECIAL_CHARS.iter() {
|
||||
assert_eq!(
|
||||
escaped_special_chars_re.replace_all(&format!("\\{}", special_char), "$1"),
|
||||
special_char.to_string()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_range_parser() {
|
||||
// testing the range() parser separately
|
||||
let res = range().parse("title: <hello").unwrap().0;
|
||||
let res = range()
|
||||
.parse("title: <hello")
|
||||
.expect("Cannot parse felxible bound word")
|
||||
.0;
|
||||
let expected = UserInputLeaf::Range {
|
||||
field: Some("title".to_string()),
|
||||
lower: UserInputBound::Unbounded,
|
||||
upper: UserInputBound::Exclusive("hello".to_string()),
|
||||
};
|
||||
let res2 = range().parse("title:{* TO hello}").unwrap().0;
|
||||
let res2 = range()
|
||||
.parse("title:{* TO hello}")
|
||||
.expect("Cannot parse ununbounded to word")
|
||||
.0;
|
||||
assert_eq!(res, expected);
|
||||
assert_eq!(res2, expected);
|
||||
|
||||
let expected_weight = UserInputLeaf::Range {
|
||||
field: Some("weight".to_string()),
|
||||
lower: UserInputBound::Inclusive("71.2".to_string()),
|
||||
upper: UserInputBound::Unbounded,
|
||||
};
|
||||
|
||||
let res3 = range().parse("weight: >=71.2").unwrap().0;
|
||||
let res4 = range().parse("weight:[71.2 TO *}").unwrap().0;
|
||||
let res3 = range()
|
||||
.parse("weight: >=71.2")
|
||||
.expect("Cannot parse flexible bound float")
|
||||
.0;
|
||||
let res4 = range()
|
||||
.parse("weight:[71.2 TO *}")
|
||||
.expect("Cannot parse float to unbounded")
|
||||
.0;
|
||||
assert_eq!(res3, expected_weight);
|
||||
assert_eq!(res4, expected_weight);
|
||||
|
||||
let expected_dates = UserInputLeaf::Range {
|
||||
field: Some("date_field".to_string()),
|
||||
lower: UserInputBound::Exclusive("2015-08-02T18:54:42Z".to_string()),
|
||||
upper: UserInputBound::Inclusive("2021-08-02T18:54:42+02:30".to_string()),
|
||||
};
|
||||
let res5 = range()
|
||||
.parse("date_field:{2015-08-02T18:54:42Z TO 2021-08-02T18:54:42+02:30]")
|
||||
.expect("Cannot parse date range")
|
||||
.0;
|
||||
assert_eq!(res5, expected_dates);
|
||||
|
||||
let expected_flexible_dates = UserInputLeaf::Range {
|
||||
field: Some("date_field".to_string()),
|
||||
lower: UserInputBound::Unbounded,
|
||||
upper: UserInputBound::Inclusive("2021-08-02T18:54:42.12345+02:30".to_string()),
|
||||
};
|
||||
|
||||
let res6 = range()
|
||||
.parse("date_field: <=2021-08-02T18:54:42.12345+02:30")
|
||||
.expect("Cannot parse date range")
|
||||
.0;
|
||||
assert_eq!(res6, expected_flexible_dates);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -492,12 +644,14 @@ mod test {
|
||||
|
||||
#[test]
|
||||
fn test_single_term_with_field() {
|
||||
test_parse_query_to_ast_helper("abc:toto", "abc:\"toto\"");
|
||||
test_parse_query_to_ast_helper("abc:toto", "\"abc\":\"toto\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_single_term_with_float() {
|
||||
test_parse_query_to_ast_helper("abc:1.1", "abc:\"1.1\"");
|
||||
test_parse_query_to_ast_helper("abc:1.1", "\"abc\":\"1.1\"");
|
||||
test_parse_query_to_ast_helper("a.b.c:1.1", "\"a.b.c\":\"1.1\"");
|
||||
test_parse_query_to_ast_helper("a\\ b\\ c:1.1", "\"a b c\":\"1.1\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -513,22 +667,27 @@ mod test {
|
||||
#[test]
|
||||
fn test_parse_test_query_other() {
|
||||
test_parse_query_to_ast_helper("(+a +b) d", "(*(+\"a\" +\"b\") *\"d\")");
|
||||
test_parse_query_to_ast_helper("+abc:toto", "abc:\"toto\"");
|
||||
test_parse_query_to_ast_helper("(+abc:toto -titi)", "(+abc:\"toto\" -\"titi\")");
|
||||
test_parse_query_to_ast_helper("-abc:toto", "(-abc:\"toto\")");
|
||||
test_parse_query_to_ast_helper("abc:a b", "(*abc:\"a\" *\"b\")");
|
||||
test_parse_query_to_ast_helper("abc:\"a b\"", "abc:\"a b\"");
|
||||
test_parse_query_to_ast_helper("foo:[1 TO 5]", "foo:[\"1\" TO \"5\"]");
|
||||
test_parse_query_to_ast_helper("+abc:toto", "\"abc\":\"toto\"");
|
||||
test_parse_query_to_ast_helper("+a\\+b\\+c:toto", "\"a+b+c\":\"toto\"");
|
||||
test_parse_query_to_ast_helper("(+abc:toto -titi)", "(+\"abc\":\"toto\" -\"titi\")");
|
||||
test_parse_query_to_ast_helper("-abc:toto", "(-\"abc\":\"toto\")");
|
||||
test_is_parse_err("--abc:toto");
|
||||
test_parse_query_to_ast_helper("abc:a b", "(*\"abc\":\"a\" *\"b\")");
|
||||
test_parse_query_to_ast_helper("abc:\"a b\"", "\"abc\":\"a b\"");
|
||||
test_parse_query_to_ast_helper("foo:[1 TO 5]", "\"foo\":[\"1\" TO \"5\"]");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_query_with_range() {
|
||||
test_parse_query_to_ast_helper("[1 TO 5]", "[\"1\" TO \"5\"]");
|
||||
test_parse_query_to_ast_helper("foo:{a TO z}", "foo:{\"a\" TO \"z\"}");
|
||||
test_parse_query_to_ast_helper("foo:[1 TO toto}", "foo:[\"1\" TO \"toto\"}");
|
||||
test_parse_query_to_ast_helper("foo:[* TO toto}", "foo:{\"*\" TO \"toto\"}");
|
||||
test_parse_query_to_ast_helper("foo:[1 TO *}", "foo:[\"1\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("foo:[1.1 TO *}", "foo:[\"1.1\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("foo:{a TO z}", "\"foo\":{\"a\" TO \"z\"}");
|
||||
test_parse_query_to_ast_helper("foo:[1 TO toto}", "\"foo\":[\"1\" TO \"toto\"}");
|
||||
test_parse_query_to_ast_helper("foo:[* TO toto}", "\"foo\":{\"*\" TO \"toto\"}");
|
||||
test_parse_query_to_ast_helper("foo:[1 TO *}", "\"foo\":[\"1\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper(
|
||||
"1.2.foo.bar:[1.1 TO *}",
|
||||
"\"1.2.foo.bar\":[\"1.1\" TO \"*\"}",
|
||||
);
|
||||
test_is_parse_err("abc + ");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,7 +24,7 @@ impl Debug for UserInputLeaf {
|
||||
ref upper,
|
||||
} => {
|
||||
if let Some(ref field) = field {
|
||||
write!(formatter, "{}:", field)?;
|
||||
write!(formatter, "\"{}\":", field)?;
|
||||
}
|
||||
lower.display_lower(formatter)?;
|
||||
write!(formatter, " TO ")?;
|
||||
@@ -45,7 +45,7 @@ pub struct UserInputLiteral {
|
||||
impl fmt::Debug for UserInputLiteral {
|
||||
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
|
||||
match self.field_name {
|
||||
Some(ref field_name) => write!(formatter, "{}:\"{}\"", field_name, self.phrase),
|
||||
Some(ref field_name) => write!(formatter, "\"{}\":\"{}\"", field_name, self.phrase),
|
||||
None => write!(formatter, "\"{}\"", self.phrase),
|
||||
}
|
||||
}
|
||||
@@ -79,46 +79,46 @@ impl UserInputBound {
|
||||
match *self {
|
||||
UserInputBound::Inclusive(ref contents) => contents,
|
||||
UserInputBound::Exclusive(ref contents) => contents,
|
||||
UserInputBound::Unbounded => &"*",
|
||||
UserInputBound::Unbounded => "*",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub enum UserInputAST {
|
||||
Clause(Vec<(Option<Occur>, UserInputAST)>),
|
||||
pub enum UserInputAst {
|
||||
Clause(Vec<(Option<Occur>, UserInputAst)>),
|
||||
Leaf(Box<UserInputLeaf>),
|
||||
Boost(Box<UserInputAST>, f64),
|
||||
Boost(Box<UserInputAst>, f64),
|
||||
}
|
||||
|
||||
impl UserInputAST {
|
||||
pub fn unary(self, occur: Occur) -> UserInputAST {
|
||||
UserInputAST::Clause(vec![(Some(occur), self)])
|
||||
impl UserInputAst {
|
||||
pub fn unary(self, occur: Occur) -> UserInputAst {
|
||||
UserInputAst::Clause(vec![(Some(occur), self)])
|
||||
}
|
||||
|
||||
fn compose(occur: Occur, asts: Vec<UserInputAST>) -> UserInputAST {
|
||||
fn compose(occur: Occur, asts: Vec<UserInputAst>) -> UserInputAst {
|
||||
assert_ne!(occur, Occur::MustNot);
|
||||
assert!(!asts.is_empty());
|
||||
if asts.len() == 1 {
|
||||
asts.into_iter().next().unwrap() //< safe
|
||||
} else {
|
||||
UserInputAST::Clause(
|
||||
UserInputAst::Clause(
|
||||
asts.into_iter()
|
||||
.map(|ast: UserInputAST| (Some(occur), ast))
|
||||
.map(|ast: UserInputAst| (Some(occur), ast))
|
||||
.collect::<Vec<_>>(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn empty_query() -> UserInputAST {
|
||||
UserInputAST::Clause(Vec::default())
|
||||
pub fn empty_query() -> UserInputAst {
|
||||
UserInputAst::Clause(Vec::default())
|
||||
}
|
||||
|
||||
pub fn and(asts: Vec<UserInputAST>) -> UserInputAST {
|
||||
UserInputAST::compose(Occur::Must, asts)
|
||||
pub fn and(asts: Vec<UserInputAst>) -> UserInputAst {
|
||||
UserInputAst::compose(Occur::Must, asts)
|
||||
}
|
||||
|
||||
pub fn or(asts: Vec<UserInputAST>) -> UserInputAST {
|
||||
UserInputAST::compose(Occur::Should, asts)
|
||||
pub fn or(asts: Vec<UserInputAst>) -> UserInputAst {
|
||||
UserInputAst::compose(Occur::Should, asts)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -128,15 +128,15 @@ impl From<UserInputLiteral> for UserInputLeaf {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<UserInputLeaf> for UserInputAST {
|
||||
fn from(leaf: UserInputLeaf) -> UserInputAST {
|
||||
UserInputAST::Leaf(Box::new(leaf))
|
||||
impl From<UserInputLeaf> for UserInputAst {
|
||||
fn from(leaf: UserInputLeaf) -> UserInputAst {
|
||||
UserInputAst::Leaf(Box::new(leaf))
|
||||
}
|
||||
}
|
||||
|
||||
fn print_occur_ast(
|
||||
occur_opt: Option<Occur>,
|
||||
ast: &UserInputAST,
|
||||
ast: &UserInputAst,
|
||||
formatter: &mut fmt::Formatter,
|
||||
) -> fmt::Result {
|
||||
if let Some(occur) = occur_opt {
|
||||
@@ -147,10 +147,10 @@ fn print_occur_ast(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl fmt::Debug for UserInputAST {
|
||||
impl fmt::Debug for UserInputAst {
|
||||
fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
||||
match *self {
|
||||
UserInputAST::Clause(ref subqueries) => {
|
||||
UserInputAst::Clause(ref subqueries) => {
|
||||
if subqueries.is_empty() {
|
||||
write!(formatter, "<emptyclause>")?;
|
||||
} else {
|
||||
@@ -164,8 +164,8 @@ impl fmt::Debug for UserInputAST {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
UserInputAST::Leaf(ref subquery) => write!(formatter, "{:?}", subquery),
|
||||
UserInputAST::Boost(ref leaf, boost) => write!(formatter, "({:?})^{}", leaf, boost),
|
||||
UserInputAst::Leaf(ref subquery) => write!(formatter, "{:?}", subquery),
|
||||
UserInputAst::Boost(ref leaf, boost) => write!(formatter, "({:?})^{}", leaf, boost),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@ use super::Collector;
|
||||
use crate::collector::SegmentCollector;
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use crate::SegmentLocalId;
|
||||
use crate::SegmentOrdinal;
|
||||
use crate::SegmentReader;
|
||||
|
||||
/// `CountCollector` collector only counts how many
|
||||
@@ -45,7 +45,7 @@ impl Collector for Count {
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
_: SegmentLocalId,
|
||||
_: SegmentOrdinal,
|
||||
_: &SegmentReader,
|
||||
) -> crate::Result<SegmentCountCollector> {
|
||||
Ok(SegmentCountCollector::default())
|
||||
|
||||
@@ -15,7 +15,7 @@ impl Collector for DocSetCollector {
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_local_id: crate::SegmentLocalId,
|
||||
segment_local_id: crate::SegmentOrdinal,
|
||||
_segment: &crate::SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
Ok(DocSetChildCollector {
|
||||
@@ -36,7 +36,7 @@ impl Collector for DocSetCollector {
|
||||
let mut result = HashSet::with_capacity(len);
|
||||
for (segment_local_id, docs) in segment_fruits {
|
||||
for doc in docs {
|
||||
result.insert(DocAddress(segment_local_id, doc));
|
||||
result.insert(DocAddress::new(segment_local_id, doc));
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
|
||||
@@ -5,7 +5,7 @@ use crate::schema::Facet;
|
||||
use crate::schema::Field;
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use crate::SegmentLocalId;
|
||||
use crate::SegmentOrdinal;
|
||||
use crate::SegmentReader;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::btree_map;
|
||||
@@ -37,7 +37,10 @@ impl<'a> PartialOrd<Hit<'a>> for Hit<'a> {
|
||||
|
||||
impl<'a> Ord for Hit<'a> {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
other.count.cmp(&self.count)
|
||||
other
|
||||
.count
|
||||
.cmp(&self.count)
|
||||
.then(self.facet.cmp(other.facet))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -262,7 +265,7 @@ impl Collector for FacetCollector {
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
_: SegmentLocalId,
|
||||
_: SegmentOrdinal,
|
||||
reader: &SegmentReader,
|
||||
) -> crate::Result<FacetSegmentCollector> {
|
||||
let facet_reader = reader.facet_reader(self.field)?;
|
||||
@@ -294,10 +297,8 @@ impl Collector for FacetCollector {
|
||||
if depth == collapse_depth + 1 {
|
||||
collapsed_id = collapse_facet_ords.len();
|
||||
collapse_facet_ords.push(facet_streamer.term_ord());
|
||||
collapse_mapping.push(collapsed_id);
|
||||
} else {
|
||||
collapse_mapping.push(collapsed_id);
|
||||
}
|
||||
collapse_mapping.push(collapsed_id);
|
||||
}
|
||||
break;
|
||||
}
|
||||
@@ -536,10 +537,10 @@ mod tests {
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from_text(&"/subjects/A/a"),
|
||||
facet_field => Facet::from_text(&"/subjects/B/a"),
|
||||
facet_field => Facet::from_text(&"/subjects/A/b"),
|
||||
facet_field => Facet::from_text(&"/subjects/B/b"),
|
||||
facet_field => Facet::from_text(&"/subjects/A/a").unwrap(),
|
||||
facet_field => Facet::from_text(&"/subjects/B/a").unwrap(),
|
||||
facet_field => Facet::from_text(&"/subjects/A/b").unwrap(),
|
||||
facet_field => Facet::from_text(&"/subjects/B/b").unwrap(),
|
||||
));
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
@@ -560,16 +561,16 @@ mod tests {
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from_text(&"/A/A"),
|
||||
facet_field => Facet::from_text(&"/A/A").unwrap(),
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from_text(&"/A/B"),
|
||||
facet_field => Facet::from_text(&"/A/B").unwrap(),
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from_text(&"/A/C/A"),
|
||||
facet_field => Facet::from_text(&"/A/C/A").unwrap(),
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from_text(&"/D/C/A"),
|
||||
facet_field => Facet::from_text(&"/D/C/A").unwrap(),
|
||||
));
|
||||
index_writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
@@ -577,7 +578,7 @@ mod tests {
|
||||
assert_eq!(searcher.num_docs(), 4);
|
||||
|
||||
let count_facet = |facet_str: &str| {
|
||||
let term = Term::from_facet(facet_field, &Facet::from_text(facet_str));
|
||||
let term = Term::from_facet(facet_field, &Facet::from_text(facet_str).unwrap());
|
||||
searcher
|
||||
.search(&TermQuery::new(term, IndexRecordOption::Basic), &Count)
|
||||
.unwrap()
|
||||
@@ -657,6 +658,41 @@ mod tests {
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_facet_collector_topk_tie_break() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let docs: Vec<Document> = vec![("b", 2), ("a", 2), ("c", 4)]
|
||||
.into_iter()
|
||||
.flat_map(|(c, count)| {
|
||||
let facet = Facet::from(&format!("/facet/{}", c));
|
||||
let doc = doc!(facet_field => facet);
|
||||
iter::repeat(doc).take(count)
|
||||
})
|
||||
.collect();
|
||||
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
for doc in docs {
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
index_writer.commit()?;
|
||||
|
||||
let searcher = index.reader()?.searcher();
|
||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||
facet_collector.add_facet("/facet");
|
||||
let counts: FacetCounts = searcher.search(&AllQuery, &facet_collector)?;
|
||||
|
||||
let facets: Vec<(&Facet, u64)> = counts.top_k("/facet", 2);
|
||||
assert_eq!(
|
||||
facets,
|
||||
vec![(&Facet::from("/facet/c"), 4), (&Facet::from("/facet/a"), 2)]
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
|
||||
@@ -12,11 +12,11 @@
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
use crate::fastfield::{FastFieldReader, FastValue};
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue};
|
||||
use crate::schema::Field;
|
||||
use crate::{Score, SegmentReader, TantivyError};
|
||||
|
||||
/// The `FilterCollector` collector filters docs using a u64 fast field value and a predicate.
|
||||
/// The `FilterCollector` collector filters docs using a fast field value and a predicate.
|
||||
/// Only the documents for which the predicate returned "true" will be passed on to the next collector.
|
||||
///
|
||||
/// ```rust
|
||||
@@ -47,7 +47,7 @@ use crate::{Score, SegmentReader, TantivyError};
|
||||
/// let top_docs = searcher.search(&query, &no_filter_collector).unwrap();
|
||||
///
|
||||
/// assert_eq!(top_docs.len(), 1);
|
||||
/// assert_eq!(top_docs[0].1, DocAddress(0, 1));
|
||||
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
|
||||
///
|
||||
/// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(price, &|value| value < 5u64, TopDocs::with_limit(2));
|
||||
/// let filtered_top_docs = searcher.search(&query, &filter_all_collector).unwrap();
|
||||
@@ -155,7 +155,7 @@ where
|
||||
TPredicate: 'static,
|
||||
TPredicateValue: FastValue,
|
||||
{
|
||||
fast_field_reader: FastFieldReader<TPredicateValue>,
|
||||
fast_field_reader: DynamicFastFieldReader<TPredicateValue>,
|
||||
segment_collector: TSegmentCollector,
|
||||
predicate: TPredicate,
|
||||
t_predicate_value: PhantomData<TPredicateValue>,
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
use crate::fastfield::{FastFieldReader, FastValue};
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue};
|
||||
use crate::schema::{Field, Type};
|
||||
use crate::{DocId, Score};
|
||||
use fastdivide::DividerU64;
|
||||
@@ -84,7 +84,7 @@ impl HistogramComputer {
|
||||
}
|
||||
pub struct SegmentHistogramCollector {
|
||||
histogram_computer: HistogramComputer,
|
||||
ff_reader: FastFieldReader<u64>,
|
||||
ff_reader: DynamicFastFieldReader<u64>,
|
||||
}
|
||||
|
||||
impl SegmentCollector for SegmentHistogramCollector {
|
||||
@@ -106,7 +106,7 @@ impl Collector for HistogramCollector {
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
_segment_local_id: crate::SegmentLocalId,
|
||||
_segment_local_id: crate::SegmentOrdinal,
|
||||
segment: &crate::SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
let ff_reader = segment.fast_fields().u64_lenient(self.field)?;
|
||||
|
||||
@@ -86,7 +86,7 @@ See the `custom_collector` example.
|
||||
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use crate::SegmentLocalId;
|
||||
use crate::SegmentOrdinal;
|
||||
use crate::SegmentReader;
|
||||
use downcast_rs::impl_downcast;
|
||||
|
||||
@@ -155,7 +155,7 @@ pub trait Collector: Sync + Send {
|
||||
/// on this segment.
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment_local_id: SegmentOrdinal,
|
||||
segment: &SegmentReader,
|
||||
) -> crate::Result<Self::Child>;
|
||||
|
||||
@@ -214,7 +214,7 @@ impl<TCollector: Collector> Collector for Option<TCollector> {
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment_local_id: SegmentOrdinal,
|
||||
segment: &SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
Ok(if let Some(inner) = self {
|
||||
|
||||
@@ -3,7 +3,7 @@ use super::SegmentCollector;
|
||||
use crate::collector::Fruit;
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use crate::SegmentLocalId;
|
||||
use crate::SegmentOrdinal;
|
||||
use crate::SegmentReader;
|
||||
use crate::TantivyError;
|
||||
use std::marker::PhantomData;
|
||||
@@ -175,7 +175,7 @@ impl<'a> Collector for MultiCollector<'a> {
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment_local_id: SegmentOrdinal,
|
||||
segment: &SegmentReader,
|
||||
) -> crate::Result<MultiCollectorChild> {
|
||||
let children = self
|
||||
@@ -276,7 +276,7 @@ mod tests {
|
||||
let mut collectors = MultiCollector::new();
|
||||
let topdocs_handler = collectors.add_collector(TopDocs::with_limit(2));
|
||||
let count_handler = collectors.add_collector(Count);
|
||||
let mut multifruits = searcher.search(&query, &mut collectors).unwrap();
|
||||
let mut multifruits = searcher.search(&query, &collectors).unwrap();
|
||||
|
||||
assert_eq!(count_handler.extract(&mut multifruits), 5);
|
||||
assert_eq!(topdocs_handler.extract(&mut multifruits).len(), 2);
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
use super::*;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::fastfield::BytesFastFieldReader;
|
||||
use crate::fastfield::DynamicFastFieldReader;
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::schema::Field;
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use crate::SegmentLocalId;
|
||||
use crate::SegmentOrdinal;
|
||||
use crate::{DocAddress, Document, Searcher};
|
||||
|
||||
use crate::collector::{Count, FilterCollector, TopDocs};
|
||||
@@ -53,7 +54,7 @@ pub fn test_filter_collector() {
|
||||
let top_docs = searcher.search(&query, &filter_some_collector).unwrap();
|
||||
|
||||
assert_eq!(top_docs.len(), 1);
|
||||
assert_eq!(top_docs[0].1, DocAddress(0, 1));
|
||||
assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
|
||||
|
||||
let filter_all_collector: FilterCollector<_, _, u64> =
|
||||
FilterCollector::new(price, &|value| value < 5u64, TopDocs::with_limit(2));
|
||||
@@ -82,7 +83,7 @@ pub struct TestCollector {
|
||||
}
|
||||
|
||||
pub struct TestSegmentCollector {
|
||||
segment_id: SegmentLocalId,
|
||||
segment_id: SegmentOrdinal,
|
||||
fruit: TestFruit,
|
||||
}
|
||||
|
||||
@@ -108,7 +109,7 @@ impl Collector for TestCollector {
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_id: SegmentLocalId,
|
||||
segment_id: SegmentOrdinal,
|
||||
_reader: &SegmentReader,
|
||||
) -> crate::Result<TestSegmentCollector> {
|
||||
Ok(TestSegmentCollector {
|
||||
@@ -126,7 +127,7 @@ impl Collector for TestCollector {
|
||||
if fruit.docs().is_empty() {
|
||||
0
|
||||
} else {
|
||||
fruit.docs()[0].segment_ord()
|
||||
fruit.docs()[0].segment_ord
|
||||
}
|
||||
});
|
||||
let mut docs = vec![];
|
||||
@@ -143,7 +144,7 @@ impl SegmentCollector for TestSegmentCollector {
|
||||
type Fruit = TestFruit;
|
||||
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
self.fruit.docs.push(DocAddress(self.segment_id, doc));
|
||||
self.fruit.docs.push(DocAddress::new(self.segment_id, doc));
|
||||
self.fruit.scores.push(score);
|
||||
}
|
||||
|
||||
@@ -162,7 +163,7 @@ pub struct FastFieldTestCollector {
|
||||
|
||||
pub struct FastFieldSegmentCollector {
|
||||
vals: Vec<u64>,
|
||||
reader: FastFieldReader<u64>,
|
||||
reader: DynamicFastFieldReader<u64>,
|
||||
}
|
||||
|
||||
impl FastFieldTestCollector {
|
||||
@@ -177,7 +178,7 @@ impl Collector for FastFieldTestCollector {
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
_: SegmentLocalId,
|
||||
_: SegmentOrdinal,
|
||||
segment_reader: &SegmentReader,
|
||||
) -> crate::Result<FastFieldSegmentCollector> {
|
||||
let reader = segment_reader
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use crate::DocAddress;
|
||||
use crate::DocId;
|
||||
use crate::SegmentLocalId;
|
||||
use crate::SegmentOrdinal;
|
||||
use crate::SegmentReader;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BinaryHeap;
|
||||
@@ -118,7 +118,7 @@ where
|
||||
|
||||
pub(crate) fn for_segment<F: PartialOrd>(
|
||||
&self,
|
||||
segment_id: SegmentLocalId,
|
||||
segment_id: SegmentOrdinal,
|
||||
_: &SegmentReader,
|
||||
) -> TopSegmentCollector<F> {
|
||||
TopSegmentCollector::new(segment_id, self.limit + self.offset)
|
||||
@@ -147,29 +147,32 @@ where
|
||||
pub(crate) struct TopSegmentCollector<T> {
|
||||
limit: usize,
|
||||
heap: BinaryHeap<ComparableDoc<T, DocId>>,
|
||||
segment_id: u32,
|
||||
segment_ord: u32,
|
||||
}
|
||||
|
||||
impl<T: PartialOrd> TopSegmentCollector<T> {
|
||||
fn new(segment_id: SegmentLocalId, limit: usize) -> TopSegmentCollector<T> {
|
||||
fn new(segment_ord: SegmentOrdinal, limit: usize) -> TopSegmentCollector<T> {
|
||||
TopSegmentCollector {
|
||||
limit,
|
||||
heap: BinaryHeap::with_capacity(limit),
|
||||
segment_id,
|
||||
segment_ord,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
|
||||
pub fn harvest(self) -> Vec<(T, DocAddress)> {
|
||||
let segment_id = self.segment_id;
|
||||
let segment_ord = self.segment_ord;
|
||||
self.heap
|
||||
.into_sorted_vec()
|
||||
.into_iter()
|
||||
.map(|comparable_doc| {
|
||||
(
|
||||
comparable_doc.feature,
|
||||
DocAddress(segment_id, comparable_doc.doc),
|
||||
DocAddress {
|
||||
segment_ord,
|
||||
doc_id: comparable_doc.doc,
|
||||
},
|
||||
)
|
||||
})
|
||||
.collect()
|
||||
@@ -177,7 +180,7 @@ impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
|
||||
|
||||
/// Return true iff at least K documents have gone through
|
||||
/// the collector.
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub(crate) fn at_capacity(&self) -> bool {
|
||||
self.heap.len() >= self.limit
|
||||
}
|
||||
@@ -186,7 +189,7 @@ impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
|
||||
///
|
||||
/// It collects documents until it has reached the max capacity. Once it reaches capacity, it
|
||||
/// will compare the lowest scoring item with the given one and keep whichever is greater.
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn collect(&mut self, doc: DocId, feature: T) {
|
||||
if self.at_capacity() {
|
||||
// It's ok to unwrap as long as a limit of 0 is forbidden.
|
||||
@@ -220,9 +223,9 @@ mod tests {
|
||||
assert_eq!(
|
||||
top_collector.harvest(),
|
||||
vec![
|
||||
(0.8, DocAddress(0, 1)),
|
||||
(0.3, DocAddress(0, 5)),
|
||||
(0.2, DocAddress(0, 3))
|
||||
(0.8, DocAddress::new(0, 1)),
|
||||
(0.3, DocAddress::new(0, 5)),
|
||||
(0.2, DocAddress::new(0, 3))
|
||||
]
|
||||
);
|
||||
}
|
||||
@@ -238,10 +241,10 @@ mod tests {
|
||||
assert_eq!(
|
||||
top_collector.harvest(),
|
||||
vec![
|
||||
(0.9, DocAddress(0, 7)),
|
||||
(0.8, DocAddress(0, 1)),
|
||||
(0.3, DocAddress(0, 5)),
|
||||
(0.2, DocAddress(0, 3))
|
||||
(0.9, DocAddress::new(0, 7)),
|
||||
(0.8, DocAddress::new(0, 1)),
|
||||
(0.3, DocAddress::new(0, 5)),
|
||||
(0.2, DocAddress::new(0, 3))
|
||||
]
|
||||
);
|
||||
}
|
||||
@@ -276,17 +279,17 @@ mod tests {
|
||||
|
||||
let results = collector
|
||||
.merge_fruits(vec![vec![
|
||||
(0.9, DocAddress(0, 1)),
|
||||
(0.8, DocAddress(0, 2)),
|
||||
(0.7, DocAddress(0, 3)),
|
||||
(0.6, DocAddress(0, 4)),
|
||||
(0.5, DocAddress(0, 5)),
|
||||
(0.9, DocAddress::new(0, 1)),
|
||||
(0.8, DocAddress::new(0, 2)),
|
||||
(0.7, DocAddress::new(0, 3)),
|
||||
(0.6, DocAddress::new(0, 4)),
|
||||
(0.5, DocAddress::new(0, 5)),
|
||||
]])
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
results,
|
||||
vec![(0.8, DocAddress(0, 2)), (0.7, DocAddress(0, 3)),]
|
||||
vec![(0.8, DocAddress::new(0, 2)), (0.7, DocAddress::new(0, 3)),]
|
||||
);
|
||||
}
|
||||
|
||||
@@ -295,10 +298,13 @@ mod tests {
|
||||
let collector = TopCollector::with_limit(2).and_offset(1);
|
||||
|
||||
let results = collector
|
||||
.merge_fruits(vec![vec![(0.9, DocAddress(0, 1)), (0.8, DocAddress(0, 2))]])
|
||||
.merge_fruits(vec![vec![
|
||||
(0.9, DocAddress::new(0, 1)),
|
||||
(0.8, DocAddress::new(0, 2)),
|
||||
]])
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(results, vec![(0.8, DocAddress(0, 2)),]);
|
||||
assert_eq!(results, vec![(0.8, DocAddress::new(0, 2)),]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -306,7 +312,10 @@ mod tests {
|
||||
let collector = TopCollector::with_limit(2).and_offset(20);
|
||||
|
||||
let results = collector
|
||||
.merge_fruits(vec![vec![(0.9, DocAddress(0, 1)), (0.8, DocAddress(0, 2))]])
|
||||
.merge_fruits(vec![vec![
|
||||
(0.9, DocAddress::new(0, 1)),
|
||||
(0.8, DocAddress::new(0, 2)),
|
||||
]])
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(results, vec![]);
|
||||
|
||||
@@ -4,13 +4,13 @@ use crate::collector::tweak_score_top_collector::TweakedScoreTopCollector;
|
||||
use crate::collector::{
|
||||
CustomScorer, CustomSegmentScorer, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector,
|
||||
};
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
|
||||
use crate::query::Weight;
|
||||
use crate::schema::Field;
|
||||
use crate::DocAddress;
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use crate::SegmentLocalId;
|
||||
use crate::SegmentOrdinal;
|
||||
use crate::SegmentReader;
|
||||
use crate::{collector::custom_score_top_collector::CustomScoreTopCollector, fastfield::FastValue};
|
||||
use crate::{collector::top_collector::TopSegmentCollector, TantivyError};
|
||||
@@ -37,7 +37,7 @@ where
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_local_id: crate::SegmentLocalId,
|
||||
segment_local_id: crate::SegmentOrdinal,
|
||||
segment: &SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
let schema = segment.schema();
|
||||
@@ -113,8 +113,8 @@ where
|
||||
/// let query = query_parser.parse_query("diary").unwrap();
|
||||
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2)).unwrap();
|
||||
///
|
||||
/// assert_eq!(top_docs[0].1, DocAddress(0, 1));
|
||||
/// assert_eq!(top_docs[1].1, DocAddress(0, 3));
|
||||
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
|
||||
/// assert_eq!(top_docs[1].1, DocAddress::new(0, 3));
|
||||
/// ```
|
||||
pub struct TopDocs(TopCollector<Score>);
|
||||
|
||||
@@ -129,7 +129,7 @@ impl fmt::Debug for TopDocs {
|
||||
}
|
||||
|
||||
struct ScorerByFastFieldReader {
|
||||
ff_reader: FastFieldReader<u64>,
|
||||
ff_reader: DynamicFastFieldReader<u64>,
|
||||
}
|
||||
|
||||
impl CustomSegmentScorer<u64> for ScorerByFastFieldReader {
|
||||
@@ -151,7 +151,7 @@ impl CustomScorer<u64> for ScorerByField {
|
||||
// mapping is monotonic, so it is sufficient to compute our top-K docs.
|
||||
//
|
||||
// The conversion will then happen only on the top-K docs.
|
||||
let ff_reader: FastFieldReader<u64> = segment_reader
|
||||
let ff_reader = segment_reader
|
||||
.fast_fields()
|
||||
.typed_fast_field_reader(self.field)?;
|
||||
Ok(ScorerByFastFieldReader { ff_reader })
|
||||
@@ -201,8 +201,8 @@ impl TopDocs {
|
||||
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2).and_offset(1)).unwrap();
|
||||
///
|
||||
/// assert_eq!(top_docs.len(), 2);
|
||||
/// assert_eq!(top_docs[0].1, DocAddress(0, 4));
|
||||
/// assert_eq!(top_docs[1].1, DocAddress(0, 3));
|
||||
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 4));
|
||||
/// assert_eq!(top_docs[1].1, DocAddress::new(0, 3));
|
||||
/// ```
|
||||
pub fn and_offset(self, offset: usize) -> TopDocs {
|
||||
TopDocs(self.0.and_offset(offset))
|
||||
@@ -243,8 +243,8 @@ impl TopDocs {
|
||||
/// # let query = QueryParser::for_index(&index, vec![title]).parse_query("diary")?;
|
||||
/// # let top_docs = docs_sorted_by_rating(&reader.searcher(), &query, rating)?;
|
||||
/// # assert_eq!(top_docs,
|
||||
/// # vec![(97u64, DocAddress(0u32, 1)),
|
||||
/// # (80u64, DocAddress(0u32, 3))]);
|
||||
/// # vec![(97u64, DocAddress::new(0u32, 1)),
|
||||
/// # (80u64, DocAddress::new(0u32, 3))]);
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// /// Searches the document matching the given query, and
|
||||
@@ -323,8 +323,8 @@ impl TopDocs {
|
||||
/// # let reader = index.reader()?;
|
||||
/// # let top_docs = docs_sorted_by_revenue(&reader.searcher(), &AllQuery, rating)?;
|
||||
/// # assert_eq!(top_docs,
|
||||
/// # vec![(119_000_000i64, DocAddress(0, 1)),
|
||||
/// # (92_000_000i64, DocAddress(0, 0))]);
|
||||
/// # vec![(119_000_000i64, DocAddress::new(0, 1)),
|
||||
/// # (92_000_000i64, DocAddress::new(0, 0))]);
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// /// Searches the document matching the given query, and
|
||||
@@ -401,6 +401,7 @@ impl TopDocs {
|
||||
/// # use tantivy::query::QueryParser;
|
||||
/// use tantivy::SegmentReader;
|
||||
/// use tantivy::collector::TopDocs;
|
||||
/// use tantivy::fastfield::FastFieldReader;
|
||||
/// use tantivy::schema::Field;
|
||||
///
|
||||
/// fn create_schema() -> Schema {
|
||||
@@ -508,6 +509,7 @@ impl TopDocs {
|
||||
/// use tantivy::SegmentReader;
|
||||
/// use tantivy::collector::TopDocs;
|
||||
/// use tantivy::schema::Field;
|
||||
/// use tantivy::fastfield::FastFieldReader;
|
||||
///
|
||||
/// # fn create_schema() -> Schema {
|
||||
/// # let mut schema_builder = Schema::builder();
|
||||
@@ -600,7 +602,7 @@ impl Collector for TopDocs {
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment_local_id: SegmentOrdinal,
|
||||
reader: &SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
let collector = self.0.for_segment(segment_local_id, reader);
|
||||
@@ -671,7 +673,15 @@ impl Collector for TopDocs {
|
||||
let fruit = heap
|
||||
.into_sorted_vec()
|
||||
.into_iter()
|
||||
.map(|cid| (cid.feature, DocAddress(segment_ord, cid.doc)))
|
||||
.map(|cid| {
|
||||
(
|
||||
cid.feature,
|
||||
DocAddress {
|
||||
segment_ord,
|
||||
doc_id: cid.doc,
|
||||
},
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
Ok(fruit)
|
||||
}
|
||||
@@ -741,9 +751,9 @@ mod tests {
|
||||
assert_results_equals(
|
||||
&score_docs,
|
||||
&[
|
||||
(0.81221175, DocAddress(0u32, 1)),
|
||||
(0.5376842, DocAddress(0u32, 2)),
|
||||
(0.48527452, DocAddress(0, 0)),
|
||||
(0.81221175, DocAddress::new(0u32, 1)),
|
||||
(0.5376842, DocAddress::new(0u32, 2)),
|
||||
(0.48527452, DocAddress::new(0, 0)),
|
||||
],
|
||||
);
|
||||
}
|
||||
@@ -760,7 +770,7 @@ mod tests {
|
||||
.searcher()
|
||||
.search(&text_query, &TopDocs::with_limit(4).and_offset(2))
|
||||
.unwrap();
|
||||
assert_results_equals(&score_docs[..], &[(0.48527452, DocAddress(0, 0))]);
|
||||
assert_results_equals(&score_docs[..], &[(0.48527452, DocAddress::new(0, 0))]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -778,8 +788,8 @@ mod tests {
|
||||
assert_results_equals(
|
||||
&score_docs,
|
||||
&[
|
||||
(0.81221175, DocAddress(0u32, 1)),
|
||||
(0.5376842, DocAddress(0u32, 2)),
|
||||
(0.81221175, DocAddress::new(0u32, 1)),
|
||||
(0.5376842, DocAddress::new(0u32, 2)),
|
||||
],
|
||||
);
|
||||
}
|
||||
@@ -799,8 +809,8 @@ mod tests {
|
||||
assert_results_equals(
|
||||
&score_docs[..],
|
||||
&[
|
||||
(0.5376842, DocAddress(0u32, 2)),
|
||||
(0.48527452, DocAddress(0, 0)),
|
||||
(0.5376842, DocAddress::new(0u32, 2)),
|
||||
(0.48527452, DocAddress::new(0, 0)),
|
||||
],
|
||||
);
|
||||
}
|
||||
@@ -864,9 +874,9 @@ mod tests {
|
||||
assert_eq!(
|
||||
&top_docs[..],
|
||||
&[
|
||||
(64, DocAddress(0, 1)),
|
||||
(16, DocAddress(0, 2)),
|
||||
(12, DocAddress(0, 0))
|
||||
(64, DocAddress::new(0, 1)),
|
||||
(16, DocAddress::new(0, 2)),
|
||||
(12, DocAddress::new(0, 0))
|
||||
]
|
||||
);
|
||||
}
|
||||
@@ -898,8 +908,8 @@ mod tests {
|
||||
assert_eq!(
|
||||
&top_docs[..],
|
||||
&[
|
||||
(mr_birthday, DocAddress(0, 1)),
|
||||
(pr_birthday, DocAddress(0, 0)),
|
||||
(mr_birthday, DocAddress::new(0, 1)),
|
||||
(pr_birthday, DocAddress::new(0, 0)),
|
||||
]
|
||||
);
|
||||
Ok(())
|
||||
@@ -927,7 +937,10 @@ mod tests {
|
||||
let top_docs: Vec<(i64, DocAddress)> = searcher.search(&AllQuery, &top_collector)?;
|
||||
assert_eq!(
|
||||
&top_docs[..],
|
||||
&[(40i64, DocAddress(0, 1)), (-1i64, DocAddress(0, 0)),]
|
||||
&[
|
||||
(40i64, DocAddress::new(0, 1)),
|
||||
(-1i64, DocAddress::new(0, 0)),
|
||||
]
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
@@ -954,7 +967,10 @@ mod tests {
|
||||
let top_docs: Vec<(f64, DocAddress)> = searcher.search(&AllQuery, &top_collector)?;
|
||||
assert_eq!(
|
||||
&top_docs[..],
|
||||
&[(40f64, DocAddress(0, 1)), (-1.0f64, DocAddress(0, 0)),]
|
||||
&[
|
||||
(40f64, DocAddress::new(0, 1)),
|
||||
(-1.0f64, DocAddress::new(0, 0)),
|
||||
]
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
@@ -1034,7 +1050,7 @@ mod tests {
|
||||
|
||||
assert_eq!(
|
||||
score_docs,
|
||||
vec![(1, DocAddress(0, 1)), (0, DocAddress(0, 0)),]
|
||||
vec![(1, DocAddress::new(0, 1)), (0, DocAddress::new(0, 0)),]
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1056,7 +1072,7 @@ mod tests {
|
||||
|
||||
assert_eq!(
|
||||
score_docs,
|
||||
vec![(1, DocAddress(0, 1)), (0, DocAddress(0, 0)),]
|
||||
vec![(1, DocAddress::new(0, 1)), (0, DocAddress::new(0, 0)),]
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1064,7 +1080,7 @@ mod tests {
|
||||
query: &str,
|
||||
query_field: Field,
|
||||
schema: Schema,
|
||||
mut doc_adder: impl FnMut(&mut IndexWriter) -> (),
|
||||
mut doc_adder: impl FnMut(&mut IndexWriter),
|
||||
) -> (Index, Box<dyn Query>) {
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
|
||||
|
||||
@@ -59,19 +59,19 @@ impl TinySet {
|
||||
|
||||
/// Creates a new `TinySet` containing only one element
|
||||
/// within `[0; 64[`
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn singleton(el: u32) -> TinySet {
|
||||
TinySet(1u64 << u64::from(el))
|
||||
}
|
||||
|
||||
/// Insert a new element within [0..64[
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn insert(self, el: u32) -> TinySet {
|
||||
self.union(TinySet::singleton(el))
|
||||
}
|
||||
|
||||
/// Insert a new element within [0..64[
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn insert_mut(&mut self, el: u32) -> bool {
|
||||
let old = *self;
|
||||
*self = old.insert(el);
|
||||
@@ -79,20 +79,20 @@ impl TinySet {
|
||||
}
|
||||
|
||||
/// Returns the union of two tinysets
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn union(self, other: TinySet) -> TinySet {
|
||||
TinySet(self.0 | other.0)
|
||||
}
|
||||
|
||||
/// Returns true iff the `TinySet` is empty.
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn is_empty(self) -> bool {
|
||||
self.0 == 0u64
|
||||
}
|
||||
|
||||
/// Returns the lowest element in the `TinySet`
|
||||
/// and removes it.
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn pop_lowest(&mut self) -> Option<u32> {
|
||||
if self.is_empty() {
|
||||
None
|
||||
|
||||
@@ -190,7 +190,7 @@ mod test {
|
||||
use super::{CompositeFile, CompositeWrite};
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::common::VInt;
|
||||
use crate::directory::{Directory, RAMDirectory};
|
||||
use crate::directory::{Directory, RamDirectory};
|
||||
use crate::schema::Field;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
@@ -198,7 +198,7 @@ mod test {
|
||||
#[test]
|
||||
fn test_composite_file() -> crate::Result<()> {
|
||||
let path = Path::new("test_path");
|
||||
let directory = RAMDirectory::create();
|
||||
let directory = RamDirectory::create();
|
||||
{
|
||||
let w = directory.open_write(path).unwrap();
|
||||
let mut composite_write = CompositeWrite::wrap(w);
|
||||
|
||||
@@ -1,71 +1,21 @@
|
||||
pub mod bitpacker;
|
||||
mod bitset;
|
||||
mod composite_file;
|
||||
mod counting_writer;
|
||||
mod serialize;
|
||||
mod vint;
|
||||
|
||||
pub use self::bitset::BitSet;
|
||||
pub(crate) use self::bitset::TinySet;
|
||||
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
|
||||
pub use self::counting_writer::CountingWriter;
|
||||
pub use self::serialize::{BinarySerializable, FixedSize};
|
||||
pub use self::vint::{
|
||||
pub use byteorder::LittleEndian as Endianness;
|
||||
pub use common::CountingWriter;
|
||||
pub use common::{
|
||||
read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt,
|
||||
};
|
||||
pub use byteorder::LittleEndian as Endianness;
|
||||
pub use common::{BinarySerializable, DeserializeFrom, FixedSize};
|
||||
|
||||
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
|
||||
///
|
||||
/// We do not allow segments with more than
|
||||
pub const MAX_DOC_LIMIT: u32 = 1 << 31;
|
||||
|
||||
pub fn minmax<I, T>(mut vals: I) -> Option<(T, T)>
|
||||
where
|
||||
I: Iterator<Item = T>,
|
||||
T: Copy + Ord,
|
||||
{
|
||||
if let Some(first_el) = vals.next() {
|
||||
return Some(vals.fold((first_el, first_el), |(min_val, max_val), el| {
|
||||
(min_val.min(el), max_val.max(el))
|
||||
}));
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Computes the number of bits that will be used for bitpacking.
|
||||
///
|
||||
/// In general the target is the minimum number of bits
|
||||
/// required to express the amplitude given in argument.
|
||||
///
|
||||
/// e.g. If the amplitude is 10, we can store all ints on simply 4bits.
|
||||
///
|
||||
/// The logic is slightly more convoluted here as for optimization
|
||||
/// reasons, we want to ensure that a value spawns over at most 8 bytes
|
||||
/// of aligns bytes.
|
||||
///
|
||||
/// Spanning over 9 bytes is possible for instance, if we do
|
||||
/// bitpacking with an amplitude of 63 bits.
|
||||
/// In this case, the second int will start on bit
|
||||
/// 63 (which belongs to byte 7) and ends at byte 15;
|
||||
/// Hence 9 bytes (from byte 7 to byte 15 included).
|
||||
///
|
||||
/// To avoid this, we force the number of bits to 64bits
|
||||
/// when the result is greater than `64-8 = 56 bits`.
|
||||
///
|
||||
/// Note that this only affects rare use cases spawning over
|
||||
/// a very large range of values. Even in this case, it results
|
||||
/// in an extra cost of at most 12% compared to the optimal
|
||||
/// number of bits.
|
||||
pub(crate) fn compute_num_bits(n: u64) -> u8 {
|
||||
let amplitude = (64u32 - n.leading_zeros()) as u8;
|
||||
if amplitude <= 64 - 8 {
|
||||
amplitude
|
||||
} else {
|
||||
64
|
||||
}
|
||||
}
|
||||
|
||||
/// Has length trait
|
||||
pub trait HasLen {
|
||||
/// Return length
|
||||
@@ -99,13 +49,13 @@ const HIGHEST_BIT: u64 = 1 << 63;
|
||||
///
|
||||
/// # See also
|
||||
/// The [reverse mapping is `u64_to_i64`](./fn.u64_to_i64.html).
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn i64_to_u64(val: i64) -> u64 {
|
||||
(val as u64) ^ HIGHEST_BIT
|
||||
}
|
||||
|
||||
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn u64_to_i64(val: u64) -> i64 {
|
||||
(val ^ HIGHEST_BIT) as i64
|
||||
}
|
||||
@@ -127,7 +77,7 @@ pub fn u64_to_i64(val: u64) -> i64 {
|
||||
///
|
||||
/// # See also
|
||||
/// The [reverse mapping is `u64_to_f64`](./fn.u64_to_f64.html).
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn f64_to_u64(val: f64) -> u64 {
|
||||
let bits = val.to_bits();
|
||||
if val.is_sign_positive() {
|
||||
@@ -138,7 +88,7 @@ pub fn f64_to_u64(val: f64) -> u64 {
|
||||
}
|
||||
|
||||
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn u64_to_f64(val: u64) -> f64 {
|
||||
f64::from_bits(if val & HIGHEST_BIT != 0 {
|
||||
val ^ HIGHEST_BIT
|
||||
@@ -150,11 +100,12 @@ pub fn u64_to_f64(val: u64) -> f64 {
|
||||
#[cfg(test)]
|
||||
pub(crate) mod test {
|
||||
|
||||
pub use super::minmax;
|
||||
pub use super::serialize::test::fixed_size_test;
|
||||
use super::{compute_num_bits, f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
|
||||
use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
|
||||
use common::{BinarySerializable, FixedSize};
|
||||
use proptest::prelude::*;
|
||||
use std::f64;
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
pub use tantivy_bitpacker::minmax;
|
||||
|
||||
fn test_i64_converter_helper(val: i64) {
|
||||
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
|
||||
@@ -164,6 +115,12 @@ pub(crate) mod test {
|
||||
assert_eq!(u64_to_f64(f64_to_u64(val)), val);
|
||||
}
|
||||
|
||||
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
|
||||
let mut buffer = Vec::new();
|
||||
O::default().serialize(&mut buffer).unwrap();
|
||||
assert_eq!(buffer.len(), O::SIZE_IN_BYTES);
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#[test]
|
||||
fn test_f64_converter_monotonicity_proptest((left, right) in (proptest::num::f64::NORMAL, proptest::num::f64::NORMAL)) {
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use super::segment::Segment;
|
||||
use super::{segment::Segment, IndexSettings};
|
||||
use crate::core::Executor;
|
||||
use crate::core::IndexMeta;
|
||||
use crate::core::SegmentId;
|
||||
@@ -10,10 +10,10 @@ use crate::directory::ManagedDirectory;
|
||||
#[cfg(feature = "mmap")]
|
||||
use crate::directory::MmapDirectory;
|
||||
use crate::directory::INDEX_WRITER_LOCK;
|
||||
use crate::directory::{Directory, RAMDirectory};
|
||||
use crate::directory::{Directory, RamDirectory};
|
||||
use crate::error::DataCorruption;
|
||||
use crate::error::TantivyError;
|
||||
use crate::indexer::index_writer::HEAP_SIZE_MIN;
|
||||
use crate::indexer::index_writer::{HEAP_SIZE_MIN, MAX_NUM_THREAD};
|
||||
use crate::indexer::segment_updater::save_new_metas;
|
||||
use crate::reader::IndexReader;
|
||||
use crate::reader::IndexReaderBuilder;
|
||||
@@ -42,7 +42,7 @@ fn load_metas(
|
||||
"Meta file does not contain valid utf8 file.".to_string(),
|
||||
)
|
||||
})?;
|
||||
IndexMeta::deserialize(&meta_string, &inventory)
|
||||
IndexMeta::deserialize(&meta_string, inventory)
|
||||
.map_err(|e| {
|
||||
DataCorruption::new(
|
||||
META_FILEPATH.to_path_buf(),
|
||||
@@ -55,17 +55,146 @@ fn load_metas(
|
||||
.map_err(From::from)
|
||||
}
|
||||
|
||||
/// IndexBuilder can be used to create an index.
|
||||
///
|
||||
/// Use in conjunction with `SchemaBuilder`. Global index settings
|
||||
/// can be configured with `IndexSettings`
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// use tantivy::schema::*;
|
||||
/// use tantivy::{Index, IndexSettings, IndexSortByField, Order};
|
||||
///
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
/// let id_field = schema_builder.add_text_field("id", STRING);
|
||||
/// let title_field = schema_builder.add_text_field("title", TEXT);
|
||||
/// let body_field = schema_builder.add_text_field("body", TEXT);
|
||||
/// let number_field = schema_builder.add_u64_field(
|
||||
/// "number",
|
||||
/// IntOptions::default().set_fast(Cardinality::SingleValue),
|
||||
/// );
|
||||
///
|
||||
/// let schema = schema_builder.build();
|
||||
/// let settings = IndexSettings{sort_by_field: Some(IndexSortByField{field:"number".to_string(), order:Order::Asc}), ..Default::default()};
|
||||
/// let index = Index::builder().schema(schema).settings(settings).create_in_ram();
|
||||
///
|
||||
/// ```
|
||||
pub struct IndexBuilder {
|
||||
schema: Option<Schema>,
|
||||
index_settings: IndexSettings,
|
||||
}
|
||||
impl Default for IndexBuilder {
|
||||
fn default() -> Self {
|
||||
IndexBuilder::new()
|
||||
}
|
||||
}
|
||||
impl IndexBuilder {
|
||||
/// Creates a new `IndexBuilder`
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
schema: None,
|
||||
index_settings: IndexSettings::default(),
|
||||
}
|
||||
}
|
||||
/// Set the settings
|
||||
pub fn settings(mut self, settings: IndexSettings) -> Self {
|
||||
self.index_settings = settings;
|
||||
self
|
||||
}
|
||||
/// Set the schema
|
||||
pub fn schema(mut self, schema: Schema) -> Self {
|
||||
self.schema = Some(schema);
|
||||
self
|
||||
}
|
||||
/// Creates a new index using the `RAMDirectory`.
|
||||
///
|
||||
/// The index will be allocated in anonymous memory.
|
||||
/// This should only be used for unit tests.
|
||||
pub fn create_in_ram(self) -> Result<Index, TantivyError> {
|
||||
let ram_directory = RamDirectory::create();
|
||||
Ok(self
|
||||
.create(ram_directory)
|
||||
.expect("Creating a RAMDirectory should never fail"))
|
||||
}
|
||||
/// Creates a new index in a given filepath.
|
||||
/// The index will use the `MMapDirectory`.
|
||||
///
|
||||
/// If a previous index was in this directory, then its meta file will be destroyed.
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn create_in_dir<P: AsRef<Path>>(self, directory_path: P) -> crate::Result<Index> {
|
||||
let mmap_directory = MmapDirectory::open(directory_path)?;
|
||||
if Index::exists(&mmap_directory)? {
|
||||
return Err(TantivyError::IndexAlreadyExists);
|
||||
}
|
||||
self.create(mmap_directory)
|
||||
}
|
||||
/// Creates a new index in a temp directory.
|
||||
///
|
||||
/// The index will use the `MMapDirectory` in a newly created directory.
|
||||
/// The temp directory will be destroyed automatically when the `Index` object
|
||||
/// is destroyed.
|
||||
///
|
||||
/// The temp directory is only used for testing the `MmapDirectory`.
|
||||
/// For other unit tests, prefer the `RAMDirectory`, see: `create_in_ram`.
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn create_from_tempdir(self) -> crate::Result<Index> {
|
||||
let mmap_directory = MmapDirectory::create_from_tempdir()?;
|
||||
self.create(mmap_directory)
|
||||
}
|
||||
fn get_expect_schema(&self) -> crate::Result<Schema> {
|
||||
self.schema
|
||||
.as_ref()
|
||||
.cloned()
|
||||
.ok_or(TantivyError::IndexBuilderMissingArgument("schema"))
|
||||
}
|
||||
/// Opens or creates a new index in the provided directory
|
||||
pub fn open_or_create<Dir: Directory>(self, dir: Dir) -> crate::Result<Index> {
|
||||
if !Index::exists(&dir)? {
|
||||
return self.create(dir);
|
||||
}
|
||||
let index = Index::open(dir)?;
|
||||
if index.schema() == self.get_expect_schema()? {
|
||||
Ok(index)
|
||||
} else {
|
||||
Err(TantivyError::SchemaError(
|
||||
"An index exists but the schema does not match.".to_string(),
|
||||
))
|
||||
}
|
||||
}
|
||||
/// Creates a new index given an implementation of the trait `Directory`.
|
||||
///
|
||||
/// If a directory previously existed, it will be erased.
|
||||
fn create<Dir: Directory>(self, dir: Dir) -> crate::Result<Index> {
|
||||
let directory = ManagedDirectory::wrap(dir)?;
|
||||
save_new_metas(
|
||||
self.get_expect_schema()?,
|
||||
self.index_settings.clone(),
|
||||
&directory,
|
||||
)?;
|
||||
let mut metas = IndexMeta::with_schema(self.get_expect_schema()?);
|
||||
metas.index_settings = self.index_settings;
|
||||
let index = Index::open_from_metas(directory, &metas, SegmentMetaInventory::default());
|
||||
Ok(index)
|
||||
}
|
||||
}
|
||||
|
||||
/// Search Index
|
||||
#[derive(Clone)]
|
||||
pub struct Index {
|
||||
directory: ManagedDirectory,
|
||||
schema: Schema,
|
||||
settings: IndexSettings,
|
||||
executor: Arc<Executor>,
|
||||
tokenizers: TokenizerManager,
|
||||
inventory: SegmentMetaInventory,
|
||||
}
|
||||
|
||||
impl Index {
|
||||
/// Creates a new builder.
|
||||
pub fn builder() -> IndexBuilder {
|
||||
IndexBuilder::new()
|
||||
}
|
||||
/// Examines the directory to see if it contains an index.
|
||||
///
|
||||
/// Effectively, it only checks for the presence of the `meta.json` file.
|
||||
@@ -97,13 +226,12 @@ impl Index {
|
||||
self.set_multithread_executor(default_num_threads)
|
||||
}
|
||||
|
||||
/// Creates a new index using the `RAMDirectory`.
|
||||
/// Creates a new index using the `RamDirectory`.
|
||||
///
|
||||
/// The index will be allocated in anonymous memory.
|
||||
/// This should only be used for unit tests.
|
||||
pub fn create_in_ram(schema: Schema) -> Index {
|
||||
let ram_directory = RAMDirectory::create();
|
||||
Index::create(ram_directory, schema).expect("Creating a RAMDirectory should never fail")
|
||||
IndexBuilder::new().schema(schema).create_in_ram().unwrap()
|
||||
}
|
||||
|
||||
/// Creates a new index in a given filepath.
|
||||
@@ -115,26 +243,14 @@ impl Index {
|
||||
directory_path: P,
|
||||
schema: Schema,
|
||||
) -> crate::Result<Index> {
|
||||
let mmap_directory = MmapDirectory::open(directory_path)?;
|
||||
if Index::exists(&mmap_directory)? {
|
||||
return Err(TantivyError::IndexAlreadyExists);
|
||||
}
|
||||
Index::create(mmap_directory, schema)
|
||||
IndexBuilder::new()
|
||||
.schema(schema)
|
||||
.create_in_dir(directory_path)
|
||||
}
|
||||
|
||||
/// Opens or creates a new index in the provided directory
|
||||
pub fn open_or_create<Dir: Directory>(dir: Dir, schema: Schema) -> crate::Result<Index> {
|
||||
if !Index::exists(&dir)? {
|
||||
return Index::create(dir, schema);
|
||||
}
|
||||
let index = Index::open(dir)?;
|
||||
if index.schema() == schema {
|
||||
Ok(index)
|
||||
} else {
|
||||
Err(TantivyError::SchemaError(
|
||||
"An index exists but the schema does not match.".to_string(),
|
||||
))
|
||||
}
|
||||
IndexBuilder::new().schema(schema).open_or_create(dir)
|
||||
}
|
||||
|
||||
/// Creates a new index in a temp directory.
|
||||
@@ -144,39 +260,34 @@ impl Index {
|
||||
/// is destroyed.
|
||||
///
|
||||
/// The temp directory is only used for testing the `MmapDirectory`.
|
||||
/// For other unit tests, prefer the `RAMDirectory`, see: `create_in_ram`.
|
||||
/// For other unit tests, prefer the `RamDirectory`, see: `create_in_ram`.
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn create_from_tempdir(schema: Schema) -> crate::Result<Index> {
|
||||
let mmap_directory = MmapDirectory::create_from_tempdir()?;
|
||||
Index::create(mmap_directory, schema)
|
||||
IndexBuilder::new().schema(schema).create_from_tempdir()
|
||||
}
|
||||
|
||||
/// Creates a new index given an implementation of the trait `Directory`.
|
||||
///
|
||||
/// If a directory previously existed, it will be erased.
|
||||
pub fn create<Dir: Directory>(dir: Dir, schema: Schema) -> crate::Result<Index> {
|
||||
let directory = ManagedDirectory::wrap(dir)?;
|
||||
Index::from_directory(directory, schema)
|
||||
}
|
||||
|
||||
/// Create a new index from a directory.
|
||||
///
|
||||
/// This will overwrite existing meta.json
|
||||
fn from_directory(directory: ManagedDirectory, schema: Schema) -> crate::Result<Index> {
|
||||
save_new_metas(schema.clone(), &directory)?;
|
||||
let metas = IndexMeta::with_schema(schema);
|
||||
let index = Index::create_from_metas(directory, &metas, SegmentMetaInventory::default());
|
||||
Ok(index)
|
||||
pub fn create<Dir: Directory>(
|
||||
dir: Dir,
|
||||
schema: Schema,
|
||||
settings: IndexSettings,
|
||||
) -> crate::Result<Index> {
|
||||
let mut builder = IndexBuilder::new().schema(schema);
|
||||
builder = builder.settings(settings);
|
||||
builder.create(dir)
|
||||
}
|
||||
|
||||
/// Creates a new index given a directory and an `IndexMeta`.
|
||||
fn create_from_metas(
|
||||
fn open_from_metas(
|
||||
directory: ManagedDirectory,
|
||||
metas: &IndexMeta,
|
||||
inventory: SegmentMetaInventory,
|
||||
) -> Index {
|
||||
let schema = metas.schema.clone();
|
||||
Index {
|
||||
settings: metas.index_settings.clone(),
|
||||
directory,
|
||||
schema,
|
||||
tokenizers: TokenizerManager::default(),
|
||||
@@ -239,7 +350,7 @@ impl Index {
|
||||
/// Such segments can of course be part of the index,
|
||||
/// but also they could be segments being currently built or in the middle of a merge
|
||||
/// operation.
|
||||
pub fn list_all_segment_metas(&self) -> Vec<SegmentMeta> {
|
||||
pub(crate) fn list_all_segment_metas(&self) -> Vec<SegmentMeta> {
|
||||
self.inventory.all()
|
||||
}
|
||||
|
||||
@@ -257,7 +368,7 @@ impl Index {
|
||||
let directory = ManagedDirectory::wrap(directory)?;
|
||||
let inventory = SegmentMetaInventory::default();
|
||||
let metas = load_metas(&directory, &inventory)?;
|
||||
let index = Index::create_from_metas(directory, &metas, inventory);
|
||||
let index = Index::open_from_metas(directory, &metas, inventory);
|
||||
Ok(index)
|
||||
}
|
||||
|
||||
@@ -282,7 +393,7 @@ impl Index {
|
||||
/// Each thread will receive a budget of `overall_heap_size_in_bytes / num_threads`.
|
||||
///
|
||||
/// # Errors
|
||||
/// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IOError`.
|
||||
/// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IoError`.
|
||||
///
|
||||
/// # Panics
|
||||
/// If the heap size per thread is too small, panics.
|
||||
@@ -317,7 +428,7 @@ impl Index {
|
||||
|
||||
/// Helper to create an index writer for tests.
|
||||
///
|
||||
/// That index writer only simply has a single thread and a heap of 5 MB.
|
||||
/// That index writer only simply has a single thread and a heap of 10 MB.
|
||||
/// Using a single thread gives us a deterministic allocation of DocId.
|
||||
#[cfg(test)]
|
||||
pub fn writer_for_tests(&self) -> crate::Result<IndexWriter> {
|
||||
@@ -326,7 +437,8 @@ impl Index {
|
||||
|
||||
/// Creates a multithreaded writer
|
||||
///
|
||||
/// Tantivy will automatically define the number of threads to use.
|
||||
/// Tantivy will automatically define the number of threads to use, but
|
||||
/// no more than [`MAX_NUM_THREAD`] threads.
|
||||
/// `overall_heap_size_in_bytes` is the total target memory usage that will be split
|
||||
/// between a given number of threads.
|
||||
///
|
||||
@@ -335,7 +447,7 @@ impl Index {
|
||||
/// # Panics
|
||||
/// If the heap size per thread is too small, panics.
|
||||
pub fn writer(&self, overall_heap_size_in_bytes: usize) -> crate::Result<IndexWriter> {
|
||||
let mut num_threads = num_cpus::get();
|
||||
let mut num_threads = std::cmp::min(num_cpus::get(), MAX_NUM_THREAD);
|
||||
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
|
||||
if heap_size_in_bytes_per_thread < HEAP_SIZE_MIN {
|
||||
num_threads = (overall_heap_size_in_bytes / HEAP_SIZE_MIN).max(1);
|
||||
@@ -343,6 +455,18 @@ impl Index {
|
||||
self.writer_with_num_threads(num_threads, overall_heap_size_in_bytes)
|
||||
}
|
||||
|
||||
/// Accessor to the index settings
|
||||
///
|
||||
pub fn settings(&self) -> &IndexSettings {
|
||||
&self.settings
|
||||
}
|
||||
|
||||
/// Accessor to the index settings
|
||||
///
|
||||
pub fn settings_mut(&mut self) -> &mut IndexSettings {
|
||||
&mut self.settings
|
||||
}
|
||||
|
||||
/// Accessor to the index schema
|
||||
///
|
||||
/// The schema is actually cloned.
|
||||
@@ -411,11 +535,14 @@ impl fmt::Debug for Index {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::directory::{RAMDirectory, WatchCallback};
|
||||
use crate::schema::Field;
|
||||
use crate::schema::{Schema, INDEXED, TEXT};
|
||||
use crate::IndexReader;
|
||||
use crate::ReloadPolicy;
|
||||
use crate::{
|
||||
directory::{RamDirectory, WatchCallback},
|
||||
IndexSettings,
|
||||
};
|
||||
use crate::{Directory, Index};
|
||||
|
||||
#[test]
|
||||
@@ -434,15 +561,20 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_index_exists() {
|
||||
let directory = RAMDirectory::create();
|
||||
let directory = RamDirectory::create();
|
||||
assert!(!Index::exists(&directory).unwrap());
|
||||
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
|
||||
assert!(Index::create(
|
||||
directory.clone(),
|
||||
throw_away_schema(),
|
||||
IndexSettings::default()
|
||||
)
|
||||
.is_ok());
|
||||
assert!(Index::exists(&directory).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn open_or_create_should_create() {
|
||||
let directory = RAMDirectory::create();
|
||||
let directory = RamDirectory::create();
|
||||
assert!(!Index::exists(&directory).unwrap());
|
||||
assert!(Index::open_or_create(directory.clone(), throw_away_schema()).is_ok());
|
||||
assert!(Index::exists(&directory).unwrap());
|
||||
@@ -450,24 +582,44 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn open_or_create_should_open() {
|
||||
let directory = RAMDirectory::create();
|
||||
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
|
||||
let directory = RamDirectory::create();
|
||||
assert!(Index::create(
|
||||
directory.clone(),
|
||||
throw_away_schema(),
|
||||
IndexSettings::default()
|
||||
)
|
||||
.is_ok());
|
||||
assert!(Index::exists(&directory).unwrap());
|
||||
assert!(Index::open_or_create(directory, throw_away_schema()).is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn create_should_wipeoff_existing() {
|
||||
let directory = RAMDirectory::create();
|
||||
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
|
||||
let directory = RamDirectory::create();
|
||||
assert!(Index::create(
|
||||
directory.clone(),
|
||||
throw_away_schema(),
|
||||
IndexSettings::default()
|
||||
)
|
||||
.is_ok());
|
||||
assert!(Index::exists(&directory).unwrap());
|
||||
assert!(Index::create(directory.clone(), Schema::builder().build()).is_ok());
|
||||
assert!(Index::create(
|
||||
directory,
|
||||
Schema::builder().build(),
|
||||
IndexSettings::default()
|
||||
)
|
||||
.is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn open_or_create_exists_but_schema_does_not_match() {
|
||||
let directory = RAMDirectory::create();
|
||||
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
|
||||
let directory = RamDirectory::create();
|
||||
assert!(Index::create(
|
||||
directory.clone(),
|
||||
throw_away_schema(),
|
||||
IndexSettings::default()
|
||||
)
|
||||
.is_ok());
|
||||
assert!(Index::exists(&directory).unwrap());
|
||||
assert!(Index::open_or_create(directory.clone(), throw_away_schema()).is_ok());
|
||||
let err = Index::open_or_create(directory, Schema::builder().build());
|
||||
@@ -599,10 +751,10 @@ mod tests {
|
||||
#[cfg(not(target_os = "windows"))]
|
||||
#[test]
|
||||
fn garbage_collect_works_as_intended() {
|
||||
let directory = RAMDirectory::create();
|
||||
let directory = RamDirectory::create();
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let index = Index::create(directory.clone(), schema).unwrap();
|
||||
let index = Index::create(directory.clone(), schema, IndexSettings::default()).unwrap();
|
||||
|
||||
let mut writer = index.writer_with_num_threads(8, 24_000_000).unwrap();
|
||||
for i in 0u64..8_000u64 {
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
use super::SegmentComponent;
|
||||
use crate::core::SegmentId;
|
||||
use crate::schema::Schema;
|
||||
use crate::Opstamp;
|
||||
use crate::{core::SegmentId, store::Compressor};
|
||||
use census::{Inventory, TrackedObject};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashSet;
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
use std::{collections::HashSet, sync::atomic::AtomicBool};
|
||||
use std::{fmt, sync::Arc};
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
struct DeleteMeta {
|
||||
@@ -33,6 +33,7 @@ impl SegmentMetaInventory {
|
||||
let inner = InnerSegmentMeta {
|
||||
segment_id,
|
||||
max_doc,
|
||||
include_temp_doc_store: Arc::new(AtomicBool::new(true)),
|
||||
deletes: None,
|
||||
};
|
||||
SegmentMeta::from(self.inventory.track(inner))
|
||||
@@ -80,6 +81,15 @@ impl SegmentMeta {
|
||||
self.tracked.segment_id
|
||||
}
|
||||
|
||||
/// Removes the Component::TempStore from the alive list and
|
||||
/// therefore marks the temp docstore file to be deleted by
|
||||
/// the garbage collection.
|
||||
pub fn untrack_temp_docstore(&self) {
|
||||
self.tracked
|
||||
.include_temp_doc_store
|
||||
.store(false, std::sync::atomic::Ordering::Relaxed);
|
||||
}
|
||||
|
||||
/// Returns the number of deleted documents.
|
||||
pub fn num_deleted_docs(&self) -> u32 {
|
||||
self.tracked
|
||||
@@ -96,9 +106,20 @@ impl SegmentMeta {
|
||||
/// is by removing all files that have been created by tantivy
|
||||
/// and are not used by any segment anymore.
|
||||
pub fn list_files(&self) -> HashSet<PathBuf> {
|
||||
SegmentComponent::iterator()
|
||||
.map(|component| self.relative_path(*component))
|
||||
.collect::<HashSet<PathBuf>>()
|
||||
if self
|
||||
.tracked
|
||||
.include_temp_doc_store
|
||||
.load(std::sync::atomic::Ordering::Relaxed)
|
||||
{
|
||||
SegmentComponent::iterator()
|
||||
.map(|component| self.relative_path(*component))
|
||||
.collect::<HashSet<PathBuf>>()
|
||||
} else {
|
||||
SegmentComponent::iterator()
|
||||
.filter(|comp| *comp != &SegmentComponent::TempStore)
|
||||
.map(|component| self.relative_path(*component))
|
||||
.collect::<HashSet<PathBuf>>()
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the relative path of a component of our segment.
|
||||
@@ -108,14 +129,14 @@ impl SegmentMeta {
|
||||
pub fn relative_path(&self, component: SegmentComponent) -> PathBuf {
|
||||
let mut path = self.id().uuid_string();
|
||||
path.push_str(&*match component {
|
||||
SegmentComponent::POSTINGS => ".idx".to_string(),
|
||||
SegmentComponent::POSITIONS => ".pos".to_string(),
|
||||
SegmentComponent::POSITIONSSKIP => ".posidx".to_string(),
|
||||
SegmentComponent::TERMS => ".term".to_string(),
|
||||
SegmentComponent::STORE => ".store".to_string(),
|
||||
SegmentComponent::FASTFIELDS => ".fast".to_string(),
|
||||
SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(),
|
||||
SegmentComponent::DELETE => format!(".{}.del", self.delete_opstamp().unwrap_or(0)),
|
||||
SegmentComponent::Postings => ".idx".to_string(),
|
||||
SegmentComponent::Positions => ".pos".to_string(),
|
||||
SegmentComponent::Terms => ".term".to_string(),
|
||||
SegmentComponent::Store => ".store".to_string(),
|
||||
SegmentComponent::TempStore => ".store.temp".to_string(),
|
||||
SegmentComponent::FastFields => ".fast".to_string(),
|
||||
SegmentComponent::FieldNorms => ".fieldnorm".to_string(),
|
||||
SegmentComponent::Delete => format!(".{}.del", self.delete_opstamp().unwrap_or(0)),
|
||||
});
|
||||
PathBuf::from(path)
|
||||
}
|
||||
@@ -160,6 +181,7 @@ impl SegmentMeta {
|
||||
segment_id: inner_meta.segment_id,
|
||||
max_doc,
|
||||
deletes: None,
|
||||
include_temp_doc_store: Arc::new(AtomicBool::new(true)),
|
||||
});
|
||||
SegmentMeta { tracked }
|
||||
}
|
||||
@@ -173,6 +195,7 @@ impl SegmentMeta {
|
||||
let tracked = self.tracked.map(move |inner_meta| InnerSegmentMeta {
|
||||
segment_id: inner_meta.segment_id,
|
||||
max_doc: inner_meta.max_doc,
|
||||
include_temp_doc_store: Arc::new(AtomicBool::new(true)),
|
||||
deletes: Some(delete_meta),
|
||||
});
|
||||
SegmentMeta { tracked }
|
||||
@@ -184,6 +207,14 @@ struct InnerSegmentMeta {
|
||||
segment_id: SegmentId,
|
||||
max_doc: u32,
|
||||
deletes: Option<DeleteMeta>,
|
||||
/// If you want to avoid the SegmentComponent::TempStore file to be covered by
|
||||
/// garbage collection and deleted, set this to true. This is used during merge.
|
||||
#[serde(skip)]
|
||||
#[serde(default = "default_temp_store")]
|
||||
pub(crate) include_temp_doc_store: Arc<AtomicBool>,
|
||||
}
|
||||
fn default_temp_store() -> Arc<AtomicBool> {
|
||||
Arc::new(AtomicBool::new(false))
|
||||
}
|
||||
|
||||
impl InnerSegmentMeta {
|
||||
@@ -194,6 +225,51 @@ impl InnerSegmentMeta {
|
||||
}
|
||||
}
|
||||
|
||||
/// Search Index Settings.
|
||||
///
|
||||
/// Contains settings which are applied on the whole
|
||||
/// index, like presort documents.
|
||||
#[derive(Clone, Default, Serialize, Deserialize, Eq, PartialEq)]
|
||||
pub struct IndexSettings {
|
||||
/// Sorts the documents by information
|
||||
/// provided in `IndexSortByField`
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub sort_by_field: Option<IndexSortByField>,
|
||||
/// The `Compressor` used to compress the doc store.
|
||||
#[serde(default)]
|
||||
pub docstore_compression: Compressor,
|
||||
}
|
||||
/// Settings to presort the documents in an index
|
||||
///
|
||||
/// Presorting documents can greatly performance
|
||||
/// in some scenarios, by applying top n
|
||||
/// optimizations.
|
||||
#[derive(Clone, Serialize, Deserialize, Eq, PartialEq)]
|
||||
pub struct IndexSortByField {
|
||||
/// The field to sort the documents by
|
||||
pub field: String,
|
||||
/// The order to sort the documents by
|
||||
pub order: Order,
|
||||
}
|
||||
/// The order to sort by
|
||||
#[derive(Clone, Serialize, Deserialize, Eq, PartialEq)]
|
||||
pub enum Order {
|
||||
/// Ascending Order
|
||||
Asc,
|
||||
/// Descending Order
|
||||
Desc,
|
||||
}
|
||||
impl Order {
|
||||
/// return if the Order is ascending
|
||||
pub fn is_asc(&self) -> bool {
|
||||
self == &Order::Asc
|
||||
}
|
||||
/// return if the Order is descending
|
||||
pub fn is_desc(&self) -> bool {
|
||||
self == &Order::Desc
|
||||
}
|
||||
}
|
||||
|
||||
/// Meta information about the `Index`.
|
||||
///
|
||||
/// This object is serialized on disk in the `meta.json` file.
|
||||
@@ -204,6 +280,9 @@ impl InnerSegmentMeta {
|
||||
///
|
||||
#[derive(Clone, Serialize)]
|
||||
pub struct IndexMeta {
|
||||
/// `IndexSettings` to configure index options.
|
||||
#[serde(default)]
|
||||
pub index_settings: IndexSettings,
|
||||
/// List of `SegmentMeta` informations associated to each finalized segment of the index.
|
||||
pub segments: Vec<SegmentMeta>,
|
||||
/// Index `Schema`
|
||||
@@ -222,6 +301,8 @@ pub struct IndexMeta {
|
||||
#[derive(Deserialize)]
|
||||
struct UntrackedIndexMeta {
|
||||
pub segments: Vec<InnerSegmentMeta>,
|
||||
#[serde(default)]
|
||||
pub index_settings: IndexSettings,
|
||||
pub schema: Schema,
|
||||
pub opstamp: Opstamp,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
@@ -231,6 +312,7 @@ struct UntrackedIndexMeta {
|
||||
impl UntrackedIndexMeta {
|
||||
pub fn track(self, inventory: &SegmentMetaInventory) -> IndexMeta {
|
||||
IndexMeta {
|
||||
index_settings: self.index_settings,
|
||||
segments: self
|
||||
.segments
|
||||
.into_iter()
|
||||
@@ -251,6 +333,7 @@ impl IndexMeta {
|
||||
/// Opstamp will the value `0u64`.
|
||||
pub fn with_schema(schema: Schema) -> IndexMeta {
|
||||
IndexMeta {
|
||||
index_settings: IndexSettings::default(),
|
||||
segments: vec![],
|
||||
schema,
|
||||
opstamp: 0u64,
|
||||
@@ -282,8 +365,10 @@ impl fmt::Debug for IndexMeta {
|
||||
mod tests {
|
||||
|
||||
use super::IndexMeta;
|
||||
use crate::schema::{Schema, TEXT};
|
||||
use serde_json;
|
||||
use crate::{
|
||||
schema::{Schema, TEXT},
|
||||
IndexSettings, IndexSortByField, Order,
|
||||
};
|
||||
|
||||
#[test]
|
||||
fn test_serialize_metas() {
|
||||
@@ -293,6 +378,13 @@ mod tests {
|
||||
schema_builder.build()
|
||||
};
|
||||
let index_metas = IndexMeta {
|
||||
index_settings: IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "text".to_string(),
|
||||
order: Order::Asc,
|
||||
}),
|
||||
..Default::default()
|
||||
},
|
||||
segments: Vec::new(),
|
||||
schema,
|
||||
opstamp: 0u64,
|
||||
@@ -301,7 +393,7 @@ mod tests {
|
||||
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
|
||||
assert_eq!(
|
||||
json,
|
||||
r#"{"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","tokenizer":"default"},"stored":false}}],"opstamp":0}"#
|
||||
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","tokenizer":"default"},"stored":false}}],"opstamp":0}"#
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -26,7 +26,6 @@ pub struct InvertedIndexReader {
|
||||
termdict: TermDictionary,
|
||||
postings_file_slice: FileSlice,
|
||||
positions_file_slice: FileSlice,
|
||||
positions_idx_file_slice: FileSlice,
|
||||
record_option: IndexRecordOption,
|
||||
total_num_tokens: u64,
|
||||
}
|
||||
@@ -37,7 +36,6 @@ impl InvertedIndexReader {
|
||||
termdict: TermDictionary,
|
||||
postings_file_slice: FileSlice,
|
||||
positions_file_slice: FileSlice,
|
||||
positions_idx_file_slice: FileSlice,
|
||||
record_option: IndexRecordOption,
|
||||
) -> io::Result<InvertedIndexReader> {
|
||||
let (total_num_tokens_slice, postings_body) = postings_file_slice.split(8);
|
||||
@@ -46,7 +44,6 @@ impl InvertedIndexReader {
|
||||
termdict,
|
||||
postings_file_slice: postings_body,
|
||||
positions_file_slice,
|
||||
positions_idx_file_slice,
|
||||
record_option,
|
||||
total_num_tokens,
|
||||
})
|
||||
@@ -59,7 +56,6 @@ impl InvertedIndexReader {
|
||||
termdict: TermDictionary::empty(),
|
||||
postings_file_slice: FileSlice::empty(),
|
||||
positions_file_slice: FileSlice::empty(),
|
||||
positions_idx_file_slice: FileSlice::empty(),
|
||||
record_option,
|
||||
total_num_tokens: 0u64,
|
||||
}
|
||||
@@ -141,12 +137,12 @@ impl InvertedIndexReader {
|
||||
option: IndexRecordOption,
|
||||
) -> io::Result<SegmentPostings> {
|
||||
let block_postings = self.read_block_postings_from_terminfo(term_info, option)?;
|
||||
let position_stream = {
|
||||
let position_reader = {
|
||||
if option.has_positions() {
|
||||
let position_reader = self.positions_file_slice.clone();
|
||||
let skip_reader = self.positions_idx_file_slice.clone();
|
||||
let position_reader =
|
||||
PositionReader::new(position_reader, skip_reader, term_info.positions_idx)?;
|
||||
let positions_data = self
|
||||
.positions_file_slice
|
||||
.read_bytes_slice(term_info.positions_range.clone())?;
|
||||
let position_reader = PositionReader::open(positions_data)?;
|
||||
Some(position_reader)
|
||||
} else {
|
||||
None
|
||||
@@ -154,7 +150,7 @@ impl InvertedIndexReader {
|
||||
};
|
||||
Ok(SegmentPostings::from_block_postings(
|
||||
block_postings,
|
||||
position_stream,
|
||||
position_reader,
|
||||
))
|
||||
}
|
||||
|
||||
|
||||
@@ -9,12 +9,13 @@ mod segment_id;
|
||||
mod segment_reader;
|
||||
|
||||
pub use self::executor::Executor;
|
||||
pub use self::index::Index;
|
||||
pub use self::index_meta::{IndexMeta, SegmentMeta, SegmentMetaInventory};
|
||||
pub use self::index::{Index, IndexBuilder};
|
||||
pub use self::index_meta::{
|
||||
IndexMeta, IndexSettings, IndexSortByField, Order, SegmentMeta, SegmentMetaInventory,
|
||||
};
|
||||
pub use self::inverted_index_reader::InvertedIndexReader;
|
||||
pub use self::searcher::Searcher;
|
||||
pub use self::segment::Segment;
|
||||
pub use self::segment::SerializableSegment;
|
||||
pub use self::segment_component::SegmentComponent;
|
||||
pub use self::segment_id::SegmentId;
|
||||
pub use self::segment_reader::SegmentReader;
|
||||
|
||||
@@ -54,9 +54,8 @@ impl Searcher {
|
||||
/// The searcher uses the segment ordinal to route the
|
||||
/// the request to the right `Segment`.
|
||||
pub fn doc(&self, doc_address: DocAddress) -> crate::Result<Document> {
|
||||
let DocAddress(segment_local_id, doc_id) = doc_address;
|
||||
let store_reader = &self.store_readers[segment_local_id as usize];
|
||||
store_reader.get(doc_id)
|
||||
let store_reader = &self.store_readers[doc_address.segment_ord as usize];
|
||||
store_reader.get(doc_address.doc_id)
|
||||
}
|
||||
|
||||
/// Access the schema associated to the index of this searcher.
|
||||
|
||||
@@ -5,7 +5,6 @@ use crate::core::SegmentMeta;
|
||||
use crate::directory::error::{OpenReadError, OpenWriteError};
|
||||
use crate::directory::Directory;
|
||||
use crate::directory::{FileSlice, WritePtr};
|
||||
use crate::indexer::segment_serializer::SegmentSerializer;
|
||||
use crate::schema::Schema;
|
||||
use crate::Opstamp;
|
||||
use std::fmt;
|
||||
@@ -90,12 +89,3 @@ impl Segment {
|
||||
Ok(write)
|
||||
}
|
||||
}
|
||||
|
||||
pub trait SerializableSegment {
|
||||
/// Writes a view of a segment by pushing information
|
||||
/// to the `SegmentSerializer`.
|
||||
///
|
||||
/// # Returns
|
||||
/// The number of documents in the segment.
|
||||
fn write(&self, serializer: SegmentSerializer) -> crate::Result<u32>;
|
||||
}
|
||||
|
||||
@@ -4,42 +4,42 @@ use std::slice;
|
||||
/// Each component is stored in its own file,
|
||||
/// using the pattern `segment_uuid`.`component_extension`,
|
||||
/// except the delete component that takes an `segment_uuid`.`delete_opstamp`.`component_extension`
|
||||
#[derive(Copy, Clone)]
|
||||
#[derive(Copy, Clone, Eq, PartialEq)]
|
||||
pub enum SegmentComponent {
|
||||
/// Postings (or inverted list). Sorted lists of document ids, associated to terms
|
||||
POSTINGS,
|
||||
Postings,
|
||||
/// Positions of terms in each document.
|
||||
POSITIONS,
|
||||
/// Index to seek within the position file
|
||||
POSITIONSSKIP,
|
||||
Positions,
|
||||
/// Column-oriented random-access storage of fields.
|
||||
FASTFIELDS,
|
||||
FastFields,
|
||||
/// Stores the sum of the length (in terms) of each field for each document.
|
||||
/// Field norms are stored as a special u64 fast field.
|
||||
FIELDNORMS,
|
||||
FieldNorms,
|
||||
/// Dictionary associating `Term`s to `TermInfo`s which is
|
||||
/// simply an address into the `postings` file and the `positions` file.
|
||||
TERMS,
|
||||
Terms,
|
||||
/// Row-oriented, compressed storage of the documents.
|
||||
/// Accessing a document from the store is relatively slow, as it
|
||||
/// requires to decompress the entire block it belongs to.
|
||||
STORE,
|
||||
Store,
|
||||
/// Temporary storage of the documents, before streamed to `Store`.
|
||||
TempStore,
|
||||
/// Bitset describing which document of the segment is deleted.
|
||||
DELETE,
|
||||
Delete,
|
||||
}
|
||||
|
||||
impl SegmentComponent {
|
||||
/// Iterates through the components.
|
||||
pub fn iterator() -> slice::Iter<'static, SegmentComponent> {
|
||||
static SEGMENT_COMPONENTS: [SegmentComponent; 8] = [
|
||||
SegmentComponent::POSTINGS,
|
||||
SegmentComponent::POSITIONS,
|
||||
SegmentComponent::POSITIONSSKIP,
|
||||
SegmentComponent::FASTFIELDS,
|
||||
SegmentComponent::FIELDNORMS,
|
||||
SegmentComponent::TERMS,
|
||||
SegmentComponent::STORE,
|
||||
SegmentComponent::DELETE,
|
||||
SegmentComponent::Postings,
|
||||
SegmentComponent::Positions,
|
||||
SegmentComponent::FastFields,
|
||||
SegmentComponent::FieldNorms,
|
||||
SegmentComponent::Terms,
|
||||
SegmentComponent::Store,
|
||||
SegmentComponent::TempStore,
|
||||
SegmentComponent::Delete,
|
||||
];
|
||||
SEGMENT_COMPONENTS.iter()
|
||||
}
|
||||
|
||||
@@ -22,7 +22,7 @@ use std::sync::atomic;
|
||||
pub struct SegmentId(Uuid);
|
||||
|
||||
#[cfg(test)]
|
||||
static AUTO_INC_COUNTER: Lazy<atomic::AtomicUsize> = Lazy::new(|| atomic::AtomicUsize::default());
|
||||
static AUTO_INC_COUNTER: Lazy<atomic::AtomicUsize> = Lazy::new(atomic::AtomicUsize::default);
|
||||
|
||||
#[cfg(test)]
|
||||
const ZERO_ARRAY: [u8; 8] = [0u8; 8];
|
||||
@@ -108,6 +108,12 @@ impl fmt::Debug for SegmentId {
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for SegmentId {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "Seg({:?})", self.short_uuid_string())
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for SegmentId {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use crate::common::HasLen;
|
||||
use crate::core::InvertedIndexReader;
|
||||
use crate::core::Segment;
|
||||
use crate::core::SegmentComponent;
|
||||
@@ -32,9 +31,6 @@ use std::{collections::HashMap, io};
|
||||
///
|
||||
/// The segment reader has a very low memory footprint,
|
||||
/// as close to all of the memory data is mmapped.
|
||||
///
|
||||
///
|
||||
/// TODO fix not decoding docfreq
|
||||
#[derive(Clone)]
|
||||
pub struct SegmentReader {
|
||||
inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>,
|
||||
@@ -46,7 +42,6 @@ pub struct SegmentReader {
|
||||
termdict_composite: CompositeFile,
|
||||
postings_composite: CompositeFile,
|
||||
positions_composite: CompositeFile,
|
||||
positions_idx_composite: CompositeFile,
|
||||
fast_fields_readers: Arc<FastFieldReaders>,
|
||||
fieldnorm_readers: FieldNormReaders,
|
||||
|
||||
@@ -58,17 +53,12 @@ pub struct SegmentReader {
|
||||
impl SegmentReader {
|
||||
/// Returns the highest document id ever attributed in
|
||||
/// this segment + 1.
|
||||
/// Today, `tantivy` does not handle deletes, so it happens
|
||||
/// to also be the number of documents in the index.
|
||||
pub fn max_doc(&self) -> DocId {
|
||||
self.max_doc
|
||||
}
|
||||
|
||||
/// Returns the number of documents.
|
||||
/// Returns the number of alive documents.
|
||||
/// Deleted documents are not counted.
|
||||
///
|
||||
/// Today, `tantivy` does not handle deletes so max doc and
|
||||
/// num_docs are the same.
|
||||
pub fn num_docs(&self) -> DocId {
|
||||
self.num_docs
|
||||
}
|
||||
@@ -82,7 +72,7 @@ impl SegmentReader {
|
||||
/// deleted in the segment.
|
||||
pub fn num_deleted_docs(&self) -> DocId {
|
||||
self.delete_bitset()
|
||||
.map(|delete_set| delete_set.len() as DocId)
|
||||
.map(|delete_set| delete_set.num_deleted() as DocId)
|
||||
.unwrap_or(0u32)
|
||||
}
|
||||
|
||||
@@ -151,44 +141,36 @@ impl SegmentReader {
|
||||
|
||||
/// Open a new segment for reading.
|
||||
pub fn open(segment: &Segment) -> crate::Result<SegmentReader> {
|
||||
let termdict_file = segment.open_read(SegmentComponent::TERMS)?;
|
||||
let termdict_file = segment.open_read(SegmentComponent::Terms)?;
|
||||
let termdict_composite = CompositeFile::open(&termdict_file)?;
|
||||
|
||||
let store_file = segment.open_read(SegmentComponent::STORE)?;
|
||||
let store_file = segment.open_read(SegmentComponent::Store)?;
|
||||
|
||||
fail_point!("SegmentReader::open#middle");
|
||||
|
||||
let postings_file = segment.open_read(SegmentComponent::POSTINGS)?;
|
||||
let postings_file = segment.open_read(SegmentComponent::Postings)?;
|
||||
let postings_composite = CompositeFile::open(&postings_file)?;
|
||||
|
||||
let positions_composite = {
|
||||
if let Ok(positions_file) = segment.open_read(SegmentComponent::POSITIONS) {
|
||||
if let Ok(positions_file) = segment.open_read(SegmentComponent::Positions) {
|
||||
CompositeFile::open(&positions_file)?
|
||||
} else {
|
||||
CompositeFile::empty()
|
||||
}
|
||||
};
|
||||
|
||||
let positions_idx_composite = {
|
||||
if let Ok(positions_skip_file) = segment.open_read(SegmentComponent::POSITIONSSKIP) {
|
||||
CompositeFile::open(&positions_skip_file)?
|
||||
} else {
|
||||
CompositeFile::empty()
|
||||
}
|
||||
};
|
||||
|
||||
let schema = segment.schema();
|
||||
|
||||
let fast_fields_data = segment.open_read(SegmentComponent::FASTFIELDS)?;
|
||||
let fast_fields_data = segment.open_read(SegmentComponent::FastFields)?;
|
||||
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
|
||||
let fast_field_readers =
|
||||
Arc::new(FastFieldReaders::new(schema.clone(), fast_fields_composite));
|
||||
|
||||
let fieldnorm_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
|
||||
let fieldnorm_data = segment.open_read(SegmentComponent::FieldNorms)?;
|
||||
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
|
||||
|
||||
let delete_bitset_opt = if segment.meta().has_deletes() {
|
||||
let delete_data = segment.open_read(SegmentComponent::DELETE)?;
|
||||
let delete_data = segment.open_read(SegmentComponent::Delete)?;
|
||||
let delete_bitset = DeleteBitSet::open(delete_data)?;
|
||||
Some(delete_bitset)
|
||||
} else {
|
||||
@@ -207,7 +189,6 @@ impl SegmentReader {
|
||||
store_file,
|
||||
delete_bitset_opt,
|
||||
positions_composite,
|
||||
positions_idx_composite,
|
||||
schema,
|
||||
})
|
||||
}
|
||||
@@ -263,18 +244,15 @@ impl SegmentReader {
|
||||
let positions_file = self
|
||||
.positions_composite
|
||||
.open_read(field)
|
||||
.expect("Index corrupted. Failed to open field positions in composite file.");
|
||||
|
||||
let positions_idx_file = self
|
||||
.positions_idx_composite
|
||||
.open_read(field)
|
||||
.expect("Index corrupted. Failed to open field positions in composite file.");
|
||||
.ok_or_else(|| {
|
||||
let error_msg = format!("Failed to open field {:?}'s positions in the composite file. Has the schema been modified?", field_entry.name());
|
||||
DataCorruption::comment_only(error_msg)
|
||||
})?;
|
||||
|
||||
let inv_idx_reader = Arc::new(InvertedIndexReader::new(
|
||||
TermDictionary::open(termdict_file)?,
|
||||
postings_file,
|
||||
positions_file,
|
||||
positions_idx_file,
|
||||
record_option,
|
||||
)?);
|
||||
|
||||
@@ -319,7 +297,6 @@ impl SegmentReader {
|
||||
self.termdict_composite.space_usage(),
|
||||
self.postings_composite.space_usage(),
|
||||
self.positions_composite.space_usage(),
|
||||
self.positions_idx_composite.space_usage(),
|
||||
self.fast_fields_readers.space_usage(),
|
||||
self.fieldnorm_readers.space_usage(),
|
||||
self.get_store_reader()?.space_usage(),
|
||||
@@ -343,6 +320,32 @@ mod test {
|
||||
use crate::schema::{Schema, Term, STORED, TEXT};
|
||||
use crate::DocId;
|
||||
|
||||
#[test]
|
||||
fn test_num_alive() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_text_field("name", TEXT | STORED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let name = schema.get_field("name").unwrap();
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(name => "tantivy"));
|
||||
index_writer.add_document(doc!(name => "horse"));
|
||||
index_writer.add_document(doc!(name => "jockey"));
|
||||
index_writer.add_document(doc!(name => "cap"));
|
||||
// we should now have one segment with two docs
|
||||
index_writer.delete_term(Term::from_field_text(name, "horse"));
|
||||
index_writer.delete_term(Term::from_field_text(name, "cap"));
|
||||
|
||||
// ok, now we should have a deleted doc
|
||||
index_writer.commit()?;
|
||||
}
|
||||
let searcher = index.reader()?.searcher();
|
||||
assert_eq!(2, searcher.segment_reader(0).num_docs());
|
||||
assert_eq!(4, searcher.segment_reader(0).max_doc());
|
||||
Ok(())
|
||||
}
|
||||
#[test]
|
||||
fn test_alive_docs_iterator() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
@@ -70,7 +70,7 @@ impl Drop for DirectoryLockGuard {
|
||||
|
||||
enum TryAcquireLockError {
|
||||
FileExists,
|
||||
IOError(io::Error),
|
||||
IoError(io::Error),
|
||||
}
|
||||
|
||||
fn try_acquire_lock(
|
||||
@@ -79,9 +79,9 @@ fn try_acquire_lock(
|
||||
) -> Result<DirectoryLock, TryAcquireLockError> {
|
||||
let mut write = directory.open_write(filepath).map_err(|e| match e {
|
||||
OpenWriteError::FileAlreadyExists(_) => TryAcquireLockError::FileExists,
|
||||
OpenWriteError::IOError { io_error, .. } => TryAcquireLockError::IOError(io_error),
|
||||
OpenWriteError::IoError { io_error, .. } => TryAcquireLockError::IoError(io_error),
|
||||
})?;
|
||||
write.flush().map_err(TryAcquireLockError::IOError)?;
|
||||
write.flush().map_err(TryAcquireLockError::IoError)?;
|
||||
Ok(DirectoryLock::from(Box::new(DirectoryLockGuard {
|
||||
directory: directory.box_clone(),
|
||||
path: filepath.to_owned(),
|
||||
@@ -106,7 +106,7 @@ fn retry_policy(is_blocking: bool) -> RetryPolicy {
|
||||
///
|
||||
/// - The [`MMapDirectory`](struct.MmapDirectory.html), this
|
||||
/// should be your default choice.
|
||||
/// - The [`RAMDirectory`](struct.RAMDirectory.html), which
|
||||
/// - The [`RamDirectory`](struct.RamDirectory.html), which
|
||||
/// should be used mostly for tests.
|
||||
pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
||||
/// Opens a file and returns a boxed `FileHandle`.
|
||||
@@ -154,7 +154,7 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
||||
/// Flush operation should also be persistent.
|
||||
///
|
||||
/// The user shall not rely on `Drop` triggering `flush`.
|
||||
/// Note that `RAMDirectory` will panic! if `flush`
|
||||
/// Note that `RamDirectory` will panic! if `flush`
|
||||
/// was not called.
|
||||
///
|
||||
/// The file may not previously exist.
|
||||
@@ -192,8 +192,8 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
||||
return Err(LockError::LockBusy);
|
||||
}
|
||||
}
|
||||
Err(TryAcquireLockError::IOError(io_error)) => {
|
||||
return Err(LockError::IOError(io_error));
|
||||
Err(TryAcquireLockError::IoError(io_error)) => {
|
||||
return Err(LockError::IoError(io_error));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,9 +12,9 @@ pub enum LockError {
|
||||
/// - In the context of a non-blocking lock, this means the lock was busy at the moment of the call.
|
||||
#[error("Could not acquire lock as it is already held, possibly by a different process.")]
|
||||
LockBusy,
|
||||
/// Trying to acquire a lock failed with an `IOError`
|
||||
/// Trying to acquire a lock failed with an `IoError`
|
||||
#[error("Failed to acquire the lock due to an io:Error.")]
|
||||
IOError(io::Error),
|
||||
IoError(io::Error),
|
||||
}
|
||||
|
||||
/// Error that may occur when opening a directory
|
||||
@@ -30,7 +30,7 @@ pub enum OpenDirectoryError {
|
||||
#[error("Failed to create a temporary directory: '{0}'.")]
|
||||
FailedToCreateTempDir(io::Error),
|
||||
/// IoError
|
||||
#[error("IOError '{io_error:?}' while create directory in: '{directory_path:?}'.")]
|
||||
#[error("IoError '{io_error:?}' while create directory in: '{directory_path:?}'.")]
|
||||
IoError {
|
||||
/// underlying io Error.
|
||||
io_error: io::Error,
|
||||
@@ -48,8 +48,8 @@ pub enum OpenWriteError {
|
||||
FileAlreadyExists(PathBuf),
|
||||
/// Any kind of IO error that happens when
|
||||
/// writing in the underlying IO device.
|
||||
#[error("IOError '{io_error:?}' while opening file for write: '{filepath}'.")]
|
||||
IOError {
|
||||
#[error("IoError '{io_error:?}' while opening file for write: '{filepath}'.")]
|
||||
IoError {
|
||||
/// The underlying `io::Error`.
|
||||
io_error: io::Error,
|
||||
/// File path of the file that tantivy failed to open for write.
|
||||
@@ -60,7 +60,7 @@ pub enum OpenWriteError {
|
||||
impl OpenWriteError {
|
||||
/// Wraps an io error.
|
||||
pub fn wrap_io_error(io_error: io::Error, filepath: PathBuf) -> Self {
|
||||
Self::IOError { io_error, filepath }
|
||||
Self::IoError { io_error, filepath }
|
||||
}
|
||||
}
|
||||
/// Type of index incompatibility between the library and the index found on disk
|
||||
@@ -130,9 +130,9 @@ pub enum OpenReadError {
|
||||
FileDoesNotExist(PathBuf),
|
||||
/// Any kind of io::Error.
|
||||
#[error(
|
||||
"IOError: '{io_error:?}' happened while opening the following file for Read: {filepath}."
|
||||
"IoError: '{io_error:?}' happened while opening the following file for Read: {filepath}."
|
||||
)]
|
||||
IOError {
|
||||
IoError {
|
||||
/// The underlying `io::Error`.
|
||||
io_error: io::Error,
|
||||
/// File path of the file that tantivy failed to open for read.
|
||||
@@ -146,7 +146,7 @@ pub enum OpenReadError {
|
||||
impl OpenReadError {
|
||||
/// Wraps an io error.
|
||||
pub fn wrap_io_error(io_error: io::Error, filepath: PathBuf) -> Self {
|
||||
Self::IOError { io_error, filepath }
|
||||
Self::IoError { io_error, filepath }
|
||||
}
|
||||
}
|
||||
/// Error that may occur when trying to delete a file
|
||||
@@ -158,7 +158,7 @@ pub enum DeleteError {
|
||||
/// Any kind of IO error that happens when
|
||||
/// interacting with the underlying IO device.
|
||||
#[error("The following IO error happened while deleting file '{filepath}': '{io_error:?}'.")]
|
||||
IOError {
|
||||
IoError {
|
||||
/// The underlying `io::Error`.
|
||||
io_error: io::Error,
|
||||
/// File path of the file that tantivy failed to delete.
|
||||
|
||||
@@ -147,6 +147,13 @@ impl FileSlice {
|
||||
self.slice(from_offset..self.len())
|
||||
}
|
||||
|
||||
/// Returns a slice from the end.
|
||||
///
|
||||
/// Equivalent to `.slice(self.len() - from_offset, self.len())`
|
||||
pub fn slice_from_end(&self, from_offset: usize) -> FileSlice {
|
||||
self.slice(self.len() - from_offset..self.len())
|
||||
}
|
||||
|
||||
/// Like `.slice(...)` but enforcing only the `to`
|
||||
/// boundary.
|
||||
///
|
||||
@@ -204,7 +211,7 @@ mod tests {
|
||||
assert_eq!(right.read_bytes()?.as_slice(), b"");
|
||||
}
|
||||
{
|
||||
let (left, right) = file_slice.clone().split_from_end(2);
|
||||
let (left, right) = file_slice.split_from_end(2);
|
||||
assert_eq!(left.read_bytes()?.as_slice(), b"abcd");
|
||||
assert_eq!(right.read_bytes()?.as_slice(), b"ef");
|
||||
}
|
||||
|
||||
@@ -1,69 +1,45 @@
|
||||
use crate::common::{BinarySerializable, CountingWriter, FixedSize, HasLen, VInt};
|
||||
use crate::directory::error::Incompatibility;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::directory::{AntiCallToken, TerminatingWrite};
|
||||
use crate::Version;
|
||||
use crate::{
|
||||
common::{BinarySerializable, CountingWriter, DeserializeFrom, FixedSize, HasLen},
|
||||
directory::{AntiCallToken, TerminatingWrite},
|
||||
Version, INDEX_FORMAT_VERSION,
|
||||
};
|
||||
use crc32fast::Hasher;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
const FOOTER_MAX_LEN: usize = 10_000;
|
||||
const FOOTER_MAX_LEN: u32 = 50_000;
|
||||
|
||||
/// The magic byte of the footer to identify corruption
|
||||
/// or an old version of the footer.
|
||||
const FOOTER_MAGIC_NUMBER: u32 = 1337;
|
||||
|
||||
type CrcHashU32 = u32;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
/// A Footer is appended to every file
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
pub struct Footer {
|
||||
pub version: Version,
|
||||
pub meta: String,
|
||||
pub versioned_footer: VersionedFooter,
|
||||
}
|
||||
|
||||
/// Serialises the footer to a byte-array
|
||||
/// - versioned_footer_len : 4 bytes
|
||||
///- versioned_footer: variable bytes
|
||||
/// - meta_len: 4 bytes
|
||||
/// - meta: variable bytes
|
||||
/// - version_len: 4 bytes
|
||||
/// - version json: variable bytes
|
||||
impl BinarySerializable for Footer {
|
||||
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
BinarySerializable::serialize(&self.versioned_footer, writer)?;
|
||||
BinarySerializable::serialize(&self.meta, writer)?;
|
||||
let version_string =
|
||||
serde_json::to_string(&self.version).map_err(|_err| io::ErrorKind::InvalidInput)?;
|
||||
BinarySerializable::serialize(&version_string, writer)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let versioned_footer = VersionedFooter::deserialize(reader)?;
|
||||
let meta = String::deserialize(reader)?;
|
||||
let version_json = String::deserialize(reader)?;
|
||||
let version = serde_json::from_str(&version_json)?;
|
||||
Ok(Footer {
|
||||
version,
|
||||
meta,
|
||||
versioned_footer,
|
||||
})
|
||||
}
|
||||
pub crc: CrcHashU32,
|
||||
}
|
||||
|
||||
impl Footer {
|
||||
pub fn new(versioned_footer: VersionedFooter) -> Self {
|
||||
pub fn new(crc: CrcHashU32) -> Self {
|
||||
let version = crate::VERSION.clone();
|
||||
let meta = version.to_string();
|
||||
Footer {
|
||||
version,
|
||||
meta,
|
||||
versioned_footer,
|
||||
}
|
||||
Footer { version, crc }
|
||||
}
|
||||
|
||||
pub fn crc(&self) -> CrcHashU32 {
|
||||
self.crc
|
||||
}
|
||||
pub fn append_footer<W: io::Write>(&self, mut write: &mut W) -> io::Result<()> {
|
||||
let mut counting_write = CountingWriter::wrap(&mut write);
|
||||
self.serialize(&mut counting_write)?;
|
||||
let written_len = counting_write.written_bytes();
|
||||
(written_len as u32).serialize(write)?;
|
||||
counting_write.write_all(serde_json::to_string(&self)?.as_ref())?;
|
||||
let footer_payload_len = counting_write.written_bytes();
|
||||
BinarySerializable::serialize(&(footer_payload_len as u32), write)?;
|
||||
BinarySerializable::serialize(&(FOOTER_MAGIC_NUMBER as u32), write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -77,12 +53,47 @@ impl Footer {
|
||||
),
|
||||
));
|
||||
}
|
||||
let (body_footer, footer_len_file) = file.split_from_end(u32::SIZE_IN_BYTES);
|
||||
let mut footer_len_bytes = footer_len_file.read_bytes()?;
|
||||
let footer_len = u32::deserialize(&mut footer_len_bytes)? as usize;
|
||||
let (body, footer) = body_footer.split_from_end(footer_len);
|
||||
let mut footer_bytes = footer.read_bytes()?;
|
||||
let footer = Footer::deserialize(&mut footer_bytes)?;
|
||||
|
||||
let footer_metadata_len = <(u32, u32)>::SIZE_IN_BYTES;
|
||||
let (footer_len, footer_magic_byte): (u32, u32) = file
|
||||
.slice_from_end(footer_metadata_len)
|
||||
.read_bytes()?
|
||||
.as_ref()
|
||||
.deserialize()?;
|
||||
|
||||
if footer_magic_byte != FOOTER_MAGIC_NUMBER {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
"Footer magic byte mismatch. File corrupted or index was created using old an tantivy version which is not supported anymore. Please use tantivy 0.15 or above to recreate the index.",
|
||||
));
|
||||
}
|
||||
|
||||
if footer_len > FOOTER_MAX_LEN {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
format!(
|
||||
"Footer seems invalid as it suggests a footer len of {}. File is corrupted, \
|
||||
or the index was created with a different & old version of tantivy.",
|
||||
footer_len
|
||||
),
|
||||
));
|
||||
}
|
||||
let total_footer_size = footer_len as usize + footer_metadata_len;
|
||||
if file.len() < total_footer_size {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::UnexpectedEof,
|
||||
format!(
|
||||
"File corrupted. The file is smaller than it's footer bytes (len={}).",
|
||||
total_footer_size
|
||||
),
|
||||
));
|
||||
}
|
||||
|
||||
let footer: Footer = serde_json::from_slice(&file.read_bytes_slice(
|
||||
file.len() - total_footer_size..file.len() - footer_metadata_len as usize,
|
||||
)?)?;
|
||||
|
||||
let body = file.slice_to(file.len() - total_footer_size);
|
||||
Ok((footer, body))
|
||||
}
|
||||
|
||||
@@ -90,151 +101,16 @@ impl Footer {
|
||||
/// Has to be called after `extract_footer` to make sure it's not accessing uninitialised memory
|
||||
pub fn is_compatible(&self) -> Result<(), Incompatibility> {
|
||||
let library_version = crate::version();
|
||||
match &self.versioned_footer {
|
||||
VersionedFooter::V1 {
|
||||
crc32: _crc,
|
||||
store_compression,
|
||||
} => {
|
||||
if &library_version.store_compression != store_compression {
|
||||
return Err(Incompatibility::CompressionMismatch {
|
||||
library_compression_format: library_version.store_compression.to_string(),
|
||||
index_compression_format: store_compression.to_string(),
|
||||
});
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
VersionedFooter::V2 {
|
||||
crc32: _crc,
|
||||
store_compression,
|
||||
} => {
|
||||
if &library_version.store_compression != store_compression {
|
||||
return Err(Incompatibility::CompressionMismatch {
|
||||
library_compression_format: library_version.store_compression.to_string(),
|
||||
index_compression_format: store_compression.to_string(),
|
||||
});
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
VersionedFooter::V3 {
|
||||
crc32: _crc,
|
||||
store_compression,
|
||||
} => {
|
||||
if &library_version.store_compression != store_compression {
|
||||
return Err(Incompatibility::CompressionMismatch {
|
||||
library_compression_format: library_version.store_compression.to_string(),
|
||||
index_compression_format: store_compression.to_string(),
|
||||
});
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
VersionedFooter::UnknownVersion => Err(Incompatibility::IndexMismatch {
|
||||
if self.version.index_format_version < 4
|
||||
|| self.version.index_format_version > INDEX_FORMAT_VERSION
|
||||
{
|
||||
return Err(Incompatibility::IndexMismatch {
|
||||
library_version: library_version.clone(),
|
||||
index_version: self.version.clone(),
|
||||
}),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Footer that includes a crc32 hash that enables us to checksum files in the index
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub enum VersionedFooter {
|
||||
UnknownVersion,
|
||||
V1 {
|
||||
crc32: CrcHashU32,
|
||||
store_compression: String,
|
||||
},
|
||||
// Introduction of the Block WAND information.
|
||||
V2 {
|
||||
crc32: CrcHashU32,
|
||||
store_compression: String,
|
||||
},
|
||||
// Block wand max termfred on 1 byte
|
||||
V3 {
|
||||
crc32: CrcHashU32,
|
||||
store_compression: String,
|
||||
},
|
||||
}
|
||||
|
||||
impl BinarySerializable for VersionedFooter {
|
||||
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let mut buf = Vec::new();
|
||||
match self {
|
||||
VersionedFooter::V3 {
|
||||
crc32,
|
||||
store_compression: compression,
|
||||
} => {
|
||||
// Serializes a valid `VersionedFooter` or panics if the version is unknown
|
||||
// [ version | crc_hash | compression_mode ]
|
||||
// [ 0..4 | 4..8 | variable ]
|
||||
BinarySerializable::serialize(&3u32, &mut buf)?;
|
||||
BinarySerializable::serialize(crc32, &mut buf)?;
|
||||
BinarySerializable::serialize(compression, &mut buf)?;
|
||||
}
|
||||
VersionedFooter::V2 { .. }
|
||||
| VersionedFooter::V1 { .. }
|
||||
| VersionedFooter::UnknownVersion => {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"Cannot serialize an unknown versioned footer ",
|
||||
));
|
||||
}
|
||||
}
|
||||
BinarySerializable::serialize(&VInt(buf.len() as u64), writer)?;
|
||||
assert!(buf.len() <= FOOTER_MAX_LEN);
|
||||
writer.write_all(&buf[..])?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let len = VInt::deserialize(reader)?.0 as usize;
|
||||
if len > FOOTER_MAX_LEN {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
format!(
|
||||
"Footer seems invalid as it suggests a footer len of {}. File is corrupted, \
|
||||
or the index was created with a different & old version of tantivy.",
|
||||
len
|
||||
),
|
||||
));
|
||||
}
|
||||
let mut buf = vec![0u8; len];
|
||||
reader.read_exact(&mut buf[..])?;
|
||||
let mut cursor = &buf[..];
|
||||
let version = u32::deserialize(&mut cursor)?;
|
||||
if version > 3 {
|
||||
return Ok(VersionedFooter::UnknownVersion);
|
||||
}
|
||||
let crc32 = u32::deserialize(&mut cursor)?;
|
||||
let store_compression = String::deserialize(&mut cursor)?;
|
||||
Ok(if version == 1 {
|
||||
VersionedFooter::V1 {
|
||||
crc32,
|
||||
store_compression,
|
||||
}
|
||||
} else if version == 2 {
|
||||
VersionedFooter::V2 {
|
||||
crc32,
|
||||
store_compression,
|
||||
}
|
||||
} else {
|
||||
assert_eq!(version, 3);
|
||||
VersionedFooter::V3 {
|
||||
crc32,
|
||||
store_compression,
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl VersionedFooter {
|
||||
pub fn crc(&self) -> Option<CrcHashU32> {
|
||||
match self {
|
||||
VersionedFooter::V3 { crc32, .. } => Some(*crc32),
|
||||
VersionedFooter::V2 { crc32, .. } => Some(*crc32),
|
||||
VersionedFooter::V1 { crc32, .. } => Some(*crc32),
|
||||
VersionedFooter::UnknownVersion { .. } => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct FooterProxy<W: TerminatingWrite> {
|
||||
@@ -268,10 +144,7 @@ impl<W: TerminatingWrite> Write for FooterProxy<W> {
|
||||
impl<W: TerminatingWrite> TerminatingWrite for FooterProxy<W> {
|
||||
fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()> {
|
||||
let crc32 = self.hasher.take().unwrap().finalize();
|
||||
let footer = Footer::new(VersionedFooter::V3 {
|
||||
crc32,
|
||||
store_compression: crate::store::COMPRESSION.to_string(),
|
||||
});
|
||||
let footer = Footer::new(crc32);
|
||||
let mut writer = self.writer.take().unwrap();
|
||||
footer.append_footer(&mut writer)?;
|
||||
writer.terminate()
|
||||
@@ -281,138 +154,75 @@ impl<W: TerminatingWrite> TerminatingWrite for FooterProxy<W> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::CrcHashU32;
|
||||
use super::FooterProxy;
|
||||
use crate::common::{BinarySerializable, VInt};
|
||||
use crate::directory::footer::{Footer, VersionedFooter};
|
||||
use crate::directory::TerminatingWrite;
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use regex::Regex;
|
||||
use crate::directory::footer::Footer;
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::{
|
||||
common::BinarySerializable,
|
||||
directory::{footer::FOOTER_MAGIC_NUMBER, FileSlice},
|
||||
};
|
||||
use std::io;
|
||||
|
||||
#[test]
|
||||
fn test_versioned_footer() {
|
||||
let mut vec = Vec::new();
|
||||
let footer_proxy = FooterProxy::new(&mut vec);
|
||||
assert!(footer_proxy.terminate().is_ok());
|
||||
if crate::store::COMPRESSION == "lz4" {
|
||||
assert_eq!(vec.len(), 158);
|
||||
} else {
|
||||
assert_eq!(vec.len(), 167);
|
||||
}
|
||||
let footer = Footer::deserialize(&mut &vec[..]).unwrap();
|
||||
assert!(matches!(
|
||||
footer.versioned_footer,
|
||||
VersionedFooter::V3 { store_compression, .. }
|
||||
if store_compression == crate::store::COMPRESSION
|
||||
));
|
||||
assert_eq!(&footer.version, crate::version());
|
||||
fn test_deserialize_footer() {
|
||||
let mut buf: Vec<u8> = vec![];
|
||||
let footer = Footer::new(123);
|
||||
footer.append_footer(&mut buf).unwrap();
|
||||
let owned_bytes = OwnedBytes::new(buf);
|
||||
let fileslice = FileSlice::new(Box::new(owned_bytes));
|
||||
let (footer_deser, _body) = Footer::extract_footer(fileslice).unwrap();
|
||||
assert_eq!(footer_deser.crc(), footer.crc());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialize_deserialize_footer() {
|
||||
let mut buffer = Vec::new();
|
||||
let crc32 = 123456u32;
|
||||
let footer: Footer = Footer::new(VersionedFooter::V3 {
|
||||
crc32,
|
||||
store_compression: "lz4".to_string(),
|
||||
});
|
||||
footer.serialize(&mut buffer).unwrap();
|
||||
let footer_deser = Footer::deserialize(&mut &buffer[..]).unwrap();
|
||||
assert_eq!(footer_deser, footer);
|
||||
fn test_deserialize_footer_missing_magic_byte() {
|
||||
let mut buf: Vec<u8> = vec![];
|
||||
BinarySerializable::serialize(&0_u32, &mut buf).unwrap();
|
||||
let wrong_magic_byte: u32 = 5555;
|
||||
BinarySerializable::serialize(&wrong_magic_byte, &mut buf).unwrap();
|
||||
|
||||
let owned_bytes = OwnedBytes::new(buf);
|
||||
|
||||
let fileslice = FileSlice::new(Box::new(owned_bytes));
|
||||
let err = Footer::extract_footer(fileslice).unwrap_err();
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
"Footer magic byte mismatch. File corrupted or index was created using old an tantivy version which \
|
||||
is not supported anymore. Please use tantivy 0.15 or above to recreate the index."
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn footer_length() {
|
||||
let crc32 = 1111111u32;
|
||||
let versioned_footer = VersionedFooter::V3 {
|
||||
crc32,
|
||||
store_compression: "lz4".to_string(),
|
||||
};
|
||||
let mut buf = Vec::new();
|
||||
versioned_footer.serialize(&mut buf).unwrap();
|
||||
assert_eq!(buf.len(), 13);
|
||||
let footer = Footer::new(versioned_footer);
|
||||
let regex_ptn = Regex::new(
|
||||
"tantivy v[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.{0,10}, index_format v[0-9]{1,5}",
|
||||
)
|
||||
.unwrap();
|
||||
assert!(regex_ptn.is_match(&footer.meta));
|
||||
}
|
||||
fn test_deserialize_footer_wrong_filesize() {
|
||||
let mut buf: Vec<u8> = vec![];
|
||||
BinarySerializable::serialize(&100_u32, &mut buf).unwrap();
|
||||
BinarySerializable::serialize(&FOOTER_MAGIC_NUMBER, &mut buf).unwrap();
|
||||
|
||||
#[test]
|
||||
fn versioned_footer_from_bytes() {
|
||||
let v_footer_bytes = vec![
|
||||
// versionned footer length
|
||||
12 | 128,
|
||||
// index format version
|
||||
3,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
// crc 32
|
||||
12,
|
||||
35,
|
||||
89,
|
||||
18,
|
||||
// compression format
|
||||
3 | 128,
|
||||
b'l',
|
||||
b'z',
|
||||
b'4',
|
||||
];
|
||||
let mut cursor = &v_footer_bytes[..];
|
||||
let versioned_footer = VersionedFooter::deserialize(&mut cursor).unwrap();
|
||||
assert!(cursor.is_empty());
|
||||
let expected_crc: u32 = LittleEndian::read_u32(&v_footer_bytes[5..9]) as CrcHashU32;
|
||||
let expected_versioned_footer: VersionedFooter = VersionedFooter::V3 {
|
||||
crc32: expected_crc,
|
||||
store_compression: "lz4".to_string(),
|
||||
};
|
||||
assert_eq!(versioned_footer, expected_versioned_footer);
|
||||
let mut buffer = Vec::new();
|
||||
assert!(versioned_footer.serialize(&mut buffer).is_ok());
|
||||
assert_eq!(&v_footer_bytes[..], &buffer[..]);
|
||||
}
|
||||
let owned_bytes = OwnedBytes::new(buf);
|
||||
|
||||
#[test]
|
||||
fn versioned_footer_panic() {
|
||||
let v_footer_bytes = vec![6u8 | 128u8, 3u8, 0u8, 0u8, 1u8, 0u8, 0u8];
|
||||
let mut b = &v_footer_bytes[..];
|
||||
let versioned_footer = VersionedFooter::deserialize(&mut b).unwrap();
|
||||
assert!(b.is_empty());
|
||||
let expected_versioned_footer = VersionedFooter::UnknownVersion;
|
||||
assert_eq!(versioned_footer, expected_versioned_footer);
|
||||
let mut buf = Vec::new();
|
||||
assert!(versioned_footer.serialize(&mut buf).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(not(feature = "lz4"))]
|
||||
fn compression_mismatch() {
|
||||
let crc32 = 1111111u32;
|
||||
let versioned_footer = VersionedFooter::V1 {
|
||||
crc32,
|
||||
store_compression: "lz4".to_string(),
|
||||
};
|
||||
let footer = Footer::new(versioned_footer);
|
||||
let res = footer.is_compatible();
|
||||
assert!(res.is_err());
|
||||
let fileslice = FileSlice::new(Box::new(owned_bytes));
|
||||
let err = Footer::extract_footer(fileslice).unwrap_err();
|
||||
assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof);
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
"File corrupted. The file is smaller than it\'s footer bytes (len=108)."
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_deserialize_too_large_footer() {
|
||||
let mut buf = vec![];
|
||||
assert!(FooterProxy::new(&mut buf).terminate().is_ok());
|
||||
let mut long_len_buf = [0u8; 10];
|
||||
let num_bytes = VInt(super::FOOTER_MAX_LEN as u64 + 1u64).serialize_into(&mut long_len_buf);
|
||||
buf[0..num_bytes].copy_from_slice(&long_len_buf[..num_bytes]);
|
||||
let err = Footer::deserialize(&mut &buf[..]).unwrap_err();
|
||||
let mut buf: Vec<u8> = vec![];
|
||||
|
||||
let footer_length = super::FOOTER_MAX_LEN + 1;
|
||||
BinarySerializable::serialize(&footer_length, &mut buf).unwrap();
|
||||
BinarySerializable::serialize(&FOOTER_MAGIC_NUMBER, &mut buf).unwrap();
|
||||
|
||||
let owned_bytes = OwnedBytes::new(buf);
|
||||
|
||||
let fileslice = FileSlice::new(Box::new(owned_bytes));
|
||||
let err = Footer::extract_footer(fileslice).unwrap_err();
|
||||
assert_eq!(err.kind(), io::ErrorKind::InvalidData);
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
"Footer seems invalid as it suggests a footer len of 10001. File is corrupted, \
|
||||
or the index was created with a different & old version of tantivy."
|
||||
"Footer seems invalid as it suggests a footer len of 50001. File is corrupted, \
|
||||
or the index was created with a different & old version of tantivy."
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -86,7 +86,7 @@ impl ManagedDirectory {
|
||||
directory: Box::new(directory),
|
||||
meta_informations: Arc::default(),
|
||||
}),
|
||||
io_err @ Err(OpenReadError::IOError { .. }) => Err(io_err.err().unwrap().into()),
|
||||
io_err @ Err(OpenReadError::IoError { .. }) => Err(io_err.err().unwrap().into()),
|
||||
Err(OpenReadError::IncompatibleIndex(incompatibility)) => {
|
||||
// For the moment, this should never happen `meta.json`
|
||||
// do not have any footer and cannot detect incompatibility.
|
||||
@@ -168,7 +168,7 @@ impl ManagedDirectory {
|
||||
DeleteError::FileDoesNotExist(_) => {
|
||||
deleted_files.push(file_to_delete.clone());
|
||||
}
|
||||
DeleteError::IOError { .. } => {
|
||||
DeleteError::IoError { .. } => {
|
||||
failed_to_delete_files.push(file_to_delete.clone());
|
||||
if !cfg!(target_os = "windows") {
|
||||
// On windows, delete is expected to fail if the file
|
||||
@@ -232,24 +232,20 @@ impl ManagedDirectory {
|
||||
pub fn validate_checksum(&self, path: &Path) -> result::Result<bool, OpenReadError> {
|
||||
let reader = self.directory.open_read(path)?;
|
||||
let (footer, data) =
|
||||
Footer::extract_footer(reader).map_err(|io_error| OpenReadError::IOError {
|
||||
Footer::extract_footer(reader).map_err(|io_error| OpenReadError::IoError {
|
||||
io_error,
|
||||
filepath: path.to_path_buf(),
|
||||
})?;
|
||||
let bytes = data
|
||||
.read_bytes()
|
||||
.map_err(|io_error| OpenReadError::IOError {
|
||||
.map_err(|io_error| OpenReadError::IoError {
|
||||
filepath: path.to_path_buf(),
|
||||
io_error,
|
||||
})?;
|
||||
let mut hasher = Hasher::new();
|
||||
hasher.update(bytes.as_slice());
|
||||
let crc = hasher.finalize();
|
||||
Ok(footer
|
||||
.versioned_footer
|
||||
.crc()
|
||||
.map(|v| v == crc)
|
||||
.unwrap_or(false))
|
||||
Ok(footer.crc() == crc)
|
||||
}
|
||||
|
||||
/// List files for which checksum does not match content
|
||||
@@ -406,10 +402,8 @@ mod tests_mmap_specific {
|
||||
// The file should still be in the list of managed file and
|
||||
// eventually be deleted once mmap is released.
|
||||
assert!(managed_directory.garbage_collect(|| living_files).is_ok());
|
||||
assert!(!managed_directory.exists(test_path1).unwrap());
|
||||
} else {
|
||||
assert!(!managed_directory.exists(test_path1).unwrap());
|
||||
}
|
||||
assert!(!managed_directory.exists(test_path1).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -434,7 +428,7 @@ mod tests_mmap_specific {
|
||||
assert_eq!(read_file.as_slice(), &[3u8, 4u8, 5u8]);
|
||||
assert!(managed_directory.list_damaged().unwrap().is_empty());
|
||||
|
||||
let mut corrupted_path = tempdir_path.clone();
|
||||
let mut corrupted_path = tempdir_path;
|
||||
corrupted_path.push(test_path2);
|
||||
let mut file = OpenOptions::new().write(true).open(&corrupted_path)?;
|
||||
file.write_all(&[255u8])?;
|
||||
|
||||
@@ -11,7 +11,7 @@ use crate::directory::{AntiCallToken, FileHandle, OwnedBytes};
|
||||
use crate::directory::{ArcBytes, WeakArcBytes};
|
||||
use crate::directory::{TerminatingWrite, WritePtr};
|
||||
use fs2::FileExt;
|
||||
use memmap::Mmap;
|
||||
use memmap2::Mmap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use stable_deref_trait::StableDeref;
|
||||
use std::convert::From;
|
||||
@@ -53,7 +53,7 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {
|
||||
return Ok(None);
|
||||
}
|
||||
unsafe {
|
||||
memmap::Mmap::map(&file)
|
||||
memmap2::Mmap::map(&file)
|
||||
.map(Some)
|
||||
.map_err(|io_err| OpenReadError::wrap_io_error(io_err, full_path.to_path_buf()))
|
||||
}
|
||||
@@ -185,7 +185,7 @@ impl MmapDirectory {
|
||||
/// Creates a new MmapDirectory in a temporary directory.
|
||||
///
|
||||
/// This is mostly useful to test the MmapDirectory itself.
|
||||
/// For your unit tests, prefer the RAMDirectory.
|
||||
/// For your unit tests, prefer the RamDirectory.
|
||||
pub fn create_from_tempdir() -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
let tempdir = TempDir::new().map_err(OpenDirectoryError::FailedToCreateTempDir)?;
|
||||
Ok(MmapDirectory::new(
|
||||
@@ -374,7 +374,7 @@ impl Directory for MmapDirectory {
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
let full_path = self.resolve_path(path);
|
||||
match fs::remove_file(&full_path) {
|
||||
Ok(_) => self.sync_directory().map_err(|e| DeleteError::IOError {
|
||||
Ok(_) => self.sync_directory().map_err(|e| DeleteError::IoError {
|
||||
io_error: e,
|
||||
filepath: path.to_path_buf(),
|
||||
}),
|
||||
@@ -382,7 +382,7 @@ impl Directory for MmapDirectory {
|
||||
if e.kind() == io::ErrorKind::NotFound {
|
||||
Err(DeleteError::FileDoesNotExist(path.to_owned()))
|
||||
} else {
|
||||
Err(DeleteError::IOError {
|
||||
Err(DeleteError::IoError {
|
||||
io_error: e,
|
||||
filepath: path.to_path_buf(),
|
||||
})
|
||||
@@ -460,9 +460,9 @@ impl Directory for MmapDirectory {
|
||||
.write(true)
|
||||
.create(true) //< if the file does not exist yet, create it.
|
||||
.open(&full_path)
|
||||
.map_err(LockError::IOError)?;
|
||||
.map_err(LockError::IoError)?;
|
||||
if lock.is_blocking {
|
||||
file.lock_exclusive().map_err(LockError::IOError)?;
|
||||
file.lock_exclusive().map_err(LockError::IoError)?;
|
||||
} else {
|
||||
file.try_lock_exclusive().map_err(|_| LockError::LockBusy)?
|
||||
}
|
||||
@@ -485,10 +485,13 @@ mod tests {
|
||||
// The following tests are specific to the MmapDirectory
|
||||
|
||||
use super::*;
|
||||
use crate::schema::{Schema, SchemaBuilder, TEXT};
|
||||
use crate::Index;
|
||||
use crate::ReloadPolicy;
|
||||
use crate::{common::HasLen, indexer::LogMergePolicy};
|
||||
use crate::{
|
||||
schema::{Schema, SchemaBuilder, TEXT},
|
||||
IndexSettings,
|
||||
};
|
||||
|
||||
#[test]
|
||||
fn test_open_non_existent_path() {
|
||||
@@ -585,11 +588,12 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
|
||||
{
|
||||
let index = Index::create(mmap_directory.clone(), schema).unwrap();
|
||||
let index =
|
||||
Index::create(mmap_directory.clone(), schema, IndexSettings::default()).unwrap();
|
||||
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut log_merge_policy = LogMergePolicy::default();
|
||||
log_merge_policy.set_min_merge_size(3);
|
||||
log_merge_policy.set_min_num_segments(3);
|
||||
index_writer.set_merge_policy(Box::new(log_merge_policy));
|
||||
for _num_commits in 0..10 {
|
||||
for _ in 0..10 {
|
||||
@@ -614,8 +618,10 @@ mod tests {
|
||||
reader.reload().unwrap();
|
||||
let num_segments = reader.searcher().segment_readers().len();
|
||||
assert!(num_segments <= 4);
|
||||
let num_components_except_deletes_and_tempstore =
|
||||
crate::core::SegmentComponent::iterator().len() - 2;
|
||||
assert_eq!(
|
||||
num_segments * 7,
|
||||
num_segments * num_components_except_deletes_and_tempstore,
|
||||
mmap_directory.get_cache_info().mmapped.len()
|
||||
);
|
||||
}
|
||||
|
||||
@@ -26,9 +26,11 @@ pub use self::directory_lock::{Lock, INDEX_WRITER_LOCK, META_LOCK};
|
||||
pub(crate) use self::file_slice::{ArcBytes, WeakArcBytes};
|
||||
pub use self::file_slice::{FileHandle, FileSlice};
|
||||
pub use self::owned_bytes::OwnedBytes;
|
||||
pub use self::ram_directory::RAMDirectory;
|
||||
pub use self::ram_directory::RamDirectory;
|
||||
pub use self::watch_event_router::{WatchCallback, WatchCallbackList, WatchHandle};
|
||||
use std::io::{self, BufWriter, Write};
|
||||
pub use common::AntiCallToken;
|
||||
pub use common::TerminatingWrite;
|
||||
use std::io::BufWriter;
|
||||
use std::path::PathBuf;
|
||||
|
||||
/// Outcome of the Garbage collection
|
||||
@@ -50,47 +52,6 @@ pub use self::mmap_directory::MmapDirectory;
|
||||
|
||||
pub use self::managed_directory::ManagedDirectory;
|
||||
|
||||
/// Struct used to prevent from calling [`terminate_ref`](trait.TerminatingWrite#method.terminate_ref) directly
|
||||
///
|
||||
/// The point is that while the type is public, it cannot be built by anyone
|
||||
/// outside of this module.
|
||||
pub struct AntiCallToken(());
|
||||
|
||||
/// Trait used to indicate when no more write need to be done on a writer
|
||||
pub trait TerminatingWrite: Write {
|
||||
/// Indicate that the writer will no longer be used. Internally call terminate_ref.
|
||||
fn terminate(mut self) -> io::Result<()>
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
self.terminate_ref(AntiCallToken(()))
|
||||
}
|
||||
|
||||
/// You should implement this function to define custom behavior.
|
||||
/// This function should flush any buffer it may hold.
|
||||
fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()>;
|
||||
}
|
||||
|
||||
impl<W: TerminatingWrite + ?Sized> TerminatingWrite for Box<W> {
|
||||
fn terminate_ref(&mut self, token: AntiCallToken) -> io::Result<()> {
|
||||
self.as_mut().terminate_ref(token)
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: TerminatingWrite> TerminatingWrite for BufWriter<W> {
|
||||
fn terminate_ref(&mut self, a: AntiCallToken) -> io::Result<()> {
|
||||
self.flush()?;
|
||||
self.get_mut().terminate_ref(a)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl<'a> TerminatingWrite for &'a mut Vec<u8> {
|
||||
fn terminate_ref(&mut self, _a: AntiCallToken) -> io::Result<()> {
|
||||
self.flush()
|
||||
}
|
||||
}
|
||||
|
||||
/// Write object for Directory.
|
||||
///
|
||||
/// `WritePtr` are required to implement both Write
|
||||
|
||||
@@ -1,290 +1,11 @@
|
||||
use crate::directory::FileHandle;
|
||||
use stable_deref_trait::StableDeref;
|
||||
use std::convert::TryInto;
|
||||
use std::mem;
|
||||
use std::ops::{Deref, Range};
|
||||
use std::sync::Arc;
|
||||
use std::{fmt, io};
|
||||
use std::io;
|
||||
use std::ops::Range;
|
||||
|
||||
/// An OwnedBytes simply wraps an object that owns a slice of data and exposes
|
||||
/// this data as a static slice.
|
||||
///
|
||||
/// The backing object is required to be `StableDeref`.
|
||||
#[derive(Clone)]
|
||||
pub struct OwnedBytes {
|
||||
data: &'static [u8],
|
||||
box_stable_deref: Arc<dyn Deref<Target = [u8]> + Sync + Send>,
|
||||
}
|
||||
pub use ownedbytes::OwnedBytes;
|
||||
|
||||
impl FileHandle for OwnedBytes {
|
||||
fn read_bytes(&self, range: Range<usize>) -> io::Result<OwnedBytes> {
|
||||
Ok(self.slice(range))
|
||||
}
|
||||
}
|
||||
|
||||
impl OwnedBytes {
|
||||
/// Creates an empty `OwnedBytes`.
|
||||
pub fn empty() -> OwnedBytes {
|
||||
OwnedBytes::new(&[][..])
|
||||
}
|
||||
|
||||
/// Creates an `OwnedBytes` intance given a `StableDeref` object.
|
||||
pub fn new<T: StableDeref + Deref<Target = [u8]> + 'static + Send + Sync>(
|
||||
data_holder: T,
|
||||
) -> OwnedBytes {
|
||||
let box_stable_deref = Arc::new(data_holder);
|
||||
let bytes: &[u8] = box_stable_deref.as_ref();
|
||||
let data = unsafe { mem::transmute::<_, &'static [u8]>(bytes.deref()) };
|
||||
OwnedBytes {
|
||||
box_stable_deref,
|
||||
data,
|
||||
}
|
||||
}
|
||||
|
||||
/// creates a fileslice that is just a view over a slice of the data.
|
||||
pub fn slice(&self, range: Range<usize>) -> Self {
|
||||
OwnedBytes {
|
||||
data: &self.data[range],
|
||||
box_stable_deref: self.box_stable_deref.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the underlying slice of data.
|
||||
/// `Deref` and `AsRef` are also available.
|
||||
#[inline(always)]
|
||||
pub fn as_slice(&self) -> &[u8] {
|
||||
self.data
|
||||
}
|
||||
|
||||
/// Returns the len of the slice.
|
||||
#[inline(always)]
|
||||
pub fn len(&self) -> usize {
|
||||
self.data.len()
|
||||
}
|
||||
|
||||
/// Splits the OwnedBytes into two OwnedBytes `(left, right)`.
|
||||
///
|
||||
/// Left will hold `split_len` bytes.
|
||||
///
|
||||
/// This operation is cheap and does not require to copy any memory.
|
||||
/// On the other hand, both `left` and `right` retain a handle over
|
||||
/// the entire slice of memory. In other words, the memory will only
|
||||
/// be released when both left and right are dropped.
|
||||
pub fn split(self, split_len: usize) -> (OwnedBytes, OwnedBytes) {
|
||||
let right_box_stable_deref = self.box_stable_deref.clone();
|
||||
let left = OwnedBytes {
|
||||
data: &self.data[..split_len],
|
||||
box_stable_deref: self.box_stable_deref,
|
||||
};
|
||||
let right = OwnedBytes {
|
||||
data: &self.data[split_len..],
|
||||
box_stable_deref: right_box_stable_deref,
|
||||
};
|
||||
(left, right)
|
||||
}
|
||||
|
||||
/// Returns true iff this `OwnedBytes` is empty.
|
||||
#[inline(always)]
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.as_slice().is_empty()
|
||||
}
|
||||
|
||||
/// Drops the left most `advance_len` bytes.
|
||||
///
|
||||
/// See also [.clip(clip_len: usize))](#method.clip).
|
||||
#[inline(always)]
|
||||
pub fn advance(&mut self, advance_len: usize) {
|
||||
self.data = &self.data[advance_len..]
|
||||
}
|
||||
|
||||
/// Reads an `u8` from the `OwnedBytes` and advance by one byte.
|
||||
pub fn read_u8(&mut self) -> u8 {
|
||||
assert!(!self.is_empty());
|
||||
|
||||
let byte = self.as_slice()[0];
|
||||
self.advance(1);
|
||||
byte
|
||||
}
|
||||
|
||||
/// Reads an `u64` encoded as little-endian from the `OwnedBytes` and advance by 8 bytes.
|
||||
pub fn read_u64(&mut self) -> u64 {
|
||||
assert!(self.len() > 7);
|
||||
|
||||
let octlet: [u8; 8] = self.as_slice()[..8].try_into().unwrap();
|
||||
self.advance(8);
|
||||
u64::from_le_bytes(octlet)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for OwnedBytes {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
// We truncate the bytes in order to make sure the debug string
|
||||
// is not too long.
|
||||
let bytes_truncated: &[u8] = if self.len() > 8 {
|
||||
&self.as_slice()[..10]
|
||||
} else {
|
||||
self.as_slice()
|
||||
};
|
||||
write!(f, "OwnedBytes({:?}, len={})", bytes_truncated, self.len())
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for OwnedBytes {
|
||||
type Target = [u8];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.as_slice()
|
||||
}
|
||||
}
|
||||
|
||||
impl io::Read for OwnedBytes {
|
||||
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
||||
let read_len = {
|
||||
let data = self.as_slice();
|
||||
if data.len() >= buf.len() {
|
||||
let buf_len = buf.len();
|
||||
buf.copy_from_slice(&data[..buf_len]);
|
||||
buf.len()
|
||||
} else {
|
||||
let data_len = data.len();
|
||||
buf[..data_len].copy_from_slice(data);
|
||||
data_len
|
||||
}
|
||||
};
|
||||
self.advance(read_len);
|
||||
Ok(read_len)
|
||||
}
|
||||
fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
|
||||
let read_len = {
|
||||
let data = self.as_slice();
|
||||
buf.extend(data);
|
||||
data.len()
|
||||
};
|
||||
self.advance(read_len);
|
||||
Ok(read_len)
|
||||
}
|
||||
fn read_exact(&mut self, buf: &mut [u8]) -> io::Result<()> {
|
||||
let read_len = self.read(buf)?;
|
||||
if read_len != buf.len() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::UnexpectedEof,
|
||||
"failed to fill whole buffer",
|
||||
));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<[u8]> for OwnedBytes {
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
self.as_slice()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::io::{self, Read};
|
||||
|
||||
use super::OwnedBytes;
|
||||
|
||||
#[test]
|
||||
fn test_owned_bytes_debug() {
|
||||
let short_bytes = OwnedBytes::new(b"abcd".as_ref());
|
||||
assert_eq!(
|
||||
format!("{:?}", short_bytes),
|
||||
"OwnedBytes([97, 98, 99, 100], len=4)"
|
||||
);
|
||||
let long_bytes = OwnedBytes::new(b"abcdefghijklmnopq".as_ref());
|
||||
assert_eq!(
|
||||
format!("{:?}", long_bytes),
|
||||
"OwnedBytes([97, 98, 99, 100, 101, 102, 103, 104, 105, 106], len=17)"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_owned_bytes_read() -> io::Result<()> {
|
||||
let mut bytes = OwnedBytes::new(b"abcdefghiklmnopqrstuvwxyz".as_ref());
|
||||
{
|
||||
let mut buf = [0u8; 5];
|
||||
bytes.read_exact(&mut buf[..]).unwrap();
|
||||
assert_eq!(&buf, b"abcde");
|
||||
assert_eq!(bytes.as_slice(), b"fghiklmnopqrstuvwxyz")
|
||||
}
|
||||
{
|
||||
let mut buf = [0u8; 2];
|
||||
bytes.read_exact(&mut buf[..]).unwrap();
|
||||
assert_eq!(&buf, b"fg");
|
||||
assert_eq!(bytes.as_slice(), b"hiklmnopqrstuvwxyz")
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_owned_bytes_read_right_at_the_end() -> io::Result<()> {
|
||||
let mut bytes = OwnedBytes::new(b"abcde".as_ref());
|
||||
let mut buf = [0u8; 5];
|
||||
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 5);
|
||||
assert_eq!(&buf, b"abcde");
|
||||
assert_eq!(bytes.as_slice(), b"");
|
||||
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 0);
|
||||
assert_eq!(&buf, b"abcde");
|
||||
Ok(())
|
||||
}
|
||||
#[test]
|
||||
fn test_owned_bytes_read_incomplete() -> io::Result<()> {
|
||||
let mut bytes = OwnedBytes::new(b"abcde".as_ref());
|
||||
let mut buf = [0u8; 7];
|
||||
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 5);
|
||||
assert_eq!(&buf[..5], b"abcde");
|
||||
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 0);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_owned_bytes_read_to_end() -> io::Result<()> {
|
||||
let mut bytes = OwnedBytes::new(b"abcde".as_ref());
|
||||
let mut buf = Vec::new();
|
||||
bytes.read_to_end(&mut buf)?;
|
||||
assert_eq!(buf.as_slice(), b"abcde".as_ref());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_owned_bytes_read_u8() -> io::Result<()> {
|
||||
let mut bytes = OwnedBytes::new(b"\xFF".as_ref());
|
||||
assert_eq!(bytes.read_u8(), 255);
|
||||
assert_eq!(bytes.len(), 0);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_owned_bytes_read_u64() -> io::Result<()> {
|
||||
let mut bytes = OwnedBytes::new(b"\0\xFF\xFF\xFF\xFF\xFF\xFF\xFF".as_ref());
|
||||
assert_eq!(bytes.read_u64(), u64::MAX - 255);
|
||||
assert_eq!(bytes.len(), 0);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_owned_bytes_split() {
|
||||
let bytes = OwnedBytes::new(b"abcdefghi".as_ref());
|
||||
let (left, right) = bytes.split(3);
|
||||
assert_eq!(left.as_slice(), b"abc");
|
||||
assert_eq!(right.as_slice(), b"defghi");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_owned_bytes_split_boundary() {
|
||||
let bytes = OwnedBytes::new(b"abcdefghi".as_ref());
|
||||
{
|
||||
let (left, right) = bytes.clone().split(0);
|
||||
assert_eq!(left.as_slice(), b"");
|
||||
assert_eq!(right.as_slice(), b"abcdefghi");
|
||||
}
|
||||
{
|
||||
let (left, right) = bytes.split(9);
|
||||
assert_eq!(left.as_slice(), b"abcdefghi");
|
||||
assert_eq!(right.as_slice(), b"");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,7 +14,7 @@ use std::sync::{Arc, RwLock};
|
||||
|
||||
use super::FileHandle;
|
||||
|
||||
/// Writer associated with the `RAMDirectory`
|
||||
/// Writer associated with the `RamDirectory`
|
||||
///
|
||||
/// The Writer just writes a buffer.
|
||||
///
|
||||
@@ -26,13 +26,13 @@ use super::FileHandle;
|
||||
///
|
||||
struct VecWriter {
|
||||
path: PathBuf,
|
||||
shared_directory: RAMDirectory,
|
||||
shared_directory: RamDirectory,
|
||||
data: Cursor<Vec<u8>>,
|
||||
is_flushed: bool,
|
||||
}
|
||||
|
||||
impl VecWriter {
|
||||
fn new(path_buf: PathBuf, shared_directory: RAMDirectory) -> VecWriter {
|
||||
fn new(path_buf: PathBuf, shared_directory: RamDirectory) -> VecWriter {
|
||||
VecWriter {
|
||||
path: path_buf,
|
||||
data: Cursor::new(Vec::new()),
|
||||
@@ -46,7 +46,7 @@ impl Drop for VecWriter {
|
||||
fn drop(&mut self) {
|
||||
if !self.is_flushed {
|
||||
panic!(
|
||||
"You forgot to flush {:?} before its writter got Drop. Do not rely on drop.",
|
||||
"You forgot to flush {:?} before its writter got Drop. Do not rely on drop. This also occurs when the indexer crashed, so you may want to check the logs for the root cause.",
|
||||
self.path
|
||||
)
|
||||
}
|
||||
@@ -119,9 +119,9 @@ impl InnerDirectory {
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for RAMDirectory {
|
||||
impl fmt::Debug for RamDirectory {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "RAMDirectory")
|
||||
write!(f, "RamDirectory")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -131,23 +131,23 @@ impl fmt::Debug for RAMDirectory {
|
||||
/// Writes are only made visible upon flushing.
|
||||
///
|
||||
#[derive(Clone, Default)]
|
||||
pub struct RAMDirectory {
|
||||
pub struct RamDirectory {
|
||||
fs: Arc<RwLock<InnerDirectory>>,
|
||||
}
|
||||
|
||||
impl RAMDirectory {
|
||||
impl RamDirectory {
|
||||
/// Constructor
|
||||
pub fn create() -> RAMDirectory {
|
||||
pub fn create() -> RamDirectory {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
/// Returns the sum of the size of the different files
|
||||
/// in the RAMDirectory.
|
||||
/// in the RamDirectory.
|
||||
pub fn total_mem_usage(&self) -> usize {
|
||||
self.fs.read().unwrap().total_mem_usage()
|
||||
}
|
||||
|
||||
/// Write a copy of all of the files saved in the RAMDirectory in the target `Directory`.
|
||||
/// Write a copy of all of the files saved in the RamDirectory in the target `Directory`.
|
||||
///
|
||||
/// Files are all written using the `Directory::write` meaning, even if they were
|
||||
/// written using the `atomic_write` api.
|
||||
@@ -164,7 +164,7 @@ impl RAMDirectory {
|
||||
}
|
||||
}
|
||||
|
||||
impl Directory for RAMDirectory {
|
||||
impl Directory for RamDirectory {
|
||||
fn get_file_handle(&self, path: &Path) -> Result<Box<dyn FileHandle>, OpenReadError> {
|
||||
let file_slice = self.open_read(path)?;
|
||||
Ok(Box::new(file_slice))
|
||||
@@ -175,8 +175,8 @@ impl Directory for RAMDirectory {
|
||||
}
|
||||
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
fail_point!("RAMDirectory::delete", |_| {
|
||||
Err(DeleteError::IOError {
|
||||
fail_point!("RamDirectory::delete", |_| {
|
||||
Err(DeleteError::IoError {
|
||||
io_error: io::Error::from(io::ErrorKind::Other),
|
||||
filepath: path.to_path_buf(),
|
||||
})
|
||||
@@ -188,7 +188,7 @@ impl Directory for RAMDirectory {
|
||||
Ok(self
|
||||
.fs
|
||||
.read()
|
||||
.map_err(|e| OpenReadError::IOError {
|
||||
.map_err(|e| OpenReadError::IoError {
|
||||
io_error: io::Error::new(io::ErrorKind::Other, e.to_string()),
|
||||
filepath: path.to_path_buf(),
|
||||
})?
|
||||
@@ -212,7 +212,7 @@ impl Directory for RAMDirectory {
|
||||
let bytes =
|
||||
self.open_read(path)?
|
||||
.read_bytes()
|
||||
.map_err(|io_error| OpenReadError::IOError {
|
||||
.map_err(|io_error| OpenReadError::IoError {
|
||||
io_error,
|
||||
filepath: path.to_path_buf(),
|
||||
})?;
|
||||
@@ -220,7 +220,7 @@ impl Directory for RAMDirectory {
|
||||
}
|
||||
|
||||
fn atomic_write(&self, path: &Path, data: &[u8]) -> io::Result<()> {
|
||||
fail_point!("RAMDirectory::atomic_write", |msg| Err(io::Error::new(
|
||||
fail_point!("RamDirectory::atomic_write", |msg| Err(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
msg.unwrap_or_else(|| "Undefined".to_string())
|
||||
)));
|
||||
@@ -241,7 +241,7 @@ impl Directory for RAMDirectory {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::RAMDirectory;
|
||||
use super::RamDirectory;
|
||||
use crate::Directory;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
@@ -252,12 +252,12 @@ mod tests {
|
||||
let msg_seq: &'static [u8] = b"sequential is the way";
|
||||
let path_atomic: &'static Path = Path::new("atomic");
|
||||
let path_seq: &'static Path = Path::new("seq");
|
||||
let directory = RAMDirectory::create();
|
||||
let directory = RamDirectory::create();
|
||||
assert!(directory.atomic_write(path_atomic, msg_atomic).is_ok());
|
||||
let mut wrt = directory.open_write(path_seq).unwrap();
|
||||
assert!(wrt.write_all(msg_seq).is_ok());
|
||||
assert!(wrt.flush().is_ok());
|
||||
let directory_copy = RAMDirectory::create();
|
||||
let directory_copy = RamDirectory::create();
|
||||
assert!(directory.persist(&directory_copy).is_ok());
|
||||
assert_eq!(directory_copy.atomic_read(path_atomic).unwrap(), msg_atomic);
|
||||
assert_eq!(directory_copy.atomic_read(path_seq).unwrap(), msg_seq);
|
||||
|
||||
@@ -65,12 +65,12 @@ mod mmap_directory_tests {
|
||||
}
|
||||
|
||||
mod ram_directory_tests {
|
||||
use crate::directory::RAMDirectory;
|
||||
use crate::directory::RamDirectory;
|
||||
|
||||
type DirectoryImpl = RAMDirectory;
|
||||
type DirectoryImpl = RamDirectory;
|
||||
|
||||
fn make_directory() -> DirectoryImpl {
|
||||
RAMDirectory::default()
|
||||
RamDirectory::default()
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -122,7 +122,7 @@ mod ram_directory_tests {
|
||||
#[should_panic]
|
||||
fn ram_directory_panics_if_flush_forgotten() {
|
||||
let test_path: &'static Path = Path::new("some_path_for_test");
|
||||
let ram_directory = RAMDirectory::create();
|
||||
let ram_directory = RamDirectory::create();
|
||||
let mut write_file = ram_directory.open_write(test_path).unwrap();
|
||||
assert!(write_file.write_all(&[4]).is_ok());
|
||||
}
|
||||
@@ -166,26 +166,26 @@ fn test_write_create_the_file(directory: &dyn Directory) {
|
||||
fn test_directory_delete(directory: &dyn Directory) -> crate::Result<()> {
|
||||
let test_path: &'static Path = Path::new("some_path_for_test");
|
||||
assert!(directory.open_read(test_path).is_err());
|
||||
let mut write_file = directory.open_write(&test_path)?;
|
||||
let mut write_file = directory.open_write(test_path)?;
|
||||
write_file.write_all(&[1, 2, 3, 4])?;
|
||||
write_file.flush()?;
|
||||
{
|
||||
let read_handle = directory.open_read(&test_path)?.read_bytes()?;
|
||||
let read_handle = directory.open_read(test_path)?.read_bytes()?;
|
||||
assert_eq!(read_handle.as_slice(), &[1u8, 2u8, 3u8, 4u8]);
|
||||
// Mapped files can't be deleted on Windows
|
||||
if !cfg!(windows) {
|
||||
assert!(directory.delete(&test_path).is_ok());
|
||||
assert!(directory.delete(test_path).is_ok());
|
||||
assert_eq!(read_handle.as_slice(), &[1u8, 2u8, 3u8, 4u8]);
|
||||
}
|
||||
assert!(directory.delete(Path::new("SomeOtherPath")).is_err());
|
||||
}
|
||||
|
||||
if cfg!(windows) {
|
||||
assert!(directory.delete(&test_path).is_ok());
|
||||
assert!(directory.delete(test_path).is_ok());
|
||||
}
|
||||
|
||||
assert!(directory.open_read(&test_path).is_err());
|
||||
assert!(directory.delete(&test_path).is_err());
|
||||
assert!(directory.open_read(test_path).is_err());
|
||||
assert!(directory.delete(test_path).is_err());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -70,7 +70,7 @@ pub enum TantivyError {
|
||||
LockFailure(LockError, Option<String>),
|
||||
/// IO Error.
|
||||
#[error("An IO error occurred: '{0}'")]
|
||||
IOError(#[from] io::Error),
|
||||
IoError(#[from] io::Error),
|
||||
/// Data corruption.
|
||||
#[error("Data corrupted: '{0:?}'")]
|
||||
DataCorruption(DataCorruption),
|
||||
@@ -83,6 +83,9 @@ pub enum TantivyError {
|
||||
/// An Error happened in one of the thread.
|
||||
#[error("An error occurred in a thread: '{0}'")]
|
||||
ErrorInThread(String),
|
||||
/// An Error appeared related to opening or creating a index.
|
||||
#[error("Missing required index builder argument when open/create index: '{0}'")]
|
||||
IndexBuilderMissingArgument(&'static str),
|
||||
/// An Error appeared related to the schema.
|
||||
#[error("Schema error: '{0}'")]
|
||||
SchemaError(String),
|
||||
@@ -136,7 +139,7 @@ impl From<schema::DocParsingError> for TantivyError {
|
||||
|
||||
impl From<serde_json::Error> for TantivyError {
|
||||
fn from(error: serde_json::Error) -> TantivyError {
|
||||
TantivyError::IOError(error.into())
|
||||
TantivyError::IoError(error.into())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -56,7 +56,7 @@ mod tests {
|
||||
fn test_stored_bytes() -> crate::Result<()> {
|
||||
let searcher = create_index_for_test(STORED)?;
|
||||
assert_eq!(searcher.num_docs(), 1);
|
||||
let retrieved_doc = searcher.doc(DocAddress(0u32, 0u32))?;
|
||||
let retrieved_doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
|
||||
let field = searcher.schema().get_field("string_bytes").unwrap();
|
||||
let values: Vec<&Value> = retrieved_doc.get_all(field).collect();
|
||||
assert_eq!(values.len(), 2);
|
||||
@@ -72,7 +72,7 @@ mod tests {
|
||||
fn test_non_stored_bytes() -> crate::Result<()> {
|
||||
let searcher = create_index_for_test(INDEXED)?;
|
||||
assert_eq!(searcher.num_docs(), 1);
|
||||
let retrieved_doc = searcher.doc(DocAddress(0u32, 0u32))?;
|
||||
let retrieved_doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
|
||||
let field = searcher.schema().get_field("string_bytes").unwrap();
|
||||
assert!(retrieved_doc.get_first(field).is_none());
|
||||
Ok(())
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use crate::directory::FileSlice;
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::fastfield::{BitpackedFastFieldReader, FastFieldReader, MultiValueLength};
|
||||
use crate::DocId;
|
||||
|
||||
/// Reader for byte array fast fields
|
||||
@@ -15,13 +15,13 @@ use crate::DocId;
|
||||
/// and the start index for the next document, and keeping the bytes in between.
|
||||
#[derive(Clone)]
|
||||
pub struct BytesFastFieldReader {
|
||||
idx_reader: FastFieldReader<u64>,
|
||||
idx_reader: BitpackedFastFieldReader<u64>,
|
||||
values: OwnedBytes,
|
||||
}
|
||||
|
||||
impl BytesFastFieldReader {
|
||||
pub(crate) fn open(
|
||||
idx_reader: FastFieldReader<u64>,
|
||||
idx_reader: BitpackedFastFieldReader<u64>,
|
||||
values_file: FileSlice,
|
||||
) -> crate::Result<BytesFastFieldReader> {
|
||||
let values = values_file.read_bytes()?;
|
||||
@@ -40,8 +40,23 @@ impl BytesFastFieldReader {
|
||||
&self.values.as_slice()[start..stop]
|
||||
}
|
||||
|
||||
/// Returns the length of the bytes associated to the given `doc`
|
||||
pub fn num_bytes(&self, doc: DocId) -> usize {
|
||||
let (start, stop) = self.range(doc);
|
||||
stop - start
|
||||
}
|
||||
|
||||
/// Returns the overall number of bytes in this bytes fast field.
|
||||
pub fn total_num_bytes(&self) -> usize {
|
||||
self.values.len()
|
||||
}
|
||||
}
|
||||
|
||||
impl MultiValueLength for BytesFastFieldReader {
|
||||
fn get_len(&self, doc_id: DocId) -> u64 {
|
||||
self.num_bytes(doc_id) as u64
|
||||
}
|
||||
fn get_total_len(&self) -> u64 {
|
||||
self.total_num_bytes() as u64
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
use std::io;
|
||||
|
||||
use crate::fastfield::serializer::FastFieldSerializer;
|
||||
use crate::schema::{Document, Field, Value};
|
||||
use crate::DocId;
|
||||
use crate::{
|
||||
fastfield::serializer::CompositeFastFieldSerializer, indexer::doc_id_mapping::DocIdMapping,
|
||||
};
|
||||
|
||||
/// Writer for byte array (as in, any number of bytes per document) fast fields
|
||||
///
|
||||
@@ -35,6 +37,10 @@ impl BytesFastFieldWriter {
|
||||
}
|
||||
}
|
||||
|
||||
/// The memory used (inclusive childs)
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.vals.capacity() + self.doc_index.capacity() * std::mem::size_of::<u64>()
|
||||
}
|
||||
/// Access the field associated to the `BytesFastFieldWriter`
|
||||
pub fn field(&self) -> Field {
|
||||
self.field
|
||||
@@ -68,20 +74,62 @@ impl BytesFastFieldWriter {
|
||||
doc
|
||||
}
|
||||
|
||||
/// Returns an iterator over values per doc_id in ascending doc_id order.
|
||||
///
|
||||
/// Normally the order is simply iterating self.doc_id_index.
|
||||
/// With doc_id_map it accounts for the new mapping, returning values in the order of the
|
||||
/// new doc_ids.
|
||||
fn get_ordered_values<'a: 'b, 'b>(
|
||||
&'a self,
|
||||
doc_id_map: Option<&'b DocIdMapping>,
|
||||
) -> impl Iterator<Item = &'b [u8]> {
|
||||
let doc_id_iter: Box<dyn Iterator<Item = u32>> = if let Some(doc_id_map) = doc_id_map {
|
||||
Box::new(doc_id_map.iter_old_doc_ids())
|
||||
} else {
|
||||
let max_doc = self.doc_index.len() as u32;
|
||||
Box::new(0..max_doc)
|
||||
};
|
||||
doc_id_iter.map(move |doc_id| self.get_values_for_doc_id(doc_id))
|
||||
}
|
||||
|
||||
/// returns all values for a doc_ids
|
||||
fn get_values_for_doc_id(&self, doc_id: u32) -> &[u8] {
|
||||
let start_pos = self.doc_index[doc_id as usize] as usize;
|
||||
let end_pos = self
|
||||
.doc_index
|
||||
.get(doc_id as usize + 1)
|
||||
.cloned()
|
||||
.unwrap_or(self.vals.len() as u64) as usize; // special case, last doc_id has no offset information
|
||||
&self.vals[start_pos..end_pos]
|
||||
}
|
||||
|
||||
/// Serializes the fast field values by pushing them to the `FastFieldSerializer`.
|
||||
pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> {
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut CompositeFastFieldSerializer,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
) -> io::Result<()> {
|
||||
// writing the offset index
|
||||
let mut doc_index_serializer =
|
||||
serializer.new_u64_fast_field_with_idx(self.field, 0, self.vals.len() as u64, 0)?;
|
||||
for &offset in &self.doc_index {
|
||||
let mut offset = 0;
|
||||
for vals in self.get_ordered_values(doc_id_map) {
|
||||
doc_index_serializer.add_val(offset)?;
|
||||
offset += vals.len() as u64;
|
||||
}
|
||||
doc_index_serializer.add_val(self.vals.len() as u64)?;
|
||||
doc_index_serializer.close_field()?;
|
||||
// writing the values themselves
|
||||
serializer
|
||||
.new_bytes_fast_field_with_idx(self.field, 1)
|
||||
.write_all(&self.vals)?;
|
||||
let mut value_serializer = serializer.new_bytes_fast_field_with_idx(self.field, 1);
|
||||
// the else could be removed, but this is faster (difference not benchmarked)
|
||||
if let Some(doc_id_map) = doc_id_map {
|
||||
for vals in self.get_ordered_values(Some(doc_id_map)) {
|
||||
// sort values in case of remapped doc_ids?
|
||||
value_serializer.write_all(vals)?;
|
||||
}
|
||||
} else {
|
||||
value_serializer.write_all(&self.vals)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -41,20 +41,20 @@ pub fn write_delete_bitset(
|
||||
#[derive(Clone)]
|
||||
pub struct DeleteBitSet {
|
||||
data: OwnedBytes,
|
||||
len: usize,
|
||||
num_deleted: usize,
|
||||
}
|
||||
|
||||
impl DeleteBitSet {
|
||||
#[cfg(test)]
|
||||
pub(crate) fn for_test(docs: &[DocId], max_doc: u32) -> DeleteBitSet {
|
||||
use crate::directory::{Directory, RAMDirectory, TerminatingWrite};
|
||||
use crate::directory::{Directory, RamDirectory, TerminatingWrite};
|
||||
use std::path::Path;
|
||||
assert!(docs.iter().all(|&doc| doc < max_doc));
|
||||
let mut bitset = BitSet::with_max_value(max_doc);
|
||||
for &doc in docs {
|
||||
bitset.insert(doc);
|
||||
}
|
||||
let directory = RAMDirectory::create();
|
||||
let directory = RamDirectory::create();
|
||||
let path = Path::new("dummydeletebitset");
|
||||
let mut wrt = directory.open_write(path).unwrap();
|
||||
write_delete_bitset(&bitset, max_doc, &mut wrt).unwrap();
|
||||
@@ -73,7 +73,7 @@ impl DeleteBitSet {
|
||||
.sum();
|
||||
Ok(DeleteBitSet {
|
||||
data: bytes,
|
||||
len: num_deleted,
|
||||
num_deleted,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -83,7 +83,7 @@ impl DeleteBitSet {
|
||||
}
|
||||
|
||||
/// Returns true iff the document has been marked as deleted.
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn is_deleted(&self, doc: DocId) -> bool {
|
||||
let byte_offset = doc / 8u32;
|
||||
let b: u8 = self.data.as_slice()[byte_offset as usize];
|
||||
@@ -91,6 +91,10 @@ impl DeleteBitSet {
|
||||
b & (1u8 << shift) != 0
|
||||
}
|
||||
|
||||
/// The number of deleted docs
|
||||
pub fn num_deleted(&self) -> usize {
|
||||
self.num_deleted
|
||||
}
|
||||
/// Summarize total space usage of this bitset.
|
||||
pub fn space_usage(&self) -> ByteCount {
|
||||
self.data.len()
|
||||
@@ -99,7 +103,7 @@ impl DeleteBitSet {
|
||||
|
||||
impl HasLen for DeleteBitSet {
|
||||
fn len(&self) -> usize {
|
||||
self.len
|
||||
self.num_deleted
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -95,7 +95,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b")));
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher
|
||||
@@ -105,7 +105,7 @@ mod tests {
|
||||
let mut facet_ords = Vec::new();
|
||||
facet_reader.facet_ords(0u32, &mut facet_ords);
|
||||
assert_eq!(&facet_ords, &[2u64]);
|
||||
let doc = searcher.doc(DocAddress(0u32, 0u32))?;
|
||||
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
|
||||
let value = doc.get_first(facet_field).and_then(Value::path);
|
||||
assert_eq!(value, None);
|
||||
Ok(())
|
||||
@@ -118,7 +118,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b")));
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher
|
||||
@@ -128,7 +128,7 @@ mod tests {
|
||||
let mut facet_ords = Vec::new();
|
||||
facet_reader.facet_ords(0u32, &mut facet_ords);
|
||||
assert!(facet_ords.is_empty());
|
||||
let doc = searcher.doc(DocAddress(0u32, 0u32))?;
|
||||
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
|
||||
let value = doc.get_first(facet_field).and_then(Value::path);
|
||||
assert_eq!(value, Some("/a/b".to_string()));
|
||||
Ok(())
|
||||
@@ -141,7 +141,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b")));
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher
|
||||
@@ -151,7 +151,7 @@ mod tests {
|
||||
let mut facet_ords = Vec::new();
|
||||
facet_reader.facet_ords(0u32, &mut facet_ords);
|
||||
assert_eq!(&facet_ords, &[2u64]);
|
||||
let doc = searcher.doc(DocAddress(0u32, 0u32))?;
|
||||
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
|
||||
let value = doc.get_first(facet_field).and_then(Value::path);
|
||||
assert_eq!(value, Some("/a/b".to_string()));
|
||||
Ok(())
|
||||
@@ -164,7 +164,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b")));
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher
|
||||
@@ -174,7 +174,7 @@ mod tests {
|
||||
let mut facet_ords = Vec::new();
|
||||
facet_reader.facet_ords(0u32, &mut facet_ords);
|
||||
assert!(facet_ords.is_empty());
|
||||
let doc = searcher.doc(DocAddress(0u32, 0u32))?;
|
||||
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
|
||||
let value = doc.get_first(facet_field).and_then(Value::path);
|
||||
assert_eq!(value, None);
|
||||
Ok(())
|
||||
@@ -187,7 +187,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b")));
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
|
||||
index_writer.add_document(Document::default());
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
|
||||
@@ -29,11 +29,14 @@ pub use self::delete::DeleteBitSet;
|
||||
pub use self::error::{FastFieldNotAvailableError, Result};
|
||||
pub use self::facet_reader::FacetReader;
|
||||
pub use self::multivalued::{MultiValuedFastFieldReader, MultiValuedFastFieldWriter};
|
||||
pub(crate) use self::reader::BitpackedFastFieldReader;
|
||||
pub use self::reader::DynamicFastFieldReader;
|
||||
pub use self::reader::FastFieldReader;
|
||||
pub use self::readers::FastFieldReaders;
|
||||
pub use self::serializer::FastFieldSerializer;
|
||||
pub use self::serializer::CompositeFastFieldSerializer;
|
||||
pub use self::serializer::FastFieldDataAccess;
|
||||
pub use self::serializer::FastFieldStats;
|
||||
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
|
||||
use crate::common;
|
||||
use crate::schema::Cardinality;
|
||||
use crate::schema::FieldType;
|
||||
use crate::schema::Value;
|
||||
@@ -41,6 +44,7 @@ use crate::{
|
||||
chrono::{NaiveDateTime, Utc},
|
||||
schema::Type,
|
||||
};
|
||||
use crate::{common, DocId};
|
||||
|
||||
mod bytes;
|
||||
mod delete;
|
||||
@@ -52,6 +56,15 @@ mod readers;
|
||||
mod serializer;
|
||||
mod writer;
|
||||
|
||||
/// Trait for `BytesFastFieldReader` and `MultiValuedFastFieldReader` to return the length of data
|
||||
/// for a doc_id
|
||||
pub trait MultiValueLength {
|
||||
/// returns the num of values associated to a doc_id
|
||||
fn get_len(&self, doc_id: DocId) -> u64;
|
||||
/// returns the sum of num values for all doc_ids
|
||||
fn get_total_len(&self) -> u64;
|
||||
}
|
||||
|
||||
/// Trait for types that are allowed for fast fields: (u64, i64 and f64).
|
||||
pub trait FastValue: Clone + Copy + Send + Sync + PartialOrd + 'static {
|
||||
/// Converts a value from u64
|
||||
@@ -201,15 +214,14 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::common::CompositeFile;
|
||||
use crate::directory::{Directory, RAMDirectory, WritePtr};
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::common::HasLen;
|
||||
use crate::directory::{Directory, RamDirectory, WritePtr};
|
||||
use crate::merge_policy::NoMergePolicy;
|
||||
use crate::schema::Field;
|
||||
use crate::schema::Schema;
|
||||
use crate::schema::FAST;
|
||||
use crate::schema::{Document, IntOptions};
|
||||
use crate::{Index, SegmentId, SegmentReader};
|
||||
use common::HasLen;
|
||||
use once_cell::sync::Lazy;
|
||||
use rand::prelude::SliceRandom;
|
||||
use rand::rngs::StdRng;
|
||||
@@ -227,7 +239,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
pub fn test_fastfield() {
|
||||
let test_fastfield = FastFieldReader::<u64>::from(vec![100, 200, 300]);
|
||||
let test_fastfield = DynamicFastFieldReader::<u64>::from(vec![100, 200, 300]);
|
||||
assert_eq!(test_fastfield.get(0), 100);
|
||||
assert_eq!(test_fastfield.get(1), 200);
|
||||
assert_eq!(test_fastfield.get(2), 300);
|
||||
@@ -242,24 +254,24 @@ mod tests {
|
||||
#[test]
|
||||
fn test_intfastfield_small() -> crate::Result<()> {
|
||||
let path = Path::new("test");
|
||||
let directory: RAMDirectory = RAMDirectory::create();
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>13u64));
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>14u64));
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>2u64));
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new())
|
||||
.serialize(&mut serializer, &HashMap::new(), None)
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(&path).unwrap();
|
||||
assert_eq!(file.len(), 36 as usize);
|
||||
let file = directory.open_read(path).unwrap();
|
||||
assert_eq!(file.len(), 37);
|
||||
let composite_file = CompositeFile::open(&file)?;
|
||||
let file = composite_file.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(file)?;
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(file)?;
|
||||
assert_eq!(fast_field_reader.get(0), 13u64);
|
||||
assert_eq!(fast_field_reader.get(1), 14u64);
|
||||
assert_eq!(fast_field_reader.get(2), 2u64);
|
||||
@@ -269,10 +281,10 @@ mod tests {
|
||||
#[test]
|
||||
fn test_intfastfield_large() -> crate::Result<()> {
|
||||
let path = Path::new("test");
|
||||
let directory: RAMDirectory = RAMDirectory::create();
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test"))?;
|
||||
let mut serializer = FastFieldSerializer::from_write(write)?;
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write)?;
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>4u64));
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>14_082_001u64));
|
||||
@@ -283,15 +295,15 @@ mod tests {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>1_002u64));
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>1_501u64));
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>215u64));
|
||||
fast_field_writers.serialize(&mut serializer, &HashMap::new())?;
|
||||
fast_field_writers.serialize(&mut serializer, &HashMap::new(), None)?;
|
||||
serializer.close()?;
|
||||
}
|
||||
let file = directory.open_read(&path)?;
|
||||
assert_eq!(file.len(), 61 as usize);
|
||||
let file = directory.open_read(path)?;
|
||||
assert_eq!(file.len(), 62);
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file)?;
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data)?;
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;
|
||||
assert_eq!(fast_field_reader.get(0), 4u64);
|
||||
assert_eq!(fast_field_reader.get(1), 14_082_001u64);
|
||||
assert_eq!(fast_field_reader.get(2), 3_052u64);
|
||||
@@ -308,26 +320,26 @@ mod tests {
|
||||
#[test]
|
||||
fn test_intfastfield_null_amplitude() -> crate::Result<()> {
|
||||
let path = Path::new("test");
|
||||
let directory: RAMDirectory = RAMDirectory::create();
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for _ in 0..10_000 {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>100_000u64));
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new())
|
||||
.serialize(&mut serializer, &HashMap::new(), None)
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(&path).unwrap();
|
||||
assert_eq!(file.len(), 34 as usize);
|
||||
let file = directory.open_read(path).unwrap();
|
||||
assert_eq!(file.len(), 35);
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data)?;
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;
|
||||
for doc in 0..10_000 {
|
||||
assert_eq!(fast_field_reader.get(doc), 100_000u64);
|
||||
}
|
||||
@@ -338,11 +350,11 @@ mod tests {
|
||||
#[test]
|
||||
fn test_intfastfield_large_numbers() -> crate::Result<()> {
|
||||
let path = Path::new("test");
|
||||
let directory: RAMDirectory = RAMDirectory::create();
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
// forcing the amplitude to be high
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>0u64));
|
||||
@@ -350,16 +362,16 @@ mod tests {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>5_000_000_000_000_000_000u64 + i));
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new())
|
||||
.serialize(&mut serializer, &HashMap::new(), None)
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(&path).unwrap();
|
||||
assert_eq!(file.len(), 80042 as usize);
|
||||
let file = directory.open_read(path).unwrap();
|
||||
assert_eq!(file.len(), 80043);
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file)?;
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data)?;
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;
|
||||
assert_eq!(fast_field_reader.get(0), 0u64);
|
||||
for doc in 1..10_001 {
|
||||
assert_eq!(
|
||||
@@ -374,14 +386,14 @@ mod tests {
|
||||
#[test]
|
||||
fn test_signed_intfastfield() -> crate::Result<()> {
|
||||
let path = Path::new("test");
|
||||
let directory: RAMDirectory = RAMDirectory::create();
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
let i64_field = schema_builder.add_i64_field("field", FAST);
|
||||
let schema = schema_builder.build();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
|
||||
for i in -100i64..10_000i64 {
|
||||
let mut doc = Document::default();
|
||||
@@ -389,16 +401,17 @@ mod tests {
|
||||
fast_field_writers.add_document(&doc);
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new())
|
||||
.serialize(&mut serializer, &HashMap::new(), None)
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(&path).unwrap();
|
||||
assert_eq!(file.len(), 17709 as usize);
|
||||
let file = directory.open_read(path).unwrap();
|
||||
//assert_eq!(file.len(), 17710 as usize); //bitpacked size
|
||||
assert_eq!(file.len(), 10175_usize); // linear interpol size
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file)?;
|
||||
let data = fast_fields_composite.open_read(i64_field).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<i64>::open(data)?;
|
||||
let fast_field_reader = DynamicFastFieldReader::<i64>::open(data)?;
|
||||
|
||||
assert_eq!(fast_field_reader.min_value(), -100i64);
|
||||
assert_eq!(fast_field_reader.max_value(), 9_999i64);
|
||||
@@ -417,28 +430,28 @@ mod tests {
|
||||
#[test]
|
||||
fn test_signed_intfastfield_default_val() -> crate::Result<()> {
|
||||
let path = Path::new("test");
|
||||
let directory: RAMDirectory = RAMDirectory::create();
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
let mut schema_builder = Schema::builder();
|
||||
let i64_field = schema_builder.add_i64_field("field", FAST);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
|
||||
let doc = Document::default();
|
||||
fast_field_writers.add_document(&doc);
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new())
|
||||
.serialize(&mut serializer, &HashMap::new(), None)
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
|
||||
let file = directory.open_read(&path).unwrap();
|
||||
let file = directory.open_read(path).unwrap();
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file).unwrap();
|
||||
let data = fast_fields_composite.open_read(i64_field).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<i64>::open(data)?;
|
||||
let fast_field_reader = DynamicFastFieldReader::<i64>::open(data)?;
|
||||
assert_eq!(fast_field_reader.get(0u32), 0i64);
|
||||
}
|
||||
Ok(())
|
||||
@@ -456,22 +469,22 @@ mod tests {
|
||||
let path = Path::new("test");
|
||||
let permutation = generate_permutation();
|
||||
let n = permutation.len();
|
||||
let directory = RAMDirectory::create();
|
||||
let directory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test"))?;
|
||||
let mut serializer = FastFieldSerializer::from_write(write)?;
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write)?;
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for &x in &permutation {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>x));
|
||||
}
|
||||
fast_field_writers.serialize(&mut serializer, &HashMap::new())?;
|
||||
fast_field_writers.serialize(&mut serializer, &HashMap::new(), None)?;
|
||||
serializer.close()?;
|
||||
}
|
||||
let file = directory.open_read(&path)?;
|
||||
let file = directory.open_read(path)?;
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file)?;
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data)?;
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;
|
||||
|
||||
let mut a = 0u64;
|
||||
for _ in 0..n {
|
||||
@@ -576,7 +589,7 @@ mod bench {
|
||||
use super::tests::{generate_permutation, SCHEMA};
|
||||
use super::*;
|
||||
use crate::common::CompositeFile;
|
||||
use crate::directory::{Directory, RAMDirectory, WritePtr};
|
||||
use crate::directory::{Directory, RamDirectory, WritePtr};
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
@@ -612,16 +625,16 @@ mod bench {
|
||||
fn bench_intfastfield_linear_fflookup(b: &mut Bencher) {
|
||||
let path = Path::new("test");
|
||||
let permutation = generate_permutation();
|
||||
let directory: RAMDirectory = RAMDirectory::create();
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for &x in &permutation {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>x));
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new())
|
||||
.serialize(&mut serializer, &HashMap::new(), None)
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
@@ -629,7 +642,7 @@ mod bench {
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let n = test::black_box(7000u32);
|
||||
@@ -646,16 +659,16 @@ mod bench {
|
||||
fn bench_intfastfield_fflookup(b: &mut Bencher) {
|
||||
let path = Path::new("test");
|
||||
let permutation = generate_permutation();
|
||||
let directory: RAMDirectory = RAMDirectory::create();
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for &x in &permutation {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>x));
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new())
|
||||
.serialize(&mut serializer, &HashMap::new(), None)
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
@@ -663,7 +676,7 @@ mod bench {
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let n = test::black_box(1000u32);
|
||||
|
||||
@@ -77,17 +77,20 @@ mod tests {
|
||||
// add another second
|
||||
let two_secs_ahead = first_time_stamp + Duration::seconds(2);
|
||||
index_writer.add_document(doc!(date_field=>two_secs_ahead, date_field=>two_secs_ahead,date_field=>two_secs_ahead, time_i=>3i64));
|
||||
// add three seconds
|
||||
index_writer
|
||||
.add_document(doc!(date_field=>first_time_stamp + Duration::seconds(3), time_i=>4i64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
assert_eq!(reader.num_docs(), 4);
|
||||
assert_eq!(reader.num_docs(), 5);
|
||||
|
||||
{
|
||||
let parser = QueryParser::for_index(&index, vec![date_field]);
|
||||
let query = parser
|
||||
.parse_query(&format!("\"{}\"", first_time_stamp.to_rfc3339()).to_string())
|
||||
.parse_query(&format!("\"{}\"", first_time_stamp.to_rfc3339()))
|
||||
.expect("could not parse query");
|
||||
let results = searcher
|
||||
.search(&query, &TopDocs::with_limit(5))
|
||||
@@ -118,7 +121,7 @@ mod tests {
|
||||
{
|
||||
let parser = QueryParser::for_index(&index, vec![date_field]);
|
||||
let query = parser
|
||||
.parse_query(&format!("\"{}\"", two_secs_ahead.to_rfc3339()).to_string())
|
||||
.parse_query(&format!("\"{}\"", two_secs_ahead.to_rfc3339()))
|
||||
.expect("could not parse query");
|
||||
let results = searcher
|
||||
.search(&query, &TopDocs::with_limit(5))
|
||||
@@ -147,37 +150,50 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: support Date range queries
|
||||
// {
|
||||
// let parser = QueryParser::for_index(&index, vec![date_field]);
|
||||
// let range_q = format!("\"{}\"..\"{}\"",
|
||||
// (first_time_stamp + Duration::seconds(1)).to_rfc3339(),
|
||||
// (first_time_stamp + Duration::seconds(3)).to_rfc3339()
|
||||
// );
|
||||
// let query = parser.parse_query(&range_q)
|
||||
// .expect("could not parse query");
|
||||
// let results = searcher.search(&query, &TopDocs::with_limit(5))
|
||||
// .expect("could not query index");
|
||||
//
|
||||
//
|
||||
// assert_eq!(results.len(), 2);
|
||||
// for (i, doc_pair) in results.iter().enumerate() {
|
||||
// let retrieved_doc = searcher.doc(doc_pair.1).expect("cannot fetch doc");
|
||||
// let offset_sec = match i {
|
||||
// 0 => 1,
|
||||
// 1 => 3,
|
||||
// _ => panic!("should not have more than 2 docs")
|
||||
// };
|
||||
// let time_i_val = match i {
|
||||
// 0 => 2,
|
||||
// 1 => 3,
|
||||
// _ => panic!("should not have more than 2 docs")
|
||||
// };
|
||||
// assert_eq!(retrieved_doc.get_first(date_field).expect("cannot find value").date_value().timestamp(),
|
||||
// (first_time_stamp + Duration::seconds(offset_sec)).timestamp());
|
||||
// assert_eq!(retrieved_doc.get_first(time_i).expect("cannot find value").i64_value(), time_i_val);
|
||||
// }
|
||||
// }
|
||||
{
|
||||
let parser = QueryParser::for_index(&index, vec![date_field]);
|
||||
let range_q = format!(
|
||||
"[{} TO {}}}",
|
||||
(first_time_stamp + Duration::seconds(1)).to_rfc3339(),
|
||||
(first_time_stamp + Duration::seconds(3)).to_rfc3339()
|
||||
);
|
||||
let query = parser.parse_query(&range_q).expect("could not parse query");
|
||||
let results = searcher
|
||||
.search(&query, &TopDocs::with_limit(5))
|
||||
.expect("could not query index");
|
||||
|
||||
assert_eq!(results.len(), 2);
|
||||
for (i, doc_pair) in results.iter().enumerate() {
|
||||
let retrieved_doc = searcher.doc(doc_pair.1).expect("cannot fetch doc");
|
||||
let offset_sec = match i {
|
||||
0 => 1,
|
||||
1 => 2,
|
||||
_ => panic!("should not have more than 2 docs"),
|
||||
};
|
||||
let time_i_val = match i {
|
||||
0 => 2,
|
||||
1 => 3,
|
||||
_ => panic!("should not have more than 2 docs"),
|
||||
};
|
||||
assert_eq!(
|
||||
retrieved_doc
|
||||
.get_first(date_field)
|
||||
.expect("cannot find value")
|
||||
.date_value()
|
||||
.expect("value not of Date type")
|
||||
.timestamp(),
|
||||
(first_time_stamp + Duration::seconds(offset_sec)).timestamp()
|
||||
);
|
||||
assert_eq!(
|
||||
retrieved_doc
|
||||
.get_first(time_i)
|
||||
.expect("cannot find value")
|
||||
.i64_value()
|
||||
.expect("value not of i64 type"),
|
||||
time_i_val
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::ops::Range;
|
||||
|
||||
use crate::fastfield::{FastFieldReader, FastValue};
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue, MultiValueLength};
|
||||
use crate::DocId;
|
||||
|
||||
/// Reader for a multivalued `u64` fast field.
|
||||
@@ -13,14 +13,14 @@ use crate::DocId;
|
||||
///
|
||||
#[derive(Clone)]
|
||||
pub struct MultiValuedFastFieldReader<Item: FastValue> {
|
||||
idx_reader: FastFieldReader<u64>,
|
||||
vals_reader: FastFieldReader<Item>,
|
||||
idx_reader: DynamicFastFieldReader<u64>,
|
||||
vals_reader: DynamicFastFieldReader<Item>,
|
||||
}
|
||||
|
||||
impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
|
||||
pub(crate) fn open(
|
||||
idx_reader: FastFieldReader<u64>,
|
||||
vals_reader: FastFieldReader<Item>,
|
||||
idx_reader: DynamicFastFieldReader<u64>,
|
||||
vals_reader: DynamicFastFieldReader<Item>,
|
||||
) -> MultiValuedFastFieldReader<Item> {
|
||||
MultiValuedFastFieldReader {
|
||||
idx_reader,
|
||||
@@ -30,6 +30,7 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
|
||||
|
||||
/// Returns `(start, stop)`, such that the values associated
|
||||
/// to the given document are `start..stop`.
|
||||
#[inline]
|
||||
fn range(&self, doc: DocId) -> Range<u64> {
|
||||
let start = self.idx_reader.get(doc);
|
||||
let stop = self.idx_reader.get(doc + 1);
|
||||
@@ -37,30 +38,60 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
|
||||
}
|
||||
|
||||
/// Returns the array of values associated to the given `doc`.
|
||||
#[inline]
|
||||
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
|
||||
let range = self.range(doc);
|
||||
let len = (range.end - range.start) as usize;
|
||||
vals.resize(len, Item::make_zero());
|
||||
self.vals_reader.get_range_u64(range.start, &mut vals[..]);
|
||||
self.vals_reader.get_range(range.start, &mut vals[..]);
|
||||
}
|
||||
|
||||
/// Returns the minimum value for this fast field.
|
||||
///
|
||||
/// The min value does not take in account of possible
|
||||
/// deleted document, and should be considered as a lower bound
|
||||
/// of the actual mimimum value.
|
||||
pub fn min_value(&self) -> Item {
|
||||
self.vals_reader.min_value()
|
||||
}
|
||||
|
||||
/// Returns the maximum value for this fast field.
|
||||
///
|
||||
/// The max value does not take in account of possible
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// of the actual maximum value.
|
||||
pub fn max_value(&self) -> Item {
|
||||
self.vals_reader.max_value()
|
||||
}
|
||||
|
||||
/// Returns the number of values associated with the document `DocId`.
|
||||
#[inline]
|
||||
pub fn num_vals(&self, doc: DocId) -> usize {
|
||||
let range = self.range(doc);
|
||||
(range.end - range.start) as usize
|
||||
}
|
||||
|
||||
/// Returns the overall number of values in this field .
|
||||
#[inline]
|
||||
pub fn total_num_vals(&self) -> u64 {
|
||||
self.idx_reader.max_value()
|
||||
}
|
||||
}
|
||||
|
||||
impl<Item: FastValue> MultiValueLength for MultiValuedFastFieldReader<Item> {
|
||||
fn get_len(&self, doc_id: DocId) -> u64 {
|
||||
self.num_vals(doc_id) as u64
|
||||
}
|
||||
|
||||
fn get_total_len(&self) -> u64 {
|
||||
self.total_num_vals() as u64
|
||||
}
|
||||
}
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use crate::core::Index;
|
||||
use crate::schema::{Facet, Schema, INDEXED};
|
||||
use crate::schema::{Cardinality, Facet, IntOptions, Schema, INDEXED};
|
||||
|
||||
#[test]
|
||||
fn test_multifastfield_reader() {
|
||||
@@ -115,4 +146,32 @@ mod tests {
|
||||
assert_eq!(&vals[..], &[4]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multifastfield_reader_min_max() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field_options = IntOptions::default()
|
||||
.set_indexed()
|
||||
.set_fast(Cardinality::MultiValues);
|
||||
let item_field = schema_builder.add_i64_field("items", field_options);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index
|
||||
.writer_for_tests()
|
||||
.expect("Failed to create index writer.");
|
||||
index_writer.add_document(doc!(
|
||||
item_field => 2i64,
|
||||
item_field => 3i64,
|
||||
item_field => -2i64,
|
||||
));
|
||||
index_writer.add_document(doc!(item_field => 6i64, item_field => 3i64));
|
||||
index_writer.add_document(doc!(item_field => 4i64));
|
||||
index_writer.commit().expect("Commit failed");
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let field_reader = segment_reader.fast_fields().i64s(item_field).unwrap();
|
||||
|
||||
assert_eq!(field_reader.min_value(), -2);
|
||||
assert_eq!(field_reader.max_value(), 6);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
use crate::fastfield::serializer::FastSingleFieldSerializer;
|
||||
use crate::fastfield::value_to_u64;
|
||||
use crate::fastfield::FastFieldSerializer;
|
||||
use crate::fastfield::serializer::BitpackedFastFieldSerializerLegacy;
|
||||
use crate::fastfield::CompositeFastFieldSerializer;
|
||||
use crate::postings::UnorderedTermId;
|
||||
use crate::schema::{Document, Field};
|
||||
use crate::termdict::TermOrdinal;
|
||||
use crate::DocId;
|
||||
use crate::{fastfield::value_to_u64, indexer::doc_id_mapping::DocIdMapping};
|
||||
use fnv::FnvHashMap;
|
||||
use std::io;
|
||||
use std::iter::once;
|
||||
use tantivy_bitpacker::minmax;
|
||||
|
||||
/// Writer for multi-valued (as in, more than one value per document)
|
||||
/// int fast field.
|
||||
@@ -48,6 +48,12 @@ impl MultiValuedFastFieldWriter {
|
||||
}
|
||||
}
|
||||
|
||||
/// The memory used (inclusive childs)
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.vals.capacity() * std::mem::size_of::<UnorderedTermId>()
|
||||
+ self.doc_index.capacity() * std::mem::size_of::<u64>()
|
||||
}
|
||||
|
||||
/// Access the field associated to the `MultiValuedFastFieldWriter`
|
||||
pub fn field(&self) -> Field {
|
||||
self.field
|
||||
@@ -87,7 +93,34 @@ impl MultiValuedFastFieldWriter {
|
||||
self.vals.extend_from_slice(vals);
|
||||
doc
|
||||
}
|
||||
/// Returns an iterator over values per doc_id in ascending doc_id order.
|
||||
///
|
||||
/// Normally the order is simply iterating self.doc_id_index.
|
||||
/// With doc_id_map it accounts for the new mapping, returning values in the order of the
|
||||
/// new doc_ids.
|
||||
fn get_ordered_values<'a: 'b, 'b>(
|
||||
&'a self,
|
||||
doc_id_map: Option<&'b DocIdMapping>,
|
||||
) -> impl Iterator<Item = &'b [u64]> {
|
||||
let doc_id_iter: Box<dyn Iterator<Item = u32>> = if let Some(doc_id_map) = doc_id_map {
|
||||
Box::new(doc_id_map.iter_old_doc_ids())
|
||||
} else {
|
||||
let max_doc = self.doc_index.len() as DocId;
|
||||
Box::new(0..max_doc)
|
||||
};
|
||||
doc_id_iter.map(move |doc_id| self.get_values_for_doc_id(doc_id))
|
||||
}
|
||||
|
||||
/// returns all values for a doc_ids
|
||||
fn get_values_for_doc_id(&self, doc_id: u32) -> &[u64] {
|
||||
let start_pos = self.doc_index[doc_id as usize] as usize;
|
||||
let end_pos = self
|
||||
.doc_index
|
||||
.get(doc_id as usize + 1)
|
||||
.cloned()
|
||||
.unwrap_or(self.vals.len() as u64) as usize; // special case, last doc_id has no offset information
|
||||
&self.vals[start_pos..end_pos]
|
||||
}
|
||||
/// Serializes fast field values by pushing them to the `FastFieldSerializer`.
|
||||
///
|
||||
/// If a mapping is given, the values are remapped *and sorted* before serialization.
|
||||
@@ -101,22 +134,27 @@ impl MultiValuedFastFieldWriter {
|
||||
///
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut FastFieldSerializer,
|
||||
serializer: &mut CompositeFastFieldSerializer,
|
||||
mapping_opt: Option<&FnvHashMap<UnorderedTermId, TermOrdinal>>,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
) -> io::Result<()> {
|
||||
{
|
||||
// writing the offset index
|
||||
let mut doc_index_serializer =
|
||||
serializer.new_u64_fast_field_with_idx(self.field, 0, self.vals.len() as u64, 0)?;
|
||||
for &offset in &self.doc_index {
|
||||
|
||||
let mut offset = 0;
|
||||
for vals in self.get_ordered_values(doc_id_map) {
|
||||
doc_index_serializer.add_val(offset)?;
|
||||
offset += vals.len() as u64;
|
||||
}
|
||||
doc_index_serializer.add_val(self.vals.len() as u64)?;
|
||||
|
||||
doc_index_serializer.close_field()?;
|
||||
}
|
||||
{
|
||||
// writing the values themselves.
|
||||
let mut value_serializer: FastSingleFieldSerializer<'_, _>;
|
||||
let mut value_serializer: BitpackedFastFieldSerializerLegacy<'_, _>;
|
||||
match mapping_opt {
|
||||
Some(mapping) => {
|
||||
value_serializer = serializer.new_u64_fast_field_with_idx(
|
||||
@@ -126,18 +164,10 @@ impl MultiValuedFastFieldWriter {
|
||||
1,
|
||||
)?;
|
||||
|
||||
let last_interval =
|
||||
self.doc_index.last().cloned().unwrap() as usize..self.vals.len();
|
||||
|
||||
let mut doc_vals: Vec<u64> = Vec::with_capacity(100);
|
||||
for range in self
|
||||
.doc_index
|
||||
.windows(2)
|
||||
.map(|interval| interval[0] as usize..interval[1] as usize)
|
||||
.chain(once(last_interval))
|
||||
{
|
||||
for vals in self.get_ordered_values(doc_id_map) {
|
||||
doc_vals.clear();
|
||||
let remapped_vals = self.vals[range]
|
||||
let remapped_vals = vals
|
||||
.iter()
|
||||
.map(|val| *mapping.get(val).expect("Missing term ordinal"));
|
||||
doc_vals.extend(remapped_vals);
|
||||
@@ -148,12 +178,15 @@ impl MultiValuedFastFieldWriter {
|
||||
}
|
||||
}
|
||||
None => {
|
||||
let val_min_max = crate::common::minmax(self.vals.iter().cloned());
|
||||
let val_min_max = minmax(self.vals.iter().cloned());
|
||||
let (val_min, val_max) = val_min_max.unwrap_or((0u64, 0u64));
|
||||
value_serializer =
|
||||
serializer.new_u64_fast_field_with_idx(self.field, val_min, val_max, 1)?;
|
||||
for &val in &self.vals {
|
||||
value_serializer.add_val(val)?;
|
||||
for vals in self.get_ordered_values(doc_id_map) {
|
||||
// sort values in case of remapped doc_ids?
|
||||
for &val in vals {
|
||||
value_serializer.add_val(val)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,47 +1,27 @@
|
||||
use super::FastValue;
|
||||
use crate::common::bitpacker::BitUnpacker;
|
||||
use crate::common::compute_num_bits;
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::common::CompositeFile;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::directory::{Directory, RAMDirectory, WritePtr};
|
||||
use crate::fastfield::{FastFieldSerializer, FastFieldsWriter};
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::directory::{Directory, RamDirectory, WritePtr};
|
||||
use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter};
|
||||
use crate::schema::Schema;
|
||||
use crate::schema::FAST;
|
||||
use crate::DocId;
|
||||
use fastfield_codecs::bitpacked::BitpackedFastFieldReader as BitpackedReader;
|
||||
use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer;
|
||||
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldReader;
|
||||
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
|
||||
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldReader;
|
||||
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer;
|
||||
use fastfield_codecs::FastFieldCodecReader;
|
||||
use fastfield_codecs::FastFieldCodecSerializer;
|
||||
use std::collections::HashMap;
|
||||
use std::marker::PhantomData;
|
||||
use std::path::Path;
|
||||
|
||||
/// Trait for accessing a fastfield.
|
||||
///
|
||||
/// Depending on the field type, a different
|
||||
/// fast field is required.
|
||||
#[derive(Clone)]
|
||||
pub struct FastFieldReader<Item: FastValue> {
|
||||
bit_unpacker: BitUnpacker,
|
||||
min_value_u64: u64,
|
||||
max_value_u64: u64,
|
||||
_phantom: PhantomData<Item>,
|
||||
}
|
||||
|
||||
impl<Item: FastValue> FastFieldReader<Item> {
|
||||
/// Opens a fast field given a file.
|
||||
pub fn open(file: FileSlice) -> crate::Result<Self> {
|
||||
let mut bytes = file.read_bytes()?;
|
||||
let min_value = u64::deserialize(&mut bytes)?;
|
||||
let amplitude = u64::deserialize(&mut bytes)?;
|
||||
let max_value = min_value + amplitude;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let bit_unpacker = BitUnpacker::new(bytes, num_bits);
|
||||
Ok(FastFieldReader {
|
||||
min_value_u64: min_value,
|
||||
max_value_u64: max_value,
|
||||
bit_unpacker,
|
||||
_phantom: PhantomData,
|
||||
})
|
||||
}
|
||||
|
||||
/// FastFieldReader is the trait to access fast field data.
|
||||
pub trait FastFieldReader<Item: FastValue>: Clone {
|
||||
/// Return the value associated to the given document.
|
||||
///
|
||||
/// This accessor should return as fast as possible.
|
||||
@@ -49,13 +29,154 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `doc` is greater than the segment
|
||||
// `maxdoc`.
|
||||
pub fn get(&self, doc: DocId) -> Item {
|
||||
self.get_u64(u64::from(doc))
|
||||
}
|
||||
fn get(&self, doc: DocId) -> Item;
|
||||
|
||||
/// Fills an output buffer with the fast field values
|
||||
/// associated with the `DocId` going from
|
||||
/// `start` to `start + output.len()`.
|
||||
///
|
||||
/// Regardless of the type of `Item`, this method works
|
||||
/// - transmuting the output array
|
||||
/// - extracting the `Item`s as if they were `u64`
|
||||
/// - possibly converting the `u64` value to the right type.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `start + output.len()` is greater than
|
||||
/// the segment's `maxdoc`.
|
||||
fn get_range(&self, start: u64, output: &mut [Item]);
|
||||
|
||||
/// Returns the minimum value for this fast field.
|
||||
///
|
||||
/// The min value does not take in account of possible
|
||||
/// deleted document, and should be considered as a lower bound
|
||||
/// of the actual mimimum value.
|
||||
fn min_value(&self) -> Item;
|
||||
|
||||
/// Returns the maximum value for this fast field.
|
||||
///
|
||||
/// The max value does not take in account of possible
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// of the actual maximum value.
|
||||
fn max_value(&self) -> Item;
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
/// DynamicFastFieldReader wraps different readers to access
|
||||
/// the various encoded fastfield data
|
||||
///
|
||||
pub enum DynamicFastFieldReader<Item: FastValue> {
|
||||
/// Bitpacked compressed fastfield data.
|
||||
Bitpacked(FastFieldReaderCodecWrapper<Item, BitpackedReader>),
|
||||
/// Linear interpolated values + bitpacked
|
||||
LinearInterpol(FastFieldReaderCodecWrapper<Item, LinearInterpolFastFieldReader>),
|
||||
/// Blockwise linear interpolated values + bitpacked
|
||||
MultiLinearInterpol(FastFieldReaderCodecWrapper<Item, MultiLinearInterpolFastFieldReader>),
|
||||
}
|
||||
|
||||
impl<Item: FastValue> DynamicFastFieldReader<Item> {
|
||||
/// Returns correct the reader wrapped in the `DynamicFastFieldReader` enum for the data.
|
||||
pub fn open(file: FileSlice) -> crate::Result<DynamicFastFieldReader<Item>> {
|
||||
let mut bytes = file.read_bytes()?;
|
||||
let id = bytes.read_u8();
|
||||
|
||||
let reader = match id {
|
||||
BitpackedFastFieldSerializer::ID => {
|
||||
DynamicFastFieldReader::Bitpacked(FastFieldReaderCodecWrapper::<
|
||||
Item,
|
||||
BitpackedReader,
|
||||
>::open_from_bytes(bytes)?)
|
||||
}
|
||||
LinearInterpolFastFieldSerializer::ID => {
|
||||
DynamicFastFieldReader::LinearInterpol(FastFieldReaderCodecWrapper::<
|
||||
Item,
|
||||
LinearInterpolFastFieldReader,
|
||||
>::open_from_bytes(bytes)?)
|
||||
}
|
||||
MultiLinearInterpolFastFieldSerializer::ID => {
|
||||
DynamicFastFieldReader::MultiLinearInterpol(FastFieldReaderCodecWrapper::<
|
||||
Item,
|
||||
MultiLinearInterpolFastFieldReader,
|
||||
>::open_from_bytes(
|
||||
bytes
|
||||
)?)
|
||||
}
|
||||
_ => {
|
||||
panic!(
|
||||
"unknown fastfield id {:?}. Data corrupted or using old tantivy version.",
|
||||
id
|
||||
)
|
||||
}
|
||||
};
|
||||
Ok(reader)
|
||||
}
|
||||
}
|
||||
|
||||
impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
|
||||
fn get(&self, doc: DocId) -> Item {
|
||||
match self {
|
||||
Self::Bitpacked(reader) => reader.get(doc),
|
||||
Self::LinearInterpol(reader) => reader.get(doc),
|
||||
Self::MultiLinearInterpol(reader) => reader.get(doc),
|
||||
}
|
||||
}
|
||||
fn get_range(&self, start: u64, output: &mut [Item]) {
|
||||
match self {
|
||||
Self::Bitpacked(reader) => reader.get_range(start, output),
|
||||
Self::LinearInterpol(reader) => reader.get_range(start, output),
|
||||
Self::MultiLinearInterpol(reader) => reader.get_range(start, output),
|
||||
}
|
||||
}
|
||||
fn min_value(&self) -> Item {
|
||||
match self {
|
||||
Self::Bitpacked(reader) => reader.min_value(),
|
||||
Self::LinearInterpol(reader) => reader.min_value(),
|
||||
Self::MultiLinearInterpol(reader) => reader.min_value(),
|
||||
}
|
||||
}
|
||||
fn max_value(&self) -> Item {
|
||||
match self {
|
||||
Self::Bitpacked(reader) => reader.max_value(),
|
||||
Self::LinearInterpol(reader) => reader.max_value(),
|
||||
Self::MultiLinearInterpol(reader) => reader.max_value(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Wrapper for accessing a fastfield.
|
||||
///
|
||||
/// Holds the data and the codec to the read the data.
|
||||
///
|
||||
#[derive(Clone)]
|
||||
pub struct FastFieldReaderCodecWrapper<Item: FastValue, CodecReader> {
|
||||
reader: CodecReader,
|
||||
bytes: OwnedBytes,
|
||||
_phantom: PhantomData<Item>,
|
||||
}
|
||||
|
||||
impl<Item: FastValue, C: FastFieldCodecReader> FastFieldReaderCodecWrapper<Item, C> {
|
||||
/// Opens a fast field given a file.
|
||||
pub fn open(file: FileSlice) -> crate::Result<Self> {
|
||||
let mut bytes = file.read_bytes()?;
|
||||
let id = u8::deserialize(&mut bytes)?;
|
||||
assert_eq!(
|
||||
BitpackedFastFieldSerializer::ID,
|
||||
id,
|
||||
"Tried to open fast field as bitpacked encoded (id=1), but got serializer with different id"
|
||||
);
|
||||
Self::open_from_bytes(bytes)
|
||||
}
|
||||
/// Opens a fast field given the bytes.
|
||||
pub fn open_from_bytes(bytes: OwnedBytes) -> crate::Result<Self> {
|
||||
let reader = C::open_from_bytes(bytes.as_slice())?;
|
||||
Ok(FastFieldReaderCodecWrapper {
|
||||
reader,
|
||||
bytes,
|
||||
_phantom: PhantomData,
|
||||
})
|
||||
}
|
||||
pub(crate) fn get_u64(&self, doc: u64) -> Item {
|
||||
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc))
|
||||
Item::from_u64(self.reader.get_u64(doc, self.bytes.as_slice()))
|
||||
}
|
||||
|
||||
/// Internally `multivalued` also use SingleValue Fast fields.
|
||||
@@ -75,6 +196,22 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
*out = self.get_u64(start + (i as u64));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<Item: FastValue, C: FastFieldCodecReader + Clone> FastFieldReader<Item>
|
||||
for FastFieldReaderCodecWrapper<Item, C>
|
||||
{
|
||||
/// Return the value associated to the given document.
|
||||
///
|
||||
/// This accessor should return as fast as possible.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `doc` is greater than the segment
|
||||
// `maxdoc`.
|
||||
fn get(&self, doc: DocId) -> Item {
|
||||
self.get_u64(u64::from(doc))
|
||||
}
|
||||
|
||||
/// Fills an output buffer with the fast field values
|
||||
/// associated with the `DocId` going from
|
||||
@@ -89,8 +226,8 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
///
|
||||
/// May panic if `start + output.len()` is greater than
|
||||
/// the segment's `maxdoc`.
|
||||
pub fn get_range(&self, start: DocId, output: &mut [Item]) {
|
||||
self.get_range_u64(u64::from(start), output);
|
||||
fn get_range(&self, start: u64, output: &mut [Item]) {
|
||||
self.get_range_u64(start, output);
|
||||
}
|
||||
|
||||
/// Returns the minimum value for this fast field.
|
||||
@@ -98,8 +235,8 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
/// The max value does not take in account of possible
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// of the actual maximum value.
|
||||
pub fn min_value(&self) -> Item {
|
||||
Item::from_u64(self.min_value_u64)
|
||||
fn min_value(&self) -> Item {
|
||||
Item::from_u64(self.reader.min_value())
|
||||
}
|
||||
|
||||
/// Returns the maximum value for this fast field.
|
||||
@@ -107,35 +244,37 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
/// The max value does not take in account of possible
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// of the actual maximum value.
|
||||
pub fn max_value(&self) -> Item {
|
||||
Item::from_u64(self.max_value_u64)
|
||||
fn max_value(&self) -> Item {
|
||||
Item::from_u64(self.reader.max_value())
|
||||
}
|
||||
}
|
||||
|
||||
impl<Item: FastValue> From<Vec<Item>> for FastFieldReader<Item> {
|
||||
fn from(vals: Vec<Item>) -> FastFieldReader<Item> {
|
||||
pub(crate) type BitpackedFastFieldReader<Item> = FastFieldReaderCodecWrapper<Item, BitpackedReader>;
|
||||
|
||||
impl<Item: FastValue> From<Vec<Item>> for DynamicFastFieldReader<Item> {
|
||||
fn from(vals: Vec<Item>) -> DynamicFastFieldReader<Item> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_u64_field("field", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let path = Path::new("__dummy__");
|
||||
let directory: RAMDirectory = RAMDirectory::create();
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory
|
||||
.open_write(path)
|
||||
.expect("With a RAMDirectory, this should never fail.");
|
||||
let mut serializer = FastFieldSerializer::from_write(write)
|
||||
.expect("With a RAMDirectory, this should never fail.");
|
||||
.expect("With a RamDirectory, this should never fail.");
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write)
|
||||
.expect("With a RamDirectory, this should never fail.");
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
|
||||
{
|
||||
let fast_field_writer = fast_field_writers
|
||||
.get_field_writer(field)
|
||||
.expect("With a RAMDirectory, this should never fail.");
|
||||
.get_field_writer_mut(field)
|
||||
.expect("With a RamDirectory, this should never fail.");
|
||||
for val in vals {
|
||||
fast_field_writer.add_val(val.to_u64());
|
||||
}
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new())
|
||||
.serialize(&mut serializer, &HashMap::new(), None)
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
@@ -145,6 +284,6 @@ impl<Item: FastValue> From<Vec<Item>> for FastFieldReader<Item> {
|
||||
let field_file = composite_file
|
||||
.open_read(field)
|
||||
.expect("File component not found");
|
||||
FastFieldReader::open(field_file).unwrap()
|
||||
DynamicFastFieldReader::open(field_file).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,13 +1,15 @@
|
||||
use crate::common::CompositeFile;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::fastfield::MultiValuedFastFieldReader;
|
||||
use crate::fastfield::{BitpackedFastFieldReader, FastFieldNotAvailableError};
|
||||
use crate::fastfield::{BytesFastFieldReader, FastValue};
|
||||
use crate::fastfield::{FastFieldNotAvailableError, FastFieldReader};
|
||||
use crate::schema::{Cardinality, Field, FieldType, Schema};
|
||||
use crate::space_usage::PerFieldSpaceUsage;
|
||||
use crate::TantivyError;
|
||||
|
||||
/// Provides access to all of the FastFieldReader.
|
||||
use super::reader::DynamicFastFieldReader;
|
||||
|
||||
/// Provides access to all of the BitpackedFastFieldReader.
|
||||
///
|
||||
/// Internally, `FastFieldReaders` have preloaded fast field readers,
|
||||
/// and just wraps several `HashMap`.
|
||||
@@ -46,8 +48,8 @@ fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType, Cardinality
|
||||
impl FastFieldReaders {
|
||||
pub(crate) fn new(schema: Schema, fast_fields_composite: CompositeFile) -> FastFieldReaders {
|
||||
FastFieldReaders {
|
||||
fast_fields_composite,
|
||||
schema,
|
||||
fast_fields_composite,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -97,30 +99,34 @@ impl FastFieldReaders {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn typed_fast_field_reader_with_idx<TFastValue: FastValue>(
|
||||
&self,
|
||||
field: Field,
|
||||
index: usize,
|
||||
) -> crate::Result<DynamicFastFieldReader<TFastValue>> {
|
||||
let fast_field_slice = self.fast_field_data(field, index)?;
|
||||
DynamicFastFieldReader::open(fast_field_slice)
|
||||
}
|
||||
pub(crate) fn typed_fast_field_reader<TFastValue: FastValue>(
|
||||
&self,
|
||||
field: Field,
|
||||
) -> crate::Result<FastFieldReader<TFastValue>> {
|
||||
let fast_field_slice = self.fast_field_data(field, 0)?;
|
||||
FastFieldReader::open(fast_field_slice)
|
||||
) -> crate::Result<DynamicFastFieldReader<TFastValue>> {
|
||||
self.typed_fast_field_reader_with_idx(field, 0)
|
||||
}
|
||||
|
||||
pub(crate) fn typed_fast_field_multi_reader<TFastValue: FastValue>(
|
||||
&self,
|
||||
field: Field,
|
||||
) -> crate::Result<MultiValuedFastFieldReader<TFastValue>> {
|
||||
let fast_field_slice_idx = self.fast_field_data(field, 0)?;
|
||||
let fast_field_slice_vals = self.fast_field_data(field, 1)?;
|
||||
let idx_reader = FastFieldReader::open(fast_field_slice_idx)?;
|
||||
let vals_reader: FastFieldReader<TFastValue> =
|
||||
FastFieldReader::open(fast_field_slice_vals)?;
|
||||
let idx_reader = self.typed_fast_field_reader(field)?;
|
||||
let vals_reader = self.typed_fast_field_reader_with_idx(field, 1)?;
|
||||
Ok(MultiValuedFastFieldReader::open(idx_reader, vals_reader))
|
||||
}
|
||||
|
||||
/// Returns the `u64` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a u64 fast field, this method returns `None`.
|
||||
pub fn u64(&self, field: Field) -> crate::Result<FastFieldReader<u64>> {
|
||||
/// If `field` is not a u64 fast field, this method returns an Error.
|
||||
pub fn u64(&self, field: Field) -> crate::Result<DynamicFastFieldReader<u64>> {
|
||||
self.check_type(field, FastType::U64, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
@@ -129,45 +135,53 @@ impl FastFieldReaders {
|
||||
/// field is effectively of type `u64` or not.
|
||||
///
|
||||
/// If not, the fastfield reader will returns the u64-value associated to the original FastValue.
|
||||
pub fn u64_lenient(&self, field: Field) -> crate::Result<FastFieldReader<u64>> {
|
||||
pub fn u64_lenient(&self, field: Field) -> crate::Result<DynamicFastFieldReader<u64>> {
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
|
||||
/// Returns the `i64` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a i64 fast field, this method returns `None`.
|
||||
pub fn i64(&self, field: Field) -> crate::Result<FastFieldReader<i64>> {
|
||||
/// If `field` is not a i64 fast field, this method returns an Error.
|
||||
pub fn i64(&self, field: Field) -> crate::Result<DynamicFastFieldReader<i64>> {
|
||||
self.check_type(field, FastType::I64, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
|
||||
/// Returns the `i64` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a i64 fast field, this method returns `None`.
|
||||
pub fn date(&self, field: Field) -> crate::Result<FastFieldReader<crate::DateTime>> {
|
||||
/// If `field` is not a i64 fast field, this method returns an Error.
|
||||
pub fn date(&self, field: Field) -> crate::Result<DynamicFastFieldReader<crate::DateTime>> {
|
||||
self.check_type(field, FastType::Date, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
|
||||
/// Returns the `f64` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a f64 fast field, this method returns `None`.
|
||||
pub fn f64(&self, field: Field) -> crate::Result<FastFieldReader<f64>> {
|
||||
/// If `field` is not a f64 fast field, this method returns an Error.
|
||||
pub fn f64(&self, field: Field) -> crate::Result<DynamicFastFieldReader<f64>> {
|
||||
self.check_type(field, FastType::F64, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
|
||||
/// Returns a `u64s` multi-valued fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a u64 multi-valued fast field, this method returns `None`.
|
||||
/// If `field` is not a u64 multi-valued fast field, this method returns an Error.
|
||||
pub fn u64s(&self, field: Field) -> crate::Result<MultiValuedFastFieldReader<u64>> {
|
||||
self.check_type(field, FastType::U64, Cardinality::MultiValues)?;
|
||||
self.typed_fast_field_multi_reader(field)
|
||||
}
|
||||
|
||||
/// Returns a `u64s` multi-valued fast field reader reader associated to `field`, regardless of whether the given
|
||||
/// field is effectively of type `u64` or not.
|
||||
///
|
||||
/// If `field` is not a u64 multi-valued fast field, this method returns an Error.
|
||||
pub fn u64s_lenient(&self, field: Field) -> crate::Result<MultiValuedFastFieldReader<u64>> {
|
||||
self.typed_fast_field_multi_reader(field)
|
||||
}
|
||||
|
||||
/// Returns a `i64s` multi-valued fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a i64 multi-valued fast field, this method returns `None`.
|
||||
/// If `field` is not a i64 multi-valued fast field, this method returns an Error.
|
||||
pub fn i64s(&self, field: Field) -> crate::Result<MultiValuedFastFieldReader<i64>> {
|
||||
self.check_type(field, FastType::I64, Cardinality::MultiValues)?;
|
||||
self.typed_fast_field_multi_reader(field)
|
||||
@@ -175,7 +189,7 @@ impl FastFieldReaders {
|
||||
|
||||
/// Returns a `f64s` multi-valued fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a f64 multi-valued fast field, this method returns `None`.
|
||||
/// If `field` is not a f64 multi-valued fast field, this method returns an Error.
|
||||
pub fn f64s(&self, field: Field) -> crate::Result<MultiValuedFastFieldReader<f64>> {
|
||||
self.check_type(field, FastType::F64, Cardinality::MultiValues)?;
|
||||
self.typed_fast_field_multi_reader(field)
|
||||
@@ -183,7 +197,7 @@ impl FastFieldReaders {
|
||||
|
||||
/// Returns a `crate::DateTime` multi-valued fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a `crate::DateTime` multi-valued fast field, this method returns `None`.
|
||||
/// If `field` is not a `crate::DateTime` multi-valued fast field, this method returns an Error.
|
||||
pub fn dates(
|
||||
&self,
|
||||
field: Field,
|
||||
@@ -194,7 +208,7 @@ impl FastFieldReaders {
|
||||
|
||||
/// Returns the `bytes` fast field reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a bytes fast field, returns `None`.
|
||||
/// If `field` is not a bytes fast field, returns an Error.
|
||||
pub fn bytes(&self, field: Field) -> crate::Result<BytesFastFieldReader> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
if let FieldType::Bytes(bytes_option) = field_entry.field_type() {
|
||||
@@ -205,7 +219,7 @@ impl FastFieldReaders {
|
||||
)));
|
||||
}
|
||||
let fast_field_idx_file = self.fast_field_data(field, 0)?;
|
||||
let idx_reader = FastFieldReader::open(fast_field_idx_file)?;
|
||||
let idx_reader = BitpackedFastFieldReader::open(fast_field_idx_file)?;
|
||||
let data = self.fast_field_data(field, 1)?;
|
||||
BytesFastFieldReader::open(idx_reader, data)
|
||||
} else {
|
||||
|
||||
@@ -1,142 +0,0 @@
|
||||
use crate::common::bitpacker::BitPacker;
|
||||
use crate::common::compute_num_bits;
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::common::CompositeWrite;
|
||||
use crate::common::CountingWriter;
|
||||
use crate::directory::WritePtr;
|
||||
use crate::schema::Field;
|
||||
use std::io::{self, Write};
|
||||
|
||||
/// `FastFieldSerializer` is in charge of serializing
|
||||
/// fastfields on disk.
|
||||
///
|
||||
/// Fast fields are encoded using bit-packing.
|
||||
///
|
||||
/// `FastFieldWriter`s are in charge of pushing the data to
|
||||
/// the serializer.
|
||||
/// The serializer expects to receive the following calls.
|
||||
///
|
||||
/// * `new_u64_fast_field(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * ...
|
||||
/// * `close_field()`
|
||||
/// * `new_u64_fast_field(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * ...
|
||||
/// * `close_field()`
|
||||
/// * `close()`
|
||||
pub struct FastFieldSerializer {
|
||||
composite_write: CompositeWrite<WritePtr>,
|
||||
}
|
||||
|
||||
impl FastFieldSerializer {
|
||||
/// Constructor
|
||||
pub fn from_write(write: WritePtr) -> io::Result<FastFieldSerializer> {
|
||||
// just making room for the pointer to header.
|
||||
let composite_write = CompositeWrite::wrap(write);
|
||||
Ok(FastFieldSerializer { composite_write })
|
||||
}
|
||||
|
||||
/// Start serializing a new u64 fast field
|
||||
pub fn new_u64_fast_field(
|
||||
&mut self,
|
||||
field: Field,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<FastSingleFieldSerializer<'_, CountingWriter<WritePtr>>> {
|
||||
self.new_u64_fast_field_with_idx(field, min_value, max_value, 0)
|
||||
}
|
||||
|
||||
/// Start serializing a new u64 fast field
|
||||
pub fn new_u64_fast_field_with_idx(
|
||||
&mut self,
|
||||
field: Field,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
idx: usize,
|
||||
) -> io::Result<FastSingleFieldSerializer<'_, CountingWriter<WritePtr>>> {
|
||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||
FastSingleFieldSerializer::open(field_write, min_value, max_value)
|
||||
}
|
||||
|
||||
/// Start serializing a new [u8] fast field
|
||||
pub fn new_bytes_fast_field_with_idx(
|
||||
&mut self,
|
||||
field: Field,
|
||||
idx: usize,
|
||||
) -> FastBytesFieldSerializer<'_, CountingWriter<WritePtr>> {
|
||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||
FastBytesFieldSerializer { write: field_write }
|
||||
}
|
||||
|
||||
/// Closes the serializer
|
||||
///
|
||||
/// After this call the data must be persistently save on disk.
|
||||
pub fn close(self) -> io::Result<()> {
|
||||
self.composite_write.close()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FastSingleFieldSerializer<'a, W: Write> {
|
||||
bit_packer: BitPacker,
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
num_bits: u8,
|
||||
}
|
||||
|
||||
impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
|
||||
/// Creates a new fast field serializer.
|
||||
///
|
||||
/// The serializer in fact encode the values by bitpacking
|
||||
/// `(val - min_value)`.
|
||||
///
|
||||
/// It requires a `min_value` and a `max_value` to compute
|
||||
/// compute the minimum number of bits required to encode
|
||||
/// values.
|
||||
fn open(
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<FastSingleFieldSerializer<'a, W>> {
|
||||
assert!(min_value <= max_value);
|
||||
min_value.serialize(write)?;
|
||||
let amplitude = max_value - min_value;
|
||||
amplitude.serialize(write)?;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let bit_packer = BitPacker::new();
|
||||
Ok(FastSingleFieldSerializer {
|
||||
write,
|
||||
bit_packer,
|
||||
min_value,
|
||||
num_bits,
|
||||
})
|
||||
}
|
||||
|
||||
/// Pushes a new value to the currently open u64 fast field.
|
||||
pub fn add_val(&mut self, val: u64) -> io::Result<()> {
|
||||
let val_to_write: u64 = val - self.min_value;
|
||||
self.bit_packer
|
||||
.write(val_to_write, self.num_bits, &mut self.write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn close_field(mut self) -> io::Result<()> {
|
||||
self.bit_packer.close(&mut self.write)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FastBytesFieldSerializer<'a, W: Write> {
|
||||
write: &'a mut W,
|
||||
}
|
||||
|
||||
impl<'a, W: Write> FastBytesFieldSerializer<'a, W> {
|
||||
pub fn write_all(&mut self, vals: &[u8]) -> io::Result<()> {
|
||||
self.write.write_all(vals)
|
||||
}
|
||||
|
||||
pub fn flush(&mut self) -> io::Result<()> {
|
||||
self.write.flush()
|
||||
}
|
||||
}
|
||||
218
src/fastfield/serializer/mod.rs
Normal file
218
src/fastfield/serializer/mod.rs
Normal file
@@ -0,0 +1,218 @@
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::common::CompositeWrite;
|
||||
use crate::common::CountingWriter;
|
||||
use crate::directory::WritePtr;
|
||||
use crate::schema::Field;
|
||||
pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer;
|
||||
pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializerLegacy;
|
||||
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
|
||||
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer;
|
||||
pub use fastfield_codecs::FastFieldCodecSerializer;
|
||||
pub use fastfield_codecs::FastFieldDataAccess;
|
||||
pub use fastfield_codecs::FastFieldStats;
|
||||
use std::io::{self, Write};
|
||||
|
||||
/// `CompositeFastFieldSerializer` is in charge of serializing
|
||||
/// fastfields on disk.
|
||||
///
|
||||
/// Fast fields have different encodings like bit-packing.
|
||||
///
|
||||
/// `FastFieldWriter`s are in charge of pushing the data to
|
||||
/// the serializer.
|
||||
/// The serializer expects to receive the following calls.
|
||||
///
|
||||
/// * `new_u64_fast_field(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * ...
|
||||
/// * `close_field()`
|
||||
/// * `new_u64_fast_field(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * ...
|
||||
/// * `close_field()`
|
||||
/// * `close()`
|
||||
pub struct CompositeFastFieldSerializer {
|
||||
composite_write: CompositeWrite<WritePtr>,
|
||||
}
|
||||
|
||||
// use this, when this is merged and stabilized explicit_generic_args_with_impl_trait
|
||||
// https://github.com/rust-lang/rust/pull/86176
|
||||
fn codec_estimation<T: FastFieldCodecSerializer, A: FastFieldDataAccess>(
|
||||
stats: FastFieldStats,
|
||||
fastfield_accessor: &A,
|
||||
estimations: &mut Vec<(f32, &str, u8)>,
|
||||
) {
|
||||
if !T::is_applicable(fastfield_accessor, stats.clone()) {
|
||||
return;
|
||||
}
|
||||
let (ratio, name, id) = (T::estimate(fastfield_accessor, stats), T::NAME, T::ID);
|
||||
estimations.push((ratio, name, id));
|
||||
}
|
||||
|
||||
impl CompositeFastFieldSerializer {
|
||||
/// Constructor
|
||||
pub fn from_write(write: WritePtr) -> io::Result<CompositeFastFieldSerializer> {
|
||||
// just making room for the pointer to header.
|
||||
let composite_write = CompositeWrite::wrap(write);
|
||||
Ok(CompositeFastFieldSerializer { composite_write })
|
||||
}
|
||||
|
||||
/// Serialize data into a new u64 fast field. The best compression codec will be chosen automatically.
|
||||
pub fn create_auto_detect_u64_fast_field(
|
||||
&mut self,
|
||||
field: Field,
|
||||
stats: FastFieldStats,
|
||||
fastfield_accessor: impl FastFieldDataAccess,
|
||||
data_iter_1: impl Iterator<Item = u64>,
|
||||
data_iter_2: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
self.create_auto_detect_u64_fast_field_with_idx(
|
||||
field,
|
||||
stats,
|
||||
fastfield_accessor,
|
||||
data_iter_1,
|
||||
data_iter_2,
|
||||
0,
|
||||
)
|
||||
}
|
||||
/// Serialize data into a new u64 fast field. The best compression codec will be chosen automatically.
|
||||
pub fn create_auto_detect_u64_fast_field_with_idx(
|
||||
&mut self,
|
||||
field: Field,
|
||||
stats: FastFieldStats,
|
||||
fastfield_accessor: impl FastFieldDataAccess,
|
||||
data_iter_1: impl Iterator<Item = u64>,
|
||||
data_iter_2: impl Iterator<Item = u64>,
|
||||
idx: usize,
|
||||
) -> io::Result<()> {
|
||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||
|
||||
let mut estimations = vec![];
|
||||
|
||||
codec_estimation::<BitpackedFastFieldSerializer, _>(
|
||||
stats.clone(),
|
||||
&fastfield_accessor,
|
||||
&mut estimations,
|
||||
);
|
||||
codec_estimation::<LinearInterpolFastFieldSerializer, _>(
|
||||
stats.clone(),
|
||||
&fastfield_accessor,
|
||||
&mut estimations,
|
||||
);
|
||||
codec_estimation::<MultiLinearInterpolFastFieldSerializer, _>(
|
||||
stats.clone(),
|
||||
&fastfield_accessor,
|
||||
&mut estimations,
|
||||
);
|
||||
if let Some(broken_estimation) = estimations
|
||||
.iter()
|
||||
.find(|estimation| estimation.0 == f32::NAN)
|
||||
{
|
||||
warn!(
|
||||
"broken estimation for fast field codec {}",
|
||||
broken_estimation.1
|
||||
);
|
||||
}
|
||||
// removing nan values for codecs with broken calculations, and max values which disables codecs
|
||||
estimations.retain(|estimation| !estimation.0.is_nan() && estimation.0 != f32::MAX);
|
||||
estimations.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
|
||||
let (_ratio, name, id) = estimations[0];
|
||||
debug!(
|
||||
"choosing fast field codec {} for field_id {:?}",
|
||||
name, field
|
||||
); // todo print actual field name
|
||||
id.serialize(field_write)?;
|
||||
match name {
|
||||
BitpackedFastFieldSerializer::NAME => {
|
||||
BitpackedFastFieldSerializer::serialize(
|
||||
field_write,
|
||||
&fastfield_accessor,
|
||||
stats,
|
||||
data_iter_1,
|
||||
data_iter_2,
|
||||
)?;
|
||||
}
|
||||
LinearInterpolFastFieldSerializer::NAME => {
|
||||
LinearInterpolFastFieldSerializer::serialize(
|
||||
field_write,
|
||||
&fastfield_accessor,
|
||||
stats,
|
||||
data_iter_1,
|
||||
data_iter_2,
|
||||
)?;
|
||||
}
|
||||
MultiLinearInterpolFastFieldSerializer::NAME => {
|
||||
MultiLinearInterpolFastFieldSerializer::serialize(
|
||||
field_write,
|
||||
&fastfield_accessor,
|
||||
stats,
|
||||
data_iter_1,
|
||||
data_iter_2,
|
||||
)?;
|
||||
}
|
||||
_ => {
|
||||
panic!("unknown fastfield serializer {}", name)
|
||||
}
|
||||
};
|
||||
field_write.flush()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Start serializing a new u64 fast field
|
||||
pub fn new_u64_fast_field(
|
||||
&mut self,
|
||||
field: Field,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<BitpackedFastFieldSerializerLegacy<'_, CountingWriter<WritePtr>>> {
|
||||
self.new_u64_fast_field_with_idx(field, min_value, max_value, 0)
|
||||
}
|
||||
|
||||
/// Start serializing a new u64 fast field
|
||||
pub fn new_u64_fast_field_with_idx(
|
||||
&mut self,
|
||||
field: Field,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
idx: usize,
|
||||
) -> io::Result<BitpackedFastFieldSerializerLegacy<'_, CountingWriter<WritePtr>>> {
|
||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||
// Prepend codec id to field data for compatibility with DynamicFastFieldReader.
|
||||
let id = BitpackedFastFieldSerializer::ID;
|
||||
id.serialize(field_write)?;
|
||||
BitpackedFastFieldSerializerLegacy::open(field_write, min_value, max_value)
|
||||
}
|
||||
|
||||
/// Start serializing a new [u8] fast field
|
||||
pub fn new_bytes_fast_field_with_idx(
|
||||
&mut self,
|
||||
field: Field,
|
||||
idx: usize,
|
||||
) -> FastBytesFieldSerializer<'_, CountingWriter<WritePtr>> {
|
||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||
FastBytesFieldSerializer { write: field_write }
|
||||
}
|
||||
|
||||
/// Closes the serializer
|
||||
///
|
||||
/// After this call the data must be persistently save on disk.
|
||||
pub fn close(self) -> io::Result<()> {
|
||||
self.composite_write.close()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FastBytesFieldSerializer<'a, W: Write> {
|
||||
write: &'a mut W,
|
||||
}
|
||||
|
||||
impl<'a, W: Write> FastBytesFieldSerializer<'a, W> {
|
||||
pub fn write_all(&mut self, vals: &[u8]) -> io::Result<()> {
|
||||
self.write.write_all(vals)
|
||||
}
|
||||
|
||||
pub fn flush(&mut self) -> io::Result<()> {
|
||||
self.write.flush()
|
||||
}
|
||||
}
|
||||
@@ -1,14 +1,16 @@
|
||||
use super::multivalued::MultiValuedFastFieldWriter;
|
||||
use super::serializer::FastFieldStats;
|
||||
use super::FastFieldDataAccess;
|
||||
use crate::common;
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::common::VInt;
|
||||
use crate::fastfield::{BytesFastFieldWriter, FastFieldSerializer};
|
||||
use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer};
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::postings::UnorderedTermId;
|
||||
use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema};
|
||||
use crate::termdict::TermOrdinal;
|
||||
use fnv::FnvHashMap;
|
||||
use std::collections::HashMap;
|
||||
use std::io;
|
||||
use tantivy_bitpacker::BlockedBitpacker;
|
||||
|
||||
/// The fastfieldswriter regroup all of the fast field writers.
|
||||
pub struct FastFieldsWriter {
|
||||
@@ -72,8 +74,34 @@ impl FastFieldsWriter {
|
||||
}
|
||||
}
|
||||
|
||||
/// The memory used (inclusive childs)
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.single_value_writers
|
||||
.iter()
|
||||
.map(|w| w.mem_usage())
|
||||
.sum::<usize>()
|
||||
+ self
|
||||
.multi_values_writers
|
||||
.iter()
|
||||
.map(|w| w.mem_usage())
|
||||
.sum::<usize>()
|
||||
+ self
|
||||
.bytes_value_writers
|
||||
.iter()
|
||||
.map(|w| w.mem_usage())
|
||||
.sum::<usize>()
|
||||
}
|
||||
|
||||
/// Get the `FastFieldWriter` associated to a field.
|
||||
pub fn get_field_writer(&mut self, field: Field) -> Option<&mut IntFastFieldWriter> {
|
||||
pub fn get_field_writer(&self, field: Field) -> Option<&IntFastFieldWriter> {
|
||||
// TODO optimize
|
||||
self.single_value_writers
|
||||
.iter()
|
||||
.find(|field_writer| field_writer.field() == field)
|
||||
}
|
||||
|
||||
/// Get the `FastFieldWriter` associated to a field.
|
||||
pub fn get_field_writer_mut(&mut self, field: Field) -> Option<&mut IntFastFieldWriter> {
|
||||
// TODO optimize
|
||||
self.single_value_writers
|
||||
.iter_mut()
|
||||
@@ -84,7 +112,7 @@ impl FastFieldsWriter {
|
||||
///
|
||||
/// Returns None if the field does not exist, or is not
|
||||
/// configured as a multivalued fastfield in the schema.
|
||||
pub fn get_multivalue_writer(
|
||||
pub fn get_multivalue_writer_mut(
|
||||
&mut self,
|
||||
field: Field,
|
||||
) -> Option<&mut MultiValuedFastFieldWriter> {
|
||||
@@ -98,7 +126,7 @@ impl FastFieldsWriter {
|
||||
///
|
||||
/// Returns None if the field does not exist, or is not
|
||||
/// configured as a bytes fastfield in the schema.
|
||||
pub fn get_bytes_writer(&mut self, field: Field) -> Option<&mut BytesFastFieldWriter> {
|
||||
pub fn get_bytes_writer_mut(&mut self, field: Field) -> Option<&mut BytesFastFieldWriter> {
|
||||
// TODO optimize
|
||||
self.bytes_value_writers
|
||||
.iter_mut()
|
||||
@@ -122,19 +150,20 @@ impl FastFieldsWriter {
|
||||
/// order to the fast field serializer.
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut FastFieldSerializer,
|
||||
serializer: &mut CompositeFastFieldSerializer,
|
||||
mapping: &HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
) -> io::Result<()> {
|
||||
for field_writer in &self.single_value_writers {
|
||||
field_writer.serialize(serializer)?;
|
||||
field_writer.serialize(serializer, doc_id_map)?;
|
||||
}
|
||||
|
||||
for field_writer in &self.multi_values_writers {
|
||||
let field = field_writer.field();
|
||||
field_writer.serialize(serializer, mapping.get(&field))?;
|
||||
field_writer.serialize(serializer, mapping.get(&field), doc_id_map)?;
|
||||
}
|
||||
for field_writer in &self.bytes_value_writers {
|
||||
field_writer.serialize(serializer)?;
|
||||
field_writer.serialize(serializer, doc_id_map)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -157,7 +186,7 @@ impl FastFieldsWriter {
|
||||
/// using `common::i64_to_u64` and `common::f64_to_u64`.
|
||||
pub struct IntFastFieldWriter {
|
||||
field: Field,
|
||||
vals: Vec<u8>,
|
||||
vals: BlockedBitpacker,
|
||||
val_count: usize,
|
||||
val_if_missing: u64,
|
||||
val_min: u64,
|
||||
@@ -169,7 +198,7 @@ impl IntFastFieldWriter {
|
||||
pub fn new(field: Field) -> IntFastFieldWriter {
|
||||
IntFastFieldWriter {
|
||||
field,
|
||||
vals: Vec::new(),
|
||||
vals: BlockedBitpacker::new(),
|
||||
val_count: 0,
|
||||
val_if_missing: 0u64,
|
||||
val_min: u64::max_value(),
|
||||
@@ -177,6 +206,11 @@ impl IntFastFieldWriter {
|
||||
}
|
||||
}
|
||||
|
||||
/// The memory used (inclusive childs)
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.vals.mem_usage()
|
||||
}
|
||||
|
||||
/// Returns the field that this writer is targetting.
|
||||
pub fn field(&self) -> Field {
|
||||
self.field
|
||||
@@ -196,9 +230,7 @@ impl IntFastFieldWriter {
|
||||
/// associated to the document with the `DocId` n.
|
||||
/// (Well, `n-1` actually because of 0-indexing)
|
||||
pub fn add_val(&mut self, val: u64) {
|
||||
VInt(val)
|
||||
.serialize(&mut self.vals)
|
||||
.expect("unable to serialize VInt to Vec");
|
||||
self.vals.add(val);
|
||||
|
||||
if val > self.val_max {
|
||||
self.val_max = val;
|
||||
@@ -234,21 +266,75 @@ impl IntFastFieldWriter {
|
||||
self.add_val(val);
|
||||
}
|
||||
|
||||
/// get iterator over the data
|
||||
pub(crate) fn iter(&self) -> impl Iterator<Item = u64> + '_ {
|
||||
self.vals.iter()
|
||||
}
|
||||
|
||||
/// Push the fast fields value to the `FastFieldWriter`.
|
||||
pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> {
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut CompositeFastFieldSerializer,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
) -> io::Result<()> {
|
||||
let (min, max) = if self.val_min > self.val_max {
|
||||
(0, 0)
|
||||
} else {
|
||||
(self.val_min, self.val_max)
|
||||
};
|
||||
let fastfield_accessor = WriterFastFieldAccessProvider {
|
||||
doc_id_map,
|
||||
vals: &self.vals,
|
||||
};
|
||||
let stats = FastFieldStats {
|
||||
min_value: min,
|
||||
max_value: max,
|
||||
num_vals: self.val_count as u64,
|
||||
};
|
||||
|
||||
let mut single_field_serializer = serializer.new_u64_fast_field(self.field, min, max)?;
|
||||
|
||||
let mut cursor = self.vals.as_slice();
|
||||
while let Ok(VInt(val)) = VInt::deserialize(&mut cursor) {
|
||||
single_field_serializer.add_val(val)?;
|
||||
}
|
||||
|
||||
single_field_serializer.close_field()
|
||||
if let Some(doc_id_map) = doc_id_map {
|
||||
let iter = doc_id_map
|
||||
.iter_old_doc_ids()
|
||||
.map(|doc_id| self.vals.get(doc_id as usize));
|
||||
serializer.create_auto_detect_u64_fast_field(
|
||||
self.field,
|
||||
stats,
|
||||
fastfield_accessor,
|
||||
iter.clone(),
|
||||
iter,
|
||||
)?;
|
||||
} else {
|
||||
serializer.create_auto_detect_u64_fast_field(
|
||||
self.field,
|
||||
stats,
|
||||
fastfield_accessor,
|
||||
self.vals.iter(),
|
||||
self.vals.iter(),
|
||||
)?;
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct WriterFastFieldAccessProvider<'map, 'bitp> {
|
||||
doc_id_map: Option<&'map DocIdMapping>,
|
||||
vals: &'bitp BlockedBitpacker,
|
||||
}
|
||||
impl<'map, 'bitp> FastFieldDataAccess for WriterFastFieldAccessProvider<'map, 'bitp> {
|
||||
/// Return the value associated to the given doc.
|
||||
///
|
||||
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance reasons.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `doc` is greater than the index.
|
||||
fn get_val(&self, doc: u64) -> u64 {
|
||||
if let Some(doc_id_map) = self.doc_id_map {
|
||||
self.vals
|
||||
.get(doc_id_map.get_old_doc_id(doc as u32) as usize) // consider extra FastFieldReader wrapper for non doc_id_map
|
||||
} else {
|
||||
self.vals.get(doc as usize)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn id_to_fieldnorm(id: u8) -> u32 {
|
||||
FIELD_NORMS_TABLE[id as usize]
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn fieldnorm_to_id(fieldnorm: u32) -> u8 {
|
||||
FIELD_NORMS_TABLE
|
||||
.binary_search(&fieldnorm)
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
//! precompute computationally expensive functions of the fieldnorm
|
||||
//! in a very short array.
|
||||
//!
|
||||
//! This trick is used by the BM25 similarity.
|
||||
//! This trick is used by the Bm25 similarity.
|
||||
mod code;
|
||||
mod reader;
|
||||
mod serializer;
|
||||
|
||||
@@ -117,8 +117,8 @@ impl FieldNormReader {
|
||||
/// The fieldnorm is a value approximating the number
|
||||
/// of tokens in a given field of the `doc_id`.
|
||||
///
|
||||
/// It is imprecise, and always lower than the actual
|
||||
/// number of tokens.
|
||||
/// It is imprecise, and equal or lower than
|
||||
/// the actual number of tokens.
|
||||
///
|
||||
/// The fieldnorm is effectively decoded from the
|
||||
/// `fieldnorm_id` by doing a simple table lookup.
|
||||
@@ -133,7 +133,7 @@ impl FieldNormReader {
|
||||
}
|
||||
|
||||
/// Returns the `fieldnorm_id` associated to a document.
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn fieldnorm_id(&self, doc_id: DocId) -> u8 {
|
||||
match &self.0 {
|
||||
ReaderImplEnum::FromData(data) => {
|
||||
@@ -145,14 +145,14 @@ impl FieldNormReader {
|
||||
}
|
||||
|
||||
/// Converts a `fieldnorm_id` into a fieldnorm.
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn id_to_fieldnorm(id: u8) -> u32 {
|
||||
id_to_fieldnorm(id)
|
||||
}
|
||||
|
||||
/// Converts a `fieldnorm` into a `fieldnorm_id`.
|
||||
/// (This function is not injective).
|
||||
#[inline(always)]
|
||||
#[inline]
|
||||
pub fn fieldnorm_to_id(fieldnorm: u32) -> u8 {
|
||||
fieldnorm_to_id(fieldnorm)
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::DocId;
|
||||
use crate::{indexer::doc_id_mapping::DocIdMapping, DocId};
|
||||
|
||||
use super::fieldnorm_to_id;
|
||||
use super::FieldNormsSerializer;
|
||||
@@ -50,6 +50,13 @@ impl FieldNormsWriter {
|
||||
}
|
||||
}
|
||||
|
||||
/// The memory used inclusive childs
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.fieldnorms_buffer
|
||||
.iter()
|
||||
.map(|buf| buf.capacity())
|
||||
.sum()
|
||||
}
|
||||
/// Ensure that all documents in 0..max_doc have a byte associated with them
|
||||
/// in each of the fieldnorm vectors.
|
||||
///
|
||||
@@ -80,10 +87,23 @@ impl FieldNormsWriter {
|
||||
}
|
||||
|
||||
/// Serialize the seen fieldnorm values to the serializer for all fields.
|
||||
pub fn serialize(&self, mut fieldnorms_serializer: FieldNormsSerializer) -> io::Result<()> {
|
||||
pub fn serialize(
|
||||
&self,
|
||||
mut fieldnorms_serializer: FieldNormsSerializer,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
) -> io::Result<()> {
|
||||
for &field in self.fields.iter() {
|
||||
let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.field_id() as usize][..];
|
||||
fieldnorms_serializer.serialize_field(field, fieldnorm_values)?;
|
||||
if let Some(doc_id_map) = doc_id_map {
|
||||
let mut mapped_fieldnorm_values = vec![];
|
||||
mapped_fieldnorm_values.resize(fieldnorm_values.len(), 0u8);
|
||||
for (new_doc_id, old_doc_id) in doc_id_map.iter_old_doc_ids().enumerate() {
|
||||
mapped_fieldnorm_values[new_doc_id] = fieldnorm_values[old_doc_id as usize];
|
||||
}
|
||||
fieldnorms_serializer.serialize_field(field, &mapped_fieldnorm_values)?;
|
||||
} else {
|
||||
fieldnorms_serializer.serialize_field(field, fieldnorm_values)?;
|
||||
}
|
||||
}
|
||||
fieldnorms_serializer.close()?;
|
||||
Ok(())
|
||||
|
||||
@@ -38,7 +38,7 @@ fn test_functional_store() -> crate::Result<()> {
|
||||
for iteration in 0..500 {
|
||||
dbg!(iteration);
|
||||
let num_docs: usize = rng.gen_range(0..4);
|
||||
if doc_set.len() >= 1 {
|
||||
if !doc_set.is_empty() {
|
||||
let doc_to_remove_id = rng.gen_range(0..doc_set.len());
|
||||
let removed_doc_id = doc_set.swap_remove(doc_to_remove_id);
|
||||
index_writer.delete_term(Term::from_field_u64(id_field, removed_doc_id));
|
||||
@@ -88,19 +88,17 @@ fn test_functional_indexing() -> crate::Result<()> {
|
||||
&searcher,
|
||||
&committed_docs.iter().cloned().collect::<Vec<u64>>(),
|
||||
)?;
|
||||
} else if committed_docs.remove(&random_val) || uncommitted_docs.remove(&random_val) {
|
||||
let doc_id_term = Term::from_field_u64(id_field, random_val);
|
||||
index_writer.delete_term(doc_id_term);
|
||||
} else {
|
||||
if committed_docs.remove(&random_val) || uncommitted_docs.remove(&random_val) {
|
||||
let doc_id_term = Term::from_field_u64(id_field, random_val);
|
||||
index_writer.delete_term(doc_id_term);
|
||||
} else {
|
||||
uncommitted_docs.insert(random_val);
|
||||
let mut doc = Document::new();
|
||||
doc.add_u64(id_field, random_val);
|
||||
for i in 1u64..10u64 {
|
||||
doc.add_u64(multiples_field, random_val * i);
|
||||
}
|
||||
index_writer.add_document(doc);
|
||||
uncommitted_docs.insert(random_val);
|
||||
let mut doc = Document::new();
|
||||
doc.add_u64(id_field, random_val);
|
||||
for i in 1u64..10u64 {
|
||||
doc.add_u64(multiples_field, random_val * i);
|
||||
}
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use super::operation::DeleteOperation;
|
||||
use crate::Opstamp;
|
||||
use std::mem;
|
||||
|
||||
use std::ops::DerefMut;
|
||||
use std::sync::{Arc, RwLock, Weak};
|
||||
|
||||
@@ -105,7 +105,7 @@ impl DeleteQueue {
|
||||
return None;
|
||||
}
|
||||
|
||||
let delete_operations = mem::replace(&mut self_wlock.writer, vec![]);
|
||||
let delete_operations = std::mem::take(&mut self_wlock.writer);
|
||||
|
||||
let new_block = Arc::new(Block {
|
||||
operations: Arc::from(delete_operations.into_boxed_slice()),
|
||||
@@ -286,7 +286,7 @@ mod tests {
|
||||
operations_it.advance();
|
||||
}
|
||||
{
|
||||
let mut operations_it = snapshot.clone();
|
||||
let mut operations_it = snapshot;
|
||||
assert_eq!(operations_it.get().unwrap().opstamp, 1);
|
||||
operations_it.advance();
|
||||
assert_eq!(operations_it.get().unwrap().opstamp, 2);
|
||||
|
||||
477
src/indexer/doc_id_mapping.rs
Normal file
477
src/indexer/doc_id_mapping.rs
Normal file
@@ -0,0 +1,477 @@
|
||||
//! This module is used when sorting the index by a property, e.g.
|
||||
//! to get mappings from old doc_id to new doc_id and vice versa, after sorting
|
||||
//!
|
||||
|
||||
use super::{merger::SegmentReaderWithOrdinal, SegmentWriter};
|
||||
use crate::{
|
||||
schema::{Field, Schema},
|
||||
DocId, IndexSortByField, Order, TantivyError,
|
||||
};
|
||||
use std::{cmp::Reverse, ops::Index};
|
||||
|
||||
/// Struct to provide mapping from new doc_id to old doc_id and segment.
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct SegmentDocidMapping<'a> {
|
||||
new_doc_id_to_old_and_segment: Vec<(DocId, SegmentReaderWithOrdinal<'a>)>,
|
||||
is_trivial: bool,
|
||||
}
|
||||
|
||||
impl<'a> SegmentDocidMapping<'a> {
|
||||
pub(crate) fn new(
|
||||
new_doc_id_to_old_and_segment: Vec<(DocId, SegmentReaderWithOrdinal<'a>)>,
|
||||
is_trivial: bool,
|
||||
) -> Self {
|
||||
Self {
|
||||
new_doc_id_to_old_and_segment,
|
||||
is_trivial,
|
||||
}
|
||||
}
|
||||
pub(crate) fn iter(&self) -> impl Iterator<Item = &(DocId, SegmentReaderWithOrdinal)> {
|
||||
self.new_doc_id_to_old_and_segment.iter()
|
||||
}
|
||||
pub(crate) fn len(&self) -> usize {
|
||||
self.new_doc_id_to_old_and_segment.len()
|
||||
}
|
||||
/// This flags means the segments are simply stacked in the order of their ordinal.
|
||||
/// e.g. [(0, 1), .. (n, 1), (0, 2)..., (m, 2)]
|
||||
///
|
||||
/// This allows for some optimization.
|
||||
pub(crate) fn is_trivial(&self) -> bool {
|
||||
self.is_trivial
|
||||
}
|
||||
}
|
||||
impl<'a> Index<usize> for SegmentDocidMapping<'a> {
|
||||
type Output = (DocId, SegmentReaderWithOrdinal<'a>);
|
||||
|
||||
fn index(&self, idx: usize) -> &Self::Output {
|
||||
&self.new_doc_id_to_old_and_segment[idx]
|
||||
}
|
||||
}
|
||||
impl<'a> IntoIterator for SegmentDocidMapping<'a> {
|
||||
type Item = (DocId, SegmentReaderWithOrdinal<'a>);
|
||||
type IntoIter = std::vec::IntoIter<Self::Item>;
|
||||
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
self.new_doc_id_to_old_and_segment.into_iter()
|
||||
}
|
||||
}
|
||||
|
||||
/// Struct to provide mapping from old doc_id to new doc_id and vice versa within a segment.
|
||||
pub struct DocIdMapping {
|
||||
new_doc_id_to_old: Vec<DocId>,
|
||||
old_doc_id_to_new: Vec<DocId>,
|
||||
}
|
||||
|
||||
impl DocIdMapping {
|
||||
/// returns the new doc_id for the old doc_id
|
||||
pub fn get_new_doc_id(&self, doc_id: DocId) -> DocId {
|
||||
self.old_doc_id_to_new[doc_id as usize]
|
||||
}
|
||||
/// returns the old doc_id for the new doc_id
|
||||
pub fn get_old_doc_id(&self, doc_id: DocId) -> DocId {
|
||||
self.new_doc_id_to_old[doc_id as usize]
|
||||
}
|
||||
/// iterate over old doc_ids in order of the new doc_ids
|
||||
pub fn iter_old_doc_ids(&self) -> impl Iterator<Item = DocId> + Clone + '_ {
|
||||
self.new_doc_id_to_old.iter().cloned()
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn expect_field_id_for_sort_field(
|
||||
schema: &Schema,
|
||||
sort_by_field: &IndexSortByField,
|
||||
) -> crate::Result<Field> {
|
||||
schema.get_field(&sort_by_field.field).ok_or_else(|| {
|
||||
TantivyError::InvalidArgument(format!(
|
||||
"field to sort index by not found: {:?}",
|
||||
sort_by_field.field
|
||||
))
|
||||
})
|
||||
}
|
||||
|
||||
// Generates a document mapping in the form of [index new doc_id] -> old doc_id
|
||||
// TODO detect if field is already sorted and discard mapping
|
||||
pub(crate) fn get_doc_id_mapping_from_field(
|
||||
sort_by_field: IndexSortByField,
|
||||
segment_writer: &SegmentWriter,
|
||||
) -> crate::Result<DocIdMapping> {
|
||||
let schema = segment_writer.segment_serializer.segment().schema();
|
||||
let field_id = expect_field_id_for_sort_field(&schema, &sort_by_field)?; // for now expect fastfield, but not strictly required
|
||||
let fast_field = segment_writer
|
||||
.fast_field_writers
|
||||
.get_field_writer(field_id)
|
||||
.ok_or_else(|| {
|
||||
TantivyError::InvalidArgument(format!(
|
||||
"sort index by field is required to be a fast field {:?}",
|
||||
sort_by_field.field
|
||||
))
|
||||
})?;
|
||||
|
||||
// create new doc_id to old doc_id index (used in fast_field_writers)
|
||||
let mut doc_id_and_data = fast_field
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|el| (el.0 as DocId, el.1))
|
||||
.collect::<Vec<_>>();
|
||||
if sort_by_field.order == Order::Desc {
|
||||
doc_id_and_data.sort_by_key(|k| Reverse(k.1));
|
||||
} else {
|
||||
doc_id_and_data.sort_by_key(|k| k.1);
|
||||
}
|
||||
let new_doc_id_to_old = doc_id_and_data
|
||||
.into_iter()
|
||||
.map(|el| el.0)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// create old doc_id to new doc_id index (used in posting recorder)
|
||||
let max_doc = new_doc_id_to_old.len();
|
||||
let mut old_doc_id_to_new = vec![0; max_doc];
|
||||
for i in 0..max_doc {
|
||||
old_doc_id_to_new[new_doc_id_to_old[i] as usize] = i as DocId;
|
||||
}
|
||||
let doc_id_map = DocIdMapping {
|
||||
new_doc_id_to_old,
|
||||
old_doc_id_to_new,
|
||||
};
|
||||
Ok(doc_id_map)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests_indexsorting {
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::{collector::TopDocs, query::QueryParser, schema::*};
|
||||
use crate::{schema::Schema, DocAddress};
|
||||
use crate::{Index, IndexSettings, IndexSortByField, Order};
|
||||
|
||||
fn create_test_index(
|
||||
index_settings: Option<IndexSettings>,
|
||||
text_field_options: TextOptions,
|
||||
) -> Index {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
let my_text_field = schema_builder.add_text_field("text_field", text_field_options);
|
||||
let my_string_field = schema_builder.add_text_field("string_field", STRING | STORED);
|
||||
let my_number = schema_builder.add_u64_field(
|
||||
"my_number",
|
||||
IntOptions::default().set_fast(Cardinality::SingleValue),
|
||||
);
|
||||
|
||||
let multi_numbers = schema_builder.add_u64_field(
|
||||
"multi_numbers",
|
||||
IntOptions::default().set_fast(Cardinality::MultiValues),
|
||||
);
|
||||
|
||||
let schema = schema_builder.build();
|
||||
let mut index_builder = Index::builder().schema(schema);
|
||||
if let Some(settings) = index_settings {
|
||||
index_builder = index_builder.settings(settings);
|
||||
}
|
||||
let index = index_builder.create_in_ram().unwrap();
|
||||
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer.add_document(doc!(my_number=>40_u64));
|
||||
index_writer
|
||||
.add_document(doc!(my_number=>20_u64, multi_numbers => 5_u64, multi_numbers => 6_u64));
|
||||
index_writer.add_document(doc!(my_number=>100_u64));
|
||||
index_writer.add_document(
|
||||
doc!(my_number=>10_u64, my_string_field=> "blublub", my_text_field => "some text"),
|
||||
);
|
||||
index_writer.add_document(doc!(my_number=>30_u64, multi_numbers => 3_u64 ));
|
||||
index_writer.commit().unwrap();
|
||||
index
|
||||
}
|
||||
fn get_text_options() -> TextOptions {
|
||||
TextOptions::default().set_indexing_options(
|
||||
TextFieldIndexing::default().set_index_option(IndexRecordOption::Basic),
|
||||
)
|
||||
}
|
||||
#[test]
|
||||
fn test_sort_index_test_text_field() -> crate::Result<()> {
|
||||
// there are different serializers for different settings in postings/recorder.rs
|
||||
// test remapping for all of them
|
||||
let options = vec![
|
||||
get_text_options(),
|
||||
get_text_options().set_indexing_options(
|
||||
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
|
||||
),
|
||||
get_text_options().set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions),
|
||||
),
|
||||
];
|
||||
|
||||
for option in options {
|
||||
//let options = get_text_options();
|
||||
// no index_sort
|
||||
let index = create_test_index(None, option.clone());
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
let searcher = index.reader()?.searcher();
|
||||
|
||||
let query = QueryParser::for_index(&index, vec![my_text_field]).parse_query("text")?;
|
||||
let top_docs: Vec<(f32, DocAddress)> =
|
||||
searcher.search(&query, &TopDocs::with_limit(3))?;
|
||||
assert_eq!(
|
||||
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>(),
|
||||
vec![3]
|
||||
);
|
||||
|
||||
// sort by field asc
|
||||
let index = create_test_index(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "my_number".to_string(),
|
||||
order: Order::Asc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
option.clone(),
|
||||
);
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let query = QueryParser::for_index(&index, vec![my_text_field]).parse_query("text")?;
|
||||
let top_docs: Vec<(f32, DocAddress)> =
|
||||
searcher.search(&query, &TopDocs::with_limit(3))?;
|
||||
assert_eq!(
|
||||
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>(),
|
||||
vec![0]
|
||||
);
|
||||
|
||||
// test new field norm mapping
|
||||
{
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
let fieldnorm_reader = searcher
|
||||
.segment_reader(0)
|
||||
.get_fieldnorms_reader(my_text_field)?;
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(0), 2); // some text
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
|
||||
}
|
||||
// sort by field desc
|
||||
let index = create_test_index(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "my_number".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
option.clone(),
|
||||
);
|
||||
let my_string_field = index.schema().get_field("text_field").unwrap();
|
||||
let searcher = index.reader()?.searcher();
|
||||
|
||||
let query =
|
||||
QueryParser::for_index(&index, vec![my_string_field]).parse_query("text")?;
|
||||
let top_docs: Vec<(f32, DocAddress)> =
|
||||
searcher.search(&query, &TopDocs::with_limit(3))?;
|
||||
assert_eq!(
|
||||
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>(),
|
||||
vec![4]
|
||||
);
|
||||
// test new field norm mapping
|
||||
{
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
let fieldnorm_reader = searcher
|
||||
.segment_reader(0)
|
||||
.get_fieldnorms_reader(my_text_field)?;
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(0), 0);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(2), 0);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(3), 0);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(4), 2); // some text
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
#[test]
|
||||
fn test_sort_index_get_documents() -> crate::Result<()> {
|
||||
// default baseline
|
||||
let index = create_test_index(None, get_text_options());
|
||||
let my_string_field = index.schema().get_field("string_field").unwrap();
|
||||
let searcher = index.reader()?.searcher();
|
||||
{
|
||||
assert_eq!(
|
||||
searcher
|
||||
.doc(DocAddress::new(0, 0))?
|
||||
.get_first(my_string_field),
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
searcher
|
||||
.doc(DocAddress::new(0, 3))?
|
||||
.get_first(my_string_field)
|
||||
.unwrap()
|
||||
.text(),
|
||||
Some("blublub")
|
||||
);
|
||||
}
|
||||
// sort by field asc
|
||||
let index = create_test_index(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "my_number".to_string(),
|
||||
order: Order::Asc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
get_text_options(),
|
||||
);
|
||||
let my_string_field = index.schema().get_field("string_field").unwrap();
|
||||
let searcher = index.reader()?.searcher();
|
||||
{
|
||||
assert_eq!(
|
||||
searcher
|
||||
.doc(DocAddress::new(0, 0))?
|
||||
.get_first(my_string_field)
|
||||
.unwrap()
|
||||
.text(),
|
||||
Some("blublub")
|
||||
);
|
||||
let doc = searcher.doc(DocAddress::new(0, 4))?;
|
||||
assert_eq!(doc.get_first(my_string_field), None);
|
||||
}
|
||||
// sort by field desc
|
||||
let index = create_test_index(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "my_number".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
get_text_options(),
|
||||
);
|
||||
let my_string_field = index.schema().get_field("string_field").unwrap();
|
||||
let searcher = index.reader()?.searcher();
|
||||
{
|
||||
let doc = searcher.doc(DocAddress::new(0, 4))?;
|
||||
assert_eq!(
|
||||
doc.get_first(my_string_field).unwrap().text(),
|
||||
Some("blublub")
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sort_index_test_string_field() -> crate::Result<()> {
|
||||
let index = create_test_index(None, get_text_options());
|
||||
let my_string_field = index.schema().get_field("string_field").unwrap();
|
||||
let searcher = index.reader()?.searcher();
|
||||
|
||||
let query = QueryParser::for_index(&index, vec![my_string_field]).parse_query("blublub")?;
|
||||
let top_docs: Vec<(f32, DocAddress)> = searcher.search(&query, &TopDocs::with_limit(3))?;
|
||||
assert_eq!(
|
||||
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>(),
|
||||
vec![3]
|
||||
);
|
||||
|
||||
let index = create_test_index(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "my_number".to_string(),
|
||||
order: Order::Asc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
get_text_options(),
|
||||
);
|
||||
let my_string_field = index.schema().get_field("string_field").unwrap();
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let query = QueryParser::for_index(&index, vec![my_string_field]).parse_query("blublub")?;
|
||||
let top_docs: Vec<(f32, DocAddress)> = searcher.search(&query, &TopDocs::with_limit(3))?;
|
||||
assert_eq!(
|
||||
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>(),
|
||||
vec![0]
|
||||
);
|
||||
|
||||
// test new field norm mapping
|
||||
{
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
let fieldnorm_reader = searcher
|
||||
.segment_reader(0)
|
||||
.get_fieldnorms_reader(my_text_field)?;
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(0), 2); // some text
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
|
||||
}
|
||||
// sort by field desc
|
||||
let index = create_test_index(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "my_number".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
get_text_options(),
|
||||
);
|
||||
let my_string_field = index.schema().get_field("string_field").unwrap();
|
||||
let searcher = index.reader()?.searcher();
|
||||
|
||||
let query = QueryParser::for_index(&index, vec![my_string_field]).parse_query("blublub")?;
|
||||
let top_docs: Vec<(f32, DocAddress)> = searcher.search(&query, &TopDocs::with_limit(3))?;
|
||||
assert_eq!(
|
||||
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>(),
|
||||
vec![4]
|
||||
);
|
||||
// test new field norm mapping
|
||||
{
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
let fieldnorm_reader = searcher
|
||||
.segment_reader(0)
|
||||
.get_fieldnorms_reader(my_text_field)?;
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(0), 0);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(2), 0);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(3), 0);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(4), 2); // some text
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sort_index_fast_field() -> crate::Result<()> {
|
||||
let index = create_test_index(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "my_number".to_string(),
|
||||
order: Order::Asc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
get_text_options(),
|
||||
);
|
||||
assert_eq!(
|
||||
index.settings().sort_by_field.as_ref().unwrap().field,
|
||||
"my_number".to_string()
|
||||
);
|
||||
|
||||
let searcher = index.reader()?.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let fast_fields = segment_reader.fast_fields();
|
||||
let my_number = index.schema().get_field("my_number").unwrap();
|
||||
|
||||
let fast_field = fast_fields.u64(my_number).unwrap();
|
||||
assert_eq!(fast_field.get(0u32), 10u64);
|
||||
assert_eq!(fast_field.get(1u32), 20u64);
|
||||
assert_eq!(fast_field.get(2u32), 30u64);
|
||||
|
||||
let multi_numbers = index.schema().get_field("multi_numbers").unwrap();
|
||||
let multifield = fast_fields.u64s(multi_numbers).unwrap();
|
||||
let mut vals = vec![];
|
||||
multifield.get_vals(0u32, &mut vals);
|
||||
assert_eq!(vals, &[] as &[u64]);
|
||||
let mut vals = vec![];
|
||||
multifield.get_vals(1u32, &mut vals);
|
||||
assert_eq!(vals, &[5, 6]);
|
||||
|
||||
let mut vals = vec![];
|
||||
multifield.get_vals(2u32, &mut vals);
|
||||
assert_eq!(vals, &[3]);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -14,35 +14,27 @@ use crate::Opstamp;
|
||||
// The doc to opstamp mapping stores precisely an array
|
||||
// indexed by doc id and storing the opstamp of the document.
|
||||
//
|
||||
// This mapping is (for the moment) stricly increasing
|
||||
// because of the way document id are allocated.
|
||||
// This mapping is NOT necessarily increasing, because
|
||||
// we might be sorting documents according to a fast field.
|
||||
#[derive(Clone)]
|
||||
pub enum DocToOpstampMapping<'a> {
|
||||
WithMap(&'a [Opstamp]),
|
||||
None,
|
||||
}
|
||||
|
||||
impl<'a> From<&'a [u64]> for DocToOpstampMapping<'a> {
|
||||
fn from(opstamps: &[Opstamp]) -> DocToOpstampMapping {
|
||||
DocToOpstampMapping::WithMap(opstamps)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> DocToOpstampMapping<'a> {
|
||||
/// Given an opstamp return the limit doc id L
|
||||
/// such that all doc id D such that
|
||||
// D >= L iff opstamp(D) >= than `target_opstamp`.
|
||||
//
|
||||
// The edge case opstamp = some doc opstamp is in practise
|
||||
// never called.
|
||||
pub fn compute_doc_limit(&self, target_opstamp: Opstamp) -> DocId {
|
||||
match *self {
|
||||
DocToOpstampMapping::WithMap(ref doc_opstamps) => {
|
||||
match doc_opstamps.binary_search(&target_opstamp) {
|
||||
Ok(doc_id) | Err(doc_id) => doc_id as DocId,
|
||||
}
|
||||
/// Assess whether a document should be considered deleted given that it contains
|
||||
/// a deleted term that was deleted at the opstamp: `delete_opstamp`.
|
||||
///
|
||||
/// This function returns true if the `DocToOpstamp` mapping is none or if
|
||||
/// the `doc_opstamp` is anterior to the delete opstamp.
|
||||
pub fn is_deleted(&self, doc_id: DocId, delete_opstamp: Opstamp) -> bool {
|
||||
match self {
|
||||
Self::WithMap(doc_opstamps) => {
|
||||
let doc_opstamp = doc_opstamps[doc_id as usize];
|
||||
doc_opstamp < delete_opstamp
|
||||
}
|
||||
DocToOpstampMapping::None => DocId::max_value(),
|
||||
Self::None => true,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -55,40 +47,17 @@ mod tests {
|
||||
#[test]
|
||||
fn test_doc_to_opstamp_mapping_none() {
|
||||
let doc_to_opstamp_mapping = DocToOpstampMapping::None;
|
||||
assert_eq!(
|
||||
doc_to_opstamp_mapping.compute_doc_limit(1),
|
||||
u32::max_value()
|
||||
);
|
||||
assert!(doc_to_opstamp_mapping.is_deleted(1u32, 0u64));
|
||||
assert!(doc_to_opstamp_mapping.is_deleted(1u32, 2u64));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_doc_to_opstamp_mapping_complex() {
|
||||
{
|
||||
let doc_to_opstamp_mapping = DocToOpstampMapping::from(&[][..]);
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0);
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(2u64), 0);
|
||||
}
|
||||
{
|
||||
let doc_to_opstamp_mapping = DocToOpstampMapping::from(&[1u64][..]);
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0);
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(2u64), 1);
|
||||
}
|
||||
{
|
||||
let doc_to_opstamp_mapping =
|
||||
DocToOpstampMapping::from(&[1u64, 12u64, 17u64, 23u64][..]);
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0);
|
||||
for i in 2u64..13u64 {
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(i), 1);
|
||||
}
|
||||
for i in 13u64..18u64 {
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(i), 2);
|
||||
}
|
||||
for i in 18u64..24u64 {
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(i), 3);
|
||||
}
|
||||
for i in 24u64..30u64 {
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(i), 4);
|
||||
}
|
||||
}
|
||||
fn test_doc_to_opstamp_mapping_with_map() {
|
||||
let doc_to_opstamp_mapping = DocToOpstampMapping::WithMap(&[5u64, 1u64, 0u64, 4u64, 3u64]);
|
||||
assert_eq!(doc_to_opstamp_mapping.is_deleted(0u32, 2u64), false);
|
||||
assert_eq!(doc_to_opstamp_mapping.is_deleted(1u32, 2u64), true);
|
||||
assert_eq!(doc_to_opstamp_mapping.is_deleted(2u32, 2u64), true);
|
||||
assert_eq!(doc_to_opstamp_mapping.is_deleted(3u32, 2u64), false);
|
||||
assert_eq!(doc_to_opstamp_mapping.is_deleted(4u32, 2u64), false);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -43,6 +43,9 @@ pub const MARGIN_IN_BYTES: usize = 1_000_000;
|
||||
pub const HEAP_SIZE_MIN: usize = ((MARGIN_IN_BYTES as u32) * 3u32) as usize;
|
||||
pub const HEAP_SIZE_MAX: usize = u32::max_value() as usize - MARGIN_IN_BYTES;
|
||||
|
||||
// We impose the number of index writter thread to be at most this.
|
||||
pub const MAX_NUM_THREAD: usize = 8;
|
||||
|
||||
// Add document will block if the number of docs waiting in the queue to be indexed
|
||||
// reaches `PIPELINE_MAX_SIZE_IN_DOCS`
|
||||
const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000;
|
||||
@@ -50,7 +53,7 @@ const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000;
|
||||
// Group of operations.
|
||||
// Most of the time, users will send operation one-by-one, but it can be useful to
|
||||
// send them as a small block to ensure that
|
||||
// - all docs in the operation will happen on the same segment and continuous docids.
|
||||
// - all docs in the operation will happen on the same segment and continuous doc_ids.
|
||||
// - all operations in the group are committed at the same time, making the group
|
||||
// atomic.
|
||||
type OperationGroup = SmallVec<[AddOperation; 4]>;
|
||||
@@ -103,22 +106,18 @@ fn compute_deleted_bitset(
|
||||
}
|
||||
|
||||
// A delete operation should only affect
|
||||
// document that were inserted after it.
|
||||
//
|
||||
// Limit doc helps identify the first document
|
||||
// that may be affected by the delete operation.
|
||||
let limit_doc = doc_opstamps.compute_doc_limit(delete_op.opstamp);
|
||||
// document that were inserted before it.
|
||||
let inverted_index = segment_reader.inverted_index(delete_op.term.field())?;
|
||||
if let Some(mut docset) =
|
||||
inverted_index.read_postings(&delete_op.term, IndexRecordOption::Basic)?
|
||||
{
|
||||
let mut deleted_doc = docset.doc();
|
||||
while deleted_doc != TERMINATED {
|
||||
if deleted_doc < limit_doc {
|
||||
delete_bitset.insert(deleted_doc);
|
||||
let mut doc_matching_deleted_term = docset.doc();
|
||||
while doc_matching_deleted_term != TERMINATED {
|
||||
if doc_opstamps.is_deleted(doc_matching_deleted_term, delete_op.opstamp) {
|
||||
delete_bitset.insert(doc_matching_deleted_term);
|
||||
might_have_changed = true;
|
||||
}
|
||||
deleted_doc = docset.advance();
|
||||
doc_matching_deleted_term = docset.advance();
|
||||
}
|
||||
}
|
||||
delete_cursor.advance();
|
||||
@@ -180,7 +179,7 @@ pub(crate) fn advance_deletes(
|
||||
if num_deleted_docs > num_deleted_docs_before {
|
||||
// There are new deletes. We need to write a new delete file.
|
||||
segment = segment.with_delete_meta(num_deleted_docs as u32, target_opstamp);
|
||||
let mut delete_file = segment.open_write(SegmentComponent::DELETE)?;
|
||||
let mut delete_file = segment.open_write(SegmentComponent::Delete)?;
|
||||
write_delete_bitset(&delete_bitset, max_doc, &mut delete_file)?;
|
||||
delete_file.terminate()?;
|
||||
}
|
||||
@@ -227,37 +226,37 @@ fn index_documents(
|
||||
|
||||
let segment_with_max_doc = segment.with_max_doc(max_doc);
|
||||
|
||||
let last_docstamp: Opstamp = *(doc_opstamps.last().unwrap());
|
||||
let delete_bitset_opt =
|
||||
apply_deletes(&segment_with_max_doc, &mut delete_cursor, &doc_opstamps)?;
|
||||
|
||||
let delete_bitset_opt = apply_deletes(
|
||||
&segment_with_max_doc,
|
||||
&mut delete_cursor,
|
||||
&doc_opstamps,
|
||||
last_docstamp,
|
||||
)?;
|
||||
|
||||
let segment_entry = SegmentEntry::new(
|
||||
segment_with_max_doc.meta().clone(),
|
||||
delete_cursor,
|
||||
delete_bitset_opt,
|
||||
);
|
||||
let meta = segment_with_max_doc.meta().clone();
|
||||
meta.untrack_temp_docstore();
|
||||
// update segment_updater inventory to remove tempstore
|
||||
let segment_entry = SegmentEntry::new(meta, delete_cursor, delete_bitset_opt);
|
||||
block_on(segment_updater.schedule_add_segment(segment_entry))?;
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
/// `doc_opstamps` is required to be non-empty.
|
||||
fn apply_deletes(
|
||||
segment: &Segment,
|
||||
mut delete_cursor: &mut DeleteCursor,
|
||||
doc_opstamps: &[Opstamp],
|
||||
last_docstamp: Opstamp,
|
||||
) -> crate::Result<Option<BitSet>> {
|
||||
if delete_cursor.get().is_none() {
|
||||
// if there are no delete operation in the queue, no need
|
||||
// to even open the segment.
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let max_doc_opstamp: Opstamp = doc_opstamps
|
||||
.iter()
|
||||
.cloned()
|
||||
.max()
|
||||
.expect("Empty DocOpstamp is forbidden");
|
||||
|
||||
let segment_reader = SegmentReader::open(segment)?;
|
||||
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
|
||||
let doc_to_opstamps = DocToOpstampMapping::WithMap(doc_opstamps);
|
||||
|
||||
let max_doc = segment.meta().max_doc();
|
||||
let mut deleted_bitset = BitSet::with_max_value(max_doc);
|
||||
@@ -266,7 +265,7 @@ fn apply_deletes(
|
||||
&segment_reader,
|
||||
&mut delete_cursor,
|
||||
&doc_to_opstamps,
|
||||
last_docstamp,
|
||||
max_doc_opstamp,
|
||||
)?;
|
||||
Ok(if may_have_deletes {
|
||||
Some(deleted_bitset)
|
||||
@@ -356,7 +355,7 @@ impl IndexWriter {
|
||||
// dropping the last reference to the segment_updater.
|
||||
self.drop_sender();
|
||||
|
||||
let former_workers_handles = mem::replace(&mut self.workers_join_handle, vec![]);
|
||||
let former_workers_handles = std::mem::take(&mut self.workers_join_handle);
|
||||
for join_handle in former_workers_handles {
|
||||
join_handle
|
||||
.join()
|
||||
@@ -626,7 +625,7 @@ impl IndexWriter {
|
||||
// and recreate a new one.
|
||||
self.recreate_document_channel();
|
||||
|
||||
let former_workers_join_handle = mem::replace(&mut self.workers_join_handle, Vec::new());
|
||||
let former_workers_join_handle = std::mem::take(&mut self.workers_join_handle);
|
||||
|
||||
for worker_handle in former_workers_join_handle {
|
||||
let indexing_worker_result = worker_handle
|
||||
@@ -782,17 +781,44 @@ impl Drop for IndexWriter {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::HashMap;
|
||||
use std::collections::HashSet;
|
||||
|
||||
use futures::executor::block_on;
|
||||
use proptest::prelude::*;
|
||||
use proptest::prop_oneof;
|
||||
use proptest::strategy::Strategy;
|
||||
|
||||
use super::super::operation::UserOperation;
|
||||
use crate::collector::TopDocs;
|
||||
use crate::directory::error::LockError;
|
||||
use crate::error::*;
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::query::QueryParser;
|
||||
use crate::query::TermQuery;
|
||||
use crate::schema::{self, IndexRecordOption, STRING};
|
||||
use crate::schema::Cardinality;
|
||||
use crate::schema::Facet;
|
||||
use crate::schema::IntOptions;
|
||||
use crate::schema::TextFieldIndexing;
|
||||
use crate::schema::TextOptions;
|
||||
use crate::schema::STORED;
|
||||
use crate::schema::TEXT;
|
||||
use crate::schema::{self, IndexRecordOption, FAST, INDEXED, STRING};
|
||||
use crate::DocAddress;
|
||||
use crate::Index;
|
||||
use crate::ReloadPolicy;
|
||||
use crate::Term;
|
||||
use crate::{IndexSettings, IndexSortByField, Order};
|
||||
|
||||
const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \
|
||||
do eiusmod tempor incididunt ut labore et dolore magna aliqua. \
|
||||
Ut enim ad minim veniam, quis nostrud exercitation ullamco \
|
||||
laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure \
|
||||
dolor in reprehenderit in voluptate velit esse cillum dolore eu \
|
||||
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non \
|
||||
proident, sunt in culpa qui officia deserunt mollit anim id est \
|
||||
laborum.";
|
||||
|
||||
#[test]
|
||||
fn test_operations_group() {
|
||||
@@ -943,7 +969,7 @@ mod tests {
|
||||
let index_writer = index.writer(3_000_000).unwrap();
|
||||
assert_eq!(
|
||||
format!("{:?}", index_writer.get_merge_policy()),
|
||||
"LogMergePolicy { min_merge_size: 8, max_merge_size: 10000000, min_layer_size: 10000, \
|
||||
"LogMergePolicy { min_num_segments: 8, max_docs_before_merge: 10000000, min_layer_size: 10000, \
|
||||
level_log_size: 0.75 }"
|
||||
);
|
||||
let merge_policy = Box::new(NoMergePolicy::default());
|
||||
@@ -1280,6 +1306,330 @@ mod tests {
|
||||
assert!(commit_again.is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delete_with_sort_by_field() -> crate::Result<()> {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let id_field =
|
||||
schema_builder.add_u64_field("id", schema::INDEXED | schema::STORED | schema::FAST);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let settings = IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "id".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let index = Index::builder()
|
||||
.schema(schema)
|
||||
.settings(settings)
|
||||
.create_in_ram()?;
|
||||
let index_reader = index.reader()?;
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
|
||||
// create and delete docs in same commit
|
||||
for id in 0u64..5u64 {
|
||||
index_writer.add_document(doc!(id_field => id));
|
||||
}
|
||||
for id in 2u64..4u64 {
|
||||
index_writer.delete_term(Term::from_field_u64(id_field, id));
|
||||
}
|
||||
for id in 5u64..10u64 {
|
||||
index_writer.add_document(doc!(id_field => id));
|
||||
}
|
||||
index_writer.commit()?;
|
||||
index_reader.reload()?;
|
||||
|
||||
let searcher = index_reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
assert_eq!(segment_reader.num_docs(), 8);
|
||||
assert_eq!(segment_reader.max_doc(), 10);
|
||||
let fast_field_reader = segment_reader.fast_fields().u64(id_field)?;
|
||||
let in_order_alive_ids: Vec<u64> = segment_reader
|
||||
.doc_ids_alive()
|
||||
.map(|doc| fast_field_reader.get(doc))
|
||||
.collect();
|
||||
assert_eq!(&in_order_alive_ids[..], &[9, 8, 7, 6, 5, 4, 1, 0]);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
enum IndexingOp {
|
||||
AddDoc { id: u64 },
|
||||
DeleteDoc { id: u64 },
|
||||
Commit,
|
||||
}
|
||||
|
||||
fn operation_strategy() -> impl Strategy<Value = IndexingOp> {
|
||||
prop_oneof![
|
||||
(0u64..10u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
|
||||
(0u64..10u64).prop_map(|id| IndexingOp::AddDoc { id }),
|
||||
(0u64..2u64).prop_map(|_| IndexingOp::Commit),
|
||||
]
|
||||
}
|
||||
|
||||
fn expected_ids(ops: &[IndexingOp]) -> (HashMap<u64, u64>, HashSet<u64>) {
|
||||
let mut existing_ids = HashMap::new();
|
||||
let mut deleted_ids = HashSet::new();
|
||||
for &op in ops {
|
||||
match op {
|
||||
IndexingOp::AddDoc { id } => {
|
||||
*existing_ids.entry(id).or_insert(0) += 1;
|
||||
deleted_ids.remove(&id);
|
||||
}
|
||||
IndexingOp::DeleteDoc { id } => {
|
||||
existing_ids.remove(&id);
|
||||
deleted_ids.insert(id);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
(existing_ids, deleted_ids)
|
||||
}
|
||||
|
||||
fn test_operation_strategy(
|
||||
ops: &[IndexingOp],
|
||||
sort_index: bool,
|
||||
force_merge: bool,
|
||||
) -> crate::Result<()> {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let id_field = schema_builder.add_u64_field("id", FAST | INDEXED | STORED);
|
||||
let text_field = schema_builder.add_text_field(
|
||||
"text_field",
|
||||
TextOptions::default()
|
||||
.set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_index_option(schema::IndexRecordOption::WithFreqsAndPositions),
|
||||
)
|
||||
.set_stored(),
|
||||
);
|
||||
|
||||
let large_text_field = schema_builder.add_text_field("large_text_field", TEXT | STORED);
|
||||
|
||||
let multi_numbers = schema_builder.add_u64_field(
|
||||
"multi_numbers",
|
||||
IntOptions::default()
|
||||
.set_fast(Cardinality::MultiValues)
|
||||
.set_stored(),
|
||||
);
|
||||
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let settings = if sort_index {
|
||||
IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "id".to_string(),
|
||||
order: Order::Asc,
|
||||
}),
|
||||
..Default::default()
|
||||
}
|
||||
} else {
|
||||
IndexSettings {
|
||||
..Default::default()
|
||||
}
|
||||
};
|
||||
let index = Index::builder()
|
||||
.schema(schema)
|
||||
.settings(settings)
|
||||
.create_in_ram()?;
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
for &op in ops {
|
||||
match op {
|
||||
IndexingOp::AddDoc { id } => {
|
||||
let facet = Facet::from(&("/cola/".to_string() + &id.to_string()));
|
||||
index_writer
|
||||
.add_document(doc!(id_field=>id, multi_numbers=> id, multi_numbers => id, text_field => id.to_string(), facet_field => facet, large_text_field=> LOREM));
|
||||
}
|
||||
IndexingOp::DeleteDoc { id } => {
|
||||
index_writer.delete_term(Term::from_field_u64(id_field, id));
|
||||
}
|
||||
IndexingOp::Commit => {
|
||||
index_writer.commit()?;
|
||||
}
|
||||
}
|
||||
}
|
||||
index_writer.commit()?;
|
||||
|
||||
let searcher = index.reader()?.searcher();
|
||||
if force_merge {
|
||||
index_writer.wait_merging_threads()?;
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
if segment_ids.len() >= 2 {
|
||||
block_on(index_writer.merge(&segment_ids)).unwrap();
|
||||
assert!(index_writer.wait_merging_threads().is_ok());
|
||||
}
|
||||
}
|
||||
let ids: HashSet<u64> = searcher
|
||||
.segment_readers()
|
||||
.iter()
|
||||
.flat_map(|segment_reader| {
|
||||
let ff_reader = segment_reader.fast_fields().u64(id_field).unwrap();
|
||||
segment_reader
|
||||
.doc_ids_alive()
|
||||
.map(move |doc| ff_reader.get(doc))
|
||||
})
|
||||
.collect();
|
||||
|
||||
let (expected_ids_and_num_occurences, deleted_ids) = expected_ids(ops);
|
||||
assert_eq!(
|
||||
ids,
|
||||
expected_ids_and_num_occurences
|
||||
.keys()
|
||||
.cloned()
|
||||
.collect::<HashSet<_>>()
|
||||
);
|
||||
|
||||
// multivalue fast field tests
|
||||
for segment_reader in searcher.segment_readers().iter() {
|
||||
let ff_reader = segment_reader.fast_fields().u64s(multi_numbers).unwrap();
|
||||
for doc in segment_reader.doc_ids_alive() {
|
||||
let mut vals = vec![];
|
||||
ff_reader.get_vals(doc, &mut vals);
|
||||
assert_eq!(vals.len(), 2);
|
||||
assert_eq!(vals[0], vals[1]);
|
||||
assert!(expected_ids_and_num_occurences.contains_key(&vals[0]));
|
||||
}
|
||||
}
|
||||
|
||||
// doc store tests
|
||||
for segment_reader in searcher.segment_readers().iter() {
|
||||
let store_reader = segment_reader.get_store_reader().unwrap();
|
||||
// test store iterator
|
||||
for doc in store_reader.iter(segment_reader.delete_bitset()) {
|
||||
let id = doc
|
||||
.unwrap()
|
||||
.get_first(id_field)
|
||||
.unwrap()
|
||||
.u64_value()
|
||||
.unwrap();
|
||||
assert!(expected_ids_and_num_occurences.contains_key(&id));
|
||||
}
|
||||
// test store random access
|
||||
for doc_id in segment_reader.doc_ids_alive() {
|
||||
let id = store_reader
|
||||
.get(doc_id)
|
||||
.unwrap()
|
||||
.get_first(id_field)
|
||||
.unwrap()
|
||||
.u64_value()
|
||||
.unwrap();
|
||||
assert!(expected_ids_and_num_occurences.contains_key(&id));
|
||||
let id2 = store_reader
|
||||
.get(doc_id)
|
||||
.unwrap()
|
||||
.get_first(multi_numbers)
|
||||
.unwrap()
|
||||
.u64_value()
|
||||
.unwrap();
|
||||
assert_eq!(id, id2);
|
||||
}
|
||||
}
|
||||
// test search
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
|
||||
let do_search = |term: &str| {
|
||||
let query = QueryParser::for_index(&index, vec![my_text_field])
|
||||
.parse_query(term)
|
||||
.unwrap();
|
||||
let top_docs: Vec<(f32, DocAddress)> =
|
||||
searcher.search(&query, &TopDocs::with_limit(1000)).unwrap();
|
||||
|
||||
top_docs.iter().map(|el| el.1).collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
for (existing_id, count) in expected_ids_and_num_occurences {
|
||||
assert_eq!(do_search(&existing_id.to_string()).len() as u64, count);
|
||||
}
|
||||
for existing_id in deleted_ids {
|
||||
assert_eq!(do_search(&existing_id.to_string()).len(), 0);
|
||||
}
|
||||
// test facets
|
||||
for segment_reader in searcher.segment_readers().iter() {
|
||||
let mut facet_reader = segment_reader.facet_reader(facet_field).unwrap();
|
||||
let ff_reader = segment_reader.fast_fields().u64(id_field).unwrap();
|
||||
for doc_id in segment_reader.doc_ids_alive() {
|
||||
let mut facet_ords = Vec::new();
|
||||
facet_reader.facet_ords(doc_id, &mut facet_ords);
|
||||
assert_eq!(facet_ords.len(), 1);
|
||||
let mut facet = Facet::default();
|
||||
facet_reader
|
||||
.facet_from_ord(facet_ords[0], &mut facet)
|
||||
.unwrap();
|
||||
let id = ff_reader.get(doc_id);
|
||||
let facet_expected = Facet::from(&("/cola/".to_string() + &id.to_string()));
|
||||
|
||||
assert_eq!(facet, facet_expected);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#[test]
|
||||
fn test_delete_with_sort_proptest(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
|
||||
assert!(test_operation_strategy(&ops[..], true, false).is_ok());
|
||||
}
|
||||
#[test]
|
||||
fn test_delete_without_sort_proptest(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
|
||||
assert!(test_operation_strategy(&ops[..], false, false).is_ok());
|
||||
}
|
||||
#[test]
|
||||
fn test_delete_with_sort_proptest_with_merge(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
|
||||
assert!(test_operation_strategy(&ops[..], true, true).is_ok());
|
||||
}
|
||||
#[test]
|
||||
fn test_delete_without_sort_proptest_with_merge(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
|
||||
assert!(test_operation_strategy(&ops[..], false, true).is_ok());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delete_with_sort_by_field_last_opstamp_is_not_max() -> crate::Result<()> {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let sort_by_field = schema_builder.add_u64_field("sort_by", FAST);
|
||||
let id_field = schema_builder.add_u64_field("id", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let settings = IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "sort_by".to_string(),
|
||||
order: Order::Asc,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let index = Index::builder()
|
||||
.schema(schema)
|
||||
.settings(settings)
|
||||
.create_in_ram()?;
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
|
||||
// We add a doc...
|
||||
index_writer.add_document(doc!(sort_by_field => 2u64, id_field => 0u64));
|
||||
// And remove it.
|
||||
index_writer.delete_term(Term::from_field_u64(id_field, 0u64));
|
||||
// We add another doc.
|
||||
index_writer.add_document(doc!(sort_by_field=>1u64, id_field => 0u64));
|
||||
|
||||
// The expected result is a segment with
|
||||
// maxdoc = 2
|
||||
// numdoc = 1.
|
||||
index_writer.commit()?;
|
||||
|
||||
let searcher = index.reader()?.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
assert_eq!(segment_reader.max_doc(), 2);
|
||||
assert_eq!(segment_reader.num_deleted_docs(), 1);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_index_doc_missing_field() {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
|
||||
@@ -1,19 +1,20 @@
|
||||
use super::merge_policy::{MergeCandidate, MergePolicy};
|
||||
use crate::core::SegmentMeta;
|
||||
use itertools::Itertools;
|
||||
use std::cmp;
|
||||
use std::f64;
|
||||
|
||||
const DEFAULT_LEVEL_LOG_SIZE: f64 = 0.75;
|
||||
const DEFAULT_MIN_LAYER_SIZE: u32 = 10_000;
|
||||
const DEFAULT_MIN_MERGE_SIZE: usize = 8;
|
||||
const DEFAULT_MAX_MERGE_SIZE: usize = 10_000_000;
|
||||
const DEFAULT_MIN_NUM_SEGMENTS_IN_MERGE: usize = 8;
|
||||
const DEFAULT_MAX_DOCS_BEFORE_MERGE: usize = 10_000_000;
|
||||
|
||||
/// `LogMergePolicy` tries to merge segments that have a similar number of
|
||||
/// documents.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct LogMergePolicy {
|
||||
min_merge_size: usize,
|
||||
max_merge_size: usize,
|
||||
min_num_segments: usize,
|
||||
max_docs_before_merge: usize,
|
||||
min_layer_size: u32,
|
||||
level_log_size: f64,
|
||||
}
|
||||
@@ -23,15 +24,16 @@ impl LogMergePolicy {
|
||||
cmp::max(self.min_layer_size, size)
|
||||
}
|
||||
|
||||
/// Set the minimum number of segment that may be merge together.
|
||||
pub fn set_min_merge_size(&mut self, min_merge_size: usize) {
|
||||
self.min_merge_size = min_merge_size;
|
||||
/// Set the minimum number of segments that may be merged together.
|
||||
pub fn set_min_num_segments(&mut self, min_num_segments: usize) {
|
||||
self.min_num_segments = min_num_segments;
|
||||
}
|
||||
|
||||
/// Set the maximum number docs in a segment for it to be considered for
|
||||
/// merging.
|
||||
pub fn set_max_merge_size(&mut self, max_merge_size: usize) {
|
||||
self.max_merge_size = max_merge_size;
|
||||
/// merging. A segment can still reach more than max_docs, by merging many
|
||||
/// smaller ones.
|
||||
pub fn set_max_docs_before_merge(&mut self, max_docs_merge_size: usize) {
|
||||
self.max_docs_before_merge = max_docs_merge_size;
|
||||
}
|
||||
|
||||
/// Set the minimum segment size under which all segment belong
|
||||
@@ -42,7 +44,7 @@ impl LogMergePolicy {
|
||||
|
||||
/// Set the ratio between two consecutive levels.
|
||||
///
|
||||
/// Segment are group in levels according to their sizes.
|
||||
/// Segments are grouped in levels according to their sizes.
|
||||
/// These levels are defined as intervals of exponentially growing sizes.
|
||||
/// level_log_size define the factor by which one should multiply the limit
|
||||
/// to reach a level, in order to get the limit to reach the following
|
||||
@@ -54,52 +56,43 @@ impl LogMergePolicy {
|
||||
|
||||
impl MergePolicy for LogMergePolicy {
|
||||
fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate> {
|
||||
let mut size_sorted_tuples = segments
|
||||
let mut size_sorted_segments = segments
|
||||
.iter()
|
||||
.map(SegmentMeta::num_docs)
|
||||
.filter(|s| s <= &(self.max_merge_size as u32))
|
||||
.enumerate()
|
||||
.collect::<Vec<(usize, u32)>>();
|
||||
.filter(|segment_meta| segment_meta.num_docs() <= (self.max_docs_before_merge as u32))
|
||||
.collect::<Vec<&SegmentMeta>>();
|
||||
|
||||
size_sorted_tuples.sort_by(|x, y| y.1.cmp(&(x.1)));
|
||||
|
||||
if size_sorted_tuples.len() <= 1 {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
let size_sorted_log_tuples: Vec<_> = size_sorted_tuples
|
||||
.into_iter()
|
||||
.map(|(ind, num_docs)| (ind, f64::from(self.clip_min_size(num_docs)).log2()))
|
||||
.collect();
|
||||
|
||||
if let Some(&(first_ind, first_score)) = size_sorted_log_tuples.first() {
|
||||
let mut current_max_log_size = first_score;
|
||||
let mut levels = vec![vec![first_ind]];
|
||||
for &(ind, score) in (&size_sorted_log_tuples).iter().skip(1) {
|
||||
if score < (current_max_log_size - self.level_log_size) {
|
||||
current_max_log_size = score;
|
||||
levels.push(Vec::new());
|
||||
}
|
||||
levels.last_mut().unwrap().push(ind);
|
||||
}
|
||||
levels
|
||||
.iter()
|
||||
.filter(|level| level.len() >= self.min_merge_size)
|
||||
.map(|ind_vec| {
|
||||
MergeCandidate(ind_vec.iter().map(|&ind| segments[ind].id()).collect())
|
||||
})
|
||||
.collect()
|
||||
} else {
|
||||
if size_sorted_segments.len() <= 1 {
|
||||
return vec![];
|
||||
}
|
||||
size_sorted_segments.sort_by_key(|seg| std::cmp::Reverse(seg.num_docs()));
|
||||
|
||||
let mut current_max_log_size = f64::MAX;
|
||||
let mut levels = vec![];
|
||||
for (_, merge_group) in &size_sorted_segments.into_iter().group_by(|segment| {
|
||||
let segment_log_size = f64::from(self.clip_min_size(segment.num_docs())).log2();
|
||||
if segment_log_size < (current_max_log_size - self.level_log_size) {
|
||||
// update current_max_log_size to create a new group
|
||||
current_max_log_size = segment_log_size;
|
||||
}
|
||||
// return current_max_log_size to be grouped to the current group
|
||||
current_max_log_size
|
||||
}) {
|
||||
levels.push(merge_group.collect::<Vec<&SegmentMeta>>());
|
||||
}
|
||||
|
||||
levels
|
||||
.iter()
|
||||
.filter(|level| level.len() >= self.min_num_segments)
|
||||
.map(|segments| MergeCandidate(segments.iter().map(|&seg| seg.id()).collect()))
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for LogMergePolicy {
|
||||
fn default() -> LogMergePolicy {
|
||||
LogMergePolicy {
|
||||
min_merge_size: DEFAULT_MIN_MERGE_SIZE,
|
||||
max_merge_size: DEFAULT_MAX_MERGE_SIZE,
|
||||
min_num_segments: DEFAULT_MIN_NUM_SEGMENTS_IN_MERGE,
|
||||
max_docs_before_merge: DEFAULT_MAX_DOCS_BEFORE_MERGE,
|
||||
min_layer_size: DEFAULT_MIN_LAYER_SIZE,
|
||||
level_log_size: DEFAULT_LEVEL_LOG_SIZE,
|
||||
}
|
||||
@@ -109,16 +102,79 @@ impl Default for LogMergePolicy {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::core::{SegmentId, SegmentMeta, SegmentMetaInventory};
|
||||
use crate::indexer::merge_policy::MergePolicy;
|
||||
use crate::{
|
||||
core::{SegmentId, SegmentMeta, SegmentMetaInventory},
|
||||
schema,
|
||||
};
|
||||
use crate::{indexer::merge_policy::MergePolicy, schema::INDEXED};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
static INVENTORY: Lazy<SegmentMetaInventory> = Lazy::new(SegmentMetaInventory::default);
|
||||
|
||||
use crate::Index;
|
||||
|
||||
#[test]
|
||||
fn create_index_test_max_merge_issue_1035() {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let int_field = schema_builder.add_u64_field("intval", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
{
|
||||
let mut log_merge_policy = LogMergePolicy::default();
|
||||
log_merge_policy.set_min_num_segments(1);
|
||||
log_merge_policy.set_max_docs_before_merge(1);
|
||||
log_merge_policy.set_min_layer_size(0);
|
||||
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer.set_merge_policy(Box::new(log_merge_policy));
|
||||
|
||||
// after every commit the merge checker is started, it will merge only segments with 1
|
||||
// element in it because of the max_merge_size.
|
||||
index_writer.add_document(doc!(int_field=>1_u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
index_writer.add_document(doc!(int_field=>2_u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
index_writer.add_document(doc!(int_field=>3_u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
index_writer.add_document(doc!(int_field=>4_u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
index_writer.add_document(doc!(int_field=>5_u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
index_writer.add_document(doc!(int_field=>6_u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
index_writer.add_document(doc!(int_field=>7_u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
index_writer.add_document(doc!(int_field=>8_u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
|
||||
let _segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let segment_readers = searcher.segment_readers();
|
||||
for segment in segment_readers {
|
||||
if segment.num_docs() > 2 {
|
||||
panic!("segment can't have more than two segments");
|
||||
} // don't know how to wait for the merge, then it could be a simple eq
|
||||
}
|
||||
}
|
||||
|
||||
fn test_merge_policy() -> LogMergePolicy {
|
||||
let mut log_merge_policy = LogMergePolicy::default();
|
||||
log_merge_policy.set_min_merge_size(3);
|
||||
log_merge_policy.set_max_merge_size(100_000);
|
||||
log_merge_policy.set_min_num_segments(3);
|
||||
log_merge_policy.set_max_docs_before_merge(100_000);
|
||||
log_merge_policy.set_min_layer_size(2);
|
||||
log_merge_policy
|
||||
}
|
||||
@@ -216,12 +272,19 @@ mod tests {
|
||||
create_random_segment_meta(1_000_000),
|
||||
create_random_segment_meta(100_001),
|
||||
create_random_segment_meta(100_000),
|
||||
create_random_segment_meta(1_000_001),
|
||||
create_random_segment_meta(100_000),
|
||||
create_random_segment_meta(100_000),
|
||||
create_random_segment_meta(1_500_000),
|
||||
];
|
||||
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
|
||||
// Do not include large segments
|
||||
assert_eq!(result_list.len(), 1);
|
||||
assert_eq!(result_list[0].0.len(), 3)
|
||||
assert_eq!(result_list[0].0.len(), 3);
|
||||
|
||||
// Making sure merge policy points to the correct index of the original input
|
||||
assert_eq!(result_list[0].0[0], test_input[2].id());
|
||||
assert_eq!(result_list[0].0[1], test_input[4].id());
|
||||
assert_eq!(result_list[0].0[2], test_input[5].id());
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user