mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-03 07:42:54 +00:00
Compare commits
51 Commits
removed-ex
...
default_fa
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
61422d7cd5 | ||
|
|
820f126075 | ||
|
|
7e6c4a1856 | ||
|
|
5fafe4b1ab | ||
|
|
1e7cd48cfa | ||
|
|
7f51d85bbd | ||
|
|
ad76e32398 | ||
|
|
7575f9bf1c | ||
|
|
67bdf3f5f6 | ||
|
|
3c300666ad | ||
|
|
b91d3f6be4 | ||
|
|
a8e76513bb | ||
|
|
0a23201338 | ||
|
|
81330aaf89 | ||
|
|
98a3b01992 | ||
|
|
d341520938 | ||
|
|
5c9af73e41 | ||
|
|
ad4c940fa3 | ||
|
|
910b0b0c61 | ||
|
|
3fef052bf1 | ||
|
|
040554f2f9 | ||
|
|
17186ca9c9 | ||
|
|
212d59c9ab | ||
|
|
1a1f252a3f | ||
|
|
d73706dede | ||
|
|
44850e1036 | ||
|
|
3b0cbf8102 | ||
|
|
4aa131c3db | ||
|
|
59962097d0 | ||
|
|
ebc78127f3 | ||
|
|
8199aa7de7 | ||
|
|
657f0cd3bd | ||
|
|
3a82ef2560 | ||
|
|
3546e7fc63 | ||
|
|
862f367f9e | ||
|
|
14137d91c4 | ||
|
|
924fc70cb5 | ||
|
|
07023948aa | ||
|
|
0cb53207ec | ||
|
|
17c783b4db | ||
|
|
7220df8a09 | ||
|
|
e3eacb4388 | ||
|
|
fdecb79273 | ||
|
|
27f202083c | ||
|
|
ccb09aaa83 | ||
|
|
4b7c485a08 | ||
|
|
3942fc6d2b | ||
|
|
b325d569ad | ||
|
|
7ee78bda52 | ||
|
|
184a9daa8a | ||
|
|
47e01b345b |
5
.github/workflows/coverage.yml
vendored
5
.github/workflows/coverage.yml
vendored
@@ -6,6 +6,11 @@ on:
|
||||
pull_request:
|
||||
branches: [main]
|
||||
|
||||
# Ensures that we cancel running jobs for the same PR / same workflow.
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
coverage:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
5
.github/workflows/long_running.yml
vendored
5
.github/workflows/long_running.yml
vendored
@@ -8,6 +8,11 @@ env:
|
||||
CARGO_TERM_COLOR: always
|
||||
NUM_FUNCTIONAL_TEST_ITERATIONS: 20000
|
||||
|
||||
# Ensures that we cancel running jobs for the same PR / same workflow.
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
test:
|
||||
|
||||
|
||||
7
.github/workflows/test.yml
vendored
7
.github/workflows/test.yml
vendored
@@ -9,6 +9,11 @@ on:
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
|
||||
# Ensures that we cancel running jobs for the same PR / same workflow.
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
check:
|
||||
|
||||
@@ -48,7 +53,7 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
features: [
|
||||
{ label: "all", flags: "mmap,stopwords,brotli-compression,lz4-compression,snappy-compression,zstd-compression,failpoints" },
|
||||
{ label: "all", flags: "mmap,stopwords,lz4-compression,zstd-compression,failpoints" },
|
||||
{ label: "quickwit", flags: "mmap,quickwit,failpoints" }
|
||||
]
|
||||
|
||||
|
||||
89
CHANGELOG.md
89
CHANGELOG.md
@@ -1,3 +1,92 @@
|
||||
|
||||
Tantivy 0.20.2
|
||||
================================
|
||||
- Align numerical type priority order on the search side. [#2088](https://github.com/quickwit-oss/tantivy/issues/2088) (@fmassot)
|
||||
- Fix is_child_of function not considering the root facet. [#2086](https://github.com/quickwit-oss/tantivy/issues/2086) (@adamreichhold)
|
||||
|
||||
Tantivy 0.20.1
|
||||
================================
|
||||
- Fix building on windows with mmap [#2070](https://github.com/quickwit-oss/tantivy/issues/2070) (@ChillFish8)
|
||||
|
||||
Tantivy 0.20
|
||||
================================
|
||||
#### Bugfixes
|
||||
- Fix phrase queries with slop (slop supports now transpositions, algorithm that carries slop so far for num terms > 2) [#2031](https://github.com/quickwit-oss/tantivy/issues/2031)[#2020](https://github.com/quickwit-oss/tantivy/issues/2020)(@PSeitz)
|
||||
- Handle error for exists on MMapDirectory [#1988](https://github.com/quickwit-oss/tantivy/issues/1988) (@PSeitz)
|
||||
- Aggregation
|
||||
- Fix min doc_count empty merge bug [#2057](https://github.com/quickwit-oss/tantivy/issues/2057) (@PSeitz)
|
||||
- Fix: Sort order for term aggregations (sort order on key was inverted) [#1858](https://github.com/quickwit-oss/tantivy/issues/1858) (@PSeitz)
|
||||
|
||||
#### Features/Improvements
|
||||
- Add PhrasePrefixQuery [#1842](https://github.com/quickwit-oss/tantivy/issues/1842) (@trinity-1686a)
|
||||
- Add `coerce` option for text and numbers types (convert the value instead of returning an error during indexing) [#1904](https://github.com/quickwit-oss/tantivy/issues/1904) (@PSeitz)
|
||||
- Add regex tokenizer [#1759](https://github.com/quickwit-oss/tantivy/issues/1759)(@mkleen)
|
||||
- Move tokenizer API to seperate crate. Having a seperate crate with a stable API will allow us to use tokenizers with different tantivy versions. [#1767](https://github.com/quickwit-oss/tantivy/issues/1767) (@PSeitz)
|
||||
- **Columnar crate**: New fast field handling (@fulmicoton @PSeitz) [#1806](https://github.com/quickwit-oss/tantivy/issues/1806)[#1809](https://github.com/quickwit-oss/tantivy/issues/1809)
|
||||
- Support for fast fields with optional values. Previously tantivy supported only single-valued and multi-value fast fields. The encoding of optional fast fields is now very compact.
|
||||
- Fast field Support for JSON (schemaless fast fields). Support multiple types on the same column. [#1876](https://github.com/quickwit-oss/tantivy/issues/1876) (@fulmicoton)
|
||||
- Unified access for fast fields over different cardinalities.
|
||||
- Unified storage for typed and untyped fields.
|
||||
- Move fastfield codecs into columnar. [#1782](https://github.com/quickwit-oss/tantivy/issues/1782) (@fulmicoton)
|
||||
- Sparse dense index for optional values [#1716](https://github.com/quickwit-oss/tantivy/issues/1716) (@PSeitz)
|
||||
- Switch to nanosecond precision in DateTime fastfield [#2016](https://github.com/quickwit-oss/tantivy/issues/2016) (@PSeitz)
|
||||
- **Aggregation**
|
||||
- Add `date_histogram` aggregation (only `fixed_interval` for now) [#1900](https://github.com/quickwit-oss/tantivy/issues/1900) (@PSeitz)
|
||||
- Add `percentiles` aggregations [#1984](https://github.com/quickwit-oss/tantivy/issues/1984) (@PSeitz)
|
||||
- [**breaking**] Drop JSON support on intermediate agg result (we use postcard as format in `quickwit` to send intermediate results) [#1992](https://github.com/quickwit-oss/tantivy/issues/1992) (@PSeitz)
|
||||
- Set memory limit in bytes for aggregations after which they abort (Previously there was only the bucket limit) [#1942](https://github.com/quickwit-oss/tantivy/issues/1942)[#1957](https://github.com/quickwit-oss/tantivy/issues/1957)(@PSeitz)
|
||||
- Add support for u64,i64,f64 fields in term aggregation [#1883](https://github.com/quickwit-oss/tantivy/issues/1883) (@PSeitz)
|
||||
- Allow histogram bounds to be passed as Rfc3339 [#2076](https://github.com/quickwit-oss/tantivy/issues/2076) (@PSeitz)
|
||||
- Add count, min, max, and sum aggregations [#1794](https://github.com/quickwit-oss/tantivy/issues/1794) (@guilload)
|
||||
- Switch to Aggregation without serde_untagged => better deserialization errors. [#2003](https://github.com/quickwit-oss/tantivy/issues/2003) (@PSeitz)
|
||||
- Switch to ms in histogram for date type (ES compatibility) [#2045](https://github.com/quickwit-oss/tantivy/issues/2045) (@PSeitz)
|
||||
- Reduce term aggregation memory consumption [#2013](https://github.com/quickwit-oss/tantivy/issues/2013) (@PSeitz)
|
||||
- Reduce agg memory consumption: Replace generic aggregation collector (which has a high memory requirement per instance) in aggregation tree with optimized versions behind a trait.
|
||||
- Split term collection count and sub_agg (Faster term agg with less memory consumption for cases without sub-aggs) [#1921](https://github.com/quickwit-oss/tantivy/issues/1921) (@PSeitz)
|
||||
- Schemaless aggregations: In combination with stacker tantivy supports now schemaless aggregations via the JSON type.
|
||||
- Add aggregation support for JSON type [#1888](https://github.com/quickwit-oss/tantivy/issues/1888) (@PSeitz)
|
||||
- Mixed types support on JSON fields in aggs [#1971](https://github.com/quickwit-oss/tantivy/issues/1971) (@PSeitz)
|
||||
- Perf: Fetch blocks of vals in aggregation for all cardinality [#1950](https://github.com/quickwit-oss/tantivy/issues/1950) (@PSeitz)
|
||||
- Allow histogram bounds to be passed as Rfc3339 [#2076](https://github.com/quickwit-oss/tantivy/issues/2076) (@PSeitz)
|
||||
- `Searcher` with disabled scoring via `EnableScoring::Disabled` [#1780](https://github.com/quickwit-oss/tantivy/issues/1780) (@shikhar)
|
||||
- Enable tokenizer on json fields [#2053](https://github.com/quickwit-oss/tantivy/issues/2053) (@PSeitz)
|
||||
- Enforcing "NOT" and "-" queries consistency in UserInputAst [#1609](https://github.com/quickwit-oss/tantivy/issues/1609) (@bazhenov)
|
||||
- Faster indexing
|
||||
- Refactor tokenization pipeline to use GATs [#1924](https://github.com/quickwit-oss/tantivy/issues/1924) (@trinity-1686a)
|
||||
- Faster term hash map [#2058](https://github.com/quickwit-oss/tantivy/issues/2058)[#1940](https://github.com/quickwit-oss/tantivy/issues/1940) (@PSeitz)
|
||||
- tokenizer-api: reduce Tokenizer allocation overhead [#2062](https://github.com/quickwit-oss/tantivy/issues/2062) (@PSeitz)
|
||||
- Refactor vint [#2010](https://github.com/quickwit-oss/tantivy/issues/2010) (@PSeitz)
|
||||
- Faster search
|
||||
- Work in batches of docs on the SegmentCollector (Only for cases without score for now) [#1937](https://github.com/quickwit-oss/tantivy/issues/1937) (@PSeitz)
|
||||
- Faster fast field range queries using SIMD [#1954](https://github.com/quickwit-oss/tantivy/issues/1954) (@fulmicoton)
|
||||
- Improve fast field range query performance [#1864](https://github.com/quickwit-oss/tantivy/issues/1864) (@PSeitz)
|
||||
- Make BM25 scoring more flexible [#1855](https://github.com/quickwit-oss/tantivy/issues/1855) (@alexcole)
|
||||
- Switch fs2 to fs4 as it is now unmaintained and does not support illumos [#1944](https://github.com/quickwit-oss/tantivy/issues/1944) (@Toasterson)
|
||||
- Made BooleanWeight and BoostWeight public [#1991](https://github.com/quickwit-oss/tantivy/issues/1991) (@fulmicoton)
|
||||
- Make index compatible with virtual drives on Windows [#1843](https://github.com/quickwit-oss/tantivy/issues/1843) (@gyk)
|
||||
- Add stop words for Hungarian language [#2069](https://github.com/quickwit-oss/tantivy/issues/2069) (@tnxbutno)
|
||||
- Auto downgrade index record option, instead of vint error [#1857](https://github.com/quickwit-oss/tantivy/issues/1857) (@PSeitz)
|
||||
- Enable range query on fast field for u64 compatible types [#1762](https://github.com/quickwit-oss/tantivy/issues/1762) (@PSeitz) [#1876]
|
||||
- sstable
|
||||
- Isolating sstable and stacker in independant crates. [#1718](https://github.com/quickwit-oss/tantivy/issues/1718) (@fulmicoton)
|
||||
- New sstable format [#1943](https://github.com/quickwit-oss/tantivy/issues/1943)[#1953](https://github.com/quickwit-oss/tantivy/issues/1953) (@trinity-1686a)
|
||||
- Use DeltaReader directly to implement Dictionnary::ord_to_term [#1928](https://github.com/quickwit-oss/tantivy/issues/1928) (@trinity-1686a)
|
||||
- Use DeltaReader directly to implement Dictionnary::term_ord [#1925](https://github.com/quickwit-oss/tantivy/issues/1925) (@trinity-1686a)
|
||||
- Add seperate tokenizer manager for fast fields [#2019](https://github.com/quickwit-oss/tantivy/issues/2019) (@PSeitz)
|
||||
- Make construction of LevenshteinAutomatonBuilder for FuzzyTermQuery instances lazy. [#1756](https://github.com/quickwit-oss/tantivy/issues/1756) (@adamreichold)
|
||||
- Added support for madvise when opening an mmaped Index [#2036](https://github.com/quickwit-oss/tantivy/issues/2036) (@fulmicoton)
|
||||
- Rename `DatePrecision` to `DateTimePrecision` [#2051](https://github.com/quickwit-oss/tantivy/issues/2051) (@guilload)
|
||||
- Query Parser
|
||||
- Quotation mark can now be used for phrase queries. [#2050](https://github.com/quickwit-oss/tantivy/issues/2050) (@fulmicoton)
|
||||
- PhrasePrefixQuery is supported in the query parser via: `field:"phrase ter"*` [#2044](https://github.com/quickwit-oss/tantivy/issues/2044) (@adamreichold)
|
||||
- Docs
|
||||
- Update examples for literate docs [#1880](https://github.com/quickwit-oss/tantivy/issues/1880) (@PSeitz)
|
||||
- Add ip field example [#1775](https://github.com/quickwit-oss/tantivy/issues/1775) (@PSeitz)
|
||||
- Fix doc store cache documentation [#1821](https://github.com/quickwit-oss/tantivy/issues/1821) (@PSeitz)
|
||||
- Fix BooleanQuery document [#1999](https://github.com/quickwit-oss/tantivy/issues/1999) (@RT_Enzyme)
|
||||
- Update comments in the faceted search example [#1737](https://github.com/quickwit-oss/tantivy/issues/1737) (@DawChihLiou)
|
||||
|
||||
|
||||
Tantivy 0.19
|
||||
================================
|
||||
#### Bugfixes
|
||||
|
||||
33
Cargo.toml
33
Cargo.toml
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.19.0"
|
||||
version = "0.20.2"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
@@ -12,6 +12,7 @@ readme = "README.md"
|
||||
keywords = ["search", "information", "retrieval"]
|
||||
edition = "2021"
|
||||
rust-version = "1.62"
|
||||
exclude = ["benches/*.json", "benches/*.txt"]
|
||||
|
||||
[dependencies]
|
||||
oneshot = "0.1.5"
|
||||
@@ -22,11 +23,9 @@ once_cell = "1.10.0"
|
||||
regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
|
||||
aho-corasick = "1.0"
|
||||
tantivy-fst = "0.4.0"
|
||||
memmap2 = { version = "0.6.0", optional = true }
|
||||
lz4_flex = { version = "0.10", default-features = false, features = ["checked-decode"], optional = true }
|
||||
brotli = { version = "3.3.4", optional = true }
|
||||
memmap2 = { version = "0.7.1", optional = true }
|
||||
lz4_flex = { version = "0.11", default-features = false, optional = true }
|
||||
zstd = { version = "0.12", optional = true, default-features = false }
|
||||
snap = { version = "1.0.5", optional = true }
|
||||
tempfile = { version = "3.3.0", optional = true }
|
||||
log = "0.4.16"
|
||||
serde = { version = "1.0.136", features = ["derive"] }
|
||||
@@ -43,25 +42,25 @@ census = "0.4.0"
|
||||
rustc-hash = "1.1.0"
|
||||
thiserror = "1.0.30"
|
||||
htmlescape = "0.3.1"
|
||||
fail = "0.5.0"
|
||||
fail = { version = "0.5.0", optional = true }
|
||||
murmurhash32 = "0.3.0"
|
||||
time = { version = "0.3.10", features = ["serde-well-known"] }
|
||||
smallvec = "1.8.0"
|
||||
rayon = "1.5.2"
|
||||
lru = "0.10.0"
|
||||
lru = "0.11.0"
|
||||
fastdivide = "0.4.0"
|
||||
itertools = "0.10.3"
|
||||
itertools = "0.11.0"
|
||||
measure_time = "0.8.2"
|
||||
async-trait = "0.1.53"
|
||||
arc-swap = "1.5.0"
|
||||
|
||||
columnar = { version="0.1", path="./columnar", package ="tantivy-columnar" }
|
||||
sstable = { version="0.1", path="./sstable", package ="tantivy-sstable", optional = true }
|
||||
stacker = { version="0.1", path="./stacker", package ="tantivy-stacker" }
|
||||
query-grammar = { version= "0.19.0", path="./query-grammar", package = "tantivy-query-grammar" }
|
||||
tantivy-bitpacker = { version= "0.3", path="./bitpacker" }
|
||||
columnar = { version= "0.1", path="./columnar", package ="tantivy-columnar" }
|
||||
sstable = { version= "0.1", path="./sstable", package ="tantivy-sstable", optional = true }
|
||||
stacker = { version= "0.1", path="./stacker", package ="tantivy-stacker" }
|
||||
query-grammar = { version= "0.20.0", path="./query-grammar", package = "tantivy-query-grammar" }
|
||||
tantivy-bitpacker = { version= "0.4", path="./bitpacker" }
|
||||
common = { version= "0.5", path = "./common/", package = "tantivy-common" }
|
||||
tokenizer-api = { version="0.1", path="./tokenizer-api", package="tantivy-tokenizer-api" }
|
||||
tokenizer-api = { version= "0.1", path="./tokenizer-api", package="tantivy-tokenizer-api" }
|
||||
sketches-ddsketch = { version = "0.2.1", features = ["use_serde"] }
|
||||
futures-util = { version = "0.3.28", optional = true }
|
||||
|
||||
@@ -77,7 +76,7 @@ proptest = "1.0.0"
|
||||
criterion = "0.5"
|
||||
test-log = "0.2.10"
|
||||
env_logger = "0.10.0"
|
||||
pprof = { version = "0.11.0", features = ["flamegraph", "criterion"] }
|
||||
pprof = { git = "https://github.com/PSeitz/pprof-rs/", rev = "53af24b", features = ["flamegraph", "criterion"] } # temp fork that works with criterion 0.5
|
||||
futures = "0.3.21"
|
||||
paste = "1.0.11"
|
||||
more-asserts = "0.3.1"
|
||||
@@ -106,12 +105,10 @@ default = ["mmap", "stopwords", "lz4-compression"]
|
||||
mmap = ["fs4", "tempfile", "memmap2"]
|
||||
stopwords = []
|
||||
|
||||
brotli-compression = ["brotli"]
|
||||
lz4-compression = ["lz4_flex"]
|
||||
snappy-compression = ["snap"]
|
||||
zstd-compression = ["zstd"]
|
||||
|
||||
failpoints = ["fail/failpoints"]
|
||||
failpoints = ["fail", "fail/failpoints"]
|
||||
unstable = [] # useful for benches.
|
||||
|
||||
quickwit = ["sstable", "futures-util"]
|
||||
|
||||
@@ -44,7 +44,7 @@ Details about the benchmark can be found at this [repository](https://github.com
|
||||
- Single valued and multivalued u64, i64, and f64 fast fields (equivalent of doc values in Lucene)
|
||||
- `&[u8]` fast fields
|
||||
- Text, i64, u64, f64, dates, ip, bool, and hierarchical facet fields
|
||||
- Compressed document store (LZ4, Zstd, None, Brotli, Snap)
|
||||
- Compressed document store (LZ4, Zstd, None)
|
||||
- Range queries
|
||||
- Faceted search
|
||||
- Configurable indexing (optional term frequency and position indexing)
|
||||
|
||||
21
RELEASE.md
Normal file
21
RELEASE.md
Normal file
@@ -0,0 +1,21 @@
|
||||
# Release a new Tantivy Version
|
||||
|
||||
## Steps
|
||||
|
||||
1. Identify new packages in workspace since last release
|
||||
2. Identify changed packages in workspace since last release
|
||||
3. Bump version in `Cargo.toml` and their dependents for all changed packages
|
||||
4. Update version of root `Cargo.toml`
|
||||
5. Publish version starting with leaf nodes
|
||||
6. Set git tag with new version
|
||||
|
||||
|
||||
In conjucation with `cargo-release` Steps 1-4 (I'm not sure if the change detection works):
|
||||
Set new packages to version 0.0.0
|
||||
|
||||
Replace prev-tag-name
|
||||
```bash
|
||||
cargo release --workspace --no-publish -v --prev-tag-name 0.19 --push-remote origin minor --no-tag --execute
|
||||
```
|
||||
|
||||
no-tag or it will create tags for all the subpackages
|
||||
23
appveyor.yml
23
appveyor.yml
@@ -1,23 +0,0 @@
|
||||
# Appveyor configuration template for Rust using rustup for Rust installation
|
||||
# https://github.com/starkat99/appveyor-rust
|
||||
|
||||
os: Visual Studio 2015
|
||||
environment:
|
||||
matrix:
|
||||
- channel: stable
|
||||
target: x86_64-pc-windows-msvc
|
||||
|
||||
install:
|
||||
- appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe
|
||||
- rustup-init -yv --default-toolchain %channel% --default-host %target%
|
||||
- set PATH=%PATH%;%USERPROFILE%\.cargo\bin
|
||||
- if defined msys_bits set PATH=%PATH%;C:\msys64\mingw%msys_bits%\bin
|
||||
- rustc -vV
|
||||
- cargo -vV
|
||||
|
||||
build: false
|
||||
|
||||
test_script:
|
||||
- REM SET RUST_LOG=tantivy,test & cargo test --all --verbose --no-default-features --features lz4-compression --features mmap
|
||||
- REM SET RUST_LOG=tantivy,test & cargo test test_store --verbose --no-default-features --features lz4-compression --features snappy-compression --features brotli-compression --features mmap
|
||||
- REM SET RUST_BACKTRACE=1 & cargo build --examples
|
||||
@@ -1,11 +1,13 @@
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use tantivy::tokenizer::TokenizerManager;
|
||||
use tantivy::tokenizer::{
|
||||
LowerCaser, RemoveLongFilter, SimpleTokenizer, TextAnalyzer, TokenizerManager,
|
||||
};
|
||||
|
||||
const ALICE_TXT: &str = include_str!("alice.txt");
|
||||
|
||||
pub fn criterion_benchmark(c: &mut Criterion) {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
let tokenizer = tokenizer_manager.get("default").unwrap();
|
||||
let mut tokenizer = tokenizer_manager.get("default").unwrap();
|
||||
c.bench_function("default-tokenize-alice", |b| {
|
||||
b.iter(|| {
|
||||
let mut word_count = 0;
|
||||
@@ -16,7 +18,26 @@ pub fn criterion_benchmark(c: &mut Criterion) {
|
||||
assert_eq!(word_count, 30_731);
|
||||
})
|
||||
});
|
||||
let mut dynamic_analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.dynamic()
|
||||
.filter_dynamic(RemoveLongFilter::limit(40))
|
||||
.filter_dynamic(LowerCaser)
|
||||
.build();
|
||||
c.bench_function("dynamic-tokenize-alice", |b| {
|
||||
b.iter(|| {
|
||||
let mut word_count = 0;
|
||||
let mut token_stream = dynamic_analyzer.token_stream(ALICE_TXT);
|
||||
while token_stream.advance() {
|
||||
word_count += 1;
|
||||
}
|
||||
assert_eq!(word_count, 30_731);
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
criterion_group!(benches, criterion_benchmark);
|
||||
criterion_group! {
|
||||
name = benches;
|
||||
config = Criterion::default().sample_size(200);
|
||||
targets = criterion_benchmark
|
||||
}
|
||||
criterion_main!(benches);
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy-bitpacker"
|
||||
version = "0.3.0"
|
||||
version = "0.4.0"
|
||||
edition = "2021"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
#[cfg(any(target_arch = "x86_64"))]
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
mod avx2;
|
||||
|
||||
mod scalar;
|
||||
|
||||
89
cliff.toml
Normal file
89
cliff.toml
Normal file
@@ -0,0 +1,89 @@
|
||||
# configuration file for git-cliff{ pattern = "foo", replace = "bar"}
|
||||
# see https://github.com/orhun/git-cliff#configuration-file
|
||||
|
||||
[changelog]
|
||||
# changelog header
|
||||
header = """
|
||||
"""
|
||||
# template for the changelog body
|
||||
# https://tera.netlify.app/docs/#introduction
|
||||
body = """
|
||||
{% if version %}\
|
||||
{{ version | trim_start_matches(pat="v") }} ({{ timestamp | date(format="%Y-%m-%d") }})
|
||||
==================
|
||||
{% else %}\
|
||||
## [unreleased]
|
||||
{% endif %}\
|
||||
{% for commit in commits %}
|
||||
- {% if commit.breaking %}[**breaking**] {% endif %}{{ commit.message | split(pat="\n") | first | trim | upper_first }}(@{{ commit.author.name }})\
|
||||
{% endfor %}
|
||||
"""
|
||||
# remove the leading and trailing whitespace from the template
|
||||
trim = true
|
||||
# changelog footer
|
||||
footer = """
|
||||
"""
|
||||
|
||||
postprocessors = [
|
||||
{ pattern = 'Paul Masurel', replace = "fulmicoton"}, # replace with github user
|
||||
{ pattern = 'PSeitz', replace = "PSeitz"}, # replace with github user
|
||||
{ pattern = 'Adam Reichold', replace = "adamreichold"}, # replace with github user
|
||||
{ pattern = 'trinity-1686a', replace = "trinity-1686a"}, # replace with github user
|
||||
{ pattern = 'Michael Kleen', replace = "mkleen"}, # replace with github user
|
||||
{ pattern = 'Adrien Guillo', replace = "guilload"}, # replace with github user
|
||||
{ pattern = 'François Massot', replace = "fmassot"}, # replace with github user
|
||||
{ pattern = '', replace = ""}, # replace with github user
|
||||
]
|
||||
|
||||
[git]
|
||||
# parse the commits based on https://www.conventionalcommits.org
|
||||
# This is required or commit.message contains the whole commit message and not just the title
|
||||
conventional_commits = true
|
||||
# filter out the commits that are not conventional
|
||||
filter_unconventional = false
|
||||
# process each line of a commit as an individual commit
|
||||
split_commits = false
|
||||
# regex for preprocessing the commit messages
|
||||
commit_preprocessors = [
|
||||
{ pattern = '\((\w+\s)?#([0-9]+)\)', replace = "[#${2}](https://github.com/quickwit-oss/tantivy/issues/${2})"}, # replace issue numbers
|
||||
]
|
||||
#link_parsers = [
|
||||
#{ pattern = "#(\\d+)", href = "https://github.com/quickwit-oss/tantivy/pulls/$1"},
|
||||
#]
|
||||
# regex for parsing and grouping commits
|
||||
commit_parsers = [
|
||||
{ message = "^feat", group = "Features"},
|
||||
{ message = "^fix", group = "Bug Fixes"},
|
||||
{ message = "^doc", group = "Documentation"},
|
||||
{ message = "^perf", group = "Performance"},
|
||||
{ message = "^refactor", group = "Refactor"},
|
||||
{ message = "^style", group = "Styling"},
|
||||
{ message = "^test", group = "Testing"},
|
||||
{ message = "^chore\\(release\\): prepare for", skip = true},
|
||||
{ message = "(?i)clippy", skip = true},
|
||||
{ message = "(?i)dependabot", skip = true},
|
||||
{ message = "(?i)fmt", skip = true},
|
||||
{ message = "(?i)bump", skip = true},
|
||||
{ message = "(?i)readme", skip = true},
|
||||
{ message = "(?i)comment", skip = true},
|
||||
{ message = "(?i)spelling", skip = true},
|
||||
{ message = "^chore", group = "Miscellaneous Tasks"},
|
||||
{ body = ".*security", group = "Security"},
|
||||
{ message = ".*", group = "Other", default_scope = "other"},
|
||||
]
|
||||
# protect breaking changes from being skipped due to matching a skipping commit_parser
|
||||
protect_breaking_commits = false
|
||||
# filter out the commits that are not matched by commit parsers
|
||||
filter_commits = false
|
||||
# glob pattern for matching git tags
|
||||
tag_pattern = "v[0-9]*"
|
||||
# regex for skipping tags
|
||||
skip_tags = "v0.1.0-beta.1"
|
||||
# regex for ignoring tags
|
||||
ignore_tags = ""
|
||||
# sort the tags topologically
|
||||
topo_order = false
|
||||
# sort the commits inside sections by oldest/newest order
|
||||
sort_commits = "newest"
|
||||
# limit the number of commits included in the changelog.
|
||||
# limit_commits = 42
|
||||
@@ -3,16 +3,20 @@ name = "tantivy-columnar"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "MIT"
|
||||
homepage = "https://github.com/quickwit-oss/tantivy"
|
||||
repository = "https://github.com/quickwit-oss/tantivy"
|
||||
description = "column oriented storage for tantivy"
|
||||
categories = ["database-implementations", "data-structures", "compression"]
|
||||
|
||||
[dependencies]
|
||||
itertools = "0.10.5"
|
||||
itertools = "0.11.0"
|
||||
fnv = "1.0.7"
|
||||
fastdivide = "0.4.0"
|
||||
|
||||
stacker = { path = "../stacker", package="tantivy-stacker"}
|
||||
sstable = { path = "../sstable", package = "tantivy-sstable" }
|
||||
common = { path = "../common", package = "tantivy-common" }
|
||||
tantivy-bitpacker = { version= "0.3", path = "../bitpacker/" }
|
||||
stacker = { version= "0.1", path = "../stacker", package="tantivy-stacker"}
|
||||
sstable = { version= "0.1", path = "../sstable", package = "tantivy-sstable" }
|
||||
common = { version= "0.5", path = "../common", package = "tantivy-common" }
|
||||
tantivy-bitpacker = { version= "0.4", path = "../bitpacker/" }
|
||||
serde = "1.0.152"
|
||||
|
||||
[dev-dependencies]
|
||||
|
||||
@@ -168,8 +168,9 @@ mod tests {
|
||||
)
|
||||
.into();
|
||||
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
|
||||
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index
|
||||
else { panic!("Excpected a multivalued index") };
|
||||
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
|
||||
panic!("Excpected a multivalued index")
|
||||
};
|
||||
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
|
||||
assert_eq!(&start_indexes, &[0, 3, 5]);
|
||||
}
|
||||
@@ -200,8 +201,9 @@ mod tests {
|
||||
)
|
||||
.into();
|
||||
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
|
||||
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index
|
||||
else { panic!("Excpected a multivalued index") };
|
||||
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
|
||||
panic!("Excpected a multivalued index")
|
||||
};
|
||||
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
|
||||
assert_eq!(&start_indexes, &[0, 3, 5, 6]);
|
||||
}
|
||||
|
||||
@@ -157,7 +157,13 @@ mod tests {
|
||||
Cardinality::Optional,
|
||||
&shuffle_merge_order,
|
||||
);
|
||||
let SerializableColumnIndex::Optional { non_null_row_ids, num_rows } = serializable_index else { panic!() };
|
||||
let SerializableColumnIndex::Optional {
|
||||
non_null_row_ids,
|
||||
num_rows,
|
||||
} = serializable_index
|
||||
else {
|
||||
panic!()
|
||||
};
|
||||
assert_eq!(num_rows, 2);
|
||||
let non_null_rows: Vec<RowId> = non_null_row_ids.boxed_iter().collect();
|
||||
assert_eq!(&non_null_rows, &[1]);
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
//! # `fastfield_codecs`
|
||||
//!
|
||||
//! - Columnar storage of data for tantivy [`Column`].
|
||||
//! - Columnar storage of data for tantivy [`crate::Column`].
|
||||
//! - Encode data in different codecs.
|
||||
//! - Monotonically map values to u64/u128
|
||||
|
||||
|
||||
@@ -83,7 +83,8 @@ impl ColumnValues for BitpackedReader {
|
||||
doc_id_range: Range<u32>,
|
||||
positions: &mut Vec<u32>,
|
||||
) {
|
||||
let Some(transformed_range) = transform_range_before_linear_transformation(&self.stats, range)
|
||||
let Some(transformed_range) =
|
||||
transform_range_before_linear_transformation(&self.stats, range)
|
||||
else {
|
||||
positions.clear();
|
||||
return;
|
||||
|
||||
@@ -52,8 +52,8 @@ pub enum MergeRowOrder {
|
||||
/// Columnar tables are simply stacked one above the other.
|
||||
/// If the i-th columnar_readers has n_rows_i rows, then
|
||||
/// in the resulting columnar,
|
||||
/// rows [r0..n_row_0) contains the row of columnar_readers[0], in ordder
|
||||
/// rows [n_row_0..n_row_0 + n_row_1 contains the row of columnar_readers[1], in order.
|
||||
/// rows [r0..n_row_0) contains the row of `columnar_readers[0]`, in ordder
|
||||
/// rows [n_row_0..n_row_0 + n_row_1 contains the row of `columnar_readers[1]`, in order.
|
||||
/// ..
|
||||
/// No documents is deleted.
|
||||
Stack(StackMergeOrder),
|
||||
|
||||
@@ -244,7 +244,9 @@ fn test_merge_columnar_numbers() {
|
||||
assert_eq!(columnar_reader.num_columns(), 1);
|
||||
let cols = columnar_reader.read_columns("numbers").unwrap();
|
||||
let dynamic_column = cols[0].open().unwrap();
|
||||
let DynamicColumn::F64(vals) = dynamic_column else { panic!() };
|
||||
let DynamicColumn::F64(vals) = dynamic_column else {
|
||||
panic!()
|
||||
};
|
||||
assert_eq!(vals.get_cardinality(), Cardinality::Optional);
|
||||
assert_eq!(vals.first(0u32), Some(-1f64));
|
||||
assert_eq!(vals.first(1u32), None);
|
||||
@@ -270,7 +272,9 @@ fn test_merge_columnar_texts() {
|
||||
assert_eq!(columnar_reader.num_columns(), 1);
|
||||
let cols = columnar_reader.read_columns("texts").unwrap();
|
||||
let dynamic_column = cols[0].open().unwrap();
|
||||
let DynamicColumn::Str(vals) = dynamic_column else { panic!() };
|
||||
let DynamicColumn::Str(vals) = dynamic_column else {
|
||||
panic!()
|
||||
};
|
||||
assert_eq!(vals.ords().get_cardinality(), Cardinality::Optional);
|
||||
|
||||
let get_str_for_ord = |ord| {
|
||||
@@ -317,7 +321,9 @@ fn test_merge_columnar_byte() {
|
||||
assert_eq!(columnar_reader.num_columns(), 1);
|
||||
let cols = columnar_reader.read_columns("bytes").unwrap();
|
||||
let dynamic_column = cols[0].open().unwrap();
|
||||
let DynamicColumn::Bytes(vals) = dynamic_column else { panic!() };
|
||||
let DynamicColumn::Bytes(vals) = dynamic_column else {
|
||||
panic!()
|
||||
};
|
||||
let get_bytes_for_ord = |ord| {
|
||||
let mut out = Vec::new();
|
||||
vals.ord_to_bytes(ord, &mut out).unwrap();
|
||||
@@ -371,7 +377,9 @@ fn test_merge_columnar_byte_with_missing() {
|
||||
assert_eq!(columnar_reader.num_columns(), 2);
|
||||
let cols = columnar_reader.read_columns("col").unwrap();
|
||||
let dynamic_column = cols[0].open().unwrap();
|
||||
let DynamicColumn::Bytes(vals) = dynamic_column else { panic!() };
|
||||
let DynamicColumn::Bytes(vals) = dynamic_column else {
|
||||
panic!()
|
||||
};
|
||||
let get_bytes_for_ord = |ord| {
|
||||
let mut out = Vec::new();
|
||||
vals.ord_to_bytes(ord, &mut out).unwrap();
|
||||
@@ -423,7 +431,9 @@ fn test_merge_columnar_different_types() {
|
||||
|
||||
// numeric column
|
||||
let dynamic_column = cols[0].open().unwrap();
|
||||
let DynamicColumn::I64(vals) = dynamic_column else { panic!() };
|
||||
let DynamicColumn::I64(vals) = dynamic_column else {
|
||||
panic!()
|
||||
};
|
||||
assert_eq!(vals.get_cardinality(), Cardinality::Optional);
|
||||
assert_eq!(vals.values_for_doc(0).collect_vec(), vec![]);
|
||||
assert_eq!(vals.values_for_doc(1).collect_vec(), vec![]);
|
||||
@@ -433,7 +443,9 @@ fn test_merge_columnar_different_types() {
|
||||
|
||||
// text column
|
||||
let dynamic_column = cols[1].open().unwrap();
|
||||
let DynamicColumn::Str(vals) = dynamic_column else { panic!() };
|
||||
let DynamicColumn::Str(vals) = dynamic_column else {
|
||||
panic!()
|
||||
};
|
||||
assert_eq!(vals.ords().get_cardinality(), Cardinality::Optional);
|
||||
let get_str_for_ord = |ord| {
|
||||
let mut out = String::new();
|
||||
|
||||
@@ -98,9 +98,11 @@ impl ColumnarWriter {
|
||||
///
|
||||
/// The sort applied is stable.
|
||||
pub fn sort_order(&self, sort_field: &str, num_docs: RowId, reversed: bool) -> Vec<u32> {
|
||||
let Some(numerical_col_writer) =
|
||||
self.numerical_field_hash_map.get::<NumericalColumnWriter>(sort_field.as_bytes()) else {
|
||||
return Vec::new();
|
||||
let Some(numerical_col_writer) = self
|
||||
.numerical_field_hash_map
|
||||
.get::<NumericalColumnWriter>(sort_field.as_bytes())
|
||||
else {
|
||||
return Vec::new();
|
||||
};
|
||||
let mut symbols_buffer = Vec::new();
|
||||
let mut values = Vec::new();
|
||||
|
||||
@@ -57,7 +57,9 @@ fn test_dataframe_writer_bool() {
|
||||
assert_eq!(cols[0].num_bytes(), 22);
|
||||
assert_eq!(cols[0].column_type(), ColumnType::Bool);
|
||||
let dyn_bool_col = cols[0].open().unwrap();
|
||||
let DynamicColumn::Bool(bool_col) = dyn_bool_col else { panic!(); };
|
||||
let DynamicColumn::Bool(bool_col) = dyn_bool_col else {
|
||||
panic!();
|
||||
};
|
||||
let vals: Vec<Option<bool>> = (0..5).map(|row_id| bool_col.first(row_id)).collect();
|
||||
assert_eq!(&vals, &[None, Some(false), None, Some(true), None,]);
|
||||
}
|
||||
@@ -79,7 +81,9 @@ fn test_dataframe_writer_u64_multivalued() {
|
||||
assert_eq!(cols.len(), 1);
|
||||
assert_eq!(cols[0].num_bytes(), 29);
|
||||
let dyn_i64_col = cols[0].open().unwrap();
|
||||
let DynamicColumn::I64(divisor_col) = dyn_i64_col else { panic!(); };
|
||||
let DynamicColumn::I64(divisor_col) = dyn_i64_col else {
|
||||
panic!();
|
||||
};
|
||||
assert_eq!(
|
||||
divisor_col.get_cardinality(),
|
||||
crate::Cardinality::Multivalued
|
||||
@@ -101,7 +105,9 @@ fn test_dataframe_writer_ip_addr() {
|
||||
assert_eq!(cols[0].num_bytes(), 42);
|
||||
assert_eq!(cols[0].column_type(), ColumnType::IpAddr);
|
||||
let dyn_bool_col = cols[0].open().unwrap();
|
||||
let DynamicColumn::IpAddr(ip_col) = dyn_bool_col else { panic!(); };
|
||||
let DynamicColumn::IpAddr(ip_col) = dyn_bool_col else {
|
||||
panic!();
|
||||
};
|
||||
let vals: Vec<Option<Ipv6Addr>> = (0..5).map(|row_id| ip_col.first(row_id)).collect();
|
||||
assert_eq!(
|
||||
&vals,
|
||||
@@ -134,7 +140,9 @@ fn test_dataframe_writer_numerical() {
|
||||
// - null footer 6 bytes
|
||||
assert_eq!(cols[0].num_bytes(), 33);
|
||||
let column = cols[0].open().unwrap();
|
||||
let DynamicColumn::I64(column_i64) = column else { panic!(); };
|
||||
let DynamicColumn::I64(column_i64) = column else {
|
||||
panic!();
|
||||
};
|
||||
assert_eq!(column_i64.index.get_cardinality(), Cardinality::Optional);
|
||||
assert_eq!(column_i64.first(0), None);
|
||||
assert_eq!(column_i64.first(1), Some(12i64));
|
||||
@@ -198,7 +206,9 @@ fn test_dictionary_encoded_str() {
|
||||
assert_eq!(columnar_reader.num_columns(), 2);
|
||||
let col_handles = columnar_reader.read_columns("my.column").unwrap();
|
||||
assert_eq!(col_handles.len(), 1);
|
||||
let DynamicColumn::Str(str_col) = col_handles[0].open().unwrap() else { panic!(); };
|
||||
let DynamicColumn::Str(str_col) = col_handles[0].open().unwrap() else {
|
||||
panic!();
|
||||
};
|
||||
let index: Vec<Option<u64>> = (0..5).map(|row_id| str_col.ords().first(row_id)).collect();
|
||||
assert_eq!(index, &[None, Some(0), None, Some(2), Some(1)]);
|
||||
assert_eq!(str_col.num_rows(), 5);
|
||||
@@ -230,7 +240,9 @@ fn test_dictionary_encoded_bytes() {
|
||||
assert_eq!(columnar_reader.num_columns(), 2);
|
||||
let col_handles = columnar_reader.read_columns("my.column").unwrap();
|
||||
assert_eq!(col_handles.len(), 1);
|
||||
let DynamicColumn::Bytes(bytes_col) = col_handles[0].open().unwrap() else { panic!(); };
|
||||
let DynamicColumn::Bytes(bytes_col) = col_handles[0].open().unwrap() else {
|
||||
panic!();
|
||||
};
|
||||
let index: Vec<Option<u64>> = (0..5)
|
||||
.map(|row_id| bytes_col.ords().first(row_id))
|
||||
.collect();
|
||||
@@ -533,28 +545,36 @@ trait AssertEqualToColumnValue {
|
||||
|
||||
impl AssertEqualToColumnValue for bool {
|
||||
fn assert_equal_to_column_value(&self, column_value: &ColumnValue) {
|
||||
let ColumnValue::Bool(val) = column_value else { panic!() };
|
||||
let ColumnValue::Bool(val) = column_value else {
|
||||
panic!()
|
||||
};
|
||||
assert_eq!(self, val);
|
||||
}
|
||||
}
|
||||
|
||||
impl AssertEqualToColumnValue for Ipv6Addr {
|
||||
fn assert_equal_to_column_value(&self, column_value: &ColumnValue) {
|
||||
let ColumnValue::IpAddr(val) = column_value else { panic!() };
|
||||
let ColumnValue::IpAddr(val) = column_value else {
|
||||
panic!()
|
||||
};
|
||||
assert_eq!(self, val);
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Coerce + PartialEq + Debug + Into<NumericalValue>> AssertEqualToColumnValue for T {
|
||||
fn assert_equal_to_column_value(&self, column_value: &ColumnValue) {
|
||||
let ColumnValue::Numerical(num) = column_value else { panic!() };
|
||||
let ColumnValue::Numerical(num) = column_value else {
|
||||
panic!()
|
||||
};
|
||||
assert_eq!(self, &T::coerce(*num));
|
||||
}
|
||||
}
|
||||
|
||||
impl AssertEqualToColumnValue for DateTime {
|
||||
fn assert_equal_to_column_value(&self, column_value: &ColumnValue) {
|
||||
let ColumnValue::DateTime(dt) = column_value else { panic!() };
|
||||
let ColumnValue::DateTime(dt) = column_value else {
|
||||
panic!()
|
||||
};
|
||||
assert_eq!(self, dt);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,21 +15,12 @@ use time::{OffsetDateTime, PrimitiveDateTime, UtcOffset};
|
||||
pub enum DateTimePrecision {
|
||||
/// Second precision.
|
||||
#[default]
|
||||
Second,
|
||||
/// Millisecond precision.
|
||||
Millisecond,
|
||||
/// Microsecond precision.
|
||||
Microsecond,
|
||||
/// Nanosecond precision.
|
||||
Nanosecond,
|
||||
// TODO: Remove deprecated variants after 2 releases.
|
||||
#[deprecated(since = "0.20.0", note = "Use `Second` instead")]
|
||||
Seconds,
|
||||
#[deprecated(since = "0.20.0", note = "Use `Millisecond` instead")]
|
||||
/// Millisecond precision.
|
||||
Milliseconds,
|
||||
#[deprecated(since = "0.20.0", note = "Use `Microsecond` instead")]
|
||||
/// Microsecond precision.
|
||||
Microseconds,
|
||||
#[deprecated(since = "0.20.0", note = "Use `Nanosecond` instead")]
|
||||
/// Nanosecond precision.
|
||||
Nanoseconds,
|
||||
}
|
||||
|
||||
@@ -156,16 +147,10 @@ impl DateTime {
|
||||
/// Truncates the microseconds value to the corresponding precision.
|
||||
pub fn truncate(self, precision: DateTimePrecision) -> Self {
|
||||
let truncated_timestamp_micros = match precision {
|
||||
DateTimePrecision::Second | DateTimePrecision::Seconds => {
|
||||
(self.timestamp_nanos / 1_000_000_000) * 1_000_000_000
|
||||
}
|
||||
DateTimePrecision::Millisecond | DateTimePrecision::Milliseconds => {
|
||||
(self.timestamp_nanos / 1_000_000) * 1_000_000
|
||||
}
|
||||
DateTimePrecision::Microsecond | DateTimePrecision::Microseconds => {
|
||||
(self.timestamp_nanos / 1_000) * 1_000
|
||||
}
|
||||
DateTimePrecision::Nanosecond | DateTimePrecision::Nanoseconds => self.timestamp_nanos,
|
||||
DateTimePrecision::Seconds => (self.timestamp_nanos / 1_000_000_000) * 1_000_000_000,
|
||||
DateTimePrecision::Milliseconds => (self.timestamp_nanos / 1_000_000) * 1_000_000,
|
||||
DateTimePrecision::Microseconds => (self.timestamp_nanos / 1_000) * 1_000,
|
||||
DateTimePrecision::Nanoseconds => self.timestamp_nanos,
|
||||
};
|
||||
Self {
|
||||
timestamp_nanos: truncated_timestamp_micros,
|
||||
@@ -174,7 +159,7 @@ impl DateTime {
|
||||
}
|
||||
|
||||
impl fmt::Debug for DateTime {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
let utc_rfc3339 = self.into_utc().format(&Rfc3339).map_err(|_| fmt::Error)?;
|
||||
f.write_str(&utc_rfc3339)
|
||||
}
|
||||
|
||||
@@ -37,7 +37,7 @@ fn main() -> tantivy::Result<()> {
|
||||
.set_index_option(IndexRecordOption::WithFreqs)
|
||||
.set_tokenizer("raw"),
|
||||
)
|
||||
.set_fast(None)
|
||||
.set_fast("default")
|
||||
.set_stored();
|
||||
schema_builder.add_text_field("category", text_fieldtype);
|
||||
schema_builder.add_f64_field("stock", FAST);
|
||||
|
||||
@@ -53,7 +53,7 @@ fn main() -> tantivy::Result<()> {
|
||||
// this will store tokens of 3 characters each
|
||||
index
|
||||
.tokenizers()
|
||||
.register("ngram3", NgramTokenizer::new(3, 3, false));
|
||||
.register("ngram3", NgramTokenizer::new(3, 3, false).unwrap());
|
||||
|
||||
// To insert document we need an index writer.
|
||||
// There must be only one writer at a time.
|
||||
|
||||
@@ -13,7 +13,7 @@ fn main() -> tantivy::Result<()> {
|
||||
let opts = DateOptions::from(INDEXED)
|
||||
.set_stored()
|
||||
.set_fast()
|
||||
.set_precision(tantivy::DateTimePrecision::Second);
|
||||
.set_precision(tantivy::DateTimePrecision::Seconds);
|
||||
// Add `occurred_at` date field type
|
||||
let occurred_at = schema_builder.add_date_field("occurred_at", opts);
|
||||
let event_type = schema_builder.add_text_field("event", STRING | STORED);
|
||||
|
||||
79
examples/phrase_prefix_search.rs
Normal file
79
examples/phrase_prefix_search.rs
Normal file
@@ -0,0 +1,79 @@
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::{doc, Index, ReloadPolicy, Result};
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let index_path = TempDir::new()?;
|
||||
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_text_field("title", TEXT | STORED);
|
||||
schema_builder.add_text_field("body", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let title = schema.get_field("title").unwrap();
|
||||
let body = schema.get_field("body").unwrap();
|
||||
|
||||
let index = Index::create_in_dir(&index_path, schema)?;
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "The Old Man and the Sea",
|
||||
body => "He was an old man who fished alone in a skiff in the Gulf Stream and he had gone \
|
||||
eighty-four days now without taking a fish.",
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
))?;
|
||||
|
||||
// Multivalued field just need to be repeated.
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankenstein",
|
||||
title => "The Modern Prometheus",
|
||||
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||
increasing confidence in the success of my undertaking."
|
||||
))?;
|
||||
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::OnCommit)
|
||||
.try_into()?;
|
||||
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||
// This will match documents containing the phrase "in the"
|
||||
// followed by some word starting with "su",
|
||||
// i.e. it will match "in the sunlight" and "in the success",
|
||||
// but not "in the Gulf Stream".
|
||||
let query = query_parser.parse_query("\"in the su\"*")?;
|
||||
|
||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
||||
let mut titles = top_docs
|
||||
.into_iter()
|
||||
.map(|(_score, doc_address)| {
|
||||
let doc = searcher.doc(doc_address)?;
|
||||
let title = doc.get_first(title).unwrap().as_text().unwrap().to_owned();
|
||||
Ok(title)
|
||||
})
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
titles.sort_unstable();
|
||||
assert_eq!(titles, ["Frankenstein", "Of Mice and Men"]);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -17,7 +17,8 @@ use tantivy::{doc, Index, ReloadPolicy};
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn pre_tokenize_text(text: &str) -> Vec<Token> {
|
||||
let mut token_stream = SimpleTokenizer.token_stream(text);
|
||||
let mut tokenizer = SimpleTokenizer::default();
|
||||
let mut token_stream = tokenizer.token_stream(text);
|
||||
let mut tokens = vec![];
|
||||
while token_stream.advance() {
|
||||
tokens.push(token_stream.token().clone());
|
||||
|
||||
@@ -50,7 +50,7 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
// This tokenizer lowers all of the text (to help with stop word matching)
|
||||
// then removes all instances of `the` and `and` from the corpus
|
||||
let tokenizer = TextAnalyzer::builder(SimpleTokenizer)
|
||||
let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(LowerCaser)
|
||||
.filter(StopWordFilter::remove(vec![
|
||||
"the".to_string(),
|
||||
|
||||
@@ -6,12 +6,14 @@ use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{Schema, FAST, TEXT};
|
||||
use tantivy::{
|
||||
doc, DocAddress, DocId, Index, IndexReader, Opstamp, Searcher, SearcherGeneration, SegmentId,
|
||||
SegmentReader, Warmer,
|
||||
doc, DocAddress, DocId, Index, Opstamp, Searcher, SearcherGeneration, SegmentId, SegmentReader,
|
||||
Warmer,
|
||||
};
|
||||
|
||||
// This example shows how warmers can be used to
|
||||
// load a values from an external sources using the Warmer API.
|
||||
// load values from an external sources and
|
||||
// tie their lifecycle to that of the index segments
|
||||
// using the Warmer API.
|
||||
//
|
||||
// In this example, we assume an e-commerce search engine.
|
||||
|
||||
@@ -23,9 +25,11 @@ pub trait PriceFetcher: Send + Sync + 'static {
|
||||
fn fetch_prices(&self, product_ids: &[ProductId]) -> Vec<Price>;
|
||||
}
|
||||
|
||||
type SegmentKey = (SegmentId, Option<Opstamp>);
|
||||
|
||||
struct DynamicPriceColumn {
|
||||
field: String,
|
||||
price_cache: RwLock<HashMap<(SegmentId, Option<Opstamp>), Arc<Vec<Price>>>>,
|
||||
price_cache: RwLock<HashMap<SegmentKey, Arc<Vec<Price>>>>,
|
||||
price_fetcher: Box<dyn PriceFetcher>,
|
||||
}
|
||||
|
||||
@@ -46,7 +50,6 @@ impl DynamicPriceColumn {
|
||||
impl Warmer for DynamicPriceColumn {
|
||||
fn warm(&self, searcher: &Searcher) -> tantivy::Result<()> {
|
||||
for segment in searcher.segment_readers() {
|
||||
let key = (segment.segment_id(), segment.delete_opstamp());
|
||||
let product_id_reader = segment
|
||||
.fast_fields()
|
||||
.u64(&self.field)?
|
||||
@@ -55,37 +58,40 @@ impl Warmer for DynamicPriceColumn {
|
||||
.doc_ids_alive()
|
||||
.map(|doc| product_id_reader.get_val(doc))
|
||||
.collect();
|
||||
let mut prices_it = self.price_fetcher.fetch_prices(&product_ids).into_iter();
|
||||
let mut price_vals: Vec<Price> = Vec::new();
|
||||
for doc in 0..segment.max_doc() {
|
||||
if segment.is_deleted(doc) {
|
||||
price_vals.push(0);
|
||||
} else {
|
||||
price_vals.push(prices_it.next().unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
let mut prices = self.price_fetcher.fetch_prices(&product_ids).into_iter();
|
||||
|
||||
let prices: Vec<Price> = (0..segment.max_doc())
|
||||
.map(|doc| {
|
||||
if !segment.is_deleted(doc) {
|
||||
prices.next().unwrap()
|
||||
} else {
|
||||
0
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
let key = (segment.segment_id(), segment.delete_opstamp());
|
||||
self.price_cache
|
||||
.write()
|
||||
.unwrap()
|
||||
.insert(key, Arc::new(price_vals));
|
||||
.insert(key, Arc::new(prices));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn garbage_collect(&self, live_generations: &[&SearcherGeneration]) {
|
||||
let live_segment_id_and_delete_ops: HashSet<(SegmentId, Option<Opstamp>)> =
|
||||
live_generations
|
||||
.iter()
|
||||
.flat_map(|gen| gen.segments())
|
||||
.map(|(&segment_id, &opstamp)| (segment_id, opstamp))
|
||||
.collect();
|
||||
let mut price_cache_wrt = self.price_cache.write().unwrap();
|
||||
// let price_cache = std::mem::take(&mut *price_cache_wrt);
|
||||
// Drain would be nicer here.
|
||||
*price_cache_wrt = std::mem::take(&mut *price_cache_wrt)
|
||||
.into_iter()
|
||||
.filter(|(seg_id_and_op, _)| !live_segment_id_and_delete_ops.contains(seg_id_and_op))
|
||||
let live_keys: HashSet<SegmentKey> = live_generations
|
||||
.iter()
|
||||
.flat_map(|gen| gen.segments())
|
||||
.map(|(&segment_id, &opstamp)| (segment_id, opstamp))
|
||||
.collect();
|
||||
|
||||
self.price_cache
|
||||
.write()
|
||||
.unwrap()
|
||||
.retain(|key, _| live_keys.contains(key));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -100,17 +106,17 @@ pub struct ExternalPriceTable {
|
||||
|
||||
impl ExternalPriceTable {
|
||||
pub fn update_price(&self, product_id: ProductId, price: Price) {
|
||||
let mut prices_wrt = self.prices.write().unwrap();
|
||||
prices_wrt.insert(product_id, price);
|
||||
self.prices.write().unwrap().insert(product_id, price);
|
||||
}
|
||||
}
|
||||
|
||||
impl PriceFetcher for ExternalPriceTable {
|
||||
fn fetch_prices(&self, product_ids: &[ProductId]) -> Vec<Price> {
|
||||
let prices_read = self.prices.read().unwrap();
|
||||
let prices = self.prices.read().unwrap();
|
||||
|
||||
product_ids
|
||||
.iter()
|
||||
.map(|product_id| prices_read.get(product_id).cloned().unwrap_or(0))
|
||||
.map(|product_id| prices.get(product_id).cloned().unwrap_or(0))
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
@@ -143,11 +149,8 @@ fn main() -> tantivy::Result<()> {
|
||||
writer.add_document(doc!(product_id=>SNEAKERS, text=>"uber sweet sneakers"))?;
|
||||
writer.commit()?;
|
||||
|
||||
let warmers: Vec<Weak<dyn Warmer>> = vec![Arc::downgrade(
|
||||
&(price_dynamic_column.clone() as Arc<dyn Warmer>),
|
||||
)];
|
||||
let reader: IndexReader = index.reader_builder().warmers(warmers).try_into()?;
|
||||
reader.reload()?;
|
||||
let warmers = vec![Arc::downgrade(&price_dynamic_column) as Weak<dyn Warmer>];
|
||||
let reader = index.reader_builder().warmers(warmers).try_into()?;
|
||||
|
||||
let query_parser = QueryParser::for_index(&index, vec![text]);
|
||||
let query = query_parser.parse_query("cooking")?;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy-query-grammar"
|
||||
version = "0.19.0"
|
||||
version = "0.20.0"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
|
||||
@@ -162,14 +162,22 @@ fn term_val<'a>() -> impl Parser<&'a str, Output = (Delimiter, String)> {
|
||||
}
|
||||
|
||||
fn term_query<'a>() -> impl Parser<&'a str, Output = UserInputLiteral> {
|
||||
(field_name(), term_val(), slop_val()).map(|(field_name, (delimiter, phrase), slop)| {
|
||||
UserInputLiteral {
|
||||
(field_name(), term_val(), slop_or_prefix_val()).map(
|
||||
|(field_name, (delimiter, phrase), (slop, prefix))| UserInputLiteral {
|
||||
field_name: Some(field_name),
|
||||
phrase,
|
||||
delimiter,
|
||||
slop,
|
||||
}
|
||||
})
|
||||
prefix,
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
fn slop_or_prefix_val<'a>() -> impl Parser<&'a str, Output = (u32, bool)> {
|
||||
let prefix_val = char('*').map(|_ast| (0, true));
|
||||
let slop_val = slop_val().map(|slop| (slop, false));
|
||||
|
||||
prefix_val.or(slop_val)
|
||||
}
|
||||
|
||||
fn slop_val<'a>() -> impl Parser<&'a str, Output = u32> {
|
||||
@@ -186,11 +194,14 @@ fn slop_val<'a>() -> impl Parser<&'a str, Output = u32> {
|
||||
|
||||
fn literal<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
|
||||
let term_default_field =
|
||||
(term_val(), slop_val()).map(|((delimiter, phrase), slop)| UserInputLiteral {
|
||||
field_name: None,
|
||||
phrase,
|
||||
delimiter,
|
||||
slop,
|
||||
(term_val(), slop_or_prefix_val()).map(|((delimiter, phrase), (slop, prefix))| {
|
||||
UserInputLiteral {
|
||||
field_name: None,
|
||||
phrase,
|
||||
delimiter,
|
||||
slop,
|
||||
prefix,
|
||||
}
|
||||
});
|
||||
|
||||
attempt(term_query())
|
||||
@@ -872,6 +883,16 @@ mod test {
|
||||
test_parse_query_to_ast_helper("\"a b\"~300^2", "(\"a b\"~300)^2");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_phrase_prefix() {
|
||||
test_parse_query_to_ast_helper("\"a b\"*", "\"a b\"*");
|
||||
test_parse_query_to_ast_helper("\"a\"*", "\"a\"*");
|
||||
test_parse_query_to_ast_helper("\"\"*", "\"\"*");
|
||||
test_parse_query_to_ast_helper("foo:\"a b\"*", "\"foo\":\"a b\"*");
|
||||
test_parse_query_to_ast_helper("foo:\"a\"*", "\"foo\":\"a\"*");
|
||||
test_parse_query_to_ast_helper("foo:\"\"*", "\"foo\":\"\"*");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_not_queries_are_consistent() {
|
||||
test_parse_query_to_ast_helper("tata -toto", "(*tata -toto)");
|
||||
|
||||
@@ -66,6 +66,7 @@ pub struct UserInputLiteral {
|
||||
pub phrase: String,
|
||||
pub delimiter: Delimiter,
|
||||
pub slop: u32,
|
||||
pub prefix: bool,
|
||||
}
|
||||
|
||||
impl fmt::Debug for UserInputLiteral {
|
||||
@@ -86,6 +87,8 @@ impl fmt::Debug for UserInputLiteral {
|
||||
}
|
||||
if self.slop > 0 {
|
||||
write!(formatter, "~{}", self.slop)?;
|
||||
} else if self.prefix {
|
||||
write!(formatter, "*")?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -60,6 +60,8 @@ impl AggregationLimits {
|
||||
/// *bucket_limit*
|
||||
/// Limits the maximum number of buckets returned from an aggregation request.
|
||||
/// bucket_limit will default to `DEFAULT_BUCKET_LIMIT` (65000)
|
||||
///
|
||||
/// Note: The returned instance contains a Arc shared counter to track memory consumption.
|
||||
pub fn new(memory_limit: Option<u64>, bucket_limit: Option<u32>) -> Self {
|
||||
Self {
|
||||
memory_consumption: Default::default(),
|
||||
|
||||
@@ -74,14 +74,14 @@ impl AggregationWithAccessor {
|
||||
ColumnType::I64,
|
||||
ColumnType::U64,
|
||||
ColumnType::F64,
|
||||
ColumnType::Bytes,
|
||||
ColumnType::Str,
|
||||
// ColumnType::Bytes Unsupported
|
||||
// ColumnType::Bool Unsupported
|
||||
// ColumnType::IpAddr Unsupported
|
||||
// ColumnType::DateTime Unsupported
|
||||
];
|
||||
let mut columns =
|
||||
get_all_ff_reader(reader, field_name, Some(&allowed_column_types))?;
|
||||
get_all_ff_reader_or_empty(reader, field_name, Some(&allowed_column_types))?;
|
||||
let first = columns.pop().unwrap();
|
||||
accessor2 = columns.pop();
|
||||
first
|
||||
@@ -177,7 +177,7 @@ fn get_ff_reader(
|
||||
/// Get all fast field reader or empty as default.
|
||||
///
|
||||
/// Is guaranteed to return at least one column.
|
||||
fn get_all_ff_reader(
|
||||
fn get_all_ff_reader_or_empty(
|
||||
reader: &SegmentReader,
|
||||
field_name: &str,
|
||||
allowed_column_types: Option<&[ColumnType]>,
|
||||
|
||||
@@ -604,6 +604,42 @@ mod tests {
|
||||
});
|
||||
assert_eq!(res, expected_res);
|
||||
}
|
||||
|
||||
{
|
||||
// 1day + hard_bounds as Rfc3339
|
||||
let elasticsearch_compatible_json = json!(
|
||||
{
|
||||
"sales_over_time": {
|
||||
"date_histogram": {
|
||||
"field": "date",
|
||||
"fixed_interval": "1d",
|
||||
"hard_bounds": {
|
||||
"min": "2015-01-02T00:00:00Z",
|
||||
"max": "2015-01-02T12:00:00Z"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
let agg_req: Aggregations = serde_json::from_str(
|
||||
&serde_json::to_string(&elasticsearch_compatible_json).unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
let res = exec_request(agg_req, &index).unwrap();
|
||||
let expected_res = json!({
|
||||
"sales_over_time" : {
|
||||
"buckets": [
|
||||
{
|
||||
"doc_count": 1,
|
||||
"key": 1420156800000.0,
|
||||
"key_as_string": "2015-01-02T00:00:00Z"
|
||||
}
|
||||
]
|
||||
}
|
||||
});
|
||||
assert_eq!(res, expected_res);
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn histogram_test_invalid_req() {
|
||||
|
||||
@@ -177,11 +177,38 @@ impl HistogramAggregation {
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct HistogramBounds {
|
||||
/// The lower bounds.
|
||||
#[serde(deserialize_with = "deserialize_date_or_num")]
|
||||
pub min: f64,
|
||||
/// The upper bounds.
|
||||
#[serde(deserialize_with = "deserialize_date_or_num")]
|
||||
pub max: f64,
|
||||
}
|
||||
|
||||
fn deserialize_date_or_num<'de, D>(deserializer: D) -> Result<f64, D::Error>
|
||||
where D: serde::Deserializer<'de> {
|
||||
let value: serde_json::Value = Deserialize::deserialize(deserializer)?;
|
||||
|
||||
// Check if the value is a string representing an Rfc3339 formatted date
|
||||
if let serde_json::Value::String(date_str) = value {
|
||||
// Parse the Rfc3339 formatted date string into a DateTime<Utc>
|
||||
let date =
|
||||
time::OffsetDateTime::parse(&date_str, &time::format_description::well_known::Rfc3339)
|
||||
.map_err(|_| serde::de::Error::custom("Invalid Rfc3339 formatted date"))?;
|
||||
|
||||
let milliseconds: i64 = (date.unix_timestamp_nanos() / 1_000_000)
|
||||
.try_into()
|
||||
.map_err(|_| serde::de::Error::custom("{date_str} out of allowed range"))?;
|
||||
|
||||
// Return the milliseconds as f64
|
||||
Ok(milliseconds as f64)
|
||||
} else {
|
||||
// The value is not a string, so assume it's a regular f64 number
|
||||
value
|
||||
.as_f64()
|
||||
.ok_or_else(|| serde::de::Error::custom("Invalid number format"))
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for HistogramBounds {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_fmt(format_args!("[{},{}]", self.min, self.max))
|
||||
|
||||
@@ -15,6 +15,12 @@
|
||||
//! Results of final buckets are [`BucketResult`](super::agg_result::BucketResult).
|
||||
//! Results of intermediate buckets are
|
||||
//! [`IntermediateBucketResult`](super::intermediate_agg_result::IntermediateBucketResult)
|
||||
//!
|
||||
//! ## Supported Bucket Aggregations
|
||||
//! - [Histogram](HistogramAggregation)
|
||||
//! - [DateHistogram](DateHistogramAggregationReq)
|
||||
//! - [Range](RangeAggregation)
|
||||
//! - [Terms](TermsAggregation)
|
||||
|
||||
mod histogram;
|
||||
mod range;
|
||||
|
||||
@@ -428,6 +428,12 @@ impl SegmentTermCollector {
|
||||
field_type: ColumnType,
|
||||
accessor_idx: usize,
|
||||
) -> crate::Result<Self> {
|
||||
if field_type == ColumnType::Bytes || field_type == ColumnType::Bool {
|
||||
return Err(TantivyError::InvalidArgument(format!(
|
||||
"terms aggregation is not supported for column type {:?}",
|
||||
field_type
|
||||
)));
|
||||
}
|
||||
let term_buckets = TermBuckets::default();
|
||||
|
||||
if let Some(custom_order) = req.order.as_ref() {
|
||||
@@ -1287,13 +1293,13 @@ mod tests {
|
||||
// searching for terma, but min_doc_count will return all terms
|
||||
let res = exec_request_with_query(agg_req, &index, Some(("string2", "hit")))?;
|
||||
|
||||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "A");
|
||||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "a");
|
||||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 2);
|
||||
assert_eq!(
|
||||
res["my_texts"]["buckets"][0]["elhistogram"]["buckets"],
|
||||
json!([{ "doc_count": 1, "key": 1.0 }, { "doc_count": 1, "key": 2.0 } ])
|
||||
);
|
||||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "B");
|
||||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "b");
|
||||
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 1);
|
||||
assert_eq!(
|
||||
res["my_texts"]["buckets"][1]["elhistogram"]["buckets"],
|
||||
@@ -1415,10 +1421,10 @@ mod tests {
|
||||
let res = exec_request_with_query(agg_req, &index, None).unwrap();
|
||||
println!("{}", serde_json::to_string_pretty(&res).unwrap());
|
||||
|
||||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "Hallo Hallo");
|
||||
assert_eq!(res["my_texts"]["buckets"][0]["key"], "hallo hallo");
|
||||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 1);
|
||||
|
||||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "Hello Hello");
|
||||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "hello hello");
|
||||
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 1);
|
||||
|
||||
Ok(())
|
||||
@@ -1500,4 +1506,41 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn terms_aggregation_bytes() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let bytes_field = schema_builder.add_bytes_field("bytes", FAST);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
index_writer.add_document(doc!(
|
||||
bytes_field => vec![1,2,3],
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
}
|
||||
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"my_texts": {
|
||||
"terms": {
|
||||
"field": "bytes"
|
||||
},
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let res = exec_request_with_query(agg_req, &index, None)?;
|
||||
|
||||
// TODO: Returning an error would be better instead of an empty result, since this is not a
|
||||
// JSON field
|
||||
assert_eq!(
|
||||
res["my_texts"]["buckets"][0]["key"],
|
||||
serde_json::Value::Null
|
||||
);
|
||||
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
|
||||
assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,6 +6,15 @@
|
||||
//! Some aggregations output a single numeric metric (e.g. Average) and are called
|
||||
//! single-value numeric metrics aggregation, others generate multiple metrics (e.g. Stats) and are
|
||||
//! called multi-value numeric metrics aggregation.
|
||||
//!
|
||||
//! ## Supported Metric Aggregations
|
||||
//! - [Average](AverageAggregation)
|
||||
//! - [Stats](StatsAggregation)
|
||||
//! - [Min](MinAggregation)
|
||||
//! - [Max](MaxAggregation)
|
||||
//! - [Sum](SumAggregation)
|
||||
//! - [Count](CountAggregation)
|
||||
//! - [Percentiles](PercentilesAggregationReq)
|
||||
|
||||
mod average;
|
||||
mod count;
|
||||
|
||||
@@ -411,7 +411,7 @@ mod tests {
|
||||
.set_index_option(IndexRecordOption::Basic)
|
||||
.set_fieldnorms(false),
|
||||
)
|
||||
.set_fast(None)
|
||||
.set_fast("default")
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype.clone());
|
||||
let text_field_id = schema_builder.add_text_field("text_id", text_fieldtype);
|
||||
@@ -466,7 +466,7 @@ mod tests {
|
||||
.set_indexing_options(
|
||||
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
|
||||
)
|
||||
.set_fast(None)
|
||||
.set_fast("default")
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
let date_field = schema_builder.add_date_field("date", FAST);
|
||||
|
||||
@@ -161,6 +161,21 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
||||
/// ]);
|
||||
/// }
|
||||
///
|
||||
/// {
|
||||
/// let mut facet_collector = FacetCollector::for_field("facet");
|
||||
/// facet_collector.add_facet("/");
|
||||
/// let facet_counts = searcher.search(&AllQuery, &facet_collector)?;
|
||||
///
|
||||
/// // This lists all of the facet counts
|
||||
/// let facets: Vec<(&Facet, u64)> = facet_counts
|
||||
/// .get("/")
|
||||
/// .collect();
|
||||
/// assert_eq!(facets, vec![
|
||||
/// (&Facet::from("/category"), 4),
|
||||
/// (&Facet::from("/lang"), 4)
|
||||
/// ]);
|
||||
/// }
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// # assert!(example().is_ok());
|
||||
@@ -285,6 +300,9 @@ fn is_child_facet(parent_facet: &[u8], possible_child_facet: &[u8]) -> bool {
|
||||
if !possible_child_facet.starts_with(parent_facet) {
|
||||
return false;
|
||||
}
|
||||
if parent_facet.is_empty() {
|
||||
return true;
|
||||
}
|
||||
possible_child_facet.get(parent_facet.len()).copied() == Some(0u8)
|
||||
}
|
||||
|
||||
@@ -789,6 +807,15 @@ mod tests {
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn is_child_facet() {
|
||||
assert!(super::is_child_facet(&b"foo"[..], &b"foo\0bar"[..]));
|
||||
assert!(super::is_child_facet(&b""[..], &b"foo\0bar"[..]));
|
||||
assert!(super::is_child_facet(&b""[..], &b"foo"[..]));
|
||||
assert!(!super::is_child_facet(&b"foo\0bar"[..], &b"foo"[..]));
|
||||
assert!(!super::is_child_facet(&b"foo"[..], &b"foobar\0baz"[..]));
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
|
||||
@@ -6,32 +6,35 @@
|
||||
//
|
||||
// Of course, you can have a look at the tantivy's built-in collectors
|
||||
// such as the `CountCollector` for more examples.
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
use std::fmt::Debug;
|
||||
use std::marker::PhantomData;
|
||||
use std::sync::Arc;
|
||||
|
||||
use columnar::{ColumnValues, DynamicColumn, HasAssociatedColumnType};
|
||||
use columnar::{BytesColumn, Column, DynamicColumn, HasAssociatedColumnType};
|
||||
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
use crate::schema::Field;
|
||||
use crate::{Score, SegmentReader, TantivyError};
|
||||
use crate::{DocId, Score, SegmentReader, TantivyError};
|
||||
|
||||
/// The `FilterCollector` filters docs using a fast field value and a predicate.
|
||||
/// Only the documents for which the predicate returned "true" will be passed on to the next
|
||||
/// collector.
|
||||
///
|
||||
/// Only the documents containing at least one value for which the predicate returns `true`
|
||||
/// will be passed on to the next collector.
|
||||
///
|
||||
/// In other words,
|
||||
/// - documents with no values are filtered out.
|
||||
/// - documents with several values are accepted if at least one value matches the predicate.
|
||||
///
|
||||
///
|
||||
/// ```rust
|
||||
/// use tantivy::collector::{TopDocs, FilterCollector};
|
||||
/// use tantivy::query::QueryParser;
|
||||
/// use tantivy::schema::{Schema, TEXT, INDEXED, FAST};
|
||||
/// use tantivy::schema::{Schema, TEXT, FAST};
|
||||
/// use tantivy::{doc, DocAddress, Index};
|
||||
///
|
||||
/// # fn main() -> tantivy::Result<()> {
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let price = schema_builder.add_u64_field("price", INDEXED | FAST);
|
||||
/// let price = schema_builder.add_u64_field("price", FAST);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
///
|
||||
@@ -47,20 +50,24 @@ use crate::{Score, SegmentReader, TantivyError};
|
||||
///
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// let no_filter_collector = FilterCollector::new(price, &|value: u64| value > 20_120u64, TopDocs::with_limit(2));
|
||||
/// let no_filter_collector = FilterCollector::new(price, |value: u64| value > 20_120u64, TopDocs::with_limit(2));
|
||||
/// let top_docs = searcher.search(&query, &no_filter_collector)?;
|
||||
///
|
||||
/// assert_eq!(top_docs.len(), 1);
|
||||
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
|
||||
///
|
||||
/// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(price, &|value| value < 5u64, TopDocs::with_limit(2));
|
||||
/// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(price, |value| value < 5u64, TopDocs::with_limit(2));
|
||||
/// let filtered_top_docs = searcher.search(&query, &filter_all_collector)?;
|
||||
///
|
||||
/// assert_eq!(filtered_top_docs.len(), 0);
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue: Default>
|
||||
///
|
||||
/// Note that this is limited to fast fields which implement the
|
||||
/// [`FastValue`][crate::fastfield::FastValue] trait, e.g. `u64` but not `&[u8]`.
|
||||
/// To filter based on a bytes fast field, use a [`BytesFilterCollector`] instead.
|
||||
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue>
|
||||
where TPredicate: 'static + Clone
|
||||
{
|
||||
field: Field,
|
||||
@@ -69,19 +76,15 @@ where TPredicate: 'static + Clone
|
||||
t_predicate_value: PhantomData<TPredicateValue>,
|
||||
}
|
||||
|
||||
impl<TCollector, TPredicate, TPredicateValue: Default>
|
||||
impl<TCollector, TPredicate, TPredicateValue>
|
||||
FilterCollector<TCollector, TPredicate, TPredicateValue>
|
||||
where
|
||||
TCollector: Collector + Send + Sync,
|
||||
TPredicate: Fn(TPredicateValue) -> bool + Send + Sync + Clone,
|
||||
{
|
||||
/// Create a new FilterCollector.
|
||||
pub fn new(
|
||||
field: Field,
|
||||
predicate: TPredicate,
|
||||
collector: TCollector,
|
||||
) -> FilterCollector<TCollector, TPredicate, TPredicateValue> {
|
||||
FilterCollector {
|
||||
/// Create a new `FilterCollector`.
|
||||
pub fn new(field: Field, predicate: TPredicate, collector: TCollector) -> Self {
|
||||
Self {
|
||||
field,
|
||||
predicate,
|
||||
collector,
|
||||
@@ -90,7 +93,7 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
impl<TCollector, TPredicate, TPredicateValue: Default> Collector
|
||||
impl<TCollector, TPredicate, TPredicateValue> Collector
|
||||
for FilterCollector<TCollector, TPredicate, TPredicateValue>
|
||||
where
|
||||
TCollector: Collector + Send + Sync,
|
||||
@@ -98,8 +101,6 @@ where
|
||||
TPredicateValue: HasAssociatedColumnType,
|
||||
DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>>,
|
||||
{
|
||||
// That's the type of our result.
|
||||
// Our standard deviation will be a float.
|
||||
type Fruit = TCollector::Fruit;
|
||||
|
||||
type Child = FilterSegmentCollector<TCollector::Child, TPredicate, TPredicateValue>;
|
||||
@@ -108,7 +109,7 @@ where
|
||||
&self,
|
||||
segment_local_id: u32,
|
||||
segment_reader: &SegmentReader,
|
||||
) -> crate::Result<FilterSegmentCollector<TCollector::Child, TPredicate, TPredicateValue>> {
|
||||
) -> crate::Result<Self::Child> {
|
||||
let schema = segment_reader.schema();
|
||||
let field_entry = schema.get_field_entry(self.field);
|
||||
if !field_entry.is_fast() {
|
||||
@@ -118,16 +119,16 @@ where
|
||||
)));
|
||||
}
|
||||
|
||||
let fast_field_reader = segment_reader
|
||||
let column_opt = segment_reader
|
||||
.fast_fields()
|
||||
.column_first_or_default(schema.get_field_name(self.field))?;
|
||||
.column_opt(field_entry.name())?;
|
||||
|
||||
let segment_collector = self
|
||||
.collector
|
||||
.for_segment(segment_local_id, segment_reader)?;
|
||||
|
||||
Ok(FilterSegmentCollector {
|
||||
fast_field_reader,
|
||||
column_opt,
|
||||
segment_collector,
|
||||
predicate: self.predicate.clone(),
|
||||
t_predicate_value: PhantomData,
|
||||
@@ -146,35 +147,208 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue>
|
||||
where
|
||||
TPredicate: 'static,
|
||||
DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>>,
|
||||
{
|
||||
fast_field_reader: Arc<dyn ColumnValues<TPredicateValue>>,
|
||||
pub struct FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue> {
|
||||
column_opt: Option<Column<TPredicateValue>>,
|
||||
segment_collector: TSegmentCollector,
|
||||
predicate: TPredicate,
|
||||
t_predicate_value: PhantomData<TPredicateValue>,
|
||||
}
|
||||
|
||||
impl<TSegmentCollector, TPredicate, TPredicateValue>
|
||||
FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue>
|
||||
where
|
||||
TPredicateValue: PartialOrd + Copy + Debug + Send + Sync + 'static,
|
||||
TPredicate: 'static + Fn(TPredicateValue) -> bool + Send + Sync,
|
||||
{
|
||||
#[inline]
|
||||
fn accept_document(&self, doc_id: DocId) -> bool {
|
||||
if let Some(column) = &self.column_opt {
|
||||
for val in column.values_for_doc(doc_id) {
|
||||
if (self.predicate)(val) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl<TSegmentCollector, TPredicate, TPredicateValue> SegmentCollector
|
||||
for FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue>
|
||||
where
|
||||
TSegmentCollector: SegmentCollector,
|
||||
TPredicateValue: HasAssociatedColumnType,
|
||||
TPredicate: 'static + Fn(TPredicateValue) -> bool + Send + Sync,
|
||||
DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>>,
|
||||
TPredicate: 'static + Fn(TPredicateValue) -> bool + Send + Sync, /* DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>> */
|
||||
{
|
||||
type Fruit = TSegmentCollector::Fruit;
|
||||
|
||||
fn collect(&mut self, doc: u32, score: Score) {
|
||||
let value = self.fast_field_reader.get_val(doc);
|
||||
if (self.predicate)(value) {
|
||||
self.segment_collector.collect(doc, score)
|
||||
if self.accept_document(doc) {
|
||||
self.segment_collector.collect(doc, score);
|
||||
}
|
||||
}
|
||||
|
||||
fn harvest(self) -> <TSegmentCollector as SegmentCollector>::Fruit {
|
||||
fn harvest(self) -> TSegmentCollector::Fruit {
|
||||
self.segment_collector.harvest()
|
||||
}
|
||||
}
|
||||
|
||||
/// A variant of the [`FilterCollector`] specialized for bytes fast fields, i.e.
|
||||
/// it transparently wraps an inner [`Collector`] but filters documents
|
||||
/// based on the result of applying the predicate to the bytes fast field.
|
||||
///
|
||||
/// A document is accepted if and only if the predicate returns `true` for at least one value.
|
||||
///
|
||||
/// In other words,
|
||||
/// - documents with no values are filtered out.
|
||||
/// - documents with several values are accepted if at least one value matches the predicate.
|
||||
///
|
||||
/// ```rust
|
||||
/// use tantivy::collector::{TopDocs, BytesFilterCollector};
|
||||
/// use tantivy::query::QueryParser;
|
||||
/// use tantivy::schema::{Schema, TEXT, FAST};
|
||||
/// use tantivy::{doc, DocAddress, Index};
|
||||
///
|
||||
/// # fn main() -> tantivy::Result<()> {
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let barcode = schema_builder.add_bytes_field("barcode", FAST);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
///
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind", barcode => &b"010101"[..]))?;
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib", barcode => &b"110011"[..]))?;
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow", barcode => &b"110111"[..]))?;
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl", barcode => &b"011101"[..]))?;
|
||||
/// index_writer.add_document(doc!(title => "Bridget Jones's Diary"))?;
|
||||
/// index_writer.commit()?;
|
||||
///
|
||||
/// let reader = index.reader()?;
|
||||
/// let searcher = reader.searcher();
|
||||
///
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// let filter_collector = BytesFilterCollector::new(barcode, |bytes: &[u8]| bytes.starts_with(b"01"), TopDocs::with_limit(2));
|
||||
/// let top_docs = searcher.search(&query, &filter_collector)?;
|
||||
///
|
||||
/// assert_eq!(top_docs.len(), 1);
|
||||
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 3));
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
pub struct BytesFilterCollector<TCollector, TPredicate>
|
||||
where TPredicate: 'static + Clone
|
||||
{
|
||||
field: Field,
|
||||
collector: TCollector,
|
||||
predicate: TPredicate,
|
||||
}
|
||||
|
||||
impl<TCollector, TPredicate> BytesFilterCollector<TCollector, TPredicate>
|
||||
where
|
||||
TCollector: Collector + Send + Sync,
|
||||
TPredicate: Fn(&[u8]) -> bool + Send + Sync + Clone,
|
||||
{
|
||||
/// Create a new `BytesFilterCollector`.
|
||||
pub fn new(field: Field, predicate: TPredicate, collector: TCollector) -> Self {
|
||||
Self {
|
||||
field,
|
||||
predicate,
|
||||
collector,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<TCollector, TPredicate> Collector for BytesFilterCollector<TCollector, TPredicate>
|
||||
where
|
||||
TCollector: Collector + Send + Sync,
|
||||
TPredicate: 'static + Fn(&[u8]) -> bool + Send + Sync + Clone,
|
||||
{
|
||||
type Fruit = TCollector::Fruit;
|
||||
|
||||
type Child = BytesFilterSegmentCollector<TCollector::Child, TPredicate>;
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_local_id: u32,
|
||||
segment_reader: &SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
let schema = segment_reader.schema();
|
||||
let field_name = schema.get_field_name(self.field);
|
||||
|
||||
let column_opt = segment_reader.fast_fields().bytes(field_name)?;
|
||||
|
||||
let segment_collector = self
|
||||
.collector
|
||||
.for_segment(segment_local_id, segment_reader)?;
|
||||
|
||||
Ok(BytesFilterSegmentCollector {
|
||||
column_opt,
|
||||
segment_collector,
|
||||
predicate: self.predicate.clone(),
|
||||
buffer: Vec::new(),
|
||||
})
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
self.collector.requires_scoring()
|
||||
}
|
||||
|
||||
fn merge_fruits(
|
||||
&self,
|
||||
segment_fruits: Vec<<TCollector::Child as SegmentCollector>::Fruit>,
|
||||
) -> crate::Result<TCollector::Fruit> {
|
||||
self.collector.merge_fruits(segment_fruits)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct BytesFilterSegmentCollector<TSegmentCollector, TPredicate>
|
||||
where TPredicate: 'static
|
||||
{
|
||||
column_opt: Option<BytesColumn>,
|
||||
segment_collector: TSegmentCollector,
|
||||
predicate: TPredicate,
|
||||
buffer: Vec<u8>,
|
||||
}
|
||||
|
||||
impl<TSegmentCollector, TPredicate> BytesFilterSegmentCollector<TSegmentCollector, TPredicate>
|
||||
where
|
||||
TSegmentCollector: SegmentCollector,
|
||||
TPredicate: 'static + Fn(&[u8]) -> bool + Send + Sync,
|
||||
{
|
||||
#[inline]
|
||||
fn accept_document(&mut self, doc_id: DocId) -> bool {
|
||||
if let Some(column) = &self.column_opt {
|
||||
for ord in column.term_ords(doc_id) {
|
||||
self.buffer.clear();
|
||||
|
||||
let found = column.ord_to_bytes(ord, &mut self.buffer).unwrap_or(false);
|
||||
|
||||
if found && (self.predicate)(&self.buffer) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl<TSegmentCollector, TPredicate> SegmentCollector
|
||||
for BytesFilterSegmentCollector<TSegmentCollector, TPredicate>
|
||||
where
|
||||
TSegmentCollector: SegmentCollector,
|
||||
TPredicate: 'static + Fn(&[u8]) -> bool + Send + Sync,
|
||||
{
|
||||
type Fruit = TSegmentCollector::Fruit;
|
||||
|
||||
fn collect(&mut self, doc: u32, score: Score) {
|
||||
if self.accept_document(doc) {
|
||||
self.segment_collector.collect(doc, score);
|
||||
}
|
||||
}
|
||||
|
||||
fn harvest(self) -> TSegmentCollector::Fruit {
|
||||
self.segment_collector.harvest()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -112,7 +112,7 @@ mod docset_collector;
|
||||
pub use self::docset_collector::DocSetCollector;
|
||||
|
||||
mod filter_collector_wrapper;
|
||||
pub use self::filter_collector_wrapper::FilterCollector;
|
||||
pub use self::filter_collector_wrapper::{BytesFilterCollector, FilterCollector};
|
||||
|
||||
/// `Fruit` is the type for the result of our collection.
|
||||
/// e.g. `usize` for the `Count` collector.
|
||||
|
||||
@@ -14,7 +14,7 @@ use crate::collector::{
|
||||
};
|
||||
use crate::fastfield::{FastFieldNotAvailableError, FastValue};
|
||||
use crate::query::Weight;
|
||||
use crate::{DocAddress, DocId, Score, SegmentOrdinal, SegmentReader, TantivyError};
|
||||
use crate::{DocAddress, DocId, Order, Score, SegmentOrdinal, SegmentReader, TantivyError};
|
||||
|
||||
struct FastFieldConvertCollector<
|
||||
TCollector: Collector<Fruit = Vec<(u64, DocAddress)>>,
|
||||
@@ -23,6 +23,7 @@ struct FastFieldConvertCollector<
|
||||
pub collector: TCollector,
|
||||
pub field: String,
|
||||
pub fast_value: std::marker::PhantomData<TFastValue>,
|
||||
order: Order,
|
||||
}
|
||||
|
||||
impl<TCollector, TFastValue> Collector for FastFieldConvertCollector<TCollector, TFastValue>
|
||||
@@ -70,7 +71,13 @@ where
|
||||
let raw_result = self.collector.merge_fruits(segment_fruits)?;
|
||||
let transformed_result = raw_result
|
||||
.into_iter()
|
||||
.map(|(score, doc_address)| (TFastValue::from_u64(score), doc_address))
|
||||
.map(|(score, doc_address)| {
|
||||
if self.order.is_desc() {
|
||||
(TFastValue::from_u64(score), doc_address)
|
||||
} else {
|
||||
(TFastValue::from_u64(u64::MAX - score), doc_address)
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
Ok(transformed_result)
|
||||
}
|
||||
@@ -131,16 +138,23 @@ impl fmt::Debug for TopDocs {
|
||||
|
||||
struct ScorerByFastFieldReader {
|
||||
sort_column: Arc<dyn ColumnValues<u64>>,
|
||||
order: Order,
|
||||
}
|
||||
|
||||
impl CustomSegmentScorer<u64> for ScorerByFastFieldReader {
|
||||
fn score(&mut self, doc: DocId) -> u64 {
|
||||
self.sort_column.get_val(doc)
|
||||
let value = self.sort_column.get_val(doc);
|
||||
if self.order.is_desc() {
|
||||
value
|
||||
} else {
|
||||
u64::MAX - value
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct ScorerByField {
|
||||
field: String,
|
||||
order: Order,
|
||||
}
|
||||
|
||||
impl CustomScorer<u64> for ScorerByField {
|
||||
@@ -157,8 +171,13 @@ impl CustomScorer<u64> for ScorerByField {
|
||||
sort_column_opt.ok_or_else(|| FastFieldNotAvailableError {
|
||||
field_name: self.field.clone(),
|
||||
})?;
|
||||
let mut default_value = 0u64;
|
||||
if self.order.is_asc() {
|
||||
default_value = u64::MAX;
|
||||
}
|
||||
Ok(ScorerByFastFieldReader {
|
||||
sort_column: sort_column.first_or_default_col(0u64),
|
||||
sort_column: sort_column.first_or_default_col(default_value),
|
||||
order: self.order.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -230,7 +249,7 @@ impl TopDocs {
|
||||
///
|
||||
/// ```rust
|
||||
/// # use tantivy::schema::{Schema, FAST, TEXT};
|
||||
/// # use tantivy::{doc, Index, DocAddress};
|
||||
/// # use tantivy::{doc, Index, DocAddress, Order};
|
||||
/// # use tantivy::query::{Query, QueryParser};
|
||||
/// use tantivy::Searcher;
|
||||
/// use tantivy::collector::TopDocs;
|
||||
@@ -268,7 +287,7 @@ impl TopDocs {
|
||||
/// // Note the `rating_field` needs to be a FAST field here.
|
||||
/// let top_books_by_rating = TopDocs
|
||||
/// ::with_limit(10)
|
||||
/// .order_by_u64_field("rating");
|
||||
/// .order_by_fast_field("rating", Order::Desc);
|
||||
///
|
||||
/// // ... and here are our documents. Note this is a simple vec.
|
||||
/// // The `u64` in the pair is the value of our fast field for
|
||||
@@ -288,13 +307,15 @@ impl TopDocs {
|
||||
///
|
||||
/// To comfortably work with `u64`s, `i64`s, `f64`s, or `date`s, please refer to
|
||||
/// the [.order_by_fast_field(...)](TopDocs::order_by_fast_field) method.
|
||||
pub fn order_by_u64_field(
|
||||
fn order_by_u64_field(
|
||||
self,
|
||||
field: impl ToString,
|
||||
order: Order,
|
||||
) -> impl Collector<Fruit = Vec<(u64, DocAddress)>> {
|
||||
CustomScoreTopCollector::new(
|
||||
ScorerByField {
|
||||
field: field.to_string(),
|
||||
order,
|
||||
},
|
||||
self.0.into_tscore(),
|
||||
)
|
||||
@@ -316,7 +337,7 @@ impl TopDocs {
|
||||
///
|
||||
/// ```rust
|
||||
/// # use tantivy::schema::{Schema, FAST, TEXT};
|
||||
/// # use tantivy::{doc, Index, DocAddress};
|
||||
/// # use tantivy::{doc, Index, DocAddress,Order};
|
||||
/// # use tantivy::query::{Query, AllQuery};
|
||||
/// use tantivy::Searcher;
|
||||
/// use tantivy::collector::TopDocs;
|
||||
@@ -354,7 +375,7 @@ impl TopDocs {
|
||||
/// // type `sort_by_field`. revenue_field here is a FAST i64 field.
|
||||
/// let top_company_by_revenue = TopDocs
|
||||
/// ::with_limit(2)
|
||||
/// .order_by_fast_field("revenue");
|
||||
/// .order_by_fast_field("revenue", Order::Desc);
|
||||
///
|
||||
/// // ... and here are our documents. Note this is a simple vec.
|
||||
/// // The `i64` in the pair is the value of our fast field for
|
||||
@@ -372,15 +393,17 @@ impl TopDocs {
|
||||
pub fn order_by_fast_field<TFastValue>(
|
||||
self,
|
||||
fast_field: impl ToString,
|
||||
order: Order,
|
||||
) -> impl Collector<Fruit = Vec<(TFastValue, DocAddress)>>
|
||||
where
|
||||
TFastValue: FastValue,
|
||||
{
|
||||
let u64_collector = self.order_by_u64_field(fast_field.to_string());
|
||||
let u64_collector = self.order_by_u64_field(fast_field.to_string(), order.clone());
|
||||
FastFieldConvertCollector {
|
||||
collector: u64_collector,
|
||||
field: fast_field.to_string(),
|
||||
fast_value: PhantomData,
|
||||
order,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -721,7 +744,7 @@ mod tests {
|
||||
use crate::schema::{Field, Schema, FAST, STORED, TEXT};
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
use crate::time::OffsetDateTime;
|
||||
use crate::{DateTime, DocAddress, DocId, Index, IndexWriter, Score, SegmentReader};
|
||||
use crate::{DateTime, DocAddress, DocId, Index, IndexWriter, Order, Score, SegmentReader};
|
||||
|
||||
fn make_index() -> crate::Result<Index> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -882,7 +905,7 @@ mod tests {
|
||||
});
|
||||
let searcher = index.reader()?.searcher();
|
||||
|
||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field(SIZE);
|
||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field(SIZE, Order::Desc);
|
||||
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector)?;
|
||||
assert_eq!(
|
||||
&top_docs[..],
|
||||
@@ -921,7 +944,7 @@ mod tests {
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let top_collector = TopDocs::with_limit(3).order_by_fast_field("birthday");
|
||||
let top_collector = TopDocs::with_limit(3).order_by_fast_field("birthday", Order::Desc);
|
||||
let top_docs: Vec<(DateTime, DocAddress)> = searcher.search(&AllQuery, &top_collector)?;
|
||||
assert_eq!(
|
||||
&top_docs[..],
|
||||
@@ -951,7 +974,7 @@ mod tests {
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let top_collector = TopDocs::with_limit(3).order_by_fast_field("altitude");
|
||||
let top_collector = TopDocs::with_limit(3).order_by_fast_field("altitude", Order::Desc);
|
||||
let top_docs: Vec<(i64, DocAddress)> = searcher.search(&AllQuery, &top_collector)?;
|
||||
assert_eq!(
|
||||
&top_docs[..],
|
||||
@@ -981,7 +1004,7 @@ mod tests {
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let top_collector = TopDocs::with_limit(3).order_by_fast_field("altitude");
|
||||
let top_collector = TopDocs::with_limit(3).order_by_fast_field("altitude", Order::Desc);
|
||||
let top_docs: Vec<(f64, DocAddress)> = searcher.search(&AllQuery, &top_collector)?;
|
||||
assert_eq!(
|
||||
&top_docs[..],
|
||||
@@ -1009,7 +1032,7 @@ mod tests {
|
||||
.unwrap();
|
||||
});
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field("missing_field");
|
||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field("missing_field", Order::Desc);
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
top_collector
|
||||
.for_segment(0, segment_reader)
|
||||
@@ -1027,7 +1050,7 @@ mod tests {
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let segment = searcher.segment_reader(0);
|
||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field(SIZE);
|
||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field(SIZE, Order::Desc);
|
||||
let err = top_collector.for_segment(0, segment).err().unwrap();
|
||||
assert!(matches!(err, crate::TantivyError::InvalidArgument(_)));
|
||||
Ok(())
|
||||
@@ -1044,7 +1067,7 @@ mod tests {
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let segment = searcher.segment_reader(0);
|
||||
let top_collector = TopDocs::with_limit(4).order_by_fast_field::<i64>(SIZE);
|
||||
let top_collector = TopDocs::with_limit(4).order_by_fast_field::<i64>(SIZE, Order::Desc);
|
||||
let err = top_collector.for_segment(0, segment).err().unwrap();
|
||||
assert!(
|
||||
matches!(err, crate::TantivyError::SchemaError(msg) if msg == "Field \"size\" is not a fast field.")
|
||||
@@ -1106,4 +1129,50 @@ mod tests {
|
||||
let query = query_parser.parse_query(query).unwrap();
|
||||
(index, query)
|
||||
}
|
||||
#[test]
|
||||
fn test_fast_field_ascending_order() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let title = schema_builder.add_text_field(TITLE, TEXT);
|
||||
let size = schema_builder.add_u64_field(SIZE, FAST);
|
||||
let schema = schema_builder.build();
|
||||
let (index, query) = index("beer", title, schema, |index_writer| {
|
||||
index_writer
|
||||
.add_document(doc!(
|
||||
title => "bottle of beer",
|
||||
size => 12u64,
|
||||
))
|
||||
.unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(
|
||||
title => "growler of beer",
|
||||
size => 64u64,
|
||||
))
|
||||
.unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(
|
||||
title => "pint of beer",
|
||||
size => 16u64,
|
||||
))
|
||||
.unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(
|
||||
title => "empty beer",
|
||||
))
|
||||
.unwrap();
|
||||
});
|
||||
let searcher = index.reader()?.searcher();
|
||||
|
||||
let top_collector = TopDocs::with_limit(4).order_by_fast_field(SIZE, Order::Asc);
|
||||
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector)?;
|
||||
assert_eq!(
|
||||
&top_docs[..],
|
||||
&[
|
||||
(12, DocAddress::new(0, 0)),
|
||||
(16, DocAddress::new(0, 2)),
|
||||
(64, DocAddress::new(0, 1)),
|
||||
(18446744073709551615, DocAddress::new(0, 3)),
|
||||
]
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -120,8 +120,8 @@ impl IndexBuilder {
|
||||
Self {
|
||||
schema: None,
|
||||
index_settings: IndexSettings::default(),
|
||||
tokenizer_manager: TokenizerManager::default(),
|
||||
fast_field_tokenizer_manager: TokenizerManager::default(),
|
||||
tokenizer_manager: TokenizerManager::default_for_indexing(),
|
||||
fast_field_tokenizer_manager: TokenizerManager::default_for_fast_fields(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -400,8 +400,8 @@ impl Index {
|
||||
settings: metas.index_settings.clone(),
|
||||
directory,
|
||||
schema,
|
||||
tokenizers: TokenizerManager::default(),
|
||||
fast_field_tokenizers: TokenizerManager::default(),
|
||||
tokenizers: TokenizerManager::default_for_indexing(),
|
||||
fast_field_tokenizers: TokenizerManager::default_for_fast_fields(),
|
||||
executor: Arc::new(Executor::single_thread()),
|
||||
inventory,
|
||||
}
|
||||
|
||||
@@ -410,7 +410,9 @@ mod tests {
|
||||
use super::IndexMeta;
|
||||
use crate::core::index_meta::UntrackedIndexMeta;
|
||||
use crate::schema::{Schema, TEXT};
|
||||
use crate::store::{Compressor, ZstdCompressor};
|
||||
use crate::store::Compressor;
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
use crate::store::ZstdCompressor;
|
||||
use crate::{IndexSettings, IndexSortByField, Order};
|
||||
|
||||
#[test]
|
||||
@@ -446,6 +448,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
fn test_serialize_metas_zstd_compressor() {
|
||||
let schema = {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -482,13 +485,14 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(all(feature = "lz4-compression", feature = "zstd-compression"))]
|
||||
fn test_serialize_metas_invalid_comp() {
|
||||
let json = r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zsstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
|
||||
|
||||
let err = serde_json::from_str::<UntrackedIndexMeta>(json).unwrap_err();
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
"unknown variant `zsstd`, expected one of `none`, `lz4`, `brotli`, `snappy`, `zstd`, \
|
||||
"unknown variant `zsstd`, expected one of `none`, `lz4`, `zstd`, \
|
||||
`zstd(compression_level=5)` at line 1 column 96"
|
||||
.to_string()
|
||||
);
|
||||
@@ -502,6 +506,20 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(not(feature = "zstd-compression"))]
|
||||
fn test_serialize_metas_unsupported_comp() {
|
||||
let json = r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
|
||||
|
||||
let err = serde_json::from_str::<UntrackedIndexMeta>(json).unwrap_err();
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
"unsupported variant `zstd`, please enable Tantivy's `zstd-compression` feature at \
|
||||
line 1 column 95"
|
||||
.to_string()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
fn test_index_settings_default() {
|
||||
|
||||
@@ -67,7 +67,7 @@ impl IndexingPositionsPerPath {
|
||||
pub(crate) fn index_json_values<'a>(
|
||||
doc: DocId,
|
||||
json_values: impl Iterator<Item = crate::Result<&'a serde_json::Map<String, serde_json::Value>>>,
|
||||
text_analyzer: &TextAnalyzer,
|
||||
text_analyzer: &mut TextAnalyzer,
|
||||
expand_dots_enabled: bool,
|
||||
term_buffer: &mut Term,
|
||||
postings_writer: &mut dyn PostingsWriter,
|
||||
@@ -93,7 +93,7 @@ pub(crate) fn index_json_values<'a>(
|
||||
fn index_json_object(
|
||||
doc: DocId,
|
||||
json_value: &serde_json::Map<String, serde_json::Value>,
|
||||
text_analyzer: &TextAnalyzer,
|
||||
text_analyzer: &mut TextAnalyzer,
|
||||
json_term_writer: &mut JsonTermWriter,
|
||||
postings_writer: &mut dyn PostingsWriter,
|
||||
ctx: &mut IndexingContext,
|
||||
@@ -117,7 +117,7 @@ fn index_json_object(
|
||||
fn index_json_value(
|
||||
doc: DocId,
|
||||
json_value: &serde_json::Value,
|
||||
text_analyzer: &TextAnalyzer,
|
||||
text_analyzer: &mut TextAnalyzer,
|
||||
json_term_writer: &mut JsonTermWriter,
|
||||
postings_writer: &mut dyn PostingsWriter,
|
||||
ctx: &mut IndexingContext,
|
||||
@@ -212,12 +212,12 @@ pub fn convert_to_fast_value_and_get_term(
|
||||
DateTime::from_utc(dt_utc),
|
||||
));
|
||||
}
|
||||
if let Ok(u64_val) = str::parse::<u64>(phrase) {
|
||||
return Some(set_fastvalue_and_get_term(json_term_writer, u64_val));
|
||||
}
|
||||
if let Ok(i64_val) = str::parse::<i64>(phrase) {
|
||||
return Some(set_fastvalue_and_get_term(json_term_writer, i64_val));
|
||||
}
|
||||
if let Ok(u64_val) = str::parse::<u64>(phrase) {
|
||||
return Some(set_fastvalue_and_get_term(json_term_writer, u64_val));
|
||||
}
|
||||
if let Ok(f64_val) = str::parse::<f64>(phrase) {
|
||||
return Some(set_fastvalue_and_get_term(json_term_writer, f64_val));
|
||||
}
|
||||
@@ -239,7 +239,7 @@ pub(crate) fn set_fastvalue_and_get_term<T: FastValue>(
|
||||
pub(crate) fn set_string_and_get_terms(
|
||||
json_term_writer: &mut JsonTermWriter,
|
||||
value: &str,
|
||||
text_analyzer: &TextAnalyzer,
|
||||
text_analyzer: &mut TextAnalyzer,
|
||||
) -> Vec<(usize, Term)> {
|
||||
let mut positions_and_terms = Vec::<(usize, Term)>::new();
|
||||
json_term_writer.close_path_and_set_type(Type::Str);
|
||||
@@ -259,7 +259,7 @@ pub(crate) fn set_string_and_get_terms(
|
||||
|
||||
/// Writes a value of a JSON field to a `Term`.
|
||||
/// The Term format is as follows:
|
||||
/// [JSON_TYPE][JSON_PATH][JSON_END_OF_PATH][VALUE_BYTES]
|
||||
/// `[JSON_TYPE][JSON_PATH][JSON_END_OF_PATH][VALUE_BYTES]`
|
||||
pub struct JsonTermWriter<'a> {
|
||||
term_buffer: &'a mut Term,
|
||||
path_stack: Vec<usize>,
|
||||
|
||||
@@ -2,8 +2,6 @@ use std::collections::HashMap;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::{fmt, io};
|
||||
|
||||
use fail::fail_point;
|
||||
|
||||
use crate::core::{InvertedIndexReader, Segment, SegmentComponent, SegmentId};
|
||||
use crate::directory::{CompositeFile, FileSlice};
|
||||
use crate::error::DataCorruption;
|
||||
@@ -151,7 +149,7 @@ impl SegmentReader {
|
||||
|
||||
let store_file = segment.open_read(SegmentComponent::Store)?;
|
||||
|
||||
fail_point!("SegmentReader::open#middle");
|
||||
crate::fail_point!("SegmentReader::open#middle");
|
||||
|
||||
let postings_file = segment.open_read(SegmentComponent::Postings)?;
|
||||
let postings_composite = CompositeFile::open(&postings_file)?;
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::fs::{self, File, OpenOptions};
|
||||
use std::io::{self, BufWriter, Read, Seek, Write};
|
||||
use std::ops::Deref;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{Arc, RwLock, Weak};
|
||||
use std::{fmt, result};
|
||||
|
||||
use common::StableDeref;
|
||||
use fs4::FileExt;
|
||||
@@ -21,6 +21,7 @@ use crate::directory::{
|
||||
AntiCallToken, Directory, DirectoryLock, FileHandle, Lock, OwnedBytes, TerminatingWrite,
|
||||
WatchCallback, WatchHandle, WritePtr,
|
||||
};
|
||||
#[cfg(unix)]
|
||||
use crate::Advice;
|
||||
|
||||
pub type ArcBytes = Arc<dyn Deref<Target = [u8]> + Send + Sync + 'static>;
|
||||
@@ -33,10 +34,7 @@ pub(crate) fn make_io_err(msg: String) -> io::Error {
|
||||
|
||||
/// Returns `None` iff the file exists, can be read, but is empty (and hence
|
||||
/// cannot be mmapped)
|
||||
fn open_mmap(
|
||||
full_path: &Path,
|
||||
madvice_opt: Option<Advice>,
|
||||
) -> result::Result<Option<Mmap>, OpenReadError> {
|
||||
fn open_mmap(full_path: &Path) -> Result<Option<Mmap>, OpenReadError> {
|
||||
let file = File::open(full_path).map_err(|io_err| {
|
||||
if io_err.kind() == io::ErrorKind::NotFound {
|
||||
OpenReadError::FileDoesNotExist(full_path.to_path_buf())
|
||||
@@ -59,9 +57,7 @@ fn open_mmap(
|
||||
.map(Some)
|
||||
.map_err(|io_err| OpenReadError::wrap_io_error(io_err, full_path.to_path_buf()))
|
||||
}?;
|
||||
if let (Some(mmap), Some(madvice)) = (&mmap_opt, madvice_opt) {
|
||||
let _ = mmap.advise(madvice);
|
||||
}
|
||||
|
||||
Ok(mmap_opt)
|
||||
}
|
||||
|
||||
@@ -83,18 +79,25 @@ pub struct CacheInfo {
|
||||
struct MmapCache {
|
||||
counters: CacheCounters,
|
||||
cache: HashMap<PathBuf, WeakArcBytes>,
|
||||
#[cfg(unix)]
|
||||
madvice_opt: Option<Advice>,
|
||||
}
|
||||
|
||||
impl MmapCache {
|
||||
fn new(madvice_opt: Option<Advice>) -> MmapCache {
|
||||
fn new() -> MmapCache {
|
||||
MmapCache {
|
||||
counters: CacheCounters::default(),
|
||||
cache: HashMap::default(),
|
||||
madvice_opt,
|
||||
#[cfg(unix)]
|
||||
madvice_opt: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(unix)]
|
||||
fn set_advice(&mut self, madvice: Advice) {
|
||||
self.madvice_opt = Some(madvice);
|
||||
}
|
||||
|
||||
fn get_info(&self) -> CacheInfo {
|
||||
let paths: Vec<PathBuf> = self.cache.keys().cloned().collect();
|
||||
CacheInfo {
|
||||
@@ -115,6 +118,16 @@ impl MmapCache {
|
||||
}
|
||||
}
|
||||
|
||||
fn open_mmap_impl(&self, full_path: &Path) -> Result<Option<Mmap>, OpenReadError> {
|
||||
let mmap_opt = open_mmap(full_path)?;
|
||||
#[cfg(unix)]
|
||||
if let (Some(mmap), Some(madvice)) = (mmap_opt.as_ref(), self.madvice_opt) {
|
||||
// We ignore madvise errors.
|
||||
let _ = mmap.advise(madvice);
|
||||
}
|
||||
Ok(mmap_opt)
|
||||
}
|
||||
|
||||
// Returns None if the file exists but as a len of 0 (and hence is not mmappable).
|
||||
fn get_mmap(&mut self, full_path: &Path) -> Result<Option<ArcBytes>, OpenReadError> {
|
||||
if let Some(mmap_weak) = self.cache.get(full_path) {
|
||||
@@ -125,7 +138,7 @@ impl MmapCache {
|
||||
}
|
||||
self.cache.remove(full_path);
|
||||
self.counters.miss += 1;
|
||||
let mmap_opt = open_mmap(full_path, self.madvice_opt)?;
|
||||
let mmap_opt = self.open_mmap_impl(full_path)?;
|
||||
Ok(mmap_opt.map(|mmap| {
|
||||
let mmap_arc: ArcBytes = Arc::new(mmap);
|
||||
let mmap_weak = Arc::downgrade(&mmap_arc);
|
||||
@@ -160,13 +173,9 @@ struct MmapDirectoryInner {
|
||||
}
|
||||
|
||||
impl MmapDirectoryInner {
|
||||
fn new(
|
||||
root_path: PathBuf,
|
||||
temp_directory: Option<TempDir>,
|
||||
madvice_opt: Option<Advice>,
|
||||
) -> MmapDirectoryInner {
|
||||
fn new(root_path: PathBuf, temp_directory: Option<TempDir>) -> MmapDirectoryInner {
|
||||
MmapDirectoryInner {
|
||||
mmap_cache: RwLock::new(MmapCache::new(madvice_opt)),
|
||||
mmap_cache: RwLock::new(MmapCache::new()),
|
||||
_temp_directory: temp_directory,
|
||||
watcher: FileWatcher::new(&root_path.join(*META_FILEPATH)),
|
||||
root_path,
|
||||
@@ -185,12 +194,8 @@ impl fmt::Debug for MmapDirectory {
|
||||
}
|
||||
|
||||
impl MmapDirectory {
|
||||
fn new(
|
||||
root_path: PathBuf,
|
||||
temp_directory: Option<TempDir>,
|
||||
madvice_opt: Option<Advice>,
|
||||
) -> MmapDirectory {
|
||||
let inner = MmapDirectoryInner::new(root_path, temp_directory, madvice_opt);
|
||||
fn new(root_path: PathBuf, temp_directory: Option<TempDir>) -> MmapDirectory {
|
||||
let inner = MmapDirectoryInner::new(root_path, temp_directory);
|
||||
MmapDirectory {
|
||||
inner: Arc::new(inner),
|
||||
}
|
||||
@@ -206,29 +211,33 @@ impl MmapDirectory {
|
||||
Ok(MmapDirectory::new(
|
||||
tempdir.path().to_path_buf(),
|
||||
Some(tempdir),
|
||||
None,
|
||||
))
|
||||
}
|
||||
|
||||
/// Opens a MmapDirectory in a directory, with a given access pattern.
|
||||
///
|
||||
/// This is only supported on unix platforms.
|
||||
#[cfg(unix)]
|
||||
pub fn open_with_madvice(
|
||||
directory_path: impl AsRef<Path>,
|
||||
madvice: Advice,
|
||||
) -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
let dir = Self::open_impl_to_avoid_monomorphization(directory_path.as_ref())?;
|
||||
dir.inner.mmap_cache.write().unwrap().set_advice(madvice);
|
||||
Ok(dir)
|
||||
}
|
||||
|
||||
/// Opens a MmapDirectory in a directory.
|
||||
///
|
||||
/// Returns an error if the `directory_path` does not
|
||||
/// exist or if it is not a directory.
|
||||
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
Self::open_with_access_pattern_impl(directory_path.as_ref(), None)
|
||||
pub fn open(directory_path: impl AsRef<Path>) -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
Self::open_impl_to_avoid_monomorphization(directory_path.as_ref())
|
||||
}
|
||||
|
||||
/// Opens a MmapDirectory in a directory, with a given access pattern.
|
||||
pub fn open_with_madvice<P: AsRef<Path>>(
|
||||
directory_path: P,
|
||||
madvice: Advice,
|
||||
) -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
Self::open_with_access_pattern_impl(directory_path.as_ref(), Some(madvice))
|
||||
}
|
||||
|
||||
fn open_with_access_pattern_impl(
|
||||
#[inline(never)]
|
||||
fn open_impl_to_avoid_monomorphization(
|
||||
directory_path: &Path,
|
||||
madvice_opt: Option<Advice>,
|
||||
) -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
if !directory_path.exists() {
|
||||
return Err(OpenDirectoryError::DoesNotExist(PathBuf::from(
|
||||
@@ -256,7 +265,7 @@ impl MmapDirectory {
|
||||
directory_path,
|
||||
)));
|
||||
}
|
||||
Ok(MmapDirectory::new(canonical_path, None, madvice_opt))
|
||||
Ok(MmapDirectory::new(canonical_path, None))
|
||||
}
|
||||
|
||||
/// Joins a relative_path to the directory `root_path`
|
||||
@@ -365,7 +374,7 @@ pub(crate) fn atomic_write(path: &Path, content: &[u8]) -> io::Result<()> {
|
||||
}
|
||||
|
||||
impl Directory for MmapDirectory {
|
||||
fn get_file_handle(&self, path: &Path) -> result::Result<Arc<dyn FileHandle>, OpenReadError> {
|
||||
fn get_file_handle(&self, path: &Path) -> Result<Arc<dyn FileHandle>, OpenReadError> {
|
||||
debug!("Open Read {:?}", path);
|
||||
let full_path = self.resolve_path(path);
|
||||
|
||||
@@ -388,7 +397,7 @@ impl Directory for MmapDirectory {
|
||||
|
||||
/// Any entry associated with the path in the mmap will be
|
||||
/// removed before the file is deleted.
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
fn delete(&self, path: &Path) -> Result<(), DeleteError> {
|
||||
let full_path = self.resolve_path(path);
|
||||
fs::remove_file(full_path).map_err(|e| {
|
||||
if e.kind() == io::ErrorKind::NotFound {
|
||||
|
||||
@@ -5,7 +5,6 @@ use std::sync::{Arc, RwLock};
|
||||
use std::{fmt, result};
|
||||
|
||||
use common::HasLen;
|
||||
use fail::fail_point;
|
||||
|
||||
use super::FileHandle;
|
||||
use crate::core::META_FILEPATH;
|
||||
@@ -184,7 +183,7 @@ impl Directory for RamDirectory {
|
||||
}
|
||||
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
fail_point!("RamDirectory::delete", |_| {
|
||||
crate::fail_point!("RamDirectory::delete", |_| {
|
||||
Err(DeleteError::IoError {
|
||||
io_error: Arc::new(io::Error::from(io::ErrorKind::Other)),
|
||||
filepath: path.to_path_buf(),
|
||||
|
||||
@@ -446,7 +446,8 @@ mod tests {
|
||||
#[test]
|
||||
fn test_text_fastfield() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT | FAST);
|
||||
let text_options: TextOptions = TextOptions::from(TEXT).set_fast("raw");
|
||||
let text_field = schema_builder.add_text_field("text", text_options);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
@@ -686,12 +687,12 @@ mod tests {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let date_field = schema_builder.add_date_field(
|
||||
"date",
|
||||
DateOptions::from(FAST).set_precision(DateTimePrecision::Nanosecond),
|
||||
DateOptions::from(FAST).set_precision(DateTimePrecision::Nanoseconds),
|
||||
);
|
||||
let multi_date_field = schema_builder.add_date_field(
|
||||
"multi_date",
|
||||
DateOptions::default()
|
||||
.set_precision(DateTimePrecision::Nanosecond)
|
||||
.set_precision(DateTimePrecision::Nanoseconds)
|
||||
.set_fast(),
|
||||
);
|
||||
let schema = schema_builder.build();
|
||||
@@ -862,9 +863,9 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
pub fn test_gcd_date() {
|
||||
let size_prec_sec = test_gcd_date_with_codec(DateTimePrecision::Second);
|
||||
let size_prec_sec = test_gcd_date_with_codec(DateTimePrecision::Seconds);
|
||||
assert!((1000 * 13 / 8..100 + 1000 * 13 / 8).contains(&size_prec_sec.get_bytes())); // 13 bits per val = ceil(log_2(number of seconds in 2hours);
|
||||
let size_prec_micros = test_gcd_date_with_codec(DateTimePrecision::Microsecond);
|
||||
let size_prec_micros = test_gcd_date_with_codec(DateTimePrecision::Microseconds);
|
||||
assert!((1000 * 33 / 8..100 + 1000 * 33 / 8).contains(&size_prec_micros.get_bytes()));
|
||||
// 33 bits per
|
||||
// val = ceil(log_2(number
|
||||
@@ -1082,7 +1083,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_fast_field_in_json_field_expand_dots_disabled() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_option = JsonObjectOptions::default().set_fast(None);
|
||||
let json_option = JsonObjectOptions::default().set_fast("default");
|
||||
let json = schema_builder.add_json_field("json", json_option);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
@@ -1108,7 +1109,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_fast_field_in_json_field_with_tokenizer() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_option = JsonObjectOptions::default().set_fast(Some("default"));
|
||||
let json_option = JsonObjectOptions::default().set_fast("default");
|
||||
let json = schema_builder.add_json_field("json", json_option);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
@@ -1134,7 +1135,7 @@ mod tests {
|
||||
fn test_fast_field_in_json_field_expand_dots_enabled() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_option = JsonObjectOptions::default()
|
||||
.set_fast(None)
|
||||
.set_fast("default")
|
||||
.set_expand_dots_enabled();
|
||||
let json = schema_builder.add_json_field("json", json_option);
|
||||
let schema = schema_builder.build();
|
||||
@@ -1202,13 +1203,13 @@ mod tests {
|
||||
#[test]
|
||||
fn test_fast_field_tokenizer() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let opt = TextOptions::default().set_fast(Some("custom_lowercase"));
|
||||
let opt = TextOptions::default().set_fast("custom_lowercase");
|
||||
let text_field = schema_builder.add_text_field("text", opt);
|
||||
let schema = schema_builder.build();
|
||||
let ff_tokenizer_manager = TokenizerManager::default();
|
||||
let ff_tokenizer_manager = TokenizerManager::default_for_fast_fields();
|
||||
ff_tokenizer_manager.register(
|
||||
"custom_lowercase",
|
||||
TextAnalyzer::builder(RawTokenizer)
|
||||
TextAnalyzer::builder(RawTokenizer::default())
|
||||
.filter(LowerCaser)
|
||||
.build(),
|
||||
);
|
||||
@@ -1238,7 +1239,7 @@ mod tests {
|
||||
.set_index_option(crate::schema::IndexRecordOption::WithFreqs)
|
||||
.set_tokenizer("raw"),
|
||||
)
|
||||
.set_fast(Some("default"))
|
||||
.set_fast("default")
|
||||
.set_stored();
|
||||
|
||||
let log_field = schema_builder.add_text_field("log_level", text_fieldtype);
|
||||
@@ -1271,7 +1272,7 @@ mod tests {
|
||||
fn test_shadowing_fast_field_with_expand_dots() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_option = JsonObjectOptions::default()
|
||||
.set_fast(None)
|
||||
.set_fast("default")
|
||||
.set_expand_dots_enabled();
|
||||
let json_field = schema_builder.add_json_field("jsonfield", json_option.clone());
|
||||
let shadowing_json_field = schema_builder.add_json_field("jsonfield.attr", json_option);
|
||||
|
||||
@@ -88,7 +88,7 @@ impl FastFieldReaders {
|
||||
let Some((field, path)): Option<(Field, &str)> = self
|
||||
.schema
|
||||
.find_field_with_default(field_name, default_field_opt)
|
||||
else{
|
||||
else {
|
||||
return Ok(None);
|
||||
};
|
||||
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
|
||||
@@ -120,7 +120,8 @@ impl FastFieldReaders {
|
||||
T: HasAssociatedColumnType,
|
||||
DynamicColumn: Into<Option<Column<T>>>,
|
||||
{
|
||||
let Some(dynamic_column_handle) = self.dynamic_column_handle(field_name, T::column_type())?
|
||||
let Some(dynamic_column_handle) =
|
||||
self.dynamic_column_handle(field_name, T::column_type())?
|
||||
else {
|
||||
return Ok(None);
|
||||
};
|
||||
@@ -196,7 +197,8 @@ impl FastFieldReaders {
|
||||
|
||||
/// Returns a `str` column.
|
||||
pub fn str(&self, field_name: &str) -> crate::Result<Option<StrColumn>> {
|
||||
let Some(dynamic_column_handle) = self.dynamic_column_handle(field_name, ColumnType::Str)?
|
||||
let Some(dynamic_column_handle) =
|
||||
self.dynamic_column_handle(field_name, ColumnType::Str)?
|
||||
else {
|
||||
return Ok(None);
|
||||
};
|
||||
@@ -206,7 +208,8 @@ impl FastFieldReaders {
|
||||
|
||||
/// Returns a `bytes` column.
|
||||
pub fn bytes(&self, field_name: &str) -> crate::Result<Option<BytesColumn>> {
|
||||
let Some(dynamic_column_handle) = self.dynamic_column_handle(field_name, ColumnType::Bytes)?
|
||||
let Some(dynamic_column_handle) =
|
||||
self.dynamic_column_handle(field_name, ColumnType::Bytes)?
|
||||
else {
|
||||
return Ok(None);
|
||||
};
|
||||
@@ -346,7 +349,7 @@ mod tests {
|
||||
schema_builder.add_json_field(
|
||||
"json_expand_dots_enabled",
|
||||
JsonObjectOptions::default()
|
||||
.set_fast(None)
|
||||
.set_fast("default")
|
||||
.set_expand_dots_enabled(),
|
||||
);
|
||||
let dynamic_field = schema_builder.add_json_field("_dyna", FAST);
|
||||
|
||||
@@ -18,6 +18,8 @@ const JSON_DEPTH_LIMIT: usize = 20;
|
||||
pub struct FastFieldsWriter {
|
||||
columnar_writer: ColumnarWriter,
|
||||
fast_field_names: Vec<Option<String>>, //< TODO see if we can hash the field name hash too.
|
||||
// Field -> Fast field tokenizer mapping.
|
||||
// All text fast fields should have a tokenizer.
|
||||
per_field_tokenizer: Vec<Option<TextAnalyzer>>,
|
||||
date_precisions: Vec<DateTimePrecision>,
|
||||
expand_dots: Vec<bool>,
|
||||
@@ -61,7 +63,7 @@ impl FastFieldsWriter {
|
||||
if let Some(tokenizer_name) = json_object_options.get_fast_field_tokenizer_name() {
|
||||
let text_analyzer = tokenizer_manager.get(tokenizer_name).ok_or_else(|| {
|
||||
TantivyError::InvalidArgument(format!(
|
||||
"Tokenizer {tokenizer_name:?} not found"
|
||||
"Tokenizer `{tokenizer_name}` not found"
|
||||
))
|
||||
})?;
|
||||
per_field_tokenizer[field_id.field_id() as usize] = Some(text_analyzer);
|
||||
@@ -147,7 +149,7 @@ impl FastFieldsWriter {
|
||||
}
|
||||
Value::Str(text_val) => {
|
||||
if let Some(tokenizer) =
|
||||
&self.per_field_tokenizer[field_value.field().field_id() as usize]
|
||||
&mut self.per_field_tokenizer[field_value.field().field_id() as usize]
|
||||
{
|
||||
let mut token_stream = tokenizer.token_stream(text_val);
|
||||
token_stream.process(&mut |token: &Token| {
|
||||
@@ -157,9 +159,6 @@ impl FastFieldsWriter {
|
||||
&token.text,
|
||||
);
|
||||
})
|
||||
} else {
|
||||
self.columnar_writer
|
||||
.record_str(doc_id, field_name.as_str(), text_val);
|
||||
}
|
||||
}
|
||||
Value::Bytes(bytes_val) => {
|
||||
@@ -201,18 +200,20 @@ impl FastFieldsWriter {
|
||||
self.json_path_buffer.clear();
|
||||
self.json_path_buffer.push_str(field_name);
|
||||
|
||||
let text_analyzer =
|
||||
&self.per_field_tokenizer[field_value.field().field_id() as usize];
|
||||
let text_analyzer_opt =
|
||||
&mut self.per_field_tokenizer[field_value.field().field_id() as usize];
|
||||
|
||||
record_json_obj_to_columnar_writer(
|
||||
doc_id,
|
||||
json_obj,
|
||||
expand_dots,
|
||||
JSON_DEPTH_LIMIT,
|
||||
&mut self.json_path_buffer,
|
||||
&mut self.columnar_writer,
|
||||
text_analyzer,
|
||||
);
|
||||
if let Some(text_analyzer) = text_analyzer_opt {
|
||||
record_json_obj_to_columnar_writer(
|
||||
doc_id,
|
||||
json_obj,
|
||||
expand_dots,
|
||||
JSON_DEPTH_LIMIT,
|
||||
&mut self.json_path_buffer,
|
||||
&mut self.columnar_writer,
|
||||
text_analyzer,
|
||||
);
|
||||
}
|
||||
}
|
||||
Value::IpAddr(ip_addr) => {
|
||||
self.columnar_writer
|
||||
@@ -263,7 +264,7 @@ fn record_json_obj_to_columnar_writer(
|
||||
remaining_depth_limit: usize,
|
||||
json_path_buffer: &mut String,
|
||||
columnar_writer: &mut columnar::ColumnarWriter,
|
||||
tokenizer: &Option<TextAnalyzer>,
|
||||
text_analyzer: &mut TextAnalyzer,
|
||||
) {
|
||||
for (key, child) in json_obj {
|
||||
let len_path = json_path_buffer.len();
|
||||
@@ -288,7 +289,7 @@ fn record_json_obj_to_columnar_writer(
|
||||
remaining_depth_limit,
|
||||
json_path_buffer,
|
||||
columnar_writer,
|
||||
tokenizer,
|
||||
text_analyzer,
|
||||
);
|
||||
// popping our sub path.
|
||||
json_path_buffer.truncate(len_path);
|
||||
@@ -302,7 +303,7 @@ fn record_json_value_to_columnar_writer(
|
||||
mut remaining_depth_limit: usize,
|
||||
json_path_writer: &mut String,
|
||||
columnar_writer: &mut columnar::ColumnarWriter,
|
||||
tokenizer: &Option<TextAnalyzer>,
|
||||
text_analyzer: &mut TextAnalyzer,
|
||||
) {
|
||||
if remaining_depth_limit == 0 {
|
||||
return;
|
||||
@@ -321,14 +322,10 @@ fn record_json_value_to_columnar_writer(
|
||||
}
|
||||
}
|
||||
serde_json::Value::String(text) => {
|
||||
if let Some(text_analyzer) = tokenizer {
|
||||
let mut token_stream = text_analyzer.token_stream(text);
|
||||
token_stream.process(&mut |token| {
|
||||
columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text);
|
||||
})
|
||||
} else {
|
||||
columnar_writer.record_str(doc, json_path_writer.as_str(), text);
|
||||
}
|
||||
let mut token_stream = text_analyzer.token_stream(text);
|
||||
token_stream.process(&mut |token| {
|
||||
columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text);
|
||||
});
|
||||
}
|
||||
serde_json::Value::Array(arr) => {
|
||||
for el in arr {
|
||||
@@ -339,7 +336,7 @@ fn record_json_value_to_columnar_writer(
|
||||
remaining_depth_limit,
|
||||
json_path_writer,
|
||||
columnar_writer,
|
||||
tokenizer,
|
||||
text_analyzer,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -351,7 +348,7 @@ fn record_json_value_to_columnar_writer(
|
||||
remaining_depth_limit,
|
||||
json_path_writer,
|
||||
columnar_writer,
|
||||
tokenizer,
|
||||
text_analyzer,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -371,6 +368,9 @@ mod tests {
|
||||
) -> ColumnarReader {
|
||||
let mut columnar_writer = ColumnarWriter::default();
|
||||
let mut json_path = String::new();
|
||||
let mut text_analyzer = crate::tokenizer::TokenizerManager::default_for_fast_fields()
|
||||
.get(crate::schema::DEFAULT_FAST_FIELD_TOKENIZER)
|
||||
.unwrap();
|
||||
for (doc, json_doc) in json_docs.iter().enumerate() {
|
||||
record_json_value_to_columnar_writer(
|
||||
doc as u32,
|
||||
@@ -379,7 +379,7 @@ mod tests {
|
||||
JSON_DEPTH_LIMIT,
|
||||
&mut json_path,
|
||||
&mut columnar_writer,
|
||||
&None,
|
||||
&mut text_analyzer,
|
||||
);
|
||||
}
|
||||
let mut buffer = Vec::new();
|
||||
@@ -399,6 +399,7 @@ mod tests {
|
||||
});
|
||||
let columnar_reader = test_columnar_from_jsons_aux(&[json_doc], false);
|
||||
let columns = columnar_reader.list_columns().unwrap();
|
||||
assert_eq!(columns.len(), 5);
|
||||
{
|
||||
assert_eq!(columns[0].0, "arr");
|
||||
let column_arr_opt: Option<StrColumn> = columns[0].1.open().unwrap().into();
|
||||
@@ -434,7 +435,9 @@ mod tests {
|
||||
{
|
||||
assert_eq!(columns[4].0, "text");
|
||||
let column_text_opt: Option<StrColumn> = columns[4].1.open().unwrap().into();
|
||||
assert!(column_text_opt.unwrap().term_ords(0).eq([0].into_iter()));
|
||||
let column_text = column_text_opt.unwrap();
|
||||
let term_ords: Vec<u64> = column_text.term_ords(0).collect();
|
||||
assert_eq!(&term_ords[..], &[0]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -6,7 +6,6 @@ use std::path::PathBuf;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
use fail::fail_point;
|
||||
use rayon::{ThreadPool, ThreadPoolBuilder};
|
||||
|
||||
use super::segment_manager::SegmentManager;
|
||||
@@ -43,7 +42,7 @@ pub(crate) fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate:
|
||||
let mut buffer = serde_json::to_vec_pretty(metas)?;
|
||||
// Just adding a new line at the end of the buffer.
|
||||
writeln!(&mut buffer)?;
|
||||
fail_point!("save_metas", |msg| Err(crate::TantivyError::from(
|
||||
crate::fail_point!("save_metas", |msg| Err(crate::TantivyError::from(
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
msg.unwrap_or_else(|| "Undefined".to_string())
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use columnar::MonotonicallyMappableToU64;
|
||||
use itertools::Itertools;
|
||||
use tokenizer_api::BoxTokenStream;
|
||||
|
||||
use super::doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping};
|
||||
use super::operation::AddOperation;
|
||||
@@ -15,7 +16,7 @@ use crate::postings::{
|
||||
use crate::schema::{FieldEntry, FieldType, Schema, Term, Value, DATE_TIME_PRECISION_INDEXED};
|
||||
use crate::store::{StoreReader, StoreWriter};
|
||||
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer};
|
||||
use crate::{DocId, Document, Opstamp, SegmentComponent};
|
||||
use crate::{DocId, Document, Opstamp, SegmentComponent, TantivyError};
|
||||
|
||||
/// Computes the initial size of the hash table.
|
||||
///
|
||||
@@ -98,14 +99,18 @@ impl SegmentWriter {
|
||||
}
|
||||
_ => None,
|
||||
};
|
||||
text_options
|
||||
.and_then(|text_index_option| {
|
||||
let tokenizer_name = &text_index_option.tokenizer();
|
||||
tokenizer_manager.get(tokenizer_name)
|
||||
})
|
||||
.unwrap_or_default()
|
||||
let tokenizer_name = text_options
|
||||
.map(|text_index_option| text_index_option.tokenizer())
|
||||
.unwrap_or("default");
|
||||
|
||||
tokenizer_manager.get(tokenizer_name).ok_or_else(|| {
|
||||
TantivyError::SchemaError(format!(
|
||||
"Error getting tokenizer for field: {}",
|
||||
field_entry.name()
|
||||
))
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
Ok(SegmentWriter {
|
||||
max_doc: 0,
|
||||
ctx: IndexingContext::new(table_size),
|
||||
@@ -185,10 +190,11 @@ impl SegmentWriter {
|
||||
|
||||
match field_entry.field_type() {
|
||||
FieldType::Facet(_) => {
|
||||
let mut facet_tokenizer = FacetTokenizer::default(); // this can be global
|
||||
for value in values {
|
||||
let facet = value.as_facet().ok_or_else(make_schema_error)?;
|
||||
let facet_str = facet.encoded_str();
|
||||
let mut facet_tokenizer = FacetTokenizer.token_stream(facet_str);
|
||||
let mut facet_tokenizer = facet_tokenizer.token_stream(facet_str);
|
||||
let mut indexing_position = IndexingPosition::default();
|
||||
postings_writer.index_text(
|
||||
doc_id,
|
||||
@@ -204,11 +210,11 @@ impl SegmentWriter {
|
||||
for value in values {
|
||||
let mut token_stream = match value {
|
||||
Value::PreTokStr(tok_str) => {
|
||||
PreTokenizedStream::from(tok_str.clone()).into()
|
||||
BoxTokenStream::new(PreTokenizedStream::from(tok_str.clone()))
|
||||
}
|
||||
Value::Str(ref text) => {
|
||||
let text_analyzer =
|
||||
&self.per_field_text_analyzers[field.field_id() as usize];
|
||||
&mut self.per_field_text_analyzers[field.field_id() as usize];
|
||||
text_analyzer.token_stream(text)
|
||||
}
|
||||
_ => {
|
||||
@@ -304,7 +310,8 @@ impl SegmentWriter {
|
||||
}
|
||||
}
|
||||
FieldType::JsonObject(json_options) => {
|
||||
let text_analyzer = &self.per_field_text_analyzers[field.field_id() as usize];
|
||||
let text_analyzer =
|
||||
&mut self.per_field_text_analyzers[field.field_id() as usize];
|
||||
let json_values_it =
|
||||
values.map(|value| value.as_json().ok_or_else(make_schema_error));
|
||||
index_json_values(
|
||||
@@ -436,7 +443,9 @@ fn remap_and_write(
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::path::Path;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use tempfile::TempDir;
|
||||
|
||||
use super::compute_initial_table_size;
|
||||
use crate::collector::Count;
|
||||
@@ -444,7 +453,9 @@ mod tests {
|
||||
use crate::directory::RamDirectory;
|
||||
use crate::postings::TermInfo;
|
||||
use crate::query::PhraseQuery;
|
||||
use crate::schema::{IndexRecordOption, Schema, Type, STORED, STRING, TEXT};
|
||||
use crate::schema::{
|
||||
IndexRecordOption, Schema, TextFieldIndexing, TextOptions, Type, STORED, STRING, TEXT,
|
||||
};
|
||||
use crate::store::{Compressor, StoreReader, StoreWriter};
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
use crate::time::OffsetDateTime;
|
||||
@@ -457,7 +468,7 @@ mod tests {
|
||||
fn test_hashmap_size() {
|
||||
assert_eq!(compute_initial_table_size(100_000).unwrap(), 1 << 11);
|
||||
assert_eq!(compute_initial_table_size(1_000_000).unwrap(), 1 << 14);
|
||||
assert_eq!(compute_initial_table_size(10_000_000).unwrap(), 1 << 18);
|
||||
assert_eq!(compute_initial_table_size(15_000_000).unwrap(), 1 << 18);
|
||||
assert_eq!(compute_initial_table_size(1_000_000_000).unwrap(), 1 << 19);
|
||||
assert_eq!(compute_initial_table_size(4_000_000_000).unwrap(), 1 << 19);
|
||||
}
|
||||
@@ -898,4 +909,32 @@ mod tests {
|
||||
postings.positions(&mut positions);
|
||||
assert_eq!(positions, &[4]); //< as opposed to 3 if we had a position length of 1.
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_show_error_when_tokenizer_not_registered() {
|
||||
let text_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("custom_en")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(text_field_indexing)
|
||||
.set_stored();
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_text_field("title", text_options);
|
||||
let schema = schema_builder.build();
|
||||
let tempdir = TempDir::new().unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
Index::create_in_dir(&tempdir_path, schema).unwrap();
|
||||
let index = Index::open_in_dir(tempdir_path).unwrap();
|
||||
let schema = index.schema();
|
||||
let mut index_writer = index.writer(50_000_000).unwrap();
|
||||
let title = schema.get_field("title").unwrap();
|
||||
let mut document = Document::default();
|
||||
document.add_text(title, "The Old Man and the Sea");
|
||||
index_writer.add_document(document).unwrap();
|
||||
let error = index_writer.commit().unwrap_err();
|
||||
assert_eq!(
|
||||
error.to_string(),
|
||||
"Schema error: 'Error getting tokenizer for field: title'"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -101,6 +101,7 @@ mod test {
|
||||
|
||||
use super::Stamper;
|
||||
|
||||
#[allow(clippy::redundant_clone)]
|
||||
#[test]
|
||||
fn test_stamper() {
|
||||
let stamper = Stamper::new(7u64);
|
||||
@@ -116,6 +117,7 @@ mod test {
|
||||
assert_eq!(stamper.stamp(), 15u64);
|
||||
}
|
||||
|
||||
#[allow(clippy::redundant_clone)]
|
||||
#[test]
|
||||
fn test_stamper_revert() {
|
||||
let stamper = Stamper::new(7u64);
|
||||
|
||||
120
src/lib.rs
120
src/lib.rs
@@ -191,6 +191,7 @@ pub use crate::schema::{DateOptions, DateTimePrecision, Document, Term};
|
||||
/// Index format version.
|
||||
const INDEX_FORMAT_VERSION: u32 = 5;
|
||||
|
||||
#[cfg(all(feature = "mmap", unix))]
|
||||
pub use memmap2::Advice;
|
||||
|
||||
/// Structure version for the index.
|
||||
@@ -298,9 +299,39 @@ pub struct DocAddress {
|
||||
pub doc_id: DocId,
|
||||
}
|
||||
|
||||
#[macro_export]
|
||||
/// Enable fail_point if feature is enabled.
|
||||
macro_rules! fail_point {
|
||||
($name:expr) => {{
|
||||
#[cfg(feature = "failpoints")]
|
||||
{
|
||||
fail::eval($name, |_| {
|
||||
panic!("Return is not supported for the fail point \"{}\"", $name);
|
||||
});
|
||||
}
|
||||
}};
|
||||
($name:expr, $e:expr) => {{
|
||||
#[cfg(feature = "failpoints")]
|
||||
{
|
||||
if let Some(res) = fail::eval($name, $e) {
|
||||
return res;
|
||||
}
|
||||
}
|
||||
}};
|
||||
($name:expr, $cond:expr, $e:expr) => {{
|
||||
#[cfg(feature = "failpoints")]
|
||||
{
|
||||
if $cond {
|
||||
fail::fail_point!($name, $e);
|
||||
}
|
||||
}
|
||||
}};
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
use common::{BinarySerializable, FixedSize};
|
||||
use query_grammar::{UserInputAst, UserInputLeaf, UserInputLiteral};
|
||||
use rand::distributions::{Bernoulli, Uniform};
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
@@ -856,6 +887,95 @@ pub mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_searcher_on_json_field_with_type_inference() {
|
||||
// When indexing and searching a json value, we infer its type.
|
||||
// This tests aims to check the type infereence is consistent between indexing and search.
|
||||
// Inference order is date, i64, u64, f64, bool.
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_field = schema_builder.add_json_field("json", STORED | TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let json_val: serde_json::Map<String, serde_json::Value> = serde_json::from_str(
|
||||
r#"{
|
||||
"signed": 2,
|
||||
"float": 2.0,
|
||||
"unsigned": 10000000000000,
|
||||
"date": "1985-04-12T23:20:50.52Z",
|
||||
"bool": true
|
||||
}"#,
|
||||
)
|
||||
.unwrap();
|
||||
let doc = doc!(json_field=>json_val);
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_for_tests().unwrap();
|
||||
writer.add_document(doc).unwrap();
|
||||
writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let get_doc_ids = |user_input_literal: UserInputLiteral| {
|
||||
let query_parser = crate::query::QueryParser::for_index(&index, Vec::new());
|
||||
let query = query_parser
|
||||
.build_query_from_user_input_ast(UserInputAst::from(UserInputLeaf::Literal(
|
||||
user_input_literal,
|
||||
)))
|
||||
.unwrap();
|
||||
searcher
|
||||
.search(&query, &TEST_COLLECTOR_WITH_SCORE)
|
||||
.map(|topdocs| topdocs.docs().to_vec())
|
||||
.unwrap()
|
||||
};
|
||||
{
|
||||
let user_input_literal = UserInputLiteral {
|
||||
field_name: Some("json.signed".to_string()),
|
||||
phrase: "2".to_string(),
|
||||
delimiter: crate::query_grammar::Delimiter::None,
|
||||
slop: 0,
|
||||
prefix: false,
|
||||
};
|
||||
assert_eq!(get_doc_ids(user_input_literal), vec![DocAddress::new(0, 0)]);
|
||||
}
|
||||
{
|
||||
let user_input_literal = UserInputLiteral {
|
||||
field_name: Some("json.float".to_string()),
|
||||
phrase: "2.0".to_string(),
|
||||
delimiter: crate::query_grammar::Delimiter::None,
|
||||
slop: 0,
|
||||
prefix: false,
|
||||
};
|
||||
assert_eq!(get_doc_ids(user_input_literal), vec![DocAddress::new(0, 0)]);
|
||||
}
|
||||
{
|
||||
let user_input_literal = UserInputLiteral {
|
||||
field_name: Some("json.date".to_string()),
|
||||
phrase: "1985-04-12T23:20:50.52Z".to_string(),
|
||||
delimiter: crate::query_grammar::Delimiter::None,
|
||||
slop: 0,
|
||||
prefix: false,
|
||||
};
|
||||
assert_eq!(get_doc_ids(user_input_literal), vec![DocAddress::new(0, 0)]);
|
||||
}
|
||||
{
|
||||
let user_input_literal = UserInputLiteral {
|
||||
field_name: Some("json.unsigned".to_string()),
|
||||
phrase: "10000000000000".to_string(),
|
||||
delimiter: crate::query_grammar::Delimiter::None,
|
||||
slop: 0,
|
||||
prefix: false,
|
||||
};
|
||||
assert_eq!(get_doc_ids(user_input_literal), vec![DocAddress::new(0, 0)]);
|
||||
}
|
||||
{
|
||||
let user_input_literal = UserInputLiteral {
|
||||
field_name: Some("json.bool".to_string()),
|
||||
phrase: "true".to_string(),
|
||||
delimiter: crate::query_grammar::Delimiter::None,
|
||||
slop: 0,
|
||||
prefix: false,
|
||||
};
|
||||
assert_eq!(get_doc_ids(user_input_literal), vec![DocAddress::new(0, 0)]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_doc_macro() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
@@ -162,7 +162,7 @@ pub mod tests {
|
||||
let index = Index::create_in_ram(schema);
|
||||
index
|
||||
.tokenizers()
|
||||
.register("simple_no_truncation", SimpleTokenizer);
|
||||
.register("simple_no_truncation", SimpleTokenizer::default());
|
||||
let reader = index.reader()?;
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
|
||||
@@ -194,7 +194,7 @@ pub mod tests {
|
||||
let index = Index::create_in_ram(schema);
|
||||
index
|
||||
.tokenizers()
|
||||
.register("simple_no_truncation", SimpleTokenizer);
|
||||
.register("simple_no_truncation", SimpleTokenizer::default());
|
||||
let reader = index.reader()?;
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
|
||||
|
||||
@@ -2,7 +2,6 @@ use std::cmp::Ordering;
|
||||
use std::io::{self, Write};
|
||||
|
||||
use common::{BinarySerializable, CountingWriter, VInt};
|
||||
use fail::fail_point;
|
||||
|
||||
use super::TermInfo;
|
||||
use crate::core::Segment;
|
||||
@@ -205,7 +204,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
/// If the current block is incomplete, it needs to be encoded
|
||||
/// using `VInt` encoding.
|
||||
pub fn close_term(&mut self) -> io::Result<()> {
|
||||
fail_point!("FieldSerializer::close_term", |msg: Option<String>| {
|
||||
crate::fail_point!("FieldSerializer::close_term", |msg: Option<String>| {
|
||||
Err(io::Error::new(io::ErrorKind::Other, format!("{msg:?}")))
|
||||
});
|
||||
if self.term_open {
|
||||
|
||||
@@ -4,9 +4,7 @@ use std::collections::{BinaryHeap, HashMap};
|
||||
use crate::query::bm25::idf;
|
||||
use crate::query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery};
|
||||
use crate::schema::{Field, FieldType, IndexRecordOption, Term, Value};
|
||||
use crate::tokenizer::{
|
||||
BoxTokenStream, FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer,
|
||||
};
|
||||
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer};
|
||||
use crate::{DocAddress, Result, Searcher, TantivyError};
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
@@ -192,45 +190,48 @@ impl MoreLikeThis {
|
||||
})
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
for fake_str in facets {
|
||||
FacetTokenizer.token_stream(fake_str).process(&mut |token| {
|
||||
if self.is_noise_word(token.text.clone()) {
|
||||
let term = Term::from_field_text(field, &token.text);
|
||||
*term_frequencies.entry(term).or_insert(0) += 1;
|
||||
}
|
||||
});
|
||||
FacetTokenizer::default()
|
||||
.token_stream(fake_str)
|
||||
.process(&mut |token| {
|
||||
if self.is_noise_word(token.text.clone()) {
|
||||
let term = Term::from_field_text(field, &token.text);
|
||||
*term_frequencies.entry(term).or_insert(0) += 1;
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
FieldType::Str(text_options) => {
|
||||
let mut token_streams: Vec<BoxTokenStream> = vec![];
|
||||
|
||||
for value in values {
|
||||
match value {
|
||||
Value::PreTokStr(tok_str) => {
|
||||
token_streams.push(PreTokenizedStream::from(tok_str.clone()).into());
|
||||
let mut token_stream = PreTokenizedStream::from(tok_str.clone());
|
||||
token_stream.process(&mut |token| {
|
||||
if !self.is_noise_word(token.text.clone()) {
|
||||
let term = Term::from_field_text(field, &token.text);
|
||||
*term_frequencies.entry(term).or_insert(0) += 1;
|
||||
}
|
||||
});
|
||||
}
|
||||
Value::Str(ref text) => {
|
||||
if let Some(tokenizer) = text_options
|
||||
if let Some(mut tokenizer) = text_options
|
||||
.get_indexing_options()
|
||||
.map(|text_indexing_options| {
|
||||
text_indexing_options.tokenizer().to_string()
|
||||
})
|
||||
.and_then(|tokenizer_name| tokenizer_manager.get(&tokenizer_name))
|
||||
{
|
||||
token_streams.push(tokenizer.token_stream(text));
|
||||
let mut token_stream = tokenizer.token_stream(text);
|
||||
token_stream.process(&mut |token| {
|
||||
if !self.is_noise_word(token.text.clone()) {
|
||||
let term = Term::from_field_text(field, &token.text);
|
||||
*term_frequencies.entry(term).or_insert(0) += 1;
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
|
||||
for mut token_stream in token_streams {
|
||||
token_stream.process(&mut |token| {
|
||||
if !self.is_noise_word(token.text.clone()) {
|
||||
let term = Term::from_field_text(field, &token.text);
|
||||
*term_frequencies.entry(term).or_insert(0) += 1;
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
FieldType::U64(_) => {
|
||||
for value in values {
|
||||
|
||||
@@ -88,9 +88,6 @@ impl PhrasePrefixQuery {
|
||||
/// a specialized type [`PhraseQueryWeight`] instead of a Boxed trait.
|
||||
/// If the query was only one term long, this returns `None` wherease [`Query::weight`]
|
||||
/// returns a boxed [`RangeWeight`]
|
||||
///
|
||||
/// Returns `None`, if phrase_terms is empty, which happens if the phrase prefix query was
|
||||
/// built with a single term.
|
||||
pub(crate) fn phrase_prefix_query_weight(
|
||||
&self,
|
||||
enable_scoring: EnableScoring<'_>,
|
||||
|
||||
@@ -8,7 +8,11 @@ use crate::Score;
|
||||
#[derive(Clone)]
|
||||
pub enum LogicalLiteral {
|
||||
Term(Term),
|
||||
Phrase(Vec<(usize, Term)>, u32),
|
||||
Phrase {
|
||||
terms: Vec<(usize, Term)>,
|
||||
slop: u32,
|
||||
prefix: bool,
|
||||
},
|
||||
Range {
|
||||
field: String,
|
||||
value_type: Type,
|
||||
@@ -79,10 +83,16 @@ impl fmt::Debug for LogicalLiteral {
|
||||
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
|
||||
match *self {
|
||||
LogicalLiteral::Term(ref term) => write!(formatter, "{term:?}"),
|
||||
LogicalLiteral::Phrase(ref terms, slop) => {
|
||||
LogicalLiteral::Phrase {
|
||||
ref terms,
|
||||
slop,
|
||||
prefix,
|
||||
} => {
|
||||
write!(formatter, "\"{terms:?}\"")?;
|
||||
if slop > 0 {
|
||||
write!(formatter, "~{slop:?}")
|
||||
} else if prefix {
|
||||
write!(formatter, "*")
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -15,21 +15,12 @@ use crate::core::json_utils::{
|
||||
use crate::core::Index;
|
||||
use crate::query::range_query::{is_type_valid_for_fastfield_range_query, RangeQuery};
|
||||
use crate::query::{
|
||||
AllQuery,
|
||||
BooleanQuery,
|
||||
BoostQuery,
|
||||
EmptyQuery,
|
||||
FuzzyTermQuery,
|
||||
Occur,
|
||||
PhraseQuery,
|
||||
Query,
|
||||
// RangeQuery,
|
||||
TermQuery,
|
||||
TermSetQuery,
|
||||
AllQuery, BooleanQuery, BoostQuery, EmptyQuery, FuzzyTermQuery, Occur, PhrasePrefixQuery,
|
||||
PhraseQuery, Query, TermQuery, TermSetQuery,
|
||||
};
|
||||
use crate::schema::{
|
||||
Facet, FacetParseError, Field, FieldType, IndexRecordOption, IntoIpv6Addr, JsonObjectOptions,
|
||||
Schema, Term, Type,
|
||||
Schema, Term, TextFieldIndexing, Type,
|
||||
};
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
use crate::time::OffsetDateTime;
|
||||
@@ -79,6 +70,17 @@ pub enum QueryParserError {
|
||||
/// have any positions indexed.
|
||||
#[error("The field '{0}' does not have positions indexed")]
|
||||
FieldDoesNotHavePositionsIndexed(String),
|
||||
/// A phrase-prefix query requires at least two terms
|
||||
#[error(
|
||||
"The phrase '{phrase:?}' does not produce at least two terms using the tokenizer \
|
||||
'{tokenizer:?}'"
|
||||
)]
|
||||
PhrasePrefixRequiresAtLeastTwoTerms {
|
||||
/// The phrase which triggered the issue
|
||||
phrase: String,
|
||||
/// The tokenizer configured for the field
|
||||
tokenizer: String,
|
||||
},
|
||||
/// The tokenizer for the given field is unknown
|
||||
/// The two argument strings are the name of the field, the name of the tokenizer
|
||||
#[error("The tokenizer '{tokenizer:?}' for the field '{field:?}' is unknown")]
|
||||
@@ -194,6 +196,10 @@ fn trim_ast(logical_ast: LogicalAst) -> Option<LogicalAst> {
|
||||
///
|
||||
/// Phrase terms support the `~` slop operator which allows to set the phrase's matching
|
||||
/// distance in words. `"big wolf"~1` will return documents containing the phrase `"big bad wolf"`.
|
||||
///
|
||||
/// Phrase terms also support the `*` prefix operator which switches the phrase's matching
|
||||
/// to consider all documents which contain the last term as a prefix, e.g. `"big bad wo"*` will
|
||||
/// match `"big bad wolf"`.
|
||||
#[derive(Clone)]
|
||||
pub struct QueryParser {
|
||||
schema: Schema,
|
||||
@@ -397,7 +403,7 @@ impl QueryParser {
|
||||
// This should have been seen earlier really.
|
||||
QueryParserError::FieldNotIndexed(field_entry.name().to_string())
|
||||
})?;
|
||||
let text_analyzer =
|
||||
let mut text_analyzer =
|
||||
self.tokenizer_manager
|
||||
.get(option.tokenizer())
|
||||
.ok_or_else(|| QueryParserError::UnknownTokenizer {
|
||||
@@ -446,6 +452,7 @@ impl QueryParser {
|
||||
json_path: &str,
|
||||
phrase: &str,
|
||||
slop: u32,
|
||||
prefix: bool,
|
||||
) -> Result<Vec<LogicalLiteral>, QueryParserError> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
@@ -486,25 +493,25 @@ impl QueryParser {
|
||||
Ok(vec![LogicalLiteral::Term(dt_term)])
|
||||
}
|
||||
FieldType::Str(ref str_options) => {
|
||||
let option = str_options.get_indexing_options().ok_or_else(|| {
|
||||
let indexing_options = str_options.get_indexing_options().ok_or_else(|| {
|
||||
// This should have been seen earlier really.
|
||||
QueryParserError::FieldNotIndexed(field_name.to_string())
|
||||
})?;
|
||||
let text_analyzer =
|
||||
self.tokenizer_manager
|
||||
.get(option.tokenizer())
|
||||
.ok_or_else(|| QueryParserError::UnknownTokenizer {
|
||||
field: field_name.to_string(),
|
||||
tokenizer: option.tokenizer().to_string(),
|
||||
})?;
|
||||
let index_record_option = option.index_option();
|
||||
let mut text_analyzer = self
|
||||
.tokenizer_manager
|
||||
.get(indexing_options.tokenizer())
|
||||
.ok_or_else(|| QueryParserError::UnknownTokenizer {
|
||||
field: field_name.to_string(),
|
||||
tokenizer: indexing_options.tokenizer().to_string(),
|
||||
})?;
|
||||
Ok(generate_literals_for_str(
|
||||
field_name,
|
||||
field,
|
||||
phrase,
|
||||
slop,
|
||||
&text_analyzer,
|
||||
index_record_option,
|
||||
prefix,
|
||||
indexing_options,
|
||||
&mut text_analyzer,
|
||||
)?
|
||||
.into_iter()
|
||||
.collect())
|
||||
@@ -661,9 +668,13 @@ impl QueryParser {
|
||||
self.compute_path_triplets_for_literal(&literal)?;
|
||||
let mut asts: Vec<LogicalAst> = Vec::new();
|
||||
for (field, json_path, phrase) in term_phrases {
|
||||
for ast in
|
||||
self.compute_logical_ast_for_leaf(field, json_path, phrase, literal.slop)?
|
||||
{
|
||||
for ast in self.compute_logical_ast_for_leaf(
|
||||
field,
|
||||
json_path,
|
||||
phrase,
|
||||
literal.slop,
|
||||
literal.prefix,
|
||||
)? {
|
||||
// Apply some field specific boost defined at the query parser level.
|
||||
let boost = self.field_boost(field);
|
||||
asts.push(LogicalAst::Leaf(Box::new(ast)).boost(boost));
|
||||
@@ -753,9 +764,17 @@ fn convert_literal_to_query(
|
||||
Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs))
|
||||
}
|
||||
}
|
||||
LogicalLiteral::Phrase(term_with_offsets, slop) => Box::new(
|
||||
PhraseQuery::new_with_offset_and_slop(term_with_offsets, slop),
|
||||
),
|
||||
LogicalLiteral::Phrase {
|
||||
terms,
|
||||
slop,
|
||||
prefix,
|
||||
} => {
|
||||
if prefix {
|
||||
Box::new(PhrasePrefixQuery::new_with_offset(terms))
|
||||
} else {
|
||||
Box::new(PhraseQuery::new_with_offset_and_slop(terms, slop))
|
||||
}
|
||||
}
|
||||
LogicalLiteral::Range {
|
||||
field,
|
||||
value_type,
|
||||
@@ -774,8 +793,9 @@ fn generate_literals_for_str(
|
||||
field: Field,
|
||||
phrase: &str,
|
||||
slop: u32,
|
||||
text_analyzer: &TextAnalyzer,
|
||||
index_record_option: IndexRecordOption,
|
||||
prefix: bool,
|
||||
indexing_options: &TextFieldIndexing,
|
||||
text_analyzer: &mut TextAnalyzer,
|
||||
) -> Result<Option<LogicalLiteral>, QueryParserError> {
|
||||
let mut terms: Vec<(usize, Term)> = Vec::new();
|
||||
let mut token_stream = text_analyzer.token_stream(phrase);
|
||||
@@ -784,18 +804,28 @@ fn generate_literals_for_str(
|
||||
terms.push((token.position, term));
|
||||
});
|
||||
if terms.len() <= 1 {
|
||||
if prefix {
|
||||
return Err(QueryParserError::PhrasePrefixRequiresAtLeastTwoTerms {
|
||||
phrase: phrase.to_owned(),
|
||||
tokenizer: indexing_options.tokenizer().to_owned(),
|
||||
});
|
||||
}
|
||||
let term_literal_opt = terms
|
||||
.into_iter()
|
||||
.next()
|
||||
.map(|(_, term)| LogicalLiteral::Term(term));
|
||||
return Ok(term_literal_opt);
|
||||
}
|
||||
if !index_record_option.has_positions() {
|
||||
if !indexing_options.index_option().has_positions() {
|
||||
return Err(QueryParserError::FieldDoesNotHavePositionsIndexed(
|
||||
field_name.to_string(),
|
||||
));
|
||||
}
|
||||
Ok(Some(LogicalLiteral::Phrase(terms, slop)))
|
||||
Ok(Some(LogicalLiteral::Phrase {
|
||||
terms,
|
||||
slop,
|
||||
prefix,
|
||||
}))
|
||||
}
|
||||
|
||||
fn generate_literals_for_json_object(
|
||||
@@ -810,7 +840,7 @@ fn generate_literals_for_json_object(
|
||||
// This should have been seen earlier really.
|
||||
QueryParserError::FieldNotIndexed(field_name.to_string())
|
||||
})?;
|
||||
let text_analyzer = tokenizer_manager
|
||||
let mut text_analyzer = tokenizer_manager
|
||||
.get(text_options.tokenizer())
|
||||
.ok_or_else(|| QueryParserError::UnknownTokenizer {
|
||||
field: field_name.to_string(),
|
||||
@@ -828,7 +858,7 @@ fn generate_literals_for_json_object(
|
||||
if let Some(term) = convert_to_fast_value_and_get_term(&mut json_term_writer, phrase) {
|
||||
logical_literals.push(LogicalLiteral::Term(term));
|
||||
}
|
||||
let terms = set_string_and_get_terms(&mut json_term_writer, phrase, &text_analyzer);
|
||||
let terms = set_string_and_get_terms(&mut json_term_writer, phrase, &mut text_analyzer);
|
||||
drop(json_term_writer);
|
||||
if terms.len() <= 1 {
|
||||
for (_, term) in terms {
|
||||
@@ -841,7 +871,11 @@ fn generate_literals_for_json_object(
|
||||
field_name.to_string(),
|
||||
));
|
||||
}
|
||||
logical_literals.push(LogicalLiteral::Phrase(terms, 0));
|
||||
logical_literals.push(LogicalLiteral::Phrase {
|
||||
terms,
|
||||
slop: 0,
|
||||
prefix: false,
|
||||
});
|
||||
Ok(logical_literals)
|
||||
}
|
||||
|
||||
@@ -922,10 +956,10 @@ mod test {
|
||||
.iter()
|
||||
.flat_map(|field_name| schema.get_field(field_name))
|
||||
.collect();
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
let tokenizer_manager = TokenizerManager::default_for_indexing();
|
||||
tokenizer_manager.register(
|
||||
"en_with_stop_words",
|
||||
TextAnalyzer::builder(SimpleTokenizer)
|
||||
TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(LowerCaser)
|
||||
.filter(StopWordFilter::remove(vec!["the".to_string()]))
|
||||
.build(),
|
||||
@@ -1169,7 +1203,7 @@ mod test {
|
||||
fn test_json_field_possibly_a_number() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"json.titi:5",
|
||||
r#"(Term(field=14, type=Json, path=titi, type=U64, 5) Term(field=14, type=Json, path=titi, type=Str, "5"))"#,
|
||||
r#"(Term(field=14, type=Json, path=titi, type=I64, 5) Term(field=14, type=Json, path=titi, type=Str, "5"))"#,
|
||||
true,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
@@ -1177,6 +1211,11 @@ mod test {
|
||||
r#"(Term(field=14, type=Json, path=titi, type=I64, -5) Term(field=14, type=Json, path=titi, type=Str, "5"))"#, //< Yes this is a bit weird after going through the tokenizer we lose the "-".
|
||||
true,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"json.titi:10000000000000000000",
|
||||
r#"(Term(field=14, type=Json, path=titi, type=U64, 10000000000000000000) Term(field=14, type=Json, path=titi, type=Str, "10000000000000000000"))"#,
|
||||
true,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"json.titi:-5.2",
|
||||
r#"(Term(field=14, type=Json, path=titi, type=F64, -5.2) "[(0, Term(field=14, type=Json, path=titi, type=Str, "5")), (1, Term(field=14, type=Json, path=titi, type=Str, "2"))]")"#,
|
||||
@@ -1226,7 +1265,7 @@ mod test {
|
||||
fn test_json_default() {
|
||||
test_query_to_logical_ast_with_default_json(
|
||||
"titi:4",
|
||||
"(Term(field=14, type=Json, path=titi, type=U64, 4) Term(field=14, type=Json, \
|
||||
"(Term(field=14, type=Json, path=titi, type=I64, 4) Term(field=14, type=Json, \
|
||||
path=titi, type=Str, \"4\"))",
|
||||
false,
|
||||
);
|
||||
@@ -1248,7 +1287,7 @@ mod test {
|
||||
for conjunction in [false, true] {
|
||||
test_query_to_logical_ast_with_default_json(
|
||||
"json:4",
|
||||
r#"(Term(field=14, type=Json, path=, type=U64, 4) Term(field=14, type=Json, path=, type=Str, "4"))"#,
|
||||
r#"(Term(field=14, type=Json, path=, type=I64, 4) Term(field=14, type=Json, path=, type=Str, "4"))"#,
|
||||
conjunction,
|
||||
);
|
||||
}
|
||||
@@ -1408,7 +1447,7 @@ mod test {
|
||||
let title = schema_builder.add_text_field("title", text_options);
|
||||
let schema = schema_builder.build();
|
||||
let default_fields = vec![title];
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
let tokenizer_manager = TokenizerManager::default_for_indexing();
|
||||
let query_parser = QueryParser::new(schema, default_fields, tokenizer_manager);
|
||||
|
||||
assert_matches!(
|
||||
@@ -1429,7 +1468,7 @@ mod test {
|
||||
let index = Index::create_in_ram(schema);
|
||||
index
|
||||
.tokenizers()
|
||||
.register("customtokenizer", SimpleTokenizer);
|
||||
.register("customtokenizer", SimpleTokenizer::default());
|
||||
let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
assert_eq!(
|
||||
query_parser.parse_query("title:\"happy tax\"").unwrap_err(),
|
||||
@@ -1583,7 +1622,8 @@ mod test {
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_text_field(r#"a\.b"#, STRING);
|
||||
let schema = schema_builder.build();
|
||||
let query_parser = QueryParser::new(schema, Vec::new(), TokenizerManager::default());
|
||||
let query_parser =
|
||||
QueryParser::new(schema, Vec::new(), TokenizerManager::default_for_indexing());
|
||||
let query = query_parser.parse_query(r#"a\.b:hello"#).unwrap();
|
||||
assert_eq!(
|
||||
format!("{query:?}"),
|
||||
@@ -1600,8 +1640,11 @@ mod test {
|
||||
schema_builder.add_text_field("first.toto.titi", STRING);
|
||||
schema_builder.add_text_field("third.a.b.c", STRING);
|
||||
let schema = schema_builder.build();
|
||||
let query_parser =
|
||||
QueryParser::new(schema.clone(), Vec::new(), TokenizerManager::default());
|
||||
let query_parser = QueryParser::new(
|
||||
schema.clone(),
|
||||
Vec::new(),
|
||||
TokenizerManager::default_for_indexing(),
|
||||
);
|
||||
assert_eq!(
|
||||
query_parser.split_full_path("first.toto"),
|
||||
Some((schema.get_field("first.toto").unwrap(), ""))
|
||||
@@ -1643,6 +1686,48 @@ mod test {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_phrase_prefix() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"\"big bad wo\"*",
|
||||
r#"("[(0, Term(field=0, type=Str, "big")), (1, Term(field=0, type=Str, "bad")), (2, Term(field=0, type=Str, "wo"))]"* "[(0, Term(field=1, type=Str, "big")), (1, Term(field=1, type=Str, "bad")), (2, Term(field=1, type=Str, "wo"))]"*)"#,
|
||||
false,
|
||||
);
|
||||
|
||||
let query_parser = make_query_parser();
|
||||
let query = query_parser.parse_query("\"big bad wo\"*").unwrap();
|
||||
assert_eq!(
|
||||
format!("{query:?}"),
|
||||
"BooleanQuery { subqueries: [(Should, PhrasePrefixQuery { field: Field(0), \
|
||||
phrase_terms: [(0, Term(field=0, type=Str, \"big\")), (1, Term(field=0, type=Str, \
|
||||
\"bad\"))], prefix: (2, Term(field=0, type=Str, \"wo\")), max_expansions: 50 }), \
|
||||
(Should, PhrasePrefixQuery { field: Field(1), phrase_terms: [(0, Term(field=1, \
|
||||
type=Str, \"big\")), (1, Term(field=1, type=Str, \"bad\"))], prefix: (2, \
|
||||
Term(field=1, type=Str, \"wo\")), max_expansions: 50 })] }"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_phrase_prefix_too_short() {
|
||||
let err = parse_query_to_logical_ast("\"wo\"*", true).unwrap_err();
|
||||
assert_eq!(
|
||||
err,
|
||||
QueryParserError::PhrasePrefixRequiresAtLeastTwoTerms {
|
||||
phrase: "wo".to_owned(),
|
||||
tokenizer: "default".to_owned()
|
||||
}
|
||||
);
|
||||
|
||||
let err = parse_query_to_logical_ast("\"\"*", true).unwrap_err();
|
||||
assert_eq!(
|
||||
err,
|
||||
QueryParserError::PhrasePrefixRequiresAtLeastTwoTerms {
|
||||
phrase: "".to_owned(),
|
||||
tokenizer: "default".to_owned()
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_term_set_query() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
|
||||
@@ -472,6 +472,7 @@ mod tests {
|
||||
|
||||
use super::RangeQuery;
|
||||
use crate::collector::{Count, TopDocs};
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::{Document, Field, IntoIpv6Addr, Schema, FAST, INDEXED, STORED, TEXT};
|
||||
use crate::{doc, Index};
|
||||
@@ -547,7 +548,8 @@ mod tests {
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(2, 60_000_000)?;
|
||||
let mut index_writer = index.writer_with_num_threads(1, 60_000_000)?;
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
|
||||
for i in 1..100 {
|
||||
let mut doc = Document::new();
|
||||
@@ -557,6 +559,9 @@ mod tests {
|
||||
}
|
||||
}
|
||||
index_writer.add_document(doc)?;
|
||||
if i == 10 {
|
||||
index_writer.commit()?;
|
||||
}
|
||||
}
|
||||
|
||||
index_writer.commit()?;
|
||||
|
||||
@@ -31,9 +31,10 @@ impl IPFastFieldRangeWeight {
|
||||
|
||||
impl Weight for IPFastFieldRangeWeight {
|
||||
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
||||
let Some(ip_addr_column): Option<Column<Ipv6Addr>> = reader.fast_fields()
|
||||
.column_opt(&self.field)? else {
|
||||
return Ok(Box::new(EmptyScorer))
|
||||
let Some(ip_addr_column): Option<Column<Ipv6Addr>> =
|
||||
reader.fast_fields().column_opt(&self.field)?
|
||||
else {
|
||||
return Ok(Box::new(EmptyScorer));
|
||||
};
|
||||
let value_range = bound_to_value_range(
|
||||
&self.lower_bound,
|
||||
|
||||
@@ -71,7 +71,9 @@ impl Weight for FastFieldRangeWeight {
|
||||
let column_type_opt_ref: Option<&[ColumnType]> = column_type_opt
|
||||
.as_ref()
|
||||
.map(|column_types| column_types.as_slice());
|
||||
let Some((column, _)) = fast_field_reader.u64_lenient_for_type(column_type_opt_ref, &self.field)? else {
|
||||
let Some((column, _)) =
|
||||
fast_field_reader.u64_lenient_for_type(column_type_opt_ref, &self.field)?
|
||||
else {
|
||||
return Ok(Box::new(EmptyScorer));
|
||||
};
|
||||
let value_range = bound_to_value_range(
|
||||
|
||||
@@ -72,6 +72,14 @@ impl Query for TermSetQuery {
|
||||
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
|
||||
Ok(Box::new(self.specialized_weight(enable_scoring.schema())?))
|
||||
}
|
||||
|
||||
fn query_terms<'a>(&'a self, visitor: &mut dyn FnMut(&'a Term, bool)) {
|
||||
for terms in self.terms_map.values() {
|
||||
for term in terms {
|
||||
visitor(term, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct SetDfaWrapper(Map<Vec<u8>>);
|
||||
|
||||
@@ -8,7 +8,7 @@ use serde::{Deserialize, Serialize};
|
||||
use crate::schema::flags::{FastFlag, IndexedFlag, SchemaFlagList, StoredFlag};
|
||||
|
||||
/// The precision of the indexed date/time values in the inverted index.
|
||||
pub const DATE_TIME_PRECISION_INDEXED: DateTimePrecision = DateTimePrecision::Second;
|
||||
pub const DATE_TIME_PRECISION_INDEXED: DateTimePrecision = DateTimePrecision::Seconds;
|
||||
|
||||
/// Defines how DateTime field should be handled by tantivy.
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)]
|
||||
|
||||
@@ -4,7 +4,7 @@ use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::text_options::{FastFieldTextOptions, TokenizerName};
|
||||
use crate::schema::flags::{FastFlag, SchemaFlagList, StoredFlag};
|
||||
use crate::schema::{TextFieldIndexing, TextOptions};
|
||||
use crate::schema::{TextFieldIndexing, TextOptions, DEFAULT_FAST_FIELD_TOKENIZER};
|
||||
|
||||
/// The `JsonObjectOptions` make it possible to
|
||||
/// configure how a json object field should be indexed and stored.
|
||||
@@ -58,20 +58,19 @@ impl JsonObjectOptions {
|
||||
/// Returns true if and only if the json object fields are
|
||||
/// to be treated as fast fields.
|
||||
pub fn is_fast(&self) -> bool {
|
||||
matches!(self.fast, FastFieldTextOptions::IsEnabled(true))
|
||||
|| matches!(
|
||||
&self.fast,
|
||||
FastFieldTextOptions::EnabledWithTokenizer { with_tokenizer: _ }
|
||||
)
|
||||
match self.fast {
|
||||
FastFieldTextOptions::Disabled => false,
|
||||
FastFieldTextOptions::Enabled { .. } => true,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if and only if the value is a fast field.
|
||||
pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> {
|
||||
match &self.fast {
|
||||
FastFieldTextOptions::IsEnabled(true) | FastFieldTextOptions::IsEnabled(false) => None,
|
||||
FastFieldTextOptions::EnabledWithTokenizer {
|
||||
with_tokenizer: tokenizer,
|
||||
} => Some(tokenizer.name()),
|
||||
FastFieldTextOptions::Disabled => None,
|
||||
FastFieldTextOptions::Enabled {
|
||||
tokenizer: with_tokenizer,
|
||||
} => Some(with_tokenizer.name()),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -130,15 +129,11 @@ impl JsonObjectOptions {
|
||||
/// [`TermDictionary::ord_to_term()`](crate::termdict::TermDictionary::ord_to_term)
|
||||
/// from the dictionary.
|
||||
#[must_use]
|
||||
pub fn set_fast(mut self, tokenizer_name: Option<&str>) -> Self {
|
||||
if let Some(tokenizer) = tokenizer_name {
|
||||
let tokenizer = TokenizerName::from_name(tokenizer);
|
||||
self.fast = FastFieldTextOptions::EnabledWithTokenizer {
|
||||
with_tokenizer: tokenizer,
|
||||
}
|
||||
} else {
|
||||
self.fast = FastFieldTextOptions::IsEnabled(true);
|
||||
}
|
||||
pub fn set_fast(mut self, tokenizer_name: &str) -> Self {
|
||||
let with_tokenizer = TokenizerName::from_name(tokenizer_name);
|
||||
self.fast = FastFieldTextOptions::Enabled {
|
||||
tokenizer: with_tokenizer,
|
||||
};
|
||||
self
|
||||
}
|
||||
|
||||
@@ -166,7 +161,9 @@ impl From<FastFlag> for JsonObjectOptions {
|
||||
JsonObjectOptions {
|
||||
stored: false,
|
||||
indexing: None,
|
||||
fast: FastFieldTextOptions::IsEnabled(true),
|
||||
fast: FastFieldTextOptions::Enabled {
|
||||
tokenizer: TokenizerName::from_static(DEFAULT_FAST_FIELD_TOKENIZER),
|
||||
},
|
||||
expand_dots_enabled: false,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
//! Schema definition for tantivy's indices.
|
||||
//!
|
||||
//! # Setting your schema in Tantivy
|
||||
//!
|
||||
//!
|
||||
//! Tantivy has a very strict schema.
|
||||
//! The schema defines information about the fields your index contains, that is, for each field:
|
||||
//!
|
||||
@@ -153,6 +153,8 @@ pub use self::term::{Term, ValueBytes, JSON_END_OF_PATH};
|
||||
pub use self::text_options::{TextFieldIndexing, TextOptions, STRING, TEXT};
|
||||
pub use self::value::Value;
|
||||
|
||||
pub(crate) const DEFAULT_FAST_FIELD_TOKENIZER: &str = "default";
|
||||
|
||||
/// Validator for a potential `field_name`.
|
||||
/// Returns true if the name can be use for a field name.
|
||||
///
|
||||
|
||||
@@ -973,7 +973,7 @@ mod tests {
|
||||
"fieldnorms": true,
|
||||
"fast": true,
|
||||
"stored": true,
|
||||
"precision": "second"
|
||||
"precision": "seconds"
|
||||
}
|
||||
},
|
||||
{
|
||||
|
||||
@@ -24,19 +24,68 @@ pub struct TextOptions {
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
#[serde(
|
||||
into = "FastFieldTextOptionsForSerialization",
|
||||
from = "FastFieldTextOptionsForSerialization"
|
||||
)]
|
||||
/// Enum to control how the fast field setting of a text field.
|
||||
#[derive(Default)]
|
||||
pub(crate) enum FastFieldTextOptions {
|
||||
/// Flag to enable/disable
|
||||
IsEnabled(bool),
|
||||
/// Fastfield disabled
|
||||
#[default]
|
||||
Disabled,
|
||||
/// Enable with tokenizer. The tokenizer must be available on the fast field tokenizer manager.
|
||||
/// `Index::fast_field_tokenizer`.
|
||||
EnabledWithTokenizer { with_tokenizer: TokenizerName },
|
||||
Enabled { tokenizer: TokenizerName },
|
||||
}
|
||||
|
||||
impl Default for FastFieldTextOptions {
|
||||
fn default() -> Self {
|
||||
FastFieldTextOptions::IsEnabled(false)
|
||||
/// Enum used to control the way we serialize fast field text options.
|
||||
///
|
||||
/// For backward compatiblity reasons, we folow the format introduce in tantivy 0.19.
|
||||
/// `false` -> Disabled
|
||||
/// `true` -> Enabled with default tokenizer
|
||||
/// `{ tokenizer: "something" }` -> Enabled with a specific tokenizer.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
enum FastFieldTextOptionsForSerialization {
|
||||
IsEnabled(bool),
|
||||
EnabledWithTokenizer {
|
||||
#[serde(alias = "with_tokenizer")]
|
||||
tokenizer: TokenizerName,
|
||||
},
|
||||
}
|
||||
|
||||
impl From<FastFieldTextOptionsForSerialization> for FastFieldTextOptions {
|
||||
fn from(value: FastFieldTextOptionsForSerialization) -> Self {
|
||||
match value {
|
||||
FastFieldTextOptionsForSerialization::IsEnabled(enabled) => {
|
||||
if enabled {
|
||||
FastFieldTextOptions::Enabled {
|
||||
tokenizer: TokenizerName::from_static(
|
||||
crate::schema::DEFAULT_FAST_FIELD_TOKENIZER,
|
||||
),
|
||||
}
|
||||
} else {
|
||||
FastFieldTextOptions::Disabled
|
||||
}
|
||||
}
|
||||
FastFieldTextOptionsForSerialization::EnabledWithTokenizer { tokenizer } => {
|
||||
FastFieldTextOptions::Enabled { tokenizer }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<FastFieldTextOptions> for FastFieldTextOptionsForSerialization {
|
||||
fn from(value: FastFieldTextOptions) -> Self {
|
||||
match value {
|
||||
FastFieldTextOptions::Disabled => {
|
||||
FastFieldTextOptionsForSerialization::IsEnabled(false)
|
||||
}
|
||||
FastFieldTextOptions::Enabled { tokenizer } => {
|
||||
FastFieldTextOptionsForSerialization::EnabledWithTokenizer { tokenizer }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -45,23 +94,13 @@ impl BitOr<FastFieldTextOptions> for FastFieldTextOptions {
|
||||
|
||||
fn bitor(self, other: FastFieldTextOptions) -> FastFieldTextOptions {
|
||||
match (self, other) {
|
||||
(
|
||||
FastFieldTextOptions::EnabledWithTokenizer {
|
||||
with_tokenizer: tokenizer,
|
||||
},
|
||||
_,
|
||||
)
|
||||
| (
|
||||
_,
|
||||
FastFieldTextOptions::EnabledWithTokenizer {
|
||||
with_tokenizer: tokenizer,
|
||||
},
|
||||
) => FastFieldTextOptions::EnabledWithTokenizer {
|
||||
with_tokenizer: tokenizer,
|
||||
},
|
||||
(FastFieldTextOptions::IsEnabled(true), _)
|
||||
| (_, FastFieldTextOptions::IsEnabled(true)) => FastFieldTextOptions::IsEnabled(true),
|
||||
(_, FastFieldTextOptions::IsEnabled(false)) => FastFieldTextOptions::IsEnabled(false),
|
||||
(FastFieldTextOptions::Enabled { tokenizer }, _)
|
||||
| (_, FastFieldTextOptions::Enabled { tokenizer }) => {
|
||||
FastFieldTextOptions::Enabled { tokenizer }
|
||||
}
|
||||
(FastFieldTextOptions::Disabled, FastFieldTextOptions::Disabled) => {
|
||||
FastFieldTextOptions::Disabled
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -83,20 +122,17 @@ impl TextOptions {
|
||||
|
||||
/// Returns true if and only if the value is a fast field.
|
||||
pub fn is_fast(&self) -> bool {
|
||||
matches!(self.fast, FastFieldTextOptions::IsEnabled(true))
|
||||
|| matches!(
|
||||
&self.fast,
|
||||
FastFieldTextOptions::EnabledWithTokenizer { with_tokenizer: _ }
|
||||
)
|
||||
match &self.fast {
|
||||
FastFieldTextOptions::Disabled => false,
|
||||
FastFieldTextOptions::Enabled { .. } => true,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if and only if the value is a fast field.
|
||||
pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> {
|
||||
match &self.fast {
|
||||
FastFieldTextOptions::IsEnabled(true) | FastFieldTextOptions::IsEnabled(false) => None,
|
||||
FastFieldTextOptions::EnabledWithTokenizer {
|
||||
with_tokenizer: tokenizer,
|
||||
} => Some(tokenizer.name()),
|
||||
FastFieldTextOptions::Disabled => None,
|
||||
FastFieldTextOptions::Enabled { tokenizer } => Some(tokenizer.name()),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -121,15 +157,9 @@ impl TextOptions {
|
||||
/// [`TermDictionary::ord_to_term()`](crate::termdict::TermDictionary::ord_to_term)
|
||||
/// from the dictionary.
|
||||
#[must_use]
|
||||
pub fn set_fast(mut self, tokenizer_name: Option<&str>) -> TextOptions {
|
||||
if let Some(tokenizer) = tokenizer_name {
|
||||
let tokenizer = TokenizerName::from_name(tokenizer);
|
||||
self.fast = FastFieldTextOptions::EnabledWithTokenizer {
|
||||
with_tokenizer: tokenizer,
|
||||
}
|
||||
} else {
|
||||
self.fast = FastFieldTextOptions::IsEnabled(true);
|
||||
}
|
||||
pub fn set_fast(mut self, tokenizer_name: &str) -> TextOptions {
|
||||
let tokenizer = TokenizerName::from_name(tokenizer_name);
|
||||
self.fast = FastFieldTextOptions::Enabled { tokenizer };
|
||||
self
|
||||
}
|
||||
|
||||
@@ -263,7 +293,7 @@ pub const STRING: TextOptions = TextOptions {
|
||||
record: IndexRecordOption::Basic,
|
||||
}),
|
||||
stored: false,
|
||||
fast: FastFieldTextOptions::IsEnabled(false),
|
||||
fast: FastFieldTextOptions::Disabled,
|
||||
coerce: false,
|
||||
};
|
||||
|
||||
@@ -276,7 +306,7 @@ pub const TEXT: TextOptions = TextOptions {
|
||||
}),
|
||||
stored: false,
|
||||
coerce: false,
|
||||
fast: FastFieldTextOptions::IsEnabled(false),
|
||||
fast: FastFieldTextOptions::Disabled,
|
||||
};
|
||||
|
||||
impl<T: Into<TextOptions>> BitOr<T> for TextOptions {
|
||||
@@ -326,7 +356,9 @@ impl From<FastFlag> for TextOptions {
|
||||
TextOptions {
|
||||
indexing: None,
|
||||
stored: false,
|
||||
fast: FastFieldTextOptions::IsEnabled(true),
|
||||
fast: FastFieldTextOptions::Enabled {
|
||||
tokenizer: TokenizerName::from_static(crate::schema::DEFAULT_FAST_FIELD_TOKENIZER),
|
||||
},
|
||||
coerce: false,
|
||||
}
|
||||
}
|
||||
@@ -392,21 +424,21 @@ mod tests {
|
||||
#[test]
|
||||
fn serde_fast_field_tokenizer() {
|
||||
let json = r#" {
|
||||
"fast": { "with_tokenizer": "default" }
|
||||
"fast": { "tokenizer": "default" }
|
||||
} "#;
|
||||
let options: TextOptions = serde_json::from_str(json).unwrap();
|
||||
assert_eq!(
|
||||
options.fast,
|
||||
FastFieldTextOptions::EnabledWithTokenizer {
|
||||
with_tokenizer: TokenizerName::from_static("default")
|
||||
FastFieldTextOptions::Enabled {
|
||||
tokenizer: TokenizerName::from_static("default")
|
||||
}
|
||||
);
|
||||
let options: TextOptions =
|
||||
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
|
||||
assert_eq!(
|
||||
options.fast,
|
||||
FastFieldTextOptions::EnabledWithTokenizer {
|
||||
with_tokenizer: TokenizerName::from_static("default")
|
||||
FastFieldTextOptions::Enabled {
|
||||
tokenizer: TokenizerName::from_static("default")
|
||||
}
|
||||
);
|
||||
|
||||
@@ -414,18 +446,28 @@ mod tests {
|
||||
"fast": true
|
||||
} "#;
|
||||
let options: TextOptions = serde_json::from_str(json).unwrap();
|
||||
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(true));
|
||||
assert_eq!(
|
||||
options.fast,
|
||||
FastFieldTextOptions::Enabled {
|
||||
tokenizer: TokenizerName::from_static(DEFAULT_FAST_FIELD_TOKENIZER)
|
||||
}
|
||||
);
|
||||
let options: TextOptions =
|
||||
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
|
||||
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(true));
|
||||
assert_eq!(
|
||||
options.fast,
|
||||
FastFieldTextOptions::Enabled {
|
||||
tokenizer: TokenizerName::from_static(DEFAULT_FAST_FIELD_TOKENIZER)
|
||||
}
|
||||
);
|
||||
|
||||
let json = r#" {
|
||||
"fast": false
|
||||
} "#;
|
||||
let options: TextOptions = serde_json::from_str(json).unwrap();
|
||||
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(false));
|
||||
assert_eq!(options.fast, FastFieldTextOptions::Disabled);
|
||||
let options: TextOptions =
|
||||
serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap();
|
||||
assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(false));
|
||||
assert_eq!(options.fast, FastFieldTextOptions::Disabled);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -145,7 +145,7 @@ impl Snippet {
|
||||
/// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\
|
||||
/// has to be a valid string.
|
||||
fn search_fragments(
|
||||
tokenizer: &TextAnalyzer,
|
||||
tokenizer: &mut TextAnalyzer,
|
||||
text: &str,
|
||||
terms: &BTreeMap<String, Score>,
|
||||
max_num_chars: usize,
|
||||
@@ -370,8 +370,12 @@ impl SnippetGenerator {
|
||||
|
||||
/// Generates a snippet for the given text.
|
||||
pub fn snippet(&self, text: &str) -> Snippet {
|
||||
let fragment_candidates =
|
||||
search_fragments(&self.tokenizer, text, &self.terms_text, self.max_num_chars);
|
||||
let fragment_candidates = search_fragments(
|
||||
&mut self.tokenizer.clone(),
|
||||
text,
|
||||
&self.terms_text,
|
||||
self.max_num_chars,
|
||||
);
|
||||
select_best_fragment_combination(&fragment_candidates[..], text)
|
||||
}
|
||||
}
|
||||
@@ -408,7 +412,12 @@ Survey in 2016, 2017, and 2018."#;
|
||||
String::from("rust") => 1.0,
|
||||
String::from("language") => 0.9
|
||||
};
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 100);
|
||||
let fragments = search_fragments(
|
||||
&mut From::from(SimpleTokenizer::default()),
|
||||
TEST_TEXT,
|
||||
&terms,
|
||||
100,
|
||||
);
|
||||
assert_eq!(fragments.len(), 7);
|
||||
{
|
||||
let first = &fragments[0];
|
||||
@@ -435,7 +444,12 @@ Survey in 2016, 2017, and 2018."#;
|
||||
String::from("rust") =>1.0,
|
||||
String::from("language") => 0.9
|
||||
};
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 20);
|
||||
let fragments = search_fragments(
|
||||
&mut From::from(SimpleTokenizer::default()),
|
||||
TEST_TEXT,
|
||||
&terms,
|
||||
20,
|
||||
);
|
||||
{
|
||||
let first = &fragments[0];
|
||||
assert_eq!(first.score, 1.0);
|
||||
@@ -449,7 +463,12 @@ Survey in 2016, 2017, and 2018."#;
|
||||
String::from("rust") =>0.9,
|
||||
String::from("language") => 1.0
|
||||
};
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 20);
|
||||
let fragments = search_fragments(
|
||||
&mut From::from(SimpleTokenizer::default()),
|
||||
TEST_TEXT,
|
||||
&terms,
|
||||
20,
|
||||
);
|
||||
// assert_eq!(fragments.len(), 7);
|
||||
{
|
||||
let first = &fragments[0];
|
||||
@@ -468,7 +487,8 @@ Survey in 2016, 2017, and 2018."#;
|
||||
let mut terms = BTreeMap::new();
|
||||
terms.insert(String::from("c"), 1.0);
|
||||
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), text, &terms, 3);
|
||||
let fragments =
|
||||
search_fragments(&mut From::from(SimpleTokenizer::default()), text, &terms, 3);
|
||||
|
||||
assert_eq!(fragments.len(), 1);
|
||||
{
|
||||
@@ -490,7 +510,8 @@ Survey in 2016, 2017, and 2018."#;
|
||||
let mut terms = BTreeMap::new();
|
||||
terms.insert(String::from("f"), 1.0);
|
||||
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), text, &terms, 3);
|
||||
let fragments =
|
||||
search_fragments(&mut From::from(SimpleTokenizer::default()), text, &terms, 3);
|
||||
|
||||
assert_eq!(fragments.len(), 2);
|
||||
{
|
||||
@@ -513,7 +534,8 @@ Survey in 2016, 2017, and 2018."#;
|
||||
terms.insert(String::from("f"), 1.0);
|
||||
terms.insert(String::from("a"), 0.9);
|
||||
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), text, &terms, 7);
|
||||
let fragments =
|
||||
search_fragments(&mut From::from(SimpleTokenizer::default()), text, &terms, 7);
|
||||
|
||||
assert_eq!(fragments.len(), 2);
|
||||
{
|
||||
@@ -535,7 +557,8 @@ Survey in 2016, 2017, and 2018."#;
|
||||
let mut terms = BTreeMap::new();
|
||||
terms.insert(String::from("z"), 1.0);
|
||||
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), text, &terms, 3);
|
||||
let fragments =
|
||||
search_fragments(&mut From::from(SimpleTokenizer::default()), text, &terms, 3);
|
||||
|
||||
assert_eq!(fragments.len(), 0);
|
||||
|
||||
@@ -550,7 +573,8 @@ Survey in 2016, 2017, and 2018."#;
|
||||
let text = "a b c d";
|
||||
|
||||
let terms = BTreeMap::new();
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), text, &terms, 3);
|
||||
let fragments =
|
||||
search_fragments(&mut From::from(SimpleTokenizer::default()), text, &terms, 3);
|
||||
assert_eq!(fragments.len(), 0);
|
||||
|
||||
let snippet = select_best_fragment_combination(&fragments[..], text);
|
||||
@@ -669,7 +693,7 @@ Survey in 2016, 2017, and 2018."#;
|
||||
terms.insert(String::from("bc"), 1.0);
|
||||
|
||||
let fragments = search_fragments(
|
||||
&From::from(NgramTokenizer::all_ngrams(2, 2)),
|
||||
&mut From::from(NgramTokenizer::all_ngrams(2, 2).unwrap()),
|
||||
text,
|
||||
&terms,
|
||||
3,
|
||||
@@ -691,7 +715,12 @@ Survey in 2016, 2017, and 2018."#;
|
||||
#[test]
|
||||
fn test_snippet_generator_custom_highlighted_elements() {
|
||||
let terms = btreemap! { String::from("rust") => 1.0, String::from("language") => 0.9 };
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 100);
|
||||
let fragments = search_fragments(
|
||||
&mut From::from(SimpleTokenizer::default()),
|
||||
TEST_TEXT,
|
||||
&terms,
|
||||
100,
|
||||
);
|
||||
let mut snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT);
|
||||
assert_eq!(
|
||||
snippet.to_html(),
|
||||
|
||||
@@ -1,19 +0,0 @@
|
||||
use std::io;
|
||||
|
||||
#[inline]
|
||||
pub fn compress(mut uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
let params = brotli::enc::BrotliEncoderParams {
|
||||
quality: 5,
|
||||
..Default::default()
|
||||
};
|
||||
compressed.clear();
|
||||
brotli::BrotliCompress(&mut uncompressed, compressed, ¶ms)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn decompress(mut compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
decompressed.clear();
|
||||
brotli::BrotliDecompress(&mut compressed, decompressed)?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,17 +0,0 @@
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
#[inline]
|
||||
pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
compressed.clear();
|
||||
let mut encoder = snap::write::FrameEncoder::new(compressed);
|
||||
encoder.write_all(uncompressed)?;
|
||||
encoder.flush()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn decompress(compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
decompressed.clear();
|
||||
snap::read::FrameDecoder::new(compressed).read_to_end(decompressed)?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -17,12 +17,10 @@ pub enum Compressor {
|
||||
/// No compression
|
||||
None,
|
||||
/// Use the lz4 compressor (block format)
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
Lz4,
|
||||
/// Use the brotli compressor
|
||||
Brotli,
|
||||
/// Use the snap compressor
|
||||
Snappy,
|
||||
/// Use the zstd compressor
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
Zstd(ZstdCompressor),
|
||||
}
|
||||
|
||||
@@ -31,9 +29,9 @@ impl Serialize for Compressor {
|
||||
where S: serde::Serializer {
|
||||
match *self {
|
||||
Compressor::None => serializer.serialize_str("none"),
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
Compressor::Lz4 => serializer.serialize_str("lz4"),
|
||||
Compressor::Brotli => serializer.serialize_str("brotli"),
|
||||
Compressor::Snappy => serializer.serialize_str("snappy"),
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
Compressor::Zstd(zstd) => serializer.serialize_str(&zstd.ser_to_string()),
|
||||
}
|
||||
}
|
||||
@@ -45,27 +43,38 @@ impl<'de> Deserialize<'de> for Compressor {
|
||||
let buf = String::deserialize(deserializer)?;
|
||||
let compressor = match buf.as_str() {
|
||||
"none" => Compressor::None,
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
"lz4" => Compressor::Lz4,
|
||||
"brotli" => Compressor::Brotli,
|
||||
"snappy" => Compressor::Snappy,
|
||||
#[cfg(not(feature = "lz4-compression"))]
|
||||
"lz4" => {
|
||||
return Err(serde::de::Error::custom(
|
||||
"unsupported variant `lz4`, please enable Tantivy's `lz4-compression` feature",
|
||||
))
|
||||
}
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
_ if buf.starts_with("zstd") => Compressor::Zstd(
|
||||
ZstdCompressor::deser_from_str(&buf).map_err(serde::de::Error::custom)?,
|
||||
),
|
||||
#[cfg(not(feature = "zstd-compression"))]
|
||||
_ if buf.starts_with("zstd") => {
|
||||
return Err(serde::de::Error::custom(
|
||||
"unsupported variant `zstd`, please enable Tantivy's `zstd-compression` \
|
||||
feature",
|
||||
))
|
||||
}
|
||||
_ => {
|
||||
if buf.starts_with("zstd") {
|
||||
Compressor::Zstd(
|
||||
ZstdCompressor::deser_from_str(&buf).map_err(serde::de::Error::custom)?,
|
||||
)
|
||||
} else {
|
||||
return Err(serde::de::Error::unknown_variant(
|
||||
&buf,
|
||||
&[
|
||||
"none",
|
||||
"lz4",
|
||||
"brotli",
|
||||
"snappy",
|
||||
"zstd",
|
||||
"zstd(compression_level=5)",
|
||||
],
|
||||
));
|
||||
}
|
||||
return Err(serde::de::Error::unknown_variant(
|
||||
&buf,
|
||||
&[
|
||||
"none",
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
"lz4",
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
"zstd",
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
"zstd(compression_level=5)",
|
||||
],
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
@@ -127,18 +136,15 @@ impl ZstdCompressor {
|
||||
}
|
||||
|
||||
impl Default for Compressor {
|
||||
#[allow(unreachable_code)]
|
||||
fn default() -> Self {
|
||||
if cfg!(feature = "lz4-compression") {
|
||||
Compressor::Lz4
|
||||
} else if cfg!(feature = "brotli-compression") {
|
||||
Compressor::Brotli
|
||||
} else if cfg!(feature = "snappy-compression") {
|
||||
Compressor::Snappy
|
||||
} else if cfg!(feature = "zstd-compression") {
|
||||
Compressor::Zstd(ZstdCompressor::default())
|
||||
} else {
|
||||
Compressor::None
|
||||
}
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
return Compressor::Lz4;
|
||||
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
return Compressor::Zstd(ZstdCompressor::default());
|
||||
|
||||
Compressor::None
|
||||
}
|
||||
}
|
||||
|
||||
@@ -155,50 +161,14 @@ impl Compressor {
|
||||
compressed.extend_from_slice(uncompressed);
|
||||
Ok(())
|
||||
}
|
||||
Self::Lz4 => {
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
{
|
||||
super::compression_lz4_block::compress(uncompressed, compressed)
|
||||
}
|
||||
#[cfg(not(feature = "lz4-compression"))]
|
||||
{
|
||||
panic!("lz4-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
Self::Brotli => {
|
||||
#[cfg(feature = "brotli-compression")]
|
||||
{
|
||||
super::compression_brotli::compress(uncompressed, compressed)
|
||||
}
|
||||
#[cfg(not(feature = "brotli-compression"))]
|
||||
{
|
||||
panic!("brotli-compression-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
Self::Snappy => {
|
||||
#[cfg(feature = "snappy-compression")]
|
||||
{
|
||||
super::compression_snap::compress(uncompressed, compressed)
|
||||
}
|
||||
#[cfg(not(feature = "snappy-compression"))]
|
||||
{
|
||||
panic!("snappy-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
Self::Zstd(_zstd_compressor) => {
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
{
|
||||
super::compression_zstd_block::compress(
|
||||
uncompressed,
|
||||
compressed,
|
||||
_zstd_compressor.compression_level,
|
||||
)
|
||||
}
|
||||
#[cfg(not(feature = "zstd-compression"))]
|
||||
{
|
||||
panic!("zstd-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
Self::Lz4 => super::compression_lz4_block::compress(uncompressed, compressed),
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
Self::Zstd(_zstd_compressor) => super::compression_zstd_block::compress(
|
||||
uncompressed,
|
||||
compressed,
|
||||
_zstd_compressor.compression_level,
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,12 +16,10 @@ pub enum Decompressor {
|
||||
/// No compression
|
||||
None,
|
||||
/// Use the lz4 decompressor (block format)
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
Lz4,
|
||||
/// Use the brotli decompressor
|
||||
Brotli,
|
||||
/// Use the snap decompressor
|
||||
Snappy,
|
||||
/// Use the zstd decompressor
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
Zstd,
|
||||
}
|
||||
|
||||
@@ -29,9 +27,9 @@ impl From<Compressor> for Decompressor {
|
||||
fn from(compressor: Compressor) -> Self {
|
||||
match compressor {
|
||||
Compressor::None => Decompressor::None,
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
Compressor::Lz4 => Decompressor::Lz4,
|
||||
Compressor::Brotli => Decompressor::Brotli,
|
||||
Compressor::Snappy => Decompressor::Snappy,
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
Compressor::Zstd(_) => Decompressor::Zstd,
|
||||
}
|
||||
}
|
||||
@@ -41,9 +39,9 @@ impl Decompressor {
|
||||
pub(crate) fn from_id(id: u8) -> Decompressor {
|
||||
match id {
|
||||
0 => Decompressor::None,
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
1 => Decompressor::Lz4,
|
||||
2 => Decompressor::Brotli,
|
||||
3 => Decompressor::Snappy,
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
4 => Decompressor::Zstd,
|
||||
_ => panic!("unknown compressor id {id:?}"),
|
||||
}
|
||||
@@ -52,9 +50,9 @@ impl Decompressor {
|
||||
pub(crate) fn get_id(&self) -> u8 {
|
||||
match self {
|
||||
Self::None => 0,
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
Self::Lz4 => 1,
|
||||
Self::Brotli => 2,
|
||||
Self::Snappy => 3,
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
Self::Zstd => 4,
|
||||
}
|
||||
}
|
||||
@@ -77,46 +75,10 @@ impl Decompressor {
|
||||
decompressed.extend_from_slice(compressed);
|
||||
Ok(())
|
||||
}
|
||||
Self::Lz4 => {
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
{
|
||||
super::compression_lz4_block::decompress(compressed, decompressed)
|
||||
}
|
||||
#[cfg(not(feature = "lz4-compression"))]
|
||||
{
|
||||
panic!("lz4-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
Self::Brotli => {
|
||||
#[cfg(feature = "brotli-compression")]
|
||||
{
|
||||
super::compression_brotli::decompress(compressed, decompressed)
|
||||
}
|
||||
#[cfg(not(feature = "brotli-compression"))]
|
||||
{
|
||||
panic!("brotli-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
Self::Snappy => {
|
||||
#[cfg(feature = "snappy-compression")]
|
||||
{
|
||||
super::compression_snap::decompress(compressed, decompressed)
|
||||
}
|
||||
#[cfg(not(feature = "snappy-compression"))]
|
||||
{
|
||||
panic!("snappy-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
Self::Zstd => {
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
{
|
||||
super::compression_zstd_block::decompress(compressed, decompressed)
|
||||
}
|
||||
#[cfg(not(feature = "zstd-compression"))]
|
||||
{
|
||||
panic!("zstd-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
Self::Lz4 => super::compression_lz4_block::decompress(compressed, decompressed),
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
Self::Zstd => super::compression_zstd_block::decompress(compressed, decompressed),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -129,9 +91,9 @@ mod tests {
|
||||
#[test]
|
||||
fn compressor_decompressor_id_test() {
|
||||
assert_eq!(Decompressor::from(Compressor::None), Decompressor::None);
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
assert_eq!(Decompressor::from(Compressor::Lz4), Decompressor::Lz4);
|
||||
assert_eq!(Decompressor::from(Compressor::Brotli), Decompressor::Brotli);
|
||||
assert_eq!(Decompressor::from(Compressor::Snappy), Decompressor::Snappy);
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
assert_eq!(
|
||||
Decompressor::from(Compressor::Zstd(Default::default())),
|
||||
Decompressor::Zstd
|
||||
|
||||
@@ -4,8 +4,8 @@
|
||||
//! order to be handled in the `Store`.
|
||||
//!
|
||||
//! Internally, documents (or rather their stored fields) are serialized to a buffer.
|
||||
//! When the buffer exceeds `block_size` (defaults to 16K), the buffer is compressed using `brotli`,
|
||||
//! `LZ4` or `snappy` and the resulting block is written to disk.
|
||||
//! When the buffer exceeds `block_size` (defaults to 16K), the buffer is compressed
|
||||
//! using LZ4 or Zstd and the resulting block is written to disk.
|
||||
//!
|
||||
//! One can then request for a specific `DocId`.
|
||||
//! A skip list helps navigating to the right block,
|
||||
@@ -48,12 +48,6 @@ pub(crate) const DOC_STORE_VERSION: u32 = 1;
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
mod compression_lz4_block;
|
||||
|
||||
#[cfg(feature = "brotli-compression")]
|
||||
mod compression_brotli;
|
||||
|
||||
#[cfg(feature = "snappy-compression")]
|
||||
mod compression_snap;
|
||||
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
mod compression_zstd_block;
|
||||
|
||||
@@ -200,16 +194,6 @@ pub mod tests {
|
||||
fn test_store_lz4_block() -> crate::Result<()> {
|
||||
test_store(Compressor::Lz4, BLOCK_SIZE, true)
|
||||
}
|
||||
#[cfg(feature = "snappy-compression")]
|
||||
#[test]
|
||||
fn test_store_snap() -> crate::Result<()> {
|
||||
test_store(Compressor::Snappy, BLOCK_SIZE, true)
|
||||
}
|
||||
#[cfg(feature = "brotli-compression")]
|
||||
#[test]
|
||||
fn test_store_brotli() -> crate::Result<()> {
|
||||
test_store(Compressor::Brotli, BLOCK_SIZE, true)
|
||||
}
|
||||
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
#[test]
|
||||
@@ -261,8 +245,8 @@ pub mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "snappy-compression")]
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
#[test]
|
||||
fn test_merge_with_changed_compressor() -> crate::Result<()> {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
@@ -294,7 +278,7 @@ pub mod tests {
|
||||
);
|
||||
// Change compressor, this disables stacking on merging
|
||||
let index_settings = index.settings_mut();
|
||||
index_settings.docstore_compression = Compressor::Snappy;
|
||||
index_settings.docstore_compression = Compressor::Zstd(Default::default());
|
||||
// Merging the segments
|
||||
{
|
||||
let segment_ids = index
|
||||
@@ -316,7 +300,7 @@ pub mod tests {
|
||||
LOREM.to_string()
|
||||
);
|
||||
}
|
||||
assert_eq!(store.decompressor(), Decompressor::Snappy);
|
||||
assert_eq!(store.decompressor(), Decompressor::Zstd);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -426,7 +426,7 @@ mod tests {
|
||||
assert_eq!(store.cache_stats().cache_hits, 1);
|
||||
assert_eq!(store.cache_stats().cache_misses, 2);
|
||||
|
||||
assert_eq!(store.cache.peek_lru(), Some(11163));
|
||||
assert_eq!(store.cache.peek_lru(), Some(11207));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
//! ```rust
|
||||
//! use tantivy::tokenizer::*;
|
||||
//!
|
||||
//! let tokenizer = TextAnalyzer::builder(RawTokenizer)
|
||||
//! let mut tokenizer = TextAnalyzer::builder(RawTokenizer::default())
|
||||
//! .filter(AlphaNumOnlyFilter)
|
||||
//! .build();
|
||||
//!
|
||||
@@ -11,7 +11,7 @@
|
||||
//! // contains a space
|
||||
//! assert!(stream.next().is_none());
|
||||
//!
|
||||
//! let tokenizer = TextAnalyzer::builder(SimpleTokenizer)
|
||||
//! let mut tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
//! .filter(AlphaNumOnlyFilter)
|
||||
//! .build();
|
||||
//!
|
||||
@@ -52,7 +52,7 @@ pub struct AlphaNumOnlyFilterWrapper<T>(T);
|
||||
impl<T: Tokenizer> Tokenizer for AlphaNumOnlyFilterWrapper<T> {
|
||||
type TokenStream<'a> = AlphaNumOnlyFilterStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
AlphaNumOnlyFilterStream {
|
||||
tail: self.0.token_stream(text),
|
||||
}
|
||||
@@ -96,7 +96,7 @@ mod tests {
|
||||
}
|
||||
|
||||
fn token_stream_helper(text: &str) -> Vec<Token> {
|
||||
let a = TextAnalyzer::builder(SimpleTokenizer)
|
||||
let mut a = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(AlphaNumOnlyFilter)
|
||||
.build();
|
||||
let mut token_stream = a.token_stream(text);
|
||||
|
||||
@@ -12,38 +12,45 @@ impl TokenFilter for AsciiFoldingFilter {
|
||||
type Tokenizer<T: Tokenizer> = AsciiFoldingFilterWrapper<T>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> AsciiFoldingFilterWrapper<T> {
|
||||
AsciiFoldingFilterWrapper(tokenizer)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct AsciiFoldingFilterWrapper<T>(T);
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for AsciiFoldingFilterWrapper<T> {
|
||||
type TokenStream<'a> = AsciiFoldingFilterTokenStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
AsciiFoldingFilterTokenStream {
|
||||
buffer: String::with_capacity(100),
|
||||
tail: self.0.token_stream(text),
|
||||
AsciiFoldingFilterWrapper {
|
||||
tokenizer,
|
||||
buffer: String::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct AsciiFoldingFilterTokenStream<T> {
|
||||
#[derive(Clone)]
|
||||
pub struct AsciiFoldingFilterWrapper<T> {
|
||||
tokenizer: T,
|
||||
buffer: String,
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for AsciiFoldingFilterWrapper<T> {
|
||||
type TokenStream<'a> = AsciiFoldingFilterTokenStream<'a, T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
self.buffer.clear();
|
||||
AsciiFoldingFilterTokenStream {
|
||||
buffer: &mut self.buffer,
|
||||
tail: self.tokenizer.token_stream(text),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct AsciiFoldingFilterTokenStream<'a, T> {
|
||||
buffer: &'a mut String,
|
||||
tail: T,
|
||||
}
|
||||
|
||||
impl<T: TokenStream> TokenStream for AsciiFoldingFilterTokenStream<T> {
|
||||
impl<'a, T: TokenStream> TokenStream for AsciiFoldingFilterTokenStream<'a, T> {
|
||||
fn advance(&mut self) -> bool {
|
||||
if !self.tail.advance() {
|
||||
return false;
|
||||
}
|
||||
if !self.token_mut().text.is_ascii() {
|
||||
// ignore its already ascii
|
||||
to_ascii(&self.tail.token().text, &mut self.buffer);
|
||||
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
|
||||
to_ascii(&self.tail.token().text, self.buffer);
|
||||
mem::swap(&mut self.tail.token_mut().text, self.buffer);
|
||||
}
|
||||
true
|
||||
}
|
||||
@@ -1573,7 +1580,7 @@ mod tests {
|
||||
|
||||
fn folding_helper(text: &str) -> Vec<String> {
|
||||
let mut tokens = Vec::new();
|
||||
TextAnalyzer::builder(SimpleTokenizer)
|
||||
TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(AsciiFoldingFilter)
|
||||
.build()
|
||||
.token_stream(text)
|
||||
@@ -1584,10 +1591,10 @@ mod tests {
|
||||
}
|
||||
|
||||
fn folding_using_raw_tokenizer_helper(text: &str) -> String {
|
||||
let mut token_stream = TextAnalyzer::builder(RawTokenizer)
|
||||
let mut tokenizer = TextAnalyzer::builder(RawTokenizer::default())
|
||||
.filter(AsciiFoldingFilter)
|
||||
.build()
|
||||
.token_stream(text);
|
||||
.build();
|
||||
let mut token_stream = tokenizer.token_stream(text);
|
||||
token_stream.advance();
|
||||
token_stream.token().text.clone()
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@ pub(crate) struct EmptyTokenizer;
|
||||
|
||||
impl Tokenizer for EmptyTokenizer {
|
||||
type TokenStream<'a> = EmptyTokenStream;
|
||||
fn token_stream(&self, _text: &str) -> EmptyTokenStream {
|
||||
fn token_stream(&mut self, _text: &str) -> EmptyTokenStream {
|
||||
EmptyTokenStream::default()
|
||||
}
|
||||
}
|
||||
@@ -35,7 +35,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_empty_tokenizer() {
|
||||
let tokenizer = super::EmptyTokenizer;
|
||||
let mut tokenizer = super::EmptyTokenizer;
|
||||
let mut empty = tokenizer.token_stream("whatever string");
|
||||
assert!(!empty.advance());
|
||||
}
|
||||
|
||||
@@ -9,8 +9,10 @@ use crate::schema::FACET_SEP_BYTE;
|
||||
/// - `/america/north_america/canada`
|
||||
/// - `/america/north_america`
|
||||
/// - `/america`
|
||||
#[derive(Clone)]
|
||||
pub struct FacetTokenizer;
|
||||
#[derive(Clone, Default)]
|
||||
pub struct FacetTokenizer {
|
||||
token: Token,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
enum State {
|
||||
@@ -22,20 +24,18 @@ enum State {
|
||||
pub struct FacetTokenStream<'a> {
|
||||
text: &'a str,
|
||||
state: State,
|
||||
token: Token,
|
||||
token: &'a mut Token,
|
||||
}
|
||||
|
||||
impl Tokenizer for FacetTokenizer {
|
||||
type TokenStream<'a> = FacetTokenStream<'a>;
|
||||
fn token_stream<'a>(&self, text: &'a str) -> FacetTokenStream<'a> {
|
||||
let token = Token {
|
||||
position: 0,
|
||||
..Default::default()
|
||||
};
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> FacetTokenStream<'a> {
|
||||
self.token.reset();
|
||||
self.token.position = 0;
|
||||
FacetTokenStream {
|
||||
text,
|
||||
state: State::RootFacetNotEmitted, //< pos is the first char that has not been processed yet.
|
||||
token,
|
||||
token: &mut self.token,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -74,11 +74,11 @@ impl<'a> TokenStream for FacetTokenStream<'a> {
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
&self.token
|
||||
self.token
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
&mut self.token
|
||||
self.token
|
||||
}
|
||||
}
|
||||
|
||||
@@ -98,7 +98,7 @@ mod tests {
|
||||
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap();
|
||||
tokens.push(format!("{}", facet));
|
||||
};
|
||||
FacetTokenizer
|
||||
FacetTokenizer::default()
|
||||
.token_stream(facet.encoded_str())
|
||||
.process(&mut add_token);
|
||||
}
|
||||
@@ -118,7 +118,7 @@ mod tests {
|
||||
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); // ok test
|
||||
tokens.push(format!("{}", facet));
|
||||
};
|
||||
FacetTokenizer
|
||||
FacetTokenizer::default()
|
||||
.token_stream(facet.encoded_str()) // ok test
|
||||
.process(&mut add_token);
|
||||
}
|
||||
|
||||
@@ -10,26 +10,33 @@ impl TokenFilter for LowerCaser {
|
||||
type Tokenizer<T: Tokenizer> = LowerCaserFilter<T>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T> {
|
||||
LowerCaserFilter(tokenizer)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct LowerCaserFilter<T>(T);
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for LowerCaserFilter<T> {
|
||||
type TokenStream<'a> = LowerCaserTokenStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
LowerCaserTokenStream {
|
||||
tail: self.0.token_stream(text),
|
||||
LowerCaserFilter {
|
||||
tokenizer,
|
||||
buffer: String::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LowerCaserTokenStream<T> {
|
||||
#[derive(Clone)]
|
||||
pub struct LowerCaserFilter<T> {
|
||||
tokenizer: T,
|
||||
buffer: String,
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for LowerCaserFilter<T> {
|
||||
type TokenStream<'a> = LowerCaserTokenStream<'a, T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
self.buffer.clear();
|
||||
LowerCaserTokenStream {
|
||||
tail: self.tokenizer.token_stream(text),
|
||||
buffer: &mut self.buffer,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LowerCaserTokenStream<'a, T> {
|
||||
buffer: &'a mut String,
|
||||
tail: T,
|
||||
}
|
||||
|
||||
@@ -44,7 +51,7 @@ fn to_lowercase_unicode(text: &str, output: &mut String) {
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: TokenStream> TokenStream for LowerCaserTokenStream<T> {
|
||||
impl<'a, T: TokenStream> TokenStream for LowerCaserTokenStream<'a, T> {
|
||||
fn advance(&mut self) -> bool {
|
||||
if !self.tail.advance() {
|
||||
return false;
|
||||
@@ -53,8 +60,8 @@ impl<T: TokenStream> TokenStream for LowerCaserTokenStream<T> {
|
||||
// fast track for ascii.
|
||||
self.token_mut().text.make_ascii_lowercase();
|
||||
} else {
|
||||
to_lowercase_unicode(&self.tail.token().text, &mut self.buffer);
|
||||
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
|
||||
to_lowercase_unicode(&self.tail.token().text, self.buffer);
|
||||
mem::swap(&mut self.tail.token_mut().text, self.buffer);
|
||||
}
|
||||
true
|
||||
}
|
||||
@@ -86,10 +93,11 @@ mod tests {
|
||||
}
|
||||
|
||||
fn token_stream_helper(text: &str) -> Vec<Token> {
|
||||
let mut token_stream = TextAnalyzer::builder(SimpleTokenizer)
|
||||
let mut token_stream = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(LowerCaser)
|
||||
.build()
|
||||
.token_stream(text);
|
||||
.build();
|
||||
|
||||
let mut token_stream = token_stream.token_stream(text);
|
||||
let mut tokens = vec![];
|
||||
let mut add_token = |token: &Token| {
|
||||
tokens.push(token.clone());
|
||||
|
||||
@@ -66,7 +66,7 @@
|
||||
//! ```rust
|
||||
//! use tantivy::tokenizer::*;
|
||||
//!
|
||||
//! let en_stem = TextAnalyzer::builder(SimpleTokenizer)
|
||||
//! let en_stem = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
//! .filter(RemoveLongFilter::limit(40))
|
||||
//! .filter(LowerCaser)
|
||||
//! .filter(Stemmer::new(Language::English))
|
||||
@@ -81,7 +81,7 @@
|
||||
//! # use tantivy::tokenizer::*;
|
||||
//! # use tantivy::Index;
|
||||
//! #
|
||||
//! let custom_en_tokenizer = SimpleTokenizer;
|
||||
//! let custom_en_tokenizer = SimpleTokenizer::default();
|
||||
//! # let schema = Schema::builder().build();
|
||||
//! let index = Index::create_in_ram(schema);
|
||||
//! index.tokenizers()
|
||||
@@ -113,7 +113,7 @@
|
||||
//! let index = Index::create_in_ram(schema);
|
||||
//!
|
||||
//! // We need to register our tokenizer :
|
||||
//! let custom_en_tokenizer = TextAnalyzer::builder(SimpleTokenizer)
|
||||
//! let custom_en_tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
//! .filter(RemoveLongFilter::limit(40))
|
||||
//! .filter(LowerCaser)
|
||||
//! .build();
|
||||
@@ -154,7 +154,7 @@ pub use self::split_compound_words::SplitCompoundWords;
|
||||
pub use self::stemmer::{Language, Stemmer};
|
||||
pub use self::stop_word_filter::StopWordFilter;
|
||||
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
|
||||
pub use self::tokenizer::TextAnalyzer;
|
||||
pub use self::tokenizer::{TextAnalyzer, TextAnalyzerBuilder};
|
||||
pub use self::tokenizer_manager::TokenizerManager;
|
||||
pub use self::whitespace_tokenizer::WhitespaceTokenizer;
|
||||
|
||||
@@ -188,9 +188,9 @@ pub mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_raw_tokenizer() {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
let en_tokenizer = tokenizer_manager.get("raw").unwrap();
|
||||
fn test_raw_tokenizer2() {
|
||||
let tokenizer_manager = TokenizerManager::default_for_indexing();
|
||||
let mut en_tokenizer = tokenizer_manager.get("raw").unwrap();
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| {
|
||||
@@ -206,9 +206,9 @@ pub mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_en_tokenizer() {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
let tokenizer_manager = TokenizerManager::default_for_indexing();
|
||||
assert!(tokenizer_manager.get("en_doesnotexist").is_none());
|
||||
let en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
|
||||
let mut en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| {
|
||||
@@ -228,16 +228,16 @@ pub mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_non_en_tokenizer() {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
let tokenizer_manager = TokenizerManager::default_for_indexing();
|
||||
tokenizer_manager.register(
|
||||
"el_stem",
|
||||
TextAnalyzer::builder(SimpleTokenizer)
|
||||
TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(Stemmer::new(Language::Greek))
|
||||
.build(),
|
||||
);
|
||||
let en_tokenizer = tokenizer_manager.get("el_stem").unwrap();
|
||||
let mut en_tokenizer = tokenizer_manager.get("el_stem").unwrap();
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| {
|
||||
@@ -256,8 +256,8 @@ pub mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_tokenizer_empty() {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
let en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
|
||||
let tokenizer_manager = TokenizerManager::default_for_indexing();
|
||||
let mut en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
|
||||
{
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
{
|
||||
@@ -282,8 +282,8 @@ pub mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_whitespace_tokenizer() {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
let ws_tokenizer = tokenizer_manager.get("whitespace").unwrap();
|
||||
let tokenizer_manager = TokenizerManager::default_for_indexing();
|
||||
let mut ws_tokenizer = tokenizer_manager.get("whitespace").unwrap();
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| {
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
use crate::TantivyError;
|
||||
|
||||
/// Tokenize the text by splitting words into n-grams of the given size(s)
|
||||
///
|
||||
@@ -33,7 +34,7 @@ use super::{Token, TokenStream, Tokenizer};
|
||||
/// ```rust
|
||||
/// use tantivy::tokenizer::*;
|
||||
///
|
||||
/// let tokenizer = NgramTokenizer::new(2, 3, false);
|
||||
/// let mut tokenizer = NgramTokenizer::new(2, 3, false).unwrap();
|
||||
/// let mut stream = tokenizer.token_stream("hello");
|
||||
/// {
|
||||
/// let token = stream.next().unwrap();
|
||||
@@ -79,7 +80,7 @@ use super::{Token, TokenStream, Tokenizer};
|
||||
/// }
|
||||
/// assert!(stream.next().is_none());
|
||||
/// ```
|
||||
#[derive(Clone)]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct NgramTokenizer {
|
||||
/// min size of the n-gram
|
||||
min_gram: usize,
|
||||
@@ -87,33 +88,44 @@ pub struct NgramTokenizer {
|
||||
max_gram: usize,
|
||||
/// if true, will only parse the leading edge of the input
|
||||
prefix_only: bool,
|
||||
token: Token,
|
||||
}
|
||||
|
||||
impl NgramTokenizer {
|
||||
/// Configures a new Ngram tokenizer
|
||||
pub fn new(min_gram: usize, max_gram: usize, prefix_only: bool) -> NgramTokenizer {
|
||||
assert!(min_gram > 0, "min_gram must be greater than 0");
|
||||
assert!(
|
||||
min_gram <= max_gram,
|
||||
"min_gram must not be greater than max_gram"
|
||||
);
|
||||
NgramTokenizer {
|
||||
pub fn new(
|
||||
min_gram: usize,
|
||||
max_gram: usize,
|
||||
prefix_only: bool,
|
||||
) -> crate::Result<NgramTokenizer> {
|
||||
if min_gram == 0 {
|
||||
return Err(TantivyError::InvalidArgument(
|
||||
"min_gram must be greater than 0".to_string(),
|
||||
));
|
||||
}
|
||||
if min_gram > max_gram {
|
||||
return Err(TantivyError::InvalidArgument(
|
||||
"min_gram must not be greater than max_gram".to_string(),
|
||||
));
|
||||
}
|
||||
Ok(NgramTokenizer {
|
||||
min_gram,
|
||||
max_gram,
|
||||
prefix_only,
|
||||
}
|
||||
token: Token::default(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Create a `NGramTokenizer` which generates tokens for all inner ngrams.
|
||||
///
|
||||
/// This is as opposed to only prefix ngrams .
|
||||
pub fn all_ngrams(min_gram: usize, max_gram: usize) -> NgramTokenizer {
|
||||
pub fn all_ngrams(min_gram: usize, max_gram: usize) -> crate::Result<NgramTokenizer> {
|
||||
Self::new(min_gram, max_gram, false)
|
||||
}
|
||||
|
||||
/// Create a `NGramTokenizer` which only generates tokens for the
|
||||
/// prefix ngrams.
|
||||
pub fn prefix_only(min_gram: usize, max_gram: usize) -> NgramTokenizer {
|
||||
pub fn prefix_only(min_gram: usize, max_gram: usize) -> crate::Result<NgramTokenizer> {
|
||||
Self::new(min_gram, max_gram, true)
|
||||
}
|
||||
}
|
||||
@@ -127,12 +139,13 @@ pub struct NgramTokenStream<'a> {
|
||||
/// input
|
||||
text: &'a str,
|
||||
/// output
|
||||
token: Token,
|
||||
token: &'a mut Token,
|
||||
}
|
||||
|
||||
impl Tokenizer for NgramTokenizer {
|
||||
type TokenStream<'a> = NgramTokenStream<'a>;
|
||||
fn token_stream<'a>(&self, text: &'a str) -> NgramTokenStream<'a> {
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> NgramTokenStream<'a> {
|
||||
self.token.reset();
|
||||
NgramTokenStream {
|
||||
ngram_charidx_iterator: StutteringIterator::new(
|
||||
CodepointFrontiers::for_str(text),
|
||||
@@ -141,7 +154,7 @@ impl Tokenizer for NgramTokenizer {
|
||||
),
|
||||
prefix_only: self.prefix_only,
|
||||
text,
|
||||
token: Token::default(),
|
||||
token: &mut self.token,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -164,10 +177,10 @@ impl<'a> TokenStream for NgramTokenStream<'a> {
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
&self.token
|
||||
self.token
|
||||
}
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
&mut self.token
|
||||
self.token
|
||||
}
|
||||
}
|
||||
|
||||
@@ -346,7 +359,11 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_ngram_tokenizer_1_2_false() {
|
||||
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hello"));
|
||||
let tokens = test_helper(
|
||||
NgramTokenizer::all_ngrams(1, 2)
|
||||
.unwrap()
|
||||
.token_stream("hello"),
|
||||
);
|
||||
assert_eq!(tokens.len(), 9);
|
||||
assert_token(&tokens[0], 0, "h", 0, 1);
|
||||
assert_token(&tokens[1], 0, "he", 0, 2);
|
||||
@@ -361,7 +378,11 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_ngram_tokenizer_min_max_equal() {
|
||||
let tokens = test_helper(NgramTokenizer::all_ngrams(3, 3).token_stream("hello"));
|
||||
let tokens = test_helper(
|
||||
NgramTokenizer::all_ngrams(3, 3)
|
||||
.unwrap()
|
||||
.token_stream("hello"),
|
||||
);
|
||||
assert_eq!(tokens.len(), 3);
|
||||
assert_token(&tokens[0], 0, "hel", 0, 3);
|
||||
assert_token(&tokens[1], 0, "ell", 1, 4);
|
||||
@@ -370,7 +391,11 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_ngram_tokenizer_2_5_prefix() {
|
||||
let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("frankenstein"));
|
||||
let tokens = test_helper(
|
||||
NgramTokenizer::prefix_only(2, 5)
|
||||
.unwrap()
|
||||
.token_stream("frankenstein"),
|
||||
);
|
||||
assert_eq!(tokens.len(), 4);
|
||||
assert_token(&tokens[0], 0, "fr", 0, 2);
|
||||
assert_token(&tokens[1], 0, "fra", 0, 3);
|
||||
@@ -380,7 +405,11 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_ngram_non_ascii_1_2() {
|
||||
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hεllo"));
|
||||
let tokens = test_helper(
|
||||
NgramTokenizer::all_ngrams(1, 2)
|
||||
.unwrap()
|
||||
.token_stream("hεllo"),
|
||||
);
|
||||
assert_eq!(tokens.len(), 9);
|
||||
assert_token(&tokens[0], 0, "h", 0, 1);
|
||||
assert_token(&tokens[1], 0, "hε", 0, 3);
|
||||
@@ -395,7 +424,11 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_ngram_non_ascii_2_5_prefix() {
|
||||
let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("hεllo"));
|
||||
let tokens = test_helper(
|
||||
NgramTokenizer::prefix_only(2, 5)
|
||||
.unwrap()
|
||||
.token_stream("hεllo"),
|
||||
);
|
||||
assert_eq!(tokens.len(), 4);
|
||||
assert_token(&tokens[0], 0, "hε", 0, 3);
|
||||
assert_token(&tokens[1], 0, "hεl", 0, 4);
|
||||
@@ -405,22 +438,26 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_ngram_empty() {
|
||||
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 5).token_stream(""));
|
||||
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 5).unwrap().token_stream(""));
|
||||
assert!(tokens.is_empty());
|
||||
let tokens = test_helper(NgramTokenizer::all_ngrams(2, 5).token_stream(""));
|
||||
let tokens = test_helper(NgramTokenizer::all_ngrams(2, 5).unwrap().token_stream(""));
|
||||
assert!(tokens.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "min_gram must be greater than 0")]
|
||||
fn test_ngram_min_max_interval_empty() {
|
||||
test_helper(NgramTokenizer::all_ngrams(0, 2).token_stream("hellossss"));
|
||||
test_helper(
|
||||
NgramTokenizer::all_ngrams(0, 2)
|
||||
.unwrap()
|
||||
.token_stream("hellossss"),
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "min_gram must not be greater than max_gram")]
|
||||
fn test_invalid_interval_should_panic_if_smaller() {
|
||||
NgramTokenizer::all_ngrams(2, 1);
|
||||
NgramTokenizer::all_ngrams(2, 1).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,32 +1,34 @@
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
|
||||
/// For each value of the field, emit a single unprocessed token.
|
||||
#[derive(Clone)]
|
||||
pub struct RawTokenizer;
|
||||
|
||||
pub struct RawTokenStream {
|
||||
#[derive(Clone, Default)]
|
||||
pub struct RawTokenizer {
|
||||
token: Token,
|
||||
}
|
||||
|
||||
pub struct RawTokenStream<'a> {
|
||||
token: &'a mut Token,
|
||||
has_token: bool,
|
||||
}
|
||||
|
||||
impl Tokenizer for RawTokenizer {
|
||||
type TokenStream<'a> = RawTokenStream;
|
||||
fn token_stream(&self, text: &str) -> RawTokenStream {
|
||||
let token = Token {
|
||||
offset_from: 0,
|
||||
offset_to: text.len(),
|
||||
position: 0,
|
||||
text: text.to_string(),
|
||||
position_length: 1,
|
||||
};
|
||||
type TokenStream<'a> = RawTokenStream<'a>;
|
||||
fn token_stream<'a>(&'a mut self, text: &str) -> RawTokenStream<'a> {
|
||||
self.token.reset();
|
||||
self.token.position = 0;
|
||||
self.token.position_length = 1;
|
||||
self.token.offset_from = 0;
|
||||
self.token.offset_to = text.len();
|
||||
self.token.text.clear();
|
||||
self.token.text.push_str(text);
|
||||
RawTokenStream {
|
||||
token,
|
||||
token: &mut self.token,
|
||||
has_token: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TokenStream for RawTokenStream {
|
||||
impl<'a> TokenStream for RawTokenStream<'a> {
|
||||
fn advance(&mut self) -> bool {
|
||||
let result = self.has_token;
|
||||
self.has_token = false;
|
||||
@@ -34,11 +36,11 @@ impl TokenStream for RawTokenStream {
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
&self.token
|
||||
self.token
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
&mut self.token
|
||||
self.token
|
||||
}
|
||||
}
|
||||
|
||||
@@ -55,7 +57,7 @@ mod tests {
|
||||
}
|
||||
|
||||
fn token_stream_helper(text: &str) -> Vec<Token> {
|
||||
let a = TextAnalyzer::from(RawTokenizer);
|
||||
let mut a = TextAnalyzer::from(RawTokenizer::default());
|
||||
let mut token_stream = a.token_stream(text);
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
let mut add_token = |token: &Token| {
|
||||
|
||||
@@ -22,7 +22,7 @@ use crate::TantivyError;
|
||||
/// ```rust
|
||||
/// use tantivy::tokenizer::*;
|
||||
///
|
||||
/// let tokenizer = RegexTokenizer::new(r"'(?:\w*)'").unwrap();
|
||||
/// let mut tokenizer = RegexTokenizer::new(r"'(?:\w*)'").unwrap();
|
||||
/// let mut stream = tokenizer.token_stream("'aaa' bbb 'ccc' 'ddd'");
|
||||
/// {
|
||||
/// let token = stream.next().unwrap();
|
||||
@@ -48,6 +48,7 @@ use crate::TantivyError;
|
||||
#[derive(Clone)]
|
||||
pub struct RegexTokenizer {
|
||||
regex: Regex,
|
||||
token: Token,
|
||||
}
|
||||
|
||||
impl RegexTokenizer {
|
||||
@@ -55,17 +56,21 @@ impl RegexTokenizer {
|
||||
pub fn new(regex_pattern: &str) -> crate::Result<RegexTokenizer> {
|
||||
Regex::new(regex_pattern)
|
||||
.map_err(|_| TantivyError::InvalidArgument(regex_pattern.to_owned()))
|
||||
.map(|regex| Self { regex })
|
||||
.map(|regex| Self {
|
||||
regex,
|
||||
token: Token::default(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl Tokenizer for RegexTokenizer {
|
||||
type TokenStream<'a> = RegexTokenStream<'a>;
|
||||
fn token_stream<'a>(&self, text: &'a str) -> RegexTokenStream<'a> {
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> RegexTokenStream<'a> {
|
||||
self.token.reset();
|
||||
RegexTokenStream {
|
||||
regex: self.regex.clone(),
|
||||
text,
|
||||
token: Token::default(),
|
||||
token: &mut self.token,
|
||||
cursor: 0,
|
||||
}
|
||||
}
|
||||
@@ -74,7 +79,7 @@ impl Tokenizer for RegexTokenizer {
|
||||
pub struct RegexTokenStream<'a> {
|
||||
regex: Regex,
|
||||
text: &'a str,
|
||||
token: Token,
|
||||
token: &'a mut Token,
|
||||
cursor: usize,
|
||||
}
|
||||
|
||||
@@ -100,11 +105,11 @@ impl<'a> TokenStream for RegexTokenStream<'a> {
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
&self.token
|
||||
self.token
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
&mut self.token
|
||||
self.token
|
||||
}
|
||||
}
|
||||
|
||||
@@ -147,7 +152,7 @@ mod tests {
|
||||
|
||||
fn token_stream_helper(text: &str, pattern: &str) -> Vec<Token> {
|
||||
let r = RegexTokenizer::new(pattern).unwrap();
|
||||
let a = TextAnalyzer::from(r);
|
||||
let mut a = TextAnalyzer::from(r);
|
||||
let mut token_stream = a.token_stream(text);
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
let mut add_token = |token: &Token| {
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
//! ```rust
|
||||
//! use tantivy::tokenizer::*;
|
||||
//!
|
||||
//! let tokenizer = TextAnalyzer::builder(SimpleTokenizer)
|
||||
//! let mut tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
//! .filter(RemoveLongFilter::limit(5))
|
||||
//! .build();
|
||||
//!
|
||||
@@ -57,7 +57,7 @@ pub struct RemoveLongFilterWrapper<T: Tokenizer> {
|
||||
impl<T: Tokenizer> Tokenizer for RemoveLongFilterWrapper<T> {
|
||||
type TokenStream<'a> = RemoveLongFilterStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
RemoveLongFilterStream {
|
||||
token_length_limit: self.length_limit,
|
||||
tail: self.inner.token_stream(text),
|
||||
@@ -103,7 +103,7 @@ mod tests {
|
||||
}
|
||||
|
||||
fn token_stream_helper(text: &str) -> Vec<Token> {
|
||||
let a = TextAnalyzer::builder(SimpleTokenizer)
|
||||
let mut a = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(RemoveLongFilter::limit(6))
|
||||
.build();
|
||||
let mut token_stream = a.token_stream(text);
|
||||
|
||||
@@ -3,23 +3,26 @@ use std::str::CharIndices;
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
|
||||
/// Tokenize the text by splitting on whitespaces and punctuation.
|
||||
#[derive(Clone)]
|
||||
pub struct SimpleTokenizer;
|
||||
#[derive(Clone, Default)]
|
||||
pub struct SimpleTokenizer {
|
||||
token: Token,
|
||||
}
|
||||
|
||||
/// TokenStream produced by the `SimpleTokenizer`.
|
||||
pub struct SimpleTokenStream<'a> {
|
||||
text: &'a str,
|
||||
chars: CharIndices<'a>,
|
||||
token: Token,
|
||||
token: &'a mut Token,
|
||||
}
|
||||
|
||||
impl Tokenizer for SimpleTokenizer {
|
||||
type TokenStream<'a> = SimpleTokenStream<'a>;
|
||||
fn token_stream<'a>(&self, text: &'a str) -> SimpleTokenStream<'a> {
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> SimpleTokenStream<'a> {
|
||||
self.token.reset();
|
||||
SimpleTokenStream {
|
||||
text,
|
||||
chars: text.char_indices(),
|
||||
token: Token::default(),
|
||||
token: &mut self.token,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -52,11 +55,11 @@ impl<'a> TokenStream for SimpleTokenStream<'a> {
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
&self.token
|
||||
self.token
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
&mut self.token
|
||||
self.token
|
||||
}
|
||||
}
|
||||
|
||||
@@ -76,7 +79,7 @@ mod tests {
|
||||
}
|
||||
|
||||
fn token_stream_helper(text: &str) -> Vec<Token> {
|
||||
let a = TextAnalyzer::from(SimpleTokenizer);
|
||||
let mut a = TextAnalyzer::from(SimpleTokenizer::default());
|
||||
let mut token_stream = a.token_stream(text);
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
let mut add_token = |token: &Token| {
|
||||
|
||||
@@ -20,8 +20,8 @@ use super::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
/// ```rust
|
||||
/// use tantivy::tokenizer::{SimpleTokenizer, SplitCompoundWords, TextAnalyzer};
|
||||
///
|
||||
/// let tokenizer =
|
||||
/// TextAnalyzer::builder(SimpleTokenizer)
|
||||
/// let mut tokenizer =
|
||||
/// TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
/// .filter(
|
||||
/// SplitCompoundWords::from_dictionary([
|
||||
/// "dampf", "schiff", "fahrt", "brot", "backen", "automat",
|
||||
@@ -29,13 +29,13 @@ use super::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
/// .unwrap()
|
||||
/// )
|
||||
/// .build();
|
||||
///
|
||||
/// let mut stream = tokenizer.token_stream("dampfschifffahrt");
|
||||
/// assert_eq!(stream.next().unwrap().text, "dampf");
|
||||
/// assert_eq!(stream.next().unwrap().text, "schiff");
|
||||
/// assert_eq!(stream.next().unwrap().text, "fahrt");
|
||||
/// assert_eq!(stream.next(), None);
|
||||
///
|
||||
/// {
|
||||
/// let mut stream = tokenizer.token_stream("dampfschifffahrt");
|
||||
/// assert_eq!(stream.next().unwrap().text, "dampf");
|
||||
/// assert_eq!(stream.next().unwrap().text, "schiff");
|
||||
/// assert_eq!(stream.next().unwrap().text, "fahrt");
|
||||
/// assert_eq!(stream.next(), None);
|
||||
/// }
|
||||
/// let mut stream = tokenizer.token_stream("brotbackautomat");
|
||||
/// assert_eq!(stream.next().unwrap().text, "brotbackautomat");
|
||||
/// assert_eq!(stream.next(), None);
|
||||
@@ -86,6 +86,8 @@ impl TokenFilter for SplitCompoundWords {
|
||||
SplitCompoundWordsFilter {
|
||||
dict: self.dict,
|
||||
inner: tokenizer,
|
||||
cuts: Vec::new(),
|
||||
parts: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -94,29 +96,33 @@ impl TokenFilter for SplitCompoundWords {
|
||||
pub struct SplitCompoundWordsFilter<T> {
|
||||
dict: AhoCorasick,
|
||||
inner: T,
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for SplitCompoundWordsFilter<T> {
|
||||
type TokenStream<'a> = SplitCompoundWordsTokenStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
SplitCompoundWordsTokenStream {
|
||||
dict: self.dict.clone(),
|
||||
tail: self.inner.token_stream(text),
|
||||
cuts: Vec::new(),
|
||||
parts: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct SplitCompoundWordsTokenStream<T> {
|
||||
dict: AhoCorasick,
|
||||
tail: T,
|
||||
cuts: Vec<usize>,
|
||||
parts: Vec<Token>,
|
||||
}
|
||||
|
||||
impl<T: TokenStream> SplitCompoundWordsTokenStream<T> {
|
||||
impl<T: Tokenizer> Tokenizer for SplitCompoundWordsFilter<T> {
|
||||
type TokenStream<'a> = SplitCompoundWordsTokenStream<'a, T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
self.cuts.clear();
|
||||
self.parts.clear();
|
||||
SplitCompoundWordsTokenStream {
|
||||
dict: self.dict.clone(),
|
||||
tail: self.inner.token_stream(text),
|
||||
cuts: &mut self.cuts,
|
||||
parts: &mut self.parts,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct SplitCompoundWordsTokenStream<'a, T> {
|
||||
dict: AhoCorasick,
|
||||
tail: T,
|
||||
cuts: &'a mut Vec<usize>,
|
||||
parts: &'a mut Vec<Token>,
|
||||
}
|
||||
|
||||
impl<'a, T: TokenStream> SplitCompoundWordsTokenStream<'a, T> {
|
||||
// Will use `self.cuts` to fill `self.parts` if `self.tail.token()`
|
||||
// can fully be split into consecutive matches against `self.dict`.
|
||||
fn split(&mut self) {
|
||||
@@ -152,7 +158,7 @@ impl<T: TokenStream> SplitCompoundWordsTokenStream<T> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: TokenStream> TokenStream for SplitCompoundWordsTokenStream<T> {
|
||||
impl<'a, T: TokenStream> TokenStream for SplitCompoundWordsTokenStream<'a, T> {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.parts.pop();
|
||||
|
||||
@@ -188,7 +194,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn splitting_compound_words_works() {
|
||||
let tokenizer = TextAnalyzer::builder(SimpleTokenizer)
|
||||
let mut tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(SplitCompoundWords::from_dictionary(["foo", "bar"]).unwrap())
|
||||
.build();
|
||||
|
||||
|
||||
@@ -100,7 +100,7 @@ pub struct StemmerFilter<T> {
|
||||
impl<T: Tokenizer> Tokenizer for StemmerFilter<T> {
|
||||
type TokenStream<'a> = StemmerTokenStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
let stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm);
|
||||
StemmerTokenStream {
|
||||
tail: self.inner.token_stream(text),
|
||||
|
||||
@@ -6,6 +6,7 @@ LANGUAGES = [
|
||||
"finnish",
|
||||
"french",
|
||||
"german",
|
||||
"hungarian",
|
||||
"italian",
|
||||
"norwegian",
|
||||
"portuguese",
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
//! ```rust
|
||||
//! use tantivy::tokenizer::*;
|
||||
//!
|
||||
//! let tokenizer = TextAnalyzer::builder(SimpleTokenizer)
|
||||
//! let mut tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
//! .filter(StopWordFilter::remove(vec!["the".to_string(), "is".to_string()]))
|
||||
//! .build();
|
||||
//!
|
||||
@@ -50,6 +50,7 @@ impl StopWordFilter {
|
||||
Language::Finnish => stopwords::FINNISH,
|
||||
Language::French => stopwords::FRENCH,
|
||||
Language::German => stopwords::GERMAN,
|
||||
Language::Hungarian => stopwords::HUNGARIAN,
|
||||
Language::Italian => stopwords::ITALIAN,
|
||||
Language::Norwegian => stopwords::NORWEGIAN,
|
||||
Language::Portuguese => stopwords::PORTUGUESE,
|
||||
@@ -90,7 +91,7 @@ pub struct StopWordFilterWrapper<T> {
|
||||
impl<T: Tokenizer> Tokenizer for StopWordFilterWrapper<T> {
|
||||
type TokenStream<'a> = StopWordFilterStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
StopWordFilterStream {
|
||||
words: self.words.clone(),
|
||||
tail: self.inner.token_stream(text),
|
||||
@@ -151,7 +152,7 @@ mod tests {
|
||||
"am".to_string(),
|
||||
"i".to_string(),
|
||||
];
|
||||
let a = TextAnalyzer::builder(SimpleTokenizer)
|
||||
let mut a = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(StopWordFilter::remove(stops))
|
||||
.build();
|
||||
let mut token_stream = a.token_stream(text);
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
These stop word lists are from the Snowball project (https://snowballstem.org/)
|
||||
which carries the following license:
|
||||
which carries the following copyright and license:
|
||||
|
||||
Copyright (c) 2001, Dr Martin Porter
|
||||
Copyright (c) 2004,2005, Richard Boulton
|
||||
@@ -862,6 +862,208 @@ pub const GERMAN: &[&str] = &[
|
||||
"zwischen",
|
||||
];
|
||||
|
||||
pub const HUNGARIAN: &[&str] = &[
|
||||
"a",
|
||||
"ahogy",
|
||||
"ahol",
|
||||
"aki",
|
||||
"akik",
|
||||
"akkor",
|
||||
"alatt",
|
||||
"által",
|
||||
"általában",
|
||||
"amely",
|
||||
"amelyek",
|
||||
"amelyekben",
|
||||
"amelyeket",
|
||||
"amelyet",
|
||||
"amelynek",
|
||||
"ami",
|
||||
"amit",
|
||||
"amolyan",
|
||||
"amíg",
|
||||
"amikor",
|
||||
"át",
|
||||
"abban",
|
||||
"ahhoz",
|
||||
"annak",
|
||||
"arra",
|
||||
"arról",
|
||||
"az",
|
||||
"azok",
|
||||
"azon",
|
||||
"azt",
|
||||
"azzal",
|
||||
"azért",
|
||||
"aztán",
|
||||
"azután",
|
||||
"azonban",
|
||||
"bár",
|
||||
"be",
|
||||
"belül",
|
||||
"benne",
|
||||
"cikk",
|
||||
"cikkek",
|
||||
"cikkeket",
|
||||
"csak",
|
||||
"de",
|
||||
"e",
|
||||
"eddig",
|
||||
"egész",
|
||||
"egy",
|
||||
"egyes",
|
||||
"egyetlen",
|
||||
"egyéb",
|
||||
"egyik",
|
||||
"egyre",
|
||||
"ekkor",
|
||||
"el",
|
||||
"elég",
|
||||
"ellen",
|
||||
"elő",
|
||||
"először",
|
||||
"előtt",
|
||||
"első",
|
||||
"én",
|
||||
"éppen",
|
||||
"ebben",
|
||||
"ehhez",
|
||||
"emilyen",
|
||||
"ennek",
|
||||
"erre",
|
||||
"ez",
|
||||
"ezt",
|
||||
"ezek",
|
||||
"ezen",
|
||||
"ezzel",
|
||||
"ezért",
|
||||
"és",
|
||||
"fel",
|
||||
"felé",
|
||||
"hanem",
|
||||
"hiszen",
|
||||
"hogy",
|
||||
"hogyan",
|
||||
"igen",
|
||||
"így",
|
||||
"illetve",
|
||||
"ill.",
|
||||
"ill",
|
||||
"ilyen",
|
||||
"ilyenkor",
|
||||
"ison",
|
||||
"ismét",
|
||||
"itt",
|
||||
"jó",
|
||||
"jól",
|
||||
"jobban",
|
||||
"kell",
|
||||
"kellett",
|
||||
"keresztül",
|
||||
"keressünk",
|
||||
"ki",
|
||||
"kívül",
|
||||
"között",
|
||||
"közül",
|
||||
"legalább",
|
||||
"lehet",
|
||||
"lehetett",
|
||||
"legyen",
|
||||
"lenne",
|
||||
"lenni",
|
||||
"lesz",
|
||||
"lett",
|
||||
"maga",
|
||||
"magát",
|
||||
"majd",
|
||||
"majd",
|
||||
"már",
|
||||
"más",
|
||||
"másik",
|
||||
"meg",
|
||||
"még",
|
||||
"mellett",
|
||||
"mert",
|
||||
"mely",
|
||||
"melyek",
|
||||
"mi",
|
||||
"mit",
|
||||
"míg",
|
||||
"miért",
|
||||
"milyen",
|
||||
"mikor",
|
||||
"minden",
|
||||
"mindent",
|
||||
"mindenki",
|
||||
"mindig",
|
||||
"mint",
|
||||
"mintha",
|
||||
"mivel",
|
||||
"most",
|
||||
"nagy",
|
||||
"nagyobb",
|
||||
"nagyon",
|
||||
"ne",
|
||||
"néha",
|
||||
"nekem",
|
||||
"neki",
|
||||
"nem",
|
||||
"néhány",
|
||||
"nélkül",
|
||||
"nincs",
|
||||
"olyan",
|
||||
"ott",
|
||||
"össze",
|
||||
"ő",
|
||||
"ők",
|
||||
"őket",
|
||||
"pedig",
|
||||
"persze",
|
||||
"rá",
|
||||
"s",
|
||||
"saját",
|
||||
"sem",
|
||||
"semmi",
|
||||
"sok",
|
||||
"sokat",
|
||||
"sokkal",
|
||||
"számára",
|
||||
"szemben",
|
||||
"szerint",
|
||||
"szinte",
|
||||
"talán",
|
||||
"tehát",
|
||||
"teljes",
|
||||
"tovább",
|
||||
"továbbá",
|
||||
"több",
|
||||
"úgy",
|
||||
"ugyanis",
|
||||
"új",
|
||||
"újabb",
|
||||
"újra",
|
||||
"után",
|
||||
"utána",
|
||||
"utolsó",
|
||||
"vagy",
|
||||
"vagyis",
|
||||
"valaki",
|
||||
"valami",
|
||||
"valamint",
|
||||
"való",
|
||||
"vagyok",
|
||||
"van",
|
||||
"vannak",
|
||||
"volt",
|
||||
"voltam",
|
||||
"voltak",
|
||||
"voltunk",
|
||||
"vissza",
|
||||
"vele",
|
||||
"viszont",
|
||||
"volna",
|
||||
];
|
||||
|
||||
pub const ITALIAN: &[&str] = &[
|
||||
"ad",
|
||||
"al",
|
||||
|
||||
@@ -5,35 +5,47 @@ use tokenizer_api::{BoxTokenStream, TokenFilter, Tokenizer};
|
||||
use crate::tokenizer::empty_tokenizer::EmptyTokenizer;
|
||||
|
||||
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
|
||||
#[derive(Clone)]
|
||||
pub struct TextAnalyzer {
|
||||
tokenizer: Box<dyn BoxableTokenizer>,
|
||||
}
|
||||
|
||||
impl Tokenizer for Box<dyn BoxableTokenizer> {
|
||||
type TokenStream<'a> = BoxTokenStream<'a>;
|
||||
|
||||
// Note: we want to call `box_token_stream` on the concrete `Tokenizer`
|
||||
// implementation, not the `BoxableTokenizer` one as it will cause
|
||||
// a recursive call (and a stack overflow).
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
(**self).box_token_stream(text)
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for Box<dyn BoxableTokenizer> {
|
||||
// Note: we want to call `box_clone` on the concrete `Tokenizer`
|
||||
// implementation in order to clone the concrete `Tokenizer`.
|
||||
fn clone(&self) -> Self {
|
||||
(**self).box_clone()
|
||||
}
|
||||
}
|
||||
|
||||
/// A boxable `Tokenizer`, with its `TokenStream` type erased.
|
||||
trait BoxableTokenizer: 'static + Send + Sync {
|
||||
pub trait BoxableTokenizer: 'static + Send + Sync {
|
||||
/// Creates a boxed token stream for a given `str`.
|
||||
fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
|
||||
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a>;
|
||||
/// Clone this tokenizer.
|
||||
fn box_clone(&self) -> Box<dyn BoxableTokenizer>;
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> BoxableTokenizer for T {
|
||||
fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
self.token_stream(text).into()
|
||||
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
BoxTokenStream::new(self.token_stream(text))
|
||||
}
|
||||
fn box_clone(&self) -> Box<dyn BoxableTokenizer> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for TextAnalyzer {
|
||||
fn clone(&self) -> Self {
|
||||
TextAnalyzer {
|
||||
tokenizer: self.tokenizer.box_clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for TextAnalyzer {
|
||||
fn default() -> TextAnalyzer {
|
||||
TextAnalyzer::from(EmptyTokenizer)
|
||||
@@ -53,13 +65,13 @@ impl TextAnalyzer {
|
||||
}
|
||||
|
||||
/// Creates a token stream for a given `str`.
|
||||
pub fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
self.tokenizer.box_token_stream(text)
|
||||
pub fn token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
self.tokenizer.token_stream(text)
|
||||
}
|
||||
}
|
||||
|
||||
/// Builder helper for [`TextAnalyzer`]
|
||||
pub struct TextAnalyzerBuilder<T> {
|
||||
pub struct TextAnalyzerBuilder<T = Box<dyn BoxableTokenizer>> {
|
||||
tokenizer: T,
|
||||
}
|
||||
|
||||
@@ -71,7 +83,7 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
|
||||
/// ```rust
|
||||
/// use tantivy::tokenizer::*;
|
||||
///
|
||||
/// let en_stem = TextAnalyzer::builder(SimpleTokenizer)
|
||||
/// let en_stem = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
/// .filter(RemoveLongFilter::limit(40))
|
||||
/// .filter(LowerCaser)
|
||||
/// .filter(Stemmer::default())
|
||||
@@ -83,6 +95,23 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Boxes the internal tokenizer. This is useful for adding dynamic filters.
|
||||
/// Note: this will be less performant than the non boxed version.
|
||||
pub fn dynamic(self) -> TextAnalyzerBuilder {
|
||||
let boxed_tokenizer = Box::new(self.tokenizer);
|
||||
TextAnalyzerBuilder {
|
||||
tokenizer: boxed_tokenizer,
|
||||
}
|
||||
}
|
||||
|
||||
/// Appends a token filter to the current builder and returns a boxed version of the
|
||||
/// tokenizer. This is useful when you want to build a `TextAnalyzer` dynamically.
|
||||
/// Prefer using `TextAnalyzer::builder(tokenizer).filter(token_filter).build()` if
|
||||
/// possible as it will be more performant and create less boxes.
|
||||
pub fn filter_dynamic<F: TokenFilter>(self, token_filter: F) -> TextAnalyzerBuilder {
|
||||
self.filter(token_filter).dynamic()
|
||||
}
|
||||
|
||||
/// Finalize building the TextAnalyzer
|
||||
pub fn build(self) -> TextAnalyzer {
|
||||
TextAnalyzer {
|
||||
@@ -90,3 +119,57 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::tokenizer::{LowerCaser, RemoveLongFilter, SimpleTokenizer};
|
||||
|
||||
#[test]
|
||||
fn test_text_analyzer_builder() {
|
||||
let mut analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.build();
|
||||
let mut stream = analyzer.token_stream("- first bullet point");
|
||||
assert_eq!(stream.next().unwrap().text, "first");
|
||||
assert_eq!(stream.next().unwrap().text, "bullet");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_text_analyzer_with_filters_boxed() {
|
||||
// This test shows how one can build a TextAnalyzer dynamically, by stacking a list
|
||||
// of parametrizable token filters.
|
||||
//
|
||||
// The following enum is the thing that would be serializable.
|
||||
// Note that token filters can have their own parameters, too, like the RemoveLongFilter
|
||||
enum SerializableTokenFilterEnum {
|
||||
LowerCaser(LowerCaser),
|
||||
RemoveLongFilter(RemoveLongFilter),
|
||||
}
|
||||
// Note that everything below is dynamic.
|
||||
let filters: Vec<SerializableTokenFilterEnum> = vec![
|
||||
SerializableTokenFilterEnum::LowerCaser(LowerCaser),
|
||||
SerializableTokenFilterEnum::RemoveLongFilter(RemoveLongFilter::limit(12)),
|
||||
];
|
||||
let mut analyzer_builder: TextAnalyzerBuilder =
|
||||
TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter_dynamic(RemoveLongFilter::limit(40))
|
||||
.filter_dynamic(LowerCaser);
|
||||
for filter in filters {
|
||||
analyzer_builder = match filter {
|
||||
SerializableTokenFilterEnum::LowerCaser(lower_caser) => {
|
||||
analyzer_builder.filter_dynamic(lower_caser)
|
||||
}
|
||||
SerializableTokenFilterEnum::RemoveLongFilter(remove_long_filter) => {
|
||||
analyzer_builder.filter_dynamic(remove_long_filter)
|
||||
}
|
||||
}
|
||||
}
|
||||
let mut analyzer = analyzer_builder.build();
|
||||
let mut stream = analyzer.token_stream("first bullet point");
|
||||
assert_eq!(stream.next().unwrap().text, "first");
|
||||
assert_eq!(stream.next().unwrap().text, "bullet");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,6 +27,7 @@ pub struct TokenizerManager {
|
||||
|
||||
impl TokenizerManager {
|
||||
/// Creates an empty tokenizer manager.
|
||||
#[allow(clippy::new_without_default)]
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
tokenizers: Arc::new(RwLock::new(HashMap::new())),
|
||||
@@ -51,30 +52,52 @@ impl TokenizerManager {
|
||||
.get(tokenizer_name)
|
||||
.cloned()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for TokenizerManager {
|
||||
/// Creates an `TokenizerManager` prepopulated with
|
||||
/// the default pre-configured tokenizers of `tantivy`.
|
||||
fn default() -> TokenizerManager {
|
||||
pub fn default_for_indexing() -> TokenizerManager {
|
||||
let manager = TokenizerManager::new();
|
||||
manager.register("raw", RawTokenizer);
|
||||
manager.register("raw", RawTokenizer::default());
|
||||
manager.register(
|
||||
"default",
|
||||
TextAnalyzer::builder(SimpleTokenizer)
|
||||
TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.build(),
|
||||
);
|
||||
manager.register(
|
||||
"en_stem",
|
||||
TextAnalyzer::builder(SimpleTokenizer)
|
||||
TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(Stemmer::new(Language::English))
|
||||
.build(),
|
||||
);
|
||||
manager.register("whitespace", WhitespaceTokenizer);
|
||||
manager.register("whitespace", WhitespaceTokenizer::default());
|
||||
manager
|
||||
}
|
||||
|
||||
/// Creates an `TokenizerManager` prepopulated with
|
||||
/// the default pre-configured tokenizers of `tantivy`
|
||||
/// for fast fields.
|
||||
///
|
||||
/// Fast fields usually do not really tokenize the text.
|
||||
/// It is however very useful to filter / normalize the text.
|
||||
pub fn default_for_fast_fields() -> TokenizerManager {
|
||||
let manager = TokenizerManager::new();
|
||||
let raw_tokenizer = TextAnalyzer::builder(RawTokenizer::default())
|
||||
.filter(RemoveLongFilter::limit(255))
|
||||
.build();
|
||||
let lower_tokenizer = TextAnalyzer::builder(RawTokenizer::default())
|
||||
.filter(RemoveLongFilter::limit(255))
|
||||
.filter(LowerCaser)
|
||||
.build();
|
||||
manager.register(
|
||||
crate::schema::DEFAULT_FAST_FIELD_TOKENIZER,
|
||||
lower_tokenizer.clone(),
|
||||
);
|
||||
manager.register("raw", raw_tokenizer);
|
||||
manager.register("lower", lower_tokenizer);
|
||||
manager
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,22 +3,25 @@ use std::str::CharIndices;
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
|
||||
/// Tokenize the text by splitting on whitespaces.
|
||||
#[derive(Clone)]
|
||||
pub struct WhitespaceTokenizer;
|
||||
#[derive(Clone, Default)]
|
||||
pub struct WhitespaceTokenizer {
|
||||
token: Token,
|
||||
}
|
||||
|
||||
pub struct WhitespaceTokenStream<'a> {
|
||||
text: &'a str,
|
||||
chars: CharIndices<'a>,
|
||||
token: Token,
|
||||
token: &'a mut Token,
|
||||
}
|
||||
|
||||
impl Tokenizer for WhitespaceTokenizer {
|
||||
type TokenStream<'a> = WhitespaceTokenStream<'a>;
|
||||
fn token_stream<'a>(&self, text: &'a str) -> WhitespaceTokenStream<'a> {
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> WhitespaceTokenStream<'a> {
|
||||
self.token.reset();
|
||||
WhitespaceTokenStream {
|
||||
text,
|
||||
chars: text.char_indices(),
|
||||
token: Token::default(),
|
||||
token: &mut self.token,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -51,11 +54,11 @@ impl<'a> TokenStream for WhitespaceTokenStream<'a> {
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
&self.token
|
||||
self.token
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
&mut self.token
|
||||
self.token
|
||||
}
|
||||
}
|
||||
|
||||
@@ -75,7 +78,7 @@ mod tests {
|
||||
}
|
||||
|
||||
fn token_stream_helper(text: &str) -> Vec<Token> {
|
||||
let a = TextAnalyzer::from(WhitespaceTokenizer);
|
||||
let mut a = TextAnalyzer::from(WhitespaceTokenizer::default());
|
||||
let mut token_stream = a.token_stream(text);
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
let mut add_token = |token: &Token| {
|
||||
|
||||
@@ -3,9 +3,14 @@ name = "tantivy-sstable"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "MIT"
|
||||
homepage = "https://github.com/quickwit-oss/tantivy"
|
||||
repository = "https://github.com/quickwit-oss/tantivy"
|
||||
keywords = ["search", "information", "retrieval", "sstable"]
|
||||
categories = ["database-implementations", "data-structures", "compression"]
|
||||
description = "sstables for tantivy"
|
||||
|
||||
[dependencies]
|
||||
common = {path="../common", package="tantivy-common"}
|
||||
common = {version= "0.5", path="../common", package="tantivy-common"}
|
||||
tantivy-fst = "0.4"
|
||||
# experimental gives us access to Decompressor::upper_bound
|
||||
zstd = { version = "0.12", features = ["experimental"] }
|
||||
|
||||
@@ -3,10 +3,14 @@ name = "tantivy-stacker"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "MIT"
|
||||
homepage = "https://github.com/quickwit-oss/tantivy"
|
||||
repository = "https://github.com/quickwit-oss/tantivy"
|
||||
description = "term hashmap used for indexing"
|
||||
|
||||
[dependencies]
|
||||
murmurhash32 = "0.3"
|
||||
common = { version = "0.5", path = "../common/", package = "tantivy-common" }
|
||||
ahash = { version = "0.8.3", default-features = false, optional = true }
|
||||
|
||||
[[bench]]
|
||||
harness = false
|
||||
@@ -20,8 +24,10 @@ path = "example/hashmap.rs"
|
||||
[dev-dependencies]
|
||||
rand = "0.8.5"
|
||||
zipf = "7.0.0"
|
||||
criterion = "0.5.0"
|
||||
criterion = { git = "https://github.com/PSeitz/criterion.rs/", rev = "e6f98ee"} # This fork includes stack randomization to reduce caching effects
|
||||
rustc-hash = "1.1.0"
|
||||
proptest = "1.2.0"
|
||||
|
||||
[features]
|
||||
compare_hash_only = ["ahash"] # Compare hash only, not the key in the Hashmap
|
||||
unstable = [] # useful for benches.
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user