mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 08:12:54 +00:00
Compare commits
22 Commits
removed-ex
...
addconvers
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1d72745bf5 | ||
|
|
8199aa7de7 | ||
|
|
657f0cd3bd | ||
|
|
3a82ef2560 | ||
|
|
3546e7fc63 | ||
|
|
862f367f9e | ||
|
|
14137d91c4 | ||
|
|
924fc70cb5 | ||
|
|
07023948aa | ||
|
|
0cb53207ec | ||
|
|
17c783b4db | ||
|
|
7220df8a09 | ||
|
|
e3eacb4388 | ||
|
|
fdecb79273 | ||
|
|
27f202083c | ||
|
|
ccb09aaa83 | ||
|
|
4b7c485a08 | ||
|
|
3942fc6d2b | ||
|
|
b325d569ad | ||
|
|
7ee78bda52 | ||
|
|
184a9daa8a | ||
|
|
47e01b345b |
5
.github/workflows/coverage.yml
vendored
5
.github/workflows/coverage.yml
vendored
@@ -6,6 +6,11 @@ on:
|
||||
pull_request:
|
||||
branches: [main]
|
||||
|
||||
# Ensures that we cancel running jobs for the same PR / same workflow.
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
coverage:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
5
.github/workflows/long_running.yml
vendored
5
.github/workflows/long_running.yml
vendored
@@ -8,6 +8,11 @@ env:
|
||||
CARGO_TERM_COLOR: always
|
||||
NUM_FUNCTIONAL_TEST_ITERATIONS: 20000
|
||||
|
||||
# Ensures that we cancel running jobs for the same PR / same workflow.
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
test:
|
||||
|
||||
|
||||
5
.github/workflows/test.yml
vendored
5
.github/workflows/test.yml
vendored
@@ -9,6 +9,11 @@ on:
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
|
||||
# Ensures that we cancel running jobs for the same PR / same workflow.
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
check:
|
||||
|
||||
|
||||
78
CHANGELOG.md
78
CHANGELOG.md
@@ -1,3 +1,81 @@
|
||||
|
||||
Tantivy 0.20 [Unreleased]
|
||||
================================
|
||||
#### Bugfixes
|
||||
- Fix phrase queries with slop (slop supports now transpositions, algorithm that carries slop so far for num terms > 2) [#2031](https://github.com/quickwit-oss/tantivy/issues/2031)[#2020](https://github.com/quickwit-oss/tantivy/issues/2020)(@PSeitz)
|
||||
- Handle error for exists on MMapDirectory [#1988](https://github.com/quickwit-oss/tantivy/issues/1988) (@PSeitz)
|
||||
- Aggregation
|
||||
- Fix min doc_count empty merge bug [#2057](https://github.com/quickwit-oss/tantivy/issues/2057) (@PSeitz)
|
||||
- Fix: Sort order for term aggregations (sort order on key was inverted) [#1858](https://github.com/quickwit-oss/tantivy/issues/1858) (@PSeitz)
|
||||
|
||||
#### Features/Improvements
|
||||
- Add PhrasePrefixQuery [#1842](https://github.com/quickwit-oss/tantivy/issues/1842) (@trinity-1686a)
|
||||
- Add `coerce` option for text and numbers types (convert the value instead of returning an error during indexing) [#1904](https://github.com/quickwit-oss/tantivy/issues/1904) (@PSeitz)
|
||||
- Add regex tokenizer [#1759](https://github.com/quickwit-oss/tantivy/issues/1759)(@mkleen)
|
||||
- Move tokenizer API to seperate crate. Having a seperate crate with a stable API will allow us to use tokenizers with different tantivy versions. [#1767](https://github.com/quickwit-oss/tantivy/issues/1767) (@PSeitz)
|
||||
- **Columnar crate**: New fast field handling (@fulmicoton @PSeitz) [#1806](https://github.com/quickwit-oss/tantivy/issues/1806)[#1809](https://github.com/quickwit-oss/tantivy/issues/1809)
|
||||
- Support for fast fields with optional values. Previously tantivy supported only single-valued and multi-value fast fields. The encoding of optional fast fields is now very compact.
|
||||
- Fast field Support for JSON (schemaless fast fields). Support multiple types on the same column. [#1876](https://github.com/quickwit-oss/tantivy/issues/1876) (@fulmicoton)
|
||||
- Unified access for fast fields over different cardinalities.
|
||||
- Unified storage for typed and untyped fields.
|
||||
- Move fastfield codecs into columnar. [#1782](https://github.com/quickwit-oss/tantivy/issues/1782) (@fulmicoton)
|
||||
- Sparse dense index for optional values [#1716](https://github.com/quickwit-oss/tantivy/issues/1716) (@PSeitz)
|
||||
- Switch to nanosecond precision in DateTime fastfield [#2016](https://github.com/quickwit-oss/tantivy/issues/2016) (@PSeitz)
|
||||
- **Aggregation**
|
||||
- Add `date_histogram` aggregation (only `fixed_interval` for now) [#1900](https://github.com/quickwit-oss/tantivy/issues/1900) (@PSeitz)
|
||||
- Add `percentiles` aggregations [#1984](https://github.com/quickwit-oss/tantivy/issues/1984) (@PSeitz)
|
||||
- [**breaking**] Drop JSON support on intermediate agg result (we use postcard as format in `quickwit` to send intermediate results) [#1992](https://github.com/quickwit-oss/tantivy/issues/1992) (@PSeitz)
|
||||
- Set memory limit in bytes for aggregations after which they abort (Previously there was only the bucket limit) [#1942](https://github.com/quickwit-oss/tantivy/issues/1942)[#1957](https://github.com/quickwit-oss/tantivy/issues/1957)(@PSeitz)
|
||||
- Add support for u64,i64,f64 fields in term aggregation [#1883](https://github.com/quickwit-oss/tantivy/issues/1883) (@PSeitz)
|
||||
- Allow histogram bounds to be passed as Rfc3339 [#2076](https://github.com/quickwit-oss/tantivy/issues/2076) (@PSeitz)
|
||||
- Add count, min, max, and sum aggregations [#1794](https://github.com/quickwit-oss/tantivy/issues/1794) (@guilload)
|
||||
- Switch to Aggregation without serde_untagged => better deserialization errors. [#2003](https://github.com/quickwit-oss/tantivy/issues/2003) (@PSeitz)
|
||||
- Switch to ms in histogram for date type (ES compatibility) [#2045](https://github.com/quickwit-oss/tantivy/issues/2045) (@PSeitz)
|
||||
- Reduce term aggregation memory consumption [#2013](https://github.com/quickwit-oss/tantivy/issues/2013) (@PSeitz)
|
||||
- Reduce agg memory consumption: Replace generic aggregation collector (which has a high memory requirement per instance) in aggregation tree with optimized versions behind a trait.
|
||||
- Split term collection count and sub_agg (Faster term agg with less memory consumption for cases without sub-aggs) [#1921](https://github.com/quickwit-oss/tantivy/issues/1921) (@PSeitz)
|
||||
- Schemaless aggregations: In combination with stacker tantivy supports now schemaless aggregations via the JSON type.
|
||||
- Add aggregation support for JSON type [#1888](https://github.com/quickwit-oss/tantivy/issues/1888) (@PSeitz)
|
||||
- Mixed types support on JSON fields in aggs [#1971](https://github.com/quickwit-oss/tantivy/issues/1971) (@PSeitz)
|
||||
- Perf: Fetch blocks of vals in aggregation for all cardinality [#1950](https://github.com/quickwit-oss/tantivy/issues/1950) (@PSeitz)
|
||||
- `Searcher` with disabled scoring via `EnableScoring::Disabled` [#1780](https://github.com/quickwit-oss/tantivy/issues/1780) (@shikhar)
|
||||
- Enable tokenizer on json fields [#2053](https://github.com/quickwit-oss/tantivy/issues/2053) (@PSeitz)
|
||||
- Enforcing "NOT" and "-" queries consistency in UserInputAst [#1609](https://github.com/quickwit-oss/tantivy/issues/1609) (@bazhenov)
|
||||
- Faster indexing
|
||||
- Refactor tokenization pipeline to use GATs [#1924](https://github.com/quickwit-oss/tantivy/issues/1924) (@trinity-1686a)
|
||||
- Faster term hash map [#2058](https://github.com/quickwit-oss/tantivy/issues/2058)[#1940](https://github.com/quickwit-oss/tantivy/issues/1940) (@PSeitz)
|
||||
- Refactor vint [#2010](https://github.com/quickwit-oss/tantivy/issues/2010) (@PSeitz)
|
||||
- Faster search
|
||||
- Work in batches of docs on the SegmentCollector (Only for cases without score for now) [#1937](https://github.com/quickwit-oss/tantivy/issues/1937) (@PSeitz)
|
||||
- Faster fast field range queries using SIMD [#1954](https://github.com/quickwit-oss/tantivy/issues/1954) (@fulmicoton)
|
||||
- Improve fast field range query performance [#1864](https://github.com/quickwit-oss/tantivy/issues/1864) (@PSeitz)
|
||||
- Make BM25 scoring more flexible [#1855](https://github.com/quickwit-oss/tantivy/issues/1855) (@alexcole)
|
||||
- Switch fs2 to fs4 as it is now unmaintained and does not support illumos [#1944](https://github.com/quickwit-oss/tantivy/issues/1944) (@Toasterson)
|
||||
- Made BooleanWeight and BoostWeight public [#1991](https://github.com/quickwit-oss/tantivy/issues/1991) (@fulmicoton)
|
||||
- Make index compatible with virtual drives on Windows [#1843](https://github.com/quickwit-oss/tantivy/issues/1843) (@gyk)
|
||||
- Add stop words for Hungarian language [#2069](https://github.com/quickwit-oss/tantivy/issues/2069) (@tnxbutno)
|
||||
- Auto downgrade index record option, instead of vint error [#1857](https://github.com/quickwit-oss/tantivy/issues/1857) (@PSeitz)
|
||||
- Enable range query on fast field for u64 compatible types [#1762](https://github.com/quickwit-oss/tantivy/issues/1762) (@PSeitz) [#1876]
|
||||
- sstable
|
||||
- Isolating sstable and stacker in independant crates. [#1718](https://github.com/quickwit-oss/tantivy/issues/1718) (@fulmicoton)
|
||||
- New sstable format [#1943](https://github.com/quickwit-oss/tantivy/issues/1943)[#1953](https://github.com/quickwit-oss/tantivy/issues/1953) (@trinity-1686a)
|
||||
- Use DeltaReader directly to implement Dictionnary::ord_to_term [#1928](https://github.com/quickwit-oss/tantivy/issues/1928) (@trinity-1686a)
|
||||
- Use DeltaReader directly to implement Dictionnary::term_ord [#1925](https://github.com/quickwit-oss/tantivy/issues/1925) (@trinity-1686a)
|
||||
- Add seperate tokenizer manager for fast fields [#2019](https://github.com/quickwit-oss/tantivy/issues/2019) (@PSeitz)
|
||||
- Make construction of LevenshteinAutomatonBuilder for FuzzyTermQuery instances lazy. [#1756](https://github.com/quickwit-oss/tantivy/issues/1756) (@adamreichold)
|
||||
- Added support for madvise when opening an mmaped Index [#2036](https://github.com/quickwit-oss/tantivy/issues/2036) (@fulmicoton)
|
||||
- Rename `DatePrecision` to `DateTimePrecision` [#2051](https://github.com/quickwit-oss/tantivy/issues/2051) (@guilload)
|
||||
- Query Parser
|
||||
- Quotation mark can now be used for phrase queries. [#2050](https://github.com/quickwit-oss/tantivy/issues/2050) (@fulmicoton)
|
||||
- PhrasePrefixQuery is supported in the query parser via: `field:"phrase ter"*` [#2044](https://github.com/quickwit-oss/tantivy/issues/2044) (@adamreichold)
|
||||
- Docs
|
||||
- Update examples for literate docs [#1880](https://github.com/quickwit-oss/tantivy/issues/1880) (@PSeitz)
|
||||
- Add ip field example [#1775](https://github.com/quickwit-oss/tantivy/issues/1775) (@PSeitz)
|
||||
- Fix doc store cache documentation [#1821](https://github.com/quickwit-oss/tantivy/issues/1821) (@PSeitz)
|
||||
- Fix BooleanQuery document [#1999](https://github.com/quickwit-oss/tantivy/issues/1999) (@RT_Enzyme)
|
||||
- Update comments in the faceted search example [#1737](https://github.com/quickwit-oss/tantivy/issues/1737) (@DawChihLiou)
|
||||
|
||||
|
||||
Tantivy 0.19
|
||||
================================
|
||||
#### Bugfixes
|
||||
|
||||
17
Cargo.toml
17
Cargo.toml
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.19.0"
|
||||
version = "0.20.2"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
@@ -12,6 +12,7 @@ readme = "README.md"
|
||||
keywords = ["search", "information", "retrieval"]
|
||||
edition = "2021"
|
||||
rust-version = "1.62"
|
||||
exclude = ["benches/*.json", "benches/*.txt"]
|
||||
|
||||
[dependencies]
|
||||
oneshot = "0.1.5"
|
||||
@@ -55,13 +56,13 @@ measure_time = "0.8.2"
|
||||
async-trait = "0.1.53"
|
||||
arc-swap = "1.5.0"
|
||||
|
||||
columnar = { version="0.1", path="./columnar", package ="tantivy-columnar" }
|
||||
sstable = { version="0.1", path="./sstable", package ="tantivy-sstable", optional = true }
|
||||
stacker = { version="0.1", path="./stacker", package ="tantivy-stacker" }
|
||||
query-grammar = { version= "0.19.0", path="./query-grammar", package = "tantivy-query-grammar" }
|
||||
tantivy-bitpacker = { version= "0.3", path="./bitpacker" }
|
||||
columnar = { version= "0.1", path="./columnar", package ="tantivy-columnar" }
|
||||
sstable = { version= "0.1", path="./sstable", package ="tantivy-sstable", optional = true }
|
||||
stacker = { version= "0.1", path="./stacker", package ="tantivy-stacker" }
|
||||
query-grammar = { version= "0.20.0", path="./query-grammar", package = "tantivy-query-grammar" }
|
||||
tantivy-bitpacker = { version= "0.4", path="./bitpacker" }
|
||||
common = { version= "0.5", path = "./common/", package = "tantivy-common" }
|
||||
tokenizer-api = { version="0.1", path="./tokenizer-api", package="tantivy-tokenizer-api" }
|
||||
tokenizer-api = { version= "0.1", path="./tokenizer-api", package="tantivy-tokenizer-api" }
|
||||
sketches-ddsketch = { version = "0.2.1", features = ["use_serde"] }
|
||||
futures-util = { version = "0.3.28", optional = true }
|
||||
|
||||
@@ -77,7 +78,7 @@ proptest = "1.0.0"
|
||||
criterion = "0.5"
|
||||
test-log = "0.2.10"
|
||||
env_logger = "0.10.0"
|
||||
pprof = { version = "0.11.0", features = ["flamegraph", "criterion"] }
|
||||
pprof = { git = "https://github.com/PSeitz/pprof-rs/", rev = "53af24b", features = ["flamegraph", "criterion"] } # temp fork that works with criterion 0.5
|
||||
futures = "0.3.21"
|
||||
paste = "1.0.11"
|
||||
more-asserts = "0.3.1"
|
||||
|
||||
21
RELEASE.md
Normal file
21
RELEASE.md
Normal file
@@ -0,0 +1,21 @@
|
||||
# Release a new Tantivy Version
|
||||
|
||||
## Steps
|
||||
|
||||
1. Identify new packages in workspace since last release
|
||||
2. Identify changed packages in workspace since last release
|
||||
3. Bump version in `Cargo.toml` and their dependents for all changed packages
|
||||
4. Update version of root `Cargo.toml`
|
||||
5. Publish version starting with leaf nodes
|
||||
6. Set git tag with new version
|
||||
|
||||
|
||||
In conjucation with `cargo-release` Steps 1-4 (I'm not sure if the change detection works):
|
||||
Set new packages to version 0.0.0
|
||||
|
||||
Replace prev-tag-name
|
||||
```bash
|
||||
cargo release --workspace --no-publish -v --prev-tag-name 0.19 --push-remote origin minor --no-tag --execute
|
||||
```
|
||||
|
||||
no-tag or it will create tags for all the subpackages
|
||||
23
appveyor.yml
23
appveyor.yml
@@ -1,23 +0,0 @@
|
||||
# Appveyor configuration template for Rust using rustup for Rust installation
|
||||
# https://github.com/starkat99/appveyor-rust
|
||||
|
||||
os: Visual Studio 2015
|
||||
environment:
|
||||
matrix:
|
||||
- channel: stable
|
||||
target: x86_64-pc-windows-msvc
|
||||
|
||||
install:
|
||||
- appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe
|
||||
- rustup-init -yv --default-toolchain %channel% --default-host %target%
|
||||
- set PATH=%PATH%;%USERPROFILE%\.cargo\bin
|
||||
- if defined msys_bits set PATH=%PATH%;C:\msys64\mingw%msys_bits%\bin
|
||||
- rustc -vV
|
||||
- cargo -vV
|
||||
|
||||
build: false
|
||||
|
||||
test_script:
|
||||
- REM SET RUST_LOG=tantivy,test & cargo test --all --verbose --no-default-features --features lz4-compression --features mmap
|
||||
- REM SET RUST_LOG=tantivy,test & cargo test test_store --verbose --no-default-features --features lz4-compression --features snappy-compression --features brotli-compression --features mmap
|
||||
- REM SET RUST_BACKTRACE=1 & cargo build --examples
|
||||
@@ -5,7 +5,7 @@ const ALICE_TXT: &str = include_str!("alice.txt");
|
||||
|
||||
pub fn criterion_benchmark(c: &mut Criterion) {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
let tokenizer = tokenizer_manager.get("default").unwrap();
|
||||
let mut tokenizer = tokenizer_manager.get("default").unwrap();
|
||||
c.bench_function("default-tokenize-alice", |b| {
|
||||
b.iter(|| {
|
||||
let mut word_count = 0;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy-bitpacker"
|
||||
version = "0.3.0"
|
||||
version = "0.4.0"
|
||||
edition = "2021"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
|
||||
89
cliff.toml
Normal file
89
cliff.toml
Normal file
@@ -0,0 +1,89 @@
|
||||
# configuration file for git-cliff{ pattern = "foo", replace = "bar"}
|
||||
# see https://github.com/orhun/git-cliff#configuration-file
|
||||
|
||||
[changelog]
|
||||
# changelog header
|
||||
header = """
|
||||
"""
|
||||
# template for the changelog body
|
||||
# https://tera.netlify.app/docs/#introduction
|
||||
body = """
|
||||
{% if version %}\
|
||||
{{ version | trim_start_matches(pat="v") }} ({{ timestamp | date(format="%Y-%m-%d") }})
|
||||
==================
|
||||
{% else %}\
|
||||
## [unreleased]
|
||||
{% endif %}\
|
||||
{% for commit in commits %}
|
||||
- {% if commit.breaking %}[**breaking**] {% endif %}{{ commit.message | split(pat="\n") | first | trim | upper_first }}(@{{ commit.author.name }})\
|
||||
{% endfor %}
|
||||
"""
|
||||
# remove the leading and trailing whitespace from the template
|
||||
trim = true
|
||||
# changelog footer
|
||||
footer = """
|
||||
"""
|
||||
|
||||
postprocessors = [
|
||||
{ pattern = 'Paul Masurel', replace = "fulmicoton"}, # replace with github user
|
||||
{ pattern = 'PSeitz', replace = "PSeitz"}, # replace with github user
|
||||
{ pattern = 'Adam Reichold', replace = "adamreichold"}, # replace with github user
|
||||
{ pattern = 'trinity-1686a', replace = "trinity-1686a"}, # replace with github user
|
||||
{ pattern = 'Michael Kleen', replace = "mkleen"}, # replace with github user
|
||||
{ pattern = 'Adrien Guillo', replace = "guilload"}, # replace with github user
|
||||
{ pattern = 'François Massot', replace = "fmassot"}, # replace with github user
|
||||
{ pattern = '', replace = ""}, # replace with github user
|
||||
]
|
||||
|
||||
[git]
|
||||
# parse the commits based on https://www.conventionalcommits.org
|
||||
# This is required or commit.message contains the whole commit message and not just the title
|
||||
conventional_commits = true
|
||||
# filter out the commits that are not conventional
|
||||
filter_unconventional = false
|
||||
# process each line of a commit as an individual commit
|
||||
split_commits = false
|
||||
# regex for preprocessing the commit messages
|
||||
commit_preprocessors = [
|
||||
{ pattern = '\((\w+\s)?#([0-9]+)\)', replace = "[#${2}](https://github.com/quickwit-oss/tantivy/issues/${2})"}, # replace issue numbers
|
||||
]
|
||||
#link_parsers = [
|
||||
#{ pattern = "#(\\d+)", href = "https://github.com/quickwit-oss/tantivy/pulls/$1"},
|
||||
#]
|
||||
# regex for parsing and grouping commits
|
||||
commit_parsers = [
|
||||
{ message = "^feat", group = "Features"},
|
||||
{ message = "^fix", group = "Bug Fixes"},
|
||||
{ message = "^doc", group = "Documentation"},
|
||||
{ message = "^perf", group = "Performance"},
|
||||
{ message = "^refactor", group = "Refactor"},
|
||||
{ message = "^style", group = "Styling"},
|
||||
{ message = "^test", group = "Testing"},
|
||||
{ message = "^chore\\(release\\): prepare for", skip = true},
|
||||
{ message = "(?i)clippy", skip = true},
|
||||
{ message = "(?i)dependabot", skip = true},
|
||||
{ message = "(?i)fmt", skip = true},
|
||||
{ message = "(?i)bump", skip = true},
|
||||
{ message = "(?i)readme", skip = true},
|
||||
{ message = "(?i)comment", skip = true},
|
||||
{ message = "(?i)spelling", skip = true},
|
||||
{ message = "^chore", group = "Miscellaneous Tasks"},
|
||||
{ body = ".*security", group = "Security"},
|
||||
{ message = ".*", group = "Other", default_scope = "other"},
|
||||
]
|
||||
# protect breaking changes from being skipped due to matching a skipping commit_parser
|
||||
protect_breaking_commits = false
|
||||
# filter out the commits that are not matched by commit parsers
|
||||
filter_commits = false
|
||||
# glob pattern for matching git tags
|
||||
tag_pattern = "v[0-9]*"
|
||||
# regex for skipping tags
|
||||
skip_tags = "v0.1.0-beta.1"
|
||||
# regex for ignoring tags
|
||||
ignore_tags = ""
|
||||
# sort the tags topologically
|
||||
topo_order = false
|
||||
# sort the commits inside sections by oldest/newest order
|
||||
sort_commits = "newest"
|
||||
# limit the number of commits included in the changelog.
|
||||
# limit_commits = 42
|
||||
@@ -3,16 +3,20 @@ name = "tantivy-columnar"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "MIT"
|
||||
homepage = "https://github.com/quickwit-oss/tantivy"
|
||||
repository = "https://github.com/quickwit-oss/tantivy"
|
||||
desciption = "column oriented storage for tantivy"
|
||||
categories = ["database-implementations", "data-structures", "compression"]
|
||||
|
||||
[dependencies]
|
||||
itertools = "0.10.5"
|
||||
fnv = "1.0.7"
|
||||
fastdivide = "0.4.0"
|
||||
|
||||
stacker = { path = "../stacker", package="tantivy-stacker"}
|
||||
sstable = { path = "../sstable", package = "tantivy-sstable" }
|
||||
common = { path = "../common", package = "tantivy-common" }
|
||||
tantivy-bitpacker = { version= "0.3", path = "../bitpacker/" }
|
||||
stacker = { version= "0.1", path = "../stacker", package="tantivy-stacker"}
|
||||
sstable = { version= "0.1", path = "../sstable", package = "tantivy-sstable" }
|
||||
common = { version= "0.5", path = "../common", package = "tantivy-common" }
|
||||
tantivy-bitpacker = { version= "0.4", path = "../bitpacker/" }
|
||||
serde = "1.0.152"
|
||||
|
||||
[dev-dependencies]
|
||||
|
||||
@@ -14,18 +14,14 @@ use time::{OffsetDateTime, PrimitiveDateTime, UtcOffset};
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum DateTimePrecision {
|
||||
/// Second precision.
|
||||
#[serde(alias = "seconds")]
|
||||
#[default]
|
||||
Second,
|
||||
#[serde(alias = "milliseconds")]
|
||||
Seconds,
|
||||
/// Millisecond precision.
|
||||
Millisecond,
|
||||
#[serde(alias = "microseconds")]
|
||||
Milliseconds,
|
||||
/// Microsecond precision.
|
||||
Microsecond,
|
||||
#[serde(alias = "nanoseconds")]
|
||||
Microseconds,
|
||||
/// Nanosecond precision.
|
||||
Nanosecond,
|
||||
Nanoseconds,
|
||||
}
|
||||
|
||||
#[deprecated(since = "0.20.0", note = "Use `DateTimePrecision` instead")]
|
||||
@@ -151,10 +147,10 @@ impl DateTime {
|
||||
/// Truncates the microseconds value to the corresponding precision.
|
||||
pub fn truncate(self, precision: DateTimePrecision) -> Self {
|
||||
let truncated_timestamp_micros = match precision {
|
||||
DateTimePrecision::Second => (self.timestamp_nanos / 1_000_000_000) * 1_000_000_000,
|
||||
DateTimePrecision::Millisecond => (self.timestamp_nanos / 1_000_000) * 1_000_000,
|
||||
DateTimePrecision::Microsecond => (self.timestamp_nanos / 1_000) * 1_000,
|
||||
DateTimePrecision::Nanosecond => self.timestamp_nanos,
|
||||
DateTimePrecision::Seconds => (self.timestamp_nanos / 1_000_000_000) * 1_000_000_000,
|
||||
DateTimePrecision::Milliseconds => (self.timestamp_nanos / 1_000_000) * 1_000_000,
|
||||
DateTimePrecision::Microseconds => (self.timestamp_nanos / 1_000) * 1_000,
|
||||
DateTimePrecision::Nanoseconds => self.timestamp_nanos,
|
||||
};
|
||||
Self {
|
||||
timestamp_nanos: truncated_timestamp_micros,
|
||||
@@ -163,7 +159,7 @@ impl DateTime {
|
||||
}
|
||||
|
||||
impl fmt::Debug for DateTime {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
let utc_rfc3339 = self.into_utc().format(&Rfc3339).map_err(|_| fmt::Error)?;
|
||||
f.write_str(&utc_rfc3339)
|
||||
}
|
||||
|
||||
@@ -13,7 +13,7 @@ fn main() -> tantivy::Result<()> {
|
||||
let opts = DateOptions::from(INDEXED)
|
||||
.set_stored()
|
||||
.set_fast()
|
||||
.set_precision(tantivy::DateTimePrecision::Second);
|
||||
.set_precision(tantivy::DateTimePrecision::Seconds);
|
||||
// Add `occurred_at` date field type
|
||||
let occurred_at = schema_builder.add_date_field("occurred_at", opts);
|
||||
let event_type = schema_builder.add_text_field("event", STRING | STORED);
|
||||
|
||||
79
examples/phrase_prefix_search.rs
Normal file
79
examples/phrase_prefix_search.rs
Normal file
@@ -0,0 +1,79 @@
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::{doc, Index, ReloadPolicy, Result};
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let index_path = TempDir::new()?;
|
||||
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_text_field("title", TEXT | STORED);
|
||||
schema_builder.add_text_field("body", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let title = schema.get_field("title").unwrap();
|
||||
let body = schema.get_field("body").unwrap();
|
||||
|
||||
let index = Index::create_in_dir(&index_path, schema)?;
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "The Old Man and the Sea",
|
||||
body => "He was an old man who fished alone in a skiff in the Gulf Stream and he had gone \
|
||||
eighty-four days now without taking a fish.",
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
))?;
|
||||
|
||||
// Multivalued field just need to be repeated.
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankenstein",
|
||||
title => "The Modern Prometheus",
|
||||
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||
increasing confidence in the success of my undertaking."
|
||||
))?;
|
||||
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::OnCommit)
|
||||
.try_into()?;
|
||||
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||
// This will match documents containing the phrase "in the"
|
||||
// followed by some word starting with "su",
|
||||
// i.e. it will match "in the sunlight" and "in the success",
|
||||
// but not "in the Gulf Stream".
|
||||
let query = query_parser.parse_query("\"in the su\"*")?;
|
||||
|
||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
||||
let mut titles = top_docs
|
||||
.into_iter()
|
||||
.map(|(_score, doc_address)| {
|
||||
let doc = searcher.doc(doc_address)?;
|
||||
let title = doc.get_first(title).unwrap().as_text().unwrap().to_owned();
|
||||
Ok(title)
|
||||
})
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
titles.sort_unstable();
|
||||
assert_eq!(titles, ["Frankenstein", "Of Mice and Men"]);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -17,7 +17,8 @@ use tantivy::{doc, Index, ReloadPolicy};
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn pre_tokenize_text(text: &str) -> Vec<Token> {
|
||||
let mut token_stream = SimpleTokenizer.token_stream(text);
|
||||
let mut tokenizer = SimpleTokenizer::default();
|
||||
let mut token_stream = tokenizer.token_stream(text);
|
||||
let mut tokens = vec![];
|
||||
while token_stream.advance() {
|
||||
tokens.push(token_stream.token().clone());
|
||||
|
||||
@@ -50,7 +50,7 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
// This tokenizer lowers all of the text (to help with stop word matching)
|
||||
// then removes all instances of `the` and `and` from the corpus
|
||||
let tokenizer = TextAnalyzer::builder(SimpleTokenizer)
|
||||
let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(LowerCaser)
|
||||
.filter(StopWordFilter::remove(vec![
|
||||
"the".to_string(),
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy-query-grammar"
|
||||
version = "0.19.0"
|
||||
version = "0.20.0"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
|
||||
@@ -162,14 +162,22 @@ fn term_val<'a>() -> impl Parser<&'a str, Output = (Delimiter, String)> {
|
||||
}
|
||||
|
||||
fn term_query<'a>() -> impl Parser<&'a str, Output = UserInputLiteral> {
|
||||
(field_name(), term_val(), slop_val()).map(|(field_name, (delimiter, phrase), slop)| {
|
||||
UserInputLiteral {
|
||||
(field_name(), term_val(), slop_or_prefix_val()).map(
|
||||
|(field_name, (delimiter, phrase), (slop, prefix))| UserInputLiteral {
|
||||
field_name: Some(field_name),
|
||||
phrase,
|
||||
delimiter,
|
||||
slop,
|
||||
}
|
||||
})
|
||||
prefix,
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
fn slop_or_prefix_val<'a>() -> impl Parser<&'a str, Output = (u32, bool)> {
|
||||
let prefix_val = char('*').map(|_ast| (0, true));
|
||||
let slop_val = slop_val().map(|slop| (slop, false));
|
||||
|
||||
prefix_val.or(slop_val)
|
||||
}
|
||||
|
||||
fn slop_val<'a>() -> impl Parser<&'a str, Output = u32> {
|
||||
@@ -186,11 +194,14 @@ fn slop_val<'a>() -> impl Parser<&'a str, Output = u32> {
|
||||
|
||||
fn literal<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
|
||||
let term_default_field =
|
||||
(term_val(), slop_val()).map(|((delimiter, phrase), slop)| UserInputLiteral {
|
||||
field_name: None,
|
||||
phrase,
|
||||
delimiter,
|
||||
slop,
|
||||
(term_val(), slop_or_prefix_val()).map(|((delimiter, phrase), (slop, prefix))| {
|
||||
UserInputLiteral {
|
||||
field_name: None,
|
||||
phrase,
|
||||
delimiter,
|
||||
slop,
|
||||
prefix,
|
||||
}
|
||||
});
|
||||
|
||||
attempt(term_query())
|
||||
@@ -872,6 +883,16 @@ mod test {
|
||||
test_parse_query_to_ast_helper("\"a b\"~300^2", "(\"a b\"~300)^2");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_phrase_prefix() {
|
||||
test_parse_query_to_ast_helper("\"a b\"*", "\"a b\"*");
|
||||
test_parse_query_to_ast_helper("\"a\"*", "\"a\"*");
|
||||
test_parse_query_to_ast_helper("\"\"*", "\"\"*");
|
||||
test_parse_query_to_ast_helper("foo:\"a b\"*", "\"foo\":\"a b\"*");
|
||||
test_parse_query_to_ast_helper("foo:\"a\"*", "\"foo\":\"a\"*");
|
||||
test_parse_query_to_ast_helper("foo:\"\"*", "\"foo\":\"\"*");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_not_queries_are_consistent() {
|
||||
test_parse_query_to_ast_helper("tata -toto", "(*tata -toto)");
|
||||
|
||||
@@ -66,6 +66,7 @@ pub struct UserInputLiteral {
|
||||
pub phrase: String,
|
||||
pub delimiter: Delimiter,
|
||||
pub slop: u32,
|
||||
pub prefix: bool,
|
||||
}
|
||||
|
||||
impl fmt::Debug for UserInputLiteral {
|
||||
@@ -86,6 +87,8 @@ impl fmt::Debug for UserInputLiteral {
|
||||
}
|
||||
if self.slop > 0 {
|
||||
write!(formatter, "~{}", self.slop)?;
|
||||
} else if self.prefix {
|
||||
write!(formatter, "*")?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -60,6 +60,8 @@ impl AggregationLimits {
|
||||
/// *bucket_limit*
|
||||
/// Limits the maximum number of buckets returned from an aggregation request.
|
||||
/// bucket_limit will default to `DEFAULT_BUCKET_LIMIT` (65000)
|
||||
///
|
||||
/// Note: The returned instance contains a Arc shared counter to track memory consumption.
|
||||
pub fn new(memory_limit: Option<u64>, bucket_limit: Option<u32>) -> Self {
|
||||
Self {
|
||||
memory_consumption: Default::default(),
|
||||
|
||||
@@ -74,14 +74,14 @@ impl AggregationWithAccessor {
|
||||
ColumnType::I64,
|
||||
ColumnType::U64,
|
||||
ColumnType::F64,
|
||||
ColumnType::Bytes,
|
||||
ColumnType::Str,
|
||||
// ColumnType::Bytes Unsupported
|
||||
// ColumnType::Bool Unsupported
|
||||
// ColumnType::IpAddr Unsupported
|
||||
// ColumnType::DateTime Unsupported
|
||||
];
|
||||
let mut columns =
|
||||
get_all_ff_reader(reader, field_name, Some(&allowed_column_types))?;
|
||||
get_all_ff_reader_or_empty(reader, field_name, Some(&allowed_column_types))?;
|
||||
let first = columns.pop().unwrap();
|
||||
accessor2 = columns.pop();
|
||||
first
|
||||
@@ -177,7 +177,7 @@ fn get_ff_reader(
|
||||
/// Get all fast field reader or empty as default.
|
||||
///
|
||||
/// Is guaranteed to return at least one column.
|
||||
fn get_all_ff_reader(
|
||||
fn get_all_ff_reader_or_empty(
|
||||
reader: &SegmentReader,
|
||||
field_name: &str,
|
||||
allowed_column_types: Option<&[ColumnType]>,
|
||||
|
||||
@@ -604,6 +604,42 @@ mod tests {
|
||||
});
|
||||
assert_eq!(res, expected_res);
|
||||
}
|
||||
|
||||
{
|
||||
// 1day + hard_bounds as Rfc3339
|
||||
let elasticsearch_compatible_json = json!(
|
||||
{
|
||||
"sales_over_time": {
|
||||
"date_histogram": {
|
||||
"field": "date",
|
||||
"fixed_interval": "1d",
|
||||
"hard_bounds": {
|
||||
"min": "2015-01-02T00:00:00Z",
|
||||
"max": "2015-01-02T12:00:00Z"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
let agg_req: Aggregations = serde_json::from_str(
|
||||
&serde_json::to_string(&elasticsearch_compatible_json).unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
let res = exec_request(agg_req, &index).unwrap();
|
||||
let expected_res = json!({
|
||||
"sales_over_time" : {
|
||||
"buckets": [
|
||||
{
|
||||
"doc_count": 1,
|
||||
"key": 1420156800000.0,
|
||||
"key_as_string": "2015-01-02T00:00:00Z"
|
||||
}
|
||||
]
|
||||
}
|
||||
});
|
||||
assert_eq!(res, expected_res);
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn histogram_test_invalid_req() {
|
||||
|
||||
@@ -177,11 +177,38 @@ impl HistogramAggregation {
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct HistogramBounds {
|
||||
/// The lower bounds.
|
||||
#[serde(deserialize_with = "deserialize_date_or_num")]
|
||||
pub min: f64,
|
||||
/// The upper bounds.
|
||||
#[serde(deserialize_with = "deserialize_date_or_num")]
|
||||
pub max: f64,
|
||||
}
|
||||
|
||||
fn deserialize_date_or_num<'de, D>(deserializer: D) -> Result<f64, D::Error>
|
||||
where D: serde::Deserializer<'de> {
|
||||
let value: serde_json::Value = Deserialize::deserialize(deserializer)?;
|
||||
|
||||
// Check if the value is a string representing an Rfc3339 formatted date
|
||||
if let serde_json::Value::String(date_str) = value {
|
||||
// Parse the Rfc3339 formatted date string into a DateTime<Utc>
|
||||
let date =
|
||||
time::OffsetDateTime::parse(&date_str, &time::format_description::well_known::Rfc3339)
|
||||
.map_err(|_| serde::de::Error::custom("Invalid Rfc3339 formatted date"))?;
|
||||
|
||||
let milliseconds: i64 = (date.unix_timestamp_nanos() / 1_000_000)
|
||||
.try_into()
|
||||
.map_err(|_| serde::de::Error::custom("{date_str} out of allowed range"))?;
|
||||
|
||||
// Return the milliseconds as f64
|
||||
Ok(milliseconds as f64)
|
||||
} else {
|
||||
// The value is not a string, so assume it's a regular f64 number
|
||||
value
|
||||
.as_f64()
|
||||
.ok_or_else(|| serde::de::Error::custom("Invalid number format"))
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for HistogramBounds {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_fmt(format_args!("[{},{}]", self.min, self.max))
|
||||
|
||||
@@ -428,6 +428,12 @@ impl SegmentTermCollector {
|
||||
field_type: ColumnType,
|
||||
accessor_idx: usize,
|
||||
) -> crate::Result<Self> {
|
||||
if field_type == ColumnType::Bytes || field_type == ColumnType::Bool {
|
||||
return Err(TantivyError::InvalidArgument(format!(
|
||||
"terms aggregation is not supported for column type {:?}",
|
||||
field_type
|
||||
)));
|
||||
}
|
||||
let term_buckets = TermBuckets::default();
|
||||
|
||||
if let Some(custom_order) = req.order.as_ref() {
|
||||
@@ -1500,4 +1506,41 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn terms_aggregation_bytes() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let bytes_field = schema_builder.add_bytes_field("bytes", FAST);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
index_writer.add_document(doc!(
|
||||
bytes_field => vec![1,2,3],
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
}
|
||||
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"my_texts": {
|
||||
"terms": {
|
||||
"field": "bytes"
|
||||
},
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let res = exec_request_with_query(agg_req, &index, None)?;
|
||||
|
||||
// TODO: Returning an error would be better instead of an empty result, since this is not a
|
||||
// JSON field
|
||||
assert_eq!(
|
||||
res["my_texts"]["buckets"][0]["key"],
|
||||
serde_json::Value::Null
|
||||
);
|
||||
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
|
||||
assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -161,6 +161,21 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
||||
/// ]);
|
||||
/// }
|
||||
///
|
||||
/// {
|
||||
/// let mut facet_collector = FacetCollector::for_field("facet");
|
||||
/// facet_collector.add_facet("/");
|
||||
/// let facet_counts = searcher.search(&AllQuery, &facet_collector)?;
|
||||
///
|
||||
/// // This lists all of the facet counts
|
||||
/// let facets: Vec<(&Facet, u64)> = facet_counts
|
||||
/// .get("/")
|
||||
/// .collect();
|
||||
/// assert_eq!(facets, vec![
|
||||
/// (&Facet::from("/category"), 4),
|
||||
/// (&Facet::from("/lang"), 4)
|
||||
/// ]);
|
||||
/// }
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// # assert!(example().is_ok());
|
||||
@@ -285,6 +300,9 @@ fn is_child_facet(parent_facet: &[u8], possible_child_facet: &[u8]) -> bool {
|
||||
if !possible_child_facet.starts_with(parent_facet) {
|
||||
return false;
|
||||
}
|
||||
if parent_facet.is_empty() {
|
||||
return true;
|
||||
}
|
||||
possible_child_facet.get(parent_facet.len()).copied() == Some(0u8)
|
||||
}
|
||||
|
||||
@@ -789,6 +807,15 @@ mod tests {
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn is_child_facet() {
|
||||
assert!(super::is_child_facet(&b"foo"[..], &b"foo\0bar"[..]));
|
||||
assert!(super::is_child_facet(&b""[..], &b"foo\0bar"[..]));
|
||||
assert!(super::is_child_facet(&b""[..], &b"foo"[..]));
|
||||
assert!(!super::is_child_facet(&b"foo\0bar"[..], &b"foo"[..]));
|
||||
assert!(!super::is_child_facet(&b"foo"[..], &b"foobar\0baz"[..]));
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
|
||||
@@ -67,7 +67,7 @@ impl IndexingPositionsPerPath {
|
||||
pub(crate) fn index_json_values<'a>(
|
||||
doc: DocId,
|
||||
json_values: impl Iterator<Item = crate::Result<&'a serde_json::Map<String, serde_json::Value>>>,
|
||||
text_analyzer: &TextAnalyzer,
|
||||
text_analyzer: &mut TextAnalyzer,
|
||||
expand_dots_enabled: bool,
|
||||
term_buffer: &mut Term,
|
||||
postings_writer: &mut dyn PostingsWriter,
|
||||
@@ -93,7 +93,7 @@ pub(crate) fn index_json_values<'a>(
|
||||
fn index_json_object(
|
||||
doc: DocId,
|
||||
json_value: &serde_json::Map<String, serde_json::Value>,
|
||||
text_analyzer: &TextAnalyzer,
|
||||
text_analyzer: &mut TextAnalyzer,
|
||||
json_term_writer: &mut JsonTermWriter,
|
||||
postings_writer: &mut dyn PostingsWriter,
|
||||
ctx: &mut IndexingContext,
|
||||
@@ -117,7 +117,7 @@ fn index_json_object(
|
||||
fn index_json_value(
|
||||
doc: DocId,
|
||||
json_value: &serde_json::Value,
|
||||
text_analyzer: &TextAnalyzer,
|
||||
text_analyzer: &mut TextAnalyzer,
|
||||
json_term_writer: &mut JsonTermWriter,
|
||||
postings_writer: &mut dyn PostingsWriter,
|
||||
ctx: &mut IndexingContext,
|
||||
@@ -212,12 +212,12 @@ pub fn convert_to_fast_value_and_get_term(
|
||||
DateTime::from_utc(dt_utc),
|
||||
));
|
||||
}
|
||||
if let Ok(u64_val) = str::parse::<u64>(phrase) {
|
||||
return Some(set_fastvalue_and_get_term(json_term_writer, u64_val));
|
||||
}
|
||||
if let Ok(i64_val) = str::parse::<i64>(phrase) {
|
||||
return Some(set_fastvalue_and_get_term(json_term_writer, i64_val));
|
||||
}
|
||||
if let Ok(u64_val) = str::parse::<u64>(phrase) {
|
||||
return Some(set_fastvalue_and_get_term(json_term_writer, u64_val));
|
||||
}
|
||||
if let Ok(f64_val) = str::parse::<f64>(phrase) {
|
||||
return Some(set_fastvalue_and_get_term(json_term_writer, f64_val));
|
||||
}
|
||||
@@ -239,7 +239,7 @@ pub(crate) fn set_fastvalue_and_get_term<T: FastValue>(
|
||||
pub(crate) fn set_string_and_get_terms(
|
||||
json_term_writer: &mut JsonTermWriter,
|
||||
value: &str,
|
||||
text_analyzer: &TextAnalyzer,
|
||||
text_analyzer: &mut TextAnalyzer,
|
||||
) -> Vec<(usize, Term)> {
|
||||
let mut positions_and_terms = Vec::<(usize, Term)>::new();
|
||||
json_term_writer.close_path_and_set_type(Type::Str);
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::fs::{self, File, OpenOptions};
|
||||
use std::io::{self, BufWriter, Read, Seek, Write};
|
||||
use std::ops::Deref;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{Arc, RwLock, Weak};
|
||||
use std::{fmt, result};
|
||||
|
||||
use common::StableDeref;
|
||||
use fs4::FileExt;
|
||||
@@ -21,6 +21,7 @@ use crate::directory::{
|
||||
AntiCallToken, Directory, DirectoryLock, FileHandle, Lock, OwnedBytes, TerminatingWrite,
|
||||
WatchCallback, WatchHandle, WritePtr,
|
||||
};
|
||||
#[cfg(unix)]
|
||||
use crate::Advice;
|
||||
|
||||
pub type ArcBytes = Arc<dyn Deref<Target = [u8]> + Send + Sync + 'static>;
|
||||
@@ -33,10 +34,7 @@ pub(crate) fn make_io_err(msg: String) -> io::Error {
|
||||
|
||||
/// Returns `None` iff the file exists, can be read, but is empty (and hence
|
||||
/// cannot be mmapped)
|
||||
fn open_mmap(
|
||||
full_path: &Path,
|
||||
madvice_opt: Option<Advice>,
|
||||
) -> result::Result<Option<Mmap>, OpenReadError> {
|
||||
fn open_mmap(full_path: &Path) -> Result<Option<Mmap>, OpenReadError> {
|
||||
let file = File::open(full_path).map_err(|io_err| {
|
||||
if io_err.kind() == io::ErrorKind::NotFound {
|
||||
OpenReadError::FileDoesNotExist(full_path.to_path_buf())
|
||||
@@ -59,9 +57,7 @@ fn open_mmap(
|
||||
.map(Some)
|
||||
.map_err(|io_err| OpenReadError::wrap_io_error(io_err, full_path.to_path_buf()))
|
||||
}?;
|
||||
if let (Some(mmap), Some(madvice)) = (&mmap_opt, madvice_opt) {
|
||||
let _ = mmap.advise(madvice);
|
||||
}
|
||||
|
||||
Ok(mmap_opt)
|
||||
}
|
||||
|
||||
@@ -83,18 +79,25 @@ pub struct CacheInfo {
|
||||
struct MmapCache {
|
||||
counters: CacheCounters,
|
||||
cache: HashMap<PathBuf, WeakArcBytes>,
|
||||
#[cfg(unix)]
|
||||
madvice_opt: Option<Advice>,
|
||||
}
|
||||
|
||||
impl MmapCache {
|
||||
fn new(madvice_opt: Option<Advice>) -> MmapCache {
|
||||
fn new() -> MmapCache {
|
||||
MmapCache {
|
||||
counters: CacheCounters::default(),
|
||||
cache: HashMap::default(),
|
||||
madvice_opt,
|
||||
#[cfg(unix)]
|
||||
madvice_opt: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(unix)]
|
||||
fn set_advice(&mut self, madvice: Advice) {
|
||||
self.madvice_opt = Some(madvice);
|
||||
}
|
||||
|
||||
fn get_info(&self) -> CacheInfo {
|
||||
let paths: Vec<PathBuf> = self.cache.keys().cloned().collect();
|
||||
CacheInfo {
|
||||
@@ -115,6 +118,16 @@ impl MmapCache {
|
||||
}
|
||||
}
|
||||
|
||||
fn open_mmap_impl(&self, full_path: &Path) -> Result<Option<Mmap>, OpenReadError> {
|
||||
let mmap_opt = open_mmap(full_path)?;
|
||||
#[cfg(unix)]
|
||||
if let (Some(mmap), Some(madvice)) = (mmap_opt.as_ref(), self.madvice_opt) {
|
||||
// We ignore madvise errors.
|
||||
let _ = mmap.advise(madvice);
|
||||
}
|
||||
Ok(mmap_opt)
|
||||
}
|
||||
|
||||
// Returns None if the file exists but as a len of 0 (and hence is not mmappable).
|
||||
fn get_mmap(&mut self, full_path: &Path) -> Result<Option<ArcBytes>, OpenReadError> {
|
||||
if let Some(mmap_weak) = self.cache.get(full_path) {
|
||||
@@ -125,7 +138,7 @@ impl MmapCache {
|
||||
}
|
||||
self.cache.remove(full_path);
|
||||
self.counters.miss += 1;
|
||||
let mmap_opt = open_mmap(full_path, self.madvice_opt)?;
|
||||
let mmap_opt = self.open_mmap_impl(full_path)?;
|
||||
Ok(mmap_opt.map(|mmap| {
|
||||
let mmap_arc: ArcBytes = Arc::new(mmap);
|
||||
let mmap_weak = Arc::downgrade(&mmap_arc);
|
||||
@@ -160,13 +173,9 @@ struct MmapDirectoryInner {
|
||||
}
|
||||
|
||||
impl MmapDirectoryInner {
|
||||
fn new(
|
||||
root_path: PathBuf,
|
||||
temp_directory: Option<TempDir>,
|
||||
madvice_opt: Option<Advice>,
|
||||
) -> MmapDirectoryInner {
|
||||
fn new(root_path: PathBuf, temp_directory: Option<TempDir>) -> MmapDirectoryInner {
|
||||
MmapDirectoryInner {
|
||||
mmap_cache: RwLock::new(MmapCache::new(madvice_opt)),
|
||||
mmap_cache: RwLock::new(MmapCache::new()),
|
||||
_temp_directory: temp_directory,
|
||||
watcher: FileWatcher::new(&root_path.join(*META_FILEPATH)),
|
||||
root_path,
|
||||
@@ -185,12 +194,8 @@ impl fmt::Debug for MmapDirectory {
|
||||
}
|
||||
|
||||
impl MmapDirectory {
|
||||
fn new(
|
||||
root_path: PathBuf,
|
||||
temp_directory: Option<TempDir>,
|
||||
madvice_opt: Option<Advice>,
|
||||
) -> MmapDirectory {
|
||||
let inner = MmapDirectoryInner::new(root_path, temp_directory, madvice_opt);
|
||||
fn new(root_path: PathBuf, temp_directory: Option<TempDir>) -> MmapDirectory {
|
||||
let inner = MmapDirectoryInner::new(root_path, temp_directory);
|
||||
MmapDirectory {
|
||||
inner: Arc::new(inner),
|
||||
}
|
||||
@@ -206,29 +211,33 @@ impl MmapDirectory {
|
||||
Ok(MmapDirectory::new(
|
||||
tempdir.path().to_path_buf(),
|
||||
Some(tempdir),
|
||||
None,
|
||||
))
|
||||
}
|
||||
|
||||
/// Opens a MmapDirectory in a directory, with a given access pattern.
|
||||
///
|
||||
/// This is only supported on unix platforms.
|
||||
#[cfg(unix)]
|
||||
pub fn open_with_madvice(
|
||||
directory_path: impl AsRef<Path>,
|
||||
madvice: Advice,
|
||||
) -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
let dir = Self::open_impl_to_avoid_monomorphization(directory_path.as_ref())?;
|
||||
dir.inner.mmap_cache.write().unwrap().set_advice(madvice);
|
||||
Ok(dir)
|
||||
}
|
||||
|
||||
/// Opens a MmapDirectory in a directory.
|
||||
///
|
||||
/// Returns an error if the `directory_path` does not
|
||||
/// exist or if it is not a directory.
|
||||
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
Self::open_with_access_pattern_impl(directory_path.as_ref(), None)
|
||||
pub fn open(directory_path: impl AsRef<Path>) -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
Self::open_impl_to_avoid_monomorphization(directory_path.as_ref())
|
||||
}
|
||||
|
||||
/// Opens a MmapDirectory in a directory, with a given access pattern.
|
||||
pub fn open_with_madvice<P: AsRef<Path>>(
|
||||
directory_path: P,
|
||||
madvice: Advice,
|
||||
) -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
Self::open_with_access_pattern_impl(directory_path.as_ref(), Some(madvice))
|
||||
}
|
||||
|
||||
fn open_with_access_pattern_impl(
|
||||
#[inline(never)]
|
||||
fn open_impl_to_avoid_monomorphization(
|
||||
directory_path: &Path,
|
||||
madvice_opt: Option<Advice>,
|
||||
) -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
if !directory_path.exists() {
|
||||
return Err(OpenDirectoryError::DoesNotExist(PathBuf::from(
|
||||
@@ -256,7 +265,7 @@ impl MmapDirectory {
|
||||
directory_path,
|
||||
)));
|
||||
}
|
||||
Ok(MmapDirectory::new(canonical_path, None, madvice_opt))
|
||||
Ok(MmapDirectory::new(canonical_path, None))
|
||||
}
|
||||
|
||||
/// Joins a relative_path to the directory `root_path`
|
||||
@@ -365,7 +374,7 @@ pub(crate) fn atomic_write(path: &Path, content: &[u8]) -> io::Result<()> {
|
||||
}
|
||||
|
||||
impl Directory for MmapDirectory {
|
||||
fn get_file_handle(&self, path: &Path) -> result::Result<Arc<dyn FileHandle>, OpenReadError> {
|
||||
fn get_file_handle(&self, path: &Path) -> Result<Arc<dyn FileHandle>, OpenReadError> {
|
||||
debug!("Open Read {:?}", path);
|
||||
let full_path = self.resolve_path(path);
|
||||
|
||||
@@ -388,7 +397,7 @@ impl Directory for MmapDirectory {
|
||||
|
||||
/// Any entry associated with the path in the mmap will be
|
||||
/// removed before the file is deleted.
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
fn delete(&self, path: &Path) -> Result<(), DeleteError> {
|
||||
let full_path = self.resolve_path(path);
|
||||
fs::remove_file(full_path).map_err(|e| {
|
||||
if e.kind() == io::ErrorKind::NotFound {
|
||||
|
||||
@@ -686,12 +686,12 @@ mod tests {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let date_field = schema_builder.add_date_field(
|
||||
"date",
|
||||
DateOptions::from(FAST).set_precision(DateTimePrecision::Nanosecond),
|
||||
DateOptions::from(FAST).set_precision(DateTimePrecision::Nanoseconds),
|
||||
);
|
||||
let multi_date_field = schema_builder.add_date_field(
|
||||
"multi_date",
|
||||
DateOptions::default()
|
||||
.set_precision(DateTimePrecision::Nanosecond)
|
||||
.set_precision(DateTimePrecision::Nanoseconds)
|
||||
.set_fast(),
|
||||
);
|
||||
let schema = schema_builder.build();
|
||||
@@ -862,9 +862,9 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
pub fn test_gcd_date() {
|
||||
let size_prec_sec = test_gcd_date_with_codec(DateTimePrecision::Second);
|
||||
let size_prec_sec = test_gcd_date_with_codec(DateTimePrecision::Seconds);
|
||||
assert!((1000 * 13 / 8..100 + 1000 * 13 / 8).contains(&size_prec_sec.get_bytes())); // 13 bits per val = ceil(log_2(number of seconds in 2hours);
|
||||
let size_prec_micros = test_gcd_date_with_codec(DateTimePrecision::Microsecond);
|
||||
let size_prec_micros = test_gcd_date_with_codec(DateTimePrecision::Microseconds);
|
||||
assert!((1000 * 33 / 8..100 + 1000 * 33 / 8).contains(&size_prec_micros.get_bytes()));
|
||||
// 33 bits per
|
||||
// val = ceil(log_2(number
|
||||
@@ -1208,7 +1208,7 @@ mod tests {
|
||||
let ff_tokenizer_manager = TokenizerManager::default();
|
||||
ff_tokenizer_manager.register(
|
||||
"custom_lowercase",
|
||||
TextAnalyzer::builder(RawTokenizer)
|
||||
TextAnalyzer::builder(RawTokenizer::default())
|
||||
.filter(LowerCaser)
|
||||
.build(),
|
||||
);
|
||||
|
||||
@@ -147,7 +147,7 @@ impl FastFieldsWriter {
|
||||
}
|
||||
Value::Str(text_val) => {
|
||||
if let Some(tokenizer) =
|
||||
&self.per_field_tokenizer[field_value.field().field_id() as usize]
|
||||
&mut self.per_field_tokenizer[field_value.field().field_id() as usize]
|
||||
{
|
||||
let mut token_stream = tokenizer.token_stream(text_val);
|
||||
token_stream.process(&mut |token: &Token| {
|
||||
@@ -202,7 +202,7 @@ impl FastFieldsWriter {
|
||||
self.json_path_buffer.push_str(field_name);
|
||||
|
||||
let text_analyzer =
|
||||
&self.per_field_tokenizer[field_value.field().field_id() as usize];
|
||||
&mut self.per_field_tokenizer[field_value.field().field_id() as usize];
|
||||
|
||||
record_json_obj_to_columnar_writer(
|
||||
doc_id,
|
||||
@@ -263,7 +263,7 @@ fn record_json_obj_to_columnar_writer(
|
||||
remaining_depth_limit: usize,
|
||||
json_path_buffer: &mut String,
|
||||
columnar_writer: &mut columnar::ColumnarWriter,
|
||||
tokenizer: &Option<TextAnalyzer>,
|
||||
tokenizer: &mut Option<TextAnalyzer>,
|
||||
) {
|
||||
for (key, child) in json_obj {
|
||||
let len_path = json_path_buffer.len();
|
||||
@@ -302,7 +302,7 @@ fn record_json_value_to_columnar_writer(
|
||||
mut remaining_depth_limit: usize,
|
||||
json_path_writer: &mut String,
|
||||
columnar_writer: &mut columnar::ColumnarWriter,
|
||||
tokenizer: &Option<TextAnalyzer>,
|
||||
tokenizer: &mut Option<TextAnalyzer>,
|
||||
) {
|
||||
if remaining_depth_limit == 0 {
|
||||
return;
|
||||
@@ -321,7 +321,7 @@ fn record_json_value_to_columnar_writer(
|
||||
}
|
||||
}
|
||||
serde_json::Value::String(text) => {
|
||||
if let Some(text_analyzer) = tokenizer {
|
||||
if let Some(text_analyzer) = tokenizer.as_mut() {
|
||||
let mut token_stream = text_analyzer.token_stream(text);
|
||||
token_stream.process(&mut |token| {
|
||||
columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text);
|
||||
@@ -379,7 +379,7 @@ mod tests {
|
||||
JSON_DEPTH_LIMIT,
|
||||
&mut json_path,
|
||||
&mut columnar_writer,
|
||||
&None,
|
||||
&mut None,
|
||||
);
|
||||
}
|
||||
let mut buffer = Vec::new();
|
||||
|
||||
@@ -185,10 +185,11 @@ impl SegmentWriter {
|
||||
|
||||
match field_entry.field_type() {
|
||||
FieldType::Facet(_) => {
|
||||
let mut facet_tokenizer = FacetTokenizer::default(); // this can be global
|
||||
for value in values {
|
||||
let facet = value.as_facet().ok_or_else(make_schema_error)?;
|
||||
let facet_str = facet.encoded_str();
|
||||
let mut facet_tokenizer = FacetTokenizer.token_stream(facet_str);
|
||||
let mut facet_tokenizer = facet_tokenizer.token_stream(facet_str);
|
||||
let mut indexing_position = IndexingPosition::default();
|
||||
postings_writer.index_text(
|
||||
doc_id,
|
||||
@@ -208,7 +209,7 @@ impl SegmentWriter {
|
||||
}
|
||||
Value::Str(ref text) => {
|
||||
let text_analyzer =
|
||||
&self.per_field_text_analyzers[field.field_id() as usize];
|
||||
&mut self.per_field_text_analyzers[field.field_id() as usize];
|
||||
text_analyzer.token_stream(text)
|
||||
}
|
||||
_ => {
|
||||
@@ -304,7 +305,8 @@ impl SegmentWriter {
|
||||
}
|
||||
}
|
||||
FieldType::JsonObject(json_options) => {
|
||||
let text_analyzer = &self.per_field_text_analyzers[field.field_id() as usize];
|
||||
let text_analyzer =
|
||||
&mut self.per_field_text_analyzers[field.field_id() as usize];
|
||||
let json_values_it =
|
||||
values.map(|value| value.as_json().ok_or_else(make_schema_error));
|
||||
index_json_values(
|
||||
@@ -457,7 +459,7 @@ mod tests {
|
||||
fn test_hashmap_size() {
|
||||
assert_eq!(compute_initial_table_size(100_000).unwrap(), 1 << 11);
|
||||
assert_eq!(compute_initial_table_size(1_000_000).unwrap(), 1 << 14);
|
||||
assert_eq!(compute_initial_table_size(10_000_000).unwrap(), 1 << 18);
|
||||
assert_eq!(compute_initial_table_size(15_000_000).unwrap(), 1 << 18);
|
||||
assert_eq!(compute_initial_table_size(1_000_000_000).unwrap(), 1 << 19);
|
||||
assert_eq!(compute_initial_table_size(4_000_000_000).unwrap(), 1 << 19);
|
||||
}
|
||||
|
||||
91
src/lib.rs
91
src/lib.rs
@@ -191,6 +191,7 @@ pub use crate::schema::{DateOptions, DateTimePrecision, Document, Term};
|
||||
/// Index format version.
|
||||
const INDEX_FORMAT_VERSION: u32 = 5;
|
||||
|
||||
#[cfg(unix)]
|
||||
pub use memmap2::Advice;
|
||||
|
||||
/// Structure version for the index.
|
||||
@@ -301,6 +302,7 @@ pub struct DocAddress {
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
use common::{BinarySerializable, FixedSize};
|
||||
use query_grammar::{UserInputAst, UserInputLeaf, UserInputLiteral};
|
||||
use rand::distributions::{Bernoulli, Uniform};
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
@@ -856,6 +858,95 @@ pub mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_searcher_on_json_field_with_type_inference() {
|
||||
// When indexing and searching a json value, we infer its type.
|
||||
// This tests aims to check the type infereence is consistent between indexing and search.
|
||||
// Inference order is date, i64, u64, f64, bool.
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_field = schema_builder.add_json_field("json", STORED | TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let json_val: serde_json::Map<String, serde_json::Value> = serde_json::from_str(
|
||||
r#"{
|
||||
"signed": 2,
|
||||
"float": 2.0,
|
||||
"unsigned": 10000000000000,
|
||||
"date": "1985-04-12T23:20:50.52Z",
|
||||
"bool": true
|
||||
}"#,
|
||||
)
|
||||
.unwrap();
|
||||
let doc = doc!(json_field=>json_val.clone());
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let mut writer = index.writer_for_tests().unwrap();
|
||||
writer.add_document(doc).unwrap();
|
||||
writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let get_doc_ids = |user_input_literal: UserInputLiteral| {
|
||||
let query_parser = crate::query::QueryParser::for_index(&index, Vec::new());
|
||||
let query = query_parser
|
||||
.build_query_from_user_input_ast(UserInputAst::from(UserInputLeaf::Literal(
|
||||
user_input_literal,
|
||||
)))
|
||||
.unwrap();
|
||||
searcher
|
||||
.search(&query, &TEST_COLLECTOR_WITH_SCORE)
|
||||
.map(|topdocs| topdocs.docs().to_vec())
|
||||
.unwrap()
|
||||
};
|
||||
{
|
||||
let user_input_literal = UserInputLiteral {
|
||||
field_name: Some("json.signed".to_string()),
|
||||
phrase: "2".to_string(),
|
||||
delimiter: crate::query_grammar::Delimiter::None,
|
||||
slop: 0,
|
||||
prefix: false,
|
||||
};
|
||||
assert_eq!(get_doc_ids(user_input_literal), vec![DocAddress::new(0, 0)]);
|
||||
}
|
||||
{
|
||||
let user_input_literal = UserInputLiteral {
|
||||
field_name: Some("json.float".to_string()),
|
||||
phrase: "2.0".to_string(),
|
||||
delimiter: crate::query_grammar::Delimiter::None,
|
||||
slop: 0,
|
||||
prefix: false,
|
||||
};
|
||||
assert_eq!(get_doc_ids(user_input_literal), vec![DocAddress::new(0, 0)]);
|
||||
}
|
||||
{
|
||||
let user_input_literal = UserInputLiteral {
|
||||
field_name: Some("json.date".to_string()),
|
||||
phrase: "1985-04-12T23:20:50.52Z".to_string(),
|
||||
delimiter: crate::query_grammar::Delimiter::None,
|
||||
slop: 0,
|
||||
prefix: false,
|
||||
};
|
||||
assert_eq!(get_doc_ids(user_input_literal), vec![DocAddress::new(0, 0)]);
|
||||
}
|
||||
{
|
||||
let user_input_literal = UserInputLiteral {
|
||||
field_name: Some("json.unsigned".to_string()),
|
||||
phrase: "10000000000000".to_string(),
|
||||
delimiter: crate::query_grammar::Delimiter::None,
|
||||
slop: 0,
|
||||
prefix: false,
|
||||
};
|
||||
assert_eq!(get_doc_ids(user_input_literal), vec![DocAddress::new(0, 0)]);
|
||||
}
|
||||
{
|
||||
let user_input_literal = UserInputLiteral {
|
||||
field_name: Some("json.bool".to_string()),
|
||||
phrase: "true".to_string(),
|
||||
delimiter: crate::query_grammar::Delimiter::None,
|
||||
slop: 0,
|
||||
prefix: false,
|
||||
};
|
||||
assert_eq!(get_doc_ids(user_input_literal), vec![DocAddress::new(0, 0)]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_doc_macro() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
@@ -162,7 +162,7 @@ pub mod tests {
|
||||
let index = Index::create_in_ram(schema);
|
||||
index
|
||||
.tokenizers()
|
||||
.register("simple_no_truncation", SimpleTokenizer);
|
||||
.register("simple_no_truncation", SimpleTokenizer::default());
|
||||
let reader = index.reader()?;
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
|
||||
@@ -194,7 +194,7 @@ pub mod tests {
|
||||
let index = Index::create_in_ram(schema);
|
||||
index
|
||||
.tokenizers()
|
||||
.register("simple_no_truncation", SimpleTokenizer);
|
||||
.register("simple_no_truncation", SimpleTokenizer::default());
|
||||
let reader = index.reader()?;
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
|
||||
|
||||
@@ -192,45 +192,49 @@ impl MoreLikeThis {
|
||||
})
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
for fake_str in facets {
|
||||
FacetTokenizer.token_stream(fake_str).process(&mut |token| {
|
||||
if self.is_noise_word(token.text.clone()) {
|
||||
let term = Term::from_field_text(field, &token.text);
|
||||
*term_frequencies.entry(term).or_insert(0) += 1;
|
||||
}
|
||||
});
|
||||
FacetTokenizer::default()
|
||||
.token_stream(fake_str)
|
||||
.process(&mut |token| {
|
||||
if self.is_noise_word(token.text.clone()) {
|
||||
let term = Term::from_field_text(field, &token.text);
|
||||
*term_frequencies.entry(term).or_insert(0) += 1;
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
FieldType::Str(text_options) => {
|
||||
let mut token_streams: Vec<BoxTokenStream> = vec![];
|
||||
|
||||
for value in values {
|
||||
match value {
|
||||
Value::PreTokStr(tok_str) => {
|
||||
token_streams.push(PreTokenizedStream::from(tok_str.clone()).into());
|
||||
let mut token_stream: BoxTokenStream =
|
||||
PreTokenizedStream::from(tok_str.clone()).into();
|
||||
token_stream.process(&mut |token| {
|
||||
if !self.is_noise_word(token.text.clone()) {
|
||||
let term = Term::from_field_text(field, &token.text);
|
||||
*term_frequencies.entry(term).or_insert(0) += 1;
|
||||
}
|
||||
});
|
||||
}
|
||||
Value::Str(ref text) => {
|
||||
if let Some(tokenizer) = text_options
|
||||
if let Some(mut tokenizer) = text_options
|
||||
.get_indexing_options()
|
||||
.map(|text_indexing_options| {
|
||||
text_indexing_options.tokenizer().to_string()
|
||||
})
|
||||
.and_then(|tokenizer_name| tokenizer_manager.get(&tokenizer_name))
|
||||
{
|
||||
token_streams.push(tokenizer.token_stream(text));
|
||||
let mut token_stream = tokenizer.token_stream(text);
|
||||
token_stream.process(&mut |token| {
|
||||
if !self.is_noise_word(token.text.clone()) {
|
||||
let term = Term::from_field_text(field, &token.text);
|
||||
*term_frequencies.entry(term).or_insert(0) += 1;
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
|
||||
for mut token_stream in token_streams {
|
||||
token_stream.process(&mut |token| {
|
||||
if !self.is_noise_word(token.text.clone()) {
|
||||
let term = Term::from_field_text(field, &token.text);
|
||||
*term_frequencies.entry(term).or_insert(0) += 1;
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
FieldType::U64(_) => {
|
||||
for value in values {
|
||||
|
||||
@@ -88,9 +88,6 @@ impl PhrasePrefixQuery {
|
||||
/// a specialized type [`PhraseQueryWeight`] instead of a Boxed trait.
|
||||
/// If the query was only one term long, this returns `None` wherease [`Query::weight`]
|
||||
/// returns a boxed [`RangeWeight`]
|
||||
///
|
||||
/// Returns `None`, if phrase_terms is empty, which happens if the phrase prefix query was
|
||||
/// built with a single term.
|
||||
pub(crate) fn phrase_prefix_query_weight(
|
||||
&self,
|
||||
enable_scoring: EnableScoring<'_>,
|
||||
|
||||
@@ -8,7 +8,11 @@ use crate::Score;
|
||||
#[derive(Clone)]
|
||||
pub enum LogicalLiteral {
|
||||
Term(Term),
|
||||
Phrase(Vec<(usize, Term)>, u32),
|
||||
Phrase {
|
||||
terms: Vec<(usize, Term)>,
|
||||
slop: u32,
|
||||
prefix: bool,
|
||||
},
|
||||
Range {
|
||||
field: String,
|
||||
value_type: Type,
|
||||
@@ -79,10 +83,16 @@ impl fmt::Debug for LogicalLiteral {
|
||||
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
|
||||
match *self {
|
||||
LogicalLiteral::Term(ref term) => write!(formatter, "{term:?}"),
|
||||
LogicalLiteral::Phrase(ref terms, slop) => {
|
||||
LogicalLiteral::Phrase {
|
||||
ref terms,
|
||||
slop,
|
||||
prefix,
|
||||
} => {
|
||||
write!(formatter, "\"{terms:?}\"")?;
|
||||
if slop > 0 {
|
||||
write!(formatter, "~{slop:?}")
|
||||
} else if prefix {
|
||||
write!(formatter, "*")
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -15,21 +15,12 @@ use crate::core::json_utils::{
|
||||
use crate::core::Index;
|
||||
use crate::query::range_query::{is_type_valid_for_fastfield_range_query, RangeQuery};
|
||||
use crate::query::{
|
||||
AllQuery,
|
||||
BooleanQuery,
|
||||
BoostQuery,
|
||||
EmptyQuery,
|
||||
FuzzyTermQuery,
|
||||
Occur,
|
||||
PhraseQuery,
|
||||
Query,
|
||||
// RangeQuery,
|
||||
TermQuery,
|
||||
TermSetQuery,
|
||||
AllQuery, BooleanQuery, BoostQuery, EmptyQuery, FuzzyTermQuery, Occur, PhrasePrefixQuery,
|
||||
PhraseQuery, Query, TermQuery, TermSetQuery,
|
||||
};
|
||||
use crate::schema::{
|
||||
Facet, FacetParseError, Field, FieldType, IndexRecordOption, IntoIpv6Addr, JsonObjectOptions,
|
||||
Schema, Term, Type,
|
||||
Schema, Term, TextFieldIndexing, Type,
|
||||
};
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
use crate::time::OffsetDateTime;
|
||||
@@ -79,6 +70,17 @@ pub enum QueryParserError {
|
||||
/// have any positions indexed.
|
||||
#[error("The field '{0}' does not have positions indexed")]
|
||||
FieldDoesNotHavePositionsIndexed(String),
|
||||
/// A phrase-prefix query requires at least two terms
|
||||
#[error(
|
||||
"The phrase '{phrase:?}' does not produce at least two terms using the tokenizer \
|
||||
'{tokenizer:?}'"
|
||||
)]
|
||||
PhrasePrefixRequiresAtLeastTwoTerms {
|
||||
/// The phrase which triggered the issue
|
||||
phrase: String,
|
||||
/// The tokenizer configured for the field
|
||||
tokenizer: String,
|
||||
},
|
||||
/// The tokenizer for the given field is unknown
|
||||
/// The two argument strings are the name of the field, the name of the tokenizer
|
||||
#[error("The tokenizer '{tokenizer:?}' for the field '{field:?}' is unknown")]
|
||||
@@ -194,6 +196,10 @@ fn trim_ast(logical_ast: LogicalAst) -> Option<LogicalAst> {
|
||||
///
|
||||
/// Phrase terms support the `~` slop operator which allows to set the phrase's matching
|
||||
/// distance in words. `"big wolf"~1` will return documents containing the phrase `"big bad wolf"`.
|
||||
///
|
||||
/// Phrase terms also support the `*` prefix operator which switches the phrase's matching
|
||||
/// to consider all documents which contain the last term as a prefix, e.g. `"big bad wo"*` will
|
||||
/// match `"big bad wolf"`.
|
||||
#[derive(Clone)]
|
||||
pub struct QueryParser {
|
||||
schema: Schema,
|
||||
@@ -397,7 +403,7 @@ impl QueryParser {
|
||||
// This should have been seen earlier really.
|
||||
QueryParserError::FieldNotIndexed(field_entry.name().to_string())
|
||||
})?;
|
||||
let text_analyzer =
|
||||
let mut text_analyzer =
|
||||
self.tokenizer_manager
|
||||
.get(option.tokenizer())
|
||||
.ok_or_else(|| QueryParserError::UnknownTokenizer {
|
||||
@@ -446,6 +452,7 @@ impl QueryParser {
|
||||
json_path: &str,
|
||||
phrase: &str,
|
||||
slop: u32,
|
||||
prefix: bool,
|
||||
) -> Result<Vec<LogicalLiteral>, QueryParserError> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
@@ -486,25 +493,25 @@ impl QueryParser {
|
||||
Ok(vec![LogicalLiteral::Term(dt_term)])
|
||||
}
|
||||
FieldType::Str(ref str_options) => {
|
||||
let option = str_options.get_indexing_options().ok_or_else(|| {
|
||||
let indexing_options = str_options.get_indexing_options().ok_or_else(|| {
|
||||
// This should have been seen earlier really.
|
||||
QueryParserError::FieldNotIndexed(field_name.to_string())
|
||||
})?;
|
||||
let text_analyzer =
|
||||
self.tokenizer_manager
|
||||
.get(option.tokenizer())
|
||||
.ok_or_else(|| QueryParserError::UnknownTokenizer {
|
||||
field: field_name.to_string(),
|
||||
tokenizer: option.tokenizer().to_string(),
|
||||
})?;
|
||||
let index_record_option = option.index_option();
|
||||
let mut text_analyzer = self
|
||||
.tokenizer_manager
|
||||
.get(indexing_options.tokenizer())
|
||||
.ok_or_else(|| QueryParserError::UnknownTokenizer {
|
||||
field: field_name.to_string(),
|
||||
tokenizer: indexing_options.tokenizer().to_string(),
|
||||
})?;
|
||||
Ok(generate_literals_for_str(
|
||||
field_name,
|
||||
field,
|
||||
phrase,
|
||||
slop,
|
||||
&text_analyzer,
|
||||
index_record_option,
|
||||
prefix,
|
||||
indexing_options,
|
||||
&mut text_analyzer,
|
||||
)?
|
||||
.into_iter()
|
||||
.collect())
|
||||
@@ -661,9 +668,13 @@ impl QueryParser {
|
||||
self.compute_path_triplets_for_literal(&literal)?;
|
||||
let mut asts: Vec<LogicalAst> = Vec::new();
|
||||
for (field, json_path, phrase) in term_phrases {
|
||||
for ast in
|
||||
self.compute_logical_ast_for_leaf(field, json_path, phrase, literal.slop)?
|
||||
{
|
||||
for ast in self.compute_logical_ast_for_leaf(
|
||||
field,
|
||||
json_path,
|
||||
phrase,
|
||||
literal.slop,
|
||||
literal.prefix,
|
||||
)? {
|
||||
// Apply some field specific boost defined at the query parser level.
|
||||
let boost = self.field_boost(field);
|
||||
asts.push(LogicalAst::Leaf(Box::new(ast)).boost(boost));
|
||||
@@ -753,9 +764,17 @@ fn convert_literal_to_query(
|
||||
Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs))
|
||||
}
|
||||
}
|
||||
LogicalLiteral::Phrase(term_with_offsets, slop) => Box::new(
|
||||
PhraseQuery::new_with_offset_and_slop(term_with_offsets, slop),
|
||||
),
|
||||
LogicalLiteral::Phrase {
|
||||
terms,
|
||||
slop,
|
||||
prefix,
|
||||
} => {
|
||||
if prefix {
|
||||
Box::new(PhrasePrefixQuery::new_with_offset(terms))
|
||||
} else {
|
||||
Box::new(PhraseQuery::new_with_offset_and_slop(terms, slop))
|
||||
}
|
||||
}
|
||||
LogicalLiteral::Range {
|
||||
field,
|
||||
value_type,
|
||||
@@ -774,8 +793,9 @@ fn generate_literals_for_str(
|
||||
field: Field,
|
||||
phrase: &str,
|
||||
slop: u32,
|
||||
text_analyzer: &TextAnalyzer,
|
||||
index_record_option: IndexRecordOption,
|
||||
prefix: bool,
|
||||
indexing_options: &TextFieldIndexing,
|
||||
text_analyzer: &mut TextAnalyzer,
|
||||
) -> Result<Option<LogicalLiteral>, QueryParserError> {
|
||||
let mut terms: Vec<(usize, Term)> = Vec::new();
|
||||
let mut token_stream = text_analyzer.token_stream(phrase);
|
||||
@@ -784,18 +804,28 @@ fn generate_literals_for_str(
|
||||
terms.push((token.position, term));
|
||||
});
|
||||
if terms.len() <= 1 {
|
||||
if prefix {
|
||||
return Err(QueryParserError::PhrasePrefixRequiresAtLeastTwoTerms {
|
||||
phrase: phrase.to_owned(),
|
||||
tokenizer: indexing_options.tokenizer().to_owned(),
|
||||
});
|
||||
}
|
||||
let term_literal_opt = terms
|
||||
.into_iter()
|
||||
.next()
|
||||
.map(|(_, term)| LogicalLiteral::Term(term));
|
||||
return Ok(term_literal_opt);
|
||||
}
|
||||
if !index_record_option.has_positions() {
|
||||
if !indexing_options.index_option().has_positions() {
|
||||
return Err(QueryParserError::FieldDoesNotHavePositionsIndexed(
|
||||
field_name.to_string(),
|
||||
));
|
||||
}
|
||||
Ok(Some(LogicalLiteral::Phrase(terms, slop)))
|
||||
Ok(Some(LogicalLiteral::Phrase {
|
||||
terms,
|
||||
slop,
|
||||
prefix,
|
||||
}))
|
||||
}
|
||||
|
||||
fn generate_literals_for_json_object(
|
||||
@@ -810,7 +840,7 @@ fn generate_literals_for_json_object(
|
||||
// This should have been seen earlier really.
|
||||
QueryParserError::FieldNotIndexed(field_name.to_string())
|
||||
})?;
|
||||
let text_analyzer = tokenizer_manager
|
||||
let mut text_analyzer = tokenizer_manager
|
||||
.get(text_options.tokenizer())
|
||||
.ok_or_else(|| QueryParserError::UnknownTokenizer {
|
||||
field: field_name.to_string(),
|
||||
@@ -828,7 +858,7 @@ fn generate_literals_for_json_object(
|
||||
if let Some(term) = convert_to_fast_value_and_get_term(&mut json_term_writer, phrase) {
|
||||
logical_literals.push(LogicalLiteral::Term(term));
|
||||
}
|
||||
let terms = set_string_and_get_terms(&mut json_term_writer, phrase, &text_analyzer);
|
||||
let terms = set_string_and_get_terms(&mut json_term_writer, phrase, &mut text_analyzer);
|
||||
drop(json_term_writer);
|
||||
if terms.len() <= 1 {
|
||||
for (_, term) in terms {
|
||||
@@ -841,7 +871,11 @@ fn generate_literals_for_json_object(
|
||||
field_name.to_string(),
|
||||
));
|
||||
}
|
||||
logical_literals.push(LogicalLiteral::Phrase(terms, 0));
|
||||
logical_literals.push(LogicalLiteral::Phrase {
|
||||
terms,
|
||||
slop: 0,
|
||||
prefix: false,
|
||||
});
|
||||
Ok(logical_literals)
|
||||
}
|
||||
|
||||
@@ -925,7 +959,7 @@ mod test {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
tokenizer_manager.register(
|
||||
"en_with_stop_words",
|
||||
TextAnalyzer::builder(SimpleTokenizer)
|
||||
TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(LowerCaser)
|
||||
.filter(StopWordFilter::remove(vec!["the".to_string()]))
|
||||
.build(),
|
||||
@@ -1169,7 +1203,7 @@ mod test {
|
||||
fn test_json_field_possibly_a_number() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"json.titi:5",
|
||||
r#"(Term(field=14, type=Json, path=titi, type=U64, 5) Term(field=14, type=Json, path=titi, type=Str, "5"))"#,
|
||||
r#"(Term(field=14, type=Json, path=titi, type=I64, 5) Term(field=14, type=Json, path=titi, type=Str, "5"))"#,
|
||||
true,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
@@ -1177,6 +1211,11 @@ mod test {
|
||||
r#"(Term(field=14, type=Json, path=titi, type=I64, -5) Term(field=14, type=Json, path=titi, type=Str, "5"))"#, //< Yes this is a bit weird after going through the tokenizer we lose the "-".
|
||||
true,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"json.titi:10000000000000000000",
|
||||
r#"(Term(field=14, type=Json, path=titi, type=U64, 10000000000000000000) Term(field=14, type=Json, path=titi, type=Str, "10000000000000000000"))"#,
|
||||
true,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"json.titi:-5.2",
|
||||
r#"(Term(field=14, type=Json, path=titi, type=F64, -5.2) "[(0, Term(field=14, type=Json, path=titi, type=Str, "5")), (1, Term(field=14, type=Json, path=titi, type=Str, "2"))]")"#,
|
||||
@@ -1226,7 +1265,7 @@ mod test {
|
||||
fn test_json_default() {
|
||||
test_query_to_logical_ast_with_default_json(
|
||||
"titi:4",
|
||||
"(Term(field=14, type=Json, path=titi, type=U64, 4) Term(field=14, type=Json, \
|
||||
"(Term(field=14, type=Json, path=titi, type=I64, 4) Term(field=14, type=Json, \
|
||||
path=titi, type=Str, \"4\"))",
|
||||
false,
|
||||
);
|
||||
@@ -1248,7 +1287,7 @@ mod test {
|
||||
for conjunction in [false, true] {
|
||||
test_query_to_logical_ast_with_default_json(
|
||||
"json:4",
|
||||
r#"(Term(field=14, type=Json, path=, type=U64, 4) Term(field=14, type=Json, path=, type=Str, "4"))"#,
|
||||
r#"(Term(field=14, type=Json, path=, type=I64, 4) Term(field=14, type=Json, path=, type=Str, "4"))"#,
|
||||
conjunction,
|
||||
);
|
||||
}
|
||||
@@ -1429,7 +1468,7 @@ mod test {
|
||||
let index = Index::create_in_ram(schema);
|
||||
index
|
||||
.tokenizers()
|
||||
.register("customtokenizer", SimpleTokenizer);
|
||||
.register("customtokenizer", SimpleTokenizer::default());
|
||||
let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
assert_eq!(
|
||||
query_parser.parse_query("title:\"happy tax\"").unwrap_err(),
|
||||
@@ -1643,6 +1682,48 @@ mod test {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_phrase_prefix() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"\"big bad wo\"*",
|
||||
r#"("[(0, Term(field=0, type=Str, "big")), (1, Term(field=0, type=Str, "bad")), (2, Term(field=0, type=Str, "wo"))]"* "[(0, Term(field=1, type=Str, "big")), (1, Term(field=1, type=Str, "bad")), (2, Term(field=1, type=Str, "wo"))]"*)"#,
|
||||
false,
|
||||
);
|
||||
|
||||
let query_parser = make_query_parser();
|
||||
let query = query_parser.parse_query("\"big bad wo\"*").unwrap();
|
||||
assert_eq!(
|
||||
format!("{query:?}"),
|
||||
"BooleanQuery { subqueries: [(Should, PhrasePrefixQuery { field: Field(0), \
|
||||
phrase_terms: [(0, Term(field=0, type=Str, \"big\")), (1, Term(field=0, type=Str, \
|
||||
\"bad\"))], prefix: (2, Term(field=0, type=Str, \"wo\")), max_expansions: 50 }), \
|
||||
(Should, PhrasePrefixQuery { field: Field(1), phrase_terms: [(0, Term(field=1, \
|
||||
type=Str, \"big\")), (1, Term(field=1, type=Str, \"bad\"))], prefix: (2, \
|
||||
Term(field=1, type=Str, \"wo\")), max_expansions: 50 })] }"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_phrase_prefix_too_short() {
|
||||
let err = parse_query_to_logical_ast("\"wo\"*", true).unwrap_err();
|
||||
assert_eq!(
|
||||
err,
|
||||
QueryParserError::PhrasePrefixRequiresAtLeastTwoTerms {
|
||||
phrase: "wo".to_owned(),
|
||||
tokenizer: "default".to_owned()
|
||||
}
|
||||
);
|
||||
|
||||
let err = parse_query_to_logical_ast("\"\"*", true).unwrap_err();
|
||||
assert_eq!(
|
||||
err,
|
||||
QueryParserError::PhrasePrefixRequiresAtLeastTwoTerms {
|
||||
phrase: "".to_owned(),
|
||||
tokenizer: "default".to_owned()
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_term_set_query() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
|
||||
@@ -8,7 +8,7 @@ use serde::{Deserialize, Serialize};
|
||||
use crate::schema::flags::{FastFlag, IndexedFlag, SchemaFlagList, StoredFlag};
|
||||
|
||||
/// The precision of the indexed date/time values in the inverted index.
|
||||
pub const DATE_TIME_PRECISION_INDEXED: DateTimePrecision = DateTimePrecision::Second;
|
||||
pub const DATE_TIME_PRECISION_INDEXED: DateTimePrecision = DateTimePrecision::Seconds;
|
||||
|
||||
/// Defines how DateTime field should be handled by tantivy.
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)]
|
||||
|
||||
@@ -973,7 +973,7 @@ mod tests {
|
||||
"fieldnorms": true,
|
||||
"fast": true,
|
||||
"stored": true,
|
||||
"precision": "second"
|
||||
"precision": "seconds"
|
||||
}
|
||||
},
|
||||
{
|
||||
|
||||
@@ -145,7 +145,7 @@ impl Snippet {
|
||||
/// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\
|
||||
/// has to be a valid string.
|
||||
fn search_fragments(
|
||||
tokenizer: &TextAnalyzer,
|
||||
tokenizer: &mut TextAnalyzer,
|
||||
text: &str,
|
||||
terms: &BTreeMap<String, Score>,
|
||||
max_num_chars: usize,
|
||||
@@ -370,8 +370,12 @@ impl SnippetGenerator {
|
||||
|
||||
/// Generates a snippet for the given text.
|
||||
pub fn snippet(&self, text: &str) -> Snippet {
|
||||
let fragment_candidates =
|
||||
search_fragments(&self.tokenizer, text, &self.terms_text, self.max_num_chars);
|
||||
let fragment_candidates = search_fragments(
|
||||
&mut self.tokenizer.clone(),
|
||||
text,
|
||||
&self.terms_text,
|
||||
self.max_num_chars,
|
||||
);
|
||||
select_best_fragment_combination(&fragment_candidates[..], text)
|
||||
}
|
||||
}
|
||||
@@ -408,7 +412,12 @@ Survey in 2016, 2017, and 2018."#;
|
||||
String::from("rust") => 1.0,
|
||||
String::from("language") => 0.9
|
||||
};
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 100);
|
||||
let fragments = search_fragments(
|
||||
&mut From::from(SimpleTokenizer::default()),
|
||||
TEST_TEXT,
|
||||
&terms,
|
||||
100,
|
||||
);
|
||||
assert_eq!(fragments.len(), 7);
|
||||
{
|
||||
let first = &fragments[0];
|
||||
@@ -435,7 +444,12 @@ Survey in 2016, 2017, and 2018."#;
|
||||
String::from("rust") =>1.0,
|
||||
String::from("language") => 0.9
|
||||
};
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 20);
|
||||
let fragments = search_fragments(
|
||||
&mut From::from(SimpleTokenizer::default()),
|
||||
TEST_TEXT,
|
||||
&terms,
|
||||
20,
|
||||
);
|
||||
{
|
||||
let first = &fragments[0];
|
||||
assert_eq!(first.score, 1.0);
|
||||
@@ -449,7 +463,12 @@ Survey in 2016, 2017, and 2018."#;
|
||||
String::from("rust") =>0.9,
|
||||
String::from("language") => 1.0
|
||||
};
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 20);
|
||||
let fragments = search_fragments(
|
||||
&mut From::from(SimpleTokenizer::default()),
|
||||
TEST_TEXT,
|
||||
&terms,
|
||||
20,
|
||||
);
|
||||
// assert_eq!(fragments.len(), 7);
|
||||
{
|
||||
let first = &fragments[0];
|
||||
@@ -468,7 +487,8 @@ Survey in 2016, 2017, and 2018."#;
|
||||
let mut terms = BTreeMap::new();
|
||||
terms.insert(String::from("c"), 1.0);
|
||||
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), text, &terms, 3);
|
||||
let fragments =
|
||||
search_fragments(&mut From::from(SimpleTokenizer::default()), text, &terms, 3);
|
||||
|
||||
assert_eq!(fragments.len(), 1);
|
||||
{
|
||||
@@ -490,7 +510,8 @@ Survey in 2016, 2017, and 2018."#;
|
||||
let mut terms = BTreeMap::new();
|
||||
terms.insert(String::from("f"), 1.0);
|
||||
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), text, &terms, 3);
|
||||
let fragments =
|
||||
search_fragments(&mut From::from(SimpleTokenizer::default()), text, &terms, 3);
|
||||
|
||||
assert_eq!(fragments.len(), 2);
|
||||
{
|
||||
@@ -513,7 +534,8 @@ Survey in 2016, 2017, and 2018."#;
|
||||
terms.insert(String::from("f"), 1.0);
|
||||
terms.insert(String::from("a"), 0.9);
|
||||
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), text, &terms, 7);
|
||||
let fragments =
|
||||
search_fragments(&mut From::from(SimpleTokenizer::default()), text, &terms, 7);
|
||||
|
||||
assert_eq!(fragments.len(), 2);
|
||||
{
|
||||
@@ -535,7 +557,8 @@ Survey in 2016, 2017, and 2018."#;
|
||||
let mut terms = BTreeMap::new();
|
||||
terms.insert(String::from("z"), 1.0);
|
||||
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), text, &terms, 3);
|
||||
let fragments =
|
||||
search_fragments(&mut From::from(SimpleTokenizer::default()), text, &terms, 3);
|
||||
|
||||
assert_eq!(fragments.len(), 0);
|
||||
|
||||
@@ -550,7 +573,8 @@ Survey in 2016, 2017, and 2018."#;
|
||||
let text = "a b c d";
|
||||
|
||||
let terms = BTreeMap::new();
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), text, &terms, 3);
|
||||
let fragments =
|
||||
search_fragments(&mut From::from(SimpleTokenizer::default()), text, &terms, 3);
|
||||
assert_eq!(fragments.len(), 0);
|
||||
|
||||
let snippet = select_best_fragment_combination(&fragments[..], text);
|
||||
@@ -669,7 +693,7 @@ Survey in 2016, 2017, and 2018."#;
|
||||
terms.insert(String::from("bc"), 1.0);
|
||||
|
||||
let fragments = search_fragments(
|
||||
&From::from(NgramTokenizer::all_ngrams(2, 2)),
|
||||
&mut From::from(NgramTokenizer::all_ngrams(2, 2)),
|
||||
text,
|
||||
&terms,
|
||||
3,
|
||||
@@ -691,7 +715,12 @@ Survey in 2016, 2017, and 2018."#;
|
||||
#[test]
|
||||
fn test_snippet_generator_custom_highlighted_elements() {
|
||||
let terms = btreemap! { String::from("rust") => 1.0, String::from("language") => 0.9 };
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 100);
|
||||
let fragments = search_fragments(
|
||||
&mut From::from(SimpleTokenizer::default()),
|
||||
TEST_TEXT,
|
||||
&terms,
|
||||
100,
|
||||
);
|
||||
let mut snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT);
|
||||
assert_eq!(
|
||||
snippet.to_html(),
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
//! ```rust
|
||||
//! use tantivy::tokenizer::*;
|
||||
//!
|
||||
//! let tokenizer = TextAnalyzer::builder(RawTokenizer)
|
||||
//! let mut tokenizer = TextAnalyzer::builder(RawTokenizer::default())
|
||||
//! .filter(AlphaNumOnlyFilter)
|
||||
//! .build();
|
||||
//!
|
||||
@@ -11,7 +11,7 @@
|
||||
//! // contains a space
|
||||
//! assert!(stream.next().is_none());
|
||||
//!
|
||||
//! let tokenizer = TextAnalyzer::builder(SimpleTokenizer)
|
||||
//! let mut tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
//! .filter(AlphaNumOnlyFilter)
|
||||
//! .build();
|
||||
//!
|
||||
@@ -52,7 +52,7 @@ pub struct AlphaNumOnlyFilterWrapper<T>(T);
|
||||
impl<T: Tokenizer> Tokenizer for AlphaNumOnlyFilterWrapper<T> {
|
||||
type TokenStream<'a> = AlphaNumOnlyFilterStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
AlphaNumOnlyFilterStream {
|
||||
tail: self.0.token_stream(text),
|
||||
}
|
||||
@@ -96,7 +96,7 @@ mod tests {
|
||||
}
|
||||
|
||||
fn token_stream_helper(text: &str) -> Vec<Token> {
|
||||
let a = TextAnalyzer::builder(SimpleTokenizer)
|
||||
let mut a = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(AlphaNumOnlyFilter)
|
||||
.build();
|
||||
let mut token_stream = a.token_stream(text);
|
||||
|
||||
@@ -12,38 +12,45 @@ impl TokenFilter for AsciiFoldingFilter {
|
||||
type Tokenizer<T: Tokenizer> = AsciiFoldingFilterWrapper<T>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> AsciiFoldingFilterWrapper<T> {
|
||||
AsciiFoldingFilterWrapper(tokenizer)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct AsciiFoldingFilterWrapper<T>(T);
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for AsciiFoldingFilterWrapper<T> {
|
||||
type TokenStream<'a> = AsciiFoldingFilterTokenStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
AsciiFoldingFilterTokenStream {
|
||||
buffer: String::with_capacity(100),
|
||||
tail: self.0.token_stream(text),
|
||||
AsciiFoldingFilterWrapper {
|
||||
tokenizer,
|
||||
buffer: String::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct AsciiFoldingFilterTokenStream<T> {
|
||||
#[derive(Clone)]
|
||||
pub struct AsciiFoldingFilterWrapper<T> {
|
||||
tokenizer: T,
|
||||
buffer: String,
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for AsciiFoldingFilterWrapper<T> {
|
||||
type TokenStream<'a> = AsciiFoldingFilterTokenStream<'a, T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
self.buffer.clear();
|
||||
AsciiFoldingFilterTokenStream {
|
||||
buffer: &mut self.buffer,
|
||||
tail: self.tokenizer.token_stream(text),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct AsciiFoldingFilterTokenStream<'a, T> {
|
||||
buffer: &'a mut String,
|
||||
tail: T,
|
||||
}
|
||||
|
||||
impl<T: TokenStream> TokenStream for AsciiFoldingFilterTokenStream<T> {
|
||||
impl<'a, T: TokenStream> TokenStream for AsciiFoldingFilterTokenStream<'a, T> {
|
||||
fn advance(&mut self) -> bool {
|
||||
if !self.tail.advance() {
|
||||
return false;
|
||||
}
|
||||
if !self.token_mut().text.is_ascii() {
|
||||
// ignore its already ascii
|
||||
to_ascii(&self.tail.token().text, &mut self.buffer);
|
||||
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
|
||||
to_ascii(&self.tail.token().text, self.buffer);
|
||||
mem::swap(&mut self.tail.token_mut().text, self.buffer);
|
||||
}
|
||||
true
|
||||
}
|
||||
@@ -1573,7 +1580,7 @@ mod tests {
|
||||
|
||||
fn folding_helper(text: &str) -> Vec<String> {
|
||||
let mut tokens = Vec::new();
|
||||
TextAnalyzer::builder(SimpleTokenizer)
|
||||
TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(AsciiFoldingFilter)
|
||||
.build()
|
||||
.token_stream(text)
|
||||
@@ -1584,10 +1591,10 @@ mod tests {
|
||||
}
|
||||
|
||||
fn folding_using_raw_tokenizer_helper(text: &str) -> String {
|
||||
let mut token_stream = TextAnalyzer::builder(RawTokenizer)
|
||||
let mut tokenizer = TextAnalyzer::builder(RawTokenizer::default())
|
||||
.filter(AsciiFoldingFilter)
|
||||
.build()
|
||||
.token_stream(text);
|
||||
.build();
|
||||
let mut token_stream = tokenizer.token_stream(text);
|
||||
token_stream.advance();
|
||||
token_stream.token().text.clone()
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@ pub(crate) struct EmptyTokenizer;
|
||||
|
||||
impl Tokenizer for EmptyTokenizer {
|
||||
type TokenStream<'a> = EmptyTokenStream;
|
||||
fn token_stream(&self, _text: &str) -> EmptyTokenStream {
|
||||
fn token_stream(&mut self, _text: &str) -> EmptyTokenStream {
|
||||
EmptyTokenStream::default()
|
||||
}
|
||||
}
|
||||
@@ -35,7 +35,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_empty_tokenizer() {
|
||||
let tokenizer = super::EmptyTokenizer;
|
||||
let mut tokenizer = super::EmptyTokenizer;
|
||||
let mut empty = tokenizer.token_stream("whatever string");
|
||||
assert!(!empty.advance());
|
||||
}
|
||||
|
||||
@@ -9,8 +9,10 @@ use crate::schema::FACET_SEP_BYTE;
|
||||
/// - `/america/north_america/canada`
|
||||
/// - `/america/north_america`
|
||||
/// - `/america`
|
||||
#[derive(Clone)]
|
||||
pub struct FacetTokenizer;
|
||||
#[derive(Clone, Default)]
|
||||
pub struct FacetTokenizer {
|
||||
token: Token,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
enum State {
|
||||
@@ -22,20 +24,18 @@ enum State {
|
||||
pub struct FacetTokenStream<'a> {
|
||||
text: &'a str,
|
||||
state: State,
|
||||
token: Token,
|
||||
token: &'a mut Token,
|
||||
}
|
||||
|
||||
impl Tokenizer for FacetTokenizer {
|
||||
type TokenStream<'a> = FacetTokenStream<'a>;
|
||||
fn token_stream<'a>(&self, text: &'a str) -> FacetTokenStream<'a> {
|
||||
let token = Token {
|
||||
position: 0,
|
||||
..Default::default()
|
||||
};
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> FacetTokenStream<'a> {
|
||||
self.token.reset();
|
||||
self.token.position = 0;
|
||||
FacetTokenStream {
|
||||
text,
|
||||
state: State::RootFacetNotEmitted, //< pos is the first char that has not been processed yet.
|
||||
token,
|
||||
token: &mut self.token,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -74,11 +74,11 @@ impl<'a> TokenStream for FacetTokenStream<'a> {
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
&self.token
|
||||
self.token
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
&mut self.token
|
||||
self.token
|
||||
}
|
||||
}
|
||||
|
||||
@@ -98,7 +98,7 @@ mod tests {
|
||||
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap();
|
||||
tokens.push(format!("{}", facet));
|
||||
};
|
||||
FacetTokenizer
|
||||
FacetTokenizer::default()
|
||||
.token_stream(facet.encoded_str())
|
||||
.process(&mut add_token);
|
||||
}
|
||||
@@ -118,7 +118,7 @@ mod tests {
|
||||
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); // ok test
|
||||
tokens.push(format!("{}", facet));
|
||||
};
|
||||
FacetTokenizer
|
||||
FacetTokenizer::default()
|
||||
.token_stream(facet.encoded_str()) // ok test
|
||||
.process(&mut add_token);
|
||||
}
|
||||
|
||||
@@ -10,26 +10,33 @@ impl TokenFilter for LowerCaser {
|
||||
type Tokenizer<T: Tokenizer> = LowerCaserFilter<T>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T> {
|
||||
LowerCaserFilter(tokenizer)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct LowerCaserFilter<T>(T);
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for LowerCaserFilter<T> {
|
||||
type TokenStream<'a> = LowerCaserTokenStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
LowerCaserTokenStream {
|
||||
tail: self.0.token_stream(text),
|
||||
LowerCaserFilter {
|
||||
tokenizer,
|
||||
buffer: String::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LowerCaserTokenStream<T> {
|
||||
#[derive(Clone)]
|
||||
pub struct LowerCaserFilter<T> {
|
||||
tokenizer: T,
|
||||
buffer: String,
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for LowerCaserFilter<T> {
|
||||
type TokenStream<'a> = LowerCaserTokenStream<'a, T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
self.buffer.clear();
|
||||
LowerCaserTokenStream {
|
||||
tail: self.tokenizer.token_stream(text),
|
||||
buffer: &mut self.buffer,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LowerCaserTokenStream<'a, T> {
|
||||
buffer: &'a mut String,
|
||||
tail: T,
|
||||
}
|
||||
|
||||
@@ -44,7 +51,7 @@ fn to_lowercase_unicode(text: &str, output: &mut String) {
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: TokenStream> TokenStream for LowerCaserTokenStream<T> {
|
||||
impl<'a, T: TokenStream> TokenStream for LowerCaserTokenStream<'a, T> {
|
||||
fn advance(&mut self) -> bool {
|
||||
if !self.tail.advance() {
|
||||
return false;
|
||||
@@ -53,8 +60,8 @@ impl<T: TokenStream> TokenStream for LowerCaserTokenStream<T> {
|
||||
// fast track for ascii.
|
||||
self.token_mut().text.make_ascii_lowercase();
|
||||
} else {
|
||||
to_lowercase_unicode(&self.tail.token().text, &mut self.buffer);
|
||||
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
|
||||
to_lowercase_unicode(&self.tail.token().text, self.buffer);
|
||||
mem::swap(&mut self.tail.token_mut().text, self.buffer);
|
||||
}
|
||||
true
|
||||
}
|
||||
@@ -86,10 +93,11 @@ mod tests {
|
||||
}
|
||||
|
||||
fn token_stream_helper(text: &str) -> Vec<Token> {
|
||||
let mut token_stream = TextAnalyzer::builder(SimpleTokenizer)
|
||||
let mut token_stream = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(LowerCaser)
|
||||
.build()
|
||||
.token_stream(text);
|
||||
.build();
|
||||
|
||||
let mut token_stream = token_stream.token_stream(text);
|
||||
let mut tokens = vec![];
|
||||
let mut add_token = |token: &Token| {
|
||||
tokens.push(token.clone());
|
||||
|
||||
@@ -66,7 +66,7 @@
|
||||
//! ```rust
|
||||
//! use tantivy::tokenizer::*;
|
||||
//!
|
||||
//! let en_stem = TextAnalyzer::builder(SimpleTokenizer)
|
||||
//! let en_stem = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
//! .filter(RemoveLongFilter::limit(40))
|
||||
//! .filter(LowerCaser)
|
||||
//! .filter(Stemmer::new(Language::English))
|
||||
@@ -81,7 +81,7 @@
|
||||
//! # use tantivy::tokenizer::*;
|
||||
//! # use tantivy::Index;
|
||||
//! #
|
||||
//! let custom_en_tokenizer = SimpleTokenizer;
|
||||
//! let custom_en_tokenizer = SimpleTokenizer::default();
|
||||
//! # let schema = Schema::builder().build();
|
||||
//! let index = Index::create_in_ram(schema);
|
||||
//! index.tokenizers()
|
||||
@@ -113,7 +113,7 @@
|
||||
//! let index = Index::create_in_ram(schema);
|
||||
//!
|
||||
//! // We need to register our tokenizer :
|
||||
//! let custom_en_tokenizer = TextAnalyzer::builder(SimpleTokenizer)
|
||||
//! let custom_en_tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
//! .filter(RemoveLongFilter::limit(40))
|
||||
//! .filter(LowerCaser)
|
||||
//! .build();
|
||||
@@ -188,9 +188,9 @@ pub mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_raw_tokenizer() {
|
||||
fn test_raw_tokenizer2() {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
let en_tokenizer = tokenizer_manager.get("raw").unwrap();
|
||||
let mut en_tokenizer = tokenizer_manager.get("raw").unwrap();
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| {
|
||||
@@ -208,7 +208,7 @@ pub mod tests {
|
||||
fn test_en_tokenizer() {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
assert!(tokenizer_manager.get("en_doesnotexist").is_none());
|
||||
let en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
|
||||
let mut en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| {
|
||||
@@ -231,13 +231,13 @@ pub mod tests {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
tokenizer_manager.register(
|
||||
"el_stem",
|
||||
TextAnalyzer::builder(SimpleTokenizer)
|
||||
TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(Stemmer::new(Language::Greek))
|
||||
.build(),
|
||||
);
|
||||
let en_tokenizer = tokenizer_manager.get("el_stem").unwrap();
|
||||
let mut en_tokenizer = tokenizer_manager.get("el_stem").unwrap();
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| {
|
||||
@@ -257,7 +257,7 @@ pub mod tests {
|
||||
#[test]
|
||||
fn test_tokenizer_empty() {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
let en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
|
||||
let mut en_tokenizer = tokenizer_manager.get("en_stem").unwrap();
|
||||
{
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
{
|
||||
@@ -283,7 +283,7 @@ pub mod tests {
|
||||
#[test]
|
||||
fn test_whitespace_tokenizer() {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
let ws_tokenizer = tokenizer_manager.get("whitespace").unwrap();
|
||||
let mut ws_tokenizer = tokenizer_manager.get("whitespace").unwrap();
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| {
|
||||
|
||||
@@ -33,7 +33,7 @@ use super::{Token, TokenStream, Tokenizer};
|
||||
/// ```rust
|
||||
/// use tantivy::tokenizer::*;
|
||||
///
|
||||
/// let tokenizer = NgramTokenizer::new(2, 3, false);
|
||||
/// let mut tokenizer = NgramTokenizer::new(2, 3, false);
|
||||
/// let mut stream = tokenizer.token_stream("hello");
|
||||
/// {
|
||||
/// let token = stream.next().unwrap();
|
||||
@@ -87,6 +87,7 @@ pub struct NgramTokenizer {
|
||||
max_gram: usize,
|
||||
/// if true, will only parse the leading edge of the input
|
||||
prefix_only: bool,
|
||||
token: Token,
|
||||
}
|
||||
|
||||
impl NgramTokenizer {
|
||||
@@ -101,6 +102,7 @@ impl NgramTokenizer {
|
||||
min_gram,
|
||||
max_gram,
|
||||
prefix_only,
|
||||
token: Token::default(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -127,12 +129,13 @@ pub struct NgramTokenStream<'a> {
|
||||
/// input
|
||||
text: &'a str,
|
||||
/// output
|
||||
token: Token,
|
||||
token: &'a mut Token,
|
||||
}
|
||||
|
||||
impl Tokenizer for NgramTokenizer {
|
||||
type TokenStream<'a> = NgramTokenStream<'a>;
|
||||
fn token_stream<'a>(&self, text: &'a str) -> NgramTokenStream<'a> {
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> NgramTokenStream<'a> {
|
||||
self.token.reset();
|
||||
NgramTokenStream {
|
||||
ngram_charidx_iterator: StutteringIterator::new(
|
||||
CodepointFrontiers::for_str(text),
|
||||
@@ -141,7 +144,7 @@ impl Tokenizer for NgramTokenizer {
|
||||
),
|
||||
prefix_only: self.prefix_only,
|
||||
text,
|
||||
token: Token::default(),
|
||||
token: &mut self.token,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -164,10 +167,10 @@ impl<'a> TokenStream for NgramTokenStream<'a> {
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
&self.token
|
||||
self.token
|
||||
}
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
&mut self.token
|
||||
self.token
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,32 +1,34 @@
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
|
||||
/// For each value of the field, emit a single unprocessed token.
|
||||
#[derive(Clone)]
|
||||
pub struct RawTokenizer;
|
||||
|
||||
pub struct RawTokenStream {
|
||||
#[derive(Clone, Default)]
|
||||
pub struct RawTokenizer {
|
||||
token: Token,
|
||||
}
|
||||
|
||||
pub struct RawTokenStream<'a> {
|
||||
token: &'a mut Token,
|
||||
has_token: bool,
|
||||
}
|
||||
|
||||
impl Tokenizer for RawTokenizer {
|
||||
type TokenStream<'a> = RawTokenStream;
|
||||
fn token_stream(&self, text: &str) -> RawTokenStream {
|
||||
let token = Token {
|
||||
offset_from: 0,
|
||||
offset_to: text.len(),
|
||||
position: 0,
|
||||
text: text.to_string(),
|
||||
position_length: 1,
|
||||
};
|
||||
type TokenStream<'a> = RawTokenStream<'a>;
|
||||
fn token_stream<'a>(&'a mut self, text: &str) -> RawTokenStream<'a> {
|
||||
self.token.reset();
|
||||
self.token.position = 0;
|
||||
self.token.position_length = 1;
|
||||
self.token.offset_from = 0;
|
||||
self.token.offset_to = text.len();
|
||||
self.token.text.clear();
|
||||
self.token.text.push_str(text);
|
||||
RawTokenStream {
|
||||
token,
|
||||
token: &mut self.token,
|
||||
has_token: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TokenStream for RawTokenStream {
|
||||
impl<'a> TokenStream for RawTokenStream<'a> {
|
||||
fn advance(&mut self) -> bool {
|
||||
let result = self.has_token;
|
||||
self.has_token = false;
|
||||
@@ -34,11 +36,11 @@ impl TokenStream for RawTokenStream {
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
&self.token
|
||||
self.token
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
&mut self.token
|
||||
self.token
|
||||
}
|
||||
}
|
||||
|
||||
@@ -55,7 +57,7 @@ mod tests {
|
||||
}
|
||||
|
||||
fn token_stream_helper(text: &str) -> Vec<Token> {
|
||||
let a = TextAnalyzer::from(RawTokenizer);
|
||||
let mut a = TextAnalyzer::from(RawTokenizer::default());
|
||||
let mut token_stream = a.token_stream(text);
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
let mut add_token = |token: &Token| {
|
||||
|
||||
@@ -22,7 +22,7 @@ use crate::TantivyError;
|
||||
/// ```rust
|
||||
/// use tantivy::tokenizer::*;
|
||||
///
|
||||
/// let tokenizer = RegexTokenizer::new(r"'(?:\w*)'").unwrap();
|
||||
/// let mut tokenizer = RegexTokenizer::new(r"'(?:\w*)'").unwrap();
|
||||
/// let mut stream = tokenizer.token_stream("'aaa' bbb 'ccc' 'ddd'");
|
||||
/// {
|
||||
/// let token = stream.next().unwrap();
|
||||
@@ -48,6 +48,7 @@ use crate::TantivyError;
|
||||
#[derive(Clone)]
|
||||
pub struct RegexTokenizer {
|
||||
regex: Regex,
|
||||
token: Token,
|
||||
}
|
||||
|
||||
impl RegexTokenizer {
|
||||
@@ -55,17 +56,21 @@ impl RegexTokenizer {
|
||||
pub fn new(regex_pattern: &str) -> crate::Result<RegexTokenizer> {
|
||||
Regex::new(regex_pattern)
|
||||
.map_err(|_| TantivyError::InvalidArgument(regex_pattern.to_owned()))
|
||||
.map(|regex| Self { regex })
|
||||
.map(|regex| Self {
|
||||
regex,
|
||||
token: Token::default(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl Tokenizer for RegexTokenizer {
|
||||
type TokenStream<'a> = RegexTokenStream<'a>;
|
||||
fn token_stream<'a>(&self, text: &'a str) -> RegexTokenStream<'a> {
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> RegexTokenStream<'a> {
|
||||
self.token.reset();
|
||||
RegexTokenStream {
|
||||
regex: self.regex.clone(),
|
||||
text,
|
||||
token: Token::default(),
|
||||
token: &mut self.token,
|
||||
cursor: 0,
|
||||
}
|
||||
}
|
||||
@@ -74,7 +79,7 @@ impl Tokenizer for RegexTokenizer {
|
||||
pub struct RegexTokenStream<'a> {
|
||||
regex: Regex,
|
||||
text: &'a str,
|
||||
token: Token,
|
||||
token: &'a mut Token,
|
||||
cursor: usize,
|
||||
}
|
||||
|
||||
@@ -100,11 +105,11 @@ impl<'a> TokenStream for RegexTokenStream<'a> {
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
&self.token
|
||||
self.token
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
&mut self.token
|
||||
self.token
|
||||
}
|
||||
}
|
||||
|
||||
@@ -147,7 +152,7 @@ mod tests {
|
||||
|
||||
fn token_stream_helper(text: &str, pattern: &str) -> Vec<Token> {
|
||||
let r = RegexTokenizer::new(pattern).unwrap();
|
||||
let a = TextAnalyzer::from(r);
|
||||
let mut a = TextAnalyzer::from(r);
|
||||
let mut token_stream = a.token_stream(text);
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
let mut add_token = |token: &Token| {
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
//! ```rust
|
||||
//! use tantivy::tokenizer::*;
|
||||
//!
|
||||
//! let tokenizer = TextAnalyzer::builder(SimpleTokenizer)
|
||||
//! let mut tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
//! .filter(RemoveLongFilter::limit(5))
|
||||
//! .build();
|
||||
//!
|
||||
@@ -57,7 +57,7 @@ pub struct RemoveLongFilterWrapper<T: Tokenizer> {
|
||||
impl<T: Tokenizer> Tokenizer for RemoveLongFilterWrapper<T> {
|
||||
type TokenStream<'a> = RemoveLongFilterStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
RemoveLongFilterStream {
|
||||
token_length_limit: self.length_limit,
|
||||
tail: self.inner.token_stream(text),
|
||||
@@ -103,7 +103,7 @@ mod tests {
|
||||
}
|
||||
|
||||
fn token_stream_helper(text: &str) -> Vec<Token> {
|
||||
let a = TextAnalyzer::builder(SimpleTokenizer)
|
||||
let mut a = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(RemoveLongFilter::limit(6))
|
||||
.build();
|
||||
let mut token_stream = a.token_stream(text);
|
||||
|
||||
@@ -3,23 +3,26 @@ use std::str::CharIndices;
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
|
||||
/// Tokenize the text by splitting on whitespaces and punctuation.
|
||||
#[derive(Clone)]
|
||||
pub struct SimpleTokenizer;
|
||||
#[derive(Clone, Default)]
|
||||
pub struct SimpleTokenizer {
|
||||
token: Token,
|
||||
}
|
||||
|
||||
/// TokenStream produced by the `SimpleTokenizer`.
|
||||
pub struct SimpleTokenStream<'a> {
|
||||
text: &'a str,
|
||||
chars: CharIndices<'a>,
|
||||
token: Token,
|
||||
token: &'a mut Token,
|
||||
}
|
||||
|
||||
impl Tokenizer for SimpleTokenizer {
|
||||
type TokenStream<'a> = SimpleTokenStream<'a>;
|
||||
fn token_stream<'a>(&self, text: &'a str) -> SimpleTokenStream<'a> {
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> SimpleTokenStream<'a> {
|
||||
self.token.reset();
|
||||
SimpleTokenStream {
|
||||
text,
|
||||
chars: text.char_indices(),
|
||||
token: Token::default(),
|
||||
token: &mut self.token,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -52,11 +55,11 @@ impl<'a> TokenStream for SimpleTokenStream<'a> {
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
&self.token
|
||||
self.token
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
&mut self.token
|
||||
self.token
|
||||
}
|
||||
}
|
||||
|
||||
@@ -76,7 +79,7 @@ mod tests {
|
||||
}
|
||||
|
||||
fn token_stream_helper(text: &str) -> Vec<Token> {
|
||||
let a = TextAnalyzer::from(SimpleTokenizer);
|
||||
let mut a = TextAnalyzer::from(SimpleTokenizer::default());
|
||||
let mut token_stream = a.token_stream(text);
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
let mut add_token = |token: &Token| {
|
||||
|
||||
@@ -20,8 +20,8 @@ use super::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
/// ```rust
|
||||
/// use tantivy::tokenizer::{SimpleTokenizer, SplitCompoundWords, TextAnalyzer};
|
||||
///
|
||||
/// let tokenizer =
|
||||
/// TextAnalyzer::builder(SimpleTokenizer)
|
||||
/// let mut tokenizer =
|
||||
/// TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
/// .filter(
|
||||
/// SplitCompoundWords::from_dictionary([
|
||||
/// "dampf", "schiff", "fahrt", "brot", "backen", "automat",
|
||||
@@ -29,13 +29,13 @@ use super::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
/// .unwrap()
|
||||
/// )
|
||||
/// .build();
|
||||
///
|
||||
/// let mut stream = tokenizer.token_stream("dampfschifffahrt");
|
||||
/// assert_eq!(stream.next().unwrap().text, "dampf");
|
||||
/// assert_eq!(stream.next().unwrap().text, "schiff");
|
||||
/// assert_eq!(stream.next().unwrap().text, "fahrt");
|
||||
/// assert_eq!(stream.next(), None);
|
||||
///
|
||||
/// {
|
||||
/// let mut stream = tokenizer.token_stream("dampfschifffahrt");
|
||||
/// assert_eq!(stream.next().unwrap().text, "dampf");
|
||||
/// assert_eq!(stream.next().unwrap().text, "schiff");
|
||||
/// assert_eq!(stream.next().unwrap().text, "fahrt");
|
||||
/// assert_eq!(stream.next(), None);
|
||||
/// }
|
||||
/// let mut stream = tokenizer.token_stream("brotbackautomat");
|
||||
/// assert_eq!(stream.next().unwrap().text, "brotbackautomat");
|
||||
/// assert_eq!(stream.next(), None);
|
||||
@@ -99,7 +99,7 @@ pub struct SplitCompoundWordsFilter<T> {
|
||||
impl<T: Tokenizer> Tokenizer for SplitCompoundWordsFilter<T> {
|
||||
type TokenStream<'a> = SplitCompoundWordsTokenStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
SplitCompoundWordsTokenStream {
|
||||
dict: self.dict.clone(),
|
||||
tail: self.inner.token_stream(text),
|
||||
@@ -188,7 +188,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn splitting_compound_words_works() {
|
||||
let tokenizer = TextAnalyzer::builder(SimpleTokenizer)
|
||||
let mut tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(SplitCompoundWords::from_dictionary(["foo", "bar"]).unwrap())
|
||||
.build();
|
||||
|
||||
|
||||
@@ -100,7 +100,7 @@ pub struct StemmerFilter<T> {
|
||||
impl<T: Tokenizer> Tokenizer for StemmerFilter<T> {
|
||||
type TokenStream<'a> = StemmerTokenStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
let stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm);
|
||||
StemmerTokenStream {
|
||||
tail: self.inner.token_stream(text),
|
||||
|
||||
@@ -6,6 +6,7 @@ LANGUAGES = [
|
||||
"finnish",
|
||||
"french",
|
||||
"german",
|
||||
"hungarian",
|
||||
"italian",
|
||||
"norwegian",
|
||||
"portuguese",
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
//! ```rust
|
||||
//! use tantivy::tokenizer::*;
|
||||
//!
|
||||
//! let tokenizer = TextAnalyzer::builder(SimpleTokenizer)
|
||||
//! let mut tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
//! .filter(StopWordFilter::remove(vec!["the".to_string(), "is".to_string()]))
|
||||
//! .build();
|
||||
//!
|
||||
@@ -50,6 +50,7 @@ impl StopWordFilter {
|
||||
Language::Finnish => stopwords::FINNISH,
|
||||
Language::French => stopwords::FRENCH,
|
||||
Language::German => stopwords::GERMAN,
|
||||
Language::Hungarian => stopwords::HUNGARIAN,
|
||||
Language::Italian => stopwords::ITALIAN,
|
||||
Language::Norwegian => stopwords::NORWEGIAN,
|
||||
Language::Portuguese => stopwords::PORTUGUESE,
|
||||
@@ -90,7 +91,7 @@ pub struct StopWordFilterWrapper<T> {
|
||||
impl<T: Tokenizer> Tokenizer for StopWordFilterWrapper<T> {
|
||||
type TokenStream<'a> = StopWordFilterStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
StopWordFilterStream {
|
||||
words: self.words.clone(),
|
||||
tail: self.inner.token_stream(text),
|
||||
@@ -151,7 +152,7 @@ mod tests {
|
||||
"am".to_string(),
|
||||
"i".to_string(),
|
||||
];
|
||||
let a = TextAnalyzer::builder(SimpleTokenizer)
|
||||
let mut a = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(StopWordFilter::remove(stops))
|
||||
.build();
|
||||
let mut token_stream = a.token_stream(text);
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
These stop word lists are from the Snowball project (https://snowballstem.org/)
|
||||
which carries the following license:
|
||||
which carries the following copyright and license:
|
||||
|
||||
Copyright (c) 2001, Dr Martin Porter
|
||||
Copyright (c) 2004,2005, Richard Boulton
|
||||
@@ -862,6 +862,208 @@ pub const GERMAN: &[&str] = &[
|
||||
"zwischen",
|
||||
];
|
||||
|
||||
pub const HUNGARIAN: &[&str] = &[
|
||||
"a",
|
||||
"ahogy",
|
||||
"ahol",
|
||||
"aki",
|
||||
"akik",
|
||||
"akkor",
|
||||
"alatt",
|
||||
"által",
|
||||
"általában",
|
||||
"amely",
|
||||
"amelyek",
|
||||
"amelyekben",
|
||||
"amelyeket",
|
||||
"amelyet",
|
||||
"amelynek",
|
||||
"ami",
|
||||
"amit",
|
||||
"amolyan",
|
||||
"amíg",
|
||||
"amikor",
|
||||
"át",
|
||||
"abban",
|
||||
"ahhoz",
|
||||
"annak",
|
||||
"arra",
|
||||
"arról",
|
||||
"az",
|
||||
"azok",
|
||||
"azon",
|
||||
"azt",
|
||||
"azzal",
|
||||
"azért",
|
||||
"aztán",
|
||||
"azután",
|
||||
"azonban",
|
||||
"bár",
|
||||
"be",
|
||||
"belül",
|
||||
"benne",
|
||||
"cikk",
|
||||
"cikkek",
|
||||
"cikkeket",
|
||||
"csak",
|
||||
"de",
|
||||
"e",
|
||||
"eddig",
|
||||
"egész",
|
||||
"egy",
|
||||
"egyes",
|
||||
"egyetlen",
|
||||
"egyéb",
|
||||
"egyik",
|
||||
"egyre",
|
||||
"ekkor",
|
||||
"el",
|
||||
"elég",
|
||||
"ellen",
|
||||
"elő",
|
||||
"először",
|
||||
"előtt",
|
||||
"első",
|
||||
"én",
|
||||
"éppen",
|
||||
"ebben",
|
||||
"ehhez",
|
||||
"emilyen",
|
||||
"ennek",
|
||||
"erre",
|
||||
"ez",
|
||||
"ezt",
|
||||
"ezek",
|
||||
"ezen",
|
||||
"ezzel",
|
||||
"ezért",
|
||||
"és",
|
||||
"fel",
|
||||
"felé",
|
||||
"hanem",
|
||||
"hiszen",
|
||||
"hogy",
|
||||
"hogyan",
|
||||
"igen",
|
||||
"így",
|
||||
"illetve",
|
||||
"ill.",
|
||||
"ill",
|
||||
"ilyen",
|
||||
"ilyenkor",
|
||||
"ison",
|
||||
"ismét",
|
||||
"itt",
|
||||
"jó",
|
||||
"jól",
|
||||
"jobban",
|
||||
"kell",
|
||||
"kellett",
|
||||
"keresztül",
|
||||
"keressünk",
|
||||
"ki",
|
||||
"kívül",
|
||||
"között",
|
||||
"közül",
|
||||
"legalább",
|
||||
"lehet",
|
||||
"lehetett",
|
||||
"legyen",
|
||||
"lenne",
|
||||
"lenni",
|
||||
"lesz",
|
||||
"lett",
|
||||
"maga",
|
||||
"magát",
|
||||
"majd",
|
||||
"majd",
|
||||
"már",
|
||||
"más",
|
||||
"másik",
|
||||
"meg",
|
||||
"még",
|
||||
"mellett",
|
||||
"mert",
|
||||
"mely",
|
||||
"melyek",
|
||||
"mi",
|
||||
"mit",
|
||||
"míg",
|
||||
"miért",
|
||||
"milyen",
|
||||
"mikor",
|
||||
"minden",
|
||||
"mindent",
|
||||
"mindenki",
|
||||
"mindig",
|
||||
"mint",
|
||||
"mintha",
|
||||
"mivel",
|
||||
"most",
|
||||
"nagy",
|
||||
"nagyobb",
|
||||
"nagyon",
|
||||
"ne",
|
||||
"néha",
|
||||
"nekem",
|
||||
"neki",
|
||||
"nem",
|
||||
"néhány",
|
||||
"nélkül",
|
||||
"nincs",
|
||||
"olyan",
|
||||
"ott",
|
||||
"össze",
|
||||
"ő",
|
||||
"ők",
|
||||
"őket",
|
||||
"pedig",
|
||||
"persze",
|
||||
"rá",
|
||||
"s",
|
||||
"saját",
|
||||
"sem",
|
||||
"semmi",
|
||||
"sok",
|
||||
"sokat",
|
||||
"sokkal",
|
||||
"számára",
|
||||
"szemben",
|
||||
"szerint",
|
||||
"szinte",
|
||||
"talán",
|
||||
"tehát",
|
||||
"teljes",
|
||||
"tovább",
|
||||
"továbbá",
|
||||
"több",
|
||||
"úgy",
|
||||
"ugyanis",
|
||||
"új",
|
||||
"újabb",
|
||||
"újra",
|
||||
"után",
|
||||
"utána",
|
||||
"utolsó",
|
||||
"vagy",
|
||||
"vagyis",
|
||||
"valaki",
|
||||
"valami",
|
||||
"valamint",
|
||||
"való",
|
||||
"vagyok",
|
||||
"van",
|
||||
"vannak",
|
||||
"volt",
|
||||
"voltam",
|
||||
"voltak",
|
||||
"voltunk",
|
||||
"vissza",
|
||||
"vele",
|
||||
"viszont",
|
||||
"volna",
|
||||
];
|
||||
|
||||
pub const ITALIAN: &[&str] = &[
|
||||
"ad",
|
||||
"al",
|
||||
|
||||
@@ -12,13 +12,13 @@ pub struct TextAnalyzer {
|
||||
/// A boxable `Tokenizer`, with its `TokenStream` type erased.
|
||||
trait BoxableTokenizer: 'static + Send + Sync {
|
||||
/// Creates a boxed token stream for a given `str`.
|
||||
fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
|
||||
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a>;
|
||||
/// Clone this tokenizer.
|
||||
fn box_clone(&self) -> Box<dyn BoxableTokenizer>;
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> BoxableTokenizer for T {
|
||||
fn box_token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
self.token_stream(text).into()
|
||||
}
|
||||
fn box_clone(&self) -> Box<dyn BoxableTokenizer> {
|
||||
@@ -53,7 +53,7 @@ impl TextAnalyzer {
|
||||
}
|
||||
|
||||
/// Creates a token stream for a given `str`.
|
||||
pub fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
pub fn token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
self.tokenizer.box_token_stream(text)
|
||||
}
|
||||
}
|
||||
@@ -71,7 +71,7 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
|
||||
/// ```rust
|
||||
/// use tantivy::tokenizer::*;
|
||||
///
|
||||
/// let en_stem = TextAnalyzer::builder(SimpleTokenizer)
|
||||
/// let en_stem = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
/// .filter(RemoveLongFilter::limit(40))
|
||||
/// .filter(LowerCaser)
|
||||
/// .filter(Stemmer::default())
|
||||
|
||||
@@ -58,23 +58,23 @@ impl Default for TokenizerManager {
|
||||
/// the default pre-configured tokenizers of `tantivy`.
|
||||
fn default() -> TokenizerManager {
|
||||
let manager = TokenizerManager::new();
|
||||
manager.register("raw", RawTokenizer);
|
||||
manager.register("raw", RawTokenizer::default());
|
||||
manager.register(
|
||||
"default",
|
||||
TextAnalyzer::builder(SimpleTokenizer)
|
||||
TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.build(),
|
||||
);
|
||||
manager.register(
|
||||
"en_stem",
|
||||
TextAnalyzer::builder(SimpleTokenizer)
|
||||
TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(Stemmer::new(Language::English))
|
||||
.build(),
|
||||
);
|
||||
manager.register("whitespace", WhitespaceTokenizer);
|
||||
manager.register("whitespace", WhitespaceTokenizer::default());
|
||||
manager
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,22 +3,25 @@ use std::str::CharIndices;
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
|
||||
/// Tokenize the text by splitting on whitespaces.
|
||||
#[derive(Clone)]
|
||||
pub struct WhitespaceTokenizer;
|
||||
#[derive(Clone, Default)]
|
||||
pub struct WhitespaceTokenizer {
|
||||
token: Token,
|
||||
}
|
||||
|
||||
pub struct WhitespaceTokenStream<'a> {
|
||||
text: &'a str,
|
||||
chars: CharIndices<'a>,
|
||||
token: Token,
|
||||
token: &'a mut Token,
|
||||
}
|
||||
|
||||
impl Tokenizer for WhitespaceTokenizer {
|
||||
type TokenStream<'a> = WhitespaceTokenStream<'a>;
|
||||
fn token_stream<'a>(&self, text: &'a str) -> WhitespaceTokenStream<'a> {
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> WhitespaceTokenStream<'a> {
|
||||
self.token.reset();
|
||||
WhitespaceTokenStream {
|
||||
text,
|
||||
chars: text.char_indices(),
|
||||
token: Token::default(),
|
||||
token: &mut self.token,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -51,11 +54,11 @@ impl<'a> TokenStream for WhitespaceTokenStream<'a> {
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
&self.token
|
||||
self.token
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
&mut self.token
|
||||
self.token
|
||||
}
|
||||
}
|
||||
|
||||
@@ -75,7 +78,7 @@ mod tests {
|
||||
}
|
||||
|
||||
fn token_stream_helper(text: &str) -> Vec<Token> {
|
||||
let a = TextAnalyzer::from(WhitespaceTokenizer);
|
||||
let mut a = TextAnalyzer::from(WhitespaceTokenizer::default());
|
||||
let mut token_stream = a.token_stream(text);
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
let mut add_token = |token: &Token| {
|
||||
|
||||
@@ -3,9 +3,14 @@ name = "tantivy-sstable"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "MIT"
|
||||
homepage = "https://github.com/quickwit-oss/tantivy"
|
||||
repository = "https://github.com/quickwit-oss/tantivy"
|
||||
keywords = ["search", "information", "retrieval", "sstable"]
|
||||
categories = ["database-implementations", "data-structures", "compression"]
|
||||
desciption = "sstables for tantivy"
|
||||
|
||||
[dependencies]
|
||||
common = {path="../common", package="tantivy-common"}
|
||||
common = {version= "0.5", path="../common", package="tantivy-common"}
|
||||
tantivy-fst = "0.4"
|
||||
# experimental gives us access to Decompressor::upper_bound
|
||||
zstd = { version = "0.12", features = ["experimental"] }
|
||||
|
||||
@@ -3,10 +3,14 @@ name = "tantivy-stacker"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "MIT"
|
||||
homepage = "https://github.com/quickwit-oss/tantivy"
|
||||
repository = "https://github.com/quickwit-oss/tantivy"
|
||||
description = "term hashmap used for indexing"
|
||||
|
||||
[dependencies]
|
||||
murmurhash32 = "0.3"
|
||||
common = { version = "0.5", path = "../common/", package = "tantivy-common" }
|
||||
ahash = { version = "0.8.3", default-features = false, optional = true }
|
||||
|
||||
[[bench]]
|
||||
harness = false
|
||||
@@ -20,8 +24,10 @@ path = "example/hashmap.rs"
|
||||
[dev-dependencies]
|
||||
rand = "0.8.5"
|
||||
zipf = "7.0.0"
|
||||
criterion = "0.5.0"
|
||||
criterion = { git = "https://github.com/PSeitz/criterion.rs/", rev = "e6f98ee"} # This fork includes stack randomization to reduce caching effects
|
||||
rustc-hash = "1.1.0"
|
||||
proptest = "1.2.0"
|
||||
|
||||
[features]
|
||||
compare_hash_only = ["ahash"] # Compare hash only, not the key in the Hashmap
|
||||
unstable = [] # useful for benches.
|
||||
|
||||
14
stacker/Performance.md
Normal file
14
stacker/Performance.md
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
# Notes
|
||||
|
||||
- `extend_from_slice(&key)` calls memcpy, which is relatively slow, since most keys are relatively short. For now there's a specialized version toavoid memcpy calls.
|
||||
Wild copy 16 bytes in a loop is faster, but would require a guard against overflow from the caller side. (We probably can do that).
|
||||
- Comparing two slices of unknown length calls memcmp. Same as above, we can do a specialized version.
|
||||
|
||||
fastcmp and fastcpy both employ the same trick, to compare slices of odd length, e.g. 2 operations unconditional on 4 bytes, instead 3 operations with conditionals (1 4byte, 1 2byte, 1 1byte).
|
||||
[1, 2, 3, 4, 5, 6, 7]
|
||||
[1, 2, 3, 4]
|
||||
[4, 5, 6, 7]
|
||||
|
||||
- Since the hashmap writes the values on every key insert/update, the values like expull should be small. Therefore inlining of the values has been removed.
|
||||
- Currently the first call to Expull will get a capacity of 0. It would be beneficial if it could be initialized with some memory, so that the first call doesn't have to allocate. But that would mean we don't have `Default` impls.
|
||||
@@ -15,11 +15,19 @@ fn bench_hashmap_throughput(c: &mut Criterion) {
|
||||
group.plot_config(plot_config);
|
||||
|
||||
let input_bytes = ALICE.len() as u64;
|
||||
|
||||
let alice_terms_as_bytes: Vec<&[u8]> = ALICE
|
||||
.split_ascii_whitespace()
|
||||
.map(|el| el.as_bytes())
|
||||
.collect();
|
||||
|
||||
let alice_terms_as_bytes_with_docid: Vec<(u32, &[u8])> = ALICE
|
||||
.split_ascii_whitespace()
|
||||
.map(|el| el.as_bytes())
|
||||
.enumerate()
|
||||
.map(|(docid, el)| (docid as u32, el))
|
||||
.collect();
|
||||
|
||||
group.throughput(Throughput::Bytes(input_bytes));
|
||||
|
||||
group.bench_with_input(
|
||||
@@ -29,8 +37,8 @@ fn bench_hashmap_throughput(c: &mut Criterion) {
|
||||
);
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("alice_expull".to_string(), input_bytes),
|
||||
&alice_terms_as_bytes,
|
||||
|b, i| b.iter(|| create_hash_map_with_expull(i.iter())),
|
||||
&alice_terms_as_bytes_with_docid,
|
||||
|b, i| b.iter(|| create_hash_map_with_expull(i.iter().cloned())),
|
||||
);
|
||||
|
||||
group.bench_with_input(
|
||||
@@ -48,11 +56,24 @@ fn bench_hashmap_throughput(c: &mut Criterion) {
|
||||
// numbers
|
||||
let input_bytes = 1_000_000 * 8 as u64;
|
||||
group.throughput(Throughput::Bytes(input_bytes));
|
||||
let numbers: Vec<[u8; 8]> = (0..1_000_000u64).map(|el| el.to_le_bytes()).collect();
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("numbers".to_string(), input_bytes),
|
||||
&(0..1_000_000u64),
|
||||
|b, i| b.iter(|| create_hash_map(i.clone().map(|el| el.to_le_bytes()))),
|
||||
&numbers,
|
||||
|b, i| b.iter(|| create_hash_map(i.iter().cloned())),
|
||||
);
|
||||
|
||||
let numbers_with_doc: Vec<_> = numbers
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(docid, el)| (docid as u32, el))
|
||||
.collect();
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("ids_expull".to_string(), input_bytes),
|
||||
&numbers_with_doc,
|
||||
|b, i| b.iter(|| create_hash_map_with_expull(i.iter().cloned())),
|
||||
);
|
||||
|
||||
// numbers zipf
|
||||
@@ -63,11 +84,14 @@ fn bench_hashmap_throughput(c: &mut Criterion) {
|
||||
|
||||
let input_bytes = 1_000_000 * 8 as u64;
|
||||
group.throughput(Throughput::Bytes(input_bytes));
|
||||
let zipf_numbers: Vec<[u8; 8]> = (0..1_000_000u64)
|
||||
.map(|_| zipf.sample(&mut rng).to_le_bytes())
|
||||
.collect();
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("numbers_zipf".to_string(), input_bytes),
|
||||
&(0..1_000_000u64),
|
||||
|b, i| b.iter(|| create_hash_map(i.clone().map(|_el| zipf.sample(&mut rng).to_le_bytes()))),
|
||||
&zipf_numbers,
|
||||
|b, i| b.iter(|| create_hash_map(i.iter().cloned())),
|
||||
);
|
||||
|
||||
group.finish();
|
||||
@@ -102,14 +126,15 @@ fn create_hash_map<'a, T: AsRef<[u8]>>(terms: impl Iterator<Item = T>) -> ArenaH
|
||||
map
|
||||
}
|
||||
|
||||
fn create_hash_map_with_expull<'a, T: AsRef<[u8]>>(terms: impl Iterator<Item = T>) -> ArenaHashMap {
|
||||
let terms = terms.enumerate();
|
||||
fn create_hash_map_with_expull<'a, T: AsRef<[u8]>>(
|
||||
terms: impl Iterator<Item = (u32, T)>,
|
||||
) -> ArenaHashMap {
|
||||
let mut memory_arena = MemoryArena::default();
|
||||
let mut map = ArenaHashMap::with_capacity(HASHMAP_SIZE);
|
||||
for (i, term) in terms {
|
||||
map.mutate_or_create(term.as_ref(), |val: Option<DocIdRecorder>| {
|
||||
if let Some(mut rec) = val {
|
||||
rec.new_doc(i as u32, &mut memory_arena);
|
||||
rec.new_doc(i, &mut memory_arena);
|
||||
rec
|
||||
} else {
|
||||
DocIdRecorder::default()
|
||||
|
||||
@@ -2,6 +2,7 @@ use std::iter::{Cloned, Filter};
|
||||
use std::mem;
|
||||
|
||||
use super::{Addr, MemoryArena};
|
||||
use crate::fastcpy::fast_short_slice_copy;
|
||||
use crate::memory_arena::store;
|
||||
use crate::UnorderedId;
|
||||
|
||||
@@ -12,8 +13,12 @@ pub fn compute_table_memory_size(capacity: usize) -> usize {
|
||||
capacity * mem::size_of::<KeyValue>()
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "compare_hash_only"))]
|
||||
type HashType = u32;
|
||||
|
||||
#[cfg(feature = "compare_hash_only")]
|
||||
type HashType = u64;
|
||||
|
||||
/// `KeyValue` is the item stored in the hash table.
|
||||
/// The key is actually a `BytesRef` object stored in an external memory arena.
|
||||
/// The `value_addr` also points to an address in the memory arena.
|
||||
@@ -62,25 +67,24 @@ pub struct ArenaHashMap {
|
||||
}
|
||||
|
||||
struct LinearProbing {
|
||||
hash: HashType,
|
||||
i: u32,
|
||||
mask: u32,
|
||||
pos: usize,
|
||||
mask: usize,
|
||||
}
|
||||
|
||||
impl LinearProbing {
|
||||
#[inline]
|
||||
fn compute(hash: HashType, mask: usize) -> LinearProbing {
|
||||
LinearProbing {
|
||||
hash,
|
||||
i: 0,
|
||||
mask: mask as u32,
|
||||
pos: hash as usize,
|
||||
mask,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn next_probe(&mut self) -> usize {
|
||||
self.i += 1;
|
||||
((self.hash + self.i) & self.mask) as usize
|
||||
// Not saving the masked version removes a dependency.
|
||||
self.pos = self.pos.wrapping_add(1);
|
||||
self.pos & self.mask
|
||||
}
|
||||
}
|
||||
|
||||
@@ -133,10 +137,21 @@ impl ArenaHashMap {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
#[cfg(not(feature = "compare_hash_only"))]
|
||||
fn get_hash(&self, key: &[u8]) -> HashType {
|
||||
murmurhash32::murmurhash2(key)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
#[cfg(feature = "compare_hash_only")]
|
||||
fn get_hash(&self, key: &[u8]) -> HashType {
|
||||
/// Since we compare only the hash we need a high quality hash.
|
||||
use std::hash::Hasher;
|
||||
let mut hasher = ahash::AHasher::default();
|
||||
hasher.write(key);
|
||||
hasher.finish() as HashType
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn read<Item: Copy + 'static>(&self, addr: Addr) -> Item {
|
||||
self.memory_arena.read(addr)
|
||||
@@ -160,17 +175,19 @@ impl ArenaHashMap {
|
||||
#[inline]
|
||||
fn get_key_value(&self, addr: Addr) -> (&[u8], Addr) {
|
||||
let data = self.memory_arena.slice_from(addr);
|
||||
let (key_bytes_len_bytes, data) = data.split_at(2);
|
||||
let key_bytes_len_bytes = unsafe { data.get_unchecked(..2) };
|
||||
let key_bytes_len = u16::from_le_bytes(key_bytes_len_bytes.try_into().unwrap());
|
||||
let key_bytes: &[u8] = &data[..key_bytes_len as usize];
|
||||
let key_bytes: &[u8] = unsafe { data.get_unchecked(2..2 + key_bytes_len as usize) };
|
||||
(key_bytes, addr.offset(2 + key_bytes_len as u32))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
#[cfg(not(feature = "compare_hash_only"))]
|
||||
fn get_value_addr_if_key_match(&self, target_key: &[u8], addr: Addr) -> Option<Addr> {
|
||||
use crate::fastcmp::fast_short_slice_compare;
|
||||
|
||||
let (stored_key, value_addr) = self.get_key_value(addr);
|
||||
if stored_key == target_key {
|
||||
if fast_short_slice_compare(stored_key, target_key) {
|
||||
Some(value_addr)
|
||||
} else {
|
||||
None
|
||||
@@ -179,6 +196,8 @@ impl ArenaHashMap {
|
||||
#[inline]
|
||||
#[cfg(feature = "compare_hash_only")]
|
||||
fn get_value_addr_if_key_match(&self, _target_key: &[u8], addr: Addr) -> Option<Addr> {
|
||||
// For the compare_hash_only feature, it would make sense to store the keys at a different
|
||||
// memory location. Here they will just pollute the cache.
|
||||
let data = self.memory_arena.slice_from(addr);
|
||||
let key_bytes_len_bytes = &data[..2];
|
||||
let key_bytes_len = u16::from_le_bytes(key_bytes_len_bytes.try_into().unwrap());
|
||||
@@ -284,9 +303,9 @@ impl ArenaHashMap {
|
||||
}
|
||||
let hash = self.get_hash(key);
|
||||
let mut probe = self.probe(hash);
|
||||
let mut bucket = probe.next_probe();
|
||||
let mut kv: KeyValue = self.table[bucket];
|
||||
loop {
|
||||
let bucket = probe.next_probe();
|
||||
let kv: KeyValue = self.table[bucket];
|
||||
if kv.is_empty() {
|
||||
// The key does not exist yet.
|
||||
let val = updater(None);
|
||||
@@ -294,14 +313,16 @@ impl ArenaHashMap {
|
||||
let key_addr = self.memory_arena.allocate_space(num_bytes);
|
||||
{
|
||||
let data = self.memory_arena.slice_mut(key_addr, num_bytes);
|
||||
data[..2].copy_from_slice(&(key.len() as u16).to_le_bytes());
|
||||
let key_len_bytes: [u8; 2] = (key.len() as u16).to_le_bytes();
|
||||
data[..2].copy_from_slice(&key_len_bytes);
|
||||
let stop = 2 + key.len();
|
||||
data[2..stop].copy_from_slice(key);
|
||||
fast_short_slice_copy(key, &mut data[2..stop]);
|
||||
store(&mut data[stop..], val);
|
||||
}
|
||||
|
||||
return self.set_bucket(hash, key_addr, bucket);
|
||||
} else if kv.hash == hash {
|
||||
}
|
||||
if kv.hash == hash {
|
||||
if let Some(val_addr) = self.get_value_addr_if_key_match(key, kv.key_value_addr) {
|
||||
let v = self.memory_arena.read(val_addr);
|
||||
let new_v = updater(Some(v));
|
||||
@@ -309,6 +330,9 @@ impl ArenaHashMap {
|
||||
return kv.unordered_id;
|
||||
}
|
||||
}
|
||||
// This allows fetching the next bucket before the loop jmp
|
||||
bucket = probe.next_probe();
|
||||
kv = self.table[bucket];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -356,4 +380,23 @@ mod tests {
|
||||
assert_eq!(compute_previous_power_of_two(7), 4);
|
||||
assert_eq!(compute_previous_power_of_two(u64::MAX as usize), 1 << 63);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_many_terms() {
|
||||
let mut terms: Vec<String> = (0..20_000).map(|val| val.to_string()).collect();
|
||||
let mut hash_map: ArenaHashMap = ArenaHashMap::default();
|
||||
for term in terms.iter() {
|
||||
hash_map.mutate_or_create(term.as_bytes(), |_opt_val: Option<u32>| 5u32);
|
||||
}
|
||||
let mut terms_back: Vec<String> = hash_map
|
||||
.iter()
|
||||
.map(|(bytes, _, _)| String::from_utf8(bytes.to_vec()).unwrap())
|
||||
.collect();
|
||||
terms_back.sort();
|
||||
terms.sort();
|
||||
|
||||
for pos in 0..terms.len() {
|
||||
assert_eq!(terms[pos], terms_back[pos]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,40 +2,10 @@ use std::mem;
|
||||
|
||||
use common::serialize_vint_u32;
|
||||
|
||||
use crate::memory_arena::{load, store};
|
||||
use crate::fastcpy::fast_short_slice_copy;
|
||||
use crate::{Addr, MemoryArena};
|
||||
|
||||
const MAX_BLOCK_LEN: u32 = 1u32 << 15;
|
||||
const FIRST_BLOCK: usize = 16;
|
||||
const INLINED_BLOCK_LEN: usize = FIRST_BLOCK + mem::size_of::<Addr>();
|
||||
|
||||
enum CapacityResult {
|
||||
Available(u32),
|
||||
NeedAlloc(u32),
|
||||
}
|
||||
|
||||
fn len_to_capacity(len: u32) -> CapacityResult {
|
||||
match len {
|
||||
0..=15 => CapacityResult::Available(FIRST_BLOCK as u32 - len),
|
||||
16..=MAX_BLOCK_LEN => {
|
||||
let cap = 1 << (32u32 - (len - 1u32).leading_zeros());
|
||||
let available = cap - len;
|
||||
if available == 0 {
|
||||
CapacityResult::NeedAlloc(len)
|
||||
} else {
|
||||
CapacityResult::Available(available)
|
||||
}
|
||||
}
|
||||
n => {
|
||||
let available = n % MAX_BLOCK_LEN;
|
||||
if available == 0 {
|
||||
CapacityResult::NeedAlloc(MAX_BLOCK_LEN)
|
||||
} else {
|
||||
CapacityResult::Available(MAX_BLOCK_LEN - available)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
const FIRST_BLOCK_NUM: u16 = 2;
|
||||
|
||||
/// An exponential unrolled link.
|
||||
///
|
||||
@@ -52,17 +22,33 @@ fn len_to_capacity(len: u32) -> CapacityResult {
|
||||
/// problem of selecting an adequate block size using a strategy similar to
|
||||
/// that of the `Vec` amortized resize strategy.
|
||||
///
|
||||
/// Data is stored in a linked list of blocks. The first block has a size of `4`
|
||||
/// Data is stored in a linked list of blocks. The first block has a size of `8`
|
||||
/// and each block has a length of twice that of the previous block up to
|
||||
/// `MAX_BLOCK_LEN = 32768`.
|
||||
/// `MAX_BLOCK_LEN = 1<<15`.
|
||||
///
|
||||
/// This strategy is a good trade off to handle numerous very rare terms
|
||||
/// and avoid wasting half of the memory for very frequent terms.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct ExpUnrolledLinkedList {
|
||||
len: u32,
|
||||
// u16, since the max size of each block is (1<<next_cap_pow_2)
|
||||
// Limited to 15, so we don't overflow remaining_cap.
|
||||
remaining_cap: u16,
|
||||
// To get the current number of blocks: block_num - FIRST_BLOCK_NUM
|
||||
block_num: u16,
|
||||
head: Addr,
|
||||
tail: Addr,
|
||||
inlined_data: [u8; INLINED_BLOCK_LEN],
|
||||
}
|
||||
|
||||
impl Default for ExpUnrolledLinkedList {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
// 0 to trigger an initial allocation. Init with MemoryArena would be better.
|
||||
remaining_cap: 0,
|
||||
block_num: FIRST_BLOCK_NUM,
|
||||
head: Addr::null_pointer(),
|
||||
tail: Addr::null_pointer(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ExpUnrolledLinkedListWriter<'a> {
|
||||
@@ -70,32 +56,22 @@ pub struct ExpUnrolledLinkedListWriter<'a> {
|
||||
arena: &'a mut MemoryArena,
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn ensure_capacity<'a>(
|
||||
eull: &'a mut ExpUnrolledLinkedList,
|
||||
arena: &'a mut MemoryArena,
|
||||
) -> &'a mut [u8] {
|
||||
if eull.len <= FIRST_BLOCK as u32 {
|
||||
// We are still hitting the inline block.
|
||||
if eull.len < FIRST_BLOCK as u32 {
|
||||
return &mut eull.inlined_data[eull.len as usize..FIRST_BLOCK];
|
||||
}
|
||||
// We need to allocate a new block!
|
||||
let new_block_addr: Addr = arena.allocate_space(FIRST_BLOCK + mem::size_of::<Addr>());
|
||||
store(&mut eull.inlined_data[FIRST_BLOCK..], new_block_addr);
|
||||
eull.tail = new_block_addr;
|
||||
return arena.slice_mut(eull.tail, FIRST_BLOCK);
|
||||
allocate: u32,
|
||||
) {
|
||||
let new_block_addr: Addr = arena.allocate_space(allocate as usize + mem::size_of::<Addr>());
|
||||
// Check first write
|
||||
if eull.head.is_null() {
|
||||
eull.head = new_block_addr;
|
||||
} else {
|
||||
arena.write_at(eull.tail, new_block_addr);
|
||||
}
|
||||
let len = match len_to_capacity(eull.len) {
|
||||
CapacityResult::NeedAlloc(new_block_len) => {
|
||||
let new_block_addr: Addr =
|
||||
arena.allocate_space(new_block_len as usize + mem::size_of::<Addr>());
|
||||
arena.write_at(eull.tail, new_block_addr);
|
||||
eull.tail = new_block_addr;
|
||||
new_block_len
|
||||
}
|
||||
CapacityResult::Available(available) => available,
|
||||
};
|
||||
arena.slice_mut(eull.tail, len as usize)
|
||||
|
||||
eull.tail = new_block_addr;
|
||||
eull.remaining_cap = allocate as u16;
|
||||
}
|
||||
|
||||
impl<'a> ExpUnrolledLinkedListWriter<'a> {
|
||||
@@ -111,56 +87,63 @@ impl<'a> ExpUnrolledLinkedListWriter<'a> {
|
||||
while !buf.is_empty() {
|
||||
let add_len: usize;
|
||||
{
|
||||
let output_buf = ensure_capacity(self.eull, self.arena);
|
||||
if self.eull.remaining_cap == 0 {
|
||||
// Double the next cap
|
||||
self.eull.increment_num_blocks();
|
||||
let block_size = get_block_size(self.eull.block_num);
|
||||
ensure_capacity(self.eull, self.arena, block_size as u32);
|
||||
}
|
||||
|
||||
let output_buf = self
|
||||
.arena
|
||||
.slice_mut(self.eull.tail, self.eull.remaining_cap as usize);
|
||||
add_len = buf.len().min(output_buf.len());
|
||||
output_buf[..add_len].copy_from_slice(&buf[..add_len]);
|
||||
let output_buf = &mut output_buf[..add_len];
|
||||
let buf = &buf[..add_len];
|
||||
|
||||
fast_short_slice_copy(buf, output_buf);
|
||||
}
|
||||
self.eull.len += add_len as u32;
|
||||
self.eull.remaining_cap -= add_len as u16;
|
||||
self.eull.tail = self.eull.tail.offset(add_len as u32);
|
||||
buf = &buf[add_len..];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ExpUnrolledLinkedList {
|
||||
fn default() -> ExpUnrolledLinkedList {
|
||||
ExpUnrolledLinkedList {
|
||||
len: 0u32,
|
||||
tail: Addr::null_pointer(),
|
||||
inlined_data: [0u8; INLINED_BLOCK_LEN],
|
||||
}
|
||||
}
|
||||
// The block size is 2^block_num + 2, but max 2^15= 32k
|
||||
// Inital size is 8, for the first block => block_num == 1
|
||||
#[inline]
|
||||
fn get_block_size(block_num: u16) -> u16 {
|
||||
1 << block_num.min(15)
|
||||
}
|
||||
|
||||
impl ExpUnrolledLinkedList {
|
||||
pub fn increment_num_blocks(&mut self) {
|
||||
self.block_num += 1;
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn writer<'a>(&'a mut self, arena: &'a mut MemoryArena) -> ExpUnrolledLinkedListWriter<'a> {
|
||||
ExpUnrolledLinkedListWriter { eull: self, arena }
|
||||
}
|
||||
|
||||
pub fn read_to_end(&self, arena: &MemoryArena, output: &mut Vec<u8>) {
|
||||
let len = self.len as usize;
|
||||
if len <= FIRST_BLOCK {
|
||||
output.extend_from_slice(&self.inlined_data[..len]);
|
||||
let mut addr = self.head;
|
||||
if addr.is_null() {
|
||||
return;
|
||||
}
|
||||
output.extend_from_slice(&self.inlined_data[..FIRST_BLOCK]);
|
||||
let mut cur = FIRST_BLOCK;
|
||||
let mut addr = load(&self.inlined_data[FIRST_BLOCK..]);
|
||||
loop {
|
||||
let cap = match len_to_capacity(cur as u32) {
|
||||
CapacityResult::Available(capacity) => capacity,
|
||||
CapacityResult::NeedAlloc(capacity) => capacity,
|
||||
} as usize;
|
||||
let last_block_len = get_block_size(self.block_num) as usize - self.remaining_cap as usize;
|
||||
|
||||
// Full Blocks
|
||||
for block_num in FIRST_BLOCK_NUM + 1..self.block_num {
|
||||
let cap = get_block_size(block_num) as usize;
|
||||
let data = arena.slice(addr, cap);
|
||||
if cur + cap >= len {
|
||||
output.extend_from_slice(&data[..(len - cur)]);
|
||||
return;
|
||||
}
|
||||
output.extend_from_slice(data);
|
||||
cur += cap;
|
||||
addr = arena.read(addr.offset(cap as u32));
|
||||
}
|
||||
// Last Block
|
||||
let data = arena.slice(addr, last_block_len);
|
||||
output.extend_from_slice(data);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -169,10 +152,21 @@ mod tests {
|
||||
use common::{read_u32_vint, write_u32_vint};
|
||||
|
||||
use super::super::MemoryArena;
|
||||
use super::{len_to_capacity, *};
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_eull() {
|
||||
fn test_eull_empty() {
|
||||
let arena = MemoryArena::default();
|
||||
let stack = ExpUnrolledLinkedList::default();
|
||||
{
|
||||
let mut buffer = Vec::new();
|
||||
stack.read_to_end(&arena, &mut buffer);
|
||||
assert_eq!(&buffer[..], &[]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_eull1() {
|
||||
let mut arena = MemoryArena::default();
|
||||
let mut stack = ExpUnrolledLinkedList::default();
|
||||
stack.writer(&mut arena).extend_from_slice(&[1u8]);
|
||||
@@ -186,6 +180,35 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_eull_vint1() {
|
||||
let mut arena = MemoryArena::default();
|
||||
let mut stack = ExpUnrolledLinkedList::default();
|
||||
stack.writer(&mut arena).extend_from_slice(&[1u8]);
|
||||
stack.writer(&mut arena).extend_from_slice(&[2u8]);
|
||||
stack.writer(&mut arena).extend_from_slice(&[3u8, 4u8]);
|
||||
stack.writer(&mut arena).extend_from_slice(&[5u8]);
|
||||
{
|
||||
let mut buffer = Vec::new();
|
||||
stack.read_to_end(&arena, &mut buffer);
|
||||
assert_eq!(&buffer[..], &[1u8, 2u8, 3u8, 4u8, 5u8]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_eull_first_write_extends_cap() {
|
||||
let mut arena = MemoryArena::default();
|
||||
let mut stack = ExpUnrolledLinkedList::default();
|
||||
stack
|
||||
.writer(&mut arena)
|
||||
.extend_from_slice(&[1u8, 2, 3, 4, 5, 6, 7, 8, 9]);
|
||||
{
|
||||
let mut buffer = Vec::new();
|
||||
stack.read_to_end(&arena, &mut buffer);
|
||||
assert_eq!(&buffer[..], &[1u8, 2, 3, 4, 5, 6, 7, 8, 9]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_eull_long() {
|
||||
let mut arena = MemoryArena::default();
|
||||
@@ -204,9 +227,18 @@ mod tests {
|
||||
assert_eq!(&result[..], &data[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_eull_limit() {
|
||||
let mut eull = ExpUnrolledLinkedList::default();
|
||||
for _ in 0..100 {
|
||||
eull.increment_num_blocks();
|
||||
}
|
||||
assert_eq!(get_block_size(eull.block_num), 1 << 15);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_eull_interlaced() {
|
||||
let mut eull = MemoryArena::default();
|
||||
let mut arena = MemoryArena::default();
|
||||
let mut stack = ExpUnrolledLinkedList::default();
|
||||
let mut stack2 = ExpUnrolledLinkedList::default();
|
||||
|
||||
@@ -214,68 +246,20 @@ mod tests {
|
||||
let mut vec2: Vec<u8> = vec![];
|
||||
|
||||
for i in 0..9 {
|
||||
stack.writer(&mut eull).write_u32_vint(i);
|
||||
stack.writer(&mut arena).write_u32_vint(i);
|
||||
assert!(write_u32_vint(i, &mut vec1).is_ok());
|
||||
if i % 2 == 0 {
|
||||
stack2.writer(&mut eull).write_u32_vint(i);
|
||||
stack2.writer(&mut arena).write_u32_vint(i);
|
||||
assert!(write_u32_vint(i, &mut vec2).is_ok());
|
||||
}
|
||||
}
|
||||
let mut res1 = vec![];
|
||||
let mut res2 = vec![];
|
||||
stack.read_to_end(&eull, &mut res1);
|
||||
stack2.read_to_end(&eull, &mut res2);
|
||||
stack.read_to_end(&arena, &mut res1);
|
||||
stack2.read_to_end(&arena, &mut res2);
|
||||
assert_eq!(&vec1[..], &res1[..]);
|
||||
assert_eq!(&vec2[..], &res2[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_jump_if_needed() {
|
||||
let mut available = 16u32;
|
||||
for i in 0..10_000_000 {
|
||||
match len_to_capacity(i) {
|
||||
CapacityResult::NeedAlloc(cap) => {
|
||||
assert_eq!(available, 0, "Failed len={i}: Expected 0 got {cap}");
|
||||
available = cap;
|
||||
}
|
||||
CapacityResult::Available(cap) => {
|
||||
assert_eq!(
|
||||
available, cap,
|
||||
"Failed len={i}: Expected {available} Got {cap}"
|
||||
);
|
||||
}
|
||||
}
|
||||
available -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_jump_if_needed_progression() {
|
||||
let mut v = vec![];
|
||||
for i in 0.. {
|
||||
if v.len() >= 10 {
|
||||
break;
|
||||
}
|
||||
if let CapacityResult::NeedAlloc(cap) = len_to_capacity(i) {
|
||||
v.push((i, cap));
|
||||
}
|
||||
}
|
||||
assert_eq!(
|
||||
&v[..],
|
||||
&[
|
||||
(16, 16),
|
||||
(32, 32),
|
||||
(64, 64),
|
||||
(128, 128),
|
||||
(256, 256),
|
||||
(512, 512),
|
||||
(1024, 1024),
|
||||
(2048, 2048),
|
||||
(4096, 4096),
|
||||
(8192, 8192)
|
||||
]
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
|
||||
147
stacker/src/fastcmp.rs
Normal file
147
stacker/src/fastcmp.rs
Normal file
@@ -0,0 +1,147 @@
|
||||
/// fastcmp employs a trick to speed up the comparison of two slices of bytes.
|
||||
/// It's also possible to inline compared to the memcmp call.
|
||||
///
|
||||
/// E.g. Comparing equality of slice length 7 in two steps, by comparing two 4 byte slices
|
||||
/// unconditionally instead comparing the remaining 3 bytes if the first comparison was equal.
|
||||
/// [1, 2, 3, 4, 5, 6, 7]
|
||||
/// [1, 2, 3, 4]
|
||||
/// [4, 5, 6, 7]
|
||||
///
|
||||
/// This method uses the XMM register for bytes slices bigger than 16, else regular registers.
|
||||
#[inline]
|
||||
pub fn fast_short_slice_compare(left: &[u8], right: &[u8]) -> bool {
|
||||
let len = left.len();
|
||||
if len != right.len() {
|
||||
return false;
|
||||
}
|
||||
|
||||
// This could be less equals, but to make the job a little bit easier for the branch predictor
|
||||
// we put the length 8 into the bigger group (8-16 bytes), that compares two u64
|
||||
// assuming that range 8-16 are more common than 4-7
|
||||
|
||||
// This weird branching is done on purpose to get the best assembly.
|
||||
// if len< 4 {
|
||||
// ..
|
||||
// if len < 8
|
||||
// will cause assembly inlined instead of jumps
|
||||
if len < 8 {
|
||||
if len >= 4 {
|
||||
return double_check_trick::<4>(left, right);
|
||||
} else {
|
||||
return short_compare(left, right);
|
||||
}
|
||||
}
|
||||
|
||||
if len > 16 {
|
||||
return fast_nbyte_slice_compare::<16>(left, right);
|
||||
}
|
||||
|
||||
double_check_trick::<8>(left, right)
|
||||
}
|
||||
|
||||
// Note: The straigthforward left.chunks_exact(SIZE).zip(right.chunks_exact(SIZE)) produces slower
|
||||
// assembly
|
||||
#[inline]
|
||||
pub fn fast_nbyte_slice_compare<const SIZE: usize>(left: &[u8], right: &[u8]) -> bool {
|
||||
let last = left.len() - left.len() % SIZE;
|
||||
let mut i = 0;
|
||||
loop {
|
||||
if unsafe { left.get_unchecked(i..i + SIZE) != right.get_unchecked(i..i + SIZE) } {
|
||||
return false;
|
||||
}
|
||||
i += SIZE;
|
||||
if i >= last {
|
||||
break;
|
||||
}
|
||||
}
|
||||
unsafe { left.get_unchecked(left.len() - SIZE..) == right.get_unchecked(right.len() - SIZE..) }
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn short_compare(left: &[u8], right: &[u8]) -> bool {
|
||||
for (l, r) in left.iter().zip(right) {
|
||||
if l != r {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn double_check_trick<const SIZE: usize>(left: &[u8], right: &[u8]) -> bool {
|
||||
left[0..SIZE] == right[0..SIZE] && left[left.len() - SIZE..] == right[right.len() - SIZE..]
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use proptest::prelude::*;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_slice_compare_bytes_len_8() {
|
||||
let a = &[1, 2, 3, 4, 5, 6, 7, 8];
|
||||
let b = &[1, 2, 3, 4, 5, 6, 7, 8];
|
||||
let c = &[1, 2, 3, 4, 5, 6, 7, 7];
|
||||
|
||||
assert!(fast_short_slice_compare(a, b));
|
||||
assert!(!fast_short_slice_compare(a, c));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_slice_compare_bytes_len_9() {
|
||||
let a = &[1, 2, 3, 4, 5, 6, 7, 8, 9];
|
||||
let b = &[1, 2, 3, 4, 5, 6, 7, 8, 9];
|
||||
let c = &[0, 2, 3, 4, 5, 6, 7, 8, 9];
|
||||
|
||||
assert!(fast_short_slice_compare(a, b));
|
||||
assert!(!fast_short_slice_compare(a, c));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_slice_compare_bytes_len_16() {
|
||||
let a = &[1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8];
|
||||
let b = &[1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8];
|
||||
let c = &[1, 2, 3, 4, 5, 6, 7, 7, 1, 2, 3, 4, 5, 6, 7, 8];
|
||||
|
||||
assert!(fast_short_slice_compare(a, b));
|
||||
assert!(!fast_short_slice_compare(a, c));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_slice_compare_bytes_short() {
|
||||
let a = &[1, 2, 3, 4];
|
||||
let b = &[1, 2, 3, 4];
|
||||
|
||||
assert!(fast_short_slice_compare(a, b));
|
||||
|
||||
let a = &[1, 2, 3];
|
||||
let b = &[1, 2, 3];
|
||||
|
||||
assert!(fast_short_slice_compare(a, b));
|
||||
|
||||
let a = &[1, 2];
|
||||
let b = &[1, 2];
|
||||
|
||||
assert!(fast_short_slice_compare(a, b));
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#[test]
|
||||
fn test_fast_short_slice_compare(left in prop::collection::vec(any::<u8>(), 0..100),
|
||||
right in prop::collection::vec(any::<u8>(), 0..100)) {
|
||||
let result = fast_short_slice_compare(&left, &right);
|
||||
let expected = left == right;
|
||||
prop_assert_eq!(result, expected, "left: {:?}, right: {:?}", left, right);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fast_short_slice_compare_equal(left in prop::collection::vec(any::<u8>(), 0..100),
|
||||
) {
|
||||
let result = fast_short_slice_compare(&left, &left);
|
||||
let expected = left == left;
|
||||
prop_assert_eq!(result, expected, "left: {:?}, right: {:?}", left, left);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
117
stacker/src/fastcpy.rs
Normal file
117
stacker/src/fastcpy.rs
Normal file
@@ -0,0 +1,117 @@
|
||||
/// Optimized copy for small sizes. All bounds checks are elided.
|
||||
/// Avoids call to memcpy
|
||||
/// Applies unbranched copy trick for sizes 8, 16, 32
|
||||
///
|
||||
/// src and dst must be num_bytes long.
|
||||
#[inline]
|
||||
pub fn fast_short_slice_copy(src: &[u8], dst: &mut [u8]) {
|
||||
#[inline(never)]
|
||||
#[cold]
|
||||
#[track_caller]
|
||||
fn len_mismatch_fail(dst_len: usize, src_len: usize) -> ! {
|
||||
panic!(
|
||||
"source slice length ({}) does not match destination slice length ({})",
|
||||
src_len, dst_len,
|
||||
);
|
||||
}
|
||||
|
||||
if src.len() != dst.len() {
|
||||
len_mismatch_fail(src.len(), dst.len());
|
||||
}
|
||||
let len = src.len();
|
||||
|
||||
if src.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
if len < 4 {
|
||||
short_copy(src, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
if len < 8 {
|
||||
double_copy_trick::<4>(src, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
if len <= 16 {
|
||||
double_copy_trick::<8>(src, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
if len <= 32 {
|
||||
double_copy_trick::<16>(src, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
/// The code will use the vmovdqu instruction to copy 32 bytes at a time.
|
||||
#[cfg(target_feature = "avx")]
|
||||
{
|
||||
if len <= 64 {
|
||||
double_copy_trick::<32>(src, dst);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// For larger sizes we use the default, which calls memcpy
|
||||
// memcpy does some virtual memory tricks to copy large chunks of memory.
|
||||
//
|
||||
// The theory should be that the checks above don't cost much relative to the copy call for
|
||||
// larger copies.
|
||||
// The bounds checks in `copy_from_slice` are elided.
|
||||
dst.copy_from_slice(src);
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn short_copy(src: &[u8], dst: &mut [u8]) {
|
||||
debug_assert_ne!(src.len(), 0);
|
||||
debug_assert_eq!(src.len(), dst.len());
|
||||
let len = src.len();
|
||||
|
||||
// length 1-3
|
||||
dst[0] = src[0];
|
||||
if len >= 2 {
|
||||
double_copy_trick::<2>(src, dst);
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn double_copy_trick<const SIZE: usize>(src: &[u8], dst: &mut [u8]) {
|
||||
debug_assert!(src.len() >= SIZE);
|
||||
debug_assert!(dst.len() >= SIZE);
|
||||
dst[0..SIZE].copy_from_slice(&src[0..SIZE]);
|
||||
dst[src.len() - SIZE..].copy_from_slice(&src[src.len() - SIZE..]);
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn copy_test<const SIZE: usize>() {
|
||||
let src: Vec<u8> = (0..SIZE as u8).collect();
|
||||
let mut dst = [0u8; SIZE];
|
||||
fast_short_slice_copy(&src, &mut dst);
|
||||
assert_eq!(src, dst);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn copy_test_n() {
|
||||
copy_test::<1>();
|
||||
copy_test::<2>();
|
||||
copy_test::<3>();
|
||||
copy_test::<4>();
|
||||
copy_test::<5>();
|
||||
copy_test::<6>();
|
||||
copy_test::<7>();
|
||||
copy_test::<8>();
|
||||
copy_test::<9>();
|
||||
copy_test::<10>();
|
||||
copy_test::<11>();
|
||||
copy_test::<31>();
|
||||
copy_test::<32>();
|
||||
copy_test::<33>();
|
||||
copy_test::<47>();
|
||||
copy_test::<48>();
|
||||
copy_test::<49>();
|
||||
}
|
||||
}
|
||||
@@ -5,6 +5,9 @@ extern crate test;
|
||||
|
||||
mod arena_hashmap;
|
||||
mod expull;
|
||||
#[allow(dead_code)]
|
||||
mod fastcmp;
|
||||
mod fastcpy;
|
||||
mod memory_arena;
|
||||
|
||||
pub use self::arena_hashmap::{compute_table_memory_size, ArenaHashMap};
|
||||
|
||||
@@ -74,7 +74,7 @@ impl Addr {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
#[inline(always)]
|
||||
pub fn store<Item: Copy + 'static>(dest: &mut [u8], val: Item) {
|
||||
debug_assert_eq!(dest.len(), std::mem::size_of::<Item>());
|
||||
unsafe {
|
||||
@@ -104,12 +104,6 @@ impl Default for MemoryArena {
|
||||
}
|
||||
|
||||
impl MemoryArena {
|
||||
fn add_page(&mut self) -> &mut Page {
|
||||
let new_page_id = self.pages.len();
|
||||
self.pages.push(Page::new(new_page_id));
|
||||
&mut self.pages[new_page_id]
|
||||
}
|
||||
|
||||
/// Returns an estimate in number of bytes
|
||||
/// of resident memory consumed by the `MemoryArena`.
|
||||
///
|
||||
@@ -134,36 +128,58 @@ impl MemoryArena {
|
||||
pub fn read<Item: Copy + 'static>(&self, addr: Addr) -> Item {
|
||||
load(self.slice(addr, mem::size_of::<Item>()))
|
||||
}
|
||||
#[inline]
|
||||
fn get_page(&self, page_id: usize) -> &Page {
|
||||
unsafe { self.pages.get_unchecked(page_id) }
|
||||
}
|
||||
#[inline]
|
||||
fn get_page_mut(&mut self, page_id: usize) -> &mut Page {
|
||||
unsafe { self.pages.get_unchecked_mut(page_id) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn slice(&self, addr: Addr, len: usize) -> &[u8] {
|
||||
self.pages[addr.page_id()].slice(addr.page_local_addr(), len)
|
||||
self.get_page(addr.page_id())
|
||||
.slice(addr.page_local_addr(), len)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn slice_from(&self, addr: Addr) -> &[u8] {
|
||||
self.pages[addr.page_id()].slice_from(addr.page_local_addr())
|
||||
self.get_page(addr.page_id())
|
||||
.slice_from(addr.page_local_addr())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn slice_mut(&mut self, addr: Addr, len: usize) -> &mut [u8] {
|
||||
self.pages[addr.page_id()].slice_mut(addr.page_local_addr(), len)
|
||||
self.get_page_mut(addr.page_id())
|
||||
.slice_mut(addr.page_local_addr(), len)
|
||||
}
|
||||
|
||||
/// Add a page and allocate len on it.
|
||||
/// Return the address
|
||||
fn add_page(&mut self, len: usize) -> Addr {
|
||||
let new_page_id = self.pages.len();
|
||||
let mut page = Page::new(new_page_id);
|
||||
page.len = len;
|
||||
self.pages.push(page);
|
||||
Addr::new(new_page_id, 0)
|
||||
}
|
||||
|
||||
/// Allocates `len` bytes and returns the allocated address.
|
||||
#[inline]
|
||||
pub fn allocate_space(&mut self, len: usize) -> Addr {
|
||||
let page_id = self.pages.len() - 1;
|
||||
if let Some(addr) = self.pages[page_id].allocate_space(len) {
|
||||
if let Some(addr) = self.get_page_mut(page_id).allocate_space(len) {
|
||||
return addr;
|
||||
}
|
||||
self.add_page().allocate_space(len).unwrap()
|
||||
self.add_page(len)
|
||||
}
|
||||
}
|
||||
|
||||
struct Page {
|
||||
page_id: usize,
|
||||
len: usize,
|
||||
data: Box<[u8]>,
|
||||
data: Box<[u8; PAGE_SIZE]>,
|
||||
}
|
||||
|
||||
impl Page {
|
||||
@@ -171,7 +187,7 @@ impl Page {
|
||||
Page {
|
||||
page_id,
|
||||
len: 0,
|
||||
data: vec![0u8; PAGE_SIZE].into_boxed_slice(),
|
||||
data: vec![0u8; PAGE_SIZE].into_boxed_slice().try_into().unwrap(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -182,7 +198,8 @@ impl Page {
|
||||
|
||||
#[inline]
|
||||
fn slice(&self, local_addr: usize, len: usize) -> &[u8] {
|
||||
&self.slice_from(local_addr)[..len]
|
||||
let data = &self.slice_from(local_addr);
|
||||
unsafe { data.get_unchecked(..len) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@@ -192,9 +209,11 @@ impl Page {
|
||||
|
||||
#[inline]
|
||||
fn slice_mut(&mut self, local_addr: usize, len: usize) -> &mut [u8] {
|
||||
&mut self.data[local_addr..][..len]
|
||||
let data = &mut self.data[local_addr..];
|
||||
unsafe { data.get_unchecked_mut(..len) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn allocate_space(&mut self, len: usize) -> Option<Addr> {
|
||||
if self.is_available(len) {
|
||||
let addr = Addr::new(self.page_id, self.len);
|
||||
|
||||
@@ -4,6 +4,8 @@ version = "0.1.0"
|
||||
license = "MIT"
|
||||
edition = "2021"
|
||||
description = "Tokenizer API of tantivy"
|
||||
homepage = "https://github.com/quickwit-oss/tantivy"
|
||||
repository = "https://github.com/quickwit-oss/tantivy"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
|
||||
@@ -34,26 +34,44 @@ impl Default for Token {
|
||||
offset_from: 0,
|
||||
offset_to: 0,
|
||||
position: usize::MAX,
|
||||
text: String::with_capacity(200),
|
||||
text: String::new(),
|
||||
position_length: 1,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Token {
|
||||
/// reset to default
|
||||
pub fn reset(&mut self) {
|
||||
self.offset_from = 0;
|
||||
self.offset_to = 0;
|
||||
self.position = usize::MAX;
|
||||
self.text.clear();
|
||||
self.position_length = 1;
|
||||
}
|
||||
}
|
||||
|
||||
/// `Tokenizer` are in charge of splitting text into a stream of token
|
||||
/// before indexing.
|
||||
pub trait Tokenizer: 'static + Clone + Send + Sync {
|
||||
/// The token stream returned by this Tokenizer.
|
||||
type TokenStream<'a>: TokenStream;
|
||||
/// Creates a token stream for a given `str`.
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a>;
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a>;
|
||||
}
|
||||
|
||||
/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
|
||||
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
|
||||
|
||||
impl<'a> From<BoxTokenStream<'a>> for Box<dyn TokenStream + 'a> {
|
||||
fn from(token_stream: BoxTokenStream<'a>) -> Self {
|
||||
token_stream.0
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T> From<T> for BoxTokenStream<'a>
|
||||
where T: TokenStream + 'a
|
||||
where
|
||||
T: TokenStream + 'a,
|
||||
{
|
||||
fn from(token_stream: T) -> BoxTokenStream<'a> {
|
||||
BoxTokenStream(Box::new(token_stream))
|
||||
|
||||
Reference in New Issue
Block a user