Compare commits

...

7 Commits

Author SHA1 Message Date
François Massot
36d585a402 Make only one mem::swap. 2023-07-17 16:24:48 +09:00
François Massot
90b9059a93 Add preserve original on ascii folding filter. 2023-07-17 16:15:59 +09:00
Adam Reichold
820f126075 Remove support for Brotli and Snappy compression (#2123)
LZ4 provides fast and simple compression whereas Zstd is exceptionally flexible
so that the additional support for Brotli and Snappy does not really add
any distinct functionality on top of those two algorithms.

Removing them reduces our maintenance burden and reduces the number of choices
users have to make when setting up their project based on Tantivy.
2023-07-14 16:54:59 +09:00
Adam Reichold
7e6c4a1856 Include only built-in compression algorithms as enum variants (#2121)
* Include only built-in compression algorithms as enum variants

This enables compile-time errors when a compression algorithm is requested which
is not actually enabled for the current Cargo project. The cost is that indexes
using other compression algorithms cannot even be loaded (even though they
are not fully accessible in any case).

As a drive-by, this also fixes `--no-default-features` on `cfg(unix)`.

* Provide more instructive error messages for unsupported, but not unknown compression variants.
2023-07-14 11:02:49 +09:00
Adam Reichold
5fafe4b1ab Add missing query_terms impl for TermSetQuery. (#2120) 2023-07-13 14:54:29 +02:00
PSeitz
1e7cd48cfa remove allocations in split compound words (#2080)
* remove allocations in split compound words

* clear reused data
2023-07-13 09:43:02 +09:00
dependabot[bot]
7f51d85bbd Update lru requirement from 0.10.0 to 0.11.0 (#2117)
Updates the requirements on [lru](https://github.com/jeromefroe/lru-rs) to permit the latest version.
- [Changelog](https://github.com/jeromefroe/lru-rs/blob/master/CHANGELOG.md)
- [Commits](https://github.com/jeromefroe/lru-rs/compare/0.10.0...0.11.0)

---
updated-dependencies:
- dependency-name: lru
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2023-07-13 09:42:21 +09:00
13 changed files with 180 additions and 234 deletions

View File

@@ -53,7 +53,7 @@ jobs:
strategy:
matrix:
features: [
{ label: "all", flags: "mmap,stopwords,brotli-compression,lz4-compression,snappy-compression,zstd-compression,failpoints" },
{ label: "all", flags: "mmap,stopwords,lz4-compression,zstd-compression,failpoints" },
{ label: "quickwit", flags: "mmap,quickwit,failpoints" }
]

View File

@@ -25,9 +25,7 @@ aho-corasick = "1.0"
tantivy-fst = "0.4.0"
memmap2 = { version = "0.7.1", optional = true }
lz4_flex = { version = "0.11", default-features = false, optional = true }
brotli = { version = "3.3.4", optional = true }
zstd = { version = "0.12", optional = true, default-features = false }
snap = { version = "1.0.5", optional = true }
tempfile = { version = "3.3.0", optional = true }
log = "0.4.16"
serde = { version = "1.0.136", features = ["derive"] }
@@ -49,7 +47,7 @@ murmurhash32 = "0.3.0"
time = { version = "0.3.10", features = ["serde-well-known"] }
smallvec = "1.8.0"
rayon = "1.5.2"
lru = "0.10.0"
lru = "0.11.0"
fastdivide = "0.4.0"
itertools = "0.11.0"
measure_time = "0.8.2"
@@ -107,9 +105,7 @@ default = ["mmap", "stopwords", "lz4-compression"]
mmap = ["fs4", "tempfile", "memmap2"]
stopwords = []
brotli-compression = ["brotli"]
lz4-compression = ["lz4_flex"]
snappy-compression = ["snap"]
zstd-compression = ["zstd"]
failpoints = ["fail", "fail/failpoints"]

View File

@@ -44,7 +44,7 @@ Details about the benchmark can be found at this [repository](https://github.com
- Single valued and multivalued u64, i64, and f64 fast fields (equivalent of doc values in Lucene)
- `&[u8]` fast fields
- Text, i64, u64, f64, dates, ip, bool, and hierarchical facet fields
- Compressed document store (LZ4, Zstd, None, Brotli, Snap)
- Compressed document store (LZ4, Zstd, None)
- Range queries
- Faceted search
- Configurable indexing (optional term frequency and position indexing)

View File

@@ -410,7 +410,9 @@ mod tests {
use super::IndexMeta;
use crate::core::index_meta::UntrackedIndexMeta;
use crate::schema::{Schema, TEXT};
use crate::store::{Compressor, ZstdCompressor};
use crate::store::Compressor;
#[cfg(feature = "zstd-compression")]
use crate::store::ZstdCompressor;
use crate::{IndexSettings, IndexSortByField, Order};
#[test]
@@ -446,6 +448,7 @@ mod tests {
}
#[test]
#[cfg(feature = "zstd-compression")]
fn test_serialize_metas_zstd_compressor() {
let schema = {
let mut schema_builder = Schema::builder();
@@ -482,13 +485,14 @@ mod tests {
}
#[test]
#[cfg(all(feature = "lz4-compression", feature = "zstd-compression"))]
fn test_serialize_metas_invalid_comp() {
let json = r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zsstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
let err = serde_json::from_str::<UntrackedIndexMeta>(json).unwrap_err();
assert_eq!(
err.to_string(),
"unknown variant `zsstd`, expected one of `none`, `lz4`, `brotli`, `snappy`, `zstd`, \
"unknown variant `zsstd`, expected one of `none`, `lz4`, `zstd`, \
`zstd(compression_level=5)` at line 1 column 96"
.to_string()
);
@@ -502,6 +506,20 @@ mod tests {
);
}
#[test]
#[cfg(not(feature = "zstd-compression"))]
fn test_serialize_metas_unsupported_comp() {
let json = r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
let err = serde_json::from_str::<UntrackedIndexMeta>(json).unwrap_err();
assert_eq!(
err.to_string(),
"unsupported variant `zstd`, please enable Tantivy's `zstd-compression` feature at \
line 1 column 95"
.to_string()
);
}
#[test]
#[cfg(feature = "lz4-compression")]
fn test_index_settings_default() {

View File

@@ -191,7 +191,7 @@ pub use crate::schema::{DateOptions, DateTimePrecision, Document, Term};
/// Index format version.
const INDEX_FORMAT_VERSION: u32 = 5;
#[cfg(unix)]
#[cfg(all(feature = "mmap", unix))]
pub use memmap2::Advice;
/// Structure version for the index.

View File

@@ -72,6 +72,14 @@ impl Query for TermSetQuery {
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
Ok(Box::new(self.specialized_weight(enable_scoring.schema())?))
}
fn query_terms<'a>(&'a self, visitor: &mut dyn FnMut(&'a Term, bool)) {
for terms in self.terms_map.values() {
for term in terms {
visitor(term, false);
}
}
}
}
struct SetDfaWrapper(Map<Vec<u8>>);

View File

@@ -1,19 +0,0 @@
use std::io;
#[inline]
pub fn compress(mut uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
let params = brotli::enc::BrotliEncoderParams {
quality: 5,
..Default::default()
};
compressed.clear();
brotli::BrotliCompress(&mut uncompressed, compressed, &params)?;
Ok(())
}
#[inline]
pub fn decompress(mut compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
decompressed.clear();
brotli::BrotliDecompress(&mut compressed, decompressed)?;
Ok(())
}

View File

@@ -1,17 +0,0 @@
use std::io::{self, Read, Write};
#[inline]
pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
compressed.clear();
let mut encoder = snap::write::FrameEncoder::new(compressed);
encoder.write_all(uncompressed)?;
encoder.flush()?;
Ok(())
}
#[inline]
pub fn decompress(compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
decompressed.clear();
snap::read::FrameDecoder::new(compressed).read_to_end(decompressed)?;
Ok(())
}

View File

@@ -17,12 +17,10 @@ pub enum Compressor {
/// No compression
None,
/// Use the lz4 compressor (block format)
#[cfg(feature = "lz4-compression")]
Lz4,
/// Use the brotli compressor
Brotli,
/// Use the snap compressor
Snappy,
/// Use the zstd compressor
#[cfg(feature = "zstd-compression")]
Zstd(ZstdCompressor),
}
@@ -31,9 +29,9 @@ impl Serialize for Compressor {
where S: serde::Serializer {
match *self {
Compressor::None => serializer.serialize_str("none"),
#[cfg(feature = "lz4-compression")]
Compressor::Lz4 => serializer.serialize_str("lz4"),
Compressor::Brotli => serializer.serialize_str("brotli"),
Compressor::Snappy => serializer.serialize_str("snappy"),
#[cfg(feature = "zstd-compression")]
Compressor::Zstd(zstd) => serializer.serialize_str(&zstd.ser_to_string()),
}
}
@@ -45,27 +43,38 @@ impl<'de> Deserialize<'de> for Compressor {
let buf = String::deserialize(deserializer)?;
let compressor = match buf.as_str() {
"none" => Compressor::None,
#[cfg(feature = "lz4-compression")]
"lz4" => Compressor::Lz4,
"brotli" => Compressor::Brotli,
"snappy" => Compressor::Snappy,
#[cfg(not(feature = "lz4-compression"))]
"lz4" => {
return Err(serde::de::Error::custom(
"unsupported variant `lz4`, please enable Tantivy's `lz4-compression` feature",
))
}
#[cfg(feature = "zstd-compression")]
_ if buf.starts_with("zstd") => Compressor::Zstd(
ZstdCompressor::deser_from_str(&buf).map_err(serde::de::Error::custom)?,
),
#[cfg(not(feature = "zstd-compression"))]
_ if buf.starts_with("zstd") => {
return Err(serde::de::Error::custom(
"unsupported variant `zstd`, please enable Tantivy's `zstd-compression` \
feature",
))
}
_ => {
if buf.starts_with("zstd") {
Compressor::Zstd(
ZstdCompressor::deser_from_str(&buf).map_err(serde::de::Error::custom)?,
)
} else {
return Err(serde::de::Error::unknown_variant(
&buf,
&[
"none",
"lz4",
"brotli",
"snappy",
"zstd",
"zstd(compression_level=5)",
],
));
}
return Err(serde::de::Error::unknown_variant(
&buf,
&[
"none",
#[cfg(feature = "lz4-compression")]
"lz4",
#[cfg(feature = "zstd-compression")]
"zstd",
#[cfg(feature = "zstd-compression")]
"zstd(compression_level=5)",
],
));
}
};
@@ -127,18 +136,15 @@ impl ZstdCompressor {
}
impl Default for Compressor {
#[allow(unreachable_code)]
fn default() -> Self {
if cfg!(feature = "lz4-compression") {
Compressor::Lz4
} else if cfg!(feature = "brotli-compression") {
Compressor::Brotli
} else if cfg!(feature = "snappy-compression") {
Compressor::Snappy
} else if cfg!(feature = "zstd-compression") {
Compressor::Zstd(ZstdCompressor::default())
} else {
Compressor::None
}
#[cfg(feature = "lz4-compression")]
return Compressor::Lz4;
#[cfg(feature = "zstd-compression")]
return Compressor::Zstd(ZstdCompressor::default());
Compressor::None
}
}
@@ -155,50 +161,14 @@ impl Compressor {
compressed.extend_from_slice(uncompressed);
Ok(())
}
Self::Lz4 => {
#[cfg(feature = "lz4-compression")]
{
super::compression_lz4_block::compress(uncompressed, compressed)
}
#[cfg(not(feature = "lz4-compression"))]
{
panic!("lz4-compression feature flag not activated");
}
}
Self::Brotli => {
#[cfg(feature = "brotli-compression")]
{
super::compression_brotli::compress(uncompressed, compressed)
}
#[cfg(not(feature = "brotli-compression"))]
{
panic!("brotli-compression-compression feature flag not activated");
}
}
Self::Snappy => {
#[cfg(feature = "snappy-compression")]
{
super::compression_snap::compress(uncompressed, compressed)
}
#[cfg(not(feature = "snappy-compression"))]
{
panic!("snappy-compression feature flag not activated");
}
}
Self::Zstd(_zstd_compressor) => {
#[cfg(feature = "zstd-compression")]
{
super::compression_zstd_block::compress(
uncompressed,
compressed,
_zstd_compressor.compression_level,
)
}
#[cfg(not(feature = "zstd-compression"))]
{
panic!("zstd-compression feature flag not activated");
}
}
#[cfg(feature = "lz4-compression")]
Self::Lz4 => super::compression_lz4_block::compress(uncompressed, compressed),
#[cfg(feature = "zstd-compression")]
Self::Zstd(_zstd_compressor) => super::compression_zstd_block::compress(
uncompressed,
compressed,
_zstd_compressor.compression_level,
),
}
}
}

View File

@@ -16,12 +16,10 @@ pub enum Decompressor {
/// No compression
None,
/// Use the lz4 decompressor (block format)
#[cfg(feature = "lz4-compression")]
Lz4,
/// Use the brotli decompressor
Brotli,
/// Use the snap decompressor
Snappy,
/// Use the zstd decompressor
#[cfg(feature = "zstd-compression")]
Zstd,
}
@@ -29,9 +27,9 @@ impl From<Compressor> for Decompressor {
fn from(compressor: Compressor) -> Self {
match compressor {
Compressor::None => Decompressor::None,
#[cfg(feature = "lz4-compression")]
Compressor::Lz4 => Decompressor::Lz4,
Compressor::Brotli => Decompressor::Brotli,
Compressor::Snappy => Decompressor::Snappy,
#[cfg(feature = "zstd-compression")]
Compressor::Zstd(_) => Decompressor::Zstd,
}
}
@@ -41,9 +39,9 @@ impl Decompressor {
pub(crate) fn from_id(id: u8) -> Decompressor {
match id {
0 => Decompressor::None,
#[cfg(feature = "lz4-compression")]
1 => Decompressor::Lz4,
2 => Decompressor::Brotli,
3 => Decompressor::Snappy,
#[cfg(feature = "zstd-compression")]
4 => Decompressor::Zstd,
_ => panic!("unknown compressor id {id:?}"),
}
@@ -52,9 +50,9 @@ impl Decompressor {
pub(crate) fn get_id(&self) -> u8 {
match self {
Self::None => 0,
#[cfg(feature = "lz4-compression")]
Self::Lz4 => 1,
Self::Brotli => 2,
Self::Snappy => 3,
#[cfg(feature = "zstd-compression")]
Self::Zstd => 4,
}
}
@@ -77,46 +75,10 @@ impl Decompressor {
decompressed.extend_from_slice(compressed);
Ok(())
}
Self::Lz4 => {
#[cfg(feature = "lz4-compression")]
{
super::compression_lz4_block::decompress(compressed, decompressed)
}
#[cfg(not(feature = "lz4-compression"))]
{
panic!("lz4-compression feature flag not activated");
}
}
Self::Brotli => {
#[cfg(feature = "brotli-compression")]
{
super::compression_brotli::decompress(compressed, decompressed)
}
#[cfg(not(feature = "brotli-compression"))]
{
panic!("brotli-compression feature flag not activated");
}
}
Self::Snappy => {
#[cfg(feature = "snappy-compression")]
{
super::compression_snap::decompress(compressed, decompressed)
}
#[cfg(not(feature = "snappy-compression"))]
{
panic!("snappy-compression feature flag not activated");
}
}
Self::Zstd => {
#[cfg(feature = "zstd-compression")]
{
super::compression_zstd_block::decompress(compressed, decompressed)
}
#[cfg(not(feature = "zstd-compression"))]
{
panic!("zstd-compression feature flag not activated");
}
}
#[cfg(feature = "lz4-compression")]
Self::Lz4 => super::compression_lz4_block::decompress(compressed, decompressed),
#[cfg(feature = "zstd-compression")]
Self::Zstd => super::compression_zstd_block::decompress(compressed, decompressed),
}
}
}
@@ -129,9 +91,9 @@ mod tests {
#[test]
fn compressor_decompressor_id_test() {
assert_eq!(Decompressor::from(Compressor::None), Decompressor::None);
#[cfg(feature = "lz4-compression")]
assert_eq!(Decompressor::from(Compressor::Lz4), Decompressor::Lz4);
assert_eq!(Decompressor::from(Compressor::Brotli), Decompressor::Brotli);
assert_eq!(Decompressor::from(Compressor::Snappy), Decompressor::Snappy);
#[cfg(feature = "zstd-compression")]
assert_eq!(
Decompressor::from(Compressor::Zstd(Default::default())),
Decompressor::Zstd

View File

@@ -4,8 +4,8 @@
//! order to be handled in the `Store`.
//!
//! Internally, documents (or rather their stored fields) are serialized to a buffer.
//! When the buffer exceeds `block_size` (defaults to 16K), the buffer is compressed using `brotli`,
//! `LZ4` or `snappy` and the resulting block is written to disk.
//! When the buffer exceeds `block_size` (defaults to 16K), the buffer is compressed
//! using LZ4 or Zstd and the resulting block is written to disk.
//!
//! One can then request for a specific `DocId`.
//! A skip list helps navigating to the right block,
@@ -48,12 +48,6 @@ pub(crate) const DOC_STORE_VERSION: u32 = 1;
#[cfg(feature = "lz4-compression")]
mod compression_lz4_block;
#[cfg(feature = "brotli-compression")]
mod compression_brotli;
#[cfg(feature = "snappy-compression")]
mod compression_snap;
#[cfg(feature = "zstd-compression")]
mod compression_zstd_block;
@@ -200,16 +194,6 @@ pub mod tests {
fn test_store_lz4_block() -> crate::Result<()> {
test_store(Compressor::Lz4, BLOCK_SIZE, true)
}
#[cfg(feature = "snappy-compression")]
#[test]
fn test_store_snap() -> crate::Result<()> {
test_store(Compressor::Snappy, BLOCK_SIZE, true)
}
#[cfg(feature = "brotli-compression")]
#[test]
fn test_store_brotli() -> crate::Result<()> {
test_store(Compressor::Brotli, BLOCK_SIZE, true)
}
#[cfg(feature = "zstd-compression")]
#[test]
@@ -261,8 +245,8 @@ pub mod tests {
Ok(())
}
#[cfg(feature = "snappy-compression")]
#[cfg(feature = "lz4-compression")]
#[cfg(feature = "zstd-compression")]
#[test]
fn test_merge_with_changed_compressor() -> crate::Result<()> {
let mut schema_builder = schema::Schema::builder();
@@ -294,7 +278,7 @@ pub mod tests {
);
// Change compressor, this disables stacking on merging
let index_settings = index.settings_mut();
index_settings.docstore_compression = Compressor::Snappy;
index_settings.docstore_compression = Compressor::Zstd(Default::default());
// Merging the segments
{
let segment_ids = index
@@ -316,7 +300,7 @@ pub mod tests {
LOREM.to_string()
);
}
assert_eq!(store.decompressor(), Decompressor::Snappy);
assert_eq!(store.decompressor(), Decompressor::Zstd);
Ok(())
}

View File

@@ -2,11 +2,22 @@ use std::mem;
use super::{Token, TokenFilter, TokenStream, Tokenizer};
/// This class converts alphabetic, numeric, and symbolic Unicode characters
/// `AsciiFoldingFilter` converts alphabetic, numeric, and symbolic Unicode characters
/// which are not in the first 127 ASCII characters (the "Basic Latin" Unicode
/// block) into their ASCII equivalents, if one exists.
#[derive(Clone)]
pub struct AsciiFoldingFilter;
/// If `preserve_original` is `true`, the filter emits both original token and
/// folded token with the same position if tokens are different.
#[derive(Clone, Default)]
pub struct AsciiFoldingFilter {
preserve_original: bool,
}
impl AsciiFoldingFilter {
/// Creates a new `AsciiFoldingFilter`.
pub fn new(preserve_original: bool) -> Self {
Self { preserve_original }
}
}
impl TokenFilter for AsciiFoldingFilter {
type Tokenizer<T: Tokenizer> = AsciiFoldingFilterWrapper<T>;
@@ -14,6 +25,7 @@ impl TokenFilter for AsciiFoldingFilter {
fn transform<T: Tokenizer>(self, tokenizer: T) -> AsciiFoldingFilterWrapper<T> {
AsciiFoldingFilterWrapper {
tokenizer,
preserve_original: self.preserve_original,
buffer: String::new(),
}
}
@@ -22,6 +34,7 @@ impl TokenFilter for AsciiFoldingFilter {
#[derive(Clone)]
pub struct AsciiFoldingFilterWrapper<T> {
tokenizer: T,
preserve_original: bool,
buffer: String,
}
@@ -31,6 +44,8 @@ impl<T: Tokenizer> Tokenizer for AsciiFoldingFilterWrapper<T> {
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
self.buffer.clear();
AsciiFoldingFilterTokenStream {
preserve_original: self.preserve_original,
emit_folded_token_on_advance: false,
buffer: &mut self.buffer,
tail: self.tokenizer.token_stream(text),
}
@@ -38,18 +53,31 @@ impl<T: Tokenizer> Tokenizer for AsciiFoldingFilterWrapper<T> {
}
pub struct AsciiFoldingFilterTokenStream<'a, T> {
preserve_original: bool,
emit_folded_token_on_advance: bool,
buffer: &'a mut String,
tail: T,
}
impl<'a, T: TokenStream> TokenStream for AsciiFoldingFilterTokenStream<'a, T> {
fn advance(&mut self) -> bool {
if self.emit_folded_token_on_advance {
self.emit_folded_token_on_advance = false;
mem::swap(&mut self.tail.token_mut().text, self.buffer);
return true;
}
if !self.tail.advance() {
return false;
}
let mut text_has_changed = false;
if !self.token_mut().text.is_ascii() {
// ignore its already ascii
to_ascii(&self.tail.token().text, self.buffer);
text_has_changed = to_ascii(&self.tail.token().text, self.buffer);
}
// If preserve original is true and orginal is different from folded text,
// the folded token will be emitted on the next call to `advance`.
if self.preserve_original && text_has_changed {
self.emit_folded_token_on_advance = true;
} else if text_has_changed {
mem::swap(&mut self.tail.token_mut().text, self.buffer);
}
true
@@ -1546,17 +1574,21 @@ fn fold_non_ascii_char(c: char) -> Option<&'static str> {
}
}
// Writes the folded version of the text to the `output`.
// Returns true if the text was modified.
// https://github.com/apache/lucene-solr/blob/master/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.java#L187
fn to_ascii(text: &str, output: &mut String) {
fn to_ascii(text: &str, output: &mut String) -> bool {
output.clear();
let mut is_text_modified = false;
for c in text.chars() {
if let Some(folded) = fold_non_ascii_char(c) {
output.push_str(folded);
is_text_modified = true;
} else {
output.push(c);
}
}
is_text_modified
}
#[cfg(test)]
@@ -1568,20 +1600,26 @@ mod tests {
#[test]
fn test_ascii_folding() {
assert_eq!(&folding_helper("Ràmon"), &["Ramon"]);
assert_eq!(&folding_helper("accentué"), &["accentue"]);
assert_eq!(&folding_helper("âäàéè"), &["aaaee"]);
assert_eq!(&folding_helper("Ràmon", false), &["Ramon"]);
assert_eq!(&folding_helper("accentué", false), &["accentue"]);
assert_eq!(&folding_helper("âäàéè", false), &["aaaee"]);
assert_eq!(
&folding_helper("Ràmon âäàéè ", true),
&["Ràmon", "Ramon", "âäàéè", "aaaee"]
);
assert_eq!(&folding_helper("Ràmon", true), &["Ràmon", "Ramon"]);
}
#[test]
fn test_no_change() {
assert_eq!(&folding_helper("Usagi"), &["Usagi"]);
assert_eq!(&folding_helper("Usagi", false), &["Usagi"]);
assert_eq!(&folding_helper("Usagi", true), &["Usagi"]);
}
fn folding_helper(text: &str) -> Vec<String> {
fn folding_helper(text: &str, preserve_original: bool) -> Vec<String> {
let mut tokens = Vec::new();
TextAnalyzer::builder(SimpleTokenizer::default())
.filter(AsciiFoldingFilter)
.filter(AsciiFoldingFilter::new(preserve_original))
.build()
.token_stream(text)
.process(&mut |token| {
@@ -1592,7 +1630,7 @@ mod tests {
fn folding_using_raw_tokenizer_helper(text: &str) -> String {
let mut tokenizer = TextAnalyzer::builder(RawTokenizer::default())
.filter(AsciiFoldingFilter)
.filter(AsciiFoldingFilter::default())
.build();
let mut token_stream = tokenizer.token_stream(text);
token_stream.advance();
@@ -1634,7 +1672,7 @@ mod tests {
vec.extend(iter::repeat("y").take(2));
vec.extend(iter::repeat("fi").take(1));
vec.extend(iter::repeat("fl").take(1));
assert_eq!(folding_helper(latin1_string), vec);
assert_eq!(folding_helper(latin1_string, false), vec);
}
#[test]

View File

@@ -86,6 +86,8 @@ impl TokenFilter for SplitCompoundWords {
SplitCompoundWordsFilter {
dict: self.dict,
inner: tokenizer,
cuts: Vec::new(),
parts: Vec::new(),
}
}
}
@@ -94,29 +96,33 @@ impl TokenFilter for SplitCompoundWords {
pub struct SplitCompoundWordsFilter<T> {
dict: AhoCorasick,
inner: T,
}
impl<T: Tokenizer> Tokenizer for SplitCompoundWordsFilter<T> {
type TokenStream<'a> = SplitCompoundWordsTokenStream<T::TokenStream<'a>>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
SplitCompoundWordsTokenStream {
dict: self.dict.clone(),
tail: self.inner.token_stream(text),
cuts: Vec::new(),
parts: Vec::new(),
}
}
}
pub struct SplitCompoundWordsTokenStream<T> {
dict: AhoCorasick,
tail: T,
cuts: Vec<usize>,
parts: Vec<Token>,
}
impl<T: TokenStream> SplitCompoundWordsTokenStream<T> {
impl<T: Tokenizer> Tokenizer for SplitCompoundWordsFilter<T> {
type TokenStream<'a> = SplitCompoundWordsTokenStream<'a, T::TokenStream<'a>>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
self.cuts.clear();
self.parts.clear();
SplitCompoundWordsTokenStream {
dict: self.dict.clone(),
tail: self.inner.token_stream(text),
cuts: &mut self.cuts,
parts: &mut self.parts,
}
}
}
pub struct SplitCompoundWordsTokenStream<'a, T> {
dict: AhoCorasick,
tail: T,
cuts: &'a mut Vec<usize>,
parts: &'a mut Vec<Token>,
}
impl<'a, T: TokenStream> SplitCompoundWordsTokenStream<'a, T> {
// Will use `self.cuts` to fill `self.parts` if `self.tail.token()`
// can fully be split into consecutive matches against `self.dict`.
fn split(&mut self) {
@@ -152,7 +158,7 @@ impl<T: TokenStream> SplitCompoundWordsTokenStream<T> {
}
}
impl<T: TokenStream> TokenStream for SplitCompoundWordsTokenStream<T> {
impl<'a, T: TokenStream> TokenStream for SplitCompoundWordsTokenStream<'a, T> {
fn advance(&mut self) -> bool {
self.parts.pop();