Make only one mem::swap.

Add preserve original on ascii folding filter.
Remove support for Brotli and Snappy compression (#2123 )
2026-01-06 17:22:54 +00:00 · 2023-07-17 16:24:48 +09:00 · 2023-07-17 16:15:59 +09:00 · 2023-07-14 16:54:59 +09:00 · 2023-07-14 11:02:49 +09:00 · 2023-07-13 14:54:29 +02:00
13 changed files with 180 additions and 234 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -53,7 +53,7 @@ jobs:
    strategy:
      matrix:
        features: [
-            { label: "all", flags: "mmap,stopwords,brotli-compression,lz4-compression,snappy-compression,zstd-compression,failpoints" },
+            { label: "all", flags: "mmap,stopwords,lz4-compression,zstd-compression,failpoints" },
            { label: "quickwit", flags: "mmap,quickwit,failpoints" }
        ]

--- a/Cargo.toml
+++ b/Cargo.toml
@@ -25,9 +25,7 @@ aho-corasick = "1.0"
 tantivy-fst = "0.4.0"
 memmap2 = { version = "0.7.1", optional = true }
 lz4_flex = { version = "0.11", default-features = false, optional = true }
-brotli = { version = "3.3.4", optional = true }
 zstd = { version = "0.12", optional = true, default-features = false }
-snap = { version = "1.0.5", optional = true }
 tempfile = { version = "3.3.0", optional = true }
 log = "0.4.16"
 serde = { version = "1.0.136", features = ["derive"] }
@@ -49,7 +47,7 @@ murmurhash32 = "0.3.0"
 time = { version = "0.3.10", features = ["serde-well-known"] }
 smallvec = "1.8.0"
 rayon = "1.5.2"
-lru = "0.10.0"
+lru = "0.11.0"
 fastdivide = "0.4.0"
 itertools = "0.11.0"
 measure_time = "0.8.2"
@@ -107,9 +105,7 @@ default = ["mmap", "stopwords", "lz4-compression"]
 mmap = ["fs4", "tempfile", "memmap2"]
 stopwords = []

-brotli-compression = ["brotli"]
 lz4-compression = ["lz4_flex"]
-snappy-compression = ["snap"]
 zstd-compression = ["zstd"]

 failpoints = ["fail", "fail/failpoints"]
--- a/README.md
+++ b/README.md
@@ -44,7 +44,7 @@ Details about the benchmark can be found at this [repository](https://github.com
 - Single valued and multivalued u64, i64, and f64 fast fields (equivalent of doc values in Lucene)
 - `&[u8]` fast fields
 - Text, i64, u64, f64, dates, ip, bool, and hierarchical facet fields
- Compressed document store (LZ4, Zstd, None, Brotli, Snap)
+- Compressed document store (LZ4, Zstd, None)
 - Range queries
 - Faceted search
 - Configurable indexing (optional term frequency and position indexing)
--- a/src/core/index_meta.rs
+++ b/src/core/index_meta.rs
@@ -410,7 +410,9 @@ mod tests {
    use super::IndexMeta;
    use crate::core::index_meta::UntrackedIndexMeta;
    use crate::schema::{Schema, TEXT};
-    use crate::store::{Compressor, ZstdCompressor};
+    use crate::store::Compressor;
+    #[cfg(feature = "zstd-compression")]
+    use crate::store::ZstdCompressor;
    use crate::{IndexSettings, IndexSortByField, Order};

    #[test]
@@ -446,6 +448,7 @@ mod tests {
    }

    #[test]
+    #[cfg(feature = "zstd-compression")]
    fn test_serialize_metas_zstd_compressor() {
        let schema = {
            let mut schema_builder = Schema::builder();
@@ -482,13 +485,14 @@ mod tests {
    }

    #[test]
+    #[cfg(all(feature = "lz4-compression", feature = "zstd-compression"))]
    fn test_serialize_metas_invalid_comp() {
        let json = r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zsstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;

        let err = serde_json::from_str::<UntrackedIndexMeta>(json).unwrap_err();
        assert_eq!(
            err.to_string(),
-            "unknown variant `zsstd`, expected one of `none`, `lz4`, `brotli`, `snappy`, `zstd`, \
+            "unknown variant `zsstd`, expected one of `none`, `lz4`, `zstd`, \
             `zstd(compression_level=5)` at line 1 column 96"
                .to_string()
        );
@@ -502,6 +506,20 @@ mod tests {
        );
    }

+    #[test]
+    #[cfg(not(feature = "zstd-compression"))]
+    fn test_serialize_metas_unsupported_comp() {
+        let json = r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
+
+        let err = serde_json::from_str::<UntrackedIndexMeta>(json).unwrap_err();
+        assert_eq!(
+            err.to_string(),
+            "unsupported variant `zstd`, please enable Tantivy's `zstd-compression` feature at \
+             line 1 column 95"
+                .to_string()
+        );
+    }
+
    #[test]
    #[cfg(feature = "lz4-compression")]
    fn test_index_settings_default() {
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -191,7 +191,7 @@ pub use crate::schema::{DateOptions, DateTimePrecision, Document, Term};
 /// Index format version.
 const INDEX_FORMAT_VERSION: u32 = 5;

-#[cfg(unix)]
+#[cfg(all(feature = "mmap", unix))]
 pub use memmap2::Advice;

 /// Structure version for the index.
--- a/src/query/set_query.rs
+++ b/src/query/set_query.rs
@@ -72,6 +72,14 @@ impl Query for TermSetQuery {
    fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
        Ok(Box::new(self.specialized_weight(enable_scoring.schema())?))
    }
+
+    fn query_terms<'a>(&'a self, visitor: &mut dyn FnMut(&'a Term, bool)) {
+        for terms in self.terms_map.values() {
+            for term in terms {
+                visitor(term, false);
+            }
+        }
+    }
 }

 struct SetDfaWrapper(Map<Vec<u8>>);
--- a/src/store/compression_brotli.rs
+++ b/src/store/compression_brotli.rs
@@ -1,19 +0,0 @@
-use std::io;
-
-#[inline]
-pub fn compress(mut uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
-    let params = brotli::enc::BrotliEncoderParams {
-        quality: 5,
-        ..Default::default()
-    };
-    compressed.clear();
-    brotli::BrotliCompress(&mut uncompressed, compressed, &params)?;
-    Ok(())
-}
-
-#[inline]
-pub fn decompress(mut compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
-    decompressed.clear();
-    brotli::BrotliDecompress(&mut compressed, decompressed)?;
-    Ok(())
-}
--- a/src/store/compression_snap.rs
+++ b/src/store/compression_snap.rs
@@ -1,17 +0,0 @@
-use std::io::{self, Read, Write};
-
-#[inline]
-pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
-    compressed.clear();
-    let mut encoder = snap::write::FrameEncoder::new(compressed);
-    encoder.write_all(uncompressed)?;
-    encoder.flush()?;
-    Ok(())
-}
-
-#[inline]
-pub fn decompress(compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
-    decompressed.clear();
-    snap::read::FrameDecoder::new(compressed).read_to_end(decompressed)?;
-    Ok(())
-}
--- a/src/store/compressors.rs
+++ b/src/store/compressors.rs
@@ -17,12 +17,10 @@ pub enum Compressor {
    /// No compression
    None,
    /// Use the lz4 compressor (block format)
+    #[cfg(feature = "lz4-compression")]
    Lz4,
-    /// Use the brotli compressor
-    Brotli,
-    /// Use the snap compressor
-    Snappy,
    /// Use the zstd compressor
+    #[cfg(feature = "zstd-compression")]
    Zstd(ZstdCompressor),
 }

@@ -31,9 +29,9 @@ impl Serialize for Compressor {
    where S: serde::Serializer {
        match *self {
            Compressor::None => serializer.serialize_str("none"),
+            #[cfg(feature = "lz4-compression")]
            Compressor::Lz4 => serializer.serialize_str("lz4"),
-            Compressor::Brotli => serializer.serialize_str("brotli"),
-            Compressor::Snappy => serializer.serialize_str("snappy"),
+            #[cfg(feature = "zstd-compression")]
            Compressor::Zstd(zstd) => serializer.serialize_str(&zstd.ser_to_string()),
        }
    }
@@ -45,27 +43,38 @@ impl<'de> Deserialize<'de> for Compressor {
        let buf = String::deserialize(deserializer)?;
        let compressor = match buf.as_str() {
            "none" => Compressor::None,
+            #[cfg(feature = "lz4-compression")]
            "lz4" => Compressor::Lz4,
-            "brotli" => Compressor::Brotli,
-            "snappy" => Compressor::Snappy,
+            #[cfg(not(feature = "lz4-compression"))]
+            "lz4" => {
+                return Err(serde::de::Error::custom(
+                    "unsupported variant `lz4`, please enable Tantivy's `lz4-compression` feature",
+                ))
+            }
+            #[cfg(feature = "zstd-compression")]
+            _ if buf.starts_with("zstd") => Compressor::Zstd(
+                ZstdCompressor::deser_from_str(&buf).map_err(serde::de::Error::custom)?,
+            ),
+            #[cfg(not(feature = "zstd-compression"))]
+            _ if buf.starts_with("zstd") => {
+                return Err(serde::de::Error::custom(
+                    "unsupported variant `zstd`, please enable Tantivy's `zstd-compression` \
+                     feature",
+                ))
+            }
            _ => {
-                if buf.starts_with("zstd") {
-                    Compressor::Zstd(
-                        ZstdCompressor::deser_from_str(&buf).map_err(serde::de::Error::custom)?,
-                    )
-                } else {
-                    return Err(serde::de::Error::unknown_variant(
-                        &buf,
-                        &[
-                            "none",
-                            "lz4",
-                            "brotli",
-                            "snappy",
-                            "zstd",
-                            "zstd(compression_level=5)",
-                        ],
-                    ));
-                }
+                return Err(serde::de::Error::unknown_variant(
+                    &buf,
+                    &[
+                        "none",
+                        #[cfg(feature = "lz4-compression")]
+                        "lz4",
+                        #[cfg(feature = "zstd-compression")]
+                        "zstd",
+                        #[cfg(feature = "zstd-compression")]
+                        "zstd(compression_level=5)",
+                    ],
+                ));
            }
        };

@@ -127,18 +136,15 @@ impl ZstdCompressor {
 }

 impl Default for Compressor {
+    #[allow(unreachable_code)]
    fn default() -> Self {
-        if cfg!(feature = "lz4-compression") {
-            Compressor::Lz4
-        } else if cfg!(feature = "brotli-compression") {
-            Compressor::Brotli
-        } else if cfg!(feature = "snappy-compression") {
-            Compressor::Snappy
-        } else if cfg!(feature = "zstd-compression") {
-            Compressor::Zstd(ZstdCompressor::default())
-        } else {
-            Compressor::None
-        }
+        #[cfg(feature = "lz4-compression")]
+        return Compressor::Lz4;
+
+        #[cfg(feature = "zstd-compression")]
+        return Compressor::Zstd(ZstdCompressor::default());
+
+        Compressor::None
    }
 }

@@ -155,50 +161,14 @@ impl Compressor {
                compressed.extend_from_slice(uncompressed);
                Ok(())
            }
-            Self::Lz4 => {
-                #[cfg(feature = "lz4-compression")]
-                {
-                    super::compression_lz4_block::compress(uncompressed, compressed)
-                }
-                #[cfg(not(feature = "lz4-compression"))]
-                {
-                    panic!("lz4-compression feature flag not activated");
-                }
-            }
-            Self::Brotli => {
-                #[cfg(feature = "brotli-compression")]
-                {
-                    super::compression_brotli::compress(uncompressed, compressed)
-                }
-                #[cfg(not(feature = "brotli-compression"))]
-                {
-                    panic!("brotli-compression-compression feature flag not activated");
-                }
-            }
-            Self::Snappy => {
-                #[cfg(feature = "snappy-compression")]
-                {
-                    super::compression_snap::compress(uncompressed, compressed)
-                }
-                #[cfg(not(feature = "snappy-compression"))]
-                {
-                    panic!("snappy-compression feature flag not activated");
-                }
-            }
-            Self::Zstd(_zstd_compressor) => {
-                #[cfg(feature = "zstd-compression")]
-                {
-                    super::compression_zstd_block::compress(
-                        uncompressed,
-                        compressed,
-                        _zstd_compressor.compression_level,
-                    )
-                }
-                #[cfg(not(feature = "zstd-compression"))]
-                {
-                    panic!("zstd-compression feature flag not activated");
-                }
-            }
+            #[cfg(feature = "lz4-compression")]
+            Self::Lz4 => super::compression_lz4_block::compress(uncompressed, compressed),
+            #[cfg(feature = "zstd-compression")]
+            Self::Zstd(_zstd_compressor) => super::compression_zstd_block::compress(
+                uncompressed,
+                compressed,
+                _zstd_compressor.compression_level,
+            ),
        }
    }
 }
--- a/src/store/decompressors.rs
+++ b/src/store/decompressors.rs
@@ -16,12 +16,10 @@ pub enum Decompressor {
    /// No compression
    None,
    /// Use the lz4 decompressor (block format)
+    #[cfg(feature = "lz4-compression")]
    Lz4,
-    /// Use the brotli decompressor
-    Brotli,
-    /// Use the snap decompressor
-    Snappy,
    /// Use the zstd decompressor
+    #[cfg(feature = "zstd-compression")]
    Zstd,
 }

@@ -29,9 +27,9 @@ impl From<Compressor> for Decompressor {
    fn from(compressor: Compressor) -> Self {
        match compressor {
            Compressor::None => Decompressor::None,
+            #[cfg(feature = "lz4-compression")]
            Compressor::Lz4 => Decompressor::Lz4,
-            Compressor::Brotli => Decompressor::Brotli,
-            Compressor::Snappy => Decompressor::Snappy,
+            #[cfg(feature = "zstd-compression")]
            Compressor::Zstd(_) => Decompressor::Zstd,
        }
    }
@@ -41,9 +39,9 @@ impl Decompressor {
    pub(crate) fn from_id(id: u8) -> Decompressor {
        match id {
            0 => Decompressor::None,
+            #[cfg(feature = "lz4-compression")]
            1 => Decompressor::Lz4,
-            2 => Decompressor::Brotli,
-            3 => Decompressor::Snappy,
+            #[cfg(feature = "zstd-compression")]
            4 => Decompressor::Zstd,
            _ => panic!("unknown compressor id {id:?}"),
        }
@@ -52,9 +50,9 @@ impl Decompressor {
    pub(crate) fn get_id(&self) -> u8 {
        match self {
            Self::None => 0,
+            #[cfg(feature = "lz4-compression")]
            Self::Lz4 => 1,
-            Self::Brotli => 2,
-            Self::Snappy => 3,
+            #[cfg(feature = "zstd-compression")]
            Self::Zstd => 4,
        }
    }
@@ -77,46 +75,10 @@ impl Decompressor {
                decompressed.extend_from_slice(compressed);
                Ok(())
            }
-            Self::Lz4 => {
-                #[cfg(feature = "lz4-compression")]
-                {
-                    super::compression_lz4_block::decompress(compressed, decompressed)
-                }
-                #[cfg(not(feature = "lz4-compression"))]
-                {
-                    panic!("lz4-compression feature flag not activated");
-                }
-            }
-            Self::Brotli => {
-                #[cfg(feature = "brotli-compression")]
-                {
-                    super::compression_brotli::decompress(compressed, decompressed)
-                }
-                #[cfg(not(feature = "brotli-compression"))]
-                {
-                    panic!("brotli-compression feature flag not activated");
-                }
-            }
-            Self::Snappy => {
-                #[cfg(feature = "snappy-compression")]
-                {
-                    super::compression_snap::decompress(compressed, decompressed)
-                }
-                #[cfg(not(feature = "snappy-compression"))]
-                {
-                    panic!("snappy-compression feature flag not activated");
-                }
-            }
-            Self::Zstd => {
-                #[cfg(feature = "zstd-compression")]
-                {
-                    super::compression_zstd_block::decompress(compressed, decompressed)
-                }
-                #[cfg(not(feature = "zstd-compression"))]
-                {
-                    panic!("zstd-compression feature flag not activated");
-                }
-            }
+            #[cfg(feature = "lz4-compression")]
+            Self::Lz4 => super::compression_lz4_block::decompress(compressed, decompressed),
+            #[cfg(feature = "zstd-compression")]
+            Self::Zstd => super::compression_zstd_block::decompress(compressed, decompressed),
        }
    }
 }
@@ -129,9 +91,9 @@ mod tests {
    #[test]
    fn compressor_decompressor_id_test() {
        assert_eq!(Decompressor::from(Compressor::None), Decompressor::None);
+        #[cfg(feature = "lz4-compression")]
        assert_eq!(Decompressor::from(Compressor::Lz4), Decompressor::Lz4);
-        assert_eq!(Decompressor::from(Compressor::Brotli), Decompressor::Brotli);
-        assert_eq!(Decompressor::from(Compressor::Snappy), Decompressor::Snappy);
+        #[cfg(feature = "zstd-compression")]
        assert_eq!(
            Decompressor::from(Compressor::Zstd(Default::default())),
            Decompressor::Zstd
--- a/src/store/mod.rs
+++ b/src/store/mod.rs
@@ -4,8 +4,8 @@
 //! order to be handled in the `Store`.
 //!
 //! Internally, documents (or rather their stored fields) are serialized to a buffer.
-//! When the buffer exceeds `block_size` (defaults to 16K), the buffer is compressed using `brotli`,
-//! `LZ4` or `snappy` and the resulting block is written to disk.
+//! When the buffer exceeds `block_size` (defaults to 16K), the buffer is compressed
+//! using LZ4 or Zstd and the resulting block is written to disk.
 //!
 //! One can then request for a specific `DocId`.
 //! A skip list helps navigating to the right block,
@@ -48,12 +48,6 @@ pub(crate) const DOC_STORE_VERSION: u32 = 1;
 #[cfg(feature = "lz4-compression")]
 mod compression_lz4_block;

-#[cfg(feature = "brotli-compression")]
-mod compression_brotli;
-
-#[cfg(feature = "snappy-compression")]
-mod compression_snap;
-
 #[cfg(feature = "zstd-compression")]
 mod compression_zstd_block;

@@ -200,16 +194,6 @@ pub mod tests {
    fn test_store_lz4_block() -> crate::Result<()> {
        test_store(Compressor::Lz4, BLOCK_SIZE, true)
    }
-    #[cfg(feature = "snappy-compression")]
-    #[test]
-    fn test_store_snap() -> crate::Result<()> {
-        test_store(Compressor::Snappy, BLOCK_SIZE, true)
-    }
-    #[cfg(feature = "brotli-compression")]
-    #[test]
-    fn test_store_brotli() -> crate::Result<()> {
-        test_store(Compressor::Brotli, BLOCK_SIZE, true)
-    }

    #[cfg(feature = "zstd-compression")]
    #[test]
@@ -261,8 +245,8 @@ pub mod tests {
        Ok(())
    }

-    #[cfg(feature = "snappy-compression")]
    #[cfg(feature = "lz4-compression")]
+    #[cfg(feature = "zstd-compression")]
    #[test]
    fn test_merge_with_changed_compressor() -> crate::Result<()> {
        let mut schema_builder = schema::Schema::builder();
@@ -294,7 +278,7 @@ pub mod tests {
        );
        // Change compressor, this disables stacking on merging
        let index_settings = index.settings_mut();
-        index_settings.docstore_compression = Compressor::Snappy;
+        index_settings.docstore_compression = Compressor::Zstd(Default::default());
        // Merging the segments
        {
            let segment_ids = index
@@ -316,7 +300,7 @@ pub mod tests {
                LOREM.to_string()
            );
        }
-        assert_eq!(store.decompressor(), Decompressor::Snappy);
+        assert_eq!(store.decompressor(), Decompressor::Zstd);

        Ok(())
    }
--- a/src/tokenizer/ascii_folding_filter.rs
+++ b/src/tokenizer/ascii_folding_filter.rs
@@ -2,11 +2,22 @@ use std::mem;

 use super::{Token, TokenFilter, TokenStream, Tokenizer};

-/// This class converts alphabetic, numeric, and symbolic Unicode characters
+/// `AsciiFoldingFilter` converts alphabetic, numeric, and symbolic Unicode characters
 /// which are not in the first 127 ASCII characters (the "Basic Latin" Unicode
 /// block) into their ASCII equivalents, if one exists.
-#[derive(Clone)]
-pub struct AsciiFoldingFilter;
+/// If `preserve_original` is `true`, the filter emits both original token and
+/// folded token with the same position if tokens are different.
+#[derive(Clone, Default)]
+pub struct AsciiFoldingFilter {
+    preserve_original: bool,
+}
+
+impl AsciiFoldingFilter {
+    /// Creates a new `AsciiFoldingFilter`.
+    pub fn new(preserve_original: bool) -> Self {
+        Self { preserve_original }
+    }
+}

 impl TokenFilter for AsciiFoldingFilter {
    type Tokenizer<T: Tokenizer> = AsciiFoldingFilterWrapper<T>;
@@ -14,6 +25,7 @@ impl TokenFilter for AsciiFoldingFilter {
    fn transform<T: Tokenizer>(self, tokenizer: T) -> AsciiFoldingFilterWrapper<T> {
        AsciiFoldingFilterWrapper {
            tokenizer,
+            preserve_original: self.preserve_original,
            buffer: String::new(),
        }
    }
@@ -22,6 +34,7 @@ impl TokenFilter for AsciiFoldingFilter {
 #[derive(Clone)]
 pub struct AsciiFoldingFilterWrapper<T> {
    tokenizer: T,
+    preserve_original: bool,
    buffer: String,
 }

@@ -31,6 +44,8 @@ impl<T: Tokenizer> Tokenizer for AsciiFoldingFilterWrapper<T> {
    fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
        self.buffer.clear();
        AsciiFoldingFilterTokenStream {
+            preserve_original: self.preserve_original,
+            emit_folded_token_on_advance: false,
            buffer: &mut self.buffer,
            tail: self.tokenizer.token_stream(text),
        }
@@ -38,18 +53,31 @@ impl<T: Tokenizer> Tokenizer for AsciiFoldingFilterWrapper<T> {
 }

 pub struct AsciiFoldingFilterTokenStream<'a, T> {
+    preserve_original: bool,
+    emit_folded_token_on_advance: bool,
    buffer: &'a mut String,
    tail: T,
 }

 impl<'a, T: TokenStream> TokenStream for AsciiFoldingFilterTokenStream<'a, T> {
    fn advance(&mut self) -> bool {
+        if self.emit_folded_token_on_advance {
+            self.emit_folded_token_on_advance = false;
+            mem::swap(&mut self.tail.token_mut().text, self.buffer);
+            return true;
+        }
        if !self.tail.advance() {
            return false;
        }
+        let mut text_has_changed = false;
        if !self.token_mut().text.is_ascii() {
-            // ignore its already ascii
-            to_ascii(&self.tail.token().text, self.buffer);
+            text_has_changed = to_ascii(&self.tail.token().text, self.buffer);
+        }
+        // If preserve original is true and orginal is different from folded text,
+        // the folded token will be emitted on the next call to `advance`.
+        if self.preserve_original && text_has_changed {
+            self.emit_folded_token_on_advance = true;
+        } else if text_has_changed {
            mem::swap(&mut self.tail.token_mut().text, self.buffer);
        }
        true
@@ -1546,17 +1574,21 @@ fn fold_non_ascii_char(c: char) -> Option<&'static str> {
    }
 }

+// Writes the folded version of the text to the `output`.
+// Returns true if the text was modified.
 // https://github.com/apache/lucene-solr/blob/master/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.java#L187
-fn to_ascii(text: &str, output: &mut String) {
+fn to_ascii(text: &str, output: &mut String) -> bool {
    output.clear();
-
+    let mut is_text_modified = false;
    for c in text.chars() {
        if let Some(folded) = fold_non_ascii_char(c) {
            output.push_str(folded);
+            is_text_modified = true;
        } else {
            output.push(c);
        }
    }
+    is_text_modified
 }

 #[cfg(test)]
@@ -1568,20 +1600,26 @@ mod tests {

    #[test]
    fn test_ascii_folding() {
-        assert_eq!(&folding_helper("Ràmon"), &["Ramon"]);
-        assert_eq!(&folding_helper("accentué"), &["accentue"]);
-        assert_eq!(&folding_helper("âäàéè"), &["aaaee"]);
+        assert_eq!(&folding_helper("Ràmon", false), &["Ramon"]);
+        assert_eq!(&folding_helper("accentué", false), &["accentue"]);
+        assert_eq!(&folding_helper("âäàéè", false), &["aaaee"]);
+        assert_eq!(
+            &folding_helper("Ràmon âäàéè ", true),
+            &["Ràmon", "Ramon", "âäàéè", "aaaee"]
+        );
+        assert_eq!(&folding_helper("Ràmon", true), &["Ràmon", "Ramon"]);
    }

    #[test]
    fn test_no_change() {
-        assert_eq!(&folding_helper("Usagi"), &["Usagi"]);
+        assert_eq!(&folding_helper("Usagi", false), &["Usagi"]);
+        assert_eq!(&folding_helper("Usagi", true), &["Usagi"]);
    }

-    fn folding_helper(text: &str) -> Vec<String> {
+    fn folding_helper(text: &str, preserve_original: bool) -> Vec<String> {
        let mut tokens = Vec::new();
        TextAnalyzer::builder(SimpleTokenizer::default())
-            .filter(AsciiFoldingFilter)
+            .filter(AsciiFoldingFilter::new(preserve_original))
            .build()
            .token_stream(text)
            .process(&mut |token| {
@@ -1592,7 +1630,7 @@ mod tests {

    fn folding_using_raw_tokenizer_helper(text: &str) -> String {
        let mut tokenizer = TextAnalyzer::builder(RawTokenizer::default())
-            .filter(AsciiFoldingFilter)
+            .filter(AsciiFoldingFilter::default())
            .build();
        let mut token_stream = tokenizer.token_stream(text);
        token_stream.advance();
@@ -1634,7 +1672,7 @@ mod tests {
        vec.extend(iter::repeat("y").take(2));
        vec.extend(iter::repeat("fi").take(1));
        vec.extend(iter::repeat("fl").take(1));
-        assert_eq!(folding_helper(latin1_string), vec);
+        assert_eq!(folding_helper(latin1_string, false), vec);
    }

    #[test]
--- a/src/tokenizer/split_compound_words.rs
+++ b/src/tokenizer/split_compound_words.rs
@@ -86,6 +86,8 @@ impl TokenFilter for SplitCompoundWords {
        SplitCompoundWordsFilter {
            dict: self.dict,
            inner: tokenizer,
+            cuts: Vec::new(),
+            parts: Vec::new(),
        }
    }
 }
@@ -94,29 +96,33 @@ impl TokenFilter for SplitCompoundWords {
 pub struct SplitCompoundWordsFilter<T> {
    dict: AhoCorasick,
    inner: T,
-}
-
-impl<T: Tokenizer> Tokenizer for SplitCompoundWordsFilter<T> {
-    type TokenStream<'a> = SplitCompoundWordsTokenStream<T::TokenStream<'a>>;
-
-    fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
-        SplitCompoundWordsTokenStream {
-            dict: self.dict.clone(),
-            tail: self.inner.token_stream(text),
-            cuts: Vec::new(),
-            parts: Vec::new(),
-        }
-    }
-}
-
-pub struct SplitCompoundWordsTokenStream<T> {
-    dict: AhoCorasick,
-    tail: T,
    cuts: Vec<usize>,
    parts: Vec<Token>,
 }

-impl<T: TokenStream> SplitCompoundWordsTokenStream<T> {
+impl<T: Tokenizer> Tokenizer for SplitCompoundWordsFilter<T> {
+    type TokenStream<'a> = SplitCompoundWordsTokenStream<'a, T::TokenStream<'a>>;
+
+    fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
+        self.cuts.clear();
+        self.parts.clear();
+        SplitCompoundWordsTokenStream {
+            dict: self.dict.clone(),
+            tail: self.inner.token_stream(text),
+            cuts: &mut self.cuts,
+            parts: &mut self.parts,
+        }
+    }
+}
+
+pub struct SplitCompoundWordsTokenStream<'a, T> {
+    dict: AhoCorasick,
+    tail: T,
+    cuts: &'a mut Vec<usize>,
+    parts: &'a mut Vec<Token>,
+}
+
+impl<'a, T: TokenStream> SplitCompoundWordsTokenStream<'a, T> {
    // Will use `self.cuts` to fill `self.parts` if `self.tail.token()`
    // can fully be split into consecutive matches against `self.dict`.
    fn split(&mut self) {
@@ -152,7 +158,7 @@ impl<T: TokenStream> SplitCompoundWordsTokenStream<T> {
    }
 }

-impl<T: TokenStream> TokenStream for SplitCompoundWordsTokenStream<T> {
+impl<'a, T: TokenStream> TokenStream for SplitCompoundWordsTokenStream<'a, T> {
    fn advance(&mut self) -> bool {
        self.parts.pop();
Author	SHA1	Message	Date
François Massot	36d585a402	Make only one mem::swap.	2023-07-17 16:24:48 +09:00
François Massot	90b9059a93	Add preserve original on ascii folding filter.	2023-07-17 16:15:59 +09:00
Adam Reichold	820f126075	Remove support for Brotli and Snappy compression (#2123 ) LZ4 provides fast and simple compression whereas Zstd is exceptionally flexible so that the additional support for Brotli and Snappy does not really add any distinct functionality on top of those two algorithms. Removing them reduces our maintenance burden and reduces the number of choices users have to make when setting up their project based on Tantivy.	2023-07-14 16:54:59 +09:00
Adam Reichold	7e6c4a1856	Include only built-in compression algorithms as enum variants (#2121 ) * Include only built-in compression algorithms as enum variants This enables compile-time errors when a compression algorithm is requested which is not actually enabled for the current Cargo project. The cost is that indexes using other compression algorithms cannot even be loaded (even though they are not fully accessible in any case). As a drive-by, this also fixes `--no-default-features` on `cfg(unix)`. * Provide more instructive error messages for unsupported, but not unknown compression variants.	2023-07-14 11:02:49 +09:00
Adam Reichold	5fafe4b1ab	Add missing query_terms impl for TermSetQuery. (#2120 )	2023-07-13 14:54:29 +02:00
PSeitz	1e7cd48cfa	remove allocations in split compound words (#2080 ) * remove allocations in split compound words * clear reused data	2023-07-13 09:43:02 +09:00
dependabot[bot]	7f51d85bbd	Update lru requirement from 0.10.0 to 0.11.0 (#2117 ) Updates the requirements on [lru](https://github.com/jeromefroe/lru-rs) to permit the latest version. - [Changelog](https://github.com/jeromefroe/lru-rs/blob/master/CHANGELOG.md) - [Commits](https://github.com/jeromefroe/lru-rs/compare/0.10.0...0.11.0) --- updated-dependencies: - dependency-name: lru dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2023-07-13 09:42:21 +09:00