mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-06 17:22:54 +00:00
Compare commits
7 Commits
straightfo
...
fmassot/ad
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
36d585a402 | ||
|
|
90b9059a93 | ||
|
|
820f126075 | ||
|
|
7e6c4a1856 | ||
|
|
5fafe4b1ab | ||
|
|
1e7cd48cfa | ||
|
|
7f51d85bbd |
2
.github/workflows/test.yml
vendored
2
.github/workflows/test.yml
vendored
@@ -53,7 +53,7 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
features: [
|
||||
{ label: "all", flags: "mmap,stopwords,brotli-compression,lz4-compression,snappy-compression,zstd-compression,failpoints" },
|
||||
{ label: "all", flags: "mmap,stopwords,lz4-compression,zstd-compression,failpoints" },
|
||||
{ label: "quickwit", flags: "mmap,quickwit,failpoints" }
|
||||
]
|
||||
|
||||
|
||||
@@ -25,9 +25,7 @@ aho-corasick = "1.0"
|
||||
tantivy-fst = "0.4.0"
|
||||
memmap2 = { version = "0.7.1", optional = true }
|
||||
lz4_flex = { version = "0.11", default-features = false, optional = true }
|
||||
brotli = { version = "3.3.4", optional = true }
|
||||
zstd = { version = "0.12", optional = true, default-features = false }
|
||||
snap = { version = "1.0.5", optional = true }
|
||||
tempfile = { version = "3.3.0", optional = true }
|
||||
log = "0.4.16"
|
||||
serde = { version = "1.0.136", features = ["derive"] }
|
||||
@@ -49,7 +47,7 @@ murmurhash32 = "0.3.0"
|
||||
time = { version = "0.3.10", features = ["serde-well-known"] }
|
||||
smallvec = "1.8.0"
|
||||
rayon = "1.5.2"
|
||||
lru = "0.10.0"
|
||||
lru = "0.11.0"
|
||||
fastdivide = "0.4.0"
|
||||
itertools = "0.11.0"
|
||||
measure_time = "0.8.2"
|
||||
@@ -107,9 +105,7 @@ default = ["mmap", "stopwords", "lz4-compression"]
|
||||
mmap = ["fs4", "tempfile", "memmap2"]
|
||||
stopwords = []
|
||||
|
||||
brotli-compression = ["brotli"]
|
||||
lz4-compression = ["lz4_flex"]
|
||||
snappy-compression = ["snap"]
|
||||
zstd-compression = ["zstd"]
|
||||
|
||||
failpoints = ["fail", "fail/failpoints"]
|
||||
|
||||
@@ -44,7 +44,7 @@ Details about the benchmark can be found at this [repository](https://github.com
|
||||
- Single valued and multivalued u64, i64, and f64 fast fields (equivalent of doc values in Lucene)
|
||||
- `&[u8]` fast fields
|
||||
- Text, i64, u64, f64, dates, ip, bool, and hierarchical facet fields
|
||||
- Compressed document store (LZ4, Zstd, None, Brotli, Snap)
|
||||
- Compressed document store (LZ4, Zstd, None)
|
||||
- Range queries
|
||||
- Faceted search
|
||||
- Configurable indexing (optional term frequency and position indexing)
|
||||
|
||||
@@ -410,7 +410,9 @@ mod tests {
|
||||
use super::IndexMeta;
|
||||
use crate::core::index_meta::UntrackedIndexMeta;
|
||||
use crate::schema::{Schema, TEXT};
|
||||
use crate::store::{Compressor, ZstdCompressor};
|
||||
use crate::store::Compressor;
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
use crate::store::ZstdCompressor;
|
||||
use crate::{IndexSettings, IndexSortByField, Order};
|
||||
|
||||
#[test]
|
||||
@@ -446,6 +448,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
fn test_serialize_metas_zstd_compressor() {
|
||||
let schema = {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -482,13 +485,14 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(all(feature = "lz4-compression", feature = "zstd-compression"))]
|
||||
fn test_serialize_metas_invalid_comp() {
|
||||
let json = r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zsstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
|
||||
|
||||
let err = serde_json::from_str::<UntrackedIndexMeta>(json).unwrap_err();
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
"unknown variant `zsstd`, expected one of `none`, `lz4`, `brotli`, `snappy`, `zstd`, \
|
||||
"unknown variant `zsstd`, expected one of `none`, `lz4`, `zstd`, \
|
||||
`zstd(compression_level=5)` at line 1 column 96"
|
||||
.to_string()
|
||||
);
|
||||
@@ -502,6 +506,20 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(not(feature = "zstd-compression"))]
|
||||
fn test_serialize_metas_unsupported_comp() {
|
||||
let json = r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
|
||||
|
||||
let err = serde_json::from_str::<UntrackedIndexMeta>(json).unwrap_err();
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
"unsupported variant `zstd`, please enable Tantivy's `zstd-compression` feature at \
|
||||
line 1 column 95"
|
||||
.to_string()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
fn test_index_settings_default() {
|
||||
|
||||
@@ -191,7 +191,7 @@ pub use crate::schema::{DateOptions, DateTimePrecision, Document, Term};
|
||||
/// Index format version.
|
||||
const INDEX_FORMAT_VERSION: u32 = 5;
|
||||
|
||||
#[cfg(unix)]
|
||||
#[cfg(all(feature = "mmap", unix))]
|
||||
pub use memmap2::Advice;
|
||||
|
||||
/// Structure version for the index.
|
||||
|
||||
@@ -72,6 +72,14 @@ impl Query for TermSetQuery {
|
||||
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
|
||||
Ok(Box::new(self.specialized_weight(enable_scoring.schema())?))
|
||||
}
|
||||
|
||||
fn query_terms<'a>(&'a self, visitor: &mut dyn FnMut(&'a Term, bool)) {
|
||||
for terms in self.terms_map.values() {
|
||||
for term in terms {
|
||||
visitor(term, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct SetDfaWrapper(Map<Vec<u8>>);
|
||||
|
||||
@@ -1,19 +0,0 @@
|
||||
use std::io;
|
||||
|
||||
#[inline]
|
||||
pub fn compress(mut uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
let params = brotli::enc::BrotliEncoderParams {
|
||||
quality: 5,
|
||||
..Default::default()
|
||||
};
|
||||
compressed.clear();
|
||||
brotli::BrotliCompress(&mut uncompressed, compressed, ¶ms)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn decompress(mut compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
decompressed.clear();
|
||||
brotli::BrotliDecompress(&mut compressed, decompressed)?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,17 +0,0 @@
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
#[inline]
|
||||
pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
compressed.clear();
|
||||
let mut encoder = snap::write::FrameEncoder::new(compressed);
|
||||
encoder.write_all(uncompressed)?;
|
||||
encoder.flush()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn decompress(compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
decompressed.clear();
|
||||
snap::read::FrameDecoder::new(compressed).read_to_end(decompressed)?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -17,12 +17,10 @@ pub enum Compressor {
|
||||
/// No compression
|
||||
None,
|
||||
/// Use the lz4 compressor (block format)
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
Lz4,
|
||||
/// Use the brotli compressor
|
||||
Brotli,
|
||||
/// Use the snap compressor
|
||||
Snappy,
|
||||
/// Use the zstd compressor
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
Zstd(ZstdCompressor),
|
||||
}
|
||||
|
||||
@@ -31,9 +29,9 @@ impl Serialize for Compressor {
|
||||
where S: serde::Serializer {
|
||||
match *self {
|
||||
Compressor::None => serializer.serialize_str("none"),
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
Compressor::Lz4 => serializer.serialize_str("lz4"),
|
||||
Compressor::Brotli => serializer.serialize_str("brotli"),
|
||||
Compressor::Snappy => serializer.serialize_str("snappy"),
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
Compressor::Zstd(zstd) => serializer.serialize_str(&zstd.ser_to_string()),
|
||||
}
|
||||
}
|
||||
@@ -45,27 +43,38 @@ impl<'de> Deserialize<'de> for Compressor {
|
||||
let buf = String::deserialize(deserializer)?;
|
||||
let compressor = match buf.as_str() {
|
||||
"none" => Compressor::None,
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
"lz4" => Compressor::Lz4,
|
||||
"brotli" => Compressor::Brotli,
|
||||
"snappy" => Compressor::Snappy,
|
||||
#[cfg(not(feature = "lz4-compression"))]
|
||||
"lz4" => {
|
||||
return Err(serde::de::Error::custom(
|
||||
"unsupported variant `lz4`, please enable Tantivy's `lz4-compression` feature",
|
||||
))
|
||||
}
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
_ if buf.starts_with("zstd") => Compressor::Zstd(
|
||||
ZstdCompressor::deser_from_str(&buf).map_err(serde::de::Error::custom)?,
|
||||
),
|
||||
#[cfg(not(feature = "zstd-compression"))]
|
||||
_ if buf.starts_with("zstd") => {
|
||||
return Err(serde::de::Error::custom(
|
||||
"unsupported variant `zstd`, please enable Tantivy's `zstd-compression` \
|
||||
feature",
|
||||
))
|
||||
}
|
||||
_ => {
|
||||
if buf.starts_with("zstd") {
|
||||
Compressor::Zstd(
|
||||
ZstdCompressor::deser_from_str(&buf).map_err(serde::de::Error::custom)?,
|
||||
)
|
||||
} else {
|
||||
return Err(serde::de::Error::unknown_variant(
|
||||
&buf,
|
||||
&[
|
||||
"none",
|
||||
"lz4",
|
||||
"brotli",
|
||||
"snappy",
|
||||
"zstd",
|
||||
"zstd(compression_level=5)",
|
||||
],
|
||||
));
|
||||
}
|
||||
return Err(serde::de::Error::unknown_variant(
|
||||
&buf,
|
||||
&[
|
||||
"none",
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
"lz4",
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
"zstd",
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
"zstd(compression_level=5)",
|
||||
],
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
@@ -127,18 +136,15 @@ impl ZstdCompressor {
|
||||
}
|
||||
|
||||
impl Default for Compressor {
|
||||
#[allow(unreachable_code)]
|
||||
fn default() -> Self {
|
||||
if cfg!(feature = "lz4-compression") {
|
||||
Compressor::Lz4
|
||||
} else if cfg!(feature = "brotli-compression") {
|
||||
Compressor::Brotli
|
||||
} else if cfg!(feature = "snappy-compression") {
|
||||
Compressor::Snappy
|
||||
} else if cfg!(feature = "zstd-compression") {
|
||||
Compressor::Zstd(ZstdCompressor::default())
|
||||
} else {
|
||||
Compressor::None
|
||||
}
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
return Compressor::Lz4;
|
||||
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
return Compressor::Zstd(ZstdCompressor::default());
|
||||
|
||||
Compressor::None
|
||||
}
|
||||
}
|
||||
|
||||
@@ -155,50 +161,14 @@ impl Compressor {
|
||||
compressed.extend_from_slice(uncompressed);
|
||||
Ok(())
|
||||
}
|
||||
Self::Lz4 => {
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
{
|
||||
super::compression_lz4_block::compress(uncompressed, compressed)
|
||||
}
|
||||
#[cfg(not(feature = "lz4-compression"))]
|
||||
{
|
||||
panic!("lz4-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
Self::Brotli => {
|
||||
#[cfg(feature = "brotli-compression")]
|
||||
{
|
||||
super::compression_brotli::compress(uncompressed, compressed)
|
||||
}
|
||||
#[cfg(not(feature = "brotli-compression"))]
|
||||
{
|
||||
panic!("brotli-compression-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
Self::Snappy => {
|
||||
#[cfg(feature = "snappy-compression")]
|
||||
{
|
||||
super::compression_snap::compress(uncompressed, compressed)
|
||||
}
|
||||
#[cfg(not(feature = "snappy-compression"))]
|
||||
{
|
||||
panic!("snappy-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
Self::Zstd(_zstd_compressor) => {
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
{
|
||||
super::compression_zstd_block::compress(
|
||||
uncompressed,
|
||||
compressed,
|
||||
_zstd_compressor.compression_level,
|
||||
)
|
||||
}
|
||||
#[cfg(not(feature = "zstd-compression"))]
|
||||
{
|
||||
panic!("zstd-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
Self::Lz4 => super::compression_lz4_block::compress(uncompressed, compressed),
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
Self::Zstd(_zstd_compressor) => super::compression_zstd_block::compress(
|
||||
uncompressed,
|
||||
compressed,
|
||||
_zstd_compressor.compression_level,
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,12 +16,10 @@ pub enum Decompressor {
|
||||
/// No compression
|
||||
None,
|
||||
/// Use the lz4 decompressor (block format)
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
Lz4,
|
||||
/// Use the brotli decompressor
|
||||
Brotli,
|
||||
/// Use the snap decompressor
|
||||
Snappy,
|
||||
/// Use the zstd decompressor
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
Zstd,
|
||||
}
|
||||
|
||||
@@ -29,9 +27,9 @@ impl From<Compressor> for Decompressor {
|
||||
fn from(compressor: Compressor) -> Self {
|
||||
match compressor {
|
||||
Compressor::None => Decompressor::None,
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
Compressor::Lz4 => Decompressor::Lz4,
|
||||
Compressor::Brotli => Decompressor::Brotli,
|
||||
Compressor::Snappy => Decompressor::Snappy,
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
Compressor::Zstd(_) => Decompressor::Zstd,
|
||||
}
|
||||
}
|
||||
@@ -41,9 +39,9 @@ impl Decompressor {
|
||||
pub(crate) fn from_id(id: u8) -> Decompressor {
|
||||
match id {
|
||||
0 => Decompressor::None,
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
1 => Decompressor::Lz4,
|
||||
2 => Decompressor::Brotli,
|
||||
3 => Decompressor::Snappy,
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
4 => Decompressor::Zstd,
|
||||
_ => panic!("unknown compressor id {id:?}"),
|
||||
}
|
||||
@@ -52,9 +50,9 @@ impl Decompressor {
|
||||
pub(crate) fn get_id(&self) -> u8 {
|
||||
match self {
|
||||
Self::None => 0,
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
Self::Lz4 => 1,
|
||||
Self::Brotli => 2,
|
||||
Self::Snappy => 3,
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
Self::Zstd => 4,
|
||||
}
|
||||
}
|
||||
@@ -77,46 +75,10 @@ impl Decompressor {
|
||||
decompressed.extend_from_slice(compressed);
|
||||
Ok(())
|
||||
}
|
||||
Self::Lz4 => {
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
{
|
||||
super::compression_lz4_block::decompress(compressed, decompressed)
|
||||
}
|
||||
#[cfg(not(feature = "lz4-compression"))]
|
||||
{
|
||||
panic!("lz4-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
Self::Brotli => {
|
||||
#[cfg(feature = "brotli-compression")]
|
||||
{
|
||||
super::compression_brotli::decompress(compressed, decompressed)
|
||||
}
|
||||
#[cfg(not(feature = "brotli-compression"))]
|
||||
{
|
||||
panic!("brotli-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
Self::Snappy => {
|
||||
#[cfg(feature = "snappy-compression")]
|
||||
{
|
||||
super::compression_snap::decompress(compressed, decompressed)
|
||||
}
|
||||
#[cfg(not(feature = "snappy-compression"))]
|
||||
{
|
||||
panic!("snappy-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
Self::Zstd => {
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
{
|
||||
super::compression_zstd_block::decompress(compressed, decompressed)
|
||||
}
|
||||
#[cfg(not(feature = "zstd-compression"))]
|
||||
{
|
||||
panic!("zstd-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
Self::Lz4 => super::compression_lz4_block::decompress(compressed, decompressed),
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
Self::Zstd => super::compression_zstd_block::decompress(compressed, decompressed),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -129,9 +91,9 @@ mod tests {
|
||||
#[test]
|
||||
fn compressor_decompressor_id_test() {
|
||||
assert_eq!(Decompressor::from(Compressor::None), Decompressor::None);
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
assert_eq!(Decompressor::from(Compressor::Lz4), Decompressor::Lz4);
|
||||
assert_eq!(Decompressor::from(Compressor::Brotli), Decompressor::Brotli);
|
||||
assert_eq!(Decompressor::from(Compressor::Snappy), Decompressor::Snappy);
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
assert_eq!(
|
||||
Decompressor::from(Compressor::Zstd(Default::default())),
|
||||
Decompressor::Zstd
|
||||
|
||||
@@ -4,8 +4,8 @@
|
||||
//! order to be handled in the `Store`.
|
||||
//!
|
||||
//! Internally, documents (or rather their stored fields) are serialized to a buffer.
|
||||
//! When the buffer exceeds `block_size` (defaults to 16K), the buffer is compressed using `brotli`,
|
||||
//! `LZ4` or `snappy` and the resulting block is written to disk.
|
||||
//! When the buffer exceeds `block_size` (defaults to 16K), the buffer is compressed
|
||||
//! using LZ4 or Zstd and the resulting block is written to disk.
|
||||
//!
|
||||
//! One can then request for a specific `DocId`.
|
||||
//! A skip list helps navigating to the right block,
|
||||
@@ -48,12 +48,6 @@ pub(crate) const DOC_STORE_VERSION: u32 = 1;
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
mod compression_lz4_block;
|
||||
|
||||
#[cfg(feature = "brotli-compression")]
|
||||
mod compression_brotli;
|
||||
|
||||
#[cfg(feature = "snappy-compression")]
|
||||
mod compression_snap;
|
||||
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
mod compression_zstd_block;
|
||||
|
||||
@@ -200,16 +194,6 @@ pub mod tests {
|
||||
fn test_store_lz4_block() -> crate::Result<()> {
|
||||
test_store(Compressor::Lz4, BLOCK_SIZE, true)
|
||||
}
|
||||
#[cfg(feature = "snappy-compression")]
|
||||
#[test]
|
||||
fn test_store_snap() -> crate::Result<()> {
|
||||
test_store(Compressor::Snappy, BLOCK_SIZE, true)
|
||||
}
|
||||
#[cfg(feature = "brotli-compression")]
|
||||
#[test]
|
||||
fn test_store_brotli() -> crate::Result<()> {
|
||||
test_store(Compressor::Brotli, BLOCK_SIZE, true)
|
||||
}
|
||||
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
#[test]
|
||||
@@ -261,8 +245,8 @@ pub mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "snappy-compression")]
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
#[test]
|
||||
fn test_merge_with_changed_compressor() -> crate::Result<()> {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
@@ -294,7 +278,7 @@ pub mod tests {
|
||||
);
|
||||
// Change compressor, this disables stacking on merging
|
||||
let index_settings = index.settings_mut();
|
||||
index_settings.docstore_compression = Compressor::Snappy;
|
||||
index_settings.docstore_compression = Compressor::Zstd(Default::default());
|
||||
// Merging the segments
|
||||
{
|
||||
let segment_ids = index
|
||||
@@ -316,7 +300,7 @@ pub mod tests {
|
||||
LOREM.to_string()
|
||||
);
|
||||
}
|
||||
assert_eq!(store.decompressor(), Decompressor::Snappy);
|
||||
assert_eq!(store.decompressor(), Decompressor::Zstd);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -2,11 +2,22 @@ use std::mem;
|
||||
|
||||
use super::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
|
||||
/// This class converts alphabetic, numeric, and symbolic Unicode characters
|
||||
/// `AsciiFoldingFilter` converts alphabetic, numeric, and symbolic Unicode characters
|
||||
/// which are not in the first 127 ASCII characters (the "Basic Latin" Unicode
|
||||
/// block) into their ASCII equivalents, if one exists.
|
||||
#[derive(Clone)]
|
||||
pub struct AsciiFoldingFilter;
|
||||
/// If `preserve_original` is `true`, the filter emits both original token and
|
||||
/// folded token with the same position if tokens are different.
|
||||
#[derive(Clone, Default)]
|
||||
pub struct AsciiFoldingFilter {
|
||||
preserve_original: bool,
|
||||
}
|
||||
|
||||
impl AsciiFoldingFilter {
|
||||
/// Creates a new `AsciiFoldingFilter`.
|
||||
pub fn new(preserve_original: bool) -> Self {
|
||||
Self { preserve_original }
|
||||
}
|
||||
}
|
||||
|
||||
impl TokenFilter for AsciiFoldingFilter {
|
||||
type Tokenizer<T: Tokenizer> = AsciiFoldingFilterWrapper<T>;
|
||||
@@ -14,6 +25,7 @@ impl TokenFilter for AsciiFoldingFilter {
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> AsciiFoldingFilterWrapper<T> {
|
||||
AsciiFoldingFilterWrapper {
|
||||
tokenizer,
|
||||
preserve_original: self.preserve_original,
|
||||
buffer: String::new(),
|
||||
}
|
||||
}
|
||||
@@ -22,6 +34,7 @@ impl TokenFilter for AsciiFoldingFilter {
|
||||
#[derive(Clone)]
|
||||
pub struct AsciiFoldingFilterWrapper<T> {
|
||||
tokenizer: T,
|
||||
preserve_original: bool,
|
||||
buffer: String,
|
||||
}
|
||||
|
||||
@@ -31,6 +44,8 @@ impl<T: Tokenizer> Tokenizer for AsciiFoldingFilterWrapper<T> {
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
self.buffer.clear();
|
||||
AsciiFoldingFilterTokenStream {
|
||||
preserve_original: self.preserve_original,
|
||||
emit_folded_token_on_advance: false,
|
||||
buffer: &mut self.buffer,
|
||||
tail: self.tokenizer.token_stream(text),
|
||||
}
|
||||
@@ -38,18 +53,31 @@ impl<T: Tokenizer> Tokenizer for AsciiFoldingFilterWrapper<T> {
|
||||
}
|
||||
|
||||
pub struct AsciiFoldingFilterTokenStream<'a, T> {
|
||||
preserve_original: bool,
|
||||
emit_folded_token_on_advance: bool,
|
||||
buffer: &'a mut String,
|
||||
tail: T,
|
||||
}
|
||||
|
||||
impl<'a, T: TokenStream> TokenStream for AsciiFoldingFilterTokenStream<'a, T> {
|
||||
fn advance(&mut self) -> bool {
|
||||
if self.emit_folded_token_on_advance {
|
||||
self.emit_folded_token_on_advance = false;
|
||||
mem::swap(&mut self.tail.token_mut().text, self.buffer);
|
||||
return true;
|
||||
}
|
||||
if !self.tail.advance() {
|
||||
return false;
|
||||
}
|
||||
let mut text_has_changed = false;
|
||||
if !self.token_mut().text.is_ascii() {
|
||||
// ignore its already ascii
|
||||
to_ascii(&self.tail.token().text, self.buffer);
|
||||
text_has_changed = to_ascii(&self.tail.token().text, self.buffer);
|
||||
}
|
||||
// If preserve original is true and orginal is different from folded text,
|
||||
// the folded token will be emitted on the next call to `advance`.
|
||||
if self.preserve_original && text_has_changed {
|
||||
self.emit_folded_token_on_advance = true;
|
||||
} else if text_has_changed {
|
||||
mem::swap(&mut self.tail.token_mut().text, self.buffer);
|
||||
}
|
||||
true
|
||||
@@ -1546,17 +1574,21 @@ fn fold_non_ascii_char(c: char) -> Option<&'static str> {
|
||||
}
|
||||
}
|
||||
|
||||
// Writes the folded version of the text to the `output`.
|
||||
// Returns true if the text was modified.
|
||||
// https://github.com/apache/lucene-solr/blob/master/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.java#L187
|
||||
fn to_ascii(text: &str, output: &mut String) {
|
||||
fn to_ascii(text: &str, output: &mut String) -> bool {
|
||||
output.clear();
|
||||
|
||||
let mut is_text_modified = false;
|
||||
for c in text.chars() {
|
||||
if let Some(folded) = fold_non_ascii_char(c) {
|
||||
output.push_str(folded);
|
||||
is_text_modified = true;
|
||||
} else {
|
||||
output.push(c);
|
||||
}
|
||||
}
|
||||
is_text_modified
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -1568,20 +1600,26 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_ascii_folding() {
|
||||
assert_eq!(&folding_helper("Ràmon"), &["Ramon"]);
|
||||
assert_eq!(&folding_helper("accentué"), &["accentue"]);
|
||||
assert_eq!(&folding_helper("âäàéè"), &["aaaee"]);
|
||||
assert_eq!(&folding_helper("Ràmon", false), &["Ramon"]);
|
||||
assert_eq!(&folding_helper("accentué", false), &["accentue"]);
|
||||
assert_eq!(&folding_helper("âäàéè", false), &["aaaee"]);
|
||||
assert_eq!(
|
||||
&folding_helper("Ràmon âäàéè ", true),
|
||||
&["Ràmon", "Ramon", "âäàéè", "aaaee"]
|
||||
);
|
||||
assert_eq!(&folding_helper("Ràmon", true), &["Ràmon", "Ramon"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_no_change() {
|
||||
assert_eq!(&folding_helper("Usagi"), &["Usagi"]);
|
||||
assert_eq!(&folding_helper("Usagi", false), &["Usagi"]);
|
||||
assert_eq!(&folding_helper("Usagi", true), &["Usagi"]);
|
||||
}
|
||||
|
||||
fn folding_helper(text: &str) -> Vec<String> {
|
||||
fn folding_helper(text: &str, preserve_original: bool) -> Vec<String> {
|
||||
let mut tokens = Vec::new();
|
||||
TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(AsciiFoldingFilter)
|
||||
.filter(AsciiFoldingFilter::new(preserve_original))
|
||||
.build()
|
||||
.token_stream(text)
|
||||
.process(&mut |token| {
|
||||
@@ -1592,7 +1630,7 @@ mod tests {
|
||||
|
||||
fn folding_using_raw_tokenizer_helper(text: &str) -> String {
|
||||
let mut tokenizer = TextAnalyzer::builder(RawTokenizer::default())
|
||||
.filter(AsciiFoldingFilter)
|
||||
.filter(AsciiFoldingFilter::default())
|
||||
.build();
|
||||
let mut token_stream = tokenizer.token_stream(text);
|
||||
token_stream.advance();
|
||||
@@ -1634,7 +1672,7 @@ mod tests {
|
||||
vec.extend(iter::repeat("y").take(2));
|
||||
vec.extend(iter::repeat("fi").take(1));
|
||||
vec.extend(iter::repeat("fl").take(1));
|
||||
assert_eq!(folding_helper(latin1_string), vec);
|
||||
assert_eq!(folding_helper(latin1_string, false), vec);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -86,6 +86,8 @@ impl TokenFilter for SplitCompoundWords {
|
||||
SplitCompoundWordsFilter {
|
||||
dict: self.dict,
|
||||
inner: tokenizer,
|
||||
cuts: Vec::new(),
|
||||
parts: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -94,29 +96,33 @@ impl TokenFilter for SplitCompoundWords {
|
||||
pub struct SplitCompoundWordsFilter<T> {
|
||||
dict: AhoCorasick,
|
||||
inner: T,
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for SplitCompoundWordsFilter<T> {
|
||||
type TokenStream<'a> = SplitCompoundWordsTokenStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
SplitCompoundWordsTokenStream {
|
||||
dict: self.dict.clone(),
|
||||
tail: self.inner.token_stream(text),
|
||||
cuts: Vec::new(),
|
||||
parts: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct SplitCompoundWordsTokenStream<T> {
|
||||
dict: AhoCorasick,
|
||||
tail: T,
|
||||
cuts: Vec<usize>,
|
||||
parts: Vec<Token>,
|
||||
}
|
||||
|
||||
impl<T: TokenStream> SplitCompoundWordsTokenStream<T> {
|
||||
impl<T: Tokenizer> Tokenizer for SplitCompoundWordsFilter<T> {
|
||||
type TokenStream<'a> = SplitCompoundWordsTokenStream<'a, T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
self.cuts.clear();
|
||||
self.parts.clear();
|
||||
SplitCompoundWordsTokenStream {
|
||||
dict: self.dict.clone(),
|
||||
tail: self.inner.token_stream(text),
|
||||
cuts: &mut self.cuts,
|
||||
parts: &mut self.parts,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct SplitCompoundWordsTokenStream<'a, T> {
|
||||
dict: AhoCorasick,
|
||||
tail: T,
|
||||
cuts: &'a mut Vec<usize>,
|
||||
parts: &'a mut Vec<Token>,
|
||||
}
|
||||
|
||||
impl<'a, T: TokenStream> SplitCompoundWordsTokenStream<'a, T> {
|
||||
// Will use `self.cuts` to fill `self.parts` if `self.tail.token()`
|
||||
// can fully be split into consecutive matches against `self.dict`.
|
||||
fn split(&mut self) {
|
||||
@@ -152,7 +158,7 @@ impl<T: TokenStream> SplitCompoundWordsTokenStream<T> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: TokenStream> TokenStream for SplitCompoundWordsTokenStream<T> {
|
||||
impl<'a, T: TokenStream> TokenStream for SplitCompoundWordsTokenStream<'a, T> {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.parts.pop();
|
||||
|
||||
|
||||
Reference in New Issue
Block a user