mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-02 23:32:54 +00:00
Compare commits
4 Commits
commit-cha
...
issue/407
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f3099a83eb | ||
|
|
f745bb9d2a | ||
|
|
d9417acbc6 | ||
|
|
38540c3826 |
@@ -49,6 +49,7 @@ failure = "0.1"
|
|||||||
htmlescape = "0.3.1"
|
htmlescape = "0.3.1"
|
||||||
fail = "0.2"
|
fail = "0.2"
|
||||||
scoped-pool = "1.0"
|
scoped-pool = "1.0"
|
||||||
|
aho-corasick = "0.6"
|
||||||
|
|
||||||
[target.'cfg(windows)'.dependencies]
|
[target.'cfg(windows)'.dependencies]
|
||||||
winapi = "0.2"
|
winapi = "0.2"
|
||||||
|
|||||||
78
src/tokenizer/char_processing/char_filter.rs
Normal file
78
src/tokenizer/char_processing/char_filter.rs
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
extern crate aho_corasick;
|
||||||
|
use self::aho_corasick::{AcAutomaton, Automaton};
|
||||||
|
use std::mem;
|
||||||
|
|
||||||
|
use super::{OffsetIncrements, OffsetIncrementsBuilder};
|
||||||
|
|
||||||
|
pub trait CharMogrifier {
|
||||||
|
fn process_text(&mut self, text: &str, dest: &mut String, correction: &mut OffsetIncrementsBuilder);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct CharFilter {
|
||||||
|
text: String,
|
||||||
|
buffer: String,
|
||||||
|
mogrifiers: Vec<Box<CharMogrifier>>
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CharFilter {
|
||||||
|
fn process_text(&mut self, text: &str) {
|
||||||
|
self.text.clear();
|
||||||
|
self.text.push_str(text);
|
||||||
|
self.buffer.clear();
|
||||||
|
let mut offset_increment_builder = OffsetIncrements::builder();
|
||||||
|
for mogrifier in &mut self.mogrifiers {
|
||||||
|
mogrifier.process_text(&self.text,
|
||||||
|
&mut self.buffer,
|
||||||
|
&mut offset_increment_builder);
|
||||||
|
mem::swap(&mut self.text, &mut self.buffer);
|
||||||
|
offset_increment_builder.new_layer();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
pub struct SubstringReplacer<'a> {
|
||||||
|
automaton: AcAutomaton<&'a str>,
|
||||||
|
replacements: Vec<&'a str>
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
impl SubstringReplacer<'static> {
|
||||||
|
fn new(from_tos: Vec<(&'static str, &'static str)>) -> SubstringReplacer<'static> {
|
||||||
|
let from_ptns: Vec<&'static str> = from_tos
|
||||||
|
.iter()
|
||||||
|
.map(|(from_str, _)| *from_str)
|
||||||
|
.collect();
|
||||||
|
let to_strs: Vec<&'static str> = from_tos
|
||||||
|
.iter()
|
||||||
|
.map(|(_, to_str)| *to_str)
|
||||||
|
.collect();
|
||||||
|
let automaton = AcAutomaton::new(from_ptns);
|
||||||
|
SubstringReplacer {
|
||||||
|
automaton,
|
||||||
|
replacements: to_strs
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
impl<'a> CharMogrifier for SubstringReplacer<'a> {
|
||||||
|
// correction is an array that goes from old_offset -> new_offset.
|
||||||
|
// correction len is `text.len() + 1`
|
||||||
|
fn process_text(&mut self, text: &str, dest: &mut String, correction: &mut OffsetIncrementsBuilder) {
|
||||||
|
let mut start = 0;
|
||||||
|
for m in self.automaton.find(text) {
|
||||||
|
dest.push_str(&text[start..m.start]);
|
||||||
|
let replacement = self.replacements[m.pati];
|
||||||
|
let previous_len = m.end - m.start;
|
||||||
|
correction.register_inc(m.end, (replacement.len() as isize) - (previous_len as isize));
|
||||||
|
dest.push_str(replacement);
|
||||||
|
start = m.end;
|
||||||
|
}
|
||||||
|
dest.push_str(&text[start..]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// lowercasing
|
||||||
|
// Unicode normalization*
|
||||||
|
// '
|
||||||
|
// accent simplification
|
||||||
4
src/tokenizer/char_processing/mod.rs
Normal file
4
src/tokenizer/char_processing/mod.rs
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
mod char_filter;
|
||||||
|
mod offset_increments;
|
||||||
|
|
||||||
|
pub use self::offset_increments::{OffsetIncrements, OffsetIncrementsBuilder};
|
||||||
264
src/tokenizer/char_processing/offset_increments.rs
Normal file
264
src/tokenizer/char_processing/offset_increments.rs
Normal file
@@ -0,0 +1,264 @@
|
|||||||
|
/*!
|
||||||
|
Stores an increasing mapping from naturals to naturals.
|
||||||
|
|
||||||
|
`CharFilter`s may make the original text longer or shorter.
|
||||||
|
Token's offset need to refer to their offset in the original
|
||||||
|
text.
|
||||||
|
|
||||||
|
This struct is in charge of doing an efficient book-keeping
|
||||||
|
of these shift in offsets and provide a mapping
|
||||||
|
from the transformed text to the original text.
|
||||||
|
|
||||||
|
We define the inverse of an increasing mapping `f` as:
|
||||||
|
g(i) = max {j | f(j) <= i}
|
||||||
|
!= min {j | f(i) >= i}
|
||||||
|
|
||||||
|
The name `inverse` is a bit misleading:
|
||||||
|
this is not really an involution.
|
||||||
|
|
||||||
|
Note that having a single definition has some bad side effects.
|
||||||
|
For instance, when trying to convert a segment of chars to
|
||||||
|
its offset in the original string, the reverse mapping may
|
||||||
|
return an empty string.
|
||||||
|
|
||||||
|
We could use a different definition of the reverse mapping
|
||||||
|
when computing the lower bound and the upper bound of the segment,
|
||||||
|
but then non-overlapping tokens could have overlapping origins.
|
||||||
|
|
||||||
|
# Example
|
||||||
|
|
||||||
|
```
|
||||||
|
forward mapping
|
||||||
|
[0,1,2,5,6,7]
|
||||||
|
Encoded sparsely as (3, 2)
|
||||||
|
|
||||||
|
reverse mapping
|
||||||
|
[0,1,2,2,2,3,4,5]
|
||||||
|
Encoded sparsely as [(3, -1), (4,-1), (5,-1)]
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
/// Builds a reverse mapping using a sparse representation of the
|
||||||
|
/// forward mapping.
|
||||||
|
pub struct OffsetIncrementsBuilder {
|
||||||
|
cumulated: isize,
|
||||||
|
incs: Vec<(usize, isize)>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl OffsetIncrementsBuilder {
|
||||||
|
/// We require
|
||||||
|
/// - `from_offset + delta >= 0`
|
||||||
|
/// There is no need to call this function if delta = 0.
|
||||||
|
pub fn register_inc(&mut self, from_offset: usize, delta: isize) {
|
||||||
|
let mut cumulated = self.cumulated;
|
||||||
|
let from_offset_isize = from_offset as isize;
|
||||||
|
let to_offset = (from_offset_isize + self.cumulated) as usize;
|
||||||
|
if delta > 0 {
|
||||||
|
for i in 0..delta as usize {
|
||||||
|
cumulated += 1;
|
||||||
|
self.incs.push((to_offset + i, -cumulated));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
assert_eq!(delta, -1);
|
||||||
|
cumulated -= 1;
|
||||||
|
self.incs.push((to_offset + 1, -cumulated));
|
||||||
|
}
|
||||||
|
println!("incs {:?}", self.incs);
|
||||||
|
self.cumulated = cumulated;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn new_layer(&self) {
|
||||||
|
panic!();
|
||||||
|
}
|
||||||
|
|
||||||
|
fn build(self) -> OffsetIncrements {
|
||||||
|
OffsetIncrements {
|
||||||
|
incs: self.incs
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#[derive(Default)]
|
||||||
|
pub struct OffsetIncrementsReader {
|
||||||
|
shifts: Vec<(usize, isize)>,
|
||||||
|
current_shift: isize,
|
||||||
|
idx: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl OffsetIncrementsReader {
|
||||||
|
fn new(shifts: Vec<(usize, isize)>) -> OffsetIncrementsReader {
|
||||||
|
OffsetIncrementsReader {
|
||||||
|
shifts,
|
||||||
|
current_shift: 0,
|
||||||
|
idx: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn convert_offset(&mut self, target: usize) -> usize {
|
||||||
|
while self.idx < self.shifts.len() {
|
||||||
|
let (offset, shift) = self.shifts[self.idx];
|
||||||
|
if offset > target {
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
self.current_shift = shift;
|
||||||
|
}
|
||||||
|
self.idx += 1;
|
||||||
|
}
|
||||||
|
return (self.current_shift + target as isize) as usize;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct OffsetIncrements {
|
||||||
|
incs: Vec<(usize, isize)>
|
||||||
|
}
|
||||||
|
|
||||||
|
impl OffsetIncrements {
|
||||||
|
pub fn builder() -> OffsetIncrementsBuilder {
|
||||||
|
OffsetIncrementsBuilder {
|
||||||
|
cumulated: 0,
|
||||||
|
incs: Vec::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn reader(&self) -> OffsetIncrementsReader {
|
||||||
|
OffsetIncrementsReader::new(self.incs.clone()) // TODO Fixme, no clone
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::OffsetIncrements;
|
||||||
|
use super::OffsetIncrementsReader;
|
||||||
|
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_offset_increment_reader_empty() {
|
||||||
|
let mut reader = OffsetIncrementsReader::new(vec![]);
|
||||||
|
for i in 0..3 {
|
||||||
|
assert_eq!(reader.convert_offset(i), i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_offset_increment_reader_step() {
|
||||||
|
let mut reader = OffsetIncrementsReader::new(vec![(1, 1), (3, 3), (6, 2), (7, 1), (8, 0), (9, -1)]);
|
||||||
|
assert_eq!(reader.convert_offset(0), 0);
|
||||||
|
assert_eq!(reader.convert_offset(1), 2);
|
||||||
|
assert_eq!(reader.convert_offset(2), 3);
|
||||||
|
assert_eq!(reader.convert_offset(3), 6);
|
||||||
|
assert_eq!(reader.convert_offset(4), 7);
|
||||||
|
assert_eq!(reader.convert_offset(5), 8);
|
||||||
|
assert_eq!(reader.convert_offset(6), 8);
|
||||||
|
assert_eq!(reader.convert_offset(7), 8);
|
||||||
|
assert_eq!(reader.convert_offset(8), 8);
|
||||||
|
assert_eq!(reader.convert_offset(9), 8);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_offset_increment_reader_step_neg() {
|
||||||
|
let mut reader = OffsetIncrementsReader::new(vec![(1, -1), (2, -2), (3, -3)]);
|
||||||
|
assert_eq!(reader.convert_offset(0), 0);
|
||||||
|
assert_eq!(reader.convert_offset(1), 0);
|
||||||
|
assert_eq!(reader.convert_offset(2), 0);
|
||||||
|
assert_eq!(reader.convert_offset(3), 0);
|
||||||
|
assert_eq!(reader.convert_offset(4), 1);
|
||||||
|
assert_eq!(reader.convert_offset(5), 2);
|
||||||
|
assert_eq!(reader.convert_offset(6), 3);
|
||||||
|
assert_eq!(reader.convert_offset(7), 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
fn aux_test_increment(increments: OffsetIncrements, expected: &[usize]) {
|
||||||
|
let mut reader = increments.reader();
|
||||||
|
for (i, el) in expected.iter().cloned().enumerate() {
|
||||||
|
assert_eq!(reader.convert_offset(i), el);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
fn assert_is_increasing(v: &[usize]) {
|
||||||
|
assert!(v.len() > 0);
|
||||||
|
assert_eq!(v[0], 0);
|
||||||
|
let mut prec = 0;
|
||||||
|
for &val in &v[1..] {
|
||||||
|
assert!(val >= prec);
|
||||||
|
prec = val;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_inverse(fwd: &[usize], rev: &[usize]) {
|
||||||
|
assert_is_increasing(fwd);
|
||||||
|
assert_is_increasing(rev);
|
||||||
|
println!("fwd {:?} rev {:?}", fwd, rev);
|
||||||
|
for (i, &antecedant) in rev.iter().enumerate() {
|
||||||
|
let expected = fwd
|
||||||
|
.iter()
|
||||||
|
.enumerate()
|
||||||
|
.filter(|(_, v)| **v <= i)
|
||||||
|
.map(|(ord, _)| ord)
|
||||||
|
.last()
|
||||||
|
.unwrap();
|
||||||
|
println!("i {}", i);
|
||||||
|
assert_eq!(expected, antecedant);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_is_inverse() {
|
||||||
|
is_inverse(&[0,1,1,1,2], &[0, 3, 4]);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
fn is_reciprocal(left: &[usize], right: &[usize]) {
|
||||||
|
is_inverse(left, right);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_offset_increments_shorten() {
|
||||||
|
{
|
||||||
|
let mut offset_increment_builder = OffsetIncrements::builder();
|
||||||
|
// abcd -> abd
|
||||||
|
offset_increment_builder.register_inc(2, -1);
|
||||||
|
aux_test_increment(offset_increment_builder.build(), &[0, 1, 2, 4]);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let mut offset_increment_builder = OffsetIncrements::builder();
|
||||||
|
// abcdefgh -> abcdfgh
|
||||||
|
offset_increment_builder.register_inc(4, -1);
|
||||||
|
aux_test_increment(offset_increment_builder.build(), &[0, 1, 2, 3, 4, 6]);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let mut offset_increment_builder = OffsetIncrements::builder();
|
||||||
|
// abcd -> bcd
|
||||||
|
offset_increment_builder.register_inc(0, -1);
|
||||||
|
aux_test_increment(offset_increment_builder.build(), &[0, 2, 3]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_offset_increments_builder() {
|
||||||
|
{
|
||||||
|
let mut offset_increment_builder = OffsetIncrements::builder();
|
||||||
|
offset_increment_builder.register_inc(2, 1);
|
||||||
|
// [0, 1, 3, 4, 5]
|
||||||
|
aux_test_increment(offset_increment_builder.build(), &[0,1,1,2,3,4,5]);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let mut offset_increment_builder = OffsetIncrements::builder();
|
||||||
|
offset_increment_builder.register_inc(3, 2);
|
||||||
|
// [0, 1, 2, 4, 5, 6]
|
||||||
|
aux_test_increment(offset_increment_builder.build(), &[0,1,2,2,2,3,4,5]);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let mut offset_increment_builder = OffsetIncrements::builder();
|
||||||
|
// 0, 0, 1, 2, 2, 2
|
||||||
|
offset_increment_builder.register_inc(1, 1);
|
||||||
|
offset_increment_builder.register_inc(3, 3);
|
||||||
|
aux_test_increment(offset_increment_builder.build(), &[0,0,1,2,2,2,2,3,4]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -140,6 +140,7 @@ mod stop_word_filter;
|
|||||||
mod token_stream_chain;
|
mod token_stream_chain;
|
||||||
mod tokenizer;
|
mod tokenizer;
|
||||||
mod tokenizer_manager;
|
mod tokenizer_manager;
|
||||||
|
mod char_processing;
|
||||||
|
|
||||||
pub use self::alphanum_only::AlphaNumOnlyFilter;
|
pub use self::alphanum_only::AlphaNumOnlyFilter;
|
||||||
pub use self::facet_tokenizer::FacetTokenizer;
|
pub use self::facet_tokenizer::FacetTokenizer;
|
||||||
|
|||||||
Reference in New Issue
Block a user