mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-28 04:52:55 +00:00
Compare commits
4 Commits
missing_te
...
issue/407
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f3099a83eb | ||
|
|
f745bb9d2a | ||
|
|
d9417acbc6 | ||
|
|
38540c3826 |
@@ -49,6 +49,7 @@ failure = "0.1"
|
||||
htmlescape = "0.3.1"
|
||||
fail = "0.2"
|
||||
scoped-pool = "1.0"
|
||||
aho-corasick = "0.6"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.2"
|
||||
|
||||
78
src/tokenizer/char_processing/char_filter.rs
Normal file
78
src/tokenizer/char_processing/char_filter.rs
Normal file
@@ -0,0 +1,78 @@
|
||||
extern crate aho_corasick;
|
||||
use self::aho_corasick::{AcAutomaton, Automaton};
|
||||
use std::mem;
|
||||
|
||||
use super::{OffsetIncrements, OffsetIncrementsBuilder};
|
||||
|
||||
pub trait CharMogrifier {
|
||||
fn process_text(&mut self, text: &str, dest: &mut String, correction: &mut OffsetIncrementsBuilder);
|
||||
}
|
||||
|
||||
pub struct CharFilter {
|
||||
text: String,
|
||||
buffer: String,
|
||||
mogrifiers: Vec<Box<CharMogrifier>>
|
||||
}
|
||||
|
||||
impl CharFilter {
|
||||
fn process_text(&mut self, text: &str) {
|
||||
self.text.clear();
|
||||
self.text.push_str(text);
|
||||
self.buffer.clear();
|
||||
let mut offset_increment_builder = OffsetIncrements::builder();
|
||||
for mogrifier in &mut self.mogrifiers {
|
||||
mogrifier.process_text(&self.text,
|
||||
&mut self.buffer,
|
||||
&mut offset_increment_builder);
|
||||
mem::swap(&mut self.text, &mut self.buffer);
|
||||
offset_increment_builder.new_layer();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub struct SubstringReplacer<'a> {
|
||||
automaton: AcAutomaton<&'a str>,
|
||||
replacements: Vec<&'a str>
|
||||
}
|
||||
|
||||
|
||||
impl SubstringReplacer<'static> {
|
||||
fn new(from_tos: Vec<(&'static str, &'static str)>) -> SubstringReplacer<'static> {
|
||||
let from_ptns: Vec<&'static str> = from_tos
|
||||
.iter()
|
||||
.map(|(from_str, _)| *from_str)
|
||||
.collect();
|
||||
let to_strs: Vec<&'static str> = from_tos
|
||||
.iter()
|
||||
.map(|(_, to_str)| *to_str)
|
||||
.collect();
|
||||
let automaton = AcAutomaton::new(from_ptns);
|
||||
SubstringReplacer {
|
||||
automaton,
|
||||
replacements: to_strs
|
||||
}
|
||||
}
|
||||
}
|
||||
impl<'a> CharMogrifier for SubstringReplacer<'a> {
|
||||
// correction is an array that goes from old_offset -> new_offset.
|
||||
// correction len is `text.len() + 1`
|
||||
fn process_text(&mut self, text: &str, dest: &mut String, correction: &mut OffsetIncrementsBuilder) {
|
||||
let mut start = 0;
|
||||
for m in self.automaton.find(text) {
|
||||
dest.push_str(&text[start..m.start]);
|
||||
let replacement = self.replacements[m.pati];
|
||||
let previous_len = m.end - m.start;
|
||||
correction.register_inc(m.end, (replacement.len() as isize) - (previous_len as isize));
|
||||
dest.push_str(replacement);
|
||||
start = m.end;
|
||||
}
|
||||
dest.push_str(&text[start..]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// lowercasing
|
||||
// Unicode normalization*
|
||||
// '
|
||||
// accent simplification
|
||||
4
src/tokenizer/char_processing/mod.rs
Normal file
4
src/tokenizer/char_processing/mod.rs
Normal file
@@ -0,0 +1,4 @@
|
||||
mod char_filter;
|
||||
mod offset_increments;
|
||||
|
||||
pub use self::offset_increments::{OffsetIncrements, OffsetIncrementsBuilder};
|
||||
264
src/tokenizer/char_processing/offset_increments.rs
Normal file
264
src/tokenizer/char_processing/offset_increments.rs
Normal file
@@ -0,0 +1,264 @@
|
||||
/*!
|
||||
Stores an increasing mapping from naturals to naturals.
|
||||
|
||||
`CharFilter`s may make the original text longer or shorter.
|
||||
Token's offset need to refer to their offset in the original
|
||||
text.
|
||||
|
||||
This struct is in charge of doing an efficient book-keeping
|
||||
of these shift in offsets and provide a mapping
|
||||
from the transformed text to the original text.
|
||||
|
||||
We define the inverse of an increasing mapping `f` as:
|
||||
g(i) = max {j | f(j) <= i}
|
||||
!= min {j | f(i) >= i}
|
||||
|
||||
The name `inverse` is a bit misleading:
|
||||
this is not really an involution.
|
||||
|
||||
Note that having a single definition has some bad side effects.
|
||||
For instance, when trying to convert a segment of chars to
|
||||
its offset in the original string, the reverse mapping may
|
||||
return an empty string.
|
||||
|
||||
We could use a different definition of the reverse mapping
|
||||
when computing the lower bound and the upper bound of the segment,
|
||||
but then non-overlapping tokens could have overlapping origins.
|
||||
|
||||
# Example
|
||||
|
||||
```
|
||||
forward mapping
|
||||
[0,1,2,5,6,7]
|
||||
Encoded sparsely as (3, 2)
|
||||
|
||||
reverse mapping
|
||||
[0,1,2,2,2,3,4,5]
|
||||
Encoded sparsely as [(3, -1), (4,-1), (5,-1)]
|
||||
```
|
||||
|
||||
|
||||
*/
|
||||
|
||||
/// Builds a reverse mapping using a sparse representation of the
|
||||
/// forward mapping.
|
||||
pub struct OffsetIncrementsBuilder {
|
||||
cumulated: isize,
|
||||
incs: Vec<(usize, isize)>,
|
||||
}
|
||||
|
||||
impl OffsetIncrementsBuilder {
|
||||
/// We require
|
||||
/// - `from_offset + delta >= 0`
|
||||
/// There is no need to call this function if delta = 0.
|
||||
pub fn register_inc(&mut self, from_offset: usize, delta: isize) {
|
||||
let mut cumulated = self.cumulated;
|
||||
let from_offset_isize = from_offset as isize;
|
||||
let to_offset = (from_offset_isize + self.cumulated) as usize;
|
||||
if delta > 0 {
|
||||
for i in 0..delta as usize {
|
||||
cumulated += 1;
|
||||
self.incs.push((to_offset + i, -cumulated));
|
||||
}
|
||||
} else {
|
||||
assert_eq!(delta, -1);
|
||||
cumulated -= 1;
|
||||
self.incs.push((to_offset + 1, -cumulated));
|
||||
}
|
||||
println!("incs {:?}", self.incs);
|
||||
self.cumulated = cumulated;
|
||||
}
|
||||
|
||||
pub fn new_layer(&self) {
|
||||
panic!();
|
||||
}
|
||||
|
||||
fn build(self) -> OffsetIncrements {
|
||||
OffsetIncrements {
|
||||
incs: self.incs
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct OffsetIncrementsReader {
|
||||
shifts: Vec<(usize, isize)>,
|
||||
current_shift: isize,
|
||||
idx: usize,
|
||||
}
|
||||
|
||||
impl OffsetIncrementsReader {
|
||||
fn new(shifts: Vec<(usize, isize)>) -> OffsetIncrementsReader {
|
||||
OffsetIncrementsReader {
|
||||
shifts,
|
||||
current_shift: 0,
|
||||
idx: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn convert_offset(&mut self, target: usize) -> usize {
|
||||
while self.idx < self.shifts.len() {
|
||||
let (offset, shift) = self.shifts[self.idx];
|
||||
if offset > target {
|
||||
break;
|
||||
} else {
|
||||
self.current_shift = shift;
|
||||
}
|
||||
self.idx += 1;
|
||||
}
|
||||
return (self.current_shift + target as isize) as usize;
|
||||
}
|
||||
}
|
||||
|
||||
pub struct OffsetIncrements {
|
||||
incs: Vec<(usize, isize)>
|
||||
}
|
||||
|
||||
impl OffsetIncrements {
|
||||
pub fn builder() -> OffsetIncrementsBuilder {
|
||||
OffsetIncrementsBuilder {
|
||||
cumulated: 0,
|
||||
incs: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn reader(&self) -> OffsetIncrementsReader {
|
||||
OffsetIncrementsReader::new(self.incs.clone()) // TODO Fixme, no clone
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::OffsetIncrements;
|
||||
use super::OffsetIncrementsReader;
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_offset_increment_reader_empty() {
|
||||
let mut reader = OffsetIncrementsReader::new(vec![]);
|
||||
for i in 0..3 {
|
||||
assert_eq!(reader.convert_offset(i), i);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_offset_increment_reader_step() {
|
||||
let mut reader = OffsetIncrementsReader::new(vec![(1, 1), (3, 3), (6, 2), (7, 1), (8, 0), (9, -1)]);
|
||||
assert_eq!(reader.convert_offset(0), 0);
|
||||
assert_eq!(reader.convert_offset(1), 2);
|
||||
assert_eq!(reader.convert_offset(2), 3);
|
||||
assert_eq!(reader.convert_offset(3), 6);
|
||||
assert_eq!(reader.convert_offset(4), 7);
|
||||
assert_eq!(reader.convert_offset(5), 8);
|
||||
assert_eq!(reader.convert_offset(6), 8);
|
||||
assert_eq!(reader.convert_offset(7), 8);
|
||||
assert_eq!(reader.convert_offset(8), 8);
|
||||
assert_eq!(reader.convert_offset(9), 8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_offset_increment_reader_step_neg() {
|
||||
let mut reader = OffsetIncrementsReader::new(vec![(1, -1), (2, -2), (3, -3)]);
|
||||
assert_eq!(reader.convert_offset(0), 0);
|
||||
assert_eq!(reader.convert_offset(1), 0);
|
||||
assert_eq!(reader.convert_offset(2), 0);
|
||||
assert_eq!(reader.convert_offset(3), 0);
|
||||
assert_eq!(reader.convert_offset(4), 1);
|
||||
assert_eq!(reader.convert_offset(5), 2);
|
||||
assert_eq!(reader.convert_offset(6), 3);
|
||||
assert_eq!(reader.convert_offset(7), 4);
|
||||
}
|
||||
|
||||
|
||||
fn aux_test_increment(increments: OffsetIncrements, expected: &[usize]) {
|
||||
let mut reader = increments.reader();
|
||||
for (i, el) in expected.iter().cloned().enumerate() {
|
||||
assert_eq!(reader.convert_offset(i), el);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fn assert_is_increasing(v: &[usize]) {
|
||||
assert!(v.len() > 0);
|
||||
assert_eq!(v[0], 0);
|
||||
let mut prec = 0;
|
||||
for &val in &v[1..] {
|
||||
assert!(val >= prec);
|
||||
prec = val;
|
||||
}
|
||||
}
|
||||
|
||||
fn is_inverse(fwd: &[usize], rev: &[usize]) {
|
||||
assert_is_increasing(fwd);
|
||||
assert_is_increasing(rev);
|
||||
println!("fwd {:?} rev {:?}", fwd, rev);
|
||||
for (i, &antecedant) in rev.iter().enumerate() {
|
||||
let expected = fwd
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|(_, v)| **v <= i)
|
||||
.map(|(ord, _)| ord)
|
||||
.last()
|
||||
.unwrap();
|
||||
println!("i {}", i);
|
||||
assert_eq!(expected, antecedant);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_inverse() {
|
||||
is_inverse(&[0,1,1,1,2], &[0, 3, 4]);
|
||||
}
|
||||
|
||||
|
||||
fn is_reciprocal(left: &[usize], right: &[usize]) {
|
||||
is_inverse(left, right);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_offset_increments_shorten() {
|
||||
{
|
||||
let mut offset_increment_builder = OffsetIncrements::builder();
|
||||
// abcd -> abd
|
||||
offset_increment_builder.register_inc(2, -1);
|
||||
aux_test_increment(offset_increment_builder.build(), &[0, 1, 2, 4]);
|
||||
}
|
||||
{
|
||||
let mut offset_increment_builder = OffsetIncrements::builder();
|
||||
// abcdefgh -> abcdfgh
|
||||
offset_increment_builder.register_inc(4, -1);
|
||||
aux_test_increment(offset_increment_builder.build(), &[0, 1, 2, 3, 4, 6]);
|
||||
}
|
||||
{
|
||||
let mut offset_increment_builder = OffsetIncrements::builder();
|
||||
// abcd -> bcd
|
||||
offset_increment_builder.register_inc(0, -1);
|
||||
aux_test_increment(offset_increment_builder.build(), &[0, 2, 3]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_offset_increments_builder() {
|
||||
{
|
||||
let mut offset_increment_builder = OffsetIncrements::builder();
|
||||
offset_increment_builder.register_inc(2, 1);
|
||||
// [0, 1, 3, 4, 5]
|
||||
aux_test_increment(offset_increment_builder.build(), &[0,1,1,2,3,4,5]);
|
||||
}
|
||||
{
|
||||
let mut offset_increment_builder = OffsetIncrements::builder();
|
||||
offset_increment_builder.register_inc(3, 2);
|
||||
// [0, 1, 2, 4, 5, 6]
|
||||
aux_test_increment(offset_increment_builder.build(), &[0,1,2,2,2,3,4,5]);
|
||||
}
|
||||
{
|
||||
let mut offset_increment_builder = OffsetIncrements::builder();
|
||||
// 0, 0, 1, 2, 2, 2
|
||||
offset_increment_builder.register_inc(1, 1);
|
||||
offset_increment_builder.register_inc(3, 3);
|
||||
aux_test_increment(offset_increment_builder.build(), &[0,0,1,2,2,2,2,3,4]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -140,6 +140,7 @@ mod stop_word_filter;
|
||||
mod token_stream_chain;
|
||||
mod tokenizer;
|
||||
mod tokenizer_manager;
|
||||
mod char_processing;
|
||||
|
||||
pub use self::alphanum_only::AlphaNumOnlyFilter;
|
||||
pub use self::facet_tokenizer::FacetTokenizer;
|
||||
|
||||
Reference in New Issue
Block a user