Compare commits

...

4 Commits

Author SHA1 Message Date
Paul Masurel
f3099a83eb Blop 2018-12-24 11:41:18 +09:00
Paul Masurel
f745bb9d2a blop 2018-12-24 11:28:08 +09:00
Paul Masurel
d9417acbc6 done 2018-12-11 09:01:45 +09:00
Paul Masurel
38540c3826 small step 2018-12-09 15:26:19 +09:00
5 changed files with 348 additions and 0 deletions

View File

@@ -49,6 +49,7 @@ failure = "0.1"
htmlescape = "0.3.1"
fail = "0.2"
scoped-pool = "1.0"
aho-corasick = "0.6"
[target.'cfg(windows)'.dependencies]
winapi = "0.2"

View File

@@ -0,0 +1,78 @@
extern crate aho_corasick;
use self::aho_corasick::{AcAutomaton, Automaton};
use std::mem;
use super::{OffsetIncrements, OffsetIncrementsBuilder};
pub trait CharMogrifier {
fn process_text(&mut self, text: &str, dest: &mut String, correction: &mut OffsetIncrementsBuilder);
}
pub struct CharFilter {
text: String,
buffer: String,
mogrifiers: Vec<Box<CharMogrifier>>
}
impl CharFilter {
fn process_text(&mut self, text: &str) {
self.text.clear();
self.text.push_str(text);
self.buffer.clear();
let mut offset_increment_builder = OffsetIncrements::builder();
for mogrifier in &mut self.mogrifiers {
mogrifier.process_text(&self.text,
&mut self.buffer,
&mut offset_increment_builder);
mem::swap(&mut self.text, &mut self.buffer);
offset_increment_builder.new_layer();
}
}
}
pub struct SubstringReplacer<'a> {
automaton: AcAutomaton<&'a str>,
replacements: Vec<&'a str>
}
impl SubstringReplacer<'static> {
fn new(from_tos: Vec<(&'static str, &'static str)>) -> SubstringReplacer<'static> {
let from_ptns: Vec<&'static str> = from_tos
.iter()
.map(|(from_str, _)| *from_str)
.collect();
let to_strs: Vec<&'static str> = from_tos
.iter()
.map(|(_, to_str)| *to_str)
.collect();
let automaton = AcAutomaton::new(from_ptns);
SubstringReplacer {
automaton,
replacements: to_strs
}
}
}
impl<'a> CharMogrifier for SubstringReplacer<'a> {
// correction is an array that goes from old_offset -> new_offset.
// correction len is `text.len() + 1`
fn process_text(&mut self, text: &str, dest: &mut String, correction: &mut OffsetIncrementsBuilder) {
let mut start = 0;
for m in self.automaton.find(text) {
dest.push_str(&text[start..m.start]);
let replacement = self.replacements[m.pati];
let previous_len = m.end - m.start;
correction.register_inc(m.end, (replacement.len() as isize) - (previous_len as isize));
dest.push_str(replacement);
start = m.end;
}
dest.push_str(&text[start..]);
}
}
// lowercasing
// Unicode normalization*
// '
// accent simplification

View File

@@ -0,0 +1,4 @@
mod char_filter;
mod offset_increments;
pub use self::offset_increments::{OffsetIncrements, OffsetIncrementsBuilder};

View File

@@ -0,0 +1,264 @@
/*!
Stores an increasing mapping from naturals to naturals.
`CharFilter`s may make the original text longer or shorter.
Token's offset need to refer to their offset in the original
text.
This struct is in charge of doing an efficient book-keeping
of these shift in offsets and provide a mapping
from the transformed text to the original text.
We define the inverse of an increasing mapping `f` as:
g(i) = max {j | f(j) <= i}
!= min {j | f(i) >= i}
The name `inverse` is a bit misleading:
this is not really an involution.
Note that having a single definition has some bad side effects.
For instance, when trying to convert a segment of chars to
its offset in the original string, the reverse mapping may
return an empty string.
We could use a different definition of the reverse mapping
when computing the lower bound and the upper bound of the segment,
but then non-overlapping tokens could have overlapping origins.
# Example
```
forward mapping
[0,1,2,5,6,7]
Encoded sparsely as (3, 2)
reverse mapping
[0,1,2,2,2,3,4,5]
Encoded sparsely as [(3, -1), (4,-1), (5,-1)]
```
*/
/// Builds a reverse mapping using a sparse representation of the
/// forward mapping.
pub struct OffsetIncrementsBuilder {
cumulated: isize,
incs: Vec<(usize, isize)>,
}
impl OffsetIncrementsBuilder {
/// We require
/// - `from_offset + delta >= 0`
/// There is no need to call this function if delta = 0.
pub fn register_inc(&mut self, from_offset: usize, delta: isize) {
let mut cumulated = self.cumulated;
let from_offset_isize = from_offset as isize;
let to_offset = (from_offset_isize + self.cumulated) as usize;
if delta > 0 {
for i in 0..delta as usize {
cumulated += 1;
self.incs.push((to_offset + i, -cumulated));
}
} else {
assert_eq!(delta, -1);
cumulated -= 1;
self.incs.push((to_offset + 1, -cumulated));
}
println!("incs {:?}", self.incs);
self.cumulated = cumulated;
}
pub fn new_layer(&self) {
panic!();
}
fn build(self) -> OffsetIncrements {
OffsetIncrements {
incs: self.incs
}
}
}
#[derive(Default)]
pub struct OffsetIncrementsReader {
shifts: Vec<(usize, isize)>,
current_shift: isize,
idx: usize,
}
impl OffsetIncrementsReader {
fn new(shifts: Vec<(usize, isize)>) -> OffsetIncrementsReader {
OffsetIncrementsReader {
shifts,
current_shift: 0,
idx: 0,
}
}
fn convert_offset(&mut self, target: usize) -> usize {
while self.idx < self.shifts.len() {
let (offset, shift) = self.shifts[self.idx];
if offset > target {
break;
} else {
self.current_shift = shift;
}
self.idx += 1;
}
return (self.current_shift + target as isize) as usize;
}
}
pub struct OffsetIncrements {
incs: Vec<(usize, isize)>
}
impl OffsetIncrements {
pub fn builder() -> OffsetIncrementsBuilder {
OffsetIncrementsBuilder {
cumulated: 0,
incs: Vec::new(),
}
}
pub fn reader(&self) -> OffsetIncrementsReader {
OffsetIncrementsReader::new(self.incs.clone()) // TODO Fixme, no clone
}
}
#[cfg(test)]
mod tests {
use super::OffsetIncrements;
use super::OffsetIncrementsReader;
#[test]
fn test_offset_increment_reader_empty() {
let mut reader = OffsetIncrementsReader::new(vec![]);
for i in 0..3 {
assert_eq!(reader.convert_offset(i), i);
}
}
#[test]
fn test_offset_increment_reader_step() {
let mut reader = OffsetIncrementsReader::new(vec![(1, 1), (3, 3), (6, 2), (7, 1), (8, 0), (9, -1)]);
assert_eq!(reader.convert_offset(0), 0);
assert_eq!(reader.convert_offset(1), 2);
assert_eq!(reader.convert_offset(2), 3);
assert_eq!(reader.convert_offset(3), 6);
assert_eq!(reader.convert_offset(4), 7);
assert_eq!(reader.convert_offset(5), 8);
assert_eq!(reader.convert_offset(6), 8);
assert_eq!(reader.convert_offset(7), 8);
assert_eq!(reader.convert_offset(8), 8);
assert_eq!(reader.convert_offset(9), 8);
}
#[test]
fn test_offset_increment_reader_step_neg() {
let mut reader = OffsetIncrementsReader::new(vec![(1, -1), (2, -2), (3, -3)]);
assert_eq!(reader.convert_offset(0), 0);
assert_eq!(reader.convert_offset(1), 0);
assert_eq!(reader.convert_offset(2), 0);
assert_eq!(reader.convert_offset(3), 0);
assert_eq!(reader.convert_offset(4), 1);
assert_eq!(reader.convert_offset(5), 2);
assert_eq!(reader.convert_offset(6), 3);
assert_eq!(reader.convert_offset(7), 4);
}
fn aux_test_increment(increments: OffsetIncrements, expected: &[usize]) {
let mut reader = increments.reader();
for (i, el) in expected.iter().cloned().enumerate() {
assert_eq!(reader.convert_offset(i), el);
}
}
fn assert_is_increasing(v: &[usize]) {
assert!(v.len() > 0);
assert_eq!(v[0], 0);
let mut prec = 0;
for &val in &v[1..] {
assert!(val >= prec);
prec = val;
}
}
fn is_inverse(fwd: &[usize], rev: &[usize]) {
assert_is_increasing(fwd);
assert_is_increasing(rev);
println!("fwd {:?} rev {:?}", fwd, rev);
for (i, &antecedant) in rev.iter().enumerate() {
let expected = fwd
.iter()
.enumerate()
.filter(|(_, v)| **v <= i)
.map(|(ord, _)| ord)
.last()
.unwrap();
println!("i {}", i);
assert_eq!(expected, antecedant);
}
}
#[test]
fn test_is_inverse() {
is_inverse(&[0,1,1,1,2], &[0, 3, 4]);
}
fn is_reciprocal(left: &[usize], right: &[usize]) {
is_inverse(left, right);
}
#[test]
fn test_offset_increments_shorten() {
{
let mut offset_increment_builder = OffsetIncrements::builder();
// abcd -> abd
offset_increment_builder.register_inc(2, -1);
aux_test_increment(offset_increment_builder.build(), &[0, 1, 2, 4]);
}
{
let mut offset_increment_builder = OffsetIncrements::builder();
// abcdefgh -> abcdfgh
offset_increment_builder.register_inc(4, -1);
aux_test_increment(offset_increment_builder.build(), &[0, 1, 2, 3, 4, 6]);
}
{
let mut offset_increment_builder = OffsetIncrements::builder();
// abcd -> bcd
offset_increment_builder.register_inc(0, -1);
aux_test_increment(offset_increment_builder.build(), &[0, 2, 3]);
}
}
#[test]
fn test_offset_increments_builder() {
{
let mut offset_increment_builder = OffsetIncrements::builder();
offset_increment_builder.register_inc(2, 1);
// [0, 1, 3, 4, 5]
aux_test_increment(offset_increment_builder.build(), &[0,1,1,2,3,4,5]);
}
{
let mut offset_increment_builder = OffsetIncrements::builder();
offset_increment_builder.register_inc(3, 2);
// [0, 1, 2, 4, 5, 6]
aux_test_increment(offset_increment_builder.build(), &[0,1,2,2,2,3,4,5]);
}
{
let mut offset_increment_builder = OffsetIncrements::builder();
// 0, 0, 1, 2, 2, 2
offset_increment_builder.register_inc(1, 1);
offset_increment_builder.register_inc(3, 3);
aux_test_increment(offset_increment_builder.build(), &[0,0,1,2,2,2,2,3,4]);
}
}
}

View File

@@ -140,6 +140,7 @@ mod stop_word_filter;
mod token_stream_chain;
mod tokenizer;
mod tokenizer_manager;
mod char_processing;
pub use self::alphanum_only::AlphaNumOnlyFilter;
pub use self::facet_tokenizer::FacetTokenizer;