From 5f7d027a52557944c6143f2f4c835d8ab4e19a81 Mon Sep 17 00:00:00 2001 From: Adam Reichold Date: Wed, 26 Oct 2022 17:08:09 +0200 Subject: [PATCH] Avoid unconditional allocation in StemmerTokenStream. This fixes the TODO in two ways: If the stemmer already yields an owned string, it is used directly as the new text of the token. Otherwise, a temporary buffer is used to copy the stemmed text (just as before) and then swapping it into the token to reuse its existing buffer. --- src/tokenizer/stemmer.rs | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/tokenizer/stemmer.rs b/src/tokenizer/stemmer.rs index 77a35ec18..b76361ec3 100644 --- a/src/tokenizer/stemmer.rs +++ b/src/tokenizer/stemmer.rs @@ -1,3 +1,6 @@ +use std::borrow::Cow; +use std::mem; + use rust_stemmers::{self, Algorithm}; use serde::{Deserialize, Serialize}; @@ -84,6 +87,7 @@ impl TokenFilter for Stemmer { BoxTokenStream::from(StemmerTokenStream { tail: token_stream, stemmer: inner_stemmer, + buffer: String::new(), }) } } @@ -91,6 +95,7 @@ impl TokenFilter for Stemmer { pub struct StemmerTokenStream<'a> { tail: BoxTokenStream<'a>, stemmer: rust_stemmers::Stemmer, + buffer: String, } impl<'a> TokenStream for StemmerTokenStream<'a> { @@ -98,10 +103,16 @@ impl<'a> TokenStream for StemmerTokenStream<'a> { if !self.tail.advance() { return false; } - // TODO remove allocation - let stemmed_str: String = self.stemmer.stem(&self.token().text).into_owned(); - self.token_mut().text.clear(); - self.token_mut().text.push_str(&stemmed_str); + let token = self.tail.token_mut(); + let stemmed_str = self.stemmer.stem(&token.text); + match stemmed_str { + Cow::Owned(stemmed_str) => token.text = stemmed_str, + Cow::Borrowed(stemmed_str) => { + self.buffer.clear(); + self.buffer.push_str(stemmed_str); + mem::swap(&mut token.text, &mut self.buffer); + } + } true }