mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
try with custom Cow<str>
This commit is contained in:
@@ -9,7 +9,7 @@ use tantivy::schema::{
|
||||
use tantivy::{Document, Index, IndexBuilder};
|
||||
|
||||
const HDFS_LOGS: &str = include_str!("hdfs.json");
|
||||
const NUM_REPEATS: usize = 10;
|
||||
const NUM_REPEATS: usize = 20;
|
||||
|
||||
pub fn hdfs_index_benchmark(c: &mut Criterion) {
|
||||
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::io::{self, Read, Write};
|
||||
use std::net::Ipv6Addr;
|
||||
@@ -11,6 +10,7 @@ use yoke::erased::ErasedArcCart;
|
||||
use yoke::Yoke;
|
||||
|
||||
use super::*;
|
||||
use crate::schema::value::MaybeOwnedString;
|
||||
use crate::tokenizer::PreTokenizedString;
|
||||
use crate::DateTime;
|
||||
|
||||
@@ -155,7 +155,7 @@ impl Document {
|
||||
|
||||
/// Add a text field.
|
||||
pub fn add_text<S: ToString>(&mut self, field: Field, text: S) {
|
||||
let value = Value::Str(Cow::Owned(text.to_string()));
|
||||
let value = Value::Str(MaybeOwnedString::from_string(text.to_string()));
|
||||
self.add_field_value(field, value);
|
||||
}
|
||||
|
||||
@@ -307,7 +307,9 @@ impl Document {
|
||||
} => {
|
||||
let field_value = FieldValue {
|
||||
field: *field,
|
||||
value: Value::Str(Cow::Owned(pre_tokenized_text.text.to_string())),
|
||||
value: Value::Str(MaybeOwnedString::from_string(
|
||||
pre_tokenized_text.text.to_string(),
|
||||
)),
|
||||
};
|
||||
field_value.serialize(writer)?;
|
||||
}
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use std::borrow::Cow;
|
||||
use std::net::IpAddr;
|
||||
use std::str::FromStr;
|
||||
|
||||
@@ -10,6 +9,7 @@ use super::ip_options::IpAddrOptions;
|
||||
use super::{Cardinality, IntoIpv6Addr};
|
||||
use crate::schema::bytes_options::BytesOptions;
|
||||
use crate::schema::facet_options::FacetOptions;
|
||||
use crate::schema::value::MaybeOwnedString;
|
||||
use crate::schema::{
|
||||
DateOptions, Facet, IndexRecordOption, JsonObjectOptions, NumericOptions, TextFieldIndexing,
|
||||
TextOptions, Value,
|
||||
@@ -342,7 +342,7 @@ impl FieldType {
|
||||
})?;
|
||||
Ok(DateTime::from_utc(dt_with_fixed_tz).into())
|
||||
}
|
||||
FieldType::Str(_) => Ok(Value::Str(Cow::Owned(field_text))),
|
||||
FieldType::Str(_) => Ok(Value::Str(MaybeOwnedString::from_string(field_text))),
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => {
|
||||
Err(ValueParsingError::TypeError {
|
||||
expected: "an integer",
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use std::borrow::Cow;
|
||||
use std::fmt;
|
||||
use std::net::Ipv6Addr;
|
||||
|
||||
pub use not_safe::MaybeOwnedString;
|
||||
use serde::de::Visitor;
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
use serde_json::Map;
|
||||
@@ -15,7 +15,7 @@ use crate::DateTime;
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub enum Value<'a> {
|
||||
/// The str type is used for any text information.
|
||||
Str(Cow<'a, str>),
|
||||
Str(MaybeOwnedString<'a>),
|
||||
/// Pre-tokenized str type,
|
||||
PreTokStr(PreTokenizedString),
|
||||
/// Unsigned 64-bits Integer `u64`
|
||||
@@ -45,7 +45,7 @@ impl<'a> Value<'a> {
|
||||
pub fn into_owned(self) -> Value<'static> {
|
||||
use Value::*;
|
||||
match self {
|
||||
Str(val) => Str(Cow::Owned(val.into_owned())),
|
||||
Str(val) => Str(MaybeOwnedString::from_string(val.into_string())),
|
||||
PreTokStr(val) => PreTokStr(val),
|
||||
U64(val) => U64(val),
|
||||
I64(val) => I64(val),
|
||||
@@ -118,11 +118,11 @@ impl<'de> Deserialize<'de> for Value<'de> {
|
||||
|
||||
// TODO add visit_borrowed_str
|
||||
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E> {
|
||||
Ok(Value::Str(Cow::Owned(v.to_owned())))
|
||||
Ok(Value::Str(MaybeOwnedString::from_string(v.to_owned())))
|
||||
}
|
||||
|
||||
fn visit_string<E>(self, v: String) -> Result<Self::Value, E> {
|
||||
Ok(Value::Str(Cow::Owned(v)))
|
||||
Ok(Value::Str(MaybeOwnedString::from_string(v)))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -250,7 +250,7 @@ impl<'a> Value<'a> {
|
||||
|
||||
impl From<String> for Value<'static> {
|
||||
fn from(s: String) -> Value<'static> {
|
||||
Value::Str(Cow::Owned(s))
|
||||
Value::Str(MaybeOwnedString::from_string(s))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -292,7 +292,7 @@ impl From<DateTime> for Value<'static> {
|
||||
|
||||
impl<'a> From<&'a str> for Value<'a> {
|
||||
fn from(s: &'a str) -> Value<'a> {
|
||||
Value::Str(Cow::Borrowed(s))
|
||||
Value::Str(MaybeOwnedString::from_str(s))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -339,14 +339,13 @@ impl From<serde_json::Value> for Value<'static> {
|
||||
}
|
||||
|
||||
mod binary_serialize {
|
||||
use std::borrow::Cow;
|
||||
use std::io::{self, Read, Write};
|
||||
use std::net::Ipv6Addr;
|
||||
|
||||
use common::{f64_to_u64, u64_to_f64, BinarySerializable};
|
||||
use fastfield_codecs::MonotonicallyMappableToU128;
|
||||
|
||||
use super::Value;
|
||||
use super::{MaybeOwnedString, Value};
|
||||
use crate::schema::Facet;
|
||||
use crate::tokenizer::PreTokenizedString;
|
||||
use crate::DateTime;
|
||||
@@ -372,7 +371,8 @@ mod binary_serialize {
|
||||
match *self {
|
||||
Value::Str(ref text) => {
|
||||
TEXT_CODE.serialize(writer)?;
|
||||
text.serialize(writer)
|
||||
// TODO impl trait for MaybeOwnedString
|
||||
text.as_str().to_owned().serialize(writer)
|
||||
}
|
||||
Value::PreTokStr(ref tok_str) => {
|
||||
EXT_CODE.serialize(writer)?;
|
||||
@@ -434,7 +434,7 @@ mod binary_serialize {
|
||||
match type_code {
|
||||
TEXT_CODE => {
|
||||
let text = String::deserialize(reader)?;
|
||||
Ok(Value::Str(Cow::Owned(text)))
|
||||
Ok(Value::Str(MaybeOwnedString::from_string(text)))
|
||||
}
|
||||
U64_CODE => {
|
||||
let value = u64::deserialize(reader)?;
|
||||
@@ -576,3 +576,104 @@ mod tests {
|
||||
assert_eq!(serialized_value_json, r#""1996-12-20T01:39:57Z""#);
|
||||
}
|
||||
}
|
||||
|
||||
mod not_safe {
|
||||
use std::ops::Deref;
|
||||
|
||||
union Ref<'a, T: ?Sized> {
|
||||
shared: &'a T,
|
||||
uniq: &'a mut T,
|
||||
}
|
||||
|
||||
pub struct MaybeOwnedString<'a> {
|
||||
string: Ref<'a, str>,
|
||||
capacity: usize,
|
||||
}
|
||||
|
||||
impl<'a> MaybeOwnedString<'a> {
|
||||
pub fn from_str(string: &'a str) -> MaybeOwnedString<'a> {
|
||||
MaybeOwnedString {
|
||||
string: Ref { shared: string },
|
||||
capacity: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_string(mut string: String) -> MaybeOwnedString<'static> {
|
||||
string.shrink_to_fit(); // <= actually important for safety, todo use the Vec .as_ptr instead
|
||||
|
||||
let mut s = std::mem::ManuallyDrop::new(string);
|
||||
let ptr = s.as_mut_ptr();
|
||||
let len = s.len();
|
||||
let capacity = s.capacity();
|
||||
|
||||
let string = unsafe {
|
||||
std::str::from_utf8_unchecked_mut(std::slice::from_raw_parts_mut(ptr, len))
|
||||
};
|
||||
MaybeOwnedString {
|
||||
string: Ref { uniq: string },
|
||||
capacity,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn into_string(mut self) -> String {
|
||||
if self.capacity != 0 {
|
||||
let string = unsafe { &mut self.string.uniq };
|
||||
unsafe {
|
||||
return String::from_raw_parts(string.as_mut_ptr(), self.len(), self.capacity);
|
||||
};
|
||||
}
|
||||
self.deref().to_owned()
|
||||
}
|
||||
|
||||
pub fn as_str(&self) -> &str {
|
||||
self.deref()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Deref for MaybeOwnedString<'a> {
|
||||
type Target = str;
|
||||
|
||||
#[inline]
|
||||
fn deref(&self) -> &str {
|
||||
unsafe { self.string.shared }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Drop for MaybeOwnedString<'a> {
|
||||
fn drop(&mut self) {
|
||||
// if capacity is 0, either it's an empty String so there is no dealloc to do, or it's
|
||||
// borrowed
|
||||
if self.capacity != 0 {
|
||||
let string = unsafe { &mut self.string.uniq };
|
||||
unsafe { String::from_raw_parts(string.as_mut_ptr(), self.len(), self.capacity) };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Clone for MaybeOwnedString<'a> {
|
||||
fn clone(&self) -> Self {
|
||||
if self.capacity == 0 {
|
||||
MaybeOwnedString {
|
||||
string: Ref {
|
||||
shared: unsafe { self.string.shared },
|
||||
},
|
||||
capacity: 0,
|
||||
}
|
||||
} else {
|
||||
MaybeOwnedString::from_string(self.deref().to_owned())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> std::fmt::Debug for MaybeOwnedString<'a> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(self.deref())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> PartialEq for MaybeOwnedString<'a> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.deref() == other.deref()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user