Compare commits

..

4 Commits

Author SHA1 Message Date
Paul Masurel
f3099a83eb Blop 2018-12-24 11:41:18 +09:00
Paul Masurel
f745bb9d2a blop 2018-12-24 11:28:08 +09:00
Paul Masurel
d9417acbc6 done 2018-12-11 09:01:45 +09:00
Paul Masurel
38540c3826 small step 2018-12-09 15:26:19 +09:00
61 changed files with 1063 additions and 801 deletions

View File

@@ -1,4 +1,4 @@
Tantivy 0.8.0
Tantivy 0.8.1
=====================
*No change in the index format*
- API Breaking change in the collector API. (@jwolfe, @fulmicoton)

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.8.3"
version = "0.8.0-dev"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -16,8 +16,8 @@ base64 = "0.10.0"
byteorder = "1.0"
lazy_static = "1"
regex = "1.0"
tantivy-fst = {path="../tantivy-search/fst", version="0.1"}
memmap = "0.7"
fst = {version="0.3", default-features=false}
fst-regex = { version="0.2" }
lz4 = {version="1.20", optional=true}
snap = {version="0.2"}
atomicwrites = {version="0.2.2", optional=true}
@@ -29,8 +29,8 @@ serde = "1.0"
serde_derive = "1.0"
serde_json = "1.0"
num_cpus = "1.2"
itertools = "0.8"
levenshtein_automata = {version="0.1"}
itertools = "0.7"
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
bit-set = "0.5"
uuid = { version = "0.7", features = ["v4", "serde"] }
crossbeam = "0.5"
@@ -49,7 +49,7 @@ failure = "0.1"
htmlescape = "0.3.1"
fail = "0.2"
scoped-pool = "1.0"
murmurhash32 = "0.2"
aho-corasick = "0.6"
[target.'cfg(windows)'.dependencies]
winapi = "0.2"
@@ -70,7 +70,7 @@ overflow-checks = true
[features]
# by default no-fail is disabled. We manually enable it when running test.
default = ["mmap", "no_fail"]
mmap = ["atomicwrites"]
mmap = ["fst/mmap", "atomicwrites"]
lz4-compression = ["lz4"]
no_fail = ["fail/no_fail"]
unstable = [] # useful for benches.

View File

@@ -106,37 +106,37 @@ fn main() -> tantivy::Result<()> {
// For convenience, tantivy also comes with a macro to
// reduce the boilerplate above.
index_writer.add_document(doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
index_writer.add_document(doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
// Multivalued field just need to be repeated.
index_writer.add_document(doc!(
title => "Frankenstein",
title => "The Modern Prometheus",
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking."
title => "Frankenstein",
title => "The Modern Prometheus",
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking."
));
// This is an example, so we will only index 3 documents

View File

@@ -68,12 +68,12 @@ fn main() -> tantivy::Result<()> {
// heap for the indexer can increase its throughput.
let mut index_writer = index.writer(50_000_000)?;
index_writer.add_document(doc!(
title => "The Old Man and the Sea",
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish."
title => "The Old Man and the Sea",
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish."
));
index_writer.add_document(doc!(
title => "Of Mice and Men",
title => "Of Mice and Men",
body => r#"A few miles south of Soledad, the Salinas River drops in close to the hillside
bank and runs deep and green. The water is warm too, for it has slipped twinkling
over the yellow sands in the sunlight before reaching the narrow pool. On one
@@ -84,7 +84,7 @@ fn main() -> tantivy::Result<()> {
limbs and branches that arch over the pool"#
));
index_writer.add_document(doc!(
title => "Frankenstein",
title => "Frankenstein",
body => r#"You will rejoice to hear that no disaster has accompanied the commencement of an
enterprise which you have regarded with such evil forebodings. I arrived here
yesterday, and my first task is to assure my dear sister of my welfare and

View File

@@ -35,15 +35,15 @@ fn main() -> tantivy::Result<()> {
// we'll only need one doc for this example.
index_writer.add_document(doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
// ...
index_writer.commit()?;
@@ -56,7 +56,7 @@ fn main() -> tantivy::Result<()> {
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;
let snippet_generator = SnippetGenerator::new(&searcher, &*query, body)?;
for (score, doc_address) in top_docs {
let doc = searcher.doc(doc_address)?;

View File

@@ -72,26 +72,26 @@ fn main() -> tantivy::Result<()> {
title => "The Old Man and the Sea",
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish."
));
));
index_writer.add_document(doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
index_writer.add_document(doc!(
title => "Frankenstein",
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking."
title => "Frankenstein",
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking."
));
index_writer.commit()?;

View File

@@ -197,7 +197,7 @@ fn skip<'a, I: Iterator<Item = &'a Facet>>(
) -> SkipResult {
loop {
match collapse_it.peek() {
Some(facet_bytes) => match facet_bytes.encoded_str().as_bytes().cmp(target) {
Some(facet_bytes) => match facet_bytes.encoded_bytes().cmp(target) {
Ordering::Less => {}
Ordering::Greater => {
return SkipResult::OverStep;
@@ -369,8 +369,7 @@ impl SegmentCollector for FacetSegmentCollector {
let mut facet = vec![];
let facet_ord = self.collapse_facet_ords[collapsed_facet_ord];
facet_dict.ord_to_term(facet_ord as u64, &mut facet);
// TODO
facet_counts.insert(Facet::from_encoded(facet).unwrap(), count);
facet_counts.insert(unsafe { Facet::from_encoded(facet) }, count);
}
FacetCounts { facet_counts }
}
@@ -404,9 +403,9 @@ impl FacetCounts {
let right_bound = if facet.is_root() {
Bound::Unbounded
} else {
let mut facet_after_bytes: String = facet.encoded_str().to_owned();
facet_after_bytes.push('\u{1}');
let facet_after = Facet::from_encoded_string(facet_after_bytes);
let mut facet_after_bytes: Vec<u8> = facet.encoded_bytes().to_owned();
facet_after_bytes.push(1u8);
let facet_after = unsafe { Facet::from_encoded(facet_after_bytes) }; // ok logic
Bound::Excluded(facet_after)
};
let underlying: btree_map::Range<_, _> = self.facet_counts.range((left_bound, right_bound));
@@ -475,8 +474,7 @@ mod tests {
n /= 4;
let leaf = n % 5;
Facet::from(&format!("/top{}/mid{}/leaf{}", top, mid, leaf))
})
.collect();
}).collect();
for i in 0..num_facets * 10 {
let mut doc = Document::new();
doc.add_facet(facet_field, facets[i % num_facets].clone());
@@ -502,16 +500,18 @@ mod tests {
("/top1/mid2", 50),
("/top1/mid3", 50),
]
.iter()
.map(|&(facet_str, count)| (String::from(facet_str), count))
.collect::<Vec<_>>()
.iter()
.map(|&(facet_str, count)| (String::from(facet_str), count))
.collect::<Vec<_>>()
);
}
}
#[test]
#[should_panic(expected = "Tried to add a facet which is a descendant of \
an already added facet.")]
#[should_panic(
expected = "Tried to add a facet which is a descendant of \
an already added facet."
)]
fn test_misused_facet_collector() {
let mut facet_collector = FacetCollector::for_field(Field(0));
facet_collector.add_facet(Facet::from("/country"));
@@ -563,15 +563,13 @@ mod tests {
let facet = Facet::from(&format!("/facet/{}", c));
let doc = doc!(facet_field => facet);
iter::repeat(doc).take(count)
})
.map(|mut doc| {
}).map(|mut doc| {
doc.add_facet(
facet_field,
&format!("/facet/{}", thread_rng().sample(&uniform)),
);
doc
})
.collect();
}).collect();
docs[..].shuffle(&mut thread_rng());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();

View File

@@ -43,8 +43,7 @@ impl<TCollector: Collector> Collector for CollectorWrapper<TCollector> {
let err_msg = format!("Failed to cast child collector fruit. {:?}", e);
TantivyError::InvalidArgument(err_msg)
})
})
.collect::<Result<_>>()?;
}).collect::<Result<_>>()?;
let merged_fruit = self.0.merge_fruits(typed_fruit)?;
Ok(Box::new(merged_fruit))
}
@@ -148,8 +147,6 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
/// Ok(())
/// }
/// ```
#[allow(clippy::type_complexity)]
#[derive(Default)]
pub struct MultiCollector<'a> {
collector_wrappers:
Vec<Box<Collector<Child = Box<BoxableSegmentCollector>, Fruit = Box<Fruit>> + 'a>>,
@@ -157,8 +154,10 @@ pub struct MultiCollector<'a> {
impl<'a> MultiCollector<'a> {
/// Create a new `MultiCollector`
pub fn new() -> Self {
Default::default()
pub fn new() -> MultiCollector<'a> {
MultiCollector {
collector_wrappers: Vec::new(),
}
}
/// Add a new collector to our `MultiCollector`.
@@ -214,8 +213,7 @@ impl<'a> Collector for MultiCollector<'a> {
.zip(segment_fruits_list)
.map(|(child_collector, segment_fruits)| {
Ok(Some(child_collector.merge_fruits(segment_fruits)?))
})
.collect::<Result<_>>()?;
}).collect::<Result<_>>()?;
Ok(MultiFruit { sub_fruits })
}
}

View File

@@ -84,9 +84,11 @@ where
for (feature, doc) in child_fruit {
if top_collector.len() < self.limit {
top_collector.push(ComparableDoc { feature, doc });
} else if let Some(mut head) = top_collector.peek_mut() {
if head.feature < feature {
*head = ComparableDoc { feature, doc };
} else {
if let Some(mut head) = top_collector.peek_mut() {
if head.feature < feature {
*head = ComparableDoc { feature, doc };
}
}
}
}
@@ -140,8 +142,7 @@ impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
comparable_doc.feature,
DocAddress(segment_id, comparable_doc.doc),
)
})
.collect()
}).collect()
}
/// Return true iff at least K documents have gone through

View File

@@ -1,6 +1,9 @@
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
use common::serialize::BinarySerializable;
use std::io;
use std::io::Write;
use std::mem;
use std::ops::Deref;
use std::ptr;
pub(crate) struct BitPacker {
mini_buffer: u64,
@@ -15,7 +18,7 @@ impl BitPacker {
}
}
pub fn write<TWrite: io::Write>(
pub fn write<TWrite: Write>(
&mut self,
val: u64,
num_bits: u8,
@@ -25,14 +28,14 @@ impl BitPacker {
let num_bits = num_bits as usize;
if self.mini_buffer_written + num_bits > 64 {
self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32);
output.write_u64::<LittleEndian>(self.mini_buffer)?;
self.mini_buffer.serialize(output)?;
self.mini_buffer = val_u64.wrapping_shr((64 - self.mini_buffer_written) as u32);
self.mini_buffer_written = self.mini_buffer_written + num_bits - 64;
} else {
self.mini_buffer |= val_u64 << self.mini_buffer_written;
self.mini_buffer_written += num_bits;
if self.mini_buffer_written == 64 {
output.write_u64::<LittleEndian>(self.mini_buffer)?;
self.mini_buffer.serialize(output)?;
self.mini_buffer_written = 0;
self.mini_buffer = 0u64;
}
@@ -40,18 +43,17 @@ impl BitPacker {
Ok(())
}
pub fn flush<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
pub fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
if self.mini_buffer_written > 0 {
let num_bytes = (self.mini_buffer_written + 7) / 8;
let mut arr: [u8; 8] = [0u8; 8];
LittleEndian::write_u64(&mut arr, self.mini_buffer);
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer.to_le()) };
output.write_all(&arr[..num_bytes])?;
self.mini_buffer_written = 0;
}
Ok(())
}
pub fn close<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
pub fn close<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
self.flush(output)?;
// Padding the write file to simplify reads.
output.write_all(&[0u8; 7])?;
@@ -100,7 +102,9 @@ where
addr + 8 <= data.len(),
"The fast field field should have been padded with 7 bytes."
);
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[addr..]);
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
let val_unshifted_unmasked: u64 =
u64::from_le(unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) });
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
val_shifted & mask
}
@@ -122,7 +126,9 @@ where
for output_val in output.iter_mut() {
let addr = addr_in_bits >> 3;
let bit_shift = addr_in_bits & 7;
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[addr..]);
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
let val_unshifted_unmasked: u64 =
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
*output_val = val_shifted & mask;
addr_in_bits += num_bits;

View File

@@ -64,18 +64,17 @@ impl Executor {
// This is important as it makes it possible for the fruit_receiver iteration to
// terminate.
};
// This is lame, but it does not use unsafe code.
let mut results_with_position = Vec::with_capacity(num_fruits);
let mut results = Vec::with_capacity(num_fruits);
unsafe { results.set_len(num_fruits) };
let mut num_items = 0;
for (pos, fruit_res) in fruit_receiver {
let fruit = fruit_res?;
results_with_position.push((pos, fruit));
results[pos] = fruit_res?;
num_items += 1;
}
results_with_position.sort_by_key(|(pos, _)| *pos);
assert_eq!(results_with_position.len(), num_fruits);
Ok(results_with_position
.into_iter()
.map(|(_, fruit)| fruit)
.collect::<Vec<_>>())
// this checks ensures that we filled of this
// uninitialized memory.
assert_eq!(num_items, results.len());
Ok(results)
}
}
}
@@ -95,8 +94,7 @@ mod tests {
panic!("panic should propagate");
},
vec![0].into_iter(),
)
.unwrap();
).unwrap();
}
#[test]
@@ -108,8 +106,7 @@ mod tests {
panic!("panic should propagate");
},
vec![0].into_iter(),
)
.unwrap();
).unwrap();
}
#[test]

View File

@@ -13,7 +13,6 @@ use directory::ManagedDirectory;
#[cfg(feature = "mmap")]
use directory::MmapDirectory;
use directory::{Directory, RAMDirectory};
use error::DataCorruption;
use error::TantivyError;
use indexer::index_writer::open_index_writer;
use indexer::index_writer::HEAP_SIZE_MIN;
@@ -38,13 +37,7 @@ fn load_metas(directory: &Directory) -> Result<IndexMeta> {
let meta_data = directory.atomic_read(&META_FILEPATH)?;
let meta_string = String::from_utf8_lossy(&meta_data);
serde_json::from_str(&meta_string)
.map_err(|e| {
DataCorruption::new(
META_FILEPATH.clone(),
format!("Meta file cannot be deserialized. {:?}.", e),
)
})
.map_err(From::from)
.map_err(|_| TantivyError::CorruptedFile(META_FILEPATH.clone()))
}
/// Search Index
@@ -142,7 +135,7 @@ impl Index {
/// Creates a new index given an implementation of the trait `Directory`
pub fn create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
let directory = ManagedDirectory::wrap(dir)?;
let directory = ManagedDirectory::new(dir)?;
Index::from_directory(directory, schema)
}
@@ -150,7 +143,7 @@ impl Index {
///
/// This will overwrite existing meta.json
fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
save_new_metas(schema.clone(), directory.borrow_mut())?;
save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
let metas = IndexMeta::with_schema(schema);
Index::create_from_metas(directory, &metas)
}
@@ -206,7 +199,7 @@ impl Index {
/// Open the index using the provided directory
pub fn open<D: Directory>(directory: D) -> Result<Index> {
let directory = ManagedDirectory::wrap(directory)?;
let directory = ManagedDirectory::new(directory)?;
let metas = load_metas(&directory)?;
Index::create_from_metas(directory, &metas)
}

View File

@@ -32,7 +32,10 @@ pub struct InvertedIndexReader {
}
impl InvertedIndexReader {
#[cfg_attr(feature = "cargo-clippy", allow(clippy::needless_pass_by_value))] // for symetry
#[cfg_attr(
feature = "cargo-clippy",
allow(clippy::needless_pass_by_value)
)] // for symetry
pub(crate) fn new(
termdict: TermDictionary,
postings_source: ReadOnlySource,

View File

@@ -104,8 +104,7 @@ impl Searcher {
.iter()
.map(|segment_reader| {
u64::from(segment_reader.inverted_index(term.field()).doc_freq(term))
})
.sum::<u64>()
}).sum::<u64>()
}
/// Return the list of segment readers

View File

@@ -1,7 +1,7 @@
use core::MANAGED_FILEPATH;
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
use directory::{ReadOnlySource, WritePtr};
use error::DataCorruption;
use error::TantivyError;
use indexer::LockType;
use serde_json;
use std::collections::HashSet;
@@ -59,17 +59,12 @@ fn save_managed_paths(
impl ManagedDirectory {
/// Wraps a directory as managed directory.
pub fn wrap<Dir: Directory>(directory: Dir) -> Result<ManagedDirectory> {
pub fn new<Dir: Directory>(directory: Dir) -> Result<ManagedDirectory> {
match directory.atomic_read(&MANAGED_FILEPATH) {
Ok(data) => {
let managed_files_json = String::from_utf8_lossy(&data);
let managed_files: HashSet<PathBuf> = serde_json::from_str(&managed_files_json)
.map_err(|e| {
DataCorruption::new(
MANAGED_FILEPATH.clone(),
format!("Managed file cannot be deserialized: {:?}. ", e),
)
})?;
.map_err(|_| TantivyError::CorruptedFile(MANAGED_FILEPATH.clone()))?;
Ok(ManagedDirectory {
directory: Box::new(directory),
meta_informations: Arc::new(RwLock::new(MetaInformation {
@@ -265,7 +260,7 @@ mod tests {
let tempdir_path = PathBuf::from(tempdir.path());
{
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
{
let mut write_file = managed_directory.open_write(*TEST_PATH1).unwrap();
write_file.flush().unwrap();
@@ -291,7 +286,7 @@ mod tests {
}
{
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
{
assert!(managed_directory.exists(*TEST_PATH1));
assert!(!managed_directory.exists(*TEST_PATH2));
@@ -315,7 +310,7 @@ mod tests {
let living_files = HashSet::new();
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
managed_directory
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
.unwrap();

View File

@@ -1,9 +1,12 @@
use atomicwrites;
use common::make_io_err;
use directory::error::{DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
use directory::shared_vec_slice::SharedVecSlice;
use directory::Directory;
use directory::ReadOnlySource;
use directory::WritePtr;
use fst::raw::MmapReadOnly;
use std::collections::hash_map::Entry as HashMapEntry;
use std::collections::HashMap;
use std::convert::From;
use std::fmt;
@@ -16,14 +19,11 @@ use std::result;
use std::sync::Arc;
use std::sync::RwLock;
use tempdir::TempDir;
use memmap::Mmap;
use std::sync::Weak;
use std::ops::Deref;
/// Returns None iff the file exists, can be read, but is empty (and hence
/// cannot be mmapped).
///
fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {
fn open_mmap(full_path: &Path) -> result::Result<Option<MmapReadOnly>, OpenReadError> {
let file = File::open(full_path).map_err(|e| {
if e.kind() == io::ErrorKind::NotFound {
OpenReadError::FileDoesNotExist(full_path.to_owned())
@@ -42,7 +42,7 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {
return Ok(None);
}
unsafe {
memmap::Mmap::map(&file)
MmapReadOnly::open(&file)
.map(Some)
.map_err(|e| From::from(IOError::with_path(full_path.to_owned(), e)))
}
@@ -65,7 +65,7 @@ pub struct CacheInfo {
struct MmapCache {
counters: CacheCounters,
cache: HashMap<PathBuf, Weak<Box<Deref<Target=[u8]> + Send + Sync>>>,
cache: HashMap<PathBuf, MmapReadOnly>,
}
impl Default for MmapCache {
@@ -78,6 +78,10 @@ impl Default for MmapCache {
}
impl MmapCache {
/// Removes a `MmapReadOnly` entry from the mmap cache.
fn discard_from_cache(&mut self, full_path: &Path) -> bool {
self.cache.remove(full_path).is_some()
}
fn get_info(&mut self) -> CacheInfo {
let paths: Vec<PathBuf> = self.cache.keys().cloned().collect();
@@ -87,27 +91,23 @@ impl MmapCache {
}
}
// Returns None if the file exists but as a len of 0 (and hence is not mmappable).
fn get_mmap(&mut self, full_path: &Path) -> Result<Option<Arc<Box<Deref<Target=[u8]> + Send + Sync>>>, OpenReadError> {
let path_in_cache = self.cache.contains_key(full_path);
if path_in_cache {
{
let mmap_weak_opt = self.cache.get(full_path);
if let Some(mmap_arc) = mmap_weak_opt.and_then(|mmap_weak| mmap_weak.upgrade()) {
self.counters.hit += 1;
return Ok(Some(mmap_arc));
fn get_mmap(&mut self, full_path: &Path) -> Result<Option<MmapReadOnly>, OpenReadError> {
Ok(match self.cache.entry(full_path.to_owned()) {
HashMapEntry::Occupied(occupied_entry) => {
let mmap = occupied_entry.get();
self.counters.hit += 1;
Some(mmap.clone())
}
HashMapEntry::Vacant(vacant_entry) => {
self.counters.miss += 1;
if let Some(mmap) = open_mmap(full_path)? {
vacant_entry.insert(mmap.clone());
Some(mmap)
} else {
None
}
}
self.cache.remove(full_path);
}
self.counters.miss += 1;
if let Some(mmap) = open_mmap(full_path)? {
let res: Arc<Box<Deref<Target=[u8]> + Send + Sync>> = Arc::new(Box::new(mmap));
self.cache.insert(full_path.to_owned(), Arc::downgrade(&res));
Ok(Some(res))
} else {
Ok(None)
}
})
}
}
@@ -253,10 +253,11 @@ impl Directory for MmapDirectory {
);
IOError::with_path(path.to_owned(), make_io_err(msg))
})?;
Ok(mmap_cache
.get_mmap(&full_path)?
.map(ReadOnlySource::from)
.unwrap_or_else(|| ReadOnlySource::empty()))
.map(ReadOnlySource::Mmap)
.unwrap_or_else(|| ReadOnlySource::Anonymous(SharedVecSlice::empty())))
}
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
@@ -294,6 +295,20 @@ impl Directory for MmapDirectory {
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
debug!("Deleting file {:?}", path);
let full_path = self.resolve_path(path);
let mut mmap_cache = self.mmap_cache.write().map_err(|_| {
let msg = format!(
"Failed to acquired write lock \
on mmap cache while deleting {:?}",
path
);
IOError::with_path(path.to_owned(), make_io_err(msg))
})?;
mmap_cache.discard_from_cache(path);
// Removing the entry in the MMap cache.
// The munmap will appear on Drop,
// when the last reference is gone.
mmap_cache.cache.remove(&full_path);
match fs::remove_file(&full_path) {
Ok(_) => self
.sync_directory()
@@ -388,50 +403,25 @@ mod tests {
w.flush().unwrap();
}
}
let mut keep = vec![];
for (i, path) in paths.iter().enumerate() {
keep.push(mmap_directory.open_read(path).unwrap());
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), i + 1);
}
assert_eq!(mmap_directory.get_cache_info().counters.hit, 0);
assert_eq!(mmap_directory.get_cache_info().counters.miss, 10);
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 10);
for path in paths.iter() {
let _r = mmap_directory.open_read(path).unwrap();
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), num_paths);
{
for (i, path) in paths.iter().enumerate() {
let _r = mmap_directory.open_read(path).unwrap();
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), i + 1);
}
for path in paths.iter() {
let _r = mmap_directory.open_read(path).unwrap();
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), num_paths);
}
for (i, path) in paths.iter().enumerate() {
mmap_directory.delete(path).unwrap();
assert_eq!(
mmap_directory.get_cache_info().mmapped.len(),
num_paths - i - 1
);
}
}
assert_eq!(mmap_directory.get_cache_info().counters.hit, 10);
assert_eq!(mmap_directory.get_cache_info().counters.miss, 10);
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 10);
for path in paths.iter() {
let _r = mmap_directory.open_read(path).unwrap();
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), num_paths);
}
assert_eq!(mmap_directory.get_cache_info().counters.hit, 20);
assert_eq!(mmap_directory.get_cache_info().counters.miss, 10);
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 10);
drop(keep);
for path in paths.iter() {
let _r = mmap_directory.open_read(path).unwrap();
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), num_paths);
}
assert_eq!(mmap_directory.get_cache_info().counters.hit, 20);
assert_eq!(mmap_directory.get_cache_info().counters.miss, 20);
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 10);
for path in &paths {
mmap_directory.delete(path).unwrap();
}
assert_eq!(mmap_directory.get_cache_info().counters.hit, 20);
assert_eq!(mmap_directory.get_cache_info().counters.miss, 20);
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 10);
for path in paths.iter() {
assert!(mmap_directory.open_read(path).is_err());
}
assert_eq!(mmap_directory.get_cache_info().counters.hit, 20);
assert_eq!(mmap_directory.get_cache_info().counters.miss, 30);
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0);
}

View File

@@ -11,6 +11,7 @@ mod directory;
mod managed_directory;
mod ram_directory;
mod read_only_source;
mod shared_vec_slice;
/// Errors specific to the directory module.
pub mod error;

View File

@@ -1,3 +1,4 @@
use super::shared_vec_slice::SharedVecSlice;
use common::make_io_err;
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
use directory::WritePtr;
@@ -70,7 +71,7 @@ impl Write for VecWriter {
}
#[derive(Clone)]
struct InnerDirectory(Arc<RwLock<HashMap<PathBuf, ReadOnlySource>>>);
struct InnerDirectory(Arc<RwLock<HashMap<PathBuf, Arc<Vec<u8>>>>>);
impl InnerDirectory {
fn new() -> InnerDirectory {
@@ -84,7 +85,7 @@ impl InnerDirectory {
path
))
})?;
let prev_value = map.insert(path, ReadOnlySource::new(Vec::from(data)));
let prev_value = map.insert(path, Arc::new(Vec::from(data)));
Ok(prev_value.is_some())
}
@@ -99,12 +100,12 @@ impl InnerDirectory {
);
let io_err = make_io_err(msg);
OpenReadError::IOError(IOError::with_path(path.to_owned(), io_err))
})
.and_then(|readable_map| {
}).and_then(|readable_map| {
readable_map
.get(path)
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
.map(|el| el.clone())
.map(Arc::clone)
.map(|data| ReadOnlySource::Anonymous(SharedVecSlice::new(data)))
})
}
@@ -119,8 +120,7 @@ impl InnerDirectory {
);
let io_err = make_io_err(msg);
DeleteError::IOError(IOError::with_path(path.to_owned(), io_err))
})
.and_then(|mut writable_map| match writable_map.remove(path) {
}).and_then(|mut writable_map| match writable_map.remove(path) {
Some(_) => Ok(()),
None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))),
})

View File

@@ -1,8 +1,9 @@
use super::shared_vec_slice::SharedVecSlice;
use common::HasLen;
#[cfg(feature = "mmap")]
use fst::raw::MmapReadOnly;
use stable_deref_trait::{CloneStableDeref, StableDeref};
use std::ops::Deref;
use std::sync::Arc;
/// Read object that represents files in tantivy.
///
@@ -10,10 +11,12 @@ use std::sync::Arc;
/// the data in the form of a constant read-only `&[u8]`.
/// Whatever happens to the directory file, the data
/// hold by this object should never be altered or destroyed.
pub struct ReadOnlySource {
data: Arc<Box<Deref<Target=[u8]> + Send + Sync + 'static>>,
start: usize,
stop: usize
pub enum ReadOnlySource {
/// Mmap source of data
#[cfg(feature = "mmap")]
Mmap(MmapReadOnly),
/// Wrapping a `Vec<u8>`
Anonymous(SharedVecSlice),
}
unsafe impl StableDeref for ReadOnlySource {}
@@ -27,41 +30,19 @@ impl Deref for ReadOnlySource {
}
}
impl From<Arc<Box<Deref<Target=[u8]> + Send + Sync>>> for ReadOnlySource {
fn from(data: Arc<Box<Deref<Target=[u8]> + Send + Sync>>) -> Self {
let len = data.len();
ReadOnlySource {
data,
start: 0,
stop: len
}
}
}
const EMPTY_ARRAY: [u8; 0] = [0u8; 0];
impl ReadOnlySource {
/// Creates a new `ReadOnlySource`.
pub fn new<D>(data: D) -> ReadOnlySource
where D: Deref<Target=[u8]> + Send + Sync + 'static {
let len = data.len();
ReadOnlySource {
data: Arc::new(Box::new(data)),
start: 0,
stop: len
}
}
/// Creates an empty ReadOnlySource
pub fn empty() -> ReadOnlySource {
ReadOnlySource::new(&EMPTY_ARRAY[..])
ReadOnlySource::Anonymous(SharedVecSlice::empty())
}
/// Returns the data underlying the ReadOnlySource object.
pub fn as_slice(&self) -> &[u8] {
&self.data[self.start..self.stop]
match *self {
#[cfg(feature = "mmap")]
ReadOnlySource::Mmap(ref mmap_read_only) => mmap_read_only.as_slice(),
ReadOnlySource::Anonymous(ref shared_vec) => shared_vec.as_slice(),
}
}
/// Splits into 2 `ReadOnlySource`, at the offset given
@@ -82,18 +63,22 @@ impl ReadOnlySource {
/// worth of data in anonymous memory, and only a
/// 1KB slice is remaining, the whole `500MBs`
/// are retained in memory.
pub fn slice(&self, start: usize, stop: usize) -> ReadOnlySource {
pub fn slice(&self, from_offset: usize, to_offset: usize) -> ReadOnlySource {
assert!(
start <= stop,
from_offset <= to_offset,
"Requested negative slice [{}..{}]",
start,
stop
from_offset,
to_offset
);
assert!(stop <= self.len());
ReadOnlySource {
data: self.data.clone(),
start: self.start + start,
stop: self.start + stop
match *self {
#[cfg(feature = "mmap")]
ReadOnlySource::Mmap(ref mmap_read_only) => {
let sliced_mmap = mmap_read_only.range(from_offset, to_offset - from_offset);
ReadOnlySource::Mmap(sliced_mmap)
}
ReadOnlySource::Anonymous(ref shared_vec) => {
ReadOnlySource::Anonymous(shared_vec.slice(from_offset, to_offset))
}
}
}
@@ -102,7 +87,8 @@ impl ReadOnlySource {
///
/// Equivalent to `.slice(from_offset, self.len())`
pub fn slice_from(&self, from_offset: usize) -> ReadOnlySource {
self.slice(from_offset, self.len())
let len = self.len();
self.slice(from_offset, len)
}
/// Like `.slice(...)` but enforcing only the `to`
@@ -116,18 +102,19 @@ impl ReadOnlySource {
impl HasLen for ReadOnlySource {
fn len(&self) -> usize {
self.stop - self.start
self.as_slice().len()
}
}
impl Clone for ReadOnlySource {
fn clone(&self) -> Self {
self.slice_from(0)
self.slice(0, self.len())
}
}
impl From<Vec<u8>> for ReadOnlySource {
fn from(data: Vec<u8>) -> ReadOnlySource {
ReadOnlySource::new(data)
let shared_data = SharedVecSlice::from(data);
ReadOnlySource::Anonymous(shared_data)
}
}
}

View File

@@ -0,0 +1,41 @@
use std::sync::Arc;
#[derive(Clone)]
pub struct SharedVecSlice {
pub data: Arc<Vec<u8>>,
pub start: usize,
pub len: usize,
}
impl SharedVecSlice {
pub fn empty() -> SharedVecSlice {
SharedVecSlice::new(Arc::new(Vec::new()))
}
pub fn new(data: Arc<Vec<u8>>) -> SharedVecSlice {
let data_len = data.len();
SharedVecSlice {
data,
start: 0,
len: data_len,
}
}
pub fn as_slice(&self) -> &[u8] {
&self.data[self.start..self.start + self.len]
}
pub fn slice(&self, from_offset: usize, to_offset: usize) -> SharedVecSlice {
SharedVecSlice {
data: Arc::clone(&self.data),
start: self.start + from_offset,
len: to_offset - from_offset,
}
}
}
impl From<Vec<u8>> for SharedVecSlice {
fn from(data: Vec<u8>) -> SharedVecSlice {
SharedVecSlice::new(Arc::new(data))
}
}

View File

@@ -8,42 +8,9 @@ use indexer::LockType;
use query;
use schema;
use serde_json;
use std::fmt;
use std::path::PathBuf;
use std::sync::PoisonError;
pub struct DataCorruption {
filepath: Option<PathBuf>,
comment: String,
}
impl DataCorruption {
pub fn new(filepath: PathBuf, comment: String) -> DataCorruption {
DataCorruption {
filepath: Some(filepath),
comment,
}
}
pub fn comment_only(comment: String) -> DataCorruption {
DataCorruption {
filepath: None,
comment,
}
}
}
impl fmt::Debug for DataCorruption {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
write!(f, "Data corruption: ")?;
if let Some(ref filepath) = &self.filepath {
write!(f, "(in file `{:?}`)", filepath)?;
}
write!(f, ": {}.", self.comment)?;
Ok(())
}
}
/// The library's failure based error enum
#[derive(Debug, Fail)]
pub enum TantivyError {
@@ -66,8 +33,8 @@ pub enum TantivyError {
#[fail(display = "An IO error occurred: '{}'", _0)]
IOError(#[cause] IOError),
/// Data corruption.
#[fail(display = "{:?}", _0)]
DataCorruption(DataCorruption),
#[fail(display = "File contains corrupted data: '{:?}'", _0)]
CorruptedFile(PathBuf),
/// A thread holding the locked panicked and poisoned the lock.
#[fail(display = "A thread holding the locked panicked and poisoned the lock")]
Poisoned,
@@ -88,12 +55,6 @@ pub enum TantivyError {
SystemError(String),
}
impl From<DataCorruption> for TantivyError {
fn from(data_corruption: DataCorruption) -> TantivyError {
TantivyError::DataCorruption(data_corruption)
}
}
impl From<FastFieldNotAvailableError> for TantivyError {
fn from(fastfield_error: FastFieldNotAvailableError) -> TantivyError {
TantivyError::FastFieldError(fastfield_error)

View File

@@ -1,6 +1,5 @@
use super::MultiValueIntFastFieldReader;
use schema::Facet;
use std::str;
use termdict::TermDictionary;
use termdict::TermOrdinal;
use DocId;
@@ -21,7 +20,6 @@ use DocId;
pub struct FacetReader {
term_ords: MultiValueIntFastFieldReader<u64>,
term_dict: TermDictionary,
buffer: Vec<u8>,
}
impl FacetReader {
@@ -39,7 +37,6 @@ impl FacetReader {
FacetReader {
term_ords,
term_dict,
buffer: vec![],
}
}
@@ -58,18 +55,11 @@ impl FacetReader {
}
/// Given a term ordinal returns the term associated to it.
pub fn facet_from_ord(
&mut self,
facet_ord: TermOrdinal,
output: &mut Facet,
) -> Result<(), str::Utf8Error> {
pub fn facet_from_ord(&self, facet_ord: TermOrdinal, output: &mut Facet) {
let found_term = self
.term_dict
.ord_to_term(facet_ord as u64, &mut self.buffer);
.ord_to_term(facet_ord as u64, output.inner_buffer_mut());
assert!(found_term, "Term ordinal {} no found.", facet_ord);
let facet_str = str::from_utf8(&self.buffer[..])?;
output.set_facet_str(facet_str);
Ok(())
}
/// Return the list of facet ordinals associated to a document.

View File

@@ -82,20 +82,20 @@ mod tests {
let mut facet = Facet::root();
{
facet_reader.facet_from_ord(1, &mut facet).unwrap();
facet_reader.facet_from_ord(1, &mut facet);
assert_eq!(facet, Facet::from("/category"));
}
{
facet_reader.facet_from_ord(2, &mut facet).unwrap();
facet_reader.facet_from_ord(2, &mut facet);
assert_eq!(facet, Facet::from("/category/cat1"));
}
{
facet_reader.facet_from_ord(3, &mut facet).unwrap();
facet_reader.facet_from_ord(3, &mut facet);
assert_eq!(format!("{}", facet), "/category/cat2");
assert_eq!(facet, Facet::from("/category/cat2"));
}
{
facet_reader.facet_from_ord(4, &mut facet).unwrap();
facet_reader.facet_from_ord(4, &mut facet);
assert_eq!(facet, Facet::from("/category/cat3"));
}

View File

@@ -191,7 +191,10 @@ impl DeleteCursor {
}
}
#[cfg_attr(feature = "cargo-clippy", allow(clippy::wrong_self_convention))]
#[cfg_attr(
feature = "cargo-clippy",
allow(clippy::wrong_self_convention)
)]
fn is_behind_opstamp(&mut self, target_opstamp: u64) -> bool {
self.get()
.map(|operation| operation.opstamp < target_opstamp)

View File

@@ -61,8 +61,7 @@ fn initial_table_size(per_thread_memory_budget: usize) -> usize {
"Per thread memory is too small: {}",
per_thread_memory_budget
)
})
.min(19) // we cap it at 512K
}).min(19) // we cap it at 512K
}
/// `IndexWriter` is the user entry-point to add document to an index.
@@ -140,7 +139,7 @@ pub fn open_index_writer(
let stamper = Stamper::new(current_opstamp);
let segment_updater =
SegmentUpdater::create(index.clone(), stamper.clone(), &delete_queue.cursor())?;
SegmentUpdater::new(index.clone(), stamper.clone(), &delete_queue.cursor())?;
let mut index_writer = IndexWriter {
_directory_lock: Some(directory_lock),
@@ -391,8 +390,7 @@ impl IndexWriter {
.name(format!(
"thrd-tantivy-index{}-gen{}",
self.worker_id, generation
))
.spawn(move || {
)).spawn(move || {
loop {
let mut document_iterator =
document_receiver_clone.clone().into_iter().peekable();
@@ -467,8 +465,10 @@ impl IndexWriter {
///
/// Returns the former segment_ready channel.
fn recreate_document_channel(&mut self) -> DocumentReceiver {
let (mut document_sender, mut document_receiver): (DocumentSender, DocumentReceiver) =
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
let (mut document_sender, mut document_receiver): (
DocumentSender,
DocumentReceiver,
) = channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
swap(&mut self.document_sender, &mut document_sender);
swap(&mut self.document_receiver, &mut document_receiver);
document_receiver
@@ -558,8 +558,11 @@ impl IndexWriter {
// and recreate a new one channels.
self.recreate_document_channel();
let former_workers_join_handle =
mem::replace(&mut self.workers_join_handle, Vec::new());
let mut former_workers_join_handle = Vec::new();
swap(
&mut former_workers_join_handle,
&mut self.workers_join_handle,
);
for worker_handle in former_workers_join_handle {
let indexing_worker_result = worker_handle
@@ -736,7 +739,7 @@ mod tests {
index_writer.add_document(doc!(text_field=>"b"));
index_writer.add_document(doc!(text_field=>"c"));
}
assert_eq!(index_writer.commit().unwrap(), 3u64);
assert_eq!(index_writer.commit().unwrap(), 2u64);
index.load_searchers().unwrap();
assert_eq!(num_docs_containing("a"), 0);
assert_eq!(num_docs_containing("b"), 1);
@@ -799,6 +802,7 @@ mod tests {
{
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
prepared_commit.set_payload("first commit");
assert_eq!(prepared_commit.opstamp(), 100);
prepared_commit.commit().expect("commit failed");
}
{
@@ -832,6 +836,7 @@ mod tests {
{
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
prepared_commit.set_payload("first commit");
assert_eq!(prepared_commit.opstamp(), 100);
prepared_commit.abort().expect("commit failed");
}
{

View File

@@ -40,15 +40,13 @@ fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
total_tokens += reader.inverted_index(field).total_num_tokens();
}
}
total_tokens
+ count
.iter()
.cloned()
.enumerate()
.map(|(fieldnorm_ord, count)| {
count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8))
})
.sum::<u64>()
total_tokens + count
.iter()
.cloned()
.enumerate()
.map(|(fieldnorm_ord, count)| {
count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8))
}).sum::<u64>()
}
pub struct IndexMerger {
@@ -525,8 +523,7 @@ impl IndexMerger {
}
}
None
})
.collect();
}).collect();
// At this point, `segment_postings` contains the posting list
// of all of the segments containing the given term.
@@ -654,7 +651,6 @@ mod tests {
use schema::IntOptions;
use schema::Term;
use schema::TextFieldIndexing;
use schema::INT_INDEXED;
use std::io::Cursor;
use DocAddress;
use IndexWriter;
@@ -668,8 +664,7 @@ mod tests {
TextFieldIndexing::default()
.set_tokenizer("default")
.set_index_option(IndexRecordOption::WithFreqs),
)
.set_stored();
).set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue);
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
@@ -808,8 +803,7 @@ mod tests {
.search(
&query,
&BytesFastFieldTestCollector::for_field(bytes_score_field),
)
.expect("failed to search")
).expect("failed to search")
};
assert_eq!(
get_fast_vals(vec![Term::from_field_text(text_field, "a")]),
@@ -829,8 +823,7 @@ mod tests {
let text_fieldtype = schema::TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
)
.set_stored();
).set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue);
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
@@ -858,21 +851,21 @@ mod tests {
{
// a first commit
index_writer.add_document(doc!(
text_field => "a b d",
score_field => 1u64,
bytes_score_field => vec![0u8, 0, 0, 1],
));
text_field => "a b d",
score_field => 1u64,
bytes_score_field => vec![0u8, 0, 0, 1],
));
index_writer.add_document(doc!(
text_field => "b c",
score_field => 2u64,
bytes_score_field => vec![0u8, 0, 0, 2],
));
text_field => "b c",
score_field => 2u64,
bytes_score_field => vec![0u8, 0, 0, 2],
));
index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.add_document(doc!(
text_field => "c d",
score_field => 3u64,
bytes_score_field => vec![0u8, 0, 0, 3],
));
text_field => "c d",
score_field => 3u64,
bytes_score_field => vec![0u8, 0, 0, 3],
));
index_writer.commit().expect("committed");
index.load_searchers().unwrap();
let ref searcher = *index.searcher();
@@ -899,27 +892,27 @@ mod tests {
{
// a second commit
index_writer.add_document(doc!(
text_field => "a d e",
score_field => 4_000u64,
bytes_score_field => vec![0u8, 0, 0, 4],
));
text_field => "a d e",
score_field => 4_000u64,
bytes_score_field => vec![0u8, 0, 0, 4],
));
index_writer.add_document(doc!(
text_field => "e f",
score_field => 5_000u64,
bytes_score_field => vec![0u8, 0, 0, 5],
));
text_field => "e f",
score_field => 5_000u64,
bytes_score_field => vec![0u8, 0, 0, 5],
));
index_writer.delete_term(Term::from_field_text(text_field, "a"));
index_writer.delete_term(Term::from_field_text(text_field, "f"));
index_writer.add_document(doc!(
text_field => "f g",
score_field => 6_000u64,
bytes_score_field => vec![0u8, 0, 23, 112],
));
text_field => "f g",
score_field => 6_000u64,
bytes_score_field => vec![0u8, 0, 23, 112],
));
index_writer.add_document(doc!(
text_field => "g h",
score_field => 7_000u64,
bytes_score_field => vec![0u8, 0, 27, 88],
));
text_field => "g h",
score_field => 7_000u64,
bytes_score_field => vec![0u8, 0, 27, 88],
));
index_writer.commit().expect("committed");
index.load_searchers().unwrap();
let searcher = index.searcher();
@@ -984,7 +977,7 @@ mod tests {
.wait()
.expect("Merging failed");
index.load_searchers().unwrap();
let searcher = index.searcher();
let ref searcher = *index.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.num_docs(), 3);
assert_eq!(searcher.segment_readers()[0].num_docs(), 3);
@@ -1030,7 +1023,7 @@ mod tests {
index_writer.commit().unwrap();
index.load_searchers().unwrap();
let searcher = index.searcher();
let ref searcher = *index.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
@@ -1126,7 +1119,6 @@ mod tests {
{
// Test removing all docs
index_writer.delete_term(Term::from_field_text(text_field, "g"));
index_writer.commit().unwrap();
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
@@ -1257,34 +1249,6 @@ mod tests {
}
}
#[test]
fn test_bug_merge() {
let mut schema_builder = schema::Schema::builder();
let int_field = schema_builder.add_u64_field("intvals", INT_INDEXED);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(int_field => 1u64));
index_writer.commit().expect("commit failed");
index_writer.add_document(doc!(int_field => 1u64));
index_writer.commit().expect("commit failed");
index.load_searchers().unwrap();
let searcher = index.searcher();
assert_eq!(searcher.num_docs(), 2);
index_writer.delete_term(Term::from_field_u64(int_field, 1));
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
index_writer
.merge(&segment_ids)
.expect("Failed to initiate merge")
.wait()
.expect("Merging failed");
index.load_searchers().unwrap();
// commit has not been called yet. The document should still be
// there.
assert_eq!(index.searcher().num_docs(), 2);
}
#[test]
fn test_merge_multivalued_int_fields_all_deleted() {
let mut schema_builder = schema::Schema::builder();

View File

@@ -18,6 +18,7 @@ use indexer::delete_queue::DeleteCursor;
use indexer::index_writer::advance_deletes;
use indexer::merger::IndexMerger;
use indexer::stamper::Stamper;
use indexer::MergeCandidate;
use indexer::SegmentEntry;
use indexer::SegmentSerializer;
use indexer::{DefaultMergePolicy, MergePolicy};
@@ -44,15 +45,8 @@ use Result;
/// and flushed.
///
/// This method is not part of tantivy's public API
pub fn save_new_metas(schema: Schema, directory: &mut Directory) -> Result<()> {
save_metas(
&IndexMeta {
segments: Vec::new(),
schema,
opstamp: 0u64,
payload: None
},
directory)
pub fn save_new_metas(schema: Schema, opstamp: u64, directory: &mut Directory) -> Result<()> {
save_metas(vec![], schema, opstamp, None, directory)
}
/// Save the index meta file.
@@ -64,17 +58,20 @@ pub fn save_new_metas(schema: Schema, directory: &mut Directory) -> Result<()> {
/// and flushed.
///
/// This method is not part of tantivy's public API
fn save_metas(
metas: &IndexMeta,
pub fn save_metas(
segment_metas: Vec<SegmentMeta>,
schema: Schema,
opstamp: u64,
payload: Option<String>,
directory: &mut Directory,
) -> Result<()> {
// let metas = IndexMeta {
// segments: segment_metas,
// schema,
// opstamp,
// payload,
// };
let mut buffer = serde_json::to_vec_pretty(metas)?;
let metas = IndexMeta {
segments: segment_metas,
schema,
opstamp,
payload,
};
let mut buffer = serde_json::to_vec_pretty(&metas)?;
writeln!(&mut buffer)?;
directory.atomic_write(&META_FILEPATH, &buffer[..])?;
debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));
@@ -89,11 +86,6 @@ fn save_metas(
#[derive(Clone)]
pub struct SegmentUpdater(Arc<InnerSegmentUpdater>);
struct MergeOperation {
pub target_opstamp: u64,
pub segment_ids: Vec<SegmentId>,
}
fn perform_merge(
index: &Index,
mut segment_entries: Vec<SegmentEntry>,
@@ -134,13 +126,6 @@ fn perform_merge(
}
struct InnerSegmentUpdater {
// we keep a copy of the current active IndexMeta to
// avoid loading the file everytime we need it in the
// `SegmentUpdater`.
//
// This should be up to date as all update happen through
// the unique active `SegmentUpdater`.
active_metas: RwLock<Arc<IndexMeta>>,
pool: CpuPool,
index: Index,
segment_manager: SegmentManager,
@@ -153,7 +138,7 @@ struct InnerSegmentUpdater {
}
impl SegmentUpdater {
pub fn create(
pub fn new(
index: Index,
stamper: Stamper,
delete_cursor: &DeleteCursor,
@@ -164,9 +149,7 @@ impl SegmentUpdater {
.name_prefix("segment_updater")
.pool_size(1)
.create();
let index_meta = index.load_metas()?;
Ok(SegmentUpdater(Arc::new(InnerSegmentUpdater {
active_metas: RwLock::new(Arc::new(index_meta)),
pool,
index,
segment_manager,
@@ -212,8 +195,7 @@ impl SegmentUpdater {
segment_updater.0.segment_manager.add_segment(segment_entry);
segment_updater.consider_merge_options();
true
})
.forget();
}).forget();
true
} else {
false
@@ -261,26 +243,20 @@ impl SegmentUpdater {
//
// Segment 1 from disk 1, Segment 1 from disk 2, etc.
commited_segment_metas.sort_by_key(|segment_meta| -(segment_meta.max_doc() as i32));
let index_meta = IndexMeta {
segments: commited_segment_metas,
schema: index.schema(),
opstamp,
payload: commit_message
};
save_metas(
&index_meta,
commited_segment_metas,
index.schema(),
opstamp,
commit_message,
directory.box_clone().borrow_mut(),
)
.expect("Could not save metas.");
self.store_meta(&index_meta);
).expect("Could not save metas.");
}
}
pub fn garbage_collect_files(&self) -> Result<()> {
self.run_async(move |segment_updater| {
segment_updater.garbage_collect_files_exec();
})
.wait()
}).wait()
}
fn garbage_collect_files_exec(&self) {
@@ -302,32 +278,19 @@ impl SegmentUpdater {
segment_updater.garbage_collect_files_exec();
segment_updater.consider_merge_options();
}
})
.wait()
}).wait()
}
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> {
//let future_merged_segment = */
let segment_ids_vec = segment_ids.to_vec();
let commit_opstamp = self.load_metas().opstamp;
self.run_async(move |segment_updater| {
segment_updater.start_merge_impl(&segment_ids_vec[..], commit_opstamp)
})
.wait()?
}
fn store_meta(&self, index_meta: &IndexMeta) {
*self.0.active_metas.write().unwrap() = Arc::new(index_meta.clone());
}
fn load_metas(&self) -> Arc<IndexMeta> {
self.0.active_metas.read().unwrap().clone()
segment_updater.start_merge_impl(&segment_ids_vec[..])
}).wait()?
}
// `segment_ids` is required to be non-empty.
fn start_merge_impl(
&self,
segment_ids: &[SegmentId],
target_opstamp: u64,
) -> Result<Receiver<SegmentMeta>> {
fn start_merge_impl(&self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> {
assert!(!segment_ids.is_empty(), "Segment_ids cannot be empty.");
let segment_updater_clone = self.clone();
@@ -342,6 +305,8 @@ impl SegmentUpdater {
);
let (merging_future_send, merging_future_recv) = oneshot();
let target_opstamp = self.0.stamper.stamp();
// first we need to apply deletes to our segment.
let merging_join_handle = thread::Builder::new()
.name(format!("mergingthread-{}", merging_thread_id))
@@ -387,8 +352,7 @@ impl SegmentUpdater {
.unwrap()
.remove(&merging_thread_id);
Ok(())
})
.expect("Failed to spawn a thread.");
}).expect("Failed to spawn a thread.");
self.0
.merging_threads
.write()
@@ -403,32 +367,11 @@ impl SegmentUpdater {
// Committed segments cannot be merged with uncommitted_segments.
// We therefore consider merges using these two sets of segments independently.
let merge_policy = self.get_merge_policy();
let current_opstamp = self.0.stamper.stamp();
let mut merge_candidates = merge_policy
.compute_merge_candidates(&uncommitted_segments)
.into_iter()
.map(|merge_candidate| MergeOperation {
target_opstamp: current_opstamp,
segment_ids: merge_candidate.0,
})
.collect::<Vec<_>>();
let commit_opstamp = self.load_metas().opstamp;
let committed_merge_candidates = merge_policy
.compute_merge_candidates(&committed_segments)
.into_iter()
.map(|merge_candidate| MergeOperation {
target_opstamp: commit_opstamp,
segment_ids: merge_candidate.0,
})
.collect::<Vec<_>>();
merge_candidates.extend(committed_merge_candidates.into_iter());
for MergeOperation {
target_opstamp,
segment_ids,
} in merge_candidates
{
match self.start_merge_impl(&segment_ids, target_opstamp) {
let mut merge_candidates = merge_policy.compute_merge_candidates(&uncommitted_segments);
let committed_merge_candidates = merge_policy.compute_merge_candidates(&committed_segments);
merge_candidates.extend_from_slice(&committed_merge_candidates[..]);
for MergeCandidate(segment_metas) in merge_candidates {
match self.start_merge_impl(&segment_metas) {
Ok(merge_future) => {
if let Err(e) = merge_future.fuse().poll() {
error!("The merge task failed quickly after starting: {:?}", e);
@@ -463,7 +406,12 @@ impl SegmentUpdater {
info!("End merge {:?}", after_merge_segment_entry.meta());
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
if let Some(delete_operation) = delete_cursor.get() {
let committed_opstamp = segment_updater.load_metas().opstamp;
let committed_opstamp = segment_updater
.0
.index
.load_metas()
.expect("Failed to read opstamp")
.opstamp;
if delete_operation.opstamp < committed_opstamp {
let index = &segment_updater.0.index;
let segment = index.segment(after_merge_segment_entry.meta().clone());
@@ -492,11 +440,10 @@ impl SegmentUpdater {
.end_merge(&before_merge_segment_ids, after_merge_segment_entry);
segment_updater.consider_merge_options();
info!("save metas");
let previous_metas = segment_updater.load_metas();
segment_updater.save_metas(previous_metas.opstamp, previous_metas.payload.clone());
let previous_metas = segment_updater.0.index.load_metas().unwrap();
segment_updater.save_metas(previous_metas.opstamp, previous_metas.payload);
segment_updater.garbage_collect_files_exec();
})
.wait()
}).wait()
}
/// Wait for current merging threads.

View File

@@ -62,8 +62,7 @@ impl SegmentWriter {
segment.index().tokenizers().get(tokenizer_name)
}),
_ => None,
})
.collect();
}).collect();
Ok(SegmentWriter {
max_doc: 0,
multifield_postings,
@@ -111,18 +110,18 @@ impl SegmentWriter {
}
match *field_options.field_type() {
FieldType::HierarchicalFacet => {
let facets: Vec<&str> = field_values
let facets: Vec<&[u8]> = field_values
.iter()
.flat_map(|field_value| match *field_value.value() {
Value::Facet(ref facet) => Some(facet.encoded_str()),
Value::Facet(ref facet) => Some(facet.encoded_bytes()),
_ => {
panic!("Expected hierarchical facet");
}
})
.collect();
}).collect();
let mut term = Term::for_field(field); // we set the Term
for fake_str in facets {
for facet_bytes in facets {
let mut unordered_term_id_opt = None;
let fake_str = unsafe { str::from_utf8_unchecked(facet_bytes) };
FacetTokenizer.token_stream(fake_str).process(&mut |token| {
term.set_text(&token.text);
let unordered_term_id =
@@ -146,8 +145,7 @@ impl SegmentWriter {
.flat_map(|field_value| match *field_value.value() {
Value::Str(ref text) => Some(text.as_str()),
_ => None,
})
.collect();
}).collect();
if texts.is_empty() {
0
} else {

View File

@@ -1,68 +1,50 @@
use std::sync::Arc;
use std::sync::atomic::Ordering;
// AtomicU64 have not landed in stable.
// For the moment let's just use AtomicUsize on
// x86/64 bit platform, and a mutex on other platform.
#[cfg(target_arch = "x86_64")]
#[cfg(target = "x86_64")]
mod archicture_impl {
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
#[derive(Default)]
pub struct AtomicU64Ersatz(AtomicUsize);
#[derive(Clone, Default)]
pub struct Stamper(Arc<AtomicU64>);
impl AtomicU64Ersatz {
pub fn new(first_opstamp: u64) -> AtomicU64Ersatz {
AtomicU64Ersatz(AtomicUsize::new(first_opstamp as usize))
impl Stamper {
pub fn new(first_opstamp: u64) -> Stamper {
Stamper(Arc::new(AtomicU64::new(first_opstamp)))
}
pub fn fetch_add(&self, val: u64, order: Ordering) -> u64 {
self.0.fetch_add(val as usize, order) as u64
pub fn stamp(&self) -> u64 {
self.0.fetch_add(1u64, Ordering::SeqCst) as u64
}
}
}
#[cfg(not(target_arch = "x86_64"))]
#[cfg(not(target = "x86_64"))]
mod archicture_impl {
use std::sync::atomic::Ordering;
/// Under other architecture, we rely on a mutex.
use std::sync::RwLock;
use std::sync::{Arc, Mutex};
#[derive(Default)]
pub struct AtomicU64Ersatz(RwLock<u64>);
#[derive(Clone, Default)]
pub struct Stamper(Arc<Mutex<u64>>);
impl AtomicU64Ersatz {
pub fn new(first_opstamp: u64) -> AtomicU64Ersatz {
AtomicU64Ersatz(RwLock::new(first_opstamp))
impl Stamper {
pub fn new(first_opstamp: u64) -> Stamper {
Stamper(Arc::new(Mutex::new(first_opstamp)))
}
pub fn fetch_add(&self, incr: u64, _order: Ordering) -> u64 {
let mut lock = self.0.write().unwrap();
let previous_val = *lock;
*lock = previous_val + incr;
pub fn stamp(&self) -> u64 {
let mut guard = self.0.lock().expect("Failed to lock the stamper");
let previous_val = *guard;
*guard = previous_val + 1;
previous_val
}
}
}
use self::archicture_impl::AtomicU64Ersatz;
#[derive(Clone, Default)]
pub struct Stamper(Arc<AtomicU64Ersatz>);
impl Stamper {
pub fn new(first_opstamp: u64) -> Stamper {
Stamper(Arc::new(AtomicU64Ersatz::new(first_opstamp)))
}
pub fn stamp(&self) -> u64 {
self.0.fetch_add(1u64, Ordering::SeqCst) as u64
}
}
pub use self::archicture_impl::Stamper;
#[cfg(test)]
mod test {

View File

@@ -1,5 +1,6 @@
#![doc(html_logo_url = "http://fulmicoton.com/tantivy-logo/tantivy-logo.png")]
#![cfg_attr(all(feature = "unstable", test), feature(test))]
#![cfg_attr(feature = "cargo-clippy", feature(tool_lints))]
#![cfg_attr(feature = "cargo-clippy", allow(clippy::module_inception))]
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
#![warn(missing_docs)]
@@ -123,8 +124,6 @@ extern crate log;
#[macro_use]
extern crate failure;
#[cfg(feature = "mmap")]
extern crate memmap;
#[cfg(feature = "mmap")]
extern crate atomicwrites;
extern crate base64;
@@ -137,7 +136,8 @@ extern crate combine;
extern crate crossbeam;
extern crate fnv;
extern crate tantivy_fst;
extern crate fst;
extern crate fst_regex;
extern crate futures;
extern crate futures_cpupool;
extern crate htmlescape;
@@ -184,7 +184,10 @@ mod macros;
pub use error::TantivyError;
#[deprecated(since = "0.7.0", note = "please use `tantivy::TantivyError` instead")]
#[deprecated(
since = "0.7.0",
note = "please use `tantivy::TantivyError` instead"
)]
pub use error::TantivyError as Error;
extern crate census;
@@ -214,7 +217,7 @@ pub mod store;
pub mod termdict;
mod snippet;
pub use self::snippet::{Snippet, SnippetGenerator};
pub use self::snippet::{SnippetGenerator, Snippet};
mod docset;
pub use self::docset::{DocSet, SkipResult};
@@ -511,9 +514,11 @@ mod tests {
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(text_field);
assert!(inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none());
assert!(
inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none()
);
{
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
@@ -548,9 +553,11 @@ mod tests {
let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(term_abcd.field());
assert!(inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none());
assert!(
inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none()
);
{
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
@@ -584,9 +591,11 @@ mod tests {
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(term_abcd.field());
assert!(inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none());
assert!(
inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none()
);
{
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
@@ -734,9 +743,11 @@ mod tests {
let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(text_field);
let term_abcd = Term::from_field_text(text_field, "abcd");
assert!(inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none());
assert!(
inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none()
);
let term_af = Term::from_field_text(text_field, "af");
let mut postings = inverted_index
.read_postings(&term_af, IndexRecordOption::WithFreqsAndPositions)

View File

@@ -77,10 +77,10 @@ mod test {
let likes = schema_builder.add_u64_field("num_u64", FAST);
let _schema = schema_builder.build();
let _doc = doc!(
title => "Life Aquatic",
author => "Wes Anderson",
likes => 4u64
);
title => "Life Aquatic",
author => "Wes Anderson",
likes => 4u64
);
}
#[test]
@@ -91,9 +91,9 @@ mod test {
let likes = schema_builder.add_u64_field("num_u64", FAST);
let _schema = schema_builder.build();
let _doc = doc!(
title => "Life Aquatic",
author => "Wes Anderson",
likes => 4u64,
);
title => "Life Aquatic",
author => "Wes Anderson",
likes => 4u64,
);
}
}

View File

@@ -221,10 +221,12 @@ pub mod tests {
}
{
let term_a = Term::from_field_text(text_field, "abcdef");
assert!(segment_reader
.inverted_index(term_a.field())
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.is_none());
assert!(
segment_reader
.inverted_index(term_a.field())
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.is_none()
);
}
{
let term_a = Term::from_field_text(text_field, "a");

View File

@@ -29,8 +29,7 @@ fn posting_from_field_entry(field_entry: &FieldEntry) -> Box<PostingsWriter> {
IndexRecordOption::WithFreqsAndPositions => {
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed()
}
})
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
}).unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
FieldType::U64(_) | FieldType::I64(_) | FieldType::HierarchicalFacet => {
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
}
@@ -108,8 +107,10 @@ impl MultiFieldPostingsWriter {
.map(|(key, _, _)| Term::wrap(key).field())
.enumerate();
let mut unordered_term_mappings: HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>> =
HashMap::new();
let mut unordered_term_mappings: HashMap<
Field,
HashMap<UnorderedTermId, TermOrdinal>,
> = HashMap::new();
let mut prev_field = Field(u32::max_value());
for (offset, field) in term_offsets_it {
@@ -137,8 +138,7 @@ impl MultiFieldPostingsWriter {
.enumerate()
.map(|(term_ord, unord_term_id)| {
(unord_term_id as UnorderedTermId, term_ord as TermOrdinal)
})
.collect();
}).collect();
unordered_term_mappings.insert(field, mapping);
}
FieldType::U64(_) | FieldType::I64(_) => {}

View File

@@ -2,7 +2,7 @@ use common::BitSet;
use common::HasLen;
use common::{BinarySerializable, VInt};
use docset::{DocSet, SkipResult};
use tantivy_fst::Streamer;
use fst::Streamer;
use owned_read::OwnedRead;
use positions::PositionReader;
use postings::compression::compressed_block_size;
@@ -126,6 +126,7 @@ impl SegmentPostings {
fn exponential_search(target: u32, arr: &[u32]) -> (usize, usize) {
let mut start = 0;
let end = arr.len();
debug_assert!(target >= arr[start]);
debug_assert!(target <= arr[end - 1]);
let mut jump = 1;
loop {
@@ -215,10 +216,11 @@ impl DocSet for SegmentPostings {
// we're in the right block now, start with an exponential search
let block_docs = self.block_cursor.docs();
debug_assert!(target >= self.doc());
let new_cur = self
.cur
.wrapping_add(search_within_block(&block_docs[self.cur..], target));
if need_positions {
sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur]
.iter()
@@ -531,8 +533,7 @@ impl BlockSegmentPostings {
} else {
BlockSegmentPostingsSkipResult::Terminated
}
})
.unwrap_or(BlockSegmentPostingsSkipResult::Terminated);
}).unwrap_or(BlockSegmentPostingsSkipResult::Terminated);
}
BlockSegmentPostingsSkipResult::Terminated
}
@@ -620,7 +621,6 @@ impl<'b> Streamer<'b> for BlockSegmentPostings {
#[cfg(test)]
mod tests {
use super::exponential_search;
use super::search_within_block;
use super::BlockSegmentPostings;
use super::BlockSegmentPostingsSkipResult;
@@ -628,13 +628,12 @@ mod tests {
use common::HasLen;
use core::Index;
use docset::DocSet;
use tantivy_fst::Streamer;
use fst::Streamer;
use schema::IndexRecordOption;
use schema::Schema;
use schema::Term;
use schema::INT_INDEXED;
use DocId;
use SkipResult;
#[test]
fn test_empty_segment_postings() {
@@ -662,16 +661,6 @@ mod tests {
.0
}
#[test]
fn test_exponentiel_search() {
assert_eq!(exponential_search(0, &[1, 2]), (0, 1));
assert_eq!(exponential_search(1, &[1, 2]), (0, 1));
assert_eq!(
exponential_search(7, &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
(3, 7)
);
}
fn util_test_search_within_block(block: &[u32], target: u32) {
assert_eq!(
search_within_block(block, target),
@@ -703,7 +692,7 @@ mod tests {
#[test]
fn test_block_segment_postings() {
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
let mut block_segments = build_block_postings((0..100_000).collect::<Vec<u32>>());
let mut offset: u32 = 0u32;
// checking that the block before calling advance is empty
assert!(block_segments.docs().is_empty());
@@ -717,44 +706,14 @@ mod tests {
}
}
#[test]
fn test_skip_right_at_new_block() {
let mut doc_ids = (0..128).collect::<Vec<u32>>();
doc_ids.push(129);
doc_ids.push(130);
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.skip_next(128), SkipResult::OverStep);
assert_eq!(docset.doc(), 129);
assert!(docset.advance());
assert_eq!(docset.doc(), 130);
assert!(!docset.advance());
}
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.skip_next(129), SkipResult::Reached);
assert_eq!(docset.doc(), 129);
assert!(docset.advance());
assert_eq!(docset.doc(), 130);
assert!(!docset.advance());
}
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.skip_next(131), SkipResult::End);
}
}
fn build_block_postings(docs: &[DocId]) -> BlockSegmentPostings {
fn build_block_postings(docs: Vec<DocId>) -> BlockSegmentPostings {
let mut schema_builder = Schema::builder();
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let mut last_doc = 0u32;
for &doc in docs {
for doc in docs {
for _ in last_doc..doc {
index_writer.add_document(doc!(int_field=>1u64));
}
@@ -774,7 +733,7 @@ mod tests {
#[test]
fn test_block_segment_postings_skip() {
for i in 0..4 {
let mut block_postings = build_block_postings(&[3]);
let mut block_postings = build_block_postings(vec![3]);
assert_eq!(
block_postings.skip_to(i),
BlockSegmentPostingsSkipResult::Success(0u32)
@@ -784,7 +743,7 @@ mod tests {
BlockSegmentPostingsSkipResult::Terminated
);
}
let mut block_postings = build_block_postings(&[3]);
let mut block_postings = build_block_postings(vec![3]);
assert_eq!(
block_postings.skip_to(4u32),
BlockSegmentPostingsSkipResult::Terminated
@@ -797,7 +756,7 @@ mod tests {
for i in 0..1300 {
docs.push((i * i / 100) + i);
}
let mut block_postings = build_block_postings(&docs[..]);
let mut block_postings = build_block_postings(docs.clone());
for i in vec![0, 424, 10000] {
assert_eq!(
block_postings.skip_to(i),

View File

@@ -55,7 +55,7 @@ pub struct InvertedIndexSerializer {
impl InvertedIndexSerializer {
/// Open a new `PostingsSerializer` for the given segment
fn create(
fn new(
terms_write: CompositeWrite<WritePtr>,
postings_write: CompositeWrite<WritePtr>,
positions_write: CompositeWrite<WritePtr>,
@@ -74,7 +74,7 @@ impl InvertedIndexSerializer {
/// Open a new `PostingsSerializer` for the given segment
pub fn open(segment: &mut Segment) -> Result<InvertedIndexSerializer> {
use SegmentComponent::{POSITIONS, POSITIONSSKIP, POSTINGS, TERMS};
InvertedIndexSerializer::create(
InvertedIndexSerializer::new(
CompositeWrite::wrap(segment.open_write(TERMS)?),
CompositeWrite::wrap(segment.open_write(POSTINGS)?),
CompositeWrite::wrap(segment.open_write(POSITIONS)?),
@@ -99,7 +99,7 @@ impl InvertedIndexSerializer {
let positions_write = self.positions_write.for_field(field);
let positionsidx_write = self.positionsidx_write.for_field(field);
let field_type: FieldType = (*field_entry.field_type()).clone();
FieldSerializer::create(
FieldSerializer::new(
&field_type,
term_dictionary_write,
postings_write,
@@ -130,7 +130,7 @@ pub struct FieldSerializer<'a> {
}
impl<'a> FieldSerializer<'a> {
fn create(
fn new(
field_type: &FieldType,
term_dictionary_write: &'a mut CountingWriter<WritePtr>,
postings_write: &'a mut CountingWriter<WritePtr>,
@@ -152,7 +152,7 @@ impl<'a> FieldSerializer<'a> {
_ => (false, false),
};
let term_dictionary_builder =
TermDictionaryBuilder::create(term_dictionary_write, &field_type)?;
TermDictionaryBuilder::new(term_dictionary_write, &field_type)?;
let postings_serializer =
PostingsSerializer::new(postings_write, term_freq_enabled, position_enabled);
let positions_serializer_opt = if position_enabled {

View File

@@ -1,7 +1,9 @@
mod expull;
mod memory_arena;
mod murmurhash2;
mod term_hashmap;
pub use self::expull::ExpUnrolledLinkedList;
pub use self::memory_arena::{Addr, ArenaStorable, MemoryArena};
use self::murmurhash2::murmurhash2;
pub use self::term_hashmap::{compute_table_size, TermHashMap};

View File

@@ -0,0 +1,87 @@
use std::ptr;
const SEED: u32 = 3_242_157_231u32;
const M: u32 = 0x5bd1_e995;
#[inline(always)]
pub fn murmurhash2(key: &[u8]) -> u32 {
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
let mut key_ptr: *const u32 = key.as_ptr() as *const u32;
let len = key.len() as u32;
let mut h: u32 = SEED ^ len;
let num_blocks = len >> 2;
for _ in 0..num_blocks {
let mut k: u32 = unsafe { ptr::read_unaligned(key_ptr) }; // ok because of num_blocks definition
k = k.wrapping_mul(M);
k ^= k >> 24;
k = k.wrapping_mul(M);
h = h.wrapping_mul(M);
h ^= k;
key_ptr = key_ptr.wrapping_offset(1);
}
// Handle the last few bytes of the input array
let remaining: &[u8] = &key[key.len() & !3..];
match remaining.len() {
3 => {
h ^= u32::from(remaining[2]) << 16;
h ^= u32::from(remaining[1]) << 8;
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
2 => {
h ^= u32::from(remaining[1]) << 8;
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
1 => {
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
_ => {}
}
h ^= h >> 13;
h = h.wrapping_mul(M);
h ^ (h >> 15)
}
#[cfg(test)]
mod test {
use super::murmurhash2;
use std::collections::HashSet;
#[test]
fn test_murmur() {
let s1 = "abcdef";
let s2 = "abcdeg";
for i in 0..5 {
assert_eq!(
murmurhash2(&s1[i..5].as_bytes()),
murmurhash2(&s2[i..5].as_bytes())
);
}
}
#[test]
fn test_murmur_against_reference_impl() {
assert_eq!(murmurhash2("".as_bytes()), 3632506080);
assert_eq!(murmurhash2("a".as_bytes()), 455683869);
assert_eq!(murmurhash2("ab".as_bytes()), 2448092234);
assert_eq!(murmurhash2("abc".as_bytes()), 2066295634);
assert_eq!(murmurhash2("abcd".as_bytes()), 2588571162);
assert_eq!(murmurhash2("abcde".as_bytes()), 2988696942);
assert_eq!(murmurhash2("abcdefghijklmnop".as_bytes()), 2350868870);
}
#[test]
fn test_murmur_collisions() {
let mut set: HashSet<u32> = HashSet::default();
for i in 0..10_000 {
let s = format!("hash{}", i);
let hash = murmurhash2(s.as_bytes());
set.insert(hash);
}
assert_eq!(set.len(), 10_000);
}
}

View File

@@ -1,7 +1,4 @@
extern crate murmurhash32;
use self::murmurhash32::murmurhash2;
use super::murmurhash2;
use super::{Addr, ArenaStorable, MemoryArena};
use std::iter;
use std::mem;
@@ -209,7 +206,7 @@ impl TermHashMap {
self.resize();
}
let key_bytes: &[u8] = key.as_ref();
let hash = murmurhash2(key.as_ref());
let hash = murmurhash2::murmurhash2(key.as_ref());
let mut probe = self.probe(hash);
loop {
let bucket = probe.next_probe();

View File

@@ -1,6 +1,6 @@
use common::BitSet;
use core::SegmentReader;
use tantivy_fst::Automaton;
use fst::Automaton;
use query::BitSetDocSet;
use query::ConstScorer;
use query::{Scorer, Weight};

View File

@@ -63,8 +63,7 @@ impl BM25Weight {
.map(|term| {
let term_doc_freq = searcher.doc_freq(term);
idf(term_doc_freq, total_num_docs)
})
.sum::<f32>();
}).sum::<f32>();
BM25Weight::new(idf, average_fieldnorm)
}

View File

@@ -47,8 +47,7 @@ impl Query for BooleanQuery {
.iter()
.map(|&(ref occur, ref subquery)| {
Ok((*occur, subquery.weight(searcher, scoring_enabled)?))
})
.collect::<Result<_>>()?;
}).collect::<Result<_>>()?;
Ok(Box::new(BooleanWeight::new(sub_weights, scoring_enabled)))
}
@@ -69,8 +68,7 @@ impl BooleanQuery {
let term_query: Box<Query> =
Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs));
(Occur::Should, term_query)
})
.collect();
}).collect();
BooleanQuery::from(occur_term_queries)
}

View File

@@ -134,8 +134,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
.into_iter()
.map(|(offset, postings)| {
PostingsWithOffset::new(postings, (max_offset - offset) as u32)
})
.collect::<Vec<_>>();
}).collect::<Vec<_>>();
PhraseScorer {
intersection_docset: Intersection::new(postings_with_offsets),
num_docsets,

View File

@@ -68,8 +68,7 @@ fn trim_ast(logical_ast: LogicalAST) -> Option<LogicalAST> {
.into_iter()
.flat_map(|(occur, child)| {
trim_ast(child).map(|trimmed_child| (occur, trimmed_child))
})
.collect::<Vec<_>>();
}).collect::<Vec<_>>();
if trimmed_children.is_empty() {
None
} else {
@@ -423,8 +422,7 @@ impl QueryParser {
lower: self.resolve_bound(field, &lower)?,
upper: self.resolve_bound(field, &upper)?,
})))
})
.collect::<Result<Vec<_>, QueryParserError>>()?;
}).collect::<Result<Vec<_>, QueryParserError>>()?;
let result_ast = if clauses.len() == 1 {
clauses.pop().unwrap()
} else {
@@ -600,19 +598,25 @@ mod test {
assert!(query_parser.parse_query("signed:2324").is_ok());
assert!(query_parser.parse_query("signed:\"22\"").is_ok());
assert!(query_parser.parse_query("signed:\"-2234\"").is_ok());
assert!(query_parser
.parse_query("signed:\"-9999999999999\"")
.is_ok());
assert!(
query_parser
.parse_query("signed:\"-9999999999999\"")
.is_ok()
);
assert!(query_parser.parse_query("signed:\"a\"").is_err());
assert!(query_parser.parse_query("signed:\"2a\"").is_err());
assert!(query_parser
.parse_query("signed:\"18446744073709551615\"")
.is_err());
assert!(
query_parser
.parse_query("signed:\"18446744073709551615\"")
.is_err()
);
assert!(query_parser.parse_query("unsigned:\"2\"").is_ok());
assert!(query_parser.parse_query("unsigned:\"-2\"").is_err());
assert!(query_parser
.parse_query("unsigned:\"18446744073709551615\"")
.is_ok());
assert!(
query_parser
.parse_query("unsigned:\"18446744073709551615\"")
.is_ok()
);
test_parse_query_to_logical_ast_helper(
"unsigned:2324",
"Term([0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 20])",

View File

@@ -1,5 +1,5 @@
use error::TantivyError;
use tantivy_fst::Regex;
use fst_regex::Regex;
use query::{AutomatonWeight, Query, Weight};
use schema::Field;
use std::clone::Clone;

View File

@@ -1,3 +1,4 @@
mod term_query;
mod term_scorer;
mod term_weight;

View File

@@ -55,8 +55,7 @@ where
None
}
},
)
.collect();
).collect();
Union {
docsets: non_empty_docsets,
bitsets: Box::new([TinySet::empty(); HORIZON_NUM_TINYBITSETS]),
@@ -215,7 +214,10 @@ where
// The target is outside of the buffered horizon.
// advance all docsets to a doc >= to the target.
#[cfg_attr(feature = "cargo-clippy", allow(clippy::clippy::collapsible_if))]
#[cfg_attr(
feature = "cargo-clippy",
allow(clippy::clippy::collapsible_if)
)]
unordered_drain_filter(&mut self.docsets, |docset| {
if docset.doc() < target {
if docset.skip_next(target) == SkipResult::End {

View File

@@ -6,7 +6,6 @@ use std::borrow::Cow;
use std::fmt::{self, Debug, Display, Formatter};
use std::io::{self, Read, Write};
use std::str;
use std::string::FromUtf8Error;
const SLASH_BYTE: u8 = b'/';
const ESCAPE_BYTE: u8 = b'\\';
@@ -15,10 +14,6 @@ const ESCAPE_BYTE: u8 = b'\\';
/// representation of facets.
pub const FACET_SEP_BYTE: u8 = 0u8;
/// `char` used as a level separation in the binary
/// representation of facets. (It is the null codepoint.)
pub const FACET_SEP_CHAR: char = '\u{0}';
/// A Facet represent a point in a given hierarchy.
///
/// They are typically represented similarly to a filepath.
@@ -31,18 +26,18 @@ pub const FACET_SEP_CHAR: char = '\u{0}';
/// its facet. In the example above, `/electronics/tv_and_video/`
/// and `/electronics`.
#[derive(Clone, Eq, Hash, PartialEq, Ord, PartialOrd)]
pub struct Facet(String);
pub struct Facet(Vec<u8>);
impl Facet {
/// Returns a new instance of the "root facet"
/// Equivalent to `/`.
pub fn root() -> Facet {
Facet("".to_string())
Facet(vec![])
}
/// Returns true iff the facet is the root facet `/`.
pub fn is_root(&self) -> bool {
self.encoded_str().is_empty()
self.encoded_bytes().is_empty()
}
/// Returns a binary representation of the facet.
@@ -54,19 +49,13 @@ impl Facet {
/// This representation has the benefit of making it possible to
/// express "being a child of a given facet" as a range over
/// the term ordinals.
pub fn encoded_str(&self) -> &str {
pub fn encoded_bytes(&self) -> &[u8] {
&self.0
}
pub(crate) fn from_encoded_string(facet_string: String) -> Facet {
Facet(facet_string)
}
/// Creates a `Facet` from its binary representation.
pub fn from_encoded(encoded_bytes: Vec<u8>) -> Result<Facet, FromUtf8Error> {
// facet bytes validation. `0u8` is used a separator but that is still legal utf-8
//Ok(Facet(String::from_utf8(encoded_bytes)?))
String::from_utf8(encoded_bytes).map(Facet)
pub(crate) unsafe fn from_encoded(encoded_bytes: Vec<u8>) -> Facet {
Facet(encoded_bytes)
}
/// Parse a text representation of a facet.
@@ -90,37 +79,36 @@ impl Facet {
Path: IntoIterator,
Path::Item: ToString,
{
let mut facet_string: String = String::with_capacity(100);
let mut facet_bytes: Vec<u8> = Vec::with_capacity(100);
let mut step_it = path.into_iter();
if let Some(step) = step_it.next() {
facet_string.push_str(&step.to_string());
facet_bytes.extend_from_slice(step.to_string().as_bytes());
}
for step in step_it {
facet_string.push(FACET_SEP_CHAR);
facet_string.push_str(&step.to_string());
facet_bytes.push(FACET_SEP_BYTE);
facet_bytes.extend_from_slice(step.to_string().as_bytes());
}
Facet(facet_string)
Facet(facet_bytes)
}
/// Accessor for the inner buffer of the `Facet`.
pub(crate) fn set_facet_str(&mut self, facet_str: &str) {
self.0.clear();
self.0.push_str(facet_str);
pub(crate) fn inner_buffer_mut(&mut self) -> &mut Vec<u8> {
&mut self.0
}
/// Returns `true` iff other is a subfacet of `self`.
pub fn is_prefix_of(&self, other: &Facet) -> bool {
let self_str = self.encoded_str();
let other_str = other.encoded_str();
self_str.len() < other_str.len()
&& other_str.starts_with(self_str)
&& other_str.as_bytes()[self_str.len()] == FACET_SEP_BYTE
let self_bytes: &[u8] = self.encoded_bytes();
let other_bytes: &[u8] = other.encoded_bytes();
self_bytes.len() < other_bytes.len()
&& other_bytes.starts_with(self_bytes)
&& other_bytes[self_bytes.len()] == 0u8
}
}
impl Borrow<str> for Facet {
fn borrow(&self) -> &str {
self.encoded_str()
impl Borrow<[u8]> for Facet {
fn borrow(&self) -> &[u8] {
self.encoded_bytes()
}
}
@@ -132,51 +120,45 @@ impl<'a, T: ?Sized + AsRef<str>> From<&'a T> for Facet {
Idle,
}
let path: &str = path_asref.as_ref();
assert!(!path.is_empty());
assert!(path.starts_with("/"));
let mut facet_encoded = String::new();
let mut facet_encoded = Vec::new();
let mut state = State::Idle;
let path_bytes = path.as_bytes();
let mut last_offset = 1;
for i in 1..path_bytes.len() {
let c = path_bytes[i];
for &c in &path_bytes[1..] {
match (state, c) {
(State::Idle, ESCAPE_BYTE) => {
facet_encoded.push_str(&path[last_offset..i]);
last_offset = i + 1;
state = State::Escaped
}
(State::Idle, ESCAPE_BYTE) => state = State::Escaped,
(State::Idle, SLASH_BYTE) => {
facet_encoded.push_str(&path[last_offset..i]);
facet_encoded.push(FACET_SEP_CHAR);
last_offset = i + 1;
facet_encoded.push(FACET_SEP_BYTE);
}
(State::Escaped, _escaped_char) => {
(State::Escaped, any_char) => {
state = State::Idle;
facet_encoded.push(any_char);
}
(State::Idle, other_char) => {
facet_encoded.push(other_char);
}
(State::Idle, _any_char) => {}
}
}
facet_encoded.push_str(&path[last_offset..]);
Facet(facet_encoded)
}
}
impl BinarySerializable for Facet {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
<String as BinarySerializable>::serialize(&self.0, writer)
<Vec<u8> as BinarySerializable>::serialize(&self.0, writer)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
Ok(Facet(<String as BinarySerializable>::deserialize(reader)?))
let bytes = <Vec<u8> as BinarySerializable>::deserialize(reader)?;
Ok(Facet(bytes))
}
}
impl Display for Facet {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
for step in self.0.split(FACET_SEP_CHAR) {
for step in self.0.split(|&b| b == FACET_SEP_BYTE) {
write!(f, "/")?;
write!(f, "{}", escape_slashes(step))?;
let step_str = unsafe { str::from_utf8_unchecked(step) };
write!(f, "{}", escape_slashes(step_str))?;
}
Ok(())
}

View File

@@ -232,14 +232,12 @@ impl Schema {
let field_entry = self.get_field_entry(field);
let field_type = field_entry.field_type();
match *json_value {
JsonValue::Array(ref json_items) => {
for json_item in json_items {
let value = field_type.value_from_json(json_item).map_err(|e| {
DocParsingError::ValueError(field_name.clone(), e)
})?;
doc.add(FieldValue::new(field, value));
}
}
JsonValue::Array(ref json_items) => for json_item in json_items {
let value = field_type
.value_from_json(json_item)
.map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?;
doc.add(FieldValue::new(field, value));
},
_ => {
let value = field_type
.value_from_json(json_value)
@@ -448,8 +446,7 @@ mod tests {
"count": 4,
"popularity": 10
}"#,
)
.unwrap();
).unwrap();
assert_eq!(doc.get_first(title_field).unwrap().text(), Some("my title"));
assert_eq!(
doc.get_first(author_field).unwrap().text(),

View File

@@ -32,7 +32,7 @@ impl Term {
/// Creates a `Term` given a facet.
pub fn from_facet(field: Field, facet: &Facet) -> Term {
let bytes = facet.encoded_str().as_bytes();
let bytes = facet.encoded_bytes();
let buffer = Vec::with_capacity(4 + bytes.len());
let mut term = Term(buffer);
term.set_field(field);
@@ -68,7 +68,12 @@ impl Term {
term
}
/// Creates a new Term for a given field.
/// Creates a new Term with an empty buffer,
/// but with a given capacity.
///
/// It is declared unsafe, as the term content
/// is not initialized, and a call to `.field()`
/// would panic.
pub(crate) fn for_field(field: Field) -> Term {
let mut term = Term(Vec::with_capacity(100));
term.set_field(field);

View File

@@ -192,8 +192,7 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
item.start - fragment.start_offset,
item.stop - fragment.start_offset,
)
})
.collect();
}).collect();
Snippet {
fragments: fragment_text.to_string(),
highlighted,
@@ -243,7 +242,7 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
/// let query = query_parser.parse_query("haleurs flamands").unwrap();
/// # index.load_searchers()?;
/// # let searcher = index.searcher();
/// let mut snippet_generator = SnippetGenerator::create(&searcher, &*query, text_field)?;
/// let mut snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field)?;
/// snippet_generator.set_max_num_chars(100);
/// let snippet = snippet_generator.snippet_from_doc(&doc);
/// let snippet_html: String = snippet.to_html();
@@ -260,7 +259,7 @@ pub struct SnippetGenerator {
impl SnippetGenerator {
/// Creates a new snippet generator
pub fn create(searcher: &Searcher, query: &Query, field: Field) -> Result<SnippetGenerator> {
pub fn new(searcher: &Searcher, query: &Query, field: Field) -> Result<SnippetGenerator> {
let mut terms = BTreeSet::new();
query.query_terms(&mut terms);
let terms_text: BTreeMap<String, f32> = terms
@@ -274,8 +273,7 @@ impl SnippetGenerator {
} else {
None
}
})
.collect();
}).collect();
let tokenizer = searcher.index().tokenizer_for_field(field)?;
Ok(SnippetGenerator {
terms_text,
@@ -534,14 +532,12 @@ Survey in 2016, 2017, and 2018."#;
let query_parser = QueryParser::for_index(&index, vec![text_field]);
{
let query = query_parser.parse_query("e").unwrap();
let snippet_generator =
SnippetGenerator::create(&searcher, &*query, text_field).unwrap();
let snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap();
assert!(snippet_generator.terms_text().is_empty());
}
{
let query = query_parser.parse_query("a").unwrap();
let snippet_generator =
SnippetGenerator::create(&searcher, &*query, text_field).unwrap();
let snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap();
assert_eq!(
&btreemap!("a".to_string() => 0.25f32),
snippet_generator.terms_text()
@@ -549,8 +545,7 @@ Survey in 2016, 2017, and 2018."#;
}
{
let query = query_parser.parse_query("a b").unwrap();
let snippet_generator =
SnippetGenerator::create(&searcher, &*query, text_field).unwrap();
let snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap();
assert_eq!(
&btreemap!("a".to_string() => 0.25f32, "b".to_string() => 0.5),
snippet_generator.terms_text()
@@ -558,8 +553,7 @@ Survey in 2016, 2017, and 2018."#;
}
{
let query = query_parser.parse_query("a b c").unwrap();
let snippet_generator =
SnippetGenerator::create(&searcher, &*query, text_field).unwrap();
let snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap();
assert_eq!(
&btreemap!("a".to_string() => 0.25f32, "b".to_string() => 0.5),
snippet_generator.terms_text()
@@ -591,8 +585,7 @@ Survey in 2016, 2017, and 2018."#;
let searcher = index.searcher();
let query_parser = QueryParser::for_index(&index, vec![text_field]);
let query = query_parser.parse_query("rust design").unwrap();
let mut snippet_generator =
SnippetGenerator::create(&searcher, &*query, text_field).unwrap();
let mut snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap();
{
let snippet = snippet_generator.snippet(TEST_TEXT);
assert_eq!(snippet.to_html(), "imperative-procedural paradigms. <b>Rust</b> is syntactically similar to C++[according to whom?],\nbut its <b>designers</b> intend it to provide better memory safety");

View File

@@ -80,7 +80,6 @@ pub struct SegmentSpaceUsage {
}
impl SegmentSpaceUsage {
#[allow(clippy::too_many_arguments)]
pub(crate) fn new(
num_docs: u32,
termdict: PerFieldSpaceUsage,

View File

@@ -95,7 +95,10 @@ impl StoreReader {
}
}
#[cfg_attr(feature = "cargo-clippy", allow(clippy::needless_pass_by_value))]
#[cfg_attr(
feature = "cargo-clippy",
allow(clippy::needless_pass_by_value)
)]
fn split_source(data: ReadOnlySource) -> (ReadOnlySource, ReadOnlySource, DocId) {
let data_len = data.len();
let footer_offset = data_len - size_of::<u64>() - size_of::<u32>();

View File

@@ -53,8 +53,7 @@ impl<'a> TermMerger<'a> {
.map(|(ord, streamer)| HeapItem {
streamer,
segment_ord: ord,
})
.collect(),
}).collect(),
}
}
@@ -123,7 +122,10 @@ impl<'a> TermMerger<'a> {
}
/// Iterates through terms
#[cfg_attr(feature = "cargo-clippy", allow(clippy::should_implement_trait))]
#[cfg_attr(
feature = "cargo-clippy",
allow(clippy::should_implement_trait)
)]
pub fn next(&mut self) -> Option<Term<&[u8]>> {
if self.advance() {
Some(Term::wrap(self.current_streamers[0].streamer.key()))

View File

@@ -66,7 +66,7 @@ mod tests {
let write = directory.open_write(&path).unwrap();
let field_type = FieldType::Str(TEXT);
let mut term_dictionary_builder =
TermDictionaryBuilder::create(write, &field_type).unwrap();
TermDictionaryBuilder::new(write, &field_type).unwrap();
for term in COUNTRIES.iter() {
term_dictionary_builder
.insert(term.as_bytes(), &make_term_info(0u64))
@@ -92,7 +92,7 @@ mod tests {
let write = directory.open_write(&path).unwrap();
let field_type = FieldType::Str(TEXT);
let mut term_dictionary_builder =
TermDictionaryBuilder::create(write, &field_type).unwrap();
TermDictionaryBuilder::new(write, &field_type).unwrap();
term_dictionary_builder
.insert("abc".as_bytes(), &make_term_info(34u64))
.unwrap();
@@ -167,7 +167,7 @@ mod tests {
let mut term_string = String::new();
while term_it.advance() {
//let term = Term::from_bytes(term_it.key());
term_string.push_str(str::from_utf8(term_it.key()).expect("test"));
term_string.push_str(unsafe { str::from_utf8_unchecked(term_it.key()) }); // ok test
}
assert_eq!(&*term_string, "abcdef");
}
@@ -180,7 +180,7 @@ mod tests {
let field_type = FieldType::Str(TEXT);
let buffer: Vec<u8> = {
let mut term_dictionary_builder =
TermDictionaryBuilder::create(vec![], &field_type).unwrap();
TermDictionaryBuilder::new(vec![], &field_type).unwrap();
for &(ref id, ref i) in &ids {
term_dictionary_builder
.insert(id.as_bytes(), &make_term_info(*i as u64))
@@ -210,7 +210,7 @@ mod tests {
let field_type = FieldType::Str(TEXT);
let buffer: Vec<u8> = {
let mut term_dictionary_builder =
TermDictionaryBuilder::create(vec![], &field_type).unwrap();
TermDictionaryBuilder::new(vec![], &field_type).unwrap();
// term requires more than 16bits
term_dictionary_builder
.insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1))
@@ -245,7 +245,7 @@ mod tests {
let field_type = FieldType::Str(TEXT);
let buffer: Vec<u8> = {
let mut term_dictionary_builder =
TermDictionaryBuilder::create(vec![], &field_type).unwrap();
TermDictionaryBuilder::new(vec![], &field_type).unwrap();
for &(ref id, ref i) in &ids {
term_dictionary_builder
.insert(id.as_bytes(), &make_term_info(*i as u64))
@@ -314,7 +314,7 @@ mod tests {
let field_type = FieldType::Str(TEXT);
let buffer: Vec<u8> = {
let mut term_dictionary_builder =
TermDictionaryBuilder::create(vec![], &field_type).unwrap();
TermDictionaryBuilder::new(vec![], &field_type).unwrap();
term_dictionary_builder
.insert(&[], &make_term_info(1 as u64))
.unwrap();
@@ -338,7 +338,7 @@ mod tests {
let field_type = FieldType::Str(TEXT);
let buffer: Vec<u8> = {
let mut term_dictionary_builder =
TermDictionaryBuilder::create(vec![], &field_type).unwrap();
TermDictionaryBuilder::new(vec![], &field_type).unwrap();
for i in 0u8..10u8 {
let number_arr = [i; 1];
term_dictionary_builder
@@ -408,7 +408,7 @@ mod tests {
let write = directory.open_write(&path).unwrap();
let field_type = FieldType::Str(TEXT);
let mut term_dictionary_builder =
TermDictionaryBuilder::create(write, &field_type).unwrap();
TermDictionaryBuilder::new(write, &field_type).unwrap();
for term in COUNTRIES.iter() {
term_dictionary_builder
.insert(term.as_bytes(), &make_term_info(0u64))

View File

@@ -1,8 +1,8 @@
use super::TermDictionary;
use tantivy_fst::automaton::AlwaysMatch;
use tantivy_fst::map::{Stream, StreamBuilder};
use tantivy_fst::Automaton;
use tantivy_fst::{IntoStreamer, Streamer};
use fst::automaton::AlwaysMatch;
use fst::map::{Stream, StreamBuilder};
use fst::Automaton;
use fst::{IntoStreamer, Streamer};
use postings::TermInfo;
use termdict::TermOrdinal;
@@ -132,7 +132,10 @@ where
}
/// Return the next `(key, value)` pair.
#[cfg_attr(feature = "cargo-clippy", allow(clippy::should_implement_trait))]
#[cfg_attr(
feature = "cargo-clippy",
allow(clippy::should_implement_trait)
)]
pub fn next(&mut self) -> Option<(&[u8], &TermInfo)> {
if self.advance() {
Some((self.key(), self.value()))

View File

@@ -3,15 +3,15 @@ use super::{TermStreamer, TermStreamerBuilder};
use common::BinarySerializable;
use common::CountingWriter;
use directory::ReadOnlySource;
use tantivy_fst;
use tantivy_fst::raw::Fst;
use tantivy_fst::Automaton;
use fst;
use fst::raw::Fst;
use fst::Automaton;
use postings::TermInfo;
use schema::FieldType;
use std::io::{self, Write};
use termdict::TermOrdinal;
fn convert_fst_error(e: tantivy_fst::Error) -> io::Error {
fn convert_fst_error(e: fst::Error) -> io::Error {
io::Error::new(io::ErrorKind::Other, e)
}
@@ -19,7 +19,7 @@ fn convert_fst_error(e: tantivy_fst::Error) -> io::Error {
///
/// Inserting must be done in the order of the `keys`.
pub struct TermDictionaryBuilder<W> {
fst_builder: tantivy_fst::MapBuilder<W>,
fst_builder: fst::MapBuilder<W>,
term_info_store_writer: TermInfoStoreWriter,
term_ord: u64,
}
@@ -29,8 +29,8 @@ where
W: Write,
{
/// Creates a new `TermDictionaryBuilder`
pub fn create(w: W, _field_type: &FieldType) -> io::Result<Self> {
let fst_builder = tantivy_fst::MapBuilder::new(w).map_err(convert_fst_error)?;
pub fn new(w: W, _field_type: &FieldType) -> io::Result<Self> {
let fst_builder = fst::MapBuilder::new(w).map_err(convert_fst_error)?;
Ok(TermDictionaryBuilder {
fst_builder,
term_info_store_writer: TermInfoStoreWriter::new(),
@@ -87,9 +87,17 @@ where
}
}
fn open_fst_index(source: ReadOnlySource) -> tantivy_fst::Map<ReadOnlySource> {
let fst = Fst::new(source).expect("FST data is corrupted");
tantivy_fst::Map::from(fst)
fn open_fst_index(source: ReadOnlySource) -> fst::Map {
let fst = match source {
ReadOnlySource::Anonymous(data) => {
Fst::from_shared_bytes(data.data, data.start, data.len).expect("FST data is corrupted")
}
#[cfg(feature = "mmap")]
ReadOnlySource::Mmap(mmap_readonly) => {
Fst::from_mmap(mmap_readonly).expect("FST data is corrupted")
}
};
fst::Map::from(fst)
}
/// The term dictionary contains all of the terms in
@@ -99,7 +107,7 @@ fn open_fst_index(source: ReadOnlySource) -> tantivy_fst::Map<ReadOnlySource> {
/// respective `TermOrdinal`. The `TermInfoStore` then makes it
/// possible to fetch the associated `TermInfo`.
pub struct TermDictionary {
fst_index: tantivy_fst::Map<ReadOnlySource>,
fst_index: fst::Map,
term_info_store: TermInfoStore,
}
@@ -124,11 +132,11 @@ impl TermDictionary {
/// Creates an empty term dictionary which contains no terms.
pub fn empty(field_type: &FieldType) -> Self {
let term_dictionary_data: Vec<u8> =
TermDictionaryBuilder::create(Vec::<u8>::new(), &field_type)
TermDictionaryBuilder::new(Vec::<u8>::new(), &field_type)
.expect("Creating a TermDictionaryBuilder in a Vec<u8> should never fail")
.finish()
.expect("Writing in a Vec<u8> should never fail");
let source = ReadOnlySource::new(term_dictionary_data);
let source = ReadOnlySource::from(term_dictionary_data);
Self::from_source(&source)
}

View File

@@ -0,0 +1,78 @@
extern crate aho_corasick;
use self::aho_corasick::{AcAutomaton, Automaton};
use std::mem;
use super::{OffsetIncrements, OffsetIncrementsBuilder};
pub trait CharMogrifier {
fn process_text(&mut self, text: &str, dest: &mut String, correction: &mut OffsetIncrementsBuilder);
}
pub struct CharFilter {
text: String,
buffer: String,
mogrifiers: Vec<Box<CharMogrifier>>
}
impl CharFilter {
fn process_text(&mut self, text: &str) {
self.text.clear();
self.text.push_str(text);
self.buffer.clear();
let mut offset_increment_builder = OffsetIncrements::builder();
for mogrifier in &mut self.mogrifiers {
mogrifier.process_text(&self.text,
&mut self.buffer,
&mut offset_increment_builder);
mem::swap(&mut self.text, &mut self.buffer);
offset_increment_builder.new_layer();
}
}
}
pub struct SubstringReplacer<'a> {
automaton: AcAutomaton<&'a str>,
replacements: Vec<&'a str>
}
impl SubstringReplacer<'static> {
fn new(from_tos: Vec<(&'static str, &'static str)>) -> SubstringReplacer<'static> {
let from_ptns: Vec<&'static str> = from_tos
.iter()
.map(|(from_str, _)| *from_str)
.collect();
let to_strs: Vec<&'static str> = from_tos
.iter()
.map(|(_, to_str)| *to_str)
.collect();
let automaton = AcAutomaton::new(from_ptns);
SubstringReplacer {
automaton,
replacements: to_strs
}
}
}
impl<'a> CharMogrifier for SubstringReplacer<'a> {
// correction is an array that goes from old_offset -> new_offset.
// correction len is `text.len() + 1`
fn process_text(&mut self, text: &str, dest: &mut String, correction: &mut OffsetIncrementsBuilder) {
let mut start = 0;
for m in self.automaton.find(text) {
dest.push_str(&text[start..m.start]);
let replacement = self.replacements[m.pati];
let previous_len = m.end - m.start;
correction.register_inc(m.end, (replacement.len() as isize) - (previous_len as isize));
dest.push_str(replacement);
start = m.end;
}
dest.push_str(&text[start..]);
}
}
// lowercasing
// Unicode normalization*
// '
// accent simplification

View File

@@ -0,0 +1,4 @@
mod char_filter;
mod offset_increments;
pub use self::offset_increments::{OffsetIncrements, OffsetIncrementsBuilder};

View File

@@ -0,0 +1,264 @@
/*!
Stores an increasing mapping from naturals to naturals.
`CharFilter`s may make the original text longer or shorter.
Token's offset need to refer to their offset in the original
text.
This struct is in charge of doing an efficient book-keeping
of these shift in offsets and provide a mapping
from the transformed text to the original text.
We define the inverse of an increasing mapping `f` as:
g(i) = max {j | f(j) <= i}
!= min {j | f(i) >= i}
The name `inverse` is a bit misleading:
this is not really an involution.
Note that having a single definition has some bad side effects.
For instance, when trying to convert a segment of chars to
its offset in the original string, the reverse mapping may
return an empty string.
We could use a different definition of the reverse mapping
when computing the lower bound and the upper bound of the segment,
but then non-overlapping tokens could have overlapping origins.
# Example
```
forward mapping
[0,1,2,5,6,7]
Encoded sparsely as (3, 2)
reverse mapping
[0,1,2,2,2,3,4,5]
Encoded sparsely as [(3, -1), (4,-1), (5,-1)]
```
*/
/// Builds a reverse mapping using a sparse representation of the
/// forward mapping.
pub struct OffsetIncrementsBuilder {
cumulated: isize,
incs: Vec<(usize, isize)>,
}
impl OffsetIncrementsBuilder {
/// We require
/// - `from_offset + delta >= 0`
/// There is no need to call this function if delta = 0.
pub fn register_inc(&mut self, from_offset: usize, delta: isize) {
let mut cumulated = self.cumulated;
let from_offset_isize = from_offset as isize;
let to_offset = (from_offset_isize + self.cumulated) as usize;
if delta > 0 {
for i in 0..delta as usize {
cumulated += 1;
self.incs.push((to_offset + i, -cumulated));
}
} else {
assert_eq!(delta, -1);
cumulated -= 1;
self.incs.push((to_offset + 1, -cumulated));
}
println!("incs {:?}", self.incs);
self.cumulated = cumulated;
}
pub fn new_layer(&self) {
panic!();
}
fn build(self) -> OffsetIncrements {
OffsetIncrements {
incs: self.incs
}
}
}
#[derive(Default)]
pub struct OffsetIncrementsReader {
shifts: Vec<(usize, isize)>,
current_shift: isize,
idx: usize,
}
impl OffsetIncrementsReader {
fn new(shifts: Vec<(usize, isize)>) -> OffsetIncrementsReader {
OffsetIncrementsReader {
shifts,
current_shift: 0,
idx: 0,
}
}
fn convert_offset(&mut self, target: usize) -> usize {
while self.idx < self.shifts.len() {
let (offset, shift) = self.shifts[self.idx];
if offset > target {
break;
} else {
self.current_shift = shift;
}
self.idx += 1;
}
return (self.current_shift + target as isize) as usize;
}
}
pub struct OffsetIncrements {
incs: Vec<(usize, isize)>
}
impl OffsetIncrements {
pub fn builder() -> OffsetIncrementsBuilder {
OffsetIncrementsBuilder {
cumulated: 0,
incs: Vec::new(),
}
}
pub fn reader(&self) -> OffsetIncrementsReader {
OffsetIncrementsReader::new(self.incs.clone()) // TODO Fixme, no clone
}
}
#[cfg(test)]
mod tests {
use super::OffsetIncrements;
use super::OffsetIncrementsReader;
#[test]
fn test_offset_increment_reader_empty() {
let mut reader = OffsetIncrementsReader::new(vec![]);
for i in 0..3 {
assert_eq!(reader.convert_offset(i), i);
}
}
#[test]
fn test_offset_increment_reader_step() {
let mut reader = OffsetIncrementsReader::new(vec![(1, 1), (3, 3), (6, 2), (7, 1), (8, 0), (9, -1)]);
assert_eq!(reader.convert_offset(0), 0);
assert_eq!(reader.convert_offset(1), 2);
assert_eq!(reader.convert_offset(2), 3);
assert_eq!(reader.convert_offset(3), 6);
assert_eq!(reader.convert_offset(4), 7);
assert_eq!(reader.convert_offset(5), 8);
assert_eq!(reader.convert_offset(6), 8);
assert_eq!(reader.convert_offset(7), 8);
assert_eq!(reader.convert_offset(8), 8);
assert_eq!(reader.convert_offset(9), 8);
}
#[test]
fn test_offset_increment_reader_step_neg() {
let mut reader = OffsetIncrementsReader::new(vec![(1, -1), (2, -2), (3, -3)]);
assert_eq!(reader.convert_offset(0), 0);
assert_eq!(reader.convert_offset(1), 0);
assert_eq!(reader.convert_offset(2), 0);
assert_eq!(reader.convert_offset(3), 0);
assert_eq!(reader.convert_offset(4), 1);
assert_eq!(reader.convert_offset(5), 2);
assert_eq!(reader.convert_offset(6), 3);
assert_eq!(reader.convert_offset(7), 4);
}
fn aux_test_increment(increments: OffsetIncrements, expected: &[usize]) {
let mut reader = increments.reader();
for (i, el) in expected.iter().cloned().enumerate() {
assert_eq!(reader.convert_offset(i), el);
}
}
fn assert_is_increasing(v: &[usize]) {
assert!(v.len() > 0);
assert_eq!(v[0], 0);
let mut prec = 0;
for &val in &v[1..] {
assert!(val >= prec);
prec = val;
}
}
fn is_inverse(fwd: &[usize], rev: &[usize]) {
assert_is_increasing(fwd);
assert_is_increasing(rev);
println!("fwd {:?} rev {:?}", fwd, rev);
for (i, &antecedant) in rev.iter().enumerate() {
let expected = fwd
.iter()
.enumerate()
.filter(|(_, v)| **v <= i)
.map(|(ord, _)| ord)
.last()
.unwrap();
println!("i {}", i);
assert_eq!(expected, antecedant);
}
}
#[test]
fn test_is_inverse() {
is_inverse(&[0,1,1,1,2], &[0, 3, 4]);
}
fn is_reciprocal(left: &[usize], right: &[usize]) {
is_inverse(left, right);
}
#[test]
fn test_offset_increments_shorten() {
{
let mut offset_increment_builder = OffsetIncrements::builder();
// abcd -> abd
offset_increment_builder.register_inc(2, -1);
aux_test_increment(offset_increment_builder.build(), &[0, 1, 2, 4]);
}
{
let mut offset_increment_builder = OffsetIncrements::builder();
// abcdefgh -> abcdfgh
offset_increment_builder.register_inc(4, -1);
aux_test_increment(offset_increment_builder.build(), &[0, 1, 2, 3, 4, 6]);
}
{
let mut offset_increment_builder = OffsetIncrements::builder();
// abcd -> bcd
offset_increment_builder.register_inc(0, -1);
aux_test_increment(offset_increment_builder.build(), &[0, 2, 3]);
}
}
#[test]
fn test_offset_increments_builder() {
{
let mut offset_increment_builder = OffsetIncrements::builder();
offset_increment_builder.register_inc(2, 1);
// [0, 1, 3, 4, 5]
aux_test_increment(offset_increment_builder.build(), &[0,1,1,2,3,4,5]);
}
{
let mut offset_increment_builder = OffsetIncrements::builder();
offset_increment_builder.register_inc(3, 2);
// [0, 1, 2, 4, 5, 6]
aux_test_increment(offset_increment_builder.build(), &[0,1,2,2,2,3,4,5]);
}
{
let mut offset_increment_builder = OffsetIncrements::builder();
// 0, 0, 1, 2, 2, 2
offset_increment_builder.register_inc(1, 1);
offset_increment_builder.register_inc(3, 3);
aux_test_increment(offset_increment_builder.build(), &[0,0,1,2,2,2,2,3,4]);
}
}
}

View File

@@ -1,5 +1,6 @@
use super::{Token, TokenStream, Tokenizer};
use schema::FACET_SEP_BYTE;
use std::str;
/// The `FacetTokenizer` process a `Facet` binary representation
/// and emits a token for all of its parent.
@@ -56,11 +57,12 @@ impl<'a> TokenStream for FacetTokenStream<'a> {
.position(|b| b == FACET_SEP_BYTE)
.map(|pos| cursor + 1 + pos)
{
let facet_part = &self.text[cursor..next_sep_pos];
let facet_part =
unsafe { str::from_utf8_unchecked(&bytes[cursor..next_sep_pos]) };
self.token.text.push_str(facet_part);
self.state = State::UpToPosition(next_sep_pos);
} else {
let facet_part = &self.text[cursor..];
let facet_part = unsafe { str::from_utf8_unchecked(&bytes[cursor..]) };
self.token.text.push_str(facet_part);
self.state = State::Terminated;
}
@@ -84,6 +86,7 @@ mod tests {
use super::FacetTokenizer;
use schema::Facet;
use std::str;
use tokenizer::{Token, TokenStream, Tokenizer};
#[test]
@@ -92,11 +95,11 @@ mod tests {
let mut tokens = vec![];
{
let mut add_token = |token: &Token| {
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap();
let facet = unsafe { Facet::from_encoded(token.text.as_bytes().to_owned()) }; // ok test
tokens.push(format!("{}", facet));
};
FacetTokenizer
.token_stream(facet.encoded_str())
.token_stream(unsafe { str::from_utf8_unchecked(facet.encoded_bytes()) })
.process(&mut add_token);
}
assert_eq!(tokens.len(), 4);
@@ -112,11 +115,11 @@ mod tests {
let mut tokens = vec![];
{
let mut add_token = |token: &Token| {
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); // ok test
let facet = unsafe { Facet::from_encoded(token.text.as_bytes().to_owned()) }; // ok test
tokens.push(format!("{}", facet));
};
FacetTokenizer
.token_stream(facet.encoded_str()) // ok test
.token_stream(unsafe { str::from_utf8_unchecked(facet.encoded_bytes()) }) // ok test
.process(&mut add_token);
}
assert_eq!(tokens.len(), 1);

View File

@@ -140,6 +140,7 @@ mod stop_word_filter;
mod token_stream_chain;
mod tokenizer;
mod tokenizer_manager;
mod char_processing;
pub use self::alphanum_only::AlphaNumOnlyFilter;
pub use self::facet_tokenizer::FacetTokenizer;