Compare commits

..

4 Commits

Author SHA1 Message Date
Paul Masurel
f3099a83eb Blop 2018-12-24 11:41:18 +09:00
Paul Masurel
f745bb9d2a blop 2018-12-24 11:28:08 +09:00
Paul Masurel
d9417acbc6 done 2018-12-11 09:01:45 +09:00
Paul Masurel
38540c3826 small step 2018-12-09 15:26:19 +09:00
54 changed files with 820 additions and 497 deletions

View File

@@ -1,4 +1,4 @@
Tantivy 0.8.0
Tantivy 0.8.1
=====================
*No change in the index format*
- API Breaking change in the collector API. (@jwolfe, @fulmicoton)

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.8.0"
version = "0.8.0-dev"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -29,7 +29,7 @@ serde = "1.0"
serde_derive = "1.0"
serde_json = "1.0"
num_cpus = "1.2"
itertools = "0.8"
itertools = "0.7"
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
bit-set = "0.5"
uuid = { version = "0.7", features = ["v4", "serde"] }
@@ -49,7 +49,7 @@ failure = "0.1"
htmlescape = "0.3.1"
fail = "0.2"
scoped-pool = "1.0"
murmurhash32 = "0.2"
aho-corasick = "0.6"
[target.'cfg(windows)'.dependencies]
winapi = "0.2"

View File

@@ -106,37 +106,37 @@ fn main() -> tantivy::Result<()> {
// For convenience, tantivy also comes with a macro to
// reduce the boilerplate above.
index_writer.add_document(doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
index_writer.add_document(doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
// Multivalued field just need to be repeated.
index_writer.add_document(doc!(
title => "Frankenstein",
title => "The Modern Prometheus",
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking."
title => "Frankenstein",
title => "The Modern Prometheus",
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking."
));
// This is an example, so we will only index 3 documents

View File

@@ -68,12 +68,12 @@ fn main() -> tantivy::Result<()> {
// heap for the indexer can increase its throughput.
let mut index_writer = index.writer(50_000_000)?;
index_writer.add_document(doc!(
title => "The Old Man and the Sea",
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish."
title => "The Old Man and the Sea",
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish."
));
index_writer.add_document(doc!(
title => "Of Mice and Men",
title => "Of Mice and Men",
body => r#"A few miles south of Soledad, the Salinas River drops in close to the hillside
bank and runs deep and green. The water is warm too, for it has slipped twinkling
over the yellow sands in the sunlight before reaching the narrow pool. On one
@@ -84,7 +84,7 @@ fn main() -> tantivy::Result<()> {
limbs and branches that arch over the pool"#
));
index_writer.add_document(doc!(
title => "Frankenstein",
title => "Frankenstein",
body => r#"You will rejoice to hear that no disaster has accompanied the commencement of an
enterprise which you have regarded with such evil forebodings. I arrived here
yesterday, and my first task is to assure my dear sister of my welfare and

View File

@@ -35,15 +35,15 @@ fn main() -> tantivy::Result<()> {
// we'll only need one doc for this example.
index_writer.add_document(doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
// ...
index_writer.commit()?;
@@ -56,7 +56,7 @@ fn main() -> tantivy::Result<()> {
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;
let snippet_generator = SnippetGenerator::new(&searcher, &*query, body)?;
for (score, doc_address) in top_docs {
let doc = searcher.doc(doc_address)?;

View File

@@ -72,26 +72,26 @@ fn main() -> tantivy::Result<()> {
title => "The Old Man and the Sea",
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish."
));
));
index_writer.add_document(doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
index_writer.add_document(doc!(
title => "Frankenstein",
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking."
title => "Frankenstein",
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking."
));
index_writer.commit()?;

View File

@@ -197,7 +197,7 @@ fn skip<'a, I: Iterator<Item = &'a Facet>>(
) -> SkipResult {
loop {
match collapse_it.peek() {
Some(facet_bytes) => match facet_bytes.encoded_str().as_bytes().cmp(target) {
Some(facet_bytes) => match facet_bytes.encoded_bytes().cmp(target) {
Ordering::Less => {}
Ordering::Greater => {
return SkipResult::OverStep;
@@ -369,8 +369,7 @@ impl SegmentCollector for FacetSegmentCollector {
let mut facet = vec![];
let facet_ord = self.collapse_facet_ords[collapsed_facet_ord];
facet_dict.ord_to_term(facet_ord as u64, &mut facet);
// TODO
facet_counts.insert(Facet::from_encoded(facet).unwrap(), count);
facet_counts.insert(unsafe { Facet::from_encoded(facet) }, count);
}
FacetCounts { facet_counts }
}
@@ -404,9 +403,9 @@ impl FacetCounts {
let right_bound = if facet.is_root() {
Bound::Unbounded
} else {
let mut facet_after_bytes: String = facet.encoded_str().to_owned();
facet_after_bytes.push('\u{1}');
let facet_after = Facet::from_encoded_string(facet_after_bytes);
let mut facet_after_bytes: Vec<u8> = facet.encoded_bytes().to_owned();
facet_after_bytes.push(1u8);
let facet_after = unsafe { Facet::from_encoded(facet_after_bytes) }; // ok logic
Bound::Excluded(facet_after)
};
let underlying: btree_map::Range<_, _> = self.facet_counts.range((left_bound, right_bound));
@@ -475,8 +474,7 @@ mod tests {
n /= 4;
let leaf = n % 5;
Facet::from(&format!("/top{}/mid{}/leaf{}", top, mid, leaf))
})
.collect();
}).collect();
for i in 0..num_facets * 10 {
let mut doc = Document::new();
doc.add_facet(facet_field, facets[i % num_facets].clone());
@@ -502,16 +500,18 @@ mod tests {
("/top1/mid2", 50),
("/top1/mid3", 50),
]
.iter()
.map(|&(facet_str, count)| (String::from(facet_str), count))
.collect::<Vec<_>>()
.iter()
.map(|&(facet_str, count)| (String::from(facet_str), count))
.collect::<Vec<_>>()
);
}
}
#[test]
#[should_panic(expected = "Tried to add a facet which is a descendant of \
an already added facet.")]
#[should_panic(
expected = "Tried to add a facet which is a descendant of \
an already added facet."
)]
fn test_misused_facet_collector() {
let mut facet_collector = FacetCollector::for_field(Field(0));
facet_collector.add_facet(Facet::from("/country"));
@@ -563,15 +563,13 @@ mod tests {
let facet = Facet::from(&format!("/facet/{}", c));
let doc = doc!(facet_field => facet);
iter::repeat(doc).take(count)
})
.map(|mut doc| {
}).map(|mut doc| {
doc.add_facet(
facet_field,
&format!("/facet/{}", thread_rng().sample(&uniform)),
);
doc
})
.collect();
}).collect();
docs[..].shuffle(&mut thread_rng());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();

View File

@@ -43,8 +43,7 @@ impl<TCollector: Collector> Collector for CollectorWrapper<TCollector> {
let err_msg = format!("Failed to cast child collector fruit. {:?}", e);
TantivyError::InvalidArgument(err_msg)
})
})
.collect::<Result<_>>()?;
}).collect::<Result<_>>()?;
let merged_fruit = self.0.merge_fruits(typed_fruit)?;
Ok(Box::new(merged_fruit))
}
@@ -148,8 +147,6 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
/// Ok(())
/// }
/// ```
#[allow(clippy::type_complexity)]
#[derive(Default)]
pub struct MultiCollector<'a> {
collector_wrappers:
Vec<Box<Collector<Child = Box<BoxableSegmentCollector>, Fruit = Box<Fruit>> + 'a>>,
@@ -157,8 +154,10 @@ pub struct MultiCollector<'a> {
impl<'a> MultiCollector<'a> {
/// Create a new `MultiCollector`
pub fn new() -> Self {
Default::default()
pub fn new() -> MultiCollector<'a> {
MultiCollector {
collector_wrappers: Vec::new(),
}
}
/// Add a new collector to our `MultiCollector`.
@@ -214,8 +213,7 @@ impl<'a> Collector for MultiCollector<'a> {
.zip(segment_fruits_list)
.map(|(child_collector, segment_fruits)| {
Ok(Some(child_collector.merge_fruits(segment_fruits)?))
})
.collect::<Result<_>>()?;
}).collect::<Result<_>>()?;
Ok(MultiFruit { sub_fruits })
}
}

View File

@@ -84,9 +84,11 @@ where
for (feature, doc) in child_fruit {
if top_collector.len() < self.limit {
top_collector.push(ComparableDoc { feature, doc });
} else if let Some(mut head) = top_collector.peek_mut() {
if head.feature < feature {
*head = ComparableDoc { feature, doc };
} else {
if let Some(mut head) = top_collector.peek_mut() {
if head.feature < feature {
*head = ComparableDoc { feature, doc };
}
}
}
}
@@ -140,8 +142,7 @@ impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
comparable_doc.feature,
DocAddress(segment_id, comparable_doc.doc),
)
})
.collect()
}).collect()
}
/// Return true iff at least K documents have gone through

View File

@@ -1,6 +1,9 @@
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
use common::serialize::BinarySerializable;
use std::io;
use std::io::Write;
use std::mem;
use std::ops::Deref;
use std::ptr;
pub(crate) struct BitPacker {
mini_buffer: u64,
@@ -15,7 +18,7 @@ impl BitPacker {
}
}
pub fn write<TWrite: io::Write>(
pub fn write<TWrite: Write>(
&mut self,
val: u64,
num_bits: u8,
@@ -25,14 +28,14 @@ impl BitPacker {
let num_bits = num_bits as usize;
if self.mini_buffer_written + num_bits > 64 {
self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32);
output.write_u64::<LittleEndian>(self.mini_buffer)?;
self.mini_buffer.serialize(output)?;
self.mini_buffer = val_u64.wrapping_shr((64 - self.mini_buffer_written) as u32);
self.mini_buffer_written = self.mini_buffer_written + num_bits - 64;
} else {
self.mini_buffer |= val_u64 << self.mini_buffer_written;
self.mini_buffer_written += num_bits;
if self.mini_buffer_written == 64 {
output.write_u64::<LittleEndian>(self.mini_buffer)?;
self.mini_buffer.serialize(output)?;
self.mini_buffer_written = 0;
self.mini_buffer = 0u64;
}
@@ -40,18 +43,17 @@ impl BitPacker {
Ok(())
}
pub fn flush<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
pub fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
if self.mini_buffer_written > 0 {
let num_bytes = (self.mini_buffer_written + 7) / 8;
let mut arr: [u8; 8] = [0u8; 8];
LittleEndian::write_u64(&mut arr, self.mini_buffer);
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer.to_le()) };
output.write_all(&arr[..num_bytes])?;
self.mini_buffer_written = 0;
}
Ok(())
}
pub fn close<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
pub fn close<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
self.flush(output)?;
// Padding the write file to simplify reads.
output.write_all(&[0u8; 7])?;
@@ -100,7 +102,9 @@ where
addr + 8 <= data.len(),
"The fast field field should have been padded with 7 bytes."
);
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[addr..]);
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
let val_unshifted_unmasked: u64 =
u64::from_le(unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) });
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
val_shifted & mask
}
@@ -122,7 +126,9 @@ where
for output_val in output.iter_mut() {
let addr = addr_in_bits >> 3;
let bit_shift = addr_in_bits & 7;
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[addr..]);
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
let val_unshifted_unmasked: u64 =
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
*output_val = val_shifted & mask;
addr_in_bits += num_bits;

View File

@@ -64,18 +64,17 @@ impl Executor {
// This is important as it makes it possible for the fruit_receiver iteration to
// terminate.
};
// This is lame, but it does not use unsafe code.
let mut results_with_position = Vec::with_capacity(num_fruits);
let mut results = Vec::with_capacity(num_fruits);
unsafe { results.set_len(num_fruits) };
let mut num_items = 0;
for (pos, fruit_res) in fruit_receiver {
let fruit = fruit_res?;
results_with_position.push((pos, fruit));
results[pos] = fruit_res?;
num_items += 1;
}
results_with_position.sort_by_key(|(pos, _)| *pos);
assert_eq!(results_with_position.len(), num_fruits);
Ok(results_with_position
.into_iter()
.map(|(_, fruit)| fruit)
.collect::<Vec<_>>())
// this checks ensures that we filled of this
// uninitialized memory.
assert_eq!(num_items, results.len());
Ok(results)
}
}
}
@@ -95,8 +94,7 @@ mod tests {
panic!("panic should propagate");
},
vec![0].into_iter(),
)
.unwrap();
).unwrap();
}
#[test]
@@ -108,8 +106,7 @@ mod tests {
panic!("panic should propagate");
},
vec![0].into_iter(),
)
.unwrap();
).unwrap();
}
#[test]

View File

@@ -13,7 +13,6 @@ use directory::ManagedDirectory;
#[cfg(feature = "mmap")]
use directory::MmapDirectory;
use directory::{Directory, RAMDirectory};
use error::DataCorruption;
use error::TantivyError;
use indexer::index_writer::open_index_writer;
use indexer::index_writer::HEAP_SIZE_MIN;
@@ -38,13 +37,7 @@ fn load_metas(directory: &Directory) -> Result<IndexMeta> {
let meta_data = directory.atomic_read(&META_FILEPATH)?;
let meta_string = String::from_utf8_lossy(&meta_data);
serde_json::from_str(&meta_string)
.map_err(|e| {
DataCorruption::new(
META_FILEPATH.clone(),
format!("Meta file cannot be deserialized. {:?}.", e),
)
})
.map_err(From::from)
.map_err(|_| TantivyError::CorruptedFile(META_FILEPATH.clone()))
}
/// Search Index
@@ -142,7 +135,7 @@ impl Index {
/// Creates a new index given an implementation of the trait `Directory`
pub fn create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
let directory = ManagedDirectory::wrap(dir)?;
let directory = ManagedDirectory::new(dir)?;
Index::from_directory(directory, schema)
}
@@ -206,7 +199,7 @@ impl Index {
/// Open the index using the provided directory
pub fn open<D: Directory>(directory: D) -> Result<Index> {
let directory = ManagedDirectory::wrap(directory)?;
let directory = ManagedDirectory::new(directory)?;
let metas = load_metas(&directory)?;
Index::create_from_metas(directory, &metas)
}

View File

@@ -32,7 +32,10 @@ pub struct InvertedIndexReader {
}
impl InvertedIndexReader {
#[cfg_attr(feature = "cargo-clippy", allow(clippy::needless_pass_by_value))] // for symetry
#[cfg_attr(
feature = "cargo-clippy",
allow(clippy::needless_pass_by_value)
)] // for symetry
pub(crate) fn new(
termdict: TermDictionary,
postings_source: ReadOnlySource,

View File

@@ -104,8 +104,7 @@ impl Searcher {
.iter()
.map(|segment_reader| {
u64::from(segment_reader.inverted_index(term.field()).doc_freq(term))
})
.sum::<u64>()
}).sum::<u64>()
}
/// Return the list of segment readers

View File

@@ -1,7 +1,7 @@
use core::MANAGED_FILEPATH;
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
use directory::{ReadOnlySource, WritePtr};
use error::DataCorruption;
use error::TantivyError;
use indexer::LockType;
use serde_json;
use std::collections::HashSet;
@@ -59,17 +59,12 @@ fn save_managed_paths(
impl ManagedDirectory {
/// Wraps a directory as managed directory.
pub fn wrap<Dir: Directory>(directory: Dir) -> Result<ManagedDirectory> {
pub fn new<Dir: Directory>(directory: Dir) -> Result<ManagedDirectory> {
match directory.atomic_read(&MANAGED_FILEPATH) {
Ok(data) => {
let managed_files_json = String::from_utf8_lossy(&data);
let managed_files: HashSet<PathBuf> = serde_json::from_str(&managed_files_json)
.map_err(|e| {
DataCorruption::new(
MANAGED_FILEPATH.clone(),
format!("Managed file cannot be deserialized: {:?}. ", e),
)
})?;
.map_err(|_| TantivyError::CorruptedFile(MANAGED_FILEPATH.clone()))?;
Ok(ManagedDirectory {
directory: Box::new(directory),
meta_informations: Arc::new(RwLock::new(MetaInformation {
@@ -265,7 +260,7 @@ mod tests {
let tempdir_path = PathBuf::from(tempdir.path());
{
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
{
let mut write_file = managed_directory.open_write(*TEST_PATH1).unwrap();
write_file.flush().unwrap();
@@ -291,7 +286,7 @@ mod tests {
}
{
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
{
assert!(managed_directory.exists(*TEST_PATH1));
assert!(!managed_directory.exists(*TEST_PATH2));
@@ -315,7 +310,7 @@ mod tests {
let living_files = HashSet::new();
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
managed_directory
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
.unwrap();

View File

@@ -100,8 +100,7 @@ impl InnerDirectory {
);
let io_err = make_io_err(msg);
OpenReadError::IOError(IOError::with_path(path.to_owned(), io_err))
})
.and_then(|readable_map| {
}).and_then(|readable_map| {
readable_map
.get(path)
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
@@ -121,8 +120,7 @@ impl InnerDirectory {
);
let io_err = make_io_err(msg);
DeleteError::IOError(IOError::with_path(path.to_owned(), io_err))
})
.and_then(|mut writable_map| match writable_map.remove(path) {
}).and_then(|mut writable_map| match writable_map.remove(path) {
Some(_) => Ok(()),
None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))),
})

View File

@@ -8,42 +8,9 @@ use indexer::LockType;
use query;
use schema;
use serde_json;
use std::fmt;
use std::path::PathBuf;
use std::sync::PoisonError;
pub struct DataCorruption {
filepath: Option<PathBuf>,
comment: String,
}
impl DataCorruption {
pub fn new(filepath: PathBuf, comment: String) -> DataCorruption {
DataCorruption {
filepath: Some(filepath),
comment,
}
}
pub fn comment_only(comment: String) -> DataCorruption {
DataCorruption {
filepath: None,
comment,
}
}
}
impl fmt::Debug for DataCorruption {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
write!(f, "Data corruption: ")?;
if let Some(ref filepath) = &self.filepath {
write!(f, "(in file `{:?}`)", filepath)?;
}
write!(f, ": {}.", self.comment)?;
Ok(())
}
}
/// The library's failure based error enum
#[derive(Debug, Fail)]
pub enum TantivyError {
@@ -66,8 +33,8 @@ pub enum TantivyError {
#[fail(display = "An IO error occurred: '{}'", _0)]
IOError(#[cause] IOError),
/// Data corruption.
#[fail(display = "{:?}", _0)]
DataCorruption(DataCorruption),
#[fail(display = "File contains corrupted data: '{:?}'", _0)]
CorruptedFile(PathBuf),
/// A thread holding the locked panicked and poisoned the lock.
#[fail(display = "A thread holding the locked panicked and poisoned the lock")]
Poisoned,
@@ -88,12 +55,6 @@ pub enum TantivyError {
SystemError(String),
}
impl From<DataCorruption> for TantivyError {
fn from(data_corruption: DataCorruption) -> TantivyError {
TantivyError::DataCorruption(data_corruption)
}
}
impl From<FastFieldNotAvailableError> for TantivyError {
fn from(fastfield_error: FastFieldNotAvailableError) -> TantivyError {
TantivyError::FastFieldError(fastfield_error)

View File

@@ -1,6 +1,5 @@
use super::MultiValueIntFastFieldReader;
use schema::Facet;
use std::str;
use termdict::TermDictionary;
use termdict::TermOrdinal;
use DocId;
@@ -21,7 +20,6 @@ use DocId;
pub struct FacetReader {
term_ords: MultiValueIntFastFieldReader<u64>,
term_dict: TermDictionary,
buffer: Vec<u8>,
}
impl FacetReader {
@@ -39,7 +37,6 @@ impl FacetReader {
FacetReader {
term_ords,
term_dict,
buffer: vec![],
}
}
@@ -58,18 +55,11 @@ impl FacetReader {
}
/// Given a term ordinal returns the term associated to it.
pub fn facet_from_ord(
&mut self,
facet_ord: TermOrdinal,
output: &mut Facet,
) -> Result<(), str::Utf8Error> {
pub fn facet_from_ord(&self, facet_ord: TermOrdinal, output: &mut Facet) {
let found_term = self
.term_dict
.ord_to_term(facet_ord as u64, &mut self.buffer);
.ord_to_term(facet_ord as u64, output.inner_buffer_mut());
assert!(found_term, "Term ordinal {} no found.", facet_ord);
let facet_str = str::from_utf8(&self.buffer[..])?;
output.set_facet_str(facet_str);
Ok(())
}
/// Return the list of facet ordinals associated to a document.

View File

@@ -82,20 +82,20 @@ mod tests {
let mut facet = Facet::root();
{
facet_reader.facet_from_ord(1, &mut facet).unwrap();
facet_reader.facet_from_ord(1, &mut facet);
assert_eq!(facet, Facet::from("/category"));
}
{
facet_reader.facet_from_ord(2, &mut facet).unwrap();
facet_reader.facet_from_ord(2, &mut facet);
assert_eq!(facet, Facet::from("/category/cat1"));
}
{
facet_reader.facet_from_ord(3, &mut facet).unwrap();
facet_reader.facet_from_ord(3, &mut facet);
assert_eq!(format!("{}", facet), "/category/cat2");
assert_eq!(facet, Facet::from("/category/cat2"));
}
{
facet_reader.facet_from_ord(4, &mut facet).unwrap();
facet_reader.facet_from_ord(4, &mut facet);
assert_eq!(facet, Facet::from("/category/cat3"));
}

View File

@@ -191,7 +191,10 @@ impl DeleteCursor {
}
}
#[cfg_attr(feature = "cargo-clippy", allow(clippy::wrong_self_convention))]
#[cfg_attr(
feature = "cargo-clippy",
allow(clippy::wrong_self_convention)
)]
fn is_behind_opstamp(&mut self, target_opstamp: u64) -> bool {
self.get()
.map(|operation| operation.opstamp < target_opstamp)

View File

@@ -61,8 +61,7 @@ fn initial_table_size(per_thread_memory_budget: usize) -> usize {
"Per thread memory is too small: {}",
per_thread_memory_budget
)
})
.min(19) // we cap it at 512K
}).min(19) // we cap it at 512K
}
/// `IndexWriter` is the user entry-point to add document to an index.
@@ -140,7 +139,7 @@ pub fn open_index_writer(
let stamper = Stamper::new(current_opstamp);
let segment_updater =
SegmentUpdater::create(index.clone(), stamper.clone(), &delete_queue.cursor())?;
SegmentUpdater::new(index.clone(), stamper.clone(), &delete_queue.cursor())?;
let mut index_writer = IndexWriter {
_directory_lock: Some(directory_lock),
@@ -391,8 +390,7 @@ impl IndexWriter {
.name(format!(
"thrd-tantivy-index{}-gen{}",
self.worker_id, generation
))
.spawn(move || {
)).spawn(move || {
loop {
let mut document_iterator =
document_receiver_clone.clone().into_iter().peekable();
@@ -467,8 +465,10 @@ impl IndexWriter {
///
/// Returns the former segment_ready channel.
fn recreate_document_channel(&mut self) -> DocumentReceiver {
let (mut document_sender, mut document_receiver): (DocumentSender, DocumentReceiver) =
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
let (mut document_sender, mut document_receiver): (
DocumentSender,
DocumentReceiver,
) = channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
swap(&mut self.document_sender, &mut document_sender);
swap(&mut self.document_receiver, &mut document_receiver);
document_receiver

View File

@@ -40,15 +40,13 @@ fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
total_tokens += reader.inverted_index(field).total_num_tokens();
}
}
total_tokens
+ count
.iter()
.cloned()
.enumerate()
.map(|(fieldnorm_ord, count)| {
count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8))
})
.sum::<u64>()
total_tokens + count
.iter()
.cloned()
.enumerate()
.map(|(fieldnorm_ord, count)| {
count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8))
}).sum::<u64>()
}
pub struct IndexMerger {
@@ -525,8 +523,7 @@ impl IndexMerger {
}
}
None
})
.collect();
}).collect();
// At this point, `segment_postings` contains the posting list
// of all of the segments containing the given term.
@@ -667,8 +664,7 @@ mod tests {
TextFieldIndexing::default()
.set_tokenizer("default")
.set_index_option(IndexRecordOption::WithFreqs),
)
.set_stored();
).set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue);
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
@@ -807,8 +803,7 @@ mod tests {
.search(
&query,
&BytesFastFieldTestCollector::for_field(bytes_score_field),
)
.expect("failed to search")
).expect("failed to search")
};
assert_eq!(
get_fast_vals(vec![Term::from_field_text(text_field, "a")]),
@@ -828,8 +823,7 @@ mod tests {
let text_fieldtype = schema::TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
)
.set_stored();
).set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue);
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
@@ -857,21 +851,21 @@ mod tests {
{
// a first commit
index_writer.add_document(doc!(
text_field => "a b d",
score_field => 1u64,
bytes_score_field => vec![0u8, 0, 0, 1],
));
text_field => "a b d",
score_field => 1u64,
bytes_score_field => vec![0u8, 0, 0, 1],
));
index_writer.add_document(doc!(
text_field => "b c",
score_field => 2u64,
bytes_score_field => vec![0u8, 0, 0, 2],
));
text_field => "b c",
score_field => 2u64,
bytes_score_field => vec![0u8, 0, 0, 2],
));
index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.add_document(doc!(
text_field => "c d",
score_field => 3u64,
bytes_score_field => vec![0u8, 0, 0, 3],
));
text_field => "c d",
score_field => 3u64,
bytes_score_field => vec![0u8, 0, 0, 3],
));
index_writer.commit().expect("committed");
index.load_searchers().unwrap();
let ref searcher = *index.searcher();
@@ -898,27 +892,27 @@ mod tests {
{
// a second commit
index_writer.add_document(doc!(
text_field => "a d e",
score_field => 4_000u64,
bytes_score_field => vec![0u8, 0, 0, 4],
));
text_field => "a d e",
score_field => 4_000u64,
bytes_score_field => vec![0u8, 0, 0, 4],
));
index_writer.add_document(doc!(
text_field => "e f",
score_field => 5_000u64,
bytes_score_field => vec![0u8, 0, 0, 5],
));
text_field => "e f",
score_field => 5_000u64,
bytes_score_field => vec![0u8, 0, 0, 5],
));
index_writer.delete_term(Term::from_field_text(text_field, "a"));
index_writer.delete_term(Term::from_field_text(text_field, "f"));
index_writer.add_document(doc!(
text_field => "f g",
score_field => 6_000u64,
bytes_score_field => vec![0u8, 0, 23, 112],
));
text_field => "f g",
score_field => 6_000u64,
bytes_score_field => vec![0u8, 0, 23, 112],
));
index_writer.add_document(doc!(
text_field => "g h",
score_field => 7_000u64,
bytes_score_field => vec![0u8, 0, 27, 88],
));
text_field => "g h",
score_field => 7_000u64,
bytes_score_field => vec![0u8, 0, 27, 88],
));
index_writer.commit().expect("committed");
index.load_searchers().unwrap();
let searcher = index.searcher();

View File

@@ -138,7 +138,7 @@ struct InnerSegmentUpdater {
}
impl SegmentUpdater {
pub fn create(
pub fn new(
index: Index,
stamper: Stamper,
delete_cursor: &DeleteCursor,
@@ -195,8 +195,7 @@ impl SegmentUpdater {
segment_updater.0.segment_manager.add_segment(segment_entry);
segment_updater.consider_merge_options();
true
})
.forget();
}).forget();
true
} else {
false
@@ -250,16 +249,14 @@ impl SegmentUpdater {
opstamp,
commit_message,
directory.box_clone().borrow_mut(),
)
.expect("Could not save metas.");
).expect("Could not save metas.");
}
}
pub fn garbage_collect_files(&self) -> Result<()> {
self.run_async(move |segment_updater| {
segment_updater.garbage_collect_files_exec();
})
.wait()
}).wait()
}
fn garbage_collect_files_exec(&self) {
@@ -281,8 +278,7 @@ impl SegmentUpdater {
segment_updater.garbage_collect_files_exec();
segment_updater.consider_merge_options();
}
})
.wait()
}).wait()
}
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> {
@@ -290,8 +286,7 @@ impl SegmentUpdater {
let segment_ids_vec = segment_ids.to_vec();
self.run_async(move |segment_updater| {
segment_updater.start_merge_impl(&segment_ids_vec[..])
})
.wait()?
}).wait()?
}
// `segment_ids` is required to be non-empty.
@@ -357,8 +352,7 @@ impl SegmentUpdater {
.unwrap()
.remove(&merging_thread_id);
Ok(())
})
.expect("Failed to spawn a thread.");
}).expect("Failed to spawn a thread.");
self.0
.merging_threads
.write()
@@ -449,8 +443,7 @@ impl SegmentUpdater {
let previous_metas = segment_updater.0.index.load_metas().unwrap();
segment_updater.save_metas(previous_metas.opstamp, previous_metas.payload);
segment_updater.garbage_collect_files_exec();
})
.wait()
}).wait()
}
/// Wait for current merging threads.

View File

@@ -62,8 +62,7 @@ impl SegmentWriter {
segment.index().tokenizers().get(tokenizer_name)
}),
_ => None,
})
.collect();
}).collect();
Ok(SegmentWriter {
max_doc: 0,
multifield_postings,
@@ -111,18 +110,18 @@ impl SegmentWriter {
}
match *field_options.field_type() {
FieldType::HierarchicalFacet => {
let facets: Vec<&str> = field_values
let facets: Vec<&[u8]> = field_values
.iter()
.flat_map(|field_value| match *field_value.value() {
Value::Facet(ref facet) => Some(facet.encoded_str()),
Value::Facet(ref facet) => Some(facet.encoded_bytes()),
_ => {
panic!("Expected hierarchical facet");
}
})
.collect();
}).collect();
let mut term = Term::for_field(field); // we set the Term
for fake_str in facets {
for facet_bytes in facets {
let mut unordered_term_id_opt = None;
let fake_str = unsafe { str::from_utf8_unchecked(facet_bytes) };
FacetTokenizer.token_stream(fake_str).process(&mut |token| {
term.set_text(&token.text);
let unordered_term_id =
@@ -146,8 +145,7 @@ impl SegmentWriter {
.flat_map(|field_value| match *field_value.value() {
Value::Str(ref text) => Some(text.as_str()),
_ => None,
})
.collect();
}).collect();
if texts.is_empty() {
0
} else {

View File

@@ -1,5 +1,6 @@
#![doc(html_logo_url = "http://fulmicoton.com/tantivy-logo/tantivy-logo.png")]
#![cfg_attr(all(feature = "unstable", test), feature(test))]
#![cfg_attr(feature = "cargo-clippy", feature(tool_lints))]
#![cfg_attr(feature = "cargo-clippy", allow(clippy::module_inception))]
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
#![warn(missing_docs)]
@@ -183,7 +184,10 @@ mod macros;
pub use error::TantivyError;
#[deprecated(since = "0.7.0", note = "please use `tantivy::TantivyError` instead")]
#[deprecated(
since = "0.7.0",
note = "please use `tantivy::TantivyError` instead"
)]
pub use error::TantivyError as Error;
extern crate census;
@@ -213,7 +217,7 @@ pub mod store;
pub mod termdict;
mod snippet;
pub use self::snippet::{Snippet, SnippetGenerator};
pub use self::snippet::{SnippetGenerator, Snippet};
mod docset;
pub use self::docset::{DocSet, SkipResult};
@@ -510,9 +514,11 @@ mod tests {
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(text_field);
assert!(inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none());
assert!(
inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none()
);
{
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
@@ -547,9 +553,11 @@ mod tests {
let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(term_abcd.field());
assert!(inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none());
assert!(
inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none()
);
{
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
@@ -583,9 +591,11 @@ mod tests {
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(term_abcd.field());
assert!(inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none());
assert!(
inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none()
);
{
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
@@ -733,9 +743,11 @@ mod tests {
let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(text_field);
let term_abcd = Term::from_field_text(text_field, "abcd");
assert!(inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none());
assert!(
inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none()
);
let term_af = Term::from_field_text(text_field, "af");
let mut postings = inverted_index
.read_postings(&term_af, IndexRecordOption::WithFreqsAndPositions)

View File

@@ -77,10 +77,10 @@ mod test {
let likes = schema_builder.add_u64_field("num_u64", FAST);
let _schema = schema_builder.build();
let _doc = doc!(
title => "Life Aquatic",
author => "Wes Anderson",
likes => 4u64
);
title => "Life Aquatic",
author => "Wes Anderson",
likes => 4u64
);
}
#[test]
@@ -91,9 +91,9 @@ mod test {
let likes = schema_builder.add_u64_field("num_u64", FAST);
let _schema = schema_builder.build();
let _doc = doc!(
title => "Life Aquatic",
author => "Wes Anderson",
likes => 4u64,
);
title => "Life Aquatic",
author => "Wes Anderson",
likes => 4u64,
);
}
}

View File

@@ -221,10 +221,12 @@ pub mod tests {
}
{
let term_a = Term::from_field_text(text_field, "abcdef");
assert!(segment_reader
.inverted_index(term_a.field())
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.is_none());
assert!(
segment_reader
.inverted_index(term_a.field())
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.is_none()
);
}
{
let term_a = Term::from_field_text(text_field, "a");

View File

@@ -29,8 +29,7 @@ fn posting_from_field_entry(field_entry: &FieldEntry) -> Box<PostingsWriter> {
IndexRecordOption::WithFreqsAndPositions => {
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed()
}
})
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
}).unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
FieldType::U64(_) | FieldType::I64(_) | FieldType::HierarchicalFacet => {
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
}
@@ -108,8 +107,10 @@ impl MultiFieldPostingsWriter {
.map(|(key, _, _)| Term::wrap(key).field())
.enumerate();
let mut unordered_term_mappings: HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>> =
HashMap::new();
let mut unordered_term_mappings: HashMap<
Field,
HashMap<UnorderedTermId, TermOrdinal>,
> = HashMap::new();
let mut prev_field = Field(u32::max_value());
for (offset, field) in term_offsets_it {
@@ -137,8 +138,7 @@ impl MultiFieldPostingsWriter {
.enumerate()
.map(|(term_ord, unord_term_id)| {
(unord_term_id as UnorderedTermId, term_ord as TermOrdinal)
})
.collect();
}).collect();
unordered_term_mappings.insert(field, mapping);
}
FieldType::U64(_) | FieldType::I64(_) => {}

View File

@@ -126,6 +126,7 @@ impl SegmentPostings {
fn exponential_search(target: u32, arr: &[u32]) -> (usize, usize) {
let mut start = 0;
let end = arr.len();
debug_assert!(target >= arr[start]);
debug_assert!(target <= arr[end - 1]);
let mut jump = 1;
loop {
@@ -215,10 +216,11 @@ impl DocSet for SegmentPostings {
// we're in the right block now, start with an exponential search
let block_docs = self.block_cursor.docs();
debug_assert!(target >= self.doc());
let new_cur = self
.cur
.wrapping_add(search_within_block(&block_docs[self.cur..], target));
if need_positions {
sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur]
.iter()
@@ -531,8 +533,7 @@ impl BlockSegmentPostings {
} else {
BlockSegmentPostingsSkipResult::Terminated
}
})
.unwrap_or(BlockSegmentPostingsSkipResult::Terminated);
}).unwrap_or(BlockSegmentPostingsSkipResult::Terminated);
}
BlockSegmentPostingsSkipResult::Terminated
}
@@ -620,7 +621,6 @@ impl<'b> Streamer<'b> for BlockSegmentPostings {
#[cfg(test)]
mod tests {
use super::exponential_search;
use super::search_within_block;
use super::BlockSegmentPostings;
use super::BlockSegmentPostingsSkipResult;
@@ -634,7 +634,6 @@ mod tests {
use schema::Term;
use schema::INT_INDEXED;
use DocId;
use SkipResult;
#[test]
fn test_empty_segment_postings() {
@@ -662,16 +661,6 @@ mod tests {
.0
}
#[test]
fn test_exponentiel_search() {
assert_eq!(exponential_search(0, &[1, 2]), (0, 1));
assert_eq!(exponential_search(1, &[1, 2]), (0, 1));
assert_eq!(
exponential_search(7, &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
(3, 7)
);
}
fn util_test_search_within_block(block: &[u32], target: u32) {
assert_eq!(
search_within_block(block, target),
@@ -703,7 +692,7 @@ mod tests {
#[test]
fn test_block_segment_postings() {
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
let mut block_segments = build_block_postings((0..100_000).collect::<Vec<u32>>());
let mut offset: u32 = 0u32;
// checking that the block before calling advance is empty
assert!(block_segments.docs().is_empty());
@@ -717,44 +706,14 @@ mod tests {
}
}
#[test]
fn test_skip_right_at_new_block() {
let mut doc_ids = (0..128).collect::<Vec<u32>>();
doc_ids.push(129);
doc_ids.push(130);
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.skip_next(128), SkipResult::OverStep);
assert_eq!(docset.doc(), 129);
assert!(docset.advance());
assert_eq!(docset.doc(), 130);
assert!(!docset.advance());
}
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.skip_next(129), SkipResult::Reached);
assert_eq!(docset.doc(), 129);
assert!(docset.advance());
assert_eq!(docset.doc(), 130);
assert!(!docset.advance());
}
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.skip_next(131), SkipResult::End);
}
}
fn build_block_postings(docs: &[DocId]) -> BlockSegmentPostings {
fn build_block_postings(docs: Vec<DocId>) -> BlockSegmentPostings {
let mut schema_builder = Schema::builder();
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let mut last_doc = 0u32;
for &doc in docs {
for doc in docs {
for _ in last_doc..doc {
index_writer.add_document(doc!(int_field=>1u64));
}
@@ -774,7 +733,7 @@ mod tests {
#[test]
fn test_block_segment_postings_skip() {
for i in 0..4 {
let mut block_postings = build_block_postings(&[3]);
let mut block_postings = build_block_postings(vec![3]);
assert_eq!(
block_postings.skip_to(i),
BlockSegmentPostingsSkipResult::Success(0u32)
@@ -784,7 +743,7 @@ mod tests {
BlockSegmentPostingsSkipResult::Terminated
);
}
let mut block_postings = build_block_postings(&[3]);
let mut block_postings = build_block_postings(vec![3]);
assert_eq!(
block_postings.skip_to(4u32),
BlockSegmentPostingsSkipResult::Terminated
@@ -797,7 +756,7 @@ mod tests {
for i in 0..1300 {
docs.push((i * i / 100) + i);
}
let mut block_postings = build_block_postings(&docs[..]);
let mut block_postings = build_block_postings(docs.clone());
for i in vec![0, 424, 10000] {
assert_eq!(
block_postings.skip_to(i),

View File

@@ -55,7 +55,7 @@ pub struct InvertedIndexSerializer {
impl InvertedIndexSerializer {
/// Open a new `PostingsSerializer` for the given segment
fn create(
fn new(
terms_write: CompositeWrite<WritePtr>,
postings_write: CompositeWrite<WritePtr>,
positions_write: CompositeWrite<WritePtr>,
@@ -74,7 +74,7 @@ impl InvertedIndexSerializer {
/// Open a new `PostingsSerializer` for the given segment
pub fn open(segment: &mut Segment) -> Result<InvertedIndexSerializer> {
use SegmentComponent::{POSITIONS, POSITIONSSKIP, POSTINGS, TERMS};
InvertedIndexSerializer::create(
InvertedIndexSerializer::new(
CompositeWrite::wrap(segment.open_write(TERMS)?),
CompositeWrite::wrap(segment.open_write(POSTINGS)?),
CompositeWrite::wrap(segment.open_write(POSITIONS)?),
@@ -99,7 +99,7 @@ impl InvertedIndexSerializer {
let positions_write = self.positions_write.for_field(field);
let positionsidx_write = self.positionsidx_write.for_field(field);
let field_type: FieldType = (*field_entry.field_type()).clone();
FieldSerializer::create(
FieldSerializer::new(
&field_type,
term_dictionary_write,
postings_write,
@@ -130,7 +130,7 @@ pub struct FieldSerializer<'a> {
}
impl<'a> FieldSerializer<'a> {
fn create(
fn new(
field_type: &FieldType,
term_dictionary_write: &'a mut CountingWriter<WritePtr>,
postings_write: &'a mut CountingWriter<WritePtr>,
@@ -152,7 +152,7 @@ impl<'a> FieldSerializer<'a> {
_ => (false, false),
};
let term_dictionary_builder =
TermDictionaryBuilder::create(term_dictionary_write, &field_type)?;
TermDictionaryBuilder::new(term_dictionary_write, &field_type)?;
let postings_serializer =
PostingsSerializer::new(postings_write, term_freq_enabled, position_enabled);
let positions_serializer_opt = if position_enabled {

View File

@@ -1,7 +1,9 @@
mod expull;
mod memory_arena;
mod murmurhash2;
mod term_hashmap;
pub use self::expull::ExpUnrolledLinkedList;
pub use self::memory_arena::{Addr, ArenaStorable, MemoryArena};
use self::murmurhash2::murmurhash2;
pub use self::term_hashmap::{compute_table_size, TermHashMap};

View File

@@ -0,0 +1,87 @@
use std::ptr;
const SEED: u32 = 3_242_157_231u32;
const M: u32 = 0x5bd1_e995;
#[inline(always)]
pub fn murmurhash2(key: &[u8]) -> u32 {
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
let mut key_ptr: *const u32 = key.as_ptr() as *const u32;
let len = key.len() as u32;
let mut h: u32 = SEED ^ len;
let num_blocks = len >> 2;
for _ in 0..num_blocks {
let mut k: u32 = unsafe { ptr::read_unaligned(key_ptr) }; // ok because of num_blocks definition
k = k.wrapping_mul(M);
k ^= k >> 24;
k = k.wrapping_mul(M);
h = h.wrapping_mul(M);
h ^= k;
key_ptr = key_ptr.wrapping_offset(1);
}
// Handle the last few bytes of the input array
let remaining: &[u8] = &key[key.len() & !3..];
match remaining.len() {
3 => {
h ^= u32::from(remaining[2]) << 16;
h ^= u32::from(remaining[1]) << 8;
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
2 => {
h ^= u32::from(remaining[1]) << 8;
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
1 => {
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
_ => {}
}
h ^= h >> 13;
h = h.wrapping_mul(M);
h ^ (h >> 15)
}
#[cfg(test)]
mod test {
use super::murmurhash2;
use std::collections::HashSet;
#[test]
fn test_murmur() {
let s1 = "abcdef";
let s2 = "abcdeg";
for i in 0..5 {
assert_eq!(
murmurhash2(&s1[i..5].as_bytes()),
murmurhash2(&s2[i..5].as_bytes())
);
}
}
#[test]
fn test_murmur_against_reference_impl() {
assert_eq!(murmurhash2("".as_bytes()), 3632506080);
assert_eq!(murmurhash2("a".as_bytes()), 455683869);
assert_eq!(murmurhash2("ab".as_bytes()), 2448092234);
assert_eq!(murmurhash2("abc".as_bytes()), 2066295634);
assert_eq!(murmurhash2("abcd".as_bytes()), 2588571162);
assert_eq!(murmurhash2("abcde".as_bytes()), 2988696942);
assert_eq!(murmurhash2("abcdefghijklmnop".as_bytes()), 2350868870);
}
#[test]
fn test_murmur_collisions() {
let mut set: HashSet<u32> = HashSet::default();
for i in 0..10_000 {
let s = format!("hash{}", i);
let hash = murmurhash2(s.as_bytes());
set.insert(hash);
}
assert_eq!(set.len(), 10_000);
}
}

View File

@@ -1,7 +1,4 @@
extern crate murmurhash32;
use self::murmurhash32::murmurhash2;
use super::murmurhash2;
use super::{Addr, ArenaStorable, MemoryArena};
use std::iter;
use std::mem;
@@ -209,7 +206,7 @@ impl TermHashMap {
self.resize();
}
let key_bytes: &[u8] = key.as_ref();
let hash = murmurhash2(key.as_ref());
let hash = murmurhash2::murmurhash2(key.as_ref());
let mut probe = self.probe(hash);
loop {
let bucket = probe.next_probe();

View File

@@ -63,8 +63,7 @@ impl BM25Weight {
.map(|term| {
let term_doc_freq = searcher.doc_freq(term);
idf(term_doc_freq, total_num_docs)
})
.sum::<f32>();
}).sum::<f32>();
BM25Weight::new(idf, average_fieldnorm)
}

View File

@@ -47,8 +47,7 @@ impl Query for BooleanQuery {
.iter()
.map(|&(ref occur, ref subquery)| {
Ok((*occur, subquery.weight(searcher, scoring_enabled)?))
})
.collect::<Result<_>>()?;
}).collect::<Result<_>>()?;
Ok(Box::new(BooleanWeight::new(sub_weights, scoring_enabled)))
}
@@ -69,8 +68,7 @@ impl BooleanQuery {
let term_query: Box<Query> =
Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs));
(Occur::Should, term_query)
})
.collect();
}).collect();
BooleanQuery::from(occur_term_queries)
}

View File

@@ -134,8 +134,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
.into_iter()
.map(|(offset, postings)| {
PostingsWithOffset::new(postings, (max_offset - offset) as u32)
})
.collect::<Vec<_>>();
}).collect::<Vec<_>>();
PhraseScorer {
intersection_docset: Intersection::new(postings_with_offsets),
num_docsets,

View File

@@ -68,8 +68,7 @@ fn trim_ast(logical_ast: LogicalAST) -> Option<LogicalAST> {
.into_iter()
.flat_map(|(occur, child)| {
trim_ast(child).map(|trimmed_child| (occur, trimmed_child))
})
.collect::<Vec<_>>();
}).collect::<Vec<_>>();
if trimmed_children.is_empty() {
None
} else {
@@ -423,8 +422,7 @@ impl QueryParser {
lower: self.resolve_bound(field, &lower)?,
upper: self.resolve_bound(field, &upper)?,
})))
})
.collect::<Result<Vec<_>, QueryParserError>>()?;
}).collect::<Result<Vec<_>, QueryParserError>>()?;
let result_ast = if clauses.len() == 1 {
clauses.pop().unwrap()
} else {
@@ -600,19 +598,25 @@ mod test {
assert!(query_parser.parse_query("signed:2324").is_ok());
assert!(query_parser.parse_query("signed:\"22\"").is_ok());
assert!(query_parser.parse_query("signed:\"-2234\"").is_ok());
assert!(query_parser
.parse_query("signed:\"-9999999999999\"")
.is_ok());
assert!(
query_parser
.parse_query("signed:\"-9999999999999\"")
.is_ok()
);
assert!(query_parser.parse_query("signed:\"a\"").is_err());
assert!(query_parser.parse_query("signed:\"2a\"").is_err());
assert!(query_parser
.parse_query("signed:\"18446744073709551615\"")
.is_err());
assert!(
query_parser
.parse_query("signed:\"18446744073709551615\"")
.is_err()
);
assert!(query_parser.parse_query("unsigned:\"2\"").is_ok());
assert!(query_parser.parse_query("unsigned:\"-2\"").is_err());
assert!(query_parser
.parse_query("unsigned:\"18446744073709551615\"")
.is_ok());
assert!(
query_parser
.parse_query("unsigned:\"18446744073709551615\"")
.is_ok()
);
test_parse_query_to_logical_ast_helper(
"unsigned:2324",
"Term([0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 20])",

View File

@@ -1,3 +1,4 @@
mod term_query;
mod term_scorer;
mod term_weight;

View File

@@ -55,8 +55,7 @@ where
None
}
},
)
.collect();
).collect();
Union {
docsets: non_empty_docsets,
bitsets: Box::new([TinySet::empty(); HORIZON_NUM_TINYBITSETS]),
@@ -215,7 +214,10 @@ where
// The target is outside of the buffered horizon.
// advance all docsets to a doc >= to the target.
#[cfg_attr(feature = "cargo-clippy", allow(clippy::clippy::collapsible_if))]
#[cfg_attr(
feature = "cargo-clippy",
allow(clippy::clippy::collapsible_if)
)]
unordered_drain_filter(&mut self.docsets, |docset| {
if docset.doc() < target {
if docset.skip_next(target) == SkipResult::End {

View File

@@ -6,7 +6,6 @@ use std::borrow::Cow;
use std::fmt::{self, Debug, Display, Formatter};
use std::io::{self, Read, Write};
use std::str;
use std::string::FromUtf8Error;
const SLASH_BYTE: u8 = b'/';
const ESCAPE_BYTE: u8 = b'\\';
@@ -15,10 +14,6 @@ const ESCAPE_BYTE: u8 = b'\\';
/// representation of facets.
pub const FACET_SEP_BYTE: u8 = 0u8;
/// `char` used as a level separation in the binary
/// representation of facets. (It is the null codepoint.)
pub const FACET_SEP_CHAR: char = '\u{0}';
/// A Facet represent a point in a given hierarchy.
///
/// They are typically represented similarly to a filepath.
@@ -31,18 +26,18 @@ pub const FACET_SEP_CHAR: char = '\u{0}';
/// its facet. In the example above, `/electronics/tv_and_video/`
/// and `/electronics`.
#[derive(Clone, Eq, Hash, PartialEq, Ord, PartialOrd)]
pub struct Facet(String);
pub struct Facet(Vec<u8>);
impl Facet {
/// Returns a new instance of the "root facet"
/// Equivalent to `/`.
pub fn root() -> Facet {
Facet("".to_string())
Facet(vec![])
}
/// Returns true iff the facet is the root facet `/`.
pub fn is_root(&self) -> bool {
self.encoded_str().is_empty()
self.encoded_bytes().is_empty()
}
/// Returns a binary representation of the facet.
@@ -54,19 +49,13 @@ impl Facet {
/// This representation has the benefit of making it possible to
/// express "being a child of a given facet" as a range over
/// the term ordinals.
pub fn encoded_str(&self) -> &str {
pub fn encoded_bytes(&self) -> &[u8] {
&self.0
}
pub(crate) fn from_encoded_string(facet_string: String) -> Facet {
Facet(facet_string)
}
/// Creates a `Facet` from its binary representation.
pub fn from_encoded(encoded_bytes: Vec<u8>) -> Result<Facet, FromUtf8Error> {
// facet bytes validation. `0u8` is used a separator but that is still legal utf-8
//Ok(Facet(String::from_utf8(encoded_bytes)?))
String::from_utf8(encoded_bytes).map(Facet)
pub(crate) unsafe fn from_encoded(encoded_bytes: Vec<u8>) -> Facet {
Facet(encoded_bytes)
}
/// Parse a text representation of a facet.
@@ -90,37 +79,36 @@ impl Facet {
Path: IntoIterator,
Path::Item: ToString,
{
let mut facet_string: String = String::with_capacity(100);
let mut facet_bytes: Vec<u8> = Vec::with_capacity(100);
let mut step_it = path.into_iter();
if let Some(step) = step_it.next() {
facet_string.push_str(&step.to_string());
facet_bytes.extend_from_slice(step.to_string().as_bytes());
}
for step in step_it {
facet_string.push(FACET_SEP_CHAR);
facet_string.push_str(&step.to_string());
facet_bytes.push(FACET_SEP_BYTE);
facet_bytes.extend_from_slice(step.to_string().as_bytes());
}
Facet(facet_string)
Facet(facet_bytes)
}
/// Accessor for the inner buffer of the `Facet`.
pub(crate) fn set_facet_str(&mut self, facet_str: &str) {
self.0.clear();
self.0.push_str(facet_str);
pub(crate) fn inner_buffer_mut(&mut self) -> &mut Vec<u8> {
&mut self.0
}
/// Returns `true` iff other is a subfacet of `self`.
pub fn is_prefix_of(&self, other: &Facet) -> bool {
let self_str = self.encoded_str();
let other_str = other.encoded_str();
self_str.len() < other_str.len()
&& other_str.starts_with(self_str)
&& other_str.as_bytes()[self_str.len()] == FACET_SEP_BYTE
let self_bytes: &[u8] = self.encoded_bytes();
let other_bytes: &[u8] = other.encoded_bytes();
self_bytes.len() < other_bytes.len()
&& other_bytes.starts_with(self_bytes)
&& other_bytes[self_bytes.len()] == 0u8
}
}
impl Borrow<str> for Facet {
fn borrow(&self) -> &str {
self.encoded_str()
impl Borrow<[u8]> for Facet {
fn borrow(&self) -> &[u8] {
self.encoded_bytes()
}
}
@@ -132,51 +120,45 @@ impl<'a, T: ?Sized + AsRef<str>> From<&'a T> for Facet {
Idle,
}
let path: &str = path_asref.as_ref();
assert!(!path.is_empty());
assert!(path.starts_with("/"));
let mut facet_encoded = String::new();
let mut facet_encoded = Vec::new();
let mut state = State::Idle;
let path_bytes = path.as_bytes();
let mut last_offset = 1;
for i in 1..path_bytes.len() {
let c = path_bytes[i];
for &c in &path_bytes[1..] {
match (state, c) {
(State::Idle, ESCAPE_BYTE) => {
facet_encoded.push_str(&path[last_offset..i]);
last_offset = i + 1;
state = State::Escaped
}
(State::Idle, ESCAPE_BYTE) => state = State::Escaped,
(State::Idle, SLASH_BYTE) => {
facet_encoded.push_str(&path[last_offset..i]);
facet_encoded.push(FACET_SEP_CHAR);
last_offset = i + 1;
facet_encoded.push(FACET_SEP_BYTE);
}
(State::Escaped, _escaped_char) => {
(State::Escaped, any_char) => {
state = State::Idle;
facet_encoded.push(any_char);
}
(State::Idle, other_char) => {
facet_encoded.push(other_char);
}
(State::Idle, _any_char) => {}
}
}
facet_encoded.push_str(&path[last_offset..]);
Facet(facet_encoded)
}
}
impl BinarySerializable for Facet {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
<String as BinarySerializable>::serialize(&self.0, writer)
<Vec<u8> as BinarySerializable>::serialize(&self.0, writer)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
Ok(Facet(<String as BinarySerializable>::deserialize(reader)?))
let bytes = <Vec<u8> as BinarySerializable>::deserialize(reader)?;
Ok(Facet(bytes))
}
}
impl Display for Facet {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
for step in self.0.split(FACET_SEP_CHAR) {
for step in self.0.split(|&b| b == FACET_SEP_BYTE) {
write!(f, "/")?;
write!(f, "{}", escape_slashes(step))?;
let step_str = unsafe { str::from_utf8_unchecked(step) };
write!(f, "{}", escape_slashes(step_str))?;
}
Ok(())
}

View File

@@ -232,14 +232,12 @@ impl Schema {
let field_entry = self.get_field_entry(field);
let field_type = field_entry.field_type();
match *json_value {
JsonValue::Array(ref json_items) => {
for json_item in json_items {
let value = field_type.value_from_json(json_item).map_err(|e| {
DocParsingError::ValueError(field_name.clone(), e)
})?;
doc.add(FieldValue::new(field, value));
}
}
JsonValue::Array(ref json_items) => for json_item in json_items {
let value = field_type
.value_from_json(json_item)
.map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?;
doc.add(FieldValue::new(field, value));
},
_ => {
let value = field_type
.value_from_json(json_value)
@@ -448,8 +446,7 @@ mod tests {
"count": 4,
"popularity": 10
}"#,
)
.unwrap();
).unwrap();
assert_eq!(doc.get_first(title_field).unwrap().text(), Some("my title"));
assert_eq!(
doc.get_first(author_field).unwrap().text(),

View File

@@ -32,7 +32,7 @@ impl Term {
/// Creates a `Term` given a facet.
pub fn from_facet(field: Field, facet: &Facet) -> Term {
let bytes = facet.encoded_str().as_bytes();
let bytes = facet.encoded_bytes();
let buffer = Vec::with_capacity(4 + bytes.len());
let mut term = Term(buffer);
term.set_field(field);
@@ -68,7 +68,12 @@ impl Term {
term
}
/// Creates a new Term for a given field.
/// Creates a new Term with an empty buffer,
/// but with a given capacity.
///
/// It is declared unsafe, as the term content
/// is not initialized, and a call to `.field()`
/// would panic.
pub(crate) fn for_field(field: Field) -> Term {
let mut term = Term(Vec::with_capacity(100));
term.set_field(field);

View File

@@ -192,8 +192,7 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
item.start - fragment.start_offset,
item.stop - fragment.start_offset,
)
})
.collect();
}).collect();
Snippet {
fragments: fragment_text.to_string(),
highlighted,
@@ -243,7 +242,7 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
/// let query = query_parser.parse_query("haleurs flamands").unwrap();
/// # index.load_searchers()?;
/// # let searcher = index.searcher();
/// let mut snippet_generator = SnippetGenerator::create(&searcher, &*query, text_field)?;
/// let mut snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field)?;
/// snippet_generator.set_max_num_chars(100);
/// let snippet = snippet_generator.snippet_from_doc(&doc);
/// let snippet_html: String = snippet.to_html();
@@ -260,7 +259,7 @@ pub struct SnippetGenerator {
impl SnippetGenerator {
/// Creates a new snippet generator
pub fn create(searcher: &Searcher, query: &Query, field: Field) -> Result<SnippetGenerator> {
pub fn new(searcher: &Searcher, query: &Query, field: Field) -> Result<SnippetGenerator> {
let mut terms = BTreeSet::new();
query.query_terms(&mut terms);
let terms_text: BTreeMap<String, f32> = terms
@@ -274,8 +273,7 @@ impl SnippetGenerator {
} else {
None
}
})
.collect();
}).collect();
let tokenizer = searcher.index().tokenizer_for_field(field)?;
Ok(SnippetGenerator {
terms_text,
@@ -534,14 +532,12 @@ Survey in 2016, 2017, and 2018."#;
let query_parser = QueryParser::for_index(&index, vec![text_field]);
{
let query = query_parser.parse_query("e").unwrap();
let snippet_generator =
SnippetGenerator::create(&searcher, &*query, text_field).unwrap();
let snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap();
assert!(snippet_generator.terms_text().is_empty());
}
{
let query = query_parser.parse_query("a").unwrap();
let snippet_generator =
SnippetGenerator::create(&searcher, &*query, text_field).unwrap();
let snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap();
assert_eq!(
&btreemap!("a".to_string() => 0.25f32),
snippet_generator.terms_text()
@@ -549,8 +545,7 @@ Survey in 2016, 2017, and 2018."#;
}
{
let query = query_parser.parse_query("a b").unwrap();
let snippet_generator =
SnippetGenerator::create(&searcher, &*query, text_field).unwrap();
let snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap();
assert_eq!(
&btreemap!("a".to_string() => 0.25f32, "b".to_string() => 0.5),
snippet_generator.terms_text()
@@ -558,8 +553,7 @@ Survey in 2016, 2017, and 2018."#;
}
{
let query = query_parser.parse_query("a b c").unwrap();
let snippet_generator =
SnippetGenerator::create(&searcher, &*query, text_field).unwrap();
let snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap();
assert_eq!(
&btreemap!("a".to_string() => 0.25f32, "b".to_string() => 0.5),
snippet_generator.terms_text()
@@ -591,8 +585,7 @@ Survey in 2016, 2017, and 2018."#;
let searcher = index.searcher();
let query_parser = QueryParser::for_index(&index, vec![text_field]);
let query = query_parser.parse_query("rust design").unwrap();
let mut snippet_generator =
SnippetGenerator::create(&searcher, &*query, text_field).unwrap();
let mut snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap();
{
let snippet = snippet_generator.snippet(TEST_TEXT);
assert_eq!(snippet.to_html(), "imperative-procedural paradigms. <b>Rust</b> is syntactically similar to C++[according to whom?],\nbut its <b>designers</b> intend it to provide better memory safety");

View File

@@ -80,7 +80,6 @@ pub struct SegmentSpaceUsage {
}
impl SegmentSpaceUsage {
#[allow(clippy::too_many_arguments)]
pub(crate) fn new(
num_docs: u32,
termdict: PerFieldSpaceUsage,

View File

@@ -95,7 +95,10 @@ impl StoreReader {
}
}
#[cfg_attr(feature = "cargo-clippy", allow(clippy::needless_pass_by_value))]
#[cfg_attr(
feature = "cargo-clippy",
allow(clippy::needless_pass_by_value)
)]
fn split_source(data: ReadOnlySource) -> (ReadOnlySource, ReadOnlySource, DocId) {
let data_len = data.len();
let footer_offset = data_len - size_of::<u64>() - size_of::<u32>();

View File

@@ -53,8 +53,7 @@ impl<'a> TermMerger<'a> {
.map(|(ord, streamer)| HeapItem {
streamer,
segment_ord: ord,
})
.collect(),
}).collect(),
}
}
@@ -123,7 +122,10 @@ impl<'a> TermMerger<'a> {
}
/// Iterates through terms
#[cfg_attr(feature = "cargo-clippy", allow(clippy::should_implement_trait))]
#[cfg_attr(
feature = "cargo-clippy",
allow(clippy::should_implement_trait)
)]
pub fn next(&mut self) -> Option<Term<&[u8]>> {
if self.advance() {
Some(Term::wrap(self.current_streamers[0].streamer.key()))

View File

@@ -66,7 +66,7 @@ mod tests {
let write = directory.open_write(&path).unwrap();
let field_type = FieldType::Str(TEXT);
let mut term_dictionary_builder =
TermDictionaryBuilder::create(write, &field_type).unwrap();
TermDictionaryBuilder::new(write, &field_type).unwrap();
for term in COUNTRIES.iter() {
term_dictionary_builder
.insert(term.as_bytes(), &make_term_info(0u64))
@@ -92,7 +92,7 @@ mod tests {
let write = directory.open_write(&path).unwrap();
let field_type = FieldType::Str(TEXT);
let mut term_dictionary_builder =
TermDictionaryBuilder::create(write, &field_type).unwrap();
TermDictionaryBuilder::new(write, &field_type).unwrap();
term_dictionary_builder
.insert("abc".as_bytes(), &make_term_info(34u64))
.unwrap();
@@ -167,7 +167,7 @@ mod tests {
let mut term_string = String::new();
while term_it.advance() {
//let term = Term::from_bytes(term_it.key());
term_string.push_str(str::from_utf8(term_it.key()).expect("test"));
term_string.push_str(unsafe { str::from_utf8_unchecked(term_it.key()) }); // ok test
}
assert_eq!(&*term_string, "abcdef");
}
@@ -180,7 +180,7 @@ mod tests {
let field_type = FieldType::Str(TEXT);
let buffer: Vec<u8> = {
let mut term_dictionary_builder =
TermDictionaryBuilder::create(vec![], &field_type).unwrap();
TermDictionaryBuilder::new(vec![], &field_type).unwrap();
for &(ref id, ref i) in &ids {
term_dictionary_builder
.insert(id.as_bytes(), &make_term_info(*i as u64))
@@ -210,7 +210,7 @@ mod tests {
let field_type = FieldType::Str(TEXT);
let buffer: Vec<u8> = {
let mut term_dictionary_builder =
TermDictionaryBuilder::create(vec![], &field_type).unwrap();
TermDictionaryBuilder::new(vec![], &field_type).unwrap();
// term requires more than 16bits
term_dictionary_builder
.insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1))
@@ -245,7 +245,7 @@ mod tests {
let field_type = FieldType::Str(TEXT);
let buffer: Vec<u8> = {
let mut term_dictionary_builder =
TermDictionaryBuilder::create(vec![], &field_type).unwrap();
TermDictionaryBuilder::new(vec![], &field_type).unwrap();
for &(ref id, ref i) in &ids {
term_dictionary_builder
.insert(id.as_bytes(), &make_term_info(*i as u64))
@@ -314,7 +314,7 @@ mod tests {
let field_type = FieldType::Str(TEXT);
let buffer: Vec<u8> = {
let mut term_dictionary_builder =
TermDictionaryBuilder::create(vec![], &field_type).unwrap();
TermDictionaryBuilder::new(vec![], &field_type).unwrap();
term_dictionary_builder
.insert(&[], &make_term_info(1 as u64))
.unwrap();
@@ -338,7 +338,7 @@ mod tests {
let field_type = FieldType::Str(TEXT);
let buffer: Vec<u8> = {
let mut term_dictionary_builder =
TermDictionaryBuilder::create(vec![], &field_type).unwrap();
TermDictionaryBuilder::new(vec![], &field_type).unwrap();
for i in 0u8..10u8 {
let number_arr = [i; 1];
term_dictionary_builder
@@ -408,7 +408,7 @@ mod tests {
let write = directory.open_write(&path).unwrap();
let field_type = FieldType::Str(TEXT);
let mut term_dictionary_builder =
TermDictionaryBuilder::create(write, &field_type).unwrap();
TermDictionaryBuilder::new(write, &field_type).unwrap();
for term in COUNTRIES.iter() {
term_dictionary_builder
.insert(term.as_bytes(), &make_term_info(0u64))

View File

@@ -132,7 +132,10 @@ where
}
/// Return the next `(key, value)` pair.
#[cfg_attr(feature = "cargo-clippy", allow(clippy::should_implement_trait))]
#[cfg_attr(
feature = "cargo-clippy",
allow(clippy::should_implement_trait)
)]
pub fn next(&mut self) -> Option<(&[u8], &TermInfo)> {
if self.advance() {
Some((self.key(), self.value()))

View File

@@ -29,7 +29,7 @@ where
W: Write,
{
/// Creates a new `TermDictionaryBuilder`
pub fn create(w: W, _field_type: &FieldType) -> io::Result<Self> {
pub fn new(w: W, _field_type: &FieldType) -> io::Result<Self> {
let fst_builder = fst::MapBuilder::new(w).map_err(convert_fst_error)?;
Ok(TermDictionaryBuilder {
fst_builder,
@@ -132,7 +132,7 @@ impl TermDictionary {
/// Creates an empty term dictionary which contains no terms.
pub fn empty(field_type: &FieldType) -> Self {
let term_dictionary_data: Vec<u8> =
TermDictionaryBuilder::create(Vec::<u8>::new(), &field_type)
TermDictionaryBuilder::new(Vec::<u8>::new(), &field_type)
.expect("Creating a TermDictionaryBuilder in a Vec<u8> should never fail")
.finish()
.expect("Writing in a Vec<u8> should never fail");

View File

@@ -0,0 +1,78 @@
extern crate aho_corasick;
use self::aho_corasick::{AcAutomaton, Automaton};
use std::mem;
use super::{OffsetIncrements, OffsetIncrementsBuilder};
pub trait CharMogrifier {
fn process_text(&mut self, text: &str, dest: &mut String, correction: &mut OffsetIncrementsBuilder);
}
pub struct CharFilter {
text: String,
buffer: String,
mogrifiers: Vec<Box<CharMogrifier>>
}
impl CharFilter {
fn process_text(&mut self, text: &str) {
self.text.clear();
self.text.push_str(text);
self.buffer.clear();
let mut offset_increment_builder = OffsetIncrements::builder();
for mogrifier in &mut self.mogrifiers {
mogrifier.process_text(&self.text,
&mut self.buffer,
&mut offset_increment_builder);
mem::swap(&mut self.text, &mut self.buffer);
offset_increment_builder.new_layer();
}
}
}
pub struct SubstringReplacer<'a> {
automaton: AcAutomaton<&'a str>,
replacements: Vec<&'a str>
}
impl SubstringReplacer<'static> {
fn new(from_tos: Vec<(&'static str, &'static str)>) -> SubstringReplacer<'static> {
let from_ptns: Vec<&'static str> = from_tos
.iter()
.map(|(from_str, _)| *from_str)
.collect();
let to_strs: Vec<&'static str> = from_tos
.iter()
.map(|(_, to_str)| *to_str)
.collect();
let automaton = AcAutomaton::new(from_ptns);
SubstringReplacer {
automaton,
replacements: to_strs
}
}
}
impl<'a> CharMogrifier for SubstringReplacer<'a> {
// correction is an array that goes from old_offset -> new_offset.
// correction len is `text.len() + 1`
fn process_text(&mut self, text: &str, dest: &mut String, correction: &mut OffsetIncrementsBuilder) {
let mut start = 0;
for m in self.automaton.find(text) {
dest.push_str(&text[start..m.start]);
let replacement = self.replacements[m.pati];
let previous_len = m.end - m.start;
correction.register_inc(m.end, (replacement.len() as isize) - (previous_len as isize));
dest.push_str(replacement);
start = m.end;
}
dest.push_str(&text[start..]);
}
}
// lowercasing
// Unicode normalization*
// '
// accent simplification

View File

@@ -0,0 +1,4 @@
mod char_filter;
mod offset_increments;
pub use self::offset_increments::{OffsetIncrements, OffsetIncrementsBuilder};

View File

@@ -0,0 +1,264 @@
/*!
Stores an increasing mapping from naturals to naturals.
`CharFilter`s may make the original text longer or shorter.
Token's offset need to refer to their offset in the original
text.
This struct is in charge of doing an efficient book-keeping
of these shift in offsets and provide a mapping
from the transformed text to the original text.
We define the inverse of an increasing mapping `f` as:
g(i) = max {j | f(j) <= i}
!= min {j | f(i) >= i}
The name `inverse` is a bit misleading:
this is not really an involution.
Note that having a single definition has some bad side effects.
For instance, when trying to convert a segment of chars to
its offset in the original string, the reverse mapping may
return an empty string.
We could use a different definition of the reverse mapping
when computing the lower bound and the upper bound of the segment,
but then non-overlapping tokens could have overlapping origins.
# Example
```
forward mapping
[0,1,2,5,6,7]
Encoded sparsely as (3, 2)
reverse mapping
[0,1,2,2,2,3,4,5]
Encoded sparsely as [(3, -1), (4,-1), (5,-1)]
```
*/
/// Builds a reverse mapping using a sparse representation of the
/// forward mapping.
pub struct OffsetIncrementsBuilder {
cumulated: isize,
incs: Vec<(usize, isize)>,
}
impl OffsetIncrementsBuilder {
/// We require
/// - `from_offset + delta >= 0`
/// There is no need to call this function if delta = 0.
pub fn register_inc(&mut self, from_offset: usize, delta: isize) {
let mut cumulated = self.cumulated;
let from_offset_isize = from_offset as isize;
let to_offset = (from_offset_isize + self.cumulated) as usize;
if delta > 0 {
for i in 0..delta as usize {
cumulated += 1;
self.incs.push((to_offset + i, -cumulated));
}
} else {
assert_eq!(delta, -1);
cumulated -= 1;
self.incs.push((to_offset + 1, -cumulated));
}
println!("incs {:?}", self.incs);
self.cumulated = cumulated;
}
pub fn new_layer(&self) {
panic!();
}
fn build(self) -> OffsetIncrements {
OffsetIncrements {
incs: self.incs
}
}
}
#[derive(Default)]
pub struct OffsetIncrementsReader {
shifts: Vec<(usize, isize)>,
current_shift: isize,
idx: usize,
}
impl OffsetIncrementsReader {
fn new(shifts: Vec<(usize, isize)>) -> OffsetIncrementsReader {
OffsetIncrementsReader {
shifts,
current_shift: 0,
idx: 0,
}
}
fn convert_offset(&mut self, target: usize) -> usize {
while self.idx < self.shifts.len() {
let (offset, shift) = self.shifts[self.idx];
if offset > target {
break;
} else {
self.current_shift = shift;
}
self.idx += 1;
}
return (self.current_shift + target as isize) as usize;
}
}
pub struct OffsetIncrements {
incs: Vec<(usize, isize)>
}
impl OffsetIncrements {
pub fn builder() -> OffsetIncrementsBuilder {
OffsetIncrementsBuilder {
cumulated: 0,
incs: Vec::new(),
}
}
pub fn reader(&self) -> OffsetIncrementsReader {
OffsetIncrementsReader::new(self.incs.clone()) // TODO Fixme, no clone
}
}
#[cfg(test)]
mod tests {
use super::OffsetIncrements;
use super::OffsetIncrementsReader;
#[test]
fn test_offset_increment_reader_empty() {
let mut reader = OffsetIncrementsReader::new(vec![]);
for i in 0..3 {
assert_eq!(reader.convert_offset(i), i);
}
}
#[test]
fn test_offset_increment_reader_step() {
let mut reader = OffsetIncrementsReader::new(vec![(1, 1), (3, 3), (6, 2), (7, 1), (8, 0), (9, -1)]);
assert_eq!(reader.convert_offset(0), 0);
assert_eq!(reader.convert_offset(1), 2);
assert_eq!(reader.convert_offset(2), 3);
assert_eq!(reader.convert_offset(3), 6);
assert_eq!(reader.convert_offset(4), 7);
assert_eq!(reader.convert_offset(5), 8);
assert_eq!(reader.convert_offset(6), 8);
assert_eq!(reader.convert_offset(7), 8);
assert_eq!(reader.convert_offset(8), 8);
assert_eq!(reader.convert_offset(9), 8);
}
#[test]
fn test_offset_increment_reader_step_neg() {
let mut reader = OffsetIncrementsReader::new(vec![(1, -1), (2, -2), (3, -3)]);
assert_eq!(reader.convert_offset(0), 0);
assert_eq!(reader.convert_offset(1), 0);
assert_eq!(reader.convert_offset(2), 0);
assert_eq!(reader.convert_offset(3), 0);
assert_eq!(reader.convert_offset(4), 1);
assert_eq!(reader.convert_offset(5), 2);
assert_eq!(reader.convert_offset(6), 3);
assert_eq!(reader.convert_offset(7), 4);
}
fn aux_test_increment(increments: OffsetIncrements, expected: &[usize]) {
let mut reader = increments.reader();
for (i, el) in expected.iter().cloned().enumerate() {
assert_eq!(reader.convert_offset(i), el);
}
}
fn assert_is_increasing(v: &[usize]) {
assert!(v.len() > 0);
assert_eq!(v[0], 0);
let mut prec = 0;
for &val in &v[1..] {
assert!(val >= prec);
prec = val;
}
}
fn is_inverse(fwd: &[usize], rev: &[usize]) {
assert_is_increasing(fwd);
assert_is_increasing(rev);
println!("fwd {:?} rev {:?}", fwd, rev);
for (i, &antecedant) in rev.iter().enumerate() {
let expected = fwd
.iter()
.enumerate()
.filter(|(_, v)| **v <= i)
.map(|(ord, _)| ord)
.last()
.unwrap();
println!("i {}", i);
assert_eq!(expected, antecedant);
}
}
#[test]
fn test_is_inverse() {
is_inverse(&[0,1,1,1,2], &[0, 3, 4]);
}
fn is_reciprocal(left: &[usize], right: &[usize]) {
is_inverse(left, right);
}
#[test]
fn test_offset_increments_shorten() {
{
let mut offset_increment_builder = OffsetIncrements::builder();
// abcd -> abd
offset_increment_builder.register_inc(2, -1);
aux_test_increment(offset_increment_builder.build(), &[0, 1, 2, 4]);
}
{
let mut offset_increment_builder = OffsetIncrements::builder();
// abcdefgh -> abcdfgh
offset_increment_builder.register_inc(4, -1);
aux_test_increment(offset_increment_builder.build(), &[0, 1, 2, 3, 4, 6]);
}
{
let mut offset_increment_builder = OffsetIncrements::builder();
// abcd -> bcd
offset_increment_builder.register_inc(0, -1);
aux_test_increment(offset_increment_builder.build(), &[0, 2, 3]);
}
}
#[test]
fn test_offset_increments_builder() {
{
let mut offset_increment_builder = OffsetIncrements::builder();
offset_increment_builder.register_inc(2, 1);
// [0, 1, 3, 4, 5]
aux_test_increment(offset_increment_builder.build(), &[0,1,1,2,3,4,5]);
}
{
let mut offset_increment_builder = OffsetIncrements::builder();
offset_increment_builder.register_inc(3, 2);
// [0, 1, 2, 4, 5, 6]
aux_test_increment(offset_increment_builder.build(), &[0,1,2,2,2,3,4,5]);
}
{
let mut offset_increment_builder = OffsetIncrements::builder();
// 0, 0, 1, 2, 2, 2
offset_increment_builder.register_inc(1, 1);
offset_increment_builder.register_inc(3, 3);
aux_test_increment(offset_increment_builder.build(), &[0,0,1,2,2,2,2,3,4]);
}
}
}

View File

@@ -1,5 +1,6 @@
use super::{Token, TokenStream, Tokenizer};
use schema::FACET_SEP_BYTE;
use std::str;
/// The `FacetTokenizer` process a `Facet` binary representation
/// and emits a token for all of its parent.
@@ -56,11 +57,12 @@ impl<'a> TokenStream for FacetTokenStream<'a> {
.position(|b| b == FACET_SEP_BYTE)
.map(|pos| cursor + 1 + pos)
{
let facet_part = &self.text[cursor..next_sep_pos];
let facet_part =
unsafe { str::from_utf8_unchecked(&bytes[cursor..next_sep_pos]) };
self.token.text.push_str(facet_part);
self.state = State::UpToPosition(next_sep_pos);
} else {
let facet_part = &self.text[cursor..];
let facet_part = unsafe { str::from_utf8_unchecked(&bytes[cursor..]) };
self.token.text.push_str(facet_part);
self.state = State::Terminated;
}
@@ -84,6 +86,7 @@ mod tests {
use super::FacetTokenizer;
use schema::Facet;
use std::str;
use tokenizer::{Token, TokenStream, Tokenizer};
#[test]
@@ -92,11 +95,11 @@ mod tests {
let mut tokens = vec![];
{
let mut add_token = |token: &Token| {
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap();
let facet = unsafe { Facet::from_encoded(token.text.as_bytes().to_owned()) }; // ok test
tokens.push(format!("{}", facet));
};
FacetTokenizer
.token_stream(facet.encoded_str())
.token_stream(unsafe { str::from_utf8_unchecked(facet.encoded_bytes()) })
.process(&mut add_token);
}
assert_eq!(tokens.len(), 4);
@@ -112,11 +115,11 @@ mod tests {
let mut tokens = vec![];
{
let mut add_token = |token: &Token| {
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); // ok test
let facet = unsafe { Facet::from_encoded(token.text.as_bytes().to_owned()) }; // ok test
tokens.push(format!("{}", facet));
};
FacetTokenizer
.token_stream(facet.encoded_str()) // ok test
.token_stream(unsafe { str::from_utf8_unchecked(facet.encoded_bytes()) }) // ok test
.process(&mut add_token);
}
assert_eq!(tokens.len(), 1);

View File

@@ -140,6 +140,7 @@ mod stop_word_filter;
mod token_stream_chain;
mod tokenizer;
mod tokenizer_manager;
mod char_processing;
pub use self::alphanum_only::AlphaNumOnlyFilter;
pub use self::facet_tokenizer::FacetTokenizer;