Compare commits

...

15 Commits

Author SHA1 Message Date
Paul Masurel
176f67a266 Refactoring 2019-01-23 10:06:40 +09:00
Paul Masurel
19babff849 Closes #476 2019-01-23 10:06:39 +09:00
Paul Masurel
bf2576adf9 Added a broken unit test 2019-01-23 10:04:27 +09:00
Paul Masurel
b8241c5603 0.8.0 2018-12-26 10:18:34 +09:00
Paul Masurel
a4745151c0 Version to 0.8 2018-12-26 10:11:06 +09:00
Paul Masurel
e2ce326a8c Merge branch 'issue/457' 2018-12-18 10:35:01 +09:00
Paul Masurel
bb21d12a70 Bumping version 2018-12-18 10:14:12 +09:00
Paul Masurel
4565aba62a Added unit test for exponential search 2018-12-18 09:24:31 +09:00
Paul Masurel
545a7ec8dd Closes #457 2018-12-18 09:18:46 +09:00
Paul Masurel
e68775d71c Format and update murmurhash32 version 2018-12-17 19:12:38 +09:00
Paul Masurel
dcc92d287e Facet remove unsafe (#456)
* Removing some unsafe

* Removing some unsafe (2)

* Remove murmurhash
2018-12-17 19:08:48 +09:00
Paul Masurel
b48f81c051 Removing unsafe from bitpacking code (#455) 2018-12-17 19:06:37 +09:00
Paul Masurel
a3042e956b Facet remove unsafe (#454)
* Removing some unsafe

* Removing some unsafe (2)
2018-12-17 09:31:09 +09:00
dependabot[bot]
1fa10f0a0b Update itertools requirement from 0.7 to 0.8 (#453)
Updates the requirements on [itertools](https://github.com/bluss/rust-itertools) to permit the latest version.
- [Release notes](https://github.com/bluss/rust-itertools/releases)
- [Commits](https://github.com/bluss/rust-itertools/commits/0.8.0)

Signed-off-by: dependabot[bot] <support@dependabot.com>
2018-12-17 09:28:36 +09:00
Paul Masurel
279a9eb5e3 Closes #449 (#450)
Clippy working on stable.
Clippy warnings addressed
2018-12-10 12:20:59 +09:00
51 changed files with 656 additions and 542 deletions

View File

@@ -1,4 +1,4 @@
Tantivy 0.8.1
Tantivy 0.8.0
=====================
*No change in the index format*
- API Breaking change in the collector API. (@jwolfe, @fulmicoton)

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.8.0-dev"
version = "0.8.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -29,7 +29,7 @@ serde = "1.0"
serde_derive = "1.0"
serde_json = "1.0"
num_cpus = "1.2"
itertools = "0.7"
itertools = "0.8"
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
bit-set = "0.5"
uuid = { version = "0.7", features = ["v4", "serde"] }
@@ -49,6 +49,7 @@ failure = "0.1"
htmlescape = "0.3.1"
fail = "0.2"
scoped-pool = "1.0"
murmurhash32 = "0.2"
[target.'cfg(windows)'.dependencies]
winapi = "0.2"

View File

@@ -106,37 +106,37 @@ fn main() -> tantivy::Result<()> {
// For convenience, tantivy also comes with a macro to
// reduce the boilerplate above.
index_writer.add_document(doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
index_writer.add_document(doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
// Multivalued field just need to be repeated.
index_writer.add_document(doc!(
title => "Frankenstein",
title => "The Modern Prometheus",
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking."
title => "Frankenstein",
title => "The Modern Prometheus",
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking."
));
// This is an example, so we will only index 3 documents

View File

@@ -68,12 +68,12 @@ fn main() -> tantivy::Result<()> {
// heap for the indexer can increase its throughput.
let mut index_writer = index.writer(50_000_000)?;
index_writer.add_document(doc!(
title => "The Old Man and the Sea",
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish."
title => "The Old Man and the Sea",
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish."
));
index_writer.add_document(doc!(
title => "Of Mice and Men",
title => "Of Mice and Men",
body => r#"A few miles south of Soledad, the Salinas River drops in close to the hillside
bank and runs deep and green. The water is warm too, for it has slipped twinkling
over the yellow sands in the sunlight before reaching the narrow pool. On one
@@ -84,7 +84,7 @@ fn main() -> tantivy::Result<()> {
limbs and branches that arch over the pool"#
));
index_writer.add_document(doc!(
title => "Frankenstein",
title => "Frankenstein",
body => r#"You will rejoice to hear that no disaster has accompanied the commencement of an
enterprise which you have regarded with such evil forebodings. I arrived here
yesterday, and my first task is to assure my dear sister of my welfare and

View File

@@ -35,15 +35,15 @@ fn main() -> tantivy::Result<()> {
// we'll only need one doc for this example.
index_writer.add_document(doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
// ...
index_writer.commit()?;
@@ -56,7 +56,7 @@ fn main() -> tantivy::Result<()> {
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
let snippet_generator = SnippetGenerator::new(&searcher, &*query, body)?;
let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;
for (score, doc_address) in top_docs {
let doc = searcher.doc(doc_address)?;

View File

@@ -72,26 +72,26 @@ fn main() -> tantivy::Result<()> {
title => "The Old Man and the Sea",
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish."
));
));
index_writer.add_document(doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
index_writer.add_document(doc!(
title => "Frankenstein",
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking."
title => "Frankenstein",
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking."
));
index_writer.commit()?;

View File

@@ -197,7 +197,7 @@ fn skip<'a, I: Iterator<Item = &'a Facet>>(
) -> SkipResult {
loop {
match collapse_it.peek() {
Some(facet_bytes) => match facet_bytes.encoded_bytes().cmp(target) {
Some(facet_bytes) => match facet_bytes.encoded_str().as_bytes().cmp(target) {
Ordering::Less => {}
Ordering::Greater => {
return SkipResult::OverStep;
@@ -369,7 +369,8 @@ impl SegmentCollector for FacetSegmentCollector {
let mut facet = vec![];
let facet_ord = self.collapse_facet_ords[collapsed_facet_ord];
facet_dict.ord_to_term(facet_ord as u64, &mut facet);
facet_counts.insert(unsafe { Facet::from_encoded(facet) }, count);
// TODO
facet_counts.insert(Facet::from_encoded(facet).unwrap(), count);
}
FacetCounts { facet_counts }
}
@@ -403,9 +404,9 @@ impl FacetCounts {
let right_bound = if facet.is_root() {
Bound::Unbounded
} else {
let mut facet_after_bytes: Vec<u8> = facet.encoded_bytes().to_owned();
facet_after_bytes.push(1u8);
let facet_after = unsafe { Facet::from_encoded(facet_after_bytes) }; // ok logic
let mut facet_after_bytes: String = facet.encoded_str().to_owned();
facet_after_bytes.push('\u{1}');
let facet_after = Facet::from_encoded_string(facet_after_bytes);
Bound::Excluded(facet_after)
};
let underlying: btree_map::Range<_, _> = self.facet_counts.range((left_bound, right_bound));
@@ -474,7 +475,8 @@ mod tests {
n /= 4;
let leaf = n % 5;
Facet::from(&format!("/top{}/mid{}/leaf{}", top, mid, leaf))
}).collect();
})
.collect();
for i in 0..num_facets * 10 {
let mut doc = Document::new();
doc.add_facet(facet_field, facets[i % num_facets].clone());
@@ -500,18 +502,16 @@ mod tests {
("/top1/mid2", 50),
("/top1/mid3", 50),
]
.iter()
.map(|&(facet_str, count)| (String::from(facet_str), count))
.collect::<Vec<_>>()
.iter()
.map(|&(facet_str, count)| (String::from(facet_str), count))
.collect::<Vec<_>>()
);
}
}
#[test]
#[should_panic(
expected = "Tried to add a facet which is a descendant of \
an already added facet."
)]
#[should_panic(expected = "Tried to add a facet which is a descendant of \
an already added facet.")]
fn test_misused_facet_collector() {
let mut facet_collector = FacetCollector::for_field(Field(0));
facet_collector.add_facet(Facet::from("/country"));
@@ -563,13 +563,15 @@ mod tests {
let facet = Facet::from(&format!("/facet/{}", c));
let doc = doc!(facet_field => facet);
iter::repeat(doc).take(count)
}).map(|mut doc| {
})
.map(|mut doc| {
doc.add_facet(
facet_field,
&format!("/facet/{}", thread_rng().sample(&uniform)),
);
doc
}).collect();
})
.collect();
docs[..].shuffle(&mut thread_rng());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();

View File

@@ -43,7 +43,8 @@ impl<TCollector: Collector> Collector for CollectorWrapper<TCollector> {
let err_msg = format!("Failed to cast child collector fruit. {:?}", e);
TantivyError::InvalidArgument(err_msg)
})
}).collect::<Result<_>>()?;
})
.collect::<Result<_>>()?;
let merged_fruit = self.0.merge_fruits(typed_fruit)?;
Ok(Box::new(merged_fruit))
}
@@ -147,6 +148,8 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
/// Ok(())
/// }
/// ```
#[allow(clippy::type_complexity)]
#[derive(Default)]
pub struct MultiCollector<'a> {
collector_wrappers:
Vec<Box<Collector<Child = Box<BoxableSegmentCollector>, Fruit = Box<Fruit>> + 'a>>,
@@ -154,10 +157,8 @@ pub struct MultiCollector<'a> {
impl<'a> MultiCollector<'a> {
/// Create a new `MultiCollector`
pub fn new() -> MultiCollector<'a> {
MultiCollector {
collector_wrappers: Vec::new(),
}
pub fn new() -> Self {
Default::default()
}
/// Add a new collector to our `MultiCollector`.
@@ -213,7 +214,8 @@ impl<'a> Collector for MultiCollector<'a> {
.zip(segment_fruits_list)
.map(|(child_collector, segment_fruits)| {
Ok(Some(child_collector.merge_fruits(segment_fruits)?))
}).collect::<Result<_>>()?;
})
.collect::<Result<_>>()?;
Ok(MultiFruit { sub_fruits })
}
}

View File

@@ -84,11 +84,9 @@ where
for (feature, doc) in child_fruit {
if top_collector.len() < self.limit {
top_collector.push(ComparableDoc { feature, doc });
} else {
if let Some(mut head) = top_collector.peek_mut() {
if head.feature < feature {
*head = ComparableDoc { feature, doc };
}
} else if let Some(mut head) = top_collector.peek_mut() {
if head.feature < feature {
*head = ComparableDoc { feature, doc };
}
}
}
@@ -142,7 +140,8 @@ impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
comparable_doc.feature,
DocAddress(segment_id, comparable_doc.doc),
)
}).collect()
})
.collect()
}
/// Return true iff at least K documents have gone through

View File

@@ -1,9 +1,6 @@
use common::serialize::BinarySerializable;
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
use std::io;
use std::io::Write;
use std::mem;
use std::ops::Deref;
use std::ptr;
pub(crate) struct BitPacker {
mini_buffer: u64,
@@ -18,7 +15,7 @@ impl BitPacker {
}
}
pub fn write<TWrite: Write>(
pub fn write<TWrite: io::Write>(
&mut self,
val: u64,
num_bits: u8,
@@ -28,14 +25,14 @@ impl BitPacker {
let num_bits = num_bits as usize;
if self.mini_buffer_written + num_bits > 64 {
self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32);
self.mini_buffer.serialize(output)?;
output.write_u64::<LittleEndian>(self.mini_buffer)?;
self.mini_buffer = val_u64.wrapping_shr((64 - self.mini_buffer_written) as u32);
self.mini_buffer_written = self.mini_buffer_written + num_bits - 64;
} else {
self.mini_buffer |= val_u64 << self.mini_buffer_written;
self.mini_buffer_written += num_bits;
if self.mini_buffer_written == 64 {
self.mini_buffer.serialize(output)?;
output.write_u64::<LittleEndian>(self.mini_buffer)?;
self.mini_buffer_written = 0;
self.mini_buffer = 0u64;
}
@@ -43,17 +40,18 @@ impl BitPacker {
Ok(())
}
pub fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
pub fn flush<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
if self.mini_buffer_written > 0 {
let num_bytes = (self.mini_buffer_written + 7) / 8;
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer.to_le()) };
let mut arr: [u8; 8] = [0u8; 8];
LittleEndian::write_u64(&mut arr, self.mini_buffer);
output.write_all(&arr[..num_bytes])?;
self.mini_buffer_written = 0;
}
Ok(())
}
pub fn close<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
pub fn close<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
self.flush(output)?;
// Padding the write file to simplify reads.
output.write_all(&[0u8; 7])?;
@@ -102,9 +100,7 @@ where
addr + 8 <= data.len(),
"The fast field field should have been padded with 7 bytes."
);
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
let val_unshifted_unmasked: u64 =
u64::from_le(unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) });
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[addr..]);
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
val_shifted & mask
}
@@ -126,9 +122,7 @@ where
for output_val in output.iter_mut() {
let addr = addr_in_bits >> 3;
let bit_shift = addr_in_bits & 7;
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
let val_unshifted_unmasked: u64 =
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[addr..]);
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
*output_val = val_shifted & mask;
addr_in_bits += num_bits;

View File

@@ -64,17 +64,18 @@ impl Executor {
// This is important as it makes it possible for the fruit_receiver iteration to
// terminate.
};
let mut results = Vec::with_capacity(num_fruits);
unsafe { results.set_len(num_fruits) };
let mut num_items = 0;
// This is lame, but it does not use unsafe code.
let mut results_with_position = Vec::with_capacity(num_fruits);
for (pos, fruit_res) in fruit_receiver {
results[pos] = fruit_res?;
num_items += 1;
let fruit = fruit_res?;
results_with_position.push((pos, fruit));
}
// this checks ensures that we filled of this
// uninitialized memory.
assert_eq!(num_items, results.len());
Ok(results)
results_with_position.sort_by_key(|(pos, _)| *pos);
assert_eq!(results_with_position.len(), num_fruits);
Ok(results_with_position
.into_iter()
.map(|(_, fruit)| fruit)
.collect::<Vec<_>>())
}
}
}
@@ -94,7 +95,8 @@ mod tests {
panic!("panic should propagate");
},
vec![0].into_iter(),
).unwrap();
)
.unwrap();
}
#[test]
@@ -106,7 +108,8 @@ mod tests {
panic!("panic should propagate");
},
vec![0].into_iter(),
).unwrap();
)
.unwrap();
}
#[test]

View File

@@ -13,6 +13,7 @@ use directory::ManagedDirectory;
#[cfg(feature = "mmap")]
use directory::MmapDirectory;
use directory::{Directory, RAMDirectory};
use error::DataCorruption;
use error::TantivyError;
use indexer::index_writer::open_index_writer;
use indexer::index_writer::HEAP_SIZE_MIN;
@@ -37,7 +38,13 @@ fn load_metas(directory: &Directory) -> Result<IndexMeta> {
let meta_data = directory.atomic_read(&META_FILEPATH)?;
let meta_string = String::from_utf8_lossy(&meta_data);
serde_json::from_str(&meta_string)
.map_err(|_| TantivyError::CorruptedFile(META_FILEPATH.clone()))
.map_err(|e| {
DataCorruption::new(
META_FILEPATH.clone(),
format!("Meta file cannot be deserialized. {:?}.", e),
)
})
.map_err(From::from)
}
/// Search Index
@@ -135,7 +142,7 @@ impl Index {
/// Creates a new index given an implementation of the trait `Directory`
pub fn create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
let directory = ManagedDirectory::new(dir)?;
let directory = ManagedDirectory::wrap(dir)?;
Index::from_directory(directory, schema)
}
@@ -143,7 +150,7 @@ impl Index {
///
/// This will overwrite existing meta.json
fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
save_new_metas(schema.clone(), directory.borrow_mut())?;
let metas = IndexMeta::with_schema(schema);
Index::create_from_metas(directory, &metas)
}
@@ -199,7 +206,7 @@ impl Index {
/// Open the index using the provided directory
pub fn open<D: Directory>(directory: D) -> Result<Index> {
let directory = ManagedDirectory::new(directory)?;
let directory = ManagedDirectory::wrap(directory)?;
let metas = load_metas(&directory)?;
Index::create_from_metas(directory, &metas)
}

View File

@@ -32,10 +32,7 @@ pub struct InvertedIndexReader {
}
impl InvertedIndexReader {
#[cfg_attr(
feature = "cargo-clippy",
allow(clippy::needless_pass_by_value)
)] // for symetry
#[cfg_attr(feature = "cargo-clippy", allow(clippy::needless_pass_by_value))] // for symetry
pub(crate) fn new(
termdict: TermDictionary,
postings_source: ReadOnlySource,

View File

@@ -104,7 +104,8 @@ impl Searcher {
.iter()
.map(|segment_reader| {
u64::from(segment_reader.inverted_index(term.field()).doc_freq(term))
}).sum::<u64>()
})
.sum::<u64>()
}
/// Return the list of segment readers

View File

@@ -1,7 +1,7 @@
use core::MANAGED_FILEPATH;
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
use directory::{ReadOnlySource, WritePtr};
use error::TantivyError;
use error::DataCorruption;
use indexer::LockType;
use serde_json;
use std::collections::HashSet;
@@ -59,12 +59,17 @@ fn save_managed_paths(
impl ManagedDirectory {
/// Wraps a directory as managed directory.
pub fn new<Dir: Directory>(directory: Dir) -> Result<ManagedDirectory> {
pub fn wrap<Dir: Directory>(directory: Dir) -> Result<ManagedDirectory> {
match directory.atomic_read(&MANAGED_FILEPATH) {
Ok(data) => {
let managed_files_json = String::from_utf8_lossy(&data);
let managed_files: HashSet<PathBuf> = serde_json::from_str(&managed_files_json)
.map_err(|_| TantivyError::CorruptedFile(MANAGED_FILEPATH.clone()))?;
.map_err(|e| {
DataCorruption::new(
MANAGED_FILEPATH.clone(),
format!("Managed file cannot be deserialized: {:?}. ", e),
)
})?;
Ok(ManagedDirectory {
directory: Box::new(directory),
meta_informations: Arc::new(RwLock::new(MetaInformation {
@@ -260,7 +265,7 @@ mod tests {
let tempdir_path = PathBuf::from(tempdir.path());
{
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
{
let mut write_file = managed_directory.open_write(*TEST_PATH1).unwrap();
write_file.flush().unwrap();
@@ -286,7 +291,7 @@ mod tests {
}
{
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
{
assert!(managed_directory.exists(*TEST_PATH1));
assert!(!managed_directory.exists(*TEST_PATH2));
@@ -310,7 +315,7 @@ mod tests {
let living_files = HashSet::new();
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
managed_directory
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
.unwrap();

View File

@@ -100,7 +100,8 @@ impl InnerDirectory {
);
let io_err = make_io_err(msg);
OpenReadError::IOError(IOError::with_path(path.to_owned(), io_err))
}).and_then(|readable_map| {
})
.and_then(|readable_map| {
readable_map
.get(path)
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
@@ -120,7 +121,8 @@ impl InnerDirectory {
);
let io_err = make_io_err(msg);
DeleteError::IOError(IOError::with_path(path.to_owned(), io_err))
}).and_then(|mut writable_map| match writable_map.remove(path) {
})
.and_then(|mut writable_map| match writable_map.remove(path) {
Some(_) => Ok(()),
None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))),
})

View File

@@ -8,9 +8,42 @@ use indexer::LockType;
use query;
use schema;
use serde_json;
use std::fmt;
use std::path::PathBuf;
use std::sync::PoisonError;
pub struct DataCorruption {
filepath: Option<PathBuf>,
comment: String,
}
impl DataCorruption {
pub fn new(filepath: PathBuf, comment: String) -> DataCorruption {
DataCorruption {
filepath: Some(filepath),
comment,
}
}
pub fn comment_only(comment: String) -> DataCorruption {
DataCorruption {
filepath: None,
comment,
}
}
}
impl fmt::Debug for DataCorruption {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
write!(f, "Data corruption: ")?;
if let Some(ref filepath) = &self.filepath {
write!(f, "(in file `{:?}`)", filepath)?;
}
write!(f, ": {}.", self.comment)?;
Ok(())
}
}
/// The library's failure based error enum
#[derive(Debug, Fail)]
pub enum TantivyError {
@@ -33,8 +66,8 @@ pub enum TantivyError {
#[fail(display = "An IO error occurred: '{}'", _0)]
IOError(#[cause] IOError),
/// Data corruption.
#[fail(display = "File contains corrupted data: '{:?}'", _0)]
CorruptedFile(PathBuf),
#[fail(display = "{:?}", _0)]
DataCorruption(DataCorruption),
/// A thread holding the locked panicked and poisoned the lock.
#[fail(display = "A thread holding the locked panicked and poisoned the lock")]
Poisoned,
@@ -55,6 +88,12 @@ pub enum TantivyError {
SystemError(String),
}
impl From<DataCorruption> for TantivyError {
fn from(data_corruption: DataCorruption) -> TantivyError {
TantivyError::DataCorruption(data_corruption)
}
}
impl From<FastFieldNotAvailableError> for TantivyError {
fn from(fastfield_error: FastFieldNotAvailableError) -> TantivyError {
TantivyError::FastFieldError(fastfield_error)

View File

@@ -1,5 +1,6 @@
use super::MultiValueIntFastFieldReader;
use schema::Facet;
use std::str;
use termdict::TermDictionary;
use termdict::TermOrdinal;
use DocId;
@@ -20,6 +21,7 @@ use DocId;
pub struct FacetReader {
term_ords: MultiValueIntFastFieldReader<u64>,
term_dict: TermDictionary,
buffer: Vec<u8>,
}
impl FacetReader {
@@ -37,6 +39,7 @@ impl FacetReader {
FacetReader {
term_ords,
term_dict,
buffer: vec![],
}
}
@@ -55,11 +58,18 @@ impl FacetReader {
}
/// Given a term ordinal returns the term associated to it.
pub fn facet_from_ord(&self, facet_ord: TermOrdinal, output: &mut Facet) {
pub fn facet_from_ord(
&mut self,
facet_ord: TermOrdinal,
output: &mut Facet,
) -> Result<(), str::Utf8Error> {
let found_term = self
.term_dict
.ord_to_term(facet_ord as u64, output.inner_buffer_mut());
.ord_to_term(facet_ord as u64, &mut self.buffer);
assert!(found_term, "Term ordinal {} no found.", facet_ord);
let facet_str = str::from_utf8(&self.buffer[..])?;
output.set_facet_str(facet_str);
Ok(())
}
/// Return the list of facet ordinals associated to a document.

View File

@@ -82,20 +82,20 @@ mod tests {
let mut facet = Facet::root();
{
facet_reader.facet_from_ord(1, &mut facet);
facet_reader.facet_from_ord(1, &mut facet).unwrap();
assert_eq!(facet, Facet::from("/category"));
}
{
facet_reader.facet_from_ord(2, &mut facet);
facet_reader.facet_from_ord(2, &mut facet).unwrap();
assert_eq!(facet, Facet::from("/category/cat1"));
}
{
facet_reader.facet_from_ord(3, &mut facet);
facet_reader.facet_from_ord(3, &mut facet).unwrap();
assert_eq!(format!("{}", facet), "/category/cat2");
assert_eq!(facet, Facet::from("/category/cat2"));
}
{
facet_reader.facet_from_ord(4, &mut facet);
facet_reader.facet_from_ord(4, &mut facet).unwrap();
assert_eq!(facet, Facet::from("/category/cat3"));
}

View File

@@ -191,10 +191,7 @@ impl DeleteCursor {
}
}
#[cfg_attr(
feature = "cargo-clippy",
allow(clippy::wrong_self_convention)
)]
#[cfg_attr(feature = "cargo-clippy", allow(clippy::wrong_self_convention))]
fn is_behind_opstamp(&mut self, target_opstamp: u64) -> bool {
self.get()
.map(|operation| operation.opstamp < target_opstamp)

View File

@@ -61,7 +61,8 @@ fn initial_table_size(per_thread_memory_budget: usize) -> usize {
"Per thread memory is too small: {}",
per_thread_memory_budget
)
}).min(19) // we cap it at 512K
})
.min(19) // we cap it at 512K
}
/// `IndexWriter` is the user entry-point to add document to an index.
@@ -139,7 +140,7 @@ pub fn open_index_writer(
let stamper = Stamper::new(current_opstamp);
let segment_updater =
SegmentUpdater::new(index.clone(), stamper.clone(), &delete_queue.cursor())?;
SegmentUpdater::create(index.clone(), stamper.clone(), &delete_queue.cursor())?;
let mut index_writer = IndexWriter {
_directory_lock: Some(directory_lock),
@@ -390,7 +391,8 @@ impl IndexWriter {
.name(format!(
"thrd-tantivy-index{}-gen{}",
self.worker_id, generation
)).spawn(move || {
))
.spawn(move || {
loop {
let mut document_iterator =
document_receiver_clone.clone().into_iter().peekable();
@@ -465,10 +467,8 @@ impl IndexWriter {
///
/// Returns the former segment_ready channel.
fn recreate_document_channel(&mut self) -> DocumentReceiver {
let (mut document_sender, mut document_receiver): (
DocumentSender,
DocumentReceiver,
) = channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
let (mut document_sender, mut document_receiver): (DocumentSender, DocumentReceiver) =
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
swap(&mut self.document_sender, &mut document_sender);
swap(&mut self.document_receiver, &mut document_receiver);
document_receiver
@@ -558,11 +558,8 @@ impl IndexWriter {
// and recreate a new one channels.
self.recreate_document_channel();
let mut former_workers_join_handle = Vec::new();
swap(
&mut former_workers_join_handle,
&mut self.workers_join_handle,
);
let former_workers_join_handle =
mem::replace(&mut self.workers_join_handle, Vec::new());
for worker_handle in former_workers_join_handle {
let indexing_worker_result = worker_handle
@@ -739,7 +736,7 @@ mod tests {
index_writer.add_document(doc!(text_field=>"b"));
index_writer.add_document(doc!(text_field=>"c"));
}
assert_eq!(index_writer.commit().unwrap(), 2u64);
assert_eq!(index_writer.commit().unwrap(), 3u64);
index.load_searchers().unwrap();
assert_eq!(num_docs_containing("a"), 0);
assert_eq!(num_docs_containing("b"), 1);
@@ -802,7 +799,6 @@ mod tests {
{
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
prepared_commit.set_payload("first commit");
assert_eq!(prepared_commit.opstamp(), 100);
prepared_commit.commit().expect("commit failed");
}
{
@@ -836,7 +832,6 @@ mod tests {
{
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
prepared_commit.set_payload("first commit");
assert_eq!(prepared_commit.opstamp(), 100);
prepared_commit.abort().expect("commit failed");
}
{

View File

@@ -40,13 +40,15 @@ fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
total_tokens += reader.inverted_index(field).total_num_tokens();
}
}
total_tokens + count
.iter()
.cloned()
.enumerate()
.map(|(fieldnorm_ord, count)| {
count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8))
}).sum::<u64>()
total_tokens
+ count
.iter()
.cloned()
.enumerate()
.map(|(fieldnorm_ord, count)| {
count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8))
})
.sum::<u64>()
}
pub struct IndexMerger {
@@ -523,7 +525,8 @@ impl IndexMerger {
}
}
None
}).collect();
})
.collect();
// At this point, `segment_postings` contains the posting list
// of all of the segments containing the given term.
@@ -651,6 +654,7 @@ mod tests {
use schema::IntOptions;
use schema::Term;
use schema::TextFieldIndexing;
use schema::INT_INDEXED;
use std::io::Cursor;
use DocAddress;
use IndexWriter;
@@ -664,7 +668,8 @@ mod tests {
TextFieldIndexing::default()
.set_tokenizer("default")
.set_index_option(IndexRecordOption::WithFreqs),
).set_stored();
)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue);
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
@@ -803,7 +808,8 @@ mod tests {
.search(
&query,
&BytesFastFieldTestCollector::for_field(bytes_score_field),
).expect("failed to search")
)
.expect("failed to search")
};
assert_eq!(
get_fast_vals(vec![Term::from_field_text(text_field, "a")]),
@@ -823,7 +829,8 @@ mod tests {
let text_fieldtype = schema::TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
).set_stored();
)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue);
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
@@ -851,21 +858,21 @@ mod tests {
{
// a first commit
index_writer.add_document(doc!(
text_field => "a b d",
score_field => 1u64,
bytes_score_field => vec![0u8, 0, 0, 1],
));
text_field => "a b d",
score_field => 1u64,
bytes_score_field => vec![0u8, 0, 0, 1],
));
index_writer.add_document(doc!(
text_field => "b c",
score_field => 2u64,
bytes_score_field => vec![0u8, 0, 0, 2],
));
text_field => "b c",
score_field => 2u64,
bytes_score_field => vec![0u8, 0, 0, 2],
));
index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.add_document(doc!(
text_field => "c d",
score_field => 3u64,
bytes_score_field => vec![0u8, 0, 0, 3],
));
text_field => "c d",
score_field => 3u64,
bytes_score_field => vec![0u8, 0, 0, 3],
));
index_writer.commit().expect("committed");
index.load_searchers().unwrap();
let ref searcher = *index.searcher();
@@ -892,27 +899,27 @@ mod tests {
{
// a second commit
index_writer.add_document(doc!(
text_field => "a d e",
score_field => 4_000u64,
bytes_score_field => vec![0u8, 0, 0, 4],
));
text_field => "a d e",
score_field => 4_000u64,
bytes_score_field => vec![0u8, 0, 0, 4],
));
index_writer.add_document(doc!(
text_field => "e f",
score_field => 5_000u64,
bytes_score_field => vec![0u8, 0, 0, 5],
));
text_field => "e f",
score_field => 5_000u64,
bytes_score_field => vec![0u8, 0, 0, 5],
));
index_writer.delete_term(Term::from_field_text(text_field, "a"));
index_writer.delete_term(Term::from_field_text(text_field, "f"));
index_writer.add_document(doc!(
text_field => "f g",
score_field => 6_000u64,
bytes_score_field => vec![0u8, 0, 23, 112],
));
text_field => "f g",
score_field => 6_000u64,
bytes_score_field => vec![0u8, 0, 23, 112],
));
index_writer.add_document(doc!(
text_field => "g h",
score_field => 7_000u64,
bytes_score_field => vec![0u8, 0, 27, 88],
));
text_field => "g h",
score_field => 7_000u64,
bytes_score_field => vec![0u8, 0, 27, 88],
));
index_writer.commit().expect("committed");
index.load_searchers().unwrap();
let searcher = index.searcher();
@@ -977,7 +984,7 @@ mod tests {
.wait()
.expect("Merging failed");
index.load_searchers().unwrap();
let ref searcher = *index.searcher();
let searcher = index.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.num_docs(), 3);
assert_eq!(searcher.segment_readers()[0].num_docs(), 3);
@@ -1023,7 +1030,7 @@ mod tests {
index_writer.commit().unwrap();
index.load_searchers().unwrap();
let ref searcher = *index.searcher();
let searcher = index.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
@@ -1119,6 +1126,7 @@ mod tests {
{
// Test removing all docs
index_writer.delete_term(Term::from_field_text(text_field, "g"));
index_writer.commit().unwrap();
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
@@ -1249,6 +1257,34 @@ mod tests {
}
}
#[test]
fn test_bug_merge() {
let mut schema_builder = schema::Schema::builder();
let int_field = schema_builder.add_u64_field("intvals", INT_INDEXED);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(int_field => 1u64));
index_writer.commit().expect("commit failed");
index_writer.add_document(doc!(int_field => 1u64));
index_writer.commit().expect("commit failed");
index.load_searchers().unwrap();
let searcher = index.searcher();
assert_eq!(searcher.num_docs(), 2);
index_writer.delete_term(Term::from_field_u64(int_field, 1));
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
index_writer
.merge(&segment_ids)
.expect("Failed to initiate merge")
.wait()
.expect("Merging failed");
index.load_searchers().unwrap();
// commit has not been called yet. The document should still be
// there.
assert_eq!(index.searcher().num_docs(), 2);
}
#[test]
fn test_merge_multivalued_int_fields_all_deleted() {
let mut schema_builder = schema::Schema::builder();

View File

@@ -18,7 +18,6 @@ use indexer::delete_queue::DeleteCursor;
use indexer::index_writer::advance_deletes;
use indexer::merger::IndexMerger;
use indexer::stamper::Stamper;
use indexer::MergeCandidate;
use indexer::SegmentEntry;
use indexer::SegmentSerializer;
use indexer::{DefaultMergePolicy, MergePolicy};
@@ -45,8 +44,15 @@ use Result;
/// and flushed.
///
/// This method is not part of tantivy's public API
pub fn save_new_metas(schema: Schema, opstamp: u64, directory: &mut Directory) -> Result<()> {
save_metas(vec![], schema, opstamp, None, directory)
pub fn save_new_metas(schema: Schema, directory: &mut Directory) -> Result<()> {
save_metas(
&IndexMeta {
segments: Vec::new(),
schema,
opstamp: 0u64,
payload: None
},
directory)
}
/// Save the index meta file.
@@ -58,20 +64,17 @@ pub fn save_new_metas(schema: Schema, opstamp: u64, directory: &mut Directory) -
/// and flushed.
///
/// This method is not part of tantivy's public API
pub fn save_metas(
segment_metas: Vec<SegmentMeta>,
schema: Schema,
opstamp: u64,
payload: Option<String>,
fn save_metas(
metas: &IndexMeta,
directory: &mut Directory,
) -> Result<()> {
let metas = IndexMeta {
segments: segment_metas,
schema,
opstamp,
payload,
};
let mut buffer = serde_json::to_vec_pretty(&metas)?;
// let metas = IndexMeta {
// segments: segment_metas,
// schema,
// opstamp,
// payload,
// };
let mut buffer = serde_json::to_vec_pretty(metas)?;
writeln!(&mut buffer)?;
directory.atomic_write(&META_FILEPATH, &buffer[..])?;
debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));
@@ -86,6 +89,11 @@ pub fn save_metas(
#[derive(Clone)]
pub struct SegmentUpdater(Arc<InnerSegmentUpdater>);
struct MergeOperation {
pub target_opstamp: u64,
pub segment_ids: Vec<SegmentId>,
}
fn perform_merge(
index: &Index,
mut segment_entries: Vec<SegmentEntry>,
@@ -126,6 +134,13 @@ fn perform_merge(
}
struct InnerSegmentUpdater {
// we keep a copy of the current active IndexMeta to
// avoid loading the file everytime we need it in the
// `SegmentUpdater`.
//
// This should be up to date as all update happen through
// the unique active `SegmentUpdater`.
active_metas: RwLock<Arc<IndexMeta>>,
pool: CpuPool,
index: Index,
segment_manager: SegmentManager,
@@ -138,7 +153,7 @@ struct InnerSegmentUpdater {
}
impl SegmentUpdater {
pub fn new(
pub fn create(
index: Index,
stamper: Stamper,
delete_cursor: &DeleteCursor,
@@ -149,7 +164,9 @@ impl SegmentUpdater {
.name_prefix("segment_updater")
.pool_size(1)
.create();
let index_meta = index.load_metas()?;
Ok(SegmentUpdater(Arc::new(InnerSegmentUpdater {
active_metas: RwLock::new(Arc::new(index_meta)),
pool,
index,
segment_manager,
@@ -195,7 +212,8 @@ impl SegmentUpdater {
segment_updater.0.segment_manager.add_segment(segment_entry);
segment_updater.consider_merge_options();
true
}).forget();
})
.forget();
true
} else {
false
@@ -243,20 +261,26 @@ impl SegmentUpdater {
//
// Segment 1 from disk 1, Segment 1 from disk 2, etc.
commited_segment_metas.sort_by_key(|segment_meta| -(segment_meta.max_doc() as i32));
save_metas(
commited_segment_metas,
index.schema(),
let index_meta = IndexMeta {
segments: commited_segment_metas,
schema: index.schema(),
opstamp,
commit_message,
payload: commit_message
};
save_metas(
&index_meta,
directory.box_clone().borrow_mut(),
).expect("Could not save metas.");
)
.expect("Could not save metas.");
self.store_meta(&index_meta);
}
}
pub fn garbage_collect_files(&self) -> Result<()> {
self.run_async(move |segment_updater| {
segment_updater.garbage_collect_files_exec();
}).wait()
})
.wait()
}
fn garbage_collect_files_exec(&self) {
@@ -278,19 +302,32 @@ impl SegmentUpdater {
segment_updater.garbage_collect_files_exec();
segment_updater.consider_merge_options();
}
}).wait()
})
.wait()
}
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> {
//let future_merged_segment = */
let segment_ids_vec = segment_ids.to_vec();
let commit_opstamp = self.load_metas().opstamp;
self.run_async(move |segment_updater| {
segment_updater.start_merge_impl(&segment_ids_vec[..])
}).wait()?
segment_updater.start_merge_impl(&segment_ids_vec[..], commit_opstamp)
})
.wait()?
}
fn store_meta(&self, index_meta: &IndexMeta) {
*self.0.active_metas.write().unwrap() = Arc::new(index_meta.clone());
}
fn load_metas(&self) -> Arc<IndexMeta> {
self.0.active_metas.read().unwrap().clone()
}
// `segment_ids` is required to be non-empty.
fn start_merge_impl(&self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> {
fn start_merge_impl(
&self,
segment_ids: &[SegmentId],
target_opstamp: u64,
) -> Result<Receiver<SegmentMeta>> {
assert!(!segment_ids.is_empty(), "Segment_ids cannot be empty.");
let segment_updater_clone = self.clone();
@@ -305,8 +342,6 @@ impl SegmentUpdater {
);
let (merging_future_send, merging_future_recv) = oneshot();
let target_opstamp = self.0.stamper.stamp();
// first we need to apply deletes to our segment.
let merging_join_handle = thread::Builder::new()
.name(format!("mergingthread-{}", merging_thread_id))
@@ -352,7 +387,8 @@ impl SegmentUpdater {
.unwrap()
.remove(&merging_thread_id);
Ok(())
}).expect("Failed to spawn a thread.");
})
.expect("Failed to spawn a thread.");
self.0
.merging_threads
.write()
@@ -367,11 +403,32 @@ impl SegmentUpdater {
// Committed segments cannot be merged with uncommitted_segments.
// We therefore consider merges using these two sets of segments independently.
let merge_policy = self.get_merge_policy();
let mut merge_candidates = merge_policy.compute_merge_candidates(&uncommitted_segments);
let committed_merge_candidates = merge_policy.compute_merge_candidates(&committed_segments);
merge_candidates.extend_from_slice(&committed_merge_candidates[..]);
for MergeCandidate(segment_metas) in merge_candidates {
match self.start_merge_impl(&segment_metas) {
let current_opstamp = self.0.stamper.stamp();
let mut merge_candidates = merge_policy
.compute_merge_candidates(&uncommitted_segments)
.into_iter()
.map(|merge_candidate| MergeOperation {
target_opstamp: current_opstamp,
segment_ids: merge_candidate.0,
})
.collect::<Vec<_>>();
let commit_opstamp = self.load_metas().opstamp;
let committed_merge_candidates = merge_policy
.compute_merge_candidates(&committed_segments)
.into_iter()
.map(|merge_candidate| MergeOperation {
target_opstamp: commit_opstamp,
segment_ids: merge_candidate.0,
})
.collect::<Vec<_>>();
merge_candidates.extend(committed_merge_candidates.into_iter());
for MergeOperation {
target_opstamp,
segment_ids,
} in merge_candidates
{
match self.start_merge_impl(&segment_ids, target_opstamp) {
Ok(merge_future) => {
if let Err(e) = merge_future.fuse().poll() {
error!("The merge task failed quickly after starting: {:?}", e);
@@ -406,12 +463,7 @@ impl SegmentUpdater {
info!("End merge {:?}", after_merge_segment_entry.meta());
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
if let Some(delete_operation) = delete_cursor.get() {
let committed_opstamp = segment_updater
.0
.index
.load_metas()
.expect("Failed to read opstamp")
.opstamp;
let committed_opstamp = segment_updater.load_metas().opstamp;
if delete_operation.opstamp < committed_opstamp {
let index = &segment_updater.0.index;
let segment = index.segment(after_merge_segment_entry.meta().clone());
@@ -440,10 +492,11 @@ impl SegmentUpdater {
.end_merge(&before_merge_segment_ids, after_merge_segment_entry);
segment_updater.consider_merge_options();
info!("save metas");
let previous_metas = segment_updater.0.index.load_metas().unwrap();
segment_updater.save_metas(previous_metas.opstamp, previous_metas.payload);
let previous_metas = segment_updater.load_metas();
segment_updater.save_metas(previous_metas.opstamp, previous_metas.payload.clone());
segment_updater.garbage_collect_files_exec();
}).wait()
})
.wait()
}
/// Wait for current merging threads.

View File

@@ -62,7 +62,8 @@ impl SegmentWriter {
segment.index().tokenizers().get(tokenizer_name)
}),
_ => None,
}).collect();
})
.collect();
Ok(SegmentWriter {
max_doc: 0,
multifield_postings,
@@ -110,18 +111,18 @@ impl SegmentWriter {
}
match *field_options.field_type() {
FieldType::HierarchicalFacet => {
let facets: Vec<&[u8]> = field_values
let facets: Vec<&str> = field_values
.iter()
.flat_map(|field_value| match *field_value.value() {
Value::Facet(ref facet) => Some(facet.encoded_bytes()),
Value::Facet(ref facet) => Some(facet.encoded_str()),
_ => {
panic!("Expected hierarchical facet");
}
}).collect();
})
.collect();
let mut term = Term::for_field(field); // we set the Term
for facet_bytes in facets {
for fake_str in facets {
let mut unordered_term_id_opt = None;
let fake_str = unsafe { str::from_utf8_unchecked(facet_bytes) };
FacetTokenizer.token_stream(fake_str).process(&mut |token| {
term.set_text(&token.text);
let unordered_term_id =
@@ -145,7 +146,8 @@ impl SegmentWriter {
.flat_map(|field_value| match *field_value.value() {
Value::Str(ref text) => Some(text.as_str()),
_ => None,
}).collect();
})
.collect();
if texts.is_empty() {
0
} else {

View File

@@ -1,50 +1,68 @@
use std::sync::Arc;
use std::sync::atomic::Ordering;
// AtomicU64 have not landed in stable.
// For the moment let's just use AtomicUsize on
// x86/64 bit platform, and a mutex on other platform.
#[cfg(target = "x86_64")]
#[cfg(target_arch = "x86_64")]
mod archicture_impl {
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
#[derive(Clone, Default)]
pub struct Stamper(Arc<AtomicU64>);
#[derive(Default)]
pub struct AtomicU64Ersatz(AtomicUsize);
impl Stamper {
pub fn new(first_opstamp: u64) -> Stamper {
Stamper(Arc::new(AtomicU64::new(first_opstamp)))
impl AtomicU64Ersatz {
pub fn new(first_opstamp: u64) -> AtomicU64Ersatz {
AtomicU64Ersatz(AtomicUsize::new(first_opstamp as usize))
}
pub fn stamp(&self) -> u64 {
self.0.fetch_add(1u64, Ordering::SeqCst) as u64
pub fn fetch_add(&self, val: u64, order: Ordering) -> u64 {
self.0.fetch_add(val as usize, order) as u64
}
}
}
#[cfg(not(target = "x86_64"))]
#[cfg(not(target_arch = "x86_64"))]
mod archicture_impl {
use std::sync::{Arc, Mutex};
/// Under other architecture, we rely on a mutex.
use std::sync::Mutex;
use std::sync::atomic::Ordering;
#[derive(Clone, Default)]
pub struct Stamper(Arc<Mutex<u64>>);
#[derive(Default)]
pub struct AtomicU64Ersatz(Mutex<u64>);
impl Stamper {
pub fn new(first_opstamp: u64) -> Stamper {
Stamper(Arc::new(Mutex::new(first_opstamp)))
impl AtomicU64Ersatz {
pub fn new(first_opstamp: u64) -> AtomicU64Ersatz {
AtomicU64Ersatz(AtomicUsize::new(first_opstamp))
}
pub fn stamp(&self) -> u64 {
let mut guard = self.0.lock().expect("Failed to lock the stamper");
let previous_val = *guard;
*guard = previous_val + 1;
pub fn fetch_add(&self, val: u64, _order: Ordering) -> u64 {
let lock = self.0.lock().unwrap();
let previous_val = *lock;
*lock = previous_val + 1;
previous_val
}
}
}
pub use self::archicture_impl::Stamper;
use self::archicture_impl::AtomicU64Ersatz;
#[derive(Clone, Default)]
pub struct Stamper(Arc<AtomicU64Ersatz>);
impl Stamper {
pub fn new(first_opstamp: u64) -> Stamper {
Stamper(Arc::new(AtomicU64Ersatz::new(first_opstamp)))
}
pub fn stamp(&self) -> u64 {
self.0.fetch_add(1u64, Ordering::SeqCst) as u64
}
}
#[cfg(test)]
mod test {

View File

@@ -1,6 +1,5 @@
#![doc(html_logo_url = "http://fulmicoton.com/tantivy-logo/tantivy-logo.png")]
#![cfg_attr(all(feature = "unstable", test), feature(test))]
#![cfg_attr(feature = "cargo-clippy", feature(tool_lints))]
#![cfg_attr(feature = "cargo-clippy", allow(clippy::module_inception))]
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
#![warn(missing_docs)]
@@ -184,10 +183,7 @@ mod macros;
pub use error::TantivyError;
#[deprecated(
since = "0.7.0",
note = "please use `tantivy::TantivyError` instead"
)]
#[deprecated(since = "0.7.0", note = "please use `tantivy::TantivyError` instead")]
pub use error::TantivyError as Error;
extern crate census;
@@ -217,7 +213,7 @@ pub mod store;
pub mod termdict;
mod snippet;
pub use self::snippet::{SnippetGenerator, Snippet};
pub use self::snippet::{Snippet, SnippetGenerator};
mod docset;
pub use self::docset::{DocSet, SkipResult};
@@ -514,11 +510,9 @@ mod tests {
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(text_field);
assert!(
inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none()
);
assert!(inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none());
{
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
@@ -553,11 +547,9 @@ mod tests {
let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(term_abcd.field());
assert!(
inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none()
);
assert!(inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none());
{
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
@@ -591,11 +583,9 @@ mod tests {
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(term_abcd.field());
assert!(
inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none()
);
assert!(inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none());
{
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
@@ -743,11 +733,9 @@ mod tests {
let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(text_field);
let term_abcd = Term::from_field_text(text_field, "abcd");
assert!(
inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none()
);
assert!(inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none());
let term_af = Term::from_field_text(text_field, "af");
let mut postings = inverted_index
.read_postings(&term_af, IndexRecordOption::WithFreqsAndPositions)

View File

@@ -77,10 +77,10 @@ mod test {
let likes = schema_builder.add_u64_field("num_u64", FAST);
let _schema = schema_builder.build();
let _doc = doc!(
title => "Life Aquatic",
author => "Wes Anderson",
likes => 4u64
);
title => "Life Aquatic",
author => "Wes Anderson",
likes => 4u64
);
}
#[test]
@@ -91,9 +91,9 @@ mod test {
let likes = schema_builder.add_u64_field("num_u64", FAST);
let _schema = schema_builder.build();
let _doc = doc!(
title => "Life Aquatic",
author => "Wes Anderson",
likes => 4u64,
);
title => "Life Aquatic",
author => "Wes Anderson",
likes => 4u64,
);
}
}

View File

@@ -221,12 +221,10 @@ pub mod tests {
}
{
let term_a = Term::from_field_text(text_field, "abcdef");
assert!(
segment_reader
.inverted_index(term_a.field())
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.is_none()
);
assert!(segment_reader
.inverted_index(term_a.field())
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.is_none());
}
{
let term_a = Term::from_field_text(text_field, "a");

View File

@@ -29,7 +29,8 @@ fn posting_from_field_entry(field_entry: &FieldEntry) -> Box<PostingsWriter> {
IndexRecordOption::WithFreqsAndPositions => {
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed()
}
}).unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
})
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
FieldType::U64(_) | FieldType::I64(_) | FieldType::HierarchicalFacet => {
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
}
@@ -107,10 +108,8 @@ impl MultiFieldPostingsWriter {
.map(|(key, _, _)| Term::wrap(key).field())
.enumerate();
let mut unordered_term_mappings: HashMap<
Field,
HashMap<UnorderedTermId, TermOrdinal>,
> = HashMap::new();
let mut unordered_term_mappings: HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>> =
HashMap::new();
let mut prev_field = Field(u32::max_value());
for (offset, field) in term_offsets_it {
@@ -138,7 +137,8 @@ impl MultiFieldPostingsWriter {
.enumerate()
.map(|(term_ord, unord_term_id)| {
(unord_term_id as UnorderedTermId, term_ord as TermOrdinal)
}).collect();
})
.collect();
unordered_term_mappings.insert(field, mapping);
}
FieldType::U64(_) | FieldType::I64(_) => {}

View File

@@ -126,7 +126,6 @@ impl SegmentPostings {
fn exponential_search(target: u32, arr: &[u32]) -> (usize, usize) {
let mut start = 0;
let end = arr.len();
debug_assert!(target >= arr[start]);
debug_assert!(target <= arr[end - 1]);
let mut jump = 1;
loop {
@@ -216,11 +215,10 @@ impl DocSet for SegmentPostings {
// we're in the right block now, start with an exponential search
let block_docs = self.block_cursor.docs();
debug_assert!(target >= self.doc());
let new_cur = self
.cur
.wrapping_add(search_within_block(&block_docs[self.cur..], target));
if need_positions {
sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur]
.iter()
@@ -533,7 +531,8 @@ impl BlockSegmentPostings {
} else {
BlockSegmentPostingsSkipResult::Terminated
}
}).unwrap_or(BlockSegmentPostingsSkipResult::Terminated);
})
.unwrap_or(BlockSegmentPostingsSkipResult::Terminated);
}
BlockSegmentPostingsSkipResult::Terminated
}
@@ -621,6 +620,7 @@ impl<'b> Streamer<'b> for BlockSegmentPostings {
#[cfg(test)]
mod tests {
use super::exponential_search;
use super::search_within_block;
use super::BlockSegmentPostings;
use super::BlockSegmentPostingsSkipResult;
@@ -634,6 +634,7 @@ mod tests {
use schema::Term;
use schema::INT_INDEXED;
use DocId;
use SkipResult;
#[test]
fn test_empty_segment_postings() {
@@ -661,6 +662,16 @@ mod tests {
.0
}
#[test]
fn test_exponentiel_search() {
assert_eq!(exponential_search(0, &[1, 2]), (0, 1));
assert_eq!(exponential_search(1, &[1, 2]), (0, 1));
assert_eq!(
exponential_search(7, &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
(3, 7)
);
}
fn util_test_search_within_block(block: &[u32], target: u32) {
assert_eq!(
search_within_block(block, target),
@@ -692,7 +703,7 @@ mod tests {
#[test]
fn test_block_segment_postings() {
let mut block_segments = build_block_postings((0..100_000).collect::<Vec<u32>>());
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
let mut offset: u32 = 0u32;
// checking that the block before calling advance is empty
assert!(block_segments.docs().is_empty());
@@ -706,14 +717,44 @@ mod tests {
}
}
fn build_block_postings(docs: Vec<DocId>) -> BlockSegmentPostings {
#[test]
fn test_skip_right_at_new_block() {
let mut doc_ids = (0..128).collect::<Vec<u32>>();
doc_ids.push(129);
doc_ids.push(130);
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.skip_next(128), SkipResult::OverStep);
assert_eq!(docset.doc(), 129);
assert!(docset.advance());
assert_eq!(docset.doc(), 130);
assert!(!docset.advance());
}
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.skip_next(129), SkipResult::Reached);
assert_eq!(docset.doc(), 129);
assert!(docset.advance());
assert_eq!(docset.doc(), 130);
assert!(!docset.advance());
}
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.skip_next(131), SkipResult::End);
}
}
fn build_block_postings(docs: &[DocId]) -> BlockSegmentPostings {
let mut schema_builder = Schema::builder();
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let mut last_doc = 0u32;
for doc in docs {
for &doc in docs {
for _ in last_doc..doc {
index_writer.add_document(doc!(int_field=>1u64));
}
@@ -733,7 +774,7 @@ mod tests {
#[test]
fn test_block_segment_postings_skip() {
for i in 0..4 {
let mut block_postings = build_block_postings(vec![3]);
let mut block_postings = build_block_postings(&[3]);
assert_eq!(
block_postings.skip_to(i),
BlockSegmentPostingsSkipResult::Success(0u32)
@@ -743,7 +784,7 @@ mod tests {
BlockSegmentPostingsSkipResult::Terminated
);
}
let mut block_postings = build_block_postings(vec![3]);
let mut block_postings = build_block_postings(&[3]);
assert_eq!(
block_postings.skip_to(4u32),
BlockSegmentPostingsSkipResult::Terminated
@@ -756,7 +797,7 @@ mod tests {
for i in 0..1300 {
docs.push((i * i / 100) + i);
}
let mut block_postings = build_block_postings(docs.clone());
let mut block_postings = build_block_postings(&docs[..]);
for i in vec![0, 424, 10000] {
assert_eq!(
block_postings.skip_to(i),

View File

@@ -55,7 +55,7 @@ pub struct InvertedIndexSerializer {
impl InvertedIndexSerializer {
/// Open a new `PostingsSerializer` for the given segment
fn new(
fn create(
terms_write: CompositeWrite<WritePtr>,
postings_write: CompositeWrite<WritePtr>,
positions_write: CompositeWrite<WritePtr>,
@@ -74,7 +74,7 @@ impl InvertedIndexSerializer {
/// Open a new `PostingsSerializer` for the given segment
pub fn open(segment: &mut Segment) -> Result<InvertedIndexSerializer> {
use SegmentComponent::{POSITIONS, POSITIONSSKIP, POSTINGS, TERMS};
InvertedIndexSerializer::new(
InvertedIndexSerializer::create(
CompositeWrite::wrap(segment.open_write(TERMS)?),
CompositeWrite::wrap(segment.open_write(POSTINGS)?),
CompositeWrite::wrap(segment.open_write(POSITIONS)?),
@@ -99,7 +99,7 @@ impl InvertedIndexSerializer {
let positions_write = self.positions_write.for_field(field);
let positionsidx_write = self.positionsidx_write.for_field(field);
let field_type: FieldType = (*field_entry.field_type()).clone();
FieldSerializer::new(
FieldSerializer::create(
&field_type,
term_dictionary_write,
postings_write,
@@ -130,7 +130,7 @@ pub struct FieldSerializer<'a> {
}
impl<'a> FieldSerializer<'a> {
fn new(
fn create(
field_type: &FieldType,
term_dictionary_write: &'a mut CountingWriter<WritePtr>,
postings_write: &'a mut CountingWriter<WritePtr>,
@@ -152,7 +152,7 @@ impl<'a> FieldSerializer<'a> {
_ => (false, false),
};
let term_dictionary_builder =
TermDictionaryBuilder::new(term_dictionary_write, &field_type)?;
TermDictionaryBuilder::create(term_dictionary_write, &field_type)?;
let postings_serializer =
PostingsSerializer::new(postings_write, term_freq_enabled, position_enabled);
let positions_serializer_opt = if position_enabled {

View File

@@ -1,9 +1,7 @@
mod expull;
mod memory_arena;
mod murmurhash2;
mod term_hashmap;
pub use self::expull::ExpUnrolledLinkedList;
pub use self::memory_arena::{Addr, ArenaStorable, MemoryArena};
use self::murmurhash2::murmurhash2;
pub use self::term_hashmap::{compute_table_size, TermHashMap};

View File

@@ -1,87 +0,0 @@
use std::ptr;
const SEED: u32 = 3_242_157_231u32;
const M: u32 = 0x5bd1_e995;
#[inline(always)]
pub fn murmurhash2(key: &[u8]) -> u32 {
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
let mut key_ptr: *const u32 = key.as_ptr() as *const u32;
let len = key.len() as u32;
let mut h: u32 = SEED ^ len;
let num_blocks = len >> 2;
for _ in 0..num_blocks {
let mut k: u32 = unsafe { ptr::read_unaligned(key_ptr) }; // ok because of num_blocks definition
k = k.wrapping_mul(M);
k ^= k >> 24;
k = k.wrapping_mul(M);
h = h.wrapping_mul(M);
h ^= k;
key_ptr = key_ptr.wrapping_offset(1);
}
// Handle the last few bytes of the input array
let remaining: &[u8] = &key[key.len() & !3..];
match remaining.len() {
3 => {
h ^= u32::from(remaining[2]) << 16;
h ^= u32::from(remaining[1]) << 8;
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
2 => {
h ^= u32::from(remaining[1]) << 8;
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
1 => {
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
_ => {}
}
h ^= h >> 13;
h = h.wrapping_mul(M);
h ^ (h >> 15)
}
#[cfg(test)]
mod test {
use super::murmurhash2;
use std::collections::HashSet;
#[test]
fn test_murmur() {
let s1 = "abcdef";
let s2 = "abcdeg";
for i in 0..5 {
assert_eq!(
murmurhash2(&s1[i..5].as_bytes()),
murmurhash2(&s2[i..5].as_bytes())
);
}
}
#[test]
fn test_murmur_against_reference_impl() {
assert_eq!(murmurhash2("".as_bytes()), 3632506080);
assert_eq!(murmurhash2("a".as_bytes()), 455683869);
assert_eq!(murmurhash2("ab".as_bytes()), 2448092234);
assert_eq!(murmurhash2("abc".as_bytes()), 2066295634);
assert_eq!(murmurhash2("abcd".as_bytes()), 2588571162);
assert_eq!(murmurhash2("abcde".as_bytes()), 2988696942);
assert_eq!(murmurhash2("abcdefghijklmnop".as_bytes()), 2350868870);
}
#[test]
fn test_murmur_collisions() {
let mut set: HashSet<u32> = HashSet::default();
for i in 0..10_000 {
let s = format!("hash{}", i);
let hash = murmurhash2(s.as_bytes());
set.insert(hash);
}
assert_eq!(set.len(), 10_000);
}
}

View File

@@ -1,4 +1,7 @@
use super::murmurhash2;
extern crate murmurhash32;
use self::murmurhash32::murmurhash2;
use super::{Addr, ArenaStorable, MemoryArena};
use std::iter;
use std::mem;
@@ -206,7 +209,7 @@ impl TermHashMap {
self.resize();
}
let key_bytes: &[u8] = key.as_ref();
let hash = murmurhash2::murmurhash2(key.as_ref());
let hash = murmurhash2(key.as_ref());
let mut probe = self.probe(hash);
loop {
let bucket = probe.next_probe();

View File

@@ -63,7 +63,8 @@ impl BM25Weight {
.map(|term| {
let term_doc_freq = searcher.doc_freq(term);
idf(term_doc_freq, total_num_docs)
}).sum::<f32>();
})
.sum::<f32>();
BM25Weight::new(idf, average_fieldnorm)
}

View File

@@ -47,7 +47,8 @@ impl Query for BooleanQuery {
.iter()
.map(|&(ref occur, ref subquery)| {
Ok((*occur, subquery.weight(searcher, scoring_enabled)?))
}).collect::<Result<_>>()?;
})
.collect::<Result<_>>()?;
Ok(Box::new(BooleanWeight::new(sub_weights, scoring_enabled)))
}
@@ -68,7 +69,8 @@ impl BooleanQuery {
let term_query: Box<Query> =
Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs));
(Occur::Should, term_query)
}).collect();
})
.collect();
BooleanQuery::from(occur_term_queries)
}

View File

@@ -134,7 +134,8 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
.into_iter()
.map(|(offset, postings)| {
PostingsWithOffset::new(postings, (max_offset - offset) as u32)
}).collect::<Vec<_>>();
})
.collect::<Vec<_>>();
PhraseScorer {
intersection_docset: Intersection::new(postings_with_offsets),
num_docsets,

View File

@@ -68,7 +68,8 @@ fn trim_ast(logical_ast: LogicalAST) -> Option<LogicalAST> {
.into_iter()
.flat_map(|(occur, child)| {
trim_ast(child).map(|trimmed_child| (occur, trimmed_child))
}).collect::<Vec<_>>();
})
.collect::<Vec<_>>();
if trimmed_children.is_empty() {
None
} else {
@@ -422,7 +423,8 @@ impl QueryParser {
lower: self.resolve_bound(field, &lower)?,
upper: self.resolve_bound(field, &upper)?,
})))
}).collect::<Result<Vec<_>, QueryParserError>>()?;
})
.collect::<Result<Vec<_>, QueryParserError>>()?;
let result_ast = if clauses.len() == 1 {
clauses.pop().unwrap()
} else {
@@ -598,25 +600,19 @@ mod test {
assert!(query_parser.parse_query("signed:2324").is_ok());
assert!(query_parser.parse_query("signed:\"22\"").is_ok());
assert!(query_parser.parse_query("signed:\"-2234\"").is_ok());
assert!(
query_parser
.parse_query("signed:\"-9999999999999\"")
.is_ok()
);
assert!(query_parser
.parse_query("signed:\"-9999999999999\"")
.is_ok());
assert!(query_parser.parse_query("signed:\"a\"").is_err());
assert!(query_parser.parse_query("signed:\"2a\"").is_err());
assert!(
query_parser
.parse_query("signed:\"18446744073709551615\"")
.is_err()
);
assert!(query_parser
.parse_query("signed:\"18446744073709551615\"")
.is_err());
assert!(query_parser.parse_query("unsigned:\"2\"").is_ok());
assert!(query_parser.parse_query("unsigned:\"-2\"").is_err());
assert!(
query_parser
.parse_query("unsigned:\"18446744073709551615\"")
.is_ok()
);
assert!(query_parser
.parse_query("unsigned:\"18446744073709551615\"")
.is_ok());
test_parse_query_to_logical_ast_helper(
"unsigned:2324",
"Term([0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 20])",

View File

@@ -1,4 +1,3 @@
mod term_query;
mod term_scorer;
mod term_weight;

View File

@@ -55,7 +55,8 @@ where
None
}
},
).collect();
)
.collect();
Union {
docsets: non_empty_docsets,
bitsets: Box::new([TinySet::empty(); HORIZON_NUM_TINYBITSETS]),
@@ -214,10 +215,7 @@ where
// The target is outside of the buffered horizon.
// advance all docsets to a doc >= to the target.
#[cfg_attr(
feature = "cargo-clippy",
allow(clippy::clippy::collapsible_if)
)]
#[cfg_attr(feature = "cargo-clippy", allow(clippy::clippy::collapsible_if))]
unordered_drain_filter(&mut self.docsets, |docset| {
if docset.doc() < target {
if docset.skip_next(target) == SkipResult::End {

View File

@@ -6,6 +6,7 @@ use std::borrow::Cow;
use std::fmt::{self, Debug, Display, Formatter};
use std::io::{self, Read, Write};
use std::str;
use std::string::FromUtf8Error;
const SLASH_BYTE: u8 = b'/';
const ESCAPE_BYTE: u8 = b'\\';
@@ -14,6 +15,10 @@ const ESCAPE_BYTE: u8 = b'\\';
/// representation of facets.
pub const FACET_SEP_BYTE: u8 = 0u8;
/// `char` used as a level separation in the binary
/// representation of facets. (It is the null codepoint.)
pub const FACET_SEP_CHAR: char = '\u{0}';
/// A Facet represent a point in a given hierarchy.
///
/// They are typically represented similarly to a filepath.
@@ -26,18 +31,18 @@ pub const FACET_SEP_BYTE: u8 = 0u8;
/// its facet. In the example above, `/electronics/tv_and_video/`
/// and `/electronics`.
#[derive(Clone, Eq, Hash, PartialEq, Ord, PartialOrd)]
pub struct Facet(Vec<u8>);
pub struct Facet(String);
impl Facet {
/// Returns a new instance of the "root facet"
/// Equivalent to `/`.
pub fn root() -> Facet {
Facet(vec![])
Facet("".to_string())
}
/// Returns true iff the facet is the root facet `/`.
pub fn is_root(&self) -> bool {
self.encoded_bytes().is_empty()
self.encoded_str().is_empty()
}
/// Returns a binary representation of the facet.
@@ -49,13 +54,19 @@ impl Facet {
/// This representation has the benefit of making it possible to
/// express "being a child of a given facet" as a range over
/// the term ordinals.
pub fn encoded_bytes(&self) -> &[u8] {
pub fn encoded_str(&self) -> &str {
&self.0
}
pub(crate) fn from_encoded_string(facet_string: String) -> Facet {
Facet(facet_string)
}
/// Creates a `Facet` from its binary representation.
pub(crate) unsafe fn from_encoded(encoded_bytes: Vec<u8>) -> Facet {
Facet(encoded_bytes)
pub fn from_encoded(encoded_bytes: Vec<u8>) -> Result<Facet, FromUtf8Error> {
// facet bytes validation. `0u8` is used a separator but that is still legal utf-8
//Ok(Facet(String::from_utf8(encoded_bytes)?))
String::from_utf8(encoded_bytes).map(Facet)
}
/// Parse a text representation of a facet.
@@ -79,36 +90,37 @@ impl Facet {
Path: IntoIterator,
Path::Item: ToString,
{
let mut facet_bytes: Vec<u8> = Vec::with_capacity(100);
let mut facet_string: String = String::with_capacity(100);
let mut step_it = path.into_iter();
if let Some(step) = step_it.next() {
facet_bytes.extend_from_slice(step.to_string().as_bytes());
facet_string.push_str(&step.to_string());
}
for step in step_it {
facet_bytes.push(FACET_SEP_BYTE);
facet_bytes.extend_from_slice(step.to_string().as_bytes());
facet_string.push(FACET_SEP_CHAR);
facet_string.push_str(&step.to_string());
}
Facet(facet_bytes)
Facet(facet_string)
}
/// Accessor for the inner buffer of the `Facet`.
pub(crate) fn inner_buffer_mut(&mut self) -> &mut Vec<u8> {
&mut self.0
pub(crate) fn set_facet_str(&mut self, facet_str: &str) {
self.0.clear();
self.0.push_str(facet_str);
}
/// Returns `true` iff other is a subfacet of `self`.
pub fn is_prefix_of(&self, other: &Facet) -> bool {
let self_bytes: &[u8] = self.encoded_bytes();
let other_bytes: &[u8] = other.encoded_bytes();
self_bytes.len() < other_bytes.len()
&& other_bytes.starts_with(self_bytes)
&& other_bytes[self_bytes.len()] == 0u8
let self_str = self.encoded_str();
let other_str = other.encoded_str();
self_str.len() < other_str.len()
&& other_str.starts_with(self_str)
&& other_str.as_bytes()[self_str.len()] == FACET_SEP_BYTE
}
}
impl Borrow<[u8]> for Facet {
fn borrow(&self) -> &[u8] {
self.encoded_bytes()
impl Borrow<str> for Facet {
fn borrow(&self) -> &str {
self.encoded_str()
}
}
@@ -120,45 +132,51 @@ impl<'a, T: ?Sized + AsRef<str>> From<&'a T> for Facet {
Idle,
}
let path: &str = path_asref.as_ref();
let mut facet_encoded = Vec::new();
assert!(!path.is_empty());
assert!(path.starts_with("/"));
let mut facet_encoded = String::new();
let mut state = State::Idle;
let path_bytes = path.as_bytes();
for &c in &path_bytes[1..] {
let mut last_offset = 1;
for i in 1..path_bytes.len() {
let c = path_bytes[i];
match (state, c) {
(State::Idle, ESCAPE_BYTE) => state = State::Escaped,
(State::Idle, ESCAPE_BYTE) => {
facet_encoded.push_str(&path[last_offset..i]);
last_offset = i + 1;
state = State::Escaped
}
(State::Idle, SLASH_BYTE) => {
facet_encoded.push(FACET_SEP_BYTE);
facet_encoded.push_str(&path[last_offset..i]);
facet_encoded.push(FACET_SEP_CHAR);
last_offset = i + 1;
}
(State::Escaped, any_char) => {
(State::Escaped, _escaped_char) => {
state = State::Idle;
facet_encoded.push(any_char);
}
(State::Idle, other_char) => {
facet_encoded.push(other_char);
}
(State::Idle, _any_char) => {}
}
}
facet_encoded.push_str(&path[last_offset..]);
Facet(facet_encoded)
}
}
impl BinarySerializable for Facet {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
<Vec<u8> as BinarySerializable>::serialize(&self.0, writer)
<String as BinarySerializable>::serialize(&self.0, writer)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let bytes = <Vec<u8> as BinarySerializable>::deserialize(reader)?;
Ok(Facet(bytes))
Ok(Facet(<String as BinarySerializable>::deserialize(reader)?))
}
}
impl Display for Facet {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
for step in self.0.split(|&b| b == FACET_SEP_BYTE) {
for step in self.0.split(FACET_SEP_CHAR) {
write!(f, "/")?;
let step_str = unsafe { str::from_utf8_unchecked(step) };
write!(f, "{}", escape_slashes(step_str))?;
write!(f, "{}", escape_slashes(step))?;
}
Ok(())
}

View File

@@ -232,12 +232,14 @@ impl Schema {
let field_entry = self.get_field_entry(field);
let field_type = field_entry.field_type();
match *json_value {
JsonValue::Array(ref json_items) => for json_item in json_items {
let value = field_type
.value_from_json(json_item)
.map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?;
doc.add(FieldValue::new(field, value));
},
JsonValue::Array(ref json_items) => {
for json_item in json_items {
let value = field_type.value_from_json(json_item).map_err(|e| {
DocParsingError::ValueError(field_name.clone(), e)
})?;
doc.add(FieldValue::new(field, value));
}
}
_ => {
let value = field_type
.value_from_json(json_value)
@@ -446,7 +448,8 @@ mod tests {
"count": 4,
"popularity": 10
}"#,
).unwrap();
)
.unwrap();
assert_eq!(doc.get_first(title_field).unwrap().text(), Some("my title"));
assert_eq!(
doc.get_first(author_field).unwrap().text(),

View File

@@ -32,7 +32,7 @@ impl Term {
/// Creates a `Term` given a facet.
pub fn from_facet(field: Field, facet: &Facet) -> Term {
let bytes = facet.encoded_bytes();
let bytes = facet.encoded_str().as_bytes();
let buffer = Vec::with_capacity(4 + bytes.len());
let mut term = Term(buffer);
term.set_field(field);
@@ -68,12 +68,7 @@ impl Term {
term
}
/// Creates a new Term with an empty buffer,
/// but with a given capacity.
///
/// It is declared unsafe, as the term content
/// is not initialized, and a call to `.field()`
/// would panic.
/// Creates a new Term for a given field.
pub(crate) fn for_field(field: Field) -> Term {
let mut term = Term(Vec::with_capacity(100));
term.set_field(field);

View File

@@ -192,7 +192,8 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
item.start - fragment.start_offset,
item.stop - fragment.start_offset,
)
}).collect();
})
.collect();
Snippet {
fragments: fragment_text.to_string(),
highlighted,
@@ -242,7 +243,7 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
/// let query = query_parser.parse_query("haleurs flamands").unwrap();
/// # index.load_searchers()?;
/// # let searcher = index.searcher();
/// let mut snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field)?;
/// let mut snippet_generator = SnippetGenerator::create(&searcher, &*query, text_field)?;
/// snippet_generator.set_max_num_chars(100);
/// let snippet = snippet_generator.snippet_from_doc(&doc);
/// let snippet_html: String = snippet.to_html();
@@ -259,7 +260,7 @@ pub struct SnippetGenerator {
impl SnippetGenerator {
/// Creates a new snippet generator
pub fn new(searcher: &Searcher, query: &Query, field: Field) -> Result<SnippetGenerator> {
pub fn create(searcher: &Searcher, query: &Query, field: Field) -> Result<SnippetGenerator> {
let mut terms = BTreeSet::new();
query.query_terms(&mut terms);
let terms_text: BTreeMap<String, f32> = terms
@@ -273,7 +274,8 @@ impl SnippetGenerator {
} else {
None
}
}).collect();
})
.collect();
let tokenizer = searcher.index().tokenizer_for_field(field)?;
Ok(SnippetGenerator {
terms_text,
@@ -532,12 +534,14 @@ Survey in 2016, 2017, and 2018."#;
let query_parser = QueryParser::for_index(&index, vec![text_field]);
{
let query = query_parser.parse_query("e").unwrap();
let snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap();
let snippet_generator =
SnippetGenerator::create(&searcher, &*query, text_field).unwrap();
assert!(snippet_generator.terms_text().is_empty());
}
{
let query = query_parser.parse_query("a").unwrap();
let snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap();
let snippet_generator =
SnippetGenerator::create(&searcher, &*query, text_field).unwrap();
assert_eq!(
&btreemap!("a".to_string() => 0.25f32),
snippet_generator.terms_text()
@@ -545,7 +549,8 @@ Survey in 2016, 2017, and 2018."#;
}
{
let query = query_parser.parse_query("a b").unwrap();
let snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap();
let snippet_generator =
SnippetGenerator::create(&searcher, &*query, text_field).unwrap();
assert_eq!(
&btreemap!("a".to_string() => 0.25f32, "b".to_string() => 0.5),
snippet_generator.terms_text()
@@ -553,7 +558,8 @@ Survey in 2016, 2017, and 2018."#;
}
{
let query = query_parser.parse_query("a b c").unwrap();
let snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap();
let snippet_generator =
SnippetGenerator::create(&searcher, &*query, text_field).unwrap();
assert_eq!(
&btreemap!("a".to_string() => 0.25f32, "b".to_string() => 0.5),
snippet_generator.terms_text()
@@ -585,7 +591,8 @@ Survey in 2016, 2017, and 2018."#;
let searcher = index.searcher();
let query_parser = QueryParser::for_index(&index, vec![text_field]);
let query = query_parser.parse_query("rust design").unwrap();
let mut snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap();
let mut snippet_generator =
SnippetGenerator::create(&searcher, &*query, text_field).unwrap();
{
let snippet = snippet_generator.snippet(TEST_TEXT);
assert_eq!(snippet.to_html(), "imperative-procedural paradigms. <b>Rust</b> is syntactically similar to C++[according to whom?],\nbut its <b>designers</b> intend it to provide better memory safety");

View File

@@ -80,6 +80,7 @@ pub struct SegmentSpaceUsage {
}
impl SegmentSpaceUsage {
#[allow(clippy::too_many_arguments)]
pub(crate) fn new(
num_docs: u32,
termdict: PerFieldSpaceUsage,

View File

@@ -95,10 +95,7 @@ impl StoreReader {
}
}
#[cfg_attr(
feature = "cargo-clippy",
allow(clippy::needless_pass_by_value)
)]
#[cfg_attr(feature = "cargo-clippy", allow(clippy::needless_pass_by_value))]
fn split_source(data: ReadOnlySource) -> (ReadOnlySource, ReadOnlySource, DocId) {
let data_len = data.len();
let footer_offset = data_len - size_of::<u64>() - size_of::<u32>();

View File

@@ -53,7 +53,8 @@ impl<'a> TermMerger<'a> {
.map(|(ord, streamer)| HeapItem {
streamer,
segment_ord: ord,
}).collect(),
})
.collect(),
}
}
@@ -122,10 +123,7 @@ impl<'a> TermMerger<'a> {
}
/// Iterates through terms
#[cfg_attr(
feature = "cargo-clippy",
allow(clippy::should_implement_trait)
)]
#[cfg_attr(feature = "cargo-clippy", allow(clippy::should_implement_trait))]
pub fn next(&mut self) -> Option<Term<&[u8]>> {
if self.advance() {
Some(Term::wrap(self.current_streamers[0].streamer.key()))

View File

@@ -66,7 +66,7 @@ mod tests {
let write = directory.open_write(&path).unwrap();
let field_type = FieldType::Str(TEXT);
let mut term_dictionary_builder =
TermDictionaryBuilder::new(write, &field_type).unwrap();
TermDictionaryBuilder::create(write, &field_type).unwrap();
for term in COUNTRIES.iter() {
term_dictionary_builder
.insert(term.as_bytes(), &make_term_info(0u64))
@@ -92,7 +92,7 @@ mod tests {
let write = directory.open_write(&path).unwrap();
let field_type = FieldType::Str(TEXT);
let mut term_dictionary_builder =
TermDictionaryBuilder::new(write, &field_type).unwrap();
TermDictionaryBuilder::create(write, &field_type).unwrap();
term_dictionary_builder
.insert("abc".as_bytes(), &make_term_info(34u64))
.unwrap();
@@ -167,7 +167,7 @@ mod tests {
let mut term_string = String::new();
while term_it.advance() {
//let term = Term::from_bytes(term_it.key());
term_string.push_str(unsafe { str::from_utf8_unchecked(term_it.key()) }); // ok test
term_string.push_str(str::from_utf8(term_it.key()).expect("test"));
}
assert_eq!(&*term_string, "abcdef");
}
@@ -180,7 +180,7 @@ mod tests {
let field_type = FieldType::Str(TEXT);
let buffer: Vec<u8> = {
let mut term_dictionary_builder =
TermDictionaryBuilder::new(vec![], &field_type).unwrap();
TermDictionaryBuilder::create(vec![], &field_type).unwrap();
for &(ref id, ref i) in &ids {
term_dictionary_builder
.insert(id.as_bytes(), &make_term_info(*i as u64))
@@ -210,7 +210,7 @@ mod tests {
let field_type = FieldType::Str(TEXT);
let buffer: Vec<u8> = {
let mut term_dictionary_builder =
TermDictionaryBuilder::new(vec![], &field_type).unwrap();
TermDictionaryBuilder::create(vec![], &field_type).unwrap();
// term requires more than 16bits
term_dictionary_builder
.insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1))
@@ -245,7 +245,7 @@ mod tests {
let field_type = FieldType::Str(TEXT);
let buffer: Vec<u8> = {
let mut term_dictionary_builder =
TermDictionaryBuilder::new(vec![], &field_type).unwrap();
TermDictionaryBuilder::create(vec![], &field_type).unwrap();
for &(ref id, ref i) in &ids {
term_dictionary_builder
.insert(id.as_bytes(), &make_term_info(*i as u64))
@@ -314,7 +314,7 @@ mod tests {
let field_type = FieldType::Str(TEXT);
let buffer: Vec<u8> = {
let mut term_dictionary_builder =
TermDictionaryBuilder::new(vec![], &field_type).unwrap();
TermDictionaryBuilder::create(vec![], &field_type).unwrap();
term_dictionary_builder
.insert(&[], &make_term_info(1 as u64))
.unwrap();
@@ -338,7 +338,7 @@ mod tests {
let field_type = FieldType::Str(TEXT);
let buffer: Vec<u8> = {
let mut term_dictionary_builder =
TermDictionaryBuilder::new(vec![], &field_type).unwrap();
TermDictionaryBuilder::create(vec![], &field_type).unwrap();
for i in 0u8..10u8 {
let number_arr = [i; 1];
term_dictionary_builder
@@ -408,7 +408,7 @@ mod tests {
let write = directory.open_write(&path).unwrap();
let field_type = FieldType::Str(TEXT);
let mut term_dictionary_builder =
TermDictionaryBuilder::new(write, &field_type).unwrap();
TermDictionaryBuilder::create(write, &field_type).unwrap();
for term in COUNTRIES.iter() {
term_dictionary_builder
.insert(term.as_bytes(), &make_term_info(0u64))

View File

@@ -132,10 +132,7 @@ where
}
/// Return the next `(key, value)` pair.
#[cfg_attr(
feature = "cargo-clippy",
allow(clippy::should_implement_trait)
)]
#[cfg_attr(feature = "cargo-clippy", allow(clippy::should_implement_trait))]
pub fn next(&mut self) -> Option<(&[u8], &TermInfo)> {
if self.advance() {
Some((self.key(), self.value()))

View File

@@ -29,7 +29,7 @@ where
W: Write,
{
/// Creates a new `TermDictionaryBuilder`
pub fn new(w: W, _field_type: &FieldType) -> io::Result<Self> {
pub fn create(w: W, _field_type: &FieldType) -> io::Result<Self> {
let fst_builder = fst::MapBuilder::new(w).map_err(convert_fst_error)?;
Ok(TermDictionaryBuilder {
fst_builder,
@@ -132,7 +132,7 @@ impl TermDictionary {
/// Creates an empty term dictionary which contains no terms.
pub fn empty(field_type: &FieldType) -> Self {
let term_dictionary_data: Vec<u8> =
TermDictionaryBuilder::new(Vec::<u8>::new(), &field_type)
TermDictionaryBuilder::create(Vec::<u8>::new(), &field_type)
.expect("Creating a TermDictionaryBuilder in a Vec<u8> should never fail")
.finish()
.expect("Writing in a Vec<u8> should never fail");

View File

@@ -1,6 +1,5 @@
use super::{Token, TokenStream, Tokenizer};
use schema::FACET_SEP_BYTE;
use std::str;
/// The `FacetTokenizer` process a `Facet` binary representation
/// and emits a token for all of its parent.
@@ -57,12 +56,11 @@ impl<'a> TokenStream for FacetTokenStream<'a> {
.position(|b| b == FACET_SEP_BYTE)
.map(|pos| cursor + 1 + pos)
{
let facet_part =
unsafe { str::from_utf8_unchecked(&bytes[cursor..next_sep_pos]) };
let facet_part = &self.text[cursor..next_sep_pos];
self.token.text.push_str(facet_part);
self.state = State::UpToPosition(next_sep_pos);
} else {
let facet_part = unsafe { str::from_utf8_unchecked(&bytes[cursor..]) };
let facet_part = &self.text[cursor..];
self.token.text.push_str(facet_part);
self.state = State::Terminated;
}
@@ -86,7 +84,6 @@ mod tests {
use super::FacetTokenizer;
use schema::Facet;
use std::str;
use tokenizer::{Token, TokenStream, Tokenizer};
#[test]
@@ -95,11 +92,11 @@ mod tests {
let mut tokens = vec![];
{
let mut add_token = |token: &Token| {
let facet = unsafe { Facet::from_encoded(token.text.as_bytes().to_owned()) }; // ok test
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap();
tokens.push(format!("{}", facet));
};
FacetTokenizer
.token_stream(unsafe { str::from_utf8_unchecked(facet.encoded_bytes()) })
.token_stream(facet.encoded_str())
.process(&mut add_token);
}
assert_eq!(tokens.len(), 4);
@@ -115,11 +112,11 @@ mod tests {
let mut tokens = vec![];
{
let mut add_token = |token: &Token| {
let facet = unsafe { Facet::from_encoded(token.text.as_bytes().to_owned()) }; // ok test
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); // ok test
tokens.push(format!("{}", facet));
};
FacetTokenizer
.token_stream(unsafe { str::from_utf8_unchecked(facet.encoded_bytes()) }) // ok test
.token_stream(facet.encoded_str()) // ok test
.process(&mut add_token);
}
assert_eq!(tokens.len(), 1);