mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-28 04:52:55 +00:00
Compare commits
15 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
176f67a266 | ||
|
|
19babff849 | ||
|
|
bf2576adf9 | ||
|
|
b8241c5603 | ||
|
|
a4745151c0 | ||
|
|
e2ce326a8c | ||
|
|
bb21d12a70 | ||
|
|
4565aba62a | ||
|
|
545a7ec8dd | ||
|
|
e68775d71c | ||
|
|
dcc92d287e | ||
|
|
b48f81c051 | ||
|
|
a3042e956b | ||
|
|
1fa10f0a0b | ||
|
|
279a9eb5e3 |
@@ -1,4 +1,4 @@
|
||||
Tantivy 0.8.1
|
||||
Tantivy 0.8.0
|
||||
=====================
|
||||
*No change in the index format*
|
||||
- API Breaking change in the collector API. (@jwolfe, @fulmicoton)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.8.0-dev"
|
||||
version = "0.8.0"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
@@ -29,7 +29,7 @@ serde = "1.0"
|
||||
serde_derive = "1.0"
|
||||
serde_json = "1.0"
|
||||
num_cpus = "1.2"
|
||||
itertools = "0.7"
|
||||
itertools = "0.8"
|
||||
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
|
||||
bit-set = "0.5"
|
||||
uuid = { version = "0.7", features = ["v4", "serde"] }
|
||||
@@ -49,6 +49,7 @@ failure = "0.1"
|
||||
htmlescape = "0.3.1"
|
||||
fail = "0.2"
|
||||
scoped-pool = "1.0"
|
||||
murmurhash32 = "0.2"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.2"
|
||||
|
||||
@@ -106,37 +106,37 @@ fn main() -> tantivy::Result<()> {
|
||||
// For convenience, tantivy also comes with a macro to
|
||||
// reduce the boilerplate above.
|
||||
index_writer.add_document(doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
|
||||
// Multivalued field just need to be repeated.
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankenstein",
|
||||
title => "The Modern Prometheus",
|
||||
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||
increasing confidence in the success of my undertaking."
|
||||
title => "Frankenstein",
|
||||
title => "The Modern Prometheus",
|
||||
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||
increasing confidence in the success of my undertaking."
|
||||
));
|
||||
|
||||
// This is an example, so we will only index 3 documents
|
||||
|
||||
@@ -68,12 +68,12 @@ fn main() -> tantivy::Result<()> {
|
||||
// heap for the indexer can increase its throughput.
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
index_writer.add_document(doc!(
|
||||
title => "The Old Man and the Sea",
|
||||
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||
he had gone eighty-four days now without taking a fish."
|
||||
title => "The Old Man and the Sea",
|
||||
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||
he had gone eighty-four days now without taking a fish."
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
title => "Of Mice and Men",
|
||||
title => "Of Mice and Men",
|
||||
body => r#"A few miles south of Soledad, the Salinas River drops in close to the hillside
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one
|
||||
@@ -84,7 +84,7 @@ fn main() -> tantivy::Result<()> {
|
||||
limbs and branches that arch over the pool"#
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankenstein",
|
||||
title => "Frankenstein",
|
||||
body => r#"You will rejoice to hear that no disaster has accompanied the commencement of an
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here
|
||||
yesterday, and my first task is to assure my dear sister of my welfare and
|
||||
|
||||
@@ -35,15 +35,15 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
// we'll only need one doc for this example.
|
||||
index_writer.add_document(doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
// ...
|
||||
index_writer.commit()?;
|
||||
@@ -56,7 +56,7 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
||||
|
||||
let snippet_generator = SnippetGenerator::new(&searcher, &*query, body)?;
|
||||
let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;
|
||||
|
||||
for (score, doc_address) in top_docs {
|
||||
let doc = searcher.doc(doc_address)?;
|
||||
|
||||
@@ -72,26 +72,26 @@ fn main() -> tantivy::Result<()> {
|
||||
title => "The Old Man and the Sea",
|
||||
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||
he had gone eighty-four days now without taking a fish."
|
||||
));
|
||||
));
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankenstein",
|
||||
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||
increasing confidence in the success of my undertaking."
|
||||
title => "Frankenstein",
|
||||
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||
increasing confidence in the success of my undertaking."
|
||||
));
|
||||
|
||||
index_writer.commit()?;
|
||||
|
||||
@@ -197,7 +197,7 @@ fn skip<'a, I: Iterator<Item = &'a Facet>>(
|
||||
) -> SkipResult {
|
||||
loop {
|
||||
match collapse_it.peek() {
|
||||
Some(facet_bytes) => match facet_bytes.encoded_bytes().cmp(target) {
|
||||
Some(facet_bytes) => match facet_bytes.encoded_str().as_bytes().cmp(target) {
|
||||
Ordering::Less => {}
|
||||
Ordering::Greater => {
|
||||
return SkipResult::OverStep;
|
||||
@@ -369,7 +369,8 @@ impl SegmentCollector for FacetSegmentCollector {
|
||||
let mut facet = vec![];
|
||||
let facet_ord = self.collapse_facet_ords[collapsed_facet_ord];
|
||||
facet_dict.ord_to_term(facet_ord as u64, &mut facet);
|
||||
facet_counts.insert(unsafe { Facet::from_encoded(facet) }, count);
|
||||
// TODO
|
||||
facet_counts.insert(Facet::from_encoded(facet).unwrap(), count);
|
||||
}
|
||||
FacetCounts { facet_counts }
|
||||
}
|
||||
@@ -403,9 +404,9 @@ impl FacetCounts {
|
||||
let right_bound = if facet.is_root() {
|
||||
Bound::Unbounded
|
||||
} else {
|
||||
let mut facet_after_bytes: Vec<u8> = facet.encoded_bytes().to_owned();
|
||||
facet_after_bytes.push(1u8);
|
||||
let facet_after = unsafe { Facet::from_encoded(facet_after_bytes) }; // ok logic
|
||||
let mut facet_after_bytes: String = facet.encoded_str().to_owned();
|
||||
facet_after_bytes.push('\u{1}');
|
||||
let facet_after = Facet::from_encoded_string(facet_after_bytes);
|
||||
Bound::Excluded(facet_after)
|
||||
};
|
||||
let underlying: btree_map::Range<_, _> = self.facet_counts.range((left_bound, right_bound));
|
||||
@@ -474,7 +475,8 @@ mod tests {
|
||||
n /= 4;
|
||||
let leaf = n % 5;
|
||||
Facet::from(&format!("/top{}/mid{}/leaf{}", top, mid, leaf))
|
||||
}).collect();
|
||||
})
|
||||
.collect();
|
||||
for i in 0..num_facets * 10 {
|
||||
let mut doc = Document::new();
|
||||
doc.add_facet(facet_field, facets[i % num_facets].clone());
|
||||
@@ -500,18 +502,16 @@ mod tests {
|
||||
("/top1/mid2", 50),
|
||||
("/top1/mid3", 50),
|
||||
]
|
||||
.iter()
|
||||
.map(|&(facet_str, count)| (String::from(facet_str), count))
|
||||
.collect::<Vec<_>>()
|
||||
.iter()
|
||||
.map(|&(facet_str, count)| (String::from(facet_str), count))
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(
|
||||
expected = "Tried to add a facet which is a descendant of \
|
||||
an already added facet."
|
||||
)]
|
||||
#[should_panic(expected = "Tried to add a facet which is a descendant of \
|
||||
an already added facet.")]
|
||||
fn test_misused_facet_collector() {
|
||||
let mut facet_collector = FacetCollector::for_field(Field(0));
|
||||
facet_collector.add_facet(Facet::from("/country"));
|
||||
@@ -563,13 +563,15 @@ mod tests {
|
||||
let facet = Facet::from(&format!("/facet/{}", c));
|
||||
let doc = doc!(facet_field => facet);
|
||||
iter::repeat(doc).take(count)
|
||||
}).map(|mut doc| {
|
||||
})
|
||||
.map(|mut doc| {
|
||||
doc.add_facet(
|
||||
facet_field,
|
||||
&format!("/facet/{}", thread_rng().sample(&uniform)),
|
||||
);
|
||||
doc
|
||||
}).collect();
|
||||
})
|
||||
.collect();
|
||||
docs[..].shuffle(&mut thread_rng());
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
|
||||
@@ -43,7 +43,8 @@ impl<TCollector: Collector> Collector for CollectorWrapper<TCollector> {
|
||||
let err_msg = format!("Failed to cast child collector fruit. {:?}", e);
|
||||
TantivyError::InvalidArgument(err_msg)
|
||||
})
|
||||
}).collect::<Result<_>>()?;
|
||||
})
|
||||
.collect::<Result<_>>()?;
|
||||
let merged_fruit = self.0.merge_fruits(typed_fruit)?;
|
||||
Ok(Box::new(merged_fruit))
|
||||
}
|
||||
@@ -147,6 +148,8 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
#[allow(clippy::type_complexity)]
|
||||
#[derive(Default)]
|
||||
pub struct MultiCollector<'a> {
|
||||
collector_wrappers:
|
||||
Vec<Box<Collector<Child = Box<BoxableSegmentCollector>, Fruit = Box<Fruit>> + 'a>>,
|
||||
@@ -154,10 +157,8 @@ pub struct MultiCollector<'a> {
|
||||
|
||||
impl<'a> MultiCollector<'a> {
|
||||
/// Create a new `MultiCollector`
|
||||
pub fn new() -> MultiCollector<'a> {
|
||||
MultiCollector {
|
||||
collector_wrappers: Vec::new(),
|
||||
}
|
||||
pub fn new() -> Self {
|
||||
Default::default()
|
||||
}
|
||||
|
||||
/// Add a new collector to our `MultiCollector`.
|
||||
@@ -213,7 +214,8 @@ impl<'a> Collector for MultiCollector<'a> {
|
||||
.zip(segment_fruits_list)
|
||||
.map(|(child_collector, segment_fruits)| {
|
||||
Ok(Some(child_collector.merge_fruits(segment_fruits)?))
|
||||
}).collect::<Result<_>>()?;
|
||||
})
|
||||
.collect::<Result<_>>()?;
|
||||
Ok(MultiFruit { sub_fruits })
|
||||
}
|
||||
}
|
||||
|
||||
@@ -84,11 +84,9 @@ where
|
||||
for (feature, doc) in child_fruit {
|
||||
if top_collector.len() < self.limit {
|
||||
top_collector.push(ComparableDoc { feature, doc });
|
||||
} else {
|
||||
if let Some(mut head) = top_collector.peek_mut() {
|
||||
if head.feature < feature {
|
||||
*head = ComparableDoc { feature, doc };
|
||||
}
|
||||
} else if let Some(mut head) = top_collector.peek_mut() {
|
||||
if head.feature < feature {
|
||||
*head = ComparableDoc { feature, doc };
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -142,7 +140,8 @@ impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
|
||||
comparable_doc.feature,
|
||||
DocAddress(segment_id, comparable_doc.doc),
|
||||
)
|
||||
}).collect()
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Return true iff at least K documents have gone through
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
use common::serialize::BinarySerializable;
|
||||
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::mem;
|
||||
use std::ops::Deref;
|
||||
use std::ptr;
|
||||
|
||||
pub(crate) struct BitPacker {
|
||||
mini_buffer: u64,
|
||||
@@ -18,7 +15,7 @@ impl BitPacker {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn write<TWrite: Write>(
|
||||
pub fn write<TWrite: io::Write>(
|
||||
&mut self,
|
||||
val: u64,
|
||||
num_bits: u8,
|
||||
@@ -28,14 +25,14 @@ impl BitPacker {
|
||||
let num_bits = num_bits as usize;
|
||||
if self.mini_buffer_written + num_bits > 64 {
|
||||
self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32);
|
||||
self.mini_buffer.serialize(output)?;
|
||||
output.write_u64::<LittleEndian>(self.mini_buffer)?;
|
||||
self.mini_buffer = val_u64.wrapping_shr((64 - self.mini_buffer_written) as u32);
|
||||
self.mini_buffer_written = self.mini_buffer_written + num_bits - 64;
|
||||
} else {
|
||||
self.mini_buffer |= val_u64 << self.mini_buffer_written;
|
||||
self.mini_buffer_written += num_bits;
|
||||
if self.mini_buffer_written == 64 {
|
||||
self.mini_buffer.serialize(output)?;
|
||||
output.write_u64::<LittleEndian>(self.mini_buffer)?;
|
||||
self.mini_buffer_written = 0;
|
||||
self.mini_buffer = 0u64;
|
||||
}
|
||||
@@ -43,17 +40,18 @@ impl BitPacker {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||
pub fn flush<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||
if self.mini_buffer_written > 0 {
|
||||
let num_bytes = (self.mini_buffer_written + 7) / 8;
|
||||
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer.to_le()) };
|
||||
let mut arr: [u8; 8] = [0u8; 8];
|
||||
LittleEndian::write_u64(&mut arr, self.mini_buffer);
|
||||
output.write_all(&arr[..num_bytes])?;
|
||||
self.mini_buffer_written = 0;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn close<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||
pub fn close<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||
self.flush(output)?;
|
||||
// Padding the write file to simplify reads.
|
||||
output.write_all(&[0u8; 7])?;
|
||||
@@ -102,9 +100,7 @@ where
|
||||
addr + 8 <= data.len(),
|
||||
"The fast field field should have been padded with 7 bytes."
|
||||
);
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
|
||||
let val_unshifted_unmasked: u64 =
|
||||
u64::from_le(unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) });
|
||||
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[addr..]);
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
val_shifted & mask
|
||||
}
|
||||
@@ -126,9 +122,7 @@ where
|
||||
for output_val in output.iter_mut() {
|
||||
let addr = addr_in_bits >> 3;
|
||||
let bit_shift = addr_in_bits & 7;
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
|
||||
let val_unshifted_unmasked: u64 =
|
||||
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
|
||||
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[addr..]);
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
*output_val = val_shifted & mask;
|
||||
addr_in_bits += num_bits;
|
||||
|
||||
@@ -64,17 +64,18 @@ impl Executor {
|
||||
// This is important as it makes it possible for the fruit_receiver iteration to
|
||||
// terminate.
|
||||
};
|
||||
let mut results = Vec::with_capacity(num_fruits);
|
||||
unsafe { results.set_len(num_fruits) };
|
||||
let mut num_items = 0;
|
||||
// This is lame, but it does not use unsafe code.
|
||||
let mut results_with_position = Vec::with_capacity(num_fruits);
|
||||
for (pos, fruit_res) in fruit_receiver {
|
||||
results[pos] = fruit_res?;
|
||||
num_items += 1;
|
||||
let fruit = fruit_res?;
|
||||
results_with_position.push((pos, fruit));
|
||||
}
|
||||
// this checks ensures that we filled of this
|
||||
// uninitialized memory.
|
||||
assert_eq!(num_items, results.len());
|
||||
Ok(results)
|
||||
results_with_position.sort_by_key(|(pos, _)| *pos);
|
||||
assert_eq!(results_with_position.len(), num_fruits);
|
||||
Ok(results_with_position
|
||||
.into_iter()
|
||||
.map(|(_, fruit)| fruit)
|
||||
.collect::<Vec<_>>())
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -94,7 +95,8 @@ mod tests {
|
||||
panic!("panic should propagate");
|
||||
},
|
||||
vec![0].into_iter(),
|
||||
).unwrap();
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -106,7 +108,8 @@ mod tests {
|
||||
panic!("panic should propagate");
|
||||
},
|
||||
vec![0].into_iter(),
|
||||
).unwrap();
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -13,6 +13,7 @@ use directory::ManagedDirectory;
|
||||
#[cfg(feature = "mmap")]
|
||||
use directory::MmapDirectory;
|
||||
use directory::{Directory, RAMDirectory};
|
||||
use error::DataCorruption;
|
||||
use error::TantivyError;
|
||||
use indexer::index_writer::open_index_writer;
|
||||
use indexer::index_writer::HEAP_SIZE_MIN;
|
||||
@@ -37,7 +38,13 @@ fn load_metas(directory: &Directory) -> Result<IndexMeta> {
|
||||
let meta_data = directory.atomic_read(&META_FILEPATH)?;
|
||||
let meta_string = String::from_utf8_lossy(&meta_data);
|
||||
serde_json::from_str(&meta_string)
|
||||
.map_err(|_| TantivyError::CorruptedFile(META_FILEPATH.clone()))
|
||||
.map_err(|e| {
|
||||
DataCorruption::new(
|
||||
META_FILEPATH.clone(),
|
||||
format!("Meta file cannot be deserialized. {:?}.", e),
|
||||
)
|
||||
})
|
||||
.map_err(From::from)
|
||||
}
|
||||
|
||||
/// Search Index
|
||||
@@ -135,7 +142,7 @@ impl Index {
|
||||
|
||||
/// Creates a new index given an implementation of the trait `Directory`
|
||||
pub fn create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
|
||||
let directory = ManagedDirectory::new(dir)?;
|
||||
let directory = ManagedDirectory::wrap(dir)?;
|
||||
Index::from_directory(directory, schema)
|
||||
}
|
||||
|
||||
@@ -143,7 +150,7 @@ impl Index {
|
||||
///
|
||||
/// This will overwrite existing meta.json
|
||||
fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
|
||||
save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
|
||||
save_new_metas(schema.clone(), directory.borrow_mut())?;
|
||||
let metas = IndexMeta::with_schema(schema);
|
||||
Index::create_from_metas(directory, &metas)
|
||||
}
|
||||
@@ -199,7 +206,7 @@ impl Index {
|
||||
|
||||
/// Open the index using the provided directory
|
||||
pub fn open<D: Directory>(directory: D) -> Result<Index> {
|
||||
let directory = ManagedDirectory::new(directory)?;
|
||||
let directory = ManagedDirectory::wrap(directory)?;
|
||||
let metas = load_metas(&directory)?;
|
||||
Index::create_from_metas(directory, &metas)
|
||||
}
|
||||
|
||||
@@ -32,10 +32,7 @@ pub struct InvertedIndexReader {
|
||||
}
|
||||
|
||||
impl InvertedIndexReader {
|
||||
#[cfg_attr(
|
||||
feature = "cargo-clippy",
|
||||
allow(clippy::needless_pass_by_value)
|
||||
)] // for symetry
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::needless_pass_by_value))] // for symetry
|
||||
pub(crate) fn new(
|
||||
termdict: TermDictionary,
|
||||
postings_source: ReadOnlySource,
|
||||
|
||||
@@ -104,7 +104,8 @@ impl Searcher {
|
||||
.iter()
|
||||
.map(|segment_reader| {
|
||||
u64::from(segment_reader.inverted_index(term.field()).doc_freq(term))
|
||||
}).sum::<u64>()
|
||||
})
|
||||
.sum::<u64>()
|
||||
}
|
||||
|
||||
/// Return the list of segment readers
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use core::MANAGED_FILEPATH;
|
||||
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
|
||||
use directory::{ReadOnlySource, WritePtr};
|
||||
use error::TantivyError;
|
||||
use error::DataCorruption;
|
||||
use indexer::LockType;
|
||||
use serde_json;
|
||||
use std::collections::HashSet;
|
||||
@@ -59,12 +59,17 @@ fn save_managed_paths(
|
||||
|
||||
impl ManagedDirectory {
|
||||
/// Wraps a directory as managed directory.
|
||||
pub fn new<Dir: Directory>(directory: Dir) -> Result<ManagedDirectory> {
|
||||
pub fn wrap<Dir: Directory>(directory: Dir) -> Result<ManagedDirectory> {
|
||||
match directory.atomic_read(&MANAGED_FILEPATH) {
|
||||
Ok(data) => {
|
||||
let managed_files_json = String::from_utf8_lossy(&data);
|
||||
let managed_files: HashSet<PathBuf> = serde_json::from_str(&managed_files_json)
|
||||
.map_err(|_| TantivyError::CorruptedFile(MANAGED_FILEPATH.clone()))?;
|
||||
.map_err(|e| {
|
||||
DataCorruption::new(
|
||||
MANAGED_FILEPATH.clone(),
|
||||
format!("Managed file cannot be deserialized: {:?}. ", e),
|
||||
)
|
||||
})?;
|
||||
Ok(ManagedDirectory {
|
||||
directory: Box::new(directory),
|
||||
meta_informations: Arc::new(RwLock::new(MetaInformation {
|
||||
@@ -260,7 +265,7 @@ mod tests {
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
{
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||
{
|
||||
let mut write_file = managed_directory.open_write(*TEST_PATH1).unwrap();
|
||||
write_file.flush().unwrap();
|
||||
@@ -286,7 +291,7 @@ mod tests {
|
||||
}
|
||||
{
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||
{
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
assert!(!managed_directory.exists(*TEST_PATH2));
|
||||
@@ -310,7 +315,7 @@ mod tests {
|
||||
let living_files = HashSet::new();
|
||||
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||
managed_directory
|
||||
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
|
||||
.unwrap();
|
||||
|
||||
@@ -100,7 +100,8 @@ impl InnerDirectory {
|
||||
);
|
||||
let io_err = make_io_err(msg);
|
||||
OpenReadError::IOError(IOError::with_path(path.to_owned(), io_err))
|
||||
}).and_then(|readable_map| {
|
||||
})
|
||||
.and_then(|readable_map| {
|
||||
readable_map
|
||||
.get(path)
|
||||
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
|
||||
@@ -120,7 +121,8 @@ impl InnerDirectory {
|
||||
);
|
||||
let io_err = make_io_err(msg);
|
||||
DeleteError::IOError(IOError::with_path(path.to_owned(), io_err))
|
||||
}).and_then(|mut writable_map| match writable_map.remove(path) {
|
||||
})
|
||||
.and_then(|mut writable_map| match writable_map.remove(path) {
|
||||
Some(_) => Ok(()),
|
||||
None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))),
|
||||
})
|
||||
|
||||
43
src/error.rs
43
src/error.rs
@@ -8,9 +8,42 @@ use indexer::LockType;
|
||||
use query;
|
||||
use schema;
|
||||
use serde_json;
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::PoisonError;
|
||||
|
||||
pub struct DataCorruption {
|
||||
filepath: Option<PathBuf>,
|
||||
comment: String,
|
||||
}
|
||||
|
||||
impl DataCorruption {
|
||||
pub fn new(filepath: PathBuf, comment: String) -> DataCorruption {
|
||||
DataCorruption {
|
||||
filepath: Some(filepath),
|
||||
comment,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn comment_only(comment: String) -> DataCorruption {
|
||||
DataCorruption {
|
||||
filepath: None,
|
||||
comment,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for DataCorruption {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
write!(f, "Data corruption: ")?;
|
||||
if let Some(ref filepath) = &self.filepath {
|
||||
write!(f, "(in file `{:?}`)", filepath)?;
|
||||
}
|
||||
write!(f, ": {}.", self.comment)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// The library's failure based error enum
|
||||
#[derive(Debug, Fail)]
|
||||
pub enum TantivyError {
|
||||
@@ -33,8 +66,8 @@ pub enum TantivyError {
|
||||
#[fail(display = "An IO error occurred: '{}'", _0)]
|
||||
IOError(#[cause] IOError),
|
||||
/// Data corruption.
|
||||
#[fail(display = "File contains corrupted data: '{:?}'", _0)]
|
||||
CorruptedFile(PathBuf),
|
||||
#[fail(display = "{:?}", _0)]
|
||||
DataCorruption(DataCorruption),
|
||||
/// A thread holding the locked panicked and poisoned the lock.
|
||||
#[fail(display = "A thread holding the locked panicked and poisoned the lock")]
|
||||
Poisoned,
|
||||
@@ -55,6 +88,12 @@ pub enum TantivyError {
|
||||
SystemError(String),
|
||||
}
|
||||
|
||||
impl From<DataCorruption> for TantivyError {
|
||||
fn from(data_corruption: DataCorruption) -> TantivyError {
|
||||
TantivyError::DataCorruption(data_corruption)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<FastFieldNotAvailableError> for TantivyError {
|
||||
fn from(fastfield_error: FastFieldNotAvailableError) -> TantivyError {
|
||||
TantivyError::FastFieldError(fastfield_error)
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use super::MultiValueIntFastFieldReader;
|
||||
use schema::Facet;
|
||||
use std::str;
|
||||
use termdict::TermDictionary;
|
||||
use termdict::TermOrdinal;
|
||||
use DocId;
|
||||
@@ -20,6 +21,7 @@ use DocId;
|
||||
pub struct FacetReader {
|
||||
term_ords: MultiValueIntFastFieldReader<u64>,
|
||||
term_dict: TermDictionary,
|
||||
buffer: Vec<u8>,
|
||||
}
|
||||
|
||||
impl FacetReader {
|
||||
@@ -37,6 +39,7 @@ impl FacetReader {
|
||||
FacetReader {
|
||||
term_ords,
|
||||
term_dict,
|
||||
buffer: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
@@ -55,11 +58,18 @@ impl FacetReader {
|
||||
}
|
||||
|
||||
/// Given a term ordinal returns the term associated to it.
|
||||
pub fn facet_from_ord(&self, facet_ord: TermOrdinal, output: &mut Facet) {
|
||||
pub fn facet_from_ord(
|
||||
&mut self,
|
||||
facet_ord: TermOrdinal,
|
||||
output: &mut Facet,
|
||||
) -> Result<(), str::Utf8Error> {
|
||||
let found_term = self
|
||||
.term_dict
|
||||
.ord_to_term(facet_ord as u64, output.inner_buffer_mut());
|
||||
.ord_to_term(facet_ord as u64, &mut self.buffer);
|
||||
assert!(found_term, "Term ordinal {} no found.", facet_ord);
|
||||
let facet_str = str::from_utf8(&self.buffer[..])?;
|
||||
output.set_facet_str(facet_str);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Return the list of facet ordinals associated to a document.
|
||||
|
||||
@@ -82,20 +82,20 @@ mod tests {
|
||||
|
||||
let mut facet = Facet::root();
|
||||
{
|
||||
facet_reader.facet_from_ord(1, &mut facet);
|
||||
facet_reader.facet_from_ord(1, &mut facet).unwrap();
|
||||
assert_eq!(facet, Facet::from("/category"));
|
||||
}
|
||||
{
|
||||
facet_reader.facet_from_ord(2, &mut facet);
|
||||
facet_reader.facet_from_ord(2, &mut facet).unwrap();
|
||||
assert_eq!(facet, Facet::from("/category/cat1"));
|
||||
}
|
||||
{
|
||||
facet_reader.facet_from_ord(3, &mut facet);
|
||||
facet_reader.facet_from_ord(3, &mut facet).unwrap();
|
||||
assert_eq!(format!("{}", facet), "/category/cat2");
|
||||
assert_eq!(facet, Facet::from("/category/cat2"));
|
||||
}
|
||||
{
|
||||
facet_reader.facet_from_ord(4, &mut facet);
|
||||
facet_reader.facet_from_ord(4, &mut facet).unwrap();
|
||||
assert_eq!(facet, Facet::from("/category/cat3"));
|
||||
}
|
||||
|
||||
|
||||
@@ -191,10 +191,7 @@ impl DeleteCursor {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg_attr(
|
||||
feature = "cargo-clippy",
|
||||
allow(clippy::wrong_self_convention)
|
||||
)]
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::wrong_self_convention))]
|
||||
fn is_behind_opstamp(&mut self, target_opstamp: u64) -> bool {
|
||||
self.get()
|
||||
.map(|operation| operation.opstamp < target_opstamp)
|
||||
|
||||
@@ -61,7 +61,8 @@ fn initial_table_size(per_thread_memory_budget: usize) -> usize {
|
||||
"Per thread memory is too small: {}",
|
||||
per_thread_memory_budget
|
||||
)
|
||||
}).min(19) // we cap it at 512K
|
||||
})
|
||||
.min(19) // we cap it at 512K
|
||||
}
|
||||
|
||||
/// `IndexWriter` is the user entry-point to add document to an index.
|
||||
@@ -139,7 +140,7 @@ pub fn open_index_writer(
|
||||
let stamper = Stamper::new(current_opstamp);
|
||||
|
||||
let segment_updater =
|
||||
SegmentUpdater::new(index.clone(), stamper.clone(), &delete_queue.cursor())?;
|
||||
SegmentUpdater::create(index.clone(), stamper.clone(), &delete_queue.cursor())?;
|
||||
|
||||
let mut index_writer = IndexWriter {
|
||||
_directory_lock: Some(directory_lock),
|
||||
@@ -390,7 +391,8 @@ impl IndexWriter {
|
||||
.name(format!(
|
||||
"thrd-tantivy-index{}-gen{}",
|
||||
self.worker_id, generation
|
||||
)).spawn(move || {
|
||||
))
|
||||
.spawn(move || {
|
||||
loop {
|
||||
let mut document_iterator =
|
||||
document_receiver_clone.clone().into_iter().peekable();
|
||||
@@ -465,10 +467,8 @@ impl IndexWriter {
|
||||
///
|
||||
/// Returns the former segment_ready channel.
|
||||
fn recreate_document_channel(&mut self) -> DocumentReceiver {
|
||||
let (mut document_sender, mut document_receiver): (
|
||||
DocumentSender,
|
||||
DocumentReceiver,
|
||||
) = channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
let (mut document_sender, mut document_receiver): (DocumentSender, DocumentReceiver) =
|
||||
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
swap(&mut self.document_sender, &mut document_sender);
|
||||
swap(&mut self.document_receiver, &mut document_receiver);
|
||||
document_receiver
|
||||
@@ -558,11 +558,8 @@ impl IndexWriter {
|
||||
// and recreate a new one channels.
|
||||
self.recreate_document_channel();
|
||||
|
||||
let mut former_workers_join_handle = Vec::new();
|
||||
swap(
|
||||
&mut former_workers_join_handle,
|
||||
&mut self.workers_join_handle,
|
||||
);
|
||||
let former_workers_join_handle =
|
||||
mem::replace(&mut self.workers_join_handle, Vec::new());
|
||||
|
||||
for worker_handle in former_workers_join_handle {
|
||||
let indexing_worker_result = worker_handle
|
||||
@@ -739,7 +736,7 @@ mod tests {
|
||||
index_writer.add_document(doc!(text_field=>"b"));
|
||||
index_writer.add_document(doc!(text_field=>"c"));
|
||||
}
|
||||
assert_eq!(index_writer.commit().unwrap(), 2u64);
|
||||
assert_eq!(index_writer.commit().unwrap(), 3u64);
|
||||
index.load_searchers().unwrap();
|
||||
assert_eq!(num_docs_containing("a"), 0);
|
||||
assert_eq!(num_docs_containing("b"), 1);
|
||||
@@ -802,7 +799,6 @@ mod tests {
|
||||
{
|
||||
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
|
||||
prepared_commit.set_payload("first commit");
|
||||
assert_eq!(prepared_commit.opstamp(), 100);
|
||||
prepared_commit.commit().expect("commit failed");
|
||||
}
|
||||
{
|
||||
@@ -836,7 +832,6 @@ mod tests {
|
||||
{
|
||||
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
|
||||
prepared_commit.set_payload("first commit");
|
||||
assert_eq!(prepared_commit.opstamp(), 100);
|
||||
prepared_commit.abort().expect("commit failed");
|
||||
}
|
||||
{
|
||||
|
||||
@@ -40,13 +40,15 @@ fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
|
||||
total_tokens += reader.inverted_index(field).total_num_tokens();
|
||||
}
|
||||
}
|
||||
total_tokens + count
|
||||
.iter()
|
||||
.cloned()
|
||||
.enumerate()
|
||||
.map(|(fieldnorm_ord, count)| {
|
||||
count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8))
|
||||
}).sum::<u64>()
|
||||
total_tokens
|
||||
+ count
|
||||
.iter()
|
||||
.cloned()
|
||||
.enumerate()
|
||||
.map(|(fieldnorm_ord, count)| {
|
||||
count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8))
|
||||
})
|
||||
.sum::<u64>()
|
||||
}
|
||||
|
||||
pub struct IndexMerger {
|
||||
@@ -523,7 +525,8 @@ impl IndexMerger {
|
||||
}
|
||||
}
|
||||
None
|
||||
}).collect();
|
||||
})
|
||||
.collect();
|
||||
|
||||
// At this point, `segment_postings` contains the posting list
|
||||
// of all of the segments containing the given term.
|
||||
@@ -651,6 +654,7 @@ mod tests {
|
||||
use schema::IntOptions;
|
||||
use schema::Term;
|
||||
use schema::TextFieldIndexing;
|
||||
use schema::INT_INDEXED;
|
||||
use std::io::Cursor;
|
||||
use DocAddress;
|
||||
use IndexWriter;
|
||||
@@ -664,7 +668,8 @@ mod tests {
|
||||
TextFieldIndexing::default()
|
||||
.set_tokenizer("default")
|
||||
.set_index_option(IndexRecordOption::WithFreqs),
|
||||
).set_stored();
|
||||
)
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue);
|
||||
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
|
||||
@@ -803,7 +808,8 @@ mod tests {
|
||||
.search(
|
||||
&query,
|
||||
&BytesFastFieldTestCollector::for_field(bytes_score_field),
|
||||
).expect("failed to search")
|
||||
)
|
||||
.expect("failed to search")
|
||||
};
|
||||
assert_eq!(
|
||||
get_fast_vals(vec![Term::from_field_text(text_field, "a")]),
|
||||
@@ -823,7 +829,8 @@ mod tests {
|
||||
let text_fieldtype = schema::TextOptions::default()
|
||||
.set_indexing_options(
|
||||
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
|
||||
).set_stored();
|
||||
)
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue);
|
||||
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
|
||||
@@ -851,21 +858,21 @@ mod tests {
|
||||
{
|
||||
// a first commit
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "a b d",
|
||||
score_field => 1u64,
|
||||
bytes_score_field => vec![0u8, 0, 0, 1],
|
||||
));
|
||||
text_field => "a b d",
|
||||
score_field => 1u64,
|
||||
bytes_score_field => vec![0u8, 0, 0, 1],
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "b c",
|
||||
score_field => 2u64,
|
||||
bytes_score_field => vec![0u8, 0, 0, 2],
|
||||
));
|
||||
text_field => "b c",
|
||||
score_field => 2u64,
|
||||
bytes_score_field => vec![0u8, 0, 0, 2],
|
||||
));
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "c d",
|
||||
score_field => 3u64,
|
||||
bytes_score_field => vec![0u8, 0, 0, 3],
|
||||
));
|
||||
text_field => "c d",
|
||||
score_field => 3u64,
|
||||
bytes_score_field => vec![0u8, 0, 0, 3],
|
||||
));
|
||||
index_writer.commit().expect("committed");
|
||||
index.load_searchers().unwrap();
|
||||
let ref searcher = *index.searcher();
|
||||
@@ -892,27 +899,27 @@ mod tests {
|
||||
{
|
||||
// a second commit
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "a d e",
|
||||
score_field => 4_000u64,
|
||||
bytes_score_field => vec![0u8, 0, 0, 4],
|
||||
));
|
||||
text_field => "a d e",
|
||||
score_field => 4_000u64,
|
||||
bytes_score_field => vec![0u8, 0, 0, 4],
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "e f",
|
||||
score_field => 5_000u64,
|
||||
bytes_score_field => vec![0u8, 0, 0, 5],
|
||||
));
|
||||
text_field => "e f",
|
||||
score_field => 5_000u64,
|
||||
bytes_score_field => vec![0u8, 0, 0, 5],
|
||||
));
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "a"));
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "f"));
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "f g",
|
||||
score_field => 6_000u64,
|
||||
bytes_score_field => vec![0u8, 0, 23, 112],
|
||||
));
|
||||
text_field => "f g",
|
||||
score_field => 6_000u64,
|
||||
bytes_score_field => vec![0u8, 0, 23, 112],
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "g h",
|
||||
score_field => 7_000u64,
|
||||
bytes_score_field => vec![0u8, 0, 27, 88],
|
||||
));
|
||||
text_field => "g h",
|
||||
score_field => 7_000u64,
|
||||
bytes_score_field => vec![0u8, 0, 27, 88],
|
||||
));
|
||||
index_writer.commit().expect("committed");
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
@@ -977,7 +984,7 @@ mod tests {
|
||||
.wait()
|
||||
.expect("Merging failed");
|
||||
index.load_searchers().unwrap();
|
||||
let ref searcher = *index.searcher();
|
||||
let searcher = index.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
assert_eq!(searcher.num_docs(), 3);
|
||||
assert_eq!(searcher.segment_readers()[0].num_docs(), 3);
|
||||
@@ -1023,7 +1030,7 @@ mod tests {
|
||||
index_writer.commit().unwrap();
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let ref searcher = *index.searcher();
|
||||
let searcher = index.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
assert_eq!(searcher.num_docs(), 2);
|
||||
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
|
||||
@@ -1119,6 +1126,7 @@ mod tests {
|
||||
{
|
||||
// Test removing all docs
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "g"));
|
||||
index_writer.commit().unwrap();
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
@@ -1249,6 +1257,34 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bug_merge() {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let int_field = schema_builder.add_u64_field("intvals", INT_INDEXED);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(int_field => 1u64));
|
||||
index_writer.commit().expect("commit failed");
|
||||
index_writer.add_document(doc!(int_field => 1u64));
|
||||
index_writer.commit().expect("commit failed");
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
assert_eq!(searcher.num_docs(), 2);
|
||||
index_writer.delete_term(Term::from_field_u64(int_field, 1));
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
index_writer
|
||||
.merge(&segment_ids)
|
||||
.expect("Failed to initiate merge")
|
||||
.wait()
|
||||
.expect("Merging failed");
|
||||
index.load_searchers().unwrap();
|
||||
// commit has not been called yet. The document should still be
|
||||
// there.
|
||||
assert_eq!(index.searcher().num_docs(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_multivalued_int_fields_all_deleted() {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
|
||||
@@ -18,7 +18,6 @@ use indexer::delete_queue::DeleteCursor;
|
||||
use indexer::index_writer::advance_deletes;
|
||||
use indexer::merger::IndexMerger;
|
||||
use indexer::stamper::Stamper;
|
||||
use indexer::MergeCandidate;
|
||||
use indexer::SegmentEntry;
|
||||
use indexer::SegmentSerializer;
|
||||
use indexer::{DefaultMergePolicy, MergePolicy};
|
||||
@@ -45,8 +44,15 @@ use Result;
|
||||
/// and flushed.
|
||||
///
|
||||
/// This method is not part of tantivy's public API
|
||||
pub fn save_new_metas(schema: Schema, opstamp: u64, directory: &mut Directory) -> Result<()> {
|
||||
save_metas(vec![], schema, opstamp, None, directory)
|
||||
pub fn save_new_metas(schema: Schema, directory: &mut Directory) -> Result<()> {
|
||||
save_metas(
|
||||
&IndexMeta {
|
||||
segments: Vec::new(),
|
||||
schema,
|
||||
opstamp: 0u64,
|
||||
payload: None
|
||||
},
|
||||
directory)
|
||||
}
|
||||
|
||||
/// Save the index meta file.
|
||||
@@ -58,20 +64,17 @@ pub fn save_new_metas(schema: Schema, opstamp: u64, directory: &mut Directory) -
|
||||
/// and flushed.
|
||||
///
|
||||
/// This method is not part of tantivy's public API
|
||||
pub fn save_metas(
|
||||
segment_metas: Vec<SegmentMeta>,
|
||||
schema: Schema,
|
||||
opstamp: u64,
|
||||
payload: Option<String>,
|
||||
fn save_metas(
|
||||
metas: &IndexMeta,
|
||||
directory: &mut Directory,
|
||||
) -> Result<()> {
|
||||
let metas = IndexMeta {
|
||||
segments: segment_metas,
|
||||
schema,
|
||||
opstamp,
|
||||
payload,
|
||||
};
|
||||
let mut buffer = serde_json::to_vec_pretty(&metas)?;
|
||||
// let metas = IndexMeta {
|
||||
// segments: segment_metas,
|
||||
// schema,
|
||||
// opstamp,
|
||||
// payload,
|
||||
// };
|
||||
let mut buffer = serde_json::to_vec_pretty(metas)?;
|
||||
writeln!(&mut buffer)?;
|
||||
directory.atomic_write(&META_FILEPATH, &buffer[..])?;
|
||||
debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));
|
||||
@@ -86,6 +89,11 @@ pub fn save_metas(
|
||||
#[derive(Clone)]
|
||||
pub struct SegmentUpdater(Arc<InnerSegmentUpdater>);
|
||||
|
||||
struct MergeOperation {
|
||||
pub target_opstamp: u64,
|
||||
pub segment_ids: Vec<SegmentId>,
|
||||
}
|
||||
|
||||
fn perform_merge(
|
||||
index: &Index,
|
||||
mut segment_entries: Vec<SegmentEntry>,
|
||||
@@ -126,6 +134,13 @@ fn perform_merge(
|
||||
}
|
||||
|
||||
struct InnerSegmentUpdater {
|
||||
// we keep a copy of the current active IndexMeta to
|
||||
// avoid loading the file everytime we need it in the
|
||||
// `SegmentUpdater`.
|
||||
//
|
||||
// This should be up to date as all update happen through
|
||||
// the unique active `SegmentUpdater`.
|
||||
active_metas: RwLock<Arc<IndexMeta>>,
|
||||
pool: CpuPool,
|
||||
index: Index,
|
||||
segment_manager: SegmentManager,
|
||||
@@ -138,7 +153,7 @@ struct InnerSegmentUpdater {
|
||||
}
|
||||
|
||||
impl SegmentUpdater {
|
||||
pub fn new(
|
||||
pub fn create(
|
||||
index: Index,
|
||||
stamper: Stamper,
|
||||
delete_cursor: &DeleteCursor,
|
||||
@@ -149,7 +164,9 @@ impl SegmentUpdater {
|
||||
.name_prefix("segment_updater")
|
||||
.pool_size(1)
|
||||
.create();
|
||||
let index_meta = index.load_metas()?;
|
||||
Ok(SegmentUpdater(Arc::new(InnerSegmentUpdater {
|
||||
active_metas: RwLock::new(Arc::new(index_meta)),
|
||||
pool,
|
||||
index,
|
||||
segment_manager,
|
||||
@@ -195,7 +212,8 @@ impl SegmentUpdater {
|
||||
segment_updater.0.segment_manager.add_segment(segment_entry);
|
||||
segment_updater.consider_merge_options();
|
||||
true
|
||||
}).forget();
|
||||
})
|
||||
.forget();
|
||||
true
|
||||
} else {
|
||||
false
|
||||
@@ -243,20 +261,26 @@ impl SegmentUpdater {
|
||||
//
|
||||
// Segment 1 from disk 1, Segment 1 from disk 2, etc.
|
||||
commited_segment_metas.sort_by_key(|segment_meta| -(segment_meta.max_doc() as i32));
|
||||
save_metas(
|
||||
commited_segment_metas,
|
||||
index.schema(),
|
||||
let index_meta = IndexMeta {
|
||||
segments: commited_segment_metas,
|
||||
schema: index.schema(),
|
||||
opstamp,
|
||||
commit_message,
|
||||
payload: commit_message
|
||||
};
|
||||
save_metas(
|
||||
&index_meta,
|
||||
directory.box_clone().borrow_mut(),
|
||||
).expect("Could not save metas.");
|
||||
)
|
||||
.expect("Could not save metas.");
|
||||
self.store_meta(&index_meta);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn garbage_collect_files(&self) -> Result<()> {
|
||||
self.run_async(move |segment_updater| {
|
||||
segment_updater.garbage_collect_files_exec();
|
||||
}).wait()
|
||||
})
|
||||
.wait()
|
||||
}
|
||||
|
||||
fn garbage_collect_files_exec(&self) {
|
||||
@@ -278,19 +302,32 @@ impl SegmentUpdater {
|
||||
segment_updater.garbage_collect_files_exec();
|
||||
segment_updater.consider_merge_options();
|
||||
}
|
||||
}).wait()
|
||||
})
|
||||
.wait()
|
||||
}
|
||||
|
||||
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> {
|
||||
//let future_merged_segment = */
|
||||
let segment_ids_vec = segment_ids.to_vec();
|
||||
let commit_opstamp = self.load_metas().opstamp;
|
||||
self.run_async(move |segment_updater| {
|
||||
segment_updater.start_merge_impl(&segment_ids_vec[..])
|
||||
}).wait()?
|
||||
segment_updater.start_merge_impl(&segment_ids_vec[..], commit_opstamp)
|
||||
})
|
||||
.wait()?
|
||||
}
|
||||
|
||||
fn store_meta(&self, index_meta: &IndexMeta) {
|
||||
*self.0.active_metas.write().unwrap() = Arc::new(index_meta.clone());
|
||||
}
|
||||
fn load_metas(&self) -> Arc<IndexMeta> {
|
||||
self.0.active_metas.read().unwrap().clone()
|
||||
}
|
||||
|
||||
// `segment_ids` is required to be non-empty.
|
||||
fn start_merge_impl(&self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> {
|
||||
fn start_merge_impl(
|
||||
&self,
|
||||
segment_ids: &[SegmentId],
|
||||
target_opstamp: u64,
|
||||
) -> Result<Receiver<SegmentMeta>> {
|
||||
assert!(!segment_ids.is_empty(), "Segment_ids cannot be empty.");
|
||||
|
||||
let segment_updater_clone = self.clone();
|
||||
@@ -305,8 +342,6 @@ impl SegmentUpdater {
|
||||
);
|
||||
let (merging_future_send, merging_future_recv) = oneshot();
|
||||
|
||||
let target_opstamp = self.0.stamper.stamp();
|
||||
|
||||
// first we need to apply deletes to our segment.
|
||||
let merging_join_handle = thread::Builder::new()
|
||||
.name(format!("mergingthread-{}", merging_thread_id))
|
||||
@@ -352,7 +387,8 @@ impl SegmentUpdater {
|
||||
.unwrap()
|
||||
.remove(&merging_thread_id);
|
||||
Ok(())
|
||||
}).expect("Failed to spawn a thread.");
|
||||
})
|
||||
.expect("Failed to spawn a thread.");
|
||||
self.0
|
||||
.merging_threads
|
||||
.write()
|
||||
@@ -367,11 +403,32 @@ impl SegmentUpdater {
|
||||
// Committed segments cannot be merged with uncommitted_segments.
|
||||
// We therefore consider merges using these two sets of segments independently.
|
||||
let merge_policy = self.get_merge_policy();
|
||||
let mut merge_candidates = merge_policy.compute_merge_candidates(&uncommitted_segments);
|
||||
let committed_merge_candidates = merge_policy.compute_merge_candidates(&committed_segments);
|
||||
merge_candidates.extend_from_slice(&committed_merge_candidates[..]);
|
||||
for MergeCandidate(segment_metas) in merge_candidates {
|
||||
match self.start_merge_impl(&segment_metas) {
|
||||
|
||||
let current_opstamp = self.0.stamper.stamp();
|
||||
let mut merge_candidates = merge_policy
|
||||
.compute_merge_candidates(&uncommitted_segments)
|
||||
.into_iter()
|
||||
.map(|merge_candidate| MergeOperation {
|
||||
target_opstamp: current_opstamp,
|
||||
segment_ids: merge_candidate.0,
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let commit_opstamp = self.load_metas().opstamp;
|
||||
let committed_merge_candidates = merge_policy
|
||||
.compute_merge_candidates(&committed_segments)
|
||||
.into_iter()
|
||||
.map(|merge_candidate| MergeOperation {
|
||||
target_opstamp: commit_opstamp,
|
||||
segment_ids: merge_candidate.0,
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
merge_candidates.extend(committed_merge_candidates.into_iter());
|
||||
for MergeOperation {
|
||||
target_opstamp,
|
||||
segment_ids,
|
||||
} in merge_candidates
|
||||
{
|
||||
match self.start_merge_impl(&segment_ids, target_opstamp) {
|
||||
Ok(merge_future) => {
|
||||
if let Err(e) = merge_future.fuse().poll() {
|
||||
error!("The merge task failed quickly after starting: {:?}", e);
|
||||
@@ -406,12 +463,7 @@ impl SegmentUpdater {
|
||||
info!("End merge {:?}", after_merge_segment_entry.meta());
|
||||
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
|
||||
if let Some(delete_operation) = delete_cursor.get() {
|
||||
let committed_opstamp = segment_updater
|
||||
.0
|
||||
.index
|
||||
.load_metas()
|
||||
.expect("Failed to read opstamp")
|
||||
.opstamp;
|
||||
let committed_opstamp = segment_updater.load_metas().opstamp;
|
||||
if delete_operation.opstamp < committed_opstamp {
|
||||
let index = &segment_updater.0.index;
|
||||
let segment = index.segment(after_merge_segment_entry.meta().clone());
|
||||
@@ -440,10 +492,11 @@ impl SegmentUpdater {
|
||||
.end_merge(&before_merge_segment_ids, after_merge_segment_entry);
|
||||
segment_updater.consider_merge_options();
|
||||
info!("save metas");
|
||||
let previous_metas = segment_updater.0.index.load_metas().unwrap();
|
||||
segment_updater.save_metas(previous_metas.opstamp, previous_metas.payload);
|
||||
let previous_metas = segment_updater.load_metas();
|
||||
segment_updater.save_metas(previous_metas.opstamp, previous_metas.payload.clone());
|
||||
segment_updater.garbage_collect_files_exec();
|
||||
}).wait()
|
||||
})
|
||||
.wait()
|
||||
}
|
||||
|
||||
/// Wait for current merging threads.
|
||||
|
||||
@@ -62,7 +62,8 @@ impl SegmentWriter {
|
||||
segment.index().tokenizers().get(tokenizer_name)
|
||||
}),
|
||||
_ => None,
|
||||
}).collect();
|
||||
})
|
||||
.collect();
|
||||
Ok(SegmentWriter {
|
||||
max_doc: 0,
|
||||
multifield_postings,
|
||||
@@ -110,18 +111,18 @@ impl SegmentWriter {
|
||||
}
|
||||
match *field_options.field_type() {
|
||||
FieldType::HierarchicalFacet => {
|
||||
let facets: Vec<&[u8]> = field_values
|
||||
let facets: Vec<&str> = field_values
|
||||
.iter()
|
||||
.flat_map(|field_value| match *field_value.value() {
|
||||
Value::Facet(ref facet) => Some(facet.encoded_bytes()),
|
||||
Value::Facet(ref facet) => Some(facet.encoded_str()),
|
||||
_ => {
|
||||
panic!("Expected hierarchical facet");
|
||||
}
|
||||
}).collect();
|
||||
})
|
||||
.collect();
|
||||
let mut term = Term::for_field(field); // we set the Term
|
||||
for facet_bytes in facets {
|
||||
for fake_str in facets {
|
||||
let mut unordered_term_id_opt = None;
|
||||
let fake_str = unsafe { str::from_utf8_unchecked(facet_bytes) };
|
||||
FacetTokenizer.token_stream(fake_str).process(&mut |token| {
|
||||
term.set_text(&token.text);
|
||||
let unordered_term_id =
|
||||
@@ -145,7 +146,8 @@ impl SegmentWriter {
|
||||
.flat_map(|field_value| match *field_value.value() {
|
||||
Value::Str(ref text) => Some(text.as_str()),
|
||||
_ => None,
|
||||
}).collect();
|
||||
})
|
||||
.collect();
|
||||
if texts.is_empty() {
|
||||
0
|
||||
} else {
|
||||
|
||||
@@ -1,50 +1,68 @@
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
|
||||
// AtomicU64 have not landed in stable.
|
||||
// For the moment let's just use AtomicUsize on
|
||||
// x86/64 bit platform, and a mutex on other platform.
|
||||
|
||||
#[cfg(target = "x86_64")]
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
mod archicture_impl {
|
||||
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub struct Stamper(Arc<AtomicU64>);
|
||||
#[derive(Default)]
|
||||
pub struct AtomicU64Ersatz(AtomicUsize);
|
||||
|
||||
impl Stamper {
|
||||
pub fn new(first_opstamp: u64) -> Stamper {
|
||||
Stamper(Arc::new(AtomicU64::new(first_opstamp)))
|
||||
impl AtomicU64Ersatz {
|
||||
pub fn new(first_opstamp: u64) -> AtomicU64Ersatz {
|
||||
AtomicU64Ersatz(AtomicUsize::new(first_opstamp as usize))
|
||||
}
|
||||
|
||||
pub fn stamp(&self) -> u64 {
|
||||
self.0.fetch_add(1u64, Ordering::SeqCst) as u64
|
||||
pub fn fetch_add(&self, val: u64, order: Ordering) -> u64 {
|
||||
self.0.fetch_add(val as usize, order) as u64
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(target = "x86_64"))]
|
||||
#[cfg(not(target_arch = "x86_64"))]
|
||||
mod archicture_impl {
|
||||
|
||||
use std::sync::{Arc, Mutex};
|
||||
/// Under other architecture, we rely on a mutex.
|
||||
use std::sync::Mutex;
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub struct Stamper(Arc<Mutex<u64>>);
|
||||
#[derive(Default)]
|
||||
pub struct AtomicU64Ersatz(Mutex<u64>);
|
||||
|
||||
impl Stamper {
|
||||
pub fn new(first_opstamp: u64) -> Stamper {
|
||||
Stamper(Arc::new(Mutex::new(first_opstamp)))
|
||||
impl AtomicU64Ersatz {
|
||||
pub fn new(first_opstamp: u64) -> AtomicU64Ersatz {
|
||||
AtomicU64Ersatz(AtomicUsize::new(first_opstamp))
|
||||
}
|
||||
|
||||
pub fn stamp(&self) -> u64 {
|
||||
let mut guard = self.0.lock().expect("Failed to lock the stamper");
|
||||
let previous_val = *guard;
|
||||
*guard = previous_val + 1;
|
||||
pub fn fetch_add(&self, val: u64, _order: Ordering) -> u64 {
|
||||
let lock = self.0.lock().unwrap();
|
||||
let previous_val = *lock;
|
||||
*lock = previous_val + 1;
|
||||
previous_val
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub use self::archicture_impl::Stamper;
|
||||
use self::archicture_impl::AtomicU64Ersatz;
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub struct Stamper(Arc<AtomicU64Ersatz>);
|
||||
|
||||
impl Stamper {
|
||||
pub fn new(first_opstamp: u64) -> Stamper {
|
||||
Stamper(Arc::new(AtomicU64Ersatz::new(first_opstamp)))
|
||||
}
|
||||
|
||||
pub fn stamp(&self) -> u64 {
|
||||
self.0.fetch_add(1u64, Ordering::SeqCst) as u64
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
40
src/lib.rs
40
src/lib.rs
@@ -1,6 +1,5 @@
|
||||
#![doc(html_logo_url = "http://fulmicoton.com/tantivy-logo/tantivy-logo.png")]
|
||||
#![cfg_attr(all(feature = "unstable", test), feature(test))]
|
||||
#![cfg_attr(feature = "cargo-clippy", feature(tool_lints))]
|
||||
#![cfg_attr(feature = "cargo-clippy", allow(clippy::module_inception))]
|
||||
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
|
||||
#![warn(missing_docs)]
|
||||
@@ -184,10 +183,7 @@ mod macros;
|
||||
|
||||
pub use error::TantivyError;
|
||||
|
||||
#[deprecated(
|
||||
since = "0.7.0",
|
||||
note = "please use `tantivy::TantivyError` instead"
|
||||
)]
|
||||
#[deprecated(since = "0.7.0", note = "please use `tantivy::TantivyError` instead")]
|
||||
pub use error::TantivyError as Error;
|
||||
|
||||
extern crate census;
|
||||
@@ -217,7 +213,7 @@ pub mod store;
|
||||
pub mod termdict;
|
||||
|
||||
mod snippet;
|
||||
pub use self::snippet::{SnippetGenerator, Snippet};
|
||||
pub use self::snippet::{Snippet, SnippetGenerator};
|
||||
|
||||
mod docset;
|
||||
pub use self::docset::{DocSet, SkipResult};
|
||||
@@ -514,11 +510,9 @@ mod tests {
|
||||
let searcher = index.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
let inverted_index = reader.inverted_index(text_field);
|
||||
assert!(
|
||||
inverted_index
|
||||
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
|
||||
.is_none()
|
||||
);
|
||||
assert!(inverted_index
|
||||
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
|
||||
.is_none());
|
||||
{
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
||||
@@ -553,11 +547,9 @@ mod tests {
|
||||
let reader = searcher.segment_reader(0);
|
||||
let inverted_index = reader.inverted_index(term_abcd.field());
|
||||
|
||||
assert!(
|
||||
inverted_index
|
||||
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
|
||||
.is_none()
|
||||
);
|
||||
assert!(inverted_index
|
||||
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
|
||||
.is_none());
|
||||
{
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
||||
@@ -591,11 +583,9 @@ mod tests {
|
||||
let searcher = index.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
let inverted_index = reader.inverted_index(term_abcd.field());
|
||||
assert!(
|
||||
inverted_index
|
||||
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
|
||||
.is_none()
|
||||
);
|
||||
assert!(inverted_index
|
||||
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
|
||||
.is_none());
|
||||
{
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
||||
@@ -743,11 +733,9 @@ mod tests {
|
||||
let reader = searcher.segment_reader(0);
|
||||
let inverted_index = reader.inverted_index(text_field);
|
||||
let term_abcd = Term::from_field_text(text_field, "abcd");
|
||||
assert!(
|
||||
inverted_index
|
||||
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
|
||||
.is_none()
|
||||
);
|
||||
assert!(inverted_index
|
||||
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
|
||||
.is_none());
|
||||
let term_af = Term::from_field_text(text_field, "af");
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_af, IndexRecordOption::WithFreqsAndPositions)
|
||||
|
||||
@@ -77,10 +77,10 @@ mod test {
|
||||
let likes = schema_builder.add_u64_field("num_u64", FAST);
|
||||
let _schema = schema_builder.build();
|
||||
let _doc = doc!(
|
||||
title => "Life Aquatic",
|
||||
author => "Wes Anderson",
|
||||
likes => 4u64
|
||||
);
|
||||
title => "Life Aquatic",
|
||||
author => "Wes Anderson",
|
||||
likes => 4u64
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -91,9 +91,9 @@ mod test {
|
||||
let likes = schema_builder.add_u64_field("num_u64", FAST);
|
||||
let _schema = schema_builder.build();
|
||||
let _doc = doc!(
|
||||
title => "Life Aquatic",
|
||||
author => "Wes Anderson",
|
||||
likes => 4u64,
|
||||
);
|
||||
title => "Life Aquatic",
|
||||
author => "Wes Anderson",
|
||||
likes => 4u64,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -221,12 +221,10 @@ pub mod tests {
|
||||
}
|
||||
{
|
||||
let term_a = Term::from_field_text(text_field, "abcdef");
|
||||
assert!(
|
||||
segment_reader
|
||||
.inverted_index(term_a.field())
|
||||
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
||||
.is_none()
|
||||
);
|
||||
assert!(segment_reader
|
||||
.inverted_index(term_a.field())
|
||||
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
||||
.is_none());
|
||||
}
|
||||
{
|
||||
let term_a = Term::from_field_text(text_field, "a");
|
||||
|
||||
@@ -29,7 +29,8 @@ fn posting_from_field_entry(field_entry: &FieldEntry) -> Box<PostingsWriter> {
|
||||
IndexRecordOption::WithFreqsAndPositions => {
|
||||
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed()
|
||||
}
|
||||
}).unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
|
||||
})
|
||||
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::HierarchicalFacet => {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
|
||||
}
|
||||
@@ -107,10 +108,8 @@ impl MultiFieldPostingsWriter {
|
||||
.map(|(key, _, _)| Term::wrap(key).field())
|
||||
.enumerate();
|
||||
|
||||
let mut unordered_term_mappings: HashMap<
|
||||
Field,
|
||||
HashMap<UnorderedTermId, TermOrdinal>,
|
||||
> = HashMap::new();
|
||||
let mut unordered_term_mappings: HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>> =
|
||||
HashMap::new();
|
||||
|
||||
let mut prev_field = Field(u32::max_value());
|
||||
for (offset, field) in term_offsets_it {
|
||||
@@ -138,7 +137,8 @@ impl MultiFieldPostingsWriter {
|
||||
.enumerate()
|
||||
.map(|(term_ord, unord_term_id)| {
|
||||
(unord_term_id as UnorderedTermId, term_ord as TermOrdinal)
|
||||
}).collect();
|
||||
})
|
||||
.collect();
|
||||
unordered_term_mappings.insert(field, mapping);
|
||||
}
|
||||
FieldType::U64(_) | FieldType::I64(_) => {}
|
||||
|
||||
@@ -126,7 +126,6 @@ impl SegmentPostings {
|
||||
fn exponential_search(target: u32, arr: &[u32]) -> (usize, usize) {
|
||||
let mut start = 0;
|
||||
let end = arr.len();
|
||||
debug_assert!(target >= arr[start]);
|
||||
debug_assert!(target <= arr[end - 1]);
|
||||
let mut jump = 1;
|
||||
loop {
|
||||
@@ -216,11 +215,10 @@ impl DocSet for SegmentPostings {
|
||||
|
||||
// we're in the right block now, start with an exponential search
|
||||
let block_docs = self.block_cursor.docs();
|
||||
|
||||
debug_assert!(target >= self.doc());
|
||||
let new_cur = self
|
||||
.cur
|
||||
.wrapping_add(search_within_block(&block_docs[self.cur..], target));
|
||||
|
||||
if need_positions {
|
||||
sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur]
|
||||
.iter()
|
||||
@@ -533,7 +531,8 @@ impl BlockSegmentPostings {
|
||||
} else {
|
||||
BlockSegmentPostingsSkipResult::Terminated
|
||||
}
|
||||
}).unwrap_or(BlockSegmentPostingsSkipResult::Terminated);
|
||||
})
|
||||
.unwrap_or(BlockSegmentPostingsSkipResult::Terminated);
|
||||
}
|
||||
BlockSegmentPostingsSkipResult::Terminated
|
||||
}
|
||||
@@ -621,6 +620,7 @@ impl<'b> Streamer<'b> for BlockSegmentPostings {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::exponential_search;
|
||||
use super::search_within_block;
|
||||
use super::BlockSegmentPostings;
|
||||
use super::BlockSegmentPostingsSkipResult;
|
||||
@@ -634,6 +634,7 @@ mod tests {
|
||||
use schema::Term;
|
||||
use schema::INT_INDEXED;
|
||||
use DocId;
|
||||
use SkipResult;
|
||||
|
||||
#[test]
|
||||
fn test_empty_segment_postings() {
|
||||
@@ -661,6 +662,16 @@ mod tests {
|
||||
.0
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exponentiel_search() {
|
||||
assert_eq!(exponential_search(0, &[1, 2]), (0, 1));
|
||||
assert_eq!(exponential_search(1, &[1, 2]), (0, 1));
|
||||
assert_eq!(
|
||||
exponential_search(7, &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
|
||||
(3, 7)
|
||||
);
|
||||
}
|
||||
|
||||
fn util_test_search_within_block(block: &[u32], target: u32) {
|
||||
assert_eq!(
|
||||
search_within_block(block, target),
|
||||
@@ -692,7 +703,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_block_segment_postings() {
|
||||
let mut block_segments = build_block_postings((0..100_000).collect::<Vec<u32>>());
|
||||
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
|
||||
let mut offset: u32 = 0u32;
|
||||
// checking that the block before calling advance is empty
|
||||
assert!(block_segments.docs().is_empty());
|
||||
@@ -706,14 +717,44 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
fn build_block_postings(docs: Vec<DocId>) -> BlockSegmentPostings {
|
||||
#[test]
|
||||
fn test_skip_right_at_new_block() {
|
||||
let mut doc_ids = (0..128).collect::<Vec<u32>>();
|
||||
doc_ids.push(129);
|
||||
doc_ids.push(130);
|
||||
{
|
||||
let block_segments = build_block_postings(&doc_ids);
|
||||
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
||||
assert_eq!(docset.skip_next(128), SkipResult::OverStep);
|
||||
assert_eq!(docset.doc(), 129);
|
||||
assert!(docset.advance());
|
||||
assert_eq!(docset.doc(), 130);
|
||||
assert!(!docset.advance());
|
||||
}
|
||||
{
|
||||
let block_segments = build_block_postings(&doc_ids);
|
||||
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
||||
assert_eq!(docset.skip_next(129), SkipResult::Reached);
|
||||
assert_eq!(docset.doc(), 129);
|
||||
assert!(docset.advance());
|
||||
assert_eq!(docset.doc(), 130);
|
||||
assert!(!docset.advance());
|
||||
}
|
||||
{
|
||||
let block_segments = build_block_postings(&doc_ids);
|
||||
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
||||
assert_eq!(docset.skip_next(131), SkipResult::End);
|
||||
}
|
||||
}
|
||||
|
||||
fn build_block_postings(docs: &[DocId]) -> BlockSegmentPostings {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
let mut last_doc = 0u32;
|
||||
for doc in docs {
|
||||
for &doc in docs {
|
||||
for _ in last_doc..doc {
|
||||
index_writer.add_document(doc!(int_field=>1u64));
|
||||
}
|
||||
@@ -733,7 +774,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_block_segment_postings_skip() {
|
||||
for i in 0..4 {
|
||||
let mut block_postings = build_block_postings(vec![3]);
|
||||
let mut block_postings = build_block_postings(&[3]);
|
||||
assert_eq!(
|
||||
block_postings.skip_to(i),
|
||||
BlockSegmentPostingsSkipResult::Success(0u32)
|
||||
@@ -743,7 +784,7 @@ mod tests {
|
||||
BlockSegmentPostingsSkipResult::Terminated
|
||||
);
|
||||
}
|
||||
let mut block_postings = build_block_postings(vec![3]);
|
||||
let mut block_postings = build_block_postings(&[3]);
|
||||
assert_eq!(
|
||||
block_postings.skip_to(4u32),
|
||||
BlockSegmentPostingsSkipResult::Terminated
|
||||
@@ -756,7 +797,7 @@ mod tests {
|
||||
for i in 0..1300 {
|
||||
docs.push((i * i / 100) + i);
|
||||
}
|
||||
let mut block_postings = build_block_postings(docs.clone());
|
||||
let mut block_postings = build_block_postings(&docs[..]);
|
||||
for i in vec![0, 424, 10000] {
|
||||
assert_eq!(
|
||||
block_postings.skip_to(i),
|
||||
|
||||
@@ -55,7 +55,7 @@ pub struct InvertedIndexSerializer {
|
||||
|
||||
impl InvertedIndexSerializer {
|
||||
/// Open a new `PostingsSerializer` for the given segment
|
||||
fn new(
|
||||
fn create(
|
||||
terms_write: CompositeWrite<WritePtr>,
|
||||
postings_write: CompositeWrite<WritePtr>,
|
||||
positions_write: CompositeWrite<WritePtr>,
|
||||
@@ -74,7 +74,7 @@ impl InvertedIndexSerializer {
|
||||
/// Open a new `PostingsSerializer` for the given segment
|
||||
pub fn open(segment: &mut Segment) -> Result<InvertedIndexSerializer> {
|
||||
use SegmentComponent::{POSITIONS, POSITIONSSKIP, POSTINGS, TERMS};
|
||||
InvertedIndexSerializer::new(
|
||||
InvertedIndexSerializer::create(
|
||||
CompositeWrite::wrap(segment.open_write(TERMS)?),
|
||||
CompositeWrite::wrap(segment.open_write(POSTINGS)?),
|
||||
CompositeWrite::wrap(segment.open_write(POSITIONS)?),
|
||||
@@ -99,7 +99,7 @@ impl InvertedIndexSerializer {
|
||||
let positions_write = self.positions_write.for_field(field);
|
||||
let positionsidx_write = self.positionsidx_write.for_field(field);
|
||||
let field_type: FieldType = (*field_entry.field_type()).clone();
|
||||
FieldSerializer::new(
|
||||
FieldSerializer::create(
|
||||
&field_type,
|
||||
term_dictionary_write,
|
||||
postings_write,
|
||||
@@ -130,7 +130,7 @@ pub struct FieldSerializer<'a> {
|
||||
}
|
||||
|
||||
impl<'a> FieldSerializer<'a> {
|
||||
fn new(
|
||||
fn create(
|
||||
field_type: &FieldType,
|
||||
term_dictionary_write: &'a mut CountingWriter<WritePtr>,
|
||||
postings_write: &'a mut CountingWriter<WritePtr>,
|
||||
@@ -152,7 +152,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
_ => (false, false),
|
||||
};
|
||||
let term_dictionary_builder =
|
||||
TermDictionaryBuilder::new(term_dictionary_write, &field_type)?;
|
||||
TermDictionaryBuilder::create(term_dictionary_write, &field_type)?;
|
||||
let postings_serializer =
|
||||
PostingsSerializer::new(postings_write, term_freq_enabled, position_enabled);
|
||||
let positions_serializer_opt = if position_enabled {
|
||||
|
||||
@@ -1,9 +1,7 @@
|
||||
mod expull;
|
||||
mod memory_arena;
|
||||
mod murmurhash2;
|
||||
mod term_hashmap;
|
||||
|
||||
pub use self::expull::ExpUnrolledLinkedList;
|
||||
pub use self::memory_arena::{Addr, ArenaStorable, MemoryArena};
|
||||
use self::murmurhash2::murmurhash2;
|
||||
pub use self::term_hashmap::{compute_table_size, TermHashMap};
|
||||
|
||||
@@ -1,87 +0,0 @@
|
||||
use std::ptr;
|
||||
const SEED: u32 = 3_242_157_231u32;
|
||||
const M: u32 = 0x5bd1_e995;
|
||||
|
||||
#[inline(always)]
|
||||
pub fn murmurhash2(key: &[u8]) -> u32 {
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
|
||||
let mut key_ptr: *const u32 = key.as_ptr() as *const u32;
|
||||
let len = key.len() as u32;
|
||||
let mut h: u32 = SEED ^ len;
|
||||
|
||||
let num_blocks = len >> 2;
|
||||
for _ in 0..num_blocks {
|
||||
let mut k: u32 = unsafe { ptr::read_unaligned(key_ptr) }; // ok because of num_blocks definition
|
||||
k = k.wrapping_mul(M);
|
||||
k ^= k >> 24;
|
||||
k = k.wrapping_mul(M);
|
||||
h = h.wrapping_mul(M);
|
||||
h ^= k;
|
||||
key_ptr = key_ptr.wrapping_offset(1);
|
||||
}
|
||||
|
||||
// Handle the last few bytes of the input array
|
||||
let remaining: &[u8] = &key[key.len() & !3..];
|
||||
match remaining.len() {
|
||||
3 => {
|
||||
h ^= u32::from(remaining[2]) << 16;
|
||||
h ^= u32::from(remaining[1]) << 8;
|
||||
h ^= u32::from(remaining[0]);
|
||||
h = h.wrapping_mul(M);
|
||||
}
|
||||
2 => {
|
||||
h ^= u32::from(remaining[1]) << 8;
|
||||
h ^= u32::from(remaining[0]);
|
||||
h = h.wrapping_mul(M);
|
||||
}
|
||||
1 => {
|
||||
h ^= u32::from(remaining[0]);
|
||||
h = h.wrapping_mul(M);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
h ^= h >> 13;
|
||||
h = h.wrapping_mul(M);
|
||||
h ^ (h >> 15)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
use super::murmurhash2;
|
||||
use std::collections::HashSet;
|
||||
|
||||
#[test]
|
||||
fn test_murmur() {
|
||||
let s1 = "abcdef";
|
||||
let s2 = "abcdeg";
|
||||
for i in 0..5 {
|
||||
assert_eq!(
|
||||
murmurhash2(&s1[i..5].as_bytes()),
|
||||
murmurhash2(&s2[i..5].as_bytes())
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_murmur_against_reference_impl() {
|
||||
assert_eq!(murmurhash2("".as_bytes()), 3632506080);
|
||||
assert_eq!(murmurhash2("a".as_bytes()), 455683869);
|
||||
assert_eq!(murmurhash2("ab".as_bytes()), 2448092234);
|
||||
assert_eq!(murmurhash2("abc".as_bytes()), 2066295634);
|
||||
assert_eq!(murmurhash2("abcd".as_bytes()), 2588571162);
|
||||
assert_eq!(murmurhash2("abcde".as_bytes()), 2988696942);
|
||||
assert_eq!(murmurhash2("abcdefghijklmnop".as_bytes()), 2350868870);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_murmur_collisions() {
|
||||
let mut set: HashSet<u32> = HashSet::default();
|
||||
for i in 0..10_000 {
|
||||
let s = format!("hash{}", i);
|
||||
let hash = murmurhash2(s.as_bytes());
|
||||
set.insert(hash);
|
||||
}
|
||||
assert_eq!(set.len(), 10_000);
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,7 @@
|
||||
use super::murmurhash2;
|
||||
extern crate murmurhash32;
|
||||
|
||||
use self::murmurhash32::murmurhash2;
|
||||
|
||||
use super::{Addr, ArenaStorable, MemoryArena};
|
||||
use std::iter;
|
||||
use std::mem;
|
||||
@@ -206,7 +209,7 @@ impl TermHashMap {
|
||||
self.resize();
|
||||
}
|
||||
let key_bytes: &[u8] = key.as_ref();
|
||||
let hash = murmurhash2::murmurhash2(key.as_ref());
|
||||
let hash = murmurhash2(key.as_ref());
|
||||
let mut probe = self.probe(hash);
|
||||
loop {
|
||||
let bucket = probe.next_probe();
|
||||
|
||||
@@ -63,7 +63,8 @@ impl BM25Weight {
|
||||
.map(|term| {
|
||||
let term_doc_freq = searcher.doc_freq(term);
|
||||
idf(term_doc_freq, total_num_docs)
|
||||
}).sum::<f32>();
|
||||
})
|
||||
.sum::<f32>();
|
||||
BM25Weight::new(idf, average_fieldnorm)
|
||||
}
|
||||
|
||||
|
||||
@@ -47,7 +47,8 @@ impl Query for BooleanQuery {
|
||||
.iter()
|
||||
.map(|&(ref occur, ref subquery)| {
|
||||
Ok((*occur, subquery.weight(searcher, scoring_enabled)?))
|
||||
}).collect::<Result<_>>()?;
|
||||
})
|
||||
.collect::<Result<_>>()?;
|
||||
Ok(Box::new(BooleanWeight::new(sub_weights, scoring_enabled)))
|
||||
}
|
||||
|
||||
@@ -68,7 +69,8 @@ impl BooleanQuery {
|
||||
let term_query: Box<Query> =
|
||||
Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs));
|
||||
(Occur::Should, term_query)
|
||||
}).collect();
|
||||
})
|
||||
.collect();
|
||||
BooleanQuery::from(occur_term_queries)
|
||||
}
|
||||
|
||||
|
||||
@@ -134,7 +134,8 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
.into_iter()
|
||||
.map(|(offset, postings)| {
|
||||
PostingsWithOffset::new(postings, (max_offset - offset) as u32)
|
||||
}).collect::<Vec<_>>();
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
PhraseScorer {
|
||||
intersection_docset: Intersection::new(postings_with_offsets),
|
||||
num_docsets,
|
||||
|
||||
@@ -68,7 +68,8 @@ fn trim_ast(logical_ast: LogicalAST) -> Option<LogicalAST> {
|
||||
.into_iter()
|
||||
.flat_map(|(occur, child)| {
|
||||
trim_ast(child).map(|trimmed_child| (occur, trimmed_child))
|
||||
}).collect::<Vec<_>>();
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
if trimmed_children.is_empty() {
|
||||
None
|
||||
} else {
|
||||
@@ -422,7 +423,8 @@ impl QueryParser {
|
||||
lower: self.resolve_bound(field, &lower)?,
|
||||
upper: self.resolve_bound(field, &upper)?,
|
||||
})))
|
||||
}).collect::<Result<Vec<_>, QueryParserError>>()?;
|
||||
})
|
||||
.collect::<Result<Vec<_>, QueryParserError>>()?;
|
||||
let result_ast = if clauses.len() == 1 {
|
||||
clauses.pop().unwrap()
|
||||
} else {
|
||||
@@ -598,25 +600,19 @@ mod test {
|
||||
assert!(query_parser.parse_query("signed:2324").is_ok());
|
||||
assert!(query_parser.parse_query("signed:\"22\"").is_ok());
|
||||
assert!(query_parser.parse_query("signed:\"-2234\"").is_ok());
|
||||
assert!(
|
||||
query_parser
|
||||
.parse_query("signed:\"-9999999999999\"")
|
||||
.is_ok()
|
||||
);
|
||||
assert!(query_parser
|
||||
.parse_query("signed:\"-9999999999999\"")
|
||||
.is_ok());
|
||||
assert!(query_parser.parse_query("signed:\"a\"").is_err());
|
||||
assert!(query_parser.parse_query("signed:\"2a\"").is_err());
|
||||
assert!(
|
||||
query_parser
|
||||
.parse_query("signed:\"18446744073709551615\"")
|
||||
.is_err()
|
||||
);
|
||||
assert!(query_parser
|
||||
.parse_query("signed:\"18446744073709551615\"")
|
||||
.is_err());
|
||||
assert!(query_parser.parse_query("unsigned:\"2\"").is_ok());
|
||||
assert!(query_parser.parse_query("unsigned:\"-2\"").is_err());
|
||||
assert!(
|
||||
query_parser
|
||||
.parse_query("unsigned:\"18446744073709551615\"")
|
||||
.is_ok()
|
||||
);
|
||||
assert!(query_parser
|
||||
.parse_query("unsigned:\"18446744073709551615\"")
|
||||
.is_ok());
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"unsigned:2324",
|
||||
"Term([0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 20])",
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
|
||||
mod term_query;
|
||||
mod term_scorer;
|
||||
mod term_weight;
|
||||
|
||||
@@ -55,7 +55,8 @@ where
|
||||
None
|
||||
}
|
||||
},
|
||||
).collect();
|
||||
)
|
||||
.collect();
|
||||
Union {
|
||||
docsets: non_empty_docsets,
|
||||
bitsets: Box::new([TinySet::empty(); HORIZON_NUM_TINYBITSETS]),
|
||||
@@ -214,10 +215,7 @@ where
|
||||
|
||||
// The target is outside of the buffered horizon.
|
||||
// advance all docsets to a doc >= to the target.
|
||||
#[cfg_attr(
|
||||
feature = "cargo-clippy",
|
||||
allow(clippy::clippy::collapsible_if)
|
||||
)]
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::clippy::collapsible_if))]
|
||||
unordered_drain_filter(&mut self.docsets, |docset| {
|
||||
if docset.doc() < target {
|
||||
if docset.skip_next(target) == SkipResult::End {
|
||||
|
||||
@@ -6,6 +6,7 @@ use std::borrow::Cow;
|
||||
use std::fmt::{self, Debug, Display, Formatter};
|
||||
use std::io::{self, Read, Write};
|
||||
use std::str;
|
||||
use std::string::FromUtf8Error;
|
||||
|
||||
const SLASH_BYTE: u8 = b'/';
|
||||
const ESCAPE_BYTE: u8 = b'\\';
|
||||
@@ -14,6 +15,10 @@ const ESCAPE_BYTE: u8 = b'\\';
|
||||
/// representation of facets.
|
||||
pub const FACET_SEP_BYTE: u8 = 0u8;
|
||||
|
||||
/// `char` used as a level separation in the binary
|
||||
/// representation of facets. (It is the null codepoint.)
|
||||
pub const FACET_SEP_CHAR: char = '\u{0}';
|
||||
|
||||
/// A Facet represent a point in a given hierarchy.
|
||||
///
|
||||
/// They are typically represented similarly to a filepath.
|
||||
@@ -26,18 +31,18 @@ pub const FACET_SEP_BYTE: u8 = 0u8;
|
||||
/// its facet. In the example above, `/electronics/tv_and_video/`
|
||||
/// and `/electronics`.
|
||||
#[derive(Clone, Eq, Hash, PartialEq, Ord, PartialOrd)]
|
||||
pub struct Facet(Vec<u8>);
|
||||
pub struct Facet(String);
|
||||
|
||||
impl Facet {
|
||||
/// Returns a new instance of the "root facet"
|
||||
/// Equivalent to `/`.
|
||||
pub fn root() -> Facet {
|
||||
Facet(vec![])
|
||||
Facet("".to_string())
|
||||
}
|
||||
|
||||
/// Returns true iff the facet is the root facet `/`.
|
||||
pub fn is_root(&self) -> bool {
|
||||
self.encoded_bytes().is_empty()
|
||||
self.encoded_str().is_empty()
|
||||
}
|
||||
|
||||
/// Returns a binary representation of the facet.
|
||||
@@ -49,13 +54,19 @@ impl Facet {
|
||||
/// This representation has the benefit of making it possible to
|
||||
/// express "being a child of a given facet" as a range over
|
||||
/// the term ordinals.
|
||||
pub fn encoded_bytes(&self) -> &[u8] {
|
||||
pub fn encoded_str(&self) -> &str {
|
||||
&self.0
|
||||
}
|
||||
|
||||
pub(crate) fn from_encoded_string(facet_string: String) -> Facet {
|
||||
Facet(facet_string)
|
||||
}
|
||||
|
||||
/// Creates a `Facet` from its binary representation.
|
||||
pub(crate) unsafe fn from_encoded(encoded_bytes: Vec<u8>) -> Facet {
|
||||
Facet(encoded_bytes)
|
||||
pub fn from_encoded(encoded_bytes: Vec<u8>) -> Result<Facet, FromUtf8Error> {
|
||||
// facet bytes validation. `0u8` is used a separator but that is still legal utf-8
|
||||
//Ok(Facet(String::from_utf8(encoded_bytes)?))
|
||||
String::from_utf8(encoded_bytes).map(Facet)
|
||||
}
|
||||
|
||||
/// Parse a text representation of a facet.
|
||||
@@ -79,36 +90,37 @@ impl Facet {
|
||||
Path: IntoIterator,
|
||||
Path::Item: ToString,
|
||||
{
|
||||
let mut facet_bytes: Vec<u8> = Vec::with_capacity(100);
|
||||
let mut facet_string: String = String::with_capacity(100);
|
||||
let mut step_it = path.into_iter();
|
||||
if let Some(step) = step_it.next() {
|
||||
facet_bytes.extend_from_slice(step.to_string().as_bytes());
|
||||
facet_string.push_str(&step.to_string());
|
||||
}
|
||||
for step in step_it {
|
||||
facet_bytes.push(FACET_SEP_BYTE);
|
||||
facet_bytes.extend_from_slice(step.to_string().as_bytes());
|
||||
facet_string.push(FACET_SEP_CHAR);
|
||||
facet_string.push_str(&step.to_string());
|
||||
}
|
||||
Facet(facet_bytes)
|
||||
Facet(facet_string)
|
||||
}
|
||||
|
||||
/// Accessor for the inner buffer of the `Facet`.
|
||||
pub(crate) fn inner_buffer_mut(&mut self) -> &mut Vec<u8> {
|
||||
&mut self.0
|
||||
pub(crate) fn set_facet_str(&mut self, facet_str: &str) {
|
||||
self.0.clear();
|
||||
self.0.push_str(facet_str);
|
||||
}
|
||||
|
||||
/// Returns `true` iff other is a subfacet of `self`.
|
||||
pub fn is_prefix_of(&self, other: &Facet) -> bool {
|
||||
let self_bytes: &[u8] = self.encoded_bytes();
|
||||
let other_bytes: &[u8] = other.encoded_bytes();
|
||||
self_bytes.len() < other_bytes.len()
|
||||
&& other_bytes.starts_with(self_bytes)
|
||||
&& other_bytes[self_bytes.len()] == 0u8
|
||||
let self_str = self.encoded_str();
|
||||
let other_str = other.encoded_str();
|
||||
self_str.len() < other_str.len()
|
||||
&& other_str.starts_with(self_str)
|
||||
&& other_str.as_bytes()[self_str.len()] == FACET_SEP_BYTE
|
||||
}
|
||||
}
|
||||
|
||||
impl Borrow<[u8]> for Facet {
|
||||
fn borrow(&self) -> &[u8] {
|
||||
self.encoded_bytes()
|
||||
impl Borrow<str> for Facet {
|
||||
fn borrow(&self) -> &str {
|
||||
self.encoded_str()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -120,45 +132,51 @@ impl<'a, T: ?Sized + AsRef<str>> From<&'a T> for Facet {
|
||||
Idle,
|
||||
}
|
||||
let path: &str = path_asref.as_ref();
|
||||
let mut facet_encoded = Vec::new();
|
||||
assert!(!path.is_empty());
|
||||
assert!(path.starts_with("/"));
|
||||
let mut facet_encoded = String::new();
|
||||
let mut state = State::Idle;
|
||||
let path_bytes = path.as_bytes();
|
||||
for &c in &path_bytes[1..] {
|
||||
let mut last_offset = 1;
|
||||
for i in 1..path_bytes.len() {
|
||||
let c = path_bytes[i];
|
||||
match (state, c) {
|
||||
(State::Idle, ESCAPE_BYTE) => state = State::Escaped,
|
||||
(State::Idle, ESCAPE_BYTE) => {
|
||||
facet_encoded.push_str(&path[last_offset..i]);
|
||||
last_offset = i + 1;
|
||||
state = State::Escaped
|
||||
}
|
||||
(State::Idle, SLASH_BYTE) => {
|
||||
facet_encoded.push(FACET_SEP_BYTE);
|
||||
facet_encoded.push_str(&path[last_offset..i]);
|
||||
facet_encoded.push(FACET_SEP_CHAR);
|
||||
last_offset = i + 1;
|
||||
}
|
||||
(State::Escaped, any_char) => {
|
||||
(State::Escaped, _escaped_char) => {
|
||||
state = State::Idle;
|
||||
facet_encoded.push(any_char);
|
||||
}
|
||||
(State::Idle, other_char) => {
|
||||
facet_encoded.push(other_char);
|
||||
}
|
||||
(State::Idle, _any_char) => {}
|
||||
}
|
||||
}
|
||||
facet_encoded.push_str(&path[last_offset..]);
|
||||
Facet(facet_encoded)
|
||||
}
|
||||
}
|
||||
|
||||
impl BinarySerializable for Facet {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
<Vec<u8> as BinarySerializable>::serialize(&self.0, writer)
|
||||
<String as BinarySerializable>::serialize(&self.0, writer)
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let bytes = <Vec<u8> as BinarySerializable>::deserialize(reader)?;
|
||||
Ok(Facet(bytes))
|
||||
Ok(Facet(<String as BinarySerializable>::deserialize(reader)?))
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for Facet {
|
||||
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
|
||||
for step in self.0.split(|&b| b == FACET_SEP_BYTE) {
|
||||
for step in self.0.split(FACET_SEP_CHAR) {
|
||||
write!(f, "/")?;
|
||||
let step_str = unsafe { str::from_utf8_unchecked(step) };
|
||||
write!(f, "{}", escape_slashes(step_str))?;
|
||||
write!(f, "{}", escape_slashes(step))?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -232,12 +232,14 @@ impl Schema {
|
||||
let field_entry = self.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
match *json_value {
|
||||
JsonValue::Array(ref json_items) => for json_item in json_items {
|
||||
let value = field_type
|
||||
.value_from_json(json_item)
|
||||
.map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?;
|
||||
doc.add(FieldValue::new(field, value));
|
||||
},
|
||||
JsonValue::Array(ref json_items) => {
|
||||
for json_item in json_items {
|
||||
let value = field_type.value_from_json(json_item).map_err(|e| {
|
||||
DocParsingError::ValueError(field_name.clone(), e)
|
||||
})?;
|
||||
doc.add(FieldValue::new(field, value));
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
let value = field_type
|
||||
.value_from_json(json_value)
|
||||
@@ -446,7 +448,8 @@ mod tests {
|
||||
"count": 4,
|
||||
"popularity": 10
|
||||
}"#,
|
||||
).unwrap();
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(doc.get_first(title_field).unwrap().text(), Some("my title"));
|
||||
assert_eq!(
|
||||
doc.get_first(author_field).unwrap().text(),
|
||||
|
||||
@@ -32,7 +32,7 @@ impl Term {
|
||||
|
||||
/// Creates a `Term` given a facet.
|
||||
pub fn from_facet(field: Field, facet: &Facet) -> Term {
|
||||
let bytes = facet.encoded_bytes();
|
||||
let bytes = facet.encoded_str().as_bytes();
|
||||
let buffer = Vec::with_capacity(4 + bytes.len());
|
||||
let mut term = Term(buffer);
|
||||
term.set_field(field);
|
||||
@@ -68,12 +68,7 @@ impl Term {
|
||||
term
|
||||
}
|
||||
|
||||
/// Creates a new Term with an empty buffer,
|
||||
/// but with a given capacity.
|
||||
///
|
||||
/// It is declared unsafe, as the term content
|
||||
/// is not initialized, and a call to `.field()`
|
||||
/// would panic.
|
||||
/// Creates a new Term for a given field.
|
||||
pub(crate) fn for_field(field: Field) -> Term {
|
||||
let mut term = Term(Vec::with_capacity(100));
|
||||
term.set_field(field);
|
||||
|
||||
@@ -192,7 +192,8 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
|
||||
item.start - fragment.start_offset,
|
||||
item.stop - fragment.start_offset,
|
||||
)
|
||||
}).collect();
|
||||
})
|
||||
.collect();
|
||||
Snippet {
|
||||
fragments: fragment_text.to_string(),
|
||||
highlighted,
|
||||
@@ -242,7 +243,7 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
|
||||
/// let query = query_parser.parse_query("haleurs flamands").unwrap();
|
||||
/// # index.load_searchers()?;
|
||||
/// # let searcher = index.searcher();
|
||||
/// let mut snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field)?;
|
||||
/// let mut snippet_generator = SnippetGenerator::create(&searcher, &*query, text_field)?;
|
||||
/// snippet_generator.set_max_num_chars(100);
|
||||
/// let snippet = snippet_generator.snippet_from_doc(&doc);
|
||||
/// let snippet_html: String = snippet.to_html();
|
||||
@@ -259,7 +260,7 @@ pub struct SnippetGenerator {
|
||||
|
||||
impl SnippetGenerator {
|
||||
/// Creates a new snippet generator
|
||||
pub fn new(searcher: &Searcher, query: &Query, field: Field) -> Result<SnippetGenerator> {
|
||||
pub fn create(searcher: &Searcher, query: &Query, field: Field) -> Result<SnippetGenerator> {
|
||||
let mut terms = BTreeSet::new();
|
||||
query.query_terms(&mut terms);
|
||||
let terms_text: BTreeMap<String, f32> = terms
|
||||
@@ -273,7 +274,8 @@ impl SnippetGenerator {
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}).collect();
|
||||
})
|
||||
.collect();
|
||||
let tokenizer = searcher.index().tokenizer_for_field(field)?;
|
||||
Ok(SnippetGenerator {
|
||||
terms_text,
|
||||
@@ -532,12 +534,14 @@ Survey in 2016, 2017, and 2018."#;
|
||||
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
||||
{
|
||||
let query = query_parser.parse_query("e").unwrap();
|
||||
let snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap();
|
||||
let snippet_generator =
|
||||
SnippetGenerator::create(&searcher, &*query, text_field).unwrap();
|
||||
assert!(snippet_generator.terms_text().is_empty());
|
||||
}
|
||||
{
|
||||
let query = query_parser.parse_query("a").unwrap();
|
||||
let snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap();
|
||||
let snippet_generator =
|
||||
SnippetGenerator::create(&searcher, &*query, text_field).unwrap();
|
||||
assert_eq!(
|
||||
&btreemap!("a".to_string() => 0.25f32),
|
||||
snippet_generator.terms_text()
|
||||
@@ -545,7 +549,8 @@ Survey in 2016, 2017, and 2018."#;
|
||||
}
|
||||
{
|
||||
let query = query_parser.parse_query("a b").unwrap();
|
||||
let snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap();
|
||||
let snippet_generator =
|
||||
SnippetGenerator::create(&searcher, &*query, text_field).unwrap();
|
||||
assert_eq!(
|
||||
&btreemap!("a".to_string() => 0.25f32, "b".to_string() => 0.5),
|
||||
snippet_generator.terms_text()
|
||||
@@ -553,7 +558,8 @@ Survey in 2016, 2017, and 2018."#;
|
||||
}
|
||||
{
|
||||
let query = query_parser.parse_query("a b c").unwrap();
|
||||
let snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap();
|
||||
let snippet_generator =
|
||||
SnippetGenerator::create(&searcher, &*query, text_field).unwrap();
|
||||
assert_eq!(
|
||||
&btreemap!("a".to_string() => 0.25f32, "b".to_string() => 0.5),
|
||||
snippet_generator.terms_text()
|
||||
@@ -585,7 +591,8 @@ Survey in 2016, 2017, and 2018."#;
|
||||
let searcher = index.searcher();
|
||||
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
||||
let query = query_parser.parse_query("rust design").unwrap();
|
||||
let mut snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap();
|
||||
let mut snippet_generator =
|
||||
SnippetGenerator::create(&searcher, &*query, text_field).unwrap();
|
||||
{
|
||||
let snippet = snippet_generator.snippet(TEST_TEXT);
|
||||
assert_eq!(snippet.to_html(), "imperative-procedural paradigms. <b>Rust</b> is syntactically similar to C++[according to whom?],\nbut its <b>designers</b> intend it to provide better memory safety");
|
||||
|
||||
@@ -80,6 +80,7 @@ pub struct SegmentSpaceUsage {
|
||||
}
|
||||
|
||||
impl SegmentSpaceUsage {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) fn new(
|
||||
num_docs: u32,
|
||||
termdict: PerFieldSpaceUsage,
|
||||
|
||||
@@ -95,10 +95,7 @@ impl StoreReader {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg_attr(
|
||||
feature = "cargo-clippy",
|
||||
allow(clippy::needless_pass_by_value)
|
||||
)]
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::needless_pass_by_value))]
|
||||
fn split_source(data: ReadOnlySource) -> (ReadOnlySource, ReadOnlySource, DocId) {
|
||||
let data_len = data.len();
|
||||
let footer_offset = data_len - size_of::<u64>() - size_of::<u32>();
|
||||
|
||||
@@ -53,7 +53,8 @@ impl<'a> TermMerger<'a> {
|
||||
.map(|(ord, streamer)| HeapItem {
|
||||
streamer,
|
||||
segment_ord: ord,
|
||||
}).collect(),
|
||||
})
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -122,10 +123,7 @@ impl<'a> TermMerger<'a> {
|
||||
}
|
||||
|
||||
/// Iterates through terms
|
||||
#[cfg_attr(
|
||||
feature = "cargo-clippy",
|
||||
allow(clippy::should_implement_trait)
|
||||
)]
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::should_implement_trait))]
|
||||
pub fn next(&mut self) -> Option<Term<&[u8]>> {
|
||||
if self.advance() {
|
||||
Some(Term::wrap(self.current_streamers[0].streamer.key()))
|
||||
|
||||
@@ -66,7 +66,7 @@ mod tests {
|
||||
let write = directory.open_write(&path).unwrap();
|
||||
let field_type = FieldType::Str(TEXT);
|
||||
let mut term_dictionary_builder =
|
||||
TermDictionaryBuilder::new(write, &field_type).unwrap();
|
||||
TermDictionaryBuilder::create(write, &field_type).unwrap();
|
||||
for term in COUNTRIES.iter() {
|
||||
term_dictionary_builder
|
||||
.insert(term.as_bytes(), &make_term_info(0u64))
|
||||
@@ -92,7 +92,7 @@ mod tests {
|
||||
let write = directory.open_write(&path).unwrap();
|
||||
let field_type = FieldType::Str(TEXT);
|
||||
let mut term_dictionary_builder =
|
||||
TermDictionaryBuilder::new(write, &field_type).unwrap();
|
||||
TermDictionaryBuilder::create(write, &field_type).unwrap();
|
||||
term_dictionary_builder
|
||||
.insert("abc".as_bytes(), &make_term_info(34u64))
|
||||
.unwrap();
|
||||
@@ -167,7 +167,7 @@ mod tests {
|
||||
let mut term_string = String::new();
|
||||
while term_it.advance() {
|
||||
//let term = Term::from_bytes(term_it.key());
|
||||
term_string.push_str(unsafe { str::from_utf8_unchecked(term_it.key()) }); // ok test
|
||||
term_string.push_str(str::from_utf8(term_it.key()).expect("test"));
|
||||
}
|
||||
assert_eq!(&*term_string, "abcdef");
|
||||
}
|
||||
@@ -180,7 +180,7 @@ mod tests {
|
||||
let field_type = FieldType::Str(TEXT);
|
||||
let buffer: Vec<u8> = {
|
||||
let mut term_dictionary_builder =
|
||||
TermDictionaryBuilder::new(vec![], &field_type).unwrap();
|
||||
TermDictionaryBuilder::create(vec![], &field_type).unwrap();
|
||||
for &(ref id, ref i) in &ids {
|
||||
term_dictionary_builder
|
||||
.insert(id.as_bytes(), &make_term_info(*i as u64))
|
||||
@@ -210,7 +210,7 @@ mod tests {
|
||||
let field_type = FieldType::Str(TEXT);
|
||||
let buffer: Vec<u8> = {
|
||||
let mut term_dictionary_builder =
|
||||
TermDictionaryBuilder::new(vec![], &field_type).unwrap();
|
||||
TermDictionaryBuilder::create(vec![], &field_type).unwrap();
|
||||
// term requires more than 16bits
|
||||
term_dictionary_builder
|
||||
.insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1))
|
||||
@@ -245,7 +245,7 @@ mod tests {
|
||||
let field_type = FieldType::Str(TEXT);
|
||||
let buffer: Vec<u8> = {
|
||||
let mut term_dictionary_builder =
|
||||
TermDictionaryBuilder::new(vec![], &field_type).unwrap();
|
||||
TermDictionaryBuilder::create(vec![], &field_type).unwrap();
|
||||
for &(ref id, ref i) in &ids {
|
||||
term_dictionary_builder
|
||||
.insert(id.as_bytes(), &make_term_info(*i as u64))
|
||||
@@ -314,7 +314,7 @@ mod tests {
|
||||
let field_type = FieldType::Str(TEXT);
|
||||
let buffer: Vec<u8> = {
|
||||
let mut term_dictionary_builder =
|
||||
TermDictionaryBuilder::new(vec![], &field_type).unwrap();
|
||||
TermDictionaryBuilder::create(vec![], &field_type).unwrap();
|
||||
term_dictionary_builder
|
||||
.insert(&[], &make_term_info(1 as u64))
|
||||
.unwrap();
|
||||
@@ -338,7 +338,7 @@ mod tests {
|
||||
let field_type = FieldType::Str(TEXT);
|
||||
let buffer: Vec<u8> = {
|
||||
let mut term_dictionary_builder =
|
||||
TermDictionaryBuilder::new(vec![], &field_type).unwrap();
|
||||
TermDictionaryBuilder::create(vec![], &field_type).unwrap();
|
||||
for i in 0u8..10u8 {
|
||||
let number_arr = [i; 1];
|
||||
term_dictionary_builder
|
||||
@@ -408,7 +408,7 @@ mod tests {
|
||||
let write = directory.open_write(&path).unwrap();
|
||||
let field_type = FieldType::Str(TEXT);
|
||||
let mut term_dictionary_builder =
|
||||
TermDictionaryBuilder::new(write, &field_type).unwrap();
|
||||
TermDictionaryBuilder::create(write, &field_type).unwrap();
|
||||
for term in COUNTRIES.iter() {
|
||||
term_dictionary_builder
|
||||
.insert(term.as_bytes(), &make_term_info(0u64))
|
||||
|
||||
@@ -132,10 +132,7 @@ where
|
||||
}
|
||||
|
||||
/// Return the next `(key, value)` pair.
|
||||
#[cfg_attr(
|
||||
feature = "cargo-clippy",
|
||||
allow(clippy::should_implement_trait)
|
||||
)]
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::should_implement_trait))]
|
||||
pub fn next(&mut self) -> Option<(&[u8], &TermInfo)> {
|
||||
if self.advance() {
|
||||
Some((self.key(), self.value()))
|
||||
|
||||
@@ -29,7 +29,7 @@ where
|
||||
W: Write,
|
||||
{
|
||||
/// Creates a new `TermDictionaryBuilder`
|
||||
pub fn new(w: W, _field_type: &FieldType) -> io::Result<Self> {
|
||||
pub fn create(w: W, _field_type: &FieldType) -> io::Result<Self> {
|
||||
let fst_builder = fst::MapBuilder::new(w).map_err(convert_fst_error)?;
|
||||
Ok(TermDictionaryBuilder {
|
||||
fst_builder,
|
||||
@@ -132,7 +132,7 @@ impl TermDictionary {
|
||||
/// Creates an empty term dictionary which contains no terms.
|
||||
pub fn empty(field_type: &FieldType) -> Self {
|
||||
let term_dictionary_data: Vec<u8> =
|
||||
TermDictionaryBuilder::new(Vec::<u8>::new(), &field_type)
|
||||
TermDictionaryBuilder::create(Vec::<u8>::new(), &field_type)
|
||||
.expect("Creating a TermDictionaryBuilder in a Vec<u8> should never fail")
|
||||
.finish()
|
||||
.expect("Writing in a Vec<u8> should never fail");
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
use schema::FACET_SEP_BYTE;
|
||||
use std::str;
|
||||
|
||||
/// The `FacetTokenizer` process a `Facet` binary representation
|
||||
/// and emits a token for all of its parent.
|
||||
@@ -57,12 +56,11 @@ impl<'a> TokenStream for FacetTokenStream<'a> {
|
||||
.position(|b| b == FACET_SEP_BYTE)
|
||||
.map(|pos| cursor + 1 + pos)
|
||||
{
|
||||
let facet_part =
|
||||
unsafe { str::from_utf8_unchecked(&bytes[cursor..next_sep_pos]) };
|
||||
let facet_part = &self.text[cursor..next_sep_pos];
|
||||
self.token.text.push_str(facet_part);
|
||||
self.state = State::UpToPosition(next_sep_pos);
|
||||
} else {
|
||||
let facet_part = unsafe { str::from_utf8_unchecked(&bytes[cursor..]) };
|
||||
let facet_part = &self.text[cursor..];
|
||||
self.token.text.push_str(facet_part);
|
||||
self.state = State::Terminated;
|
||||
}
|
||||
@@ -86,7 +84,6 @@ mod tests {
|
||||
|
||||
use super::FacetTokenizer;
|
||||
use schema::Facet;
|
||||
use std::str;
|
||||
use tokenizer::{Token, TokenStream, Tokenizer};
|
||||
|
||||
#[test]
|
||||
@@ -95,11 +92,11 @@ mod tests {
|
||||
let mut tokens = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| {
|
||||
let facet = unsafe { Facet::from_encoded(token.text.as_bytes().to_owned()) }; // ok test
|
||||
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap();
|
||||
tokens.push(format!("{}", facet));
|
||||
};
|
||||
FacetTokenizer
|
||||
.token_stream(unsafe { str::from_utf8_unchecked(facet.encoded_bytes()) })
|
||||
.token_stream(facet.encoded_str())
|
||||
.process(&mut add_token);
|
||||
}
|
||||
assert_eq!(tokens.len(), 4);
|
||||
@@ -115,11 +112,11 @@ mod tests {
|
||||
let mut tokens = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| {
|
||||
let facet = unsafe { Facet::from_encoded(token.text.as_bytes().to_owned()) }; // ok test
|
||||
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); // ok test
|
||||
tokens.push(format!("{}", facet));
|
||||
};
|
||||
FacetTokenizer
|
||||
.token_stream(unsafe { str::from_utf8_unchecked(facet.encoded_bytes()) }) // ok test
|
||||
.token_stream(facet.encoded_str()) // ok test
|
||||
.process(&mut add_token);
|
||||
}
|
||||
assert_eq!(tokens.len(), 1);
|
||||
|
||||
Reference in New Issue
Block a user