Compare commits

..

27 Commits

Author SHA1 Message Date
Paul Masurel
4d08713c73 Update CHANGELOG with date range queries 2021-04-19 09:33:56 +09:00
Paul Masurel
570c4fdbb1 Cargo check 2021-04-19 08:54:28 +09:00
Paul Masurel
00506594b9 Removing TermMerger::next().
Closing #933
2021-04-19 08:54:28 +09:00
Paul Masurel
3313fd39fc Edited CHANGELOG 2021-04-19 08:54:28 +09:00
Hardik Prajapati
9dd8c268bd Simplified chain orderings 2021-04-19 08:54:28 +09:00
Hardik Prajapati
942cbfb383 Fixed formatting using cargo fmt 2021-04-19 08:54:28 +09:00
Hardik Prajapati
5d1627e52d Implementation of Ord trait changed for Hit
- This will result in lexicographical ordering of facet in BinaryHeep in case of a tie
2021-04-19 08:54:28 +09:00
Hardik Prajapati
84cad5c1aa AAdded failing test for tie scenario in topk 2021-04-19 08:54:28 +09:00
Rihards Krišlauks
f58345f0f0 Add a date range query example to QueryParser documentation 2021-04-18 22:13:02 +03:00
Rihards Krišlauks
f518012656 Test flexible bounds in date range queries 2021-04-17 19:30:09 +03:00
Rihards Krišlauks
12fb9a95cb Clean up leftower debug comments 2021-04-17 18:52:44 +03:00
Rihards Krišlauks
55e79e34af Verified that the change in datetime range test was correct
The value that was previously there was 3 and it made the test fail when i
enabled it. Verified that it, indeed, should have been 2 instead (the testing
code previously contained an error).
2021-04-17 18:16:52 +03:00
Rihards Krišlauks
1649f31258 Make time zone parsing more strict to match rfc3339 2021-04-17 17:57:46 +03:00
Rihards Krišlauks
7849736d80 Move all of the datetime parsing code into a single function
For readability
2021-04-17 17:23:47 +03:00
Rihards Krišlauks
e58401be78 Implement date range support in the query parser
Tests pass but needs cleanup
2021-04-13 23:32:22 +03:00
Paul Masurel
be1d9e0db7 Marks list_all_segment_metas() as crate private
Closes #1004
2021-04-07 23:39:28 +09:00
Paul Masurel
5743b46457 Merge pull request #1006 from tantivy-search/feat-merge-splits
Implements merging several index into a brand new index.

Closes #1005
2021-04-07 23:38:14 +09:00
Paul Masurel
e67e5ebd46 Minor syntax changes, and passing a tantivy Directory as argument
Closes #1005.
2021-04-07 23:35:03 +09:00
Evance Souamoro
a550c85369 fixed issues & added test on merge_segements featt 2021-04-06 16:15:09 +00:00
Evance Souamoro
b185df2b22 added a scratched of implementation but still need to craft one detail and write test to validate 2021-04-06 11:48:51 +00:00
Evance Souamoro
f82922b354 added a scratched of implementation but still need to craft one detail and write test to validate 2021-04-06 11:46:17 +00:00
Paul Masurel
86b30d9d7f Cargo fmt 2021-03-31 12:20:31 +09:00
Paul Masurel
f1499d5b3e Cargo fmt 2021-03-31 11:44:03 +09:00
Paul Masurel
30b6828d71 Update actions.md 2021-03-31 10:36:13 +09:00
Paul Masurel
e6b7b7da0a Create actions.md 2021-03-31 10:34:33 +09:00
Paul Masurel
38a20ae269 Renamed SegmentLocalId to SegmentOrdinal for more homogeneity and edited
changelog
2021-03-29 09:25:42 +09:00
Stéphane Campinas
a0ec6e1e9d Expand the DocAddress struct with named fields 2021-03-28 19:00:23 +02:00
33 changed files with 627 additions and 298 deletions

13
.github/ISSUE_TEMPLATE/actions.md vendored Normal file
View File

@@ -0,0 +1,13 @@
---
name: Actions
about: Actions not directly related to producing code.
---
# Actions title
Action description.
e.g.
- benchmark
- investigate and report
- etc.

View File

@@ -3,8 +3,10 @@ Tantivy 0.15.0
- API Changes. Using Range instead of (start, end) in the API and internals (`FileSlice`, `OwnedBytes`, `Snippets`, ...)
This change is breaking but migration is trivial.
- Added an Histogram collector. (@fulmicoton) #994
- Added support for Option<TCollector>. (@fulmicoton)
- Added support for Option<TCollector>. (@fulmicoton)
- DocAddress is now a struct (@scampi) #987
- Bugfix consistent tie break handling in facet's topk (@hardikpnsp) #357
- Date field support for range queries (@rihardsk) #516
Tantivy 0.14.0
=========================

View File

@@ -1,11 +1,11 @@
use super::user_input_ast::{UserInputAST, UserInputBound, UserInputLeaf, UserInputLiteral};
use crate::Occur;
use combine::error::StringStreamError;
use combine::parser::char::{char, digit, letter, space, spaces, string};
use combine::parser::Parser;
use combine::{
attempt, choice, eof, many, many1, one_of, optional, parser, satisfy, skip_many1, value,
};
use combine::{error::StringStreamError, parser::combinator::recognize};
fn field<'a>() -> impl Parser<&'a str, Output = String> {
(
@@ -35,6 +35,62 @@ fn word<'a>() -> impl Parser<&'a str, Output = String> {
})
}
/// Parses a date time according to rfc3339
/// 2015-08-02T18:54:42+02
/// 2021-04-13T19:46:26.266051969+00:00
///
/// NOTE: also accepts 999999-99-99T99:99:99.266051969+99:99
/// We delegate rejecting such invalid dates to the logical AST compuation code
/// which invokes chrono::DateTime::parse_from_rfc3339 on the value to actually parse
/// it (instead of merely extracting the datetime value as string as done here).
fn date_time<'a>() -> impl Parser<&'a str, Output = String> {
let two_digits = || recognize::<String, _, _>((digit(), digit()));
// Parses a time zone
// -06:30
// Z
let time_zone = {
let utc = recognize::<String, _, _>(char('Z'));
let offset = recognize((
choice([char('-'), char('+')]),
two_digits(),
char(':'),
two_digits(),
));
utc.or(offset)
};
// Parses a date
// 2010-01-30
let date = {
recognize::<String, _, _>((
many1::<String, _, _>(digit()),
char('-'),
two_digits(),
char('-'),
two_digits(),
))
};
// Parses a time
// 12:30:02
// 19:46:26.266051969
let time = {
recognize::<String, _, _>((
two_digits(),
char(':'),
two_digits(),
char(':'),
two_digits(),
optional((char('.'), many1::<String, _, _>(digit()))),
time_zone,
))
};
recognize((date, char('T'), time))
}
fn term_val<'a>() -> impl Parser<&'a str, Output = String> {
let phrase = char('"').with(many1(satisfy(|c| c != '"'))).skip(char('"'));
phrase.or(word())
@@ -83,7 +139,8 @@ fn spaces1<'a>() -> impl Parser<&'a str, Output = ()> {
/// [a TO *], [a TO c], [abc TO bcd}
fn range<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
let range_term_val = || {
word()
attempt(date_time())
.or(word())
.or(negative_number())
.or(char('*').with(value("*".to_string())))
};
@@ -324,6 +381,22 @@ mod test {
error_parse("-1.");
}
#[test]
fn test_date_time() {
let (val, remaining) = date_time()
.parse("2015-08-02T18:54:42+02:30")
.expect("cannot parse date");
assert_eq!(val, "2015-08-02T18:54:42+02:30");
assert_eq!(remaining, "");
assert!(date_time().parse("2015-08-02T18:54:42+02").is_err());
let (val, remaining) = date_time()
.parse("2021-04-13T19:46:26.266051969+00:00")
.expect("cannot parse fractional date");
assert_eq!(val, "2021-04-13T19:46:26.266051969+00:00");
assert_eq!(remaining, "");
}
fn test_parse_query_to_ast_helper(query: &str, expected: &str) {
let query = parse_to_ast().parse(query).unwrap().0;
let query_str = format!("{:?}", query);
@@ -437,25 +510,60 @@ mod test {
#[test]
fn test_range_parser() {
// testing the range() parser separately
let res = range().parse("title: <hello").unwrap().0;
let res = range()
.parse("title: <hello")
.expect("Cannot parse felxible bound word")
.0;
let expected = UserInputLeaf::Range {
field: Some("title".to_string()),
lower: UserInputBound::Unbounded,
upper: UserInputBound::Exclusive("hello".to_string()),
};
let res2 = range().parse("title:{* TO hello}").unwrap().0;
let res2 = range()
.parse("title:{* TO hello}")
.expect("Cannot parse ununbounded to word")
.0;
assert_eq!(res, expected);
assert_eq!(res2, expected);
let expected_weight = UserInputLeaf::Range {
field: Some("weight".to_string()),
lower: UserInputBound::Inclusive("71.2".to_string()),
upper: UserInputBound::Unbounded,
};
let res3 = range().parse("weight: >=71.2").unwrap().0;
let res4 = range().parse("weight:[71.2 TO *}").unwrap().0;
let res3 = range()
.parse("weight: >=71.2")
.expect("Cannot parse flexible bound float")
.0;
let res4 = range()
.parse("weight:[71.2 TO *}")
.expect("Cannot parse float to unbounded")
.0;
assert_eq!(res3, expected_weight);
assert_eq!(res4, expected_weight);
let expected_dates = UserInputLeaf::Range {
field: Some("date_field".to_string()),
lower: UserInputBound::Exclusive("2015-08-02T18:54:42Z".to_string()),
upper: UserInputBound::Inclusive("2021-08-02T18:54:42+02:30".to_string()),
};
let res5 = range()
.parse("date_field:{2015-08-02T18:54:42Z TO 2021-08-02T18:54:42+02:30]")
.expect("Cannot parse date range")
.0;
assert_eq!(res5, expected_dates);
let expected_flexible_dates = UserInputLeaf::Range {
field: Some("date_field".to_string()),
lower: UserInputBound::Unbounded,
upper: UserInputBound::Inclusive("2021-08-02T18:54:42.12345+02:30".to_string()),
};
let res6 = range()
.parse("date_field: <=2021-08-02T18:54:42.12345+02:30")
.expect("Cannot parse date range")
.0;
assert_eq!(res6, expected_flexible_dates);
}
#[test]

View File

@@ -2,7 +2,7 @@ use super::Collector;
use crate::collector::SegmentCollector;
use crate::DocId;
use crate::Score;
use crate::SegmentLocalId;
use crate::SegmentOrdinal;
use crate::SegmentReader;
/// `CountCollector` collector only counts how many
@@ -45,7 +45,7 @@ impl Collector for Count {
fn for_segment(
&self,
_: SegmentLocalId,
_: SegmentOrdinal,
_: &SegmentReader,
) -> crate::Result<SegmentCountCollector> {
Ok(SegmentCountCollector::default())

View File

@@ -15,7 +15,7 @@ impl Collector for DocSetCollector {
fn for_segment(
&self,
segment_local_id: crate::SegmentLocalId,
segment_local_id: crate::SegmentOrdinal,
_segment: &crate::SegmentReader,
) -> crate::Result<Self::Child> {
Ok(DocSetChildCollector {
@@ -36,7 +36,7 @@ impl Collector for DocSetCollector {
let mut result = HashSet::with_capacity(len);
for (segment_local_id, docs) in segment_fruits {
for doc in docs {
result.insert(DocAddress(segment_local_id, doc));
result.insert(DocAddress::new(segment_local_id, doc));
}
}
Ok(result)

View File

@@ -5,7 +5,7 @@ use crate::schema::Facet;
use crate::schema::Field;
use crate::DocId;
use crate::Score;
use crate::SegmentLocalId;
use crate::SegmentOrdinal;
use crate::SegmentReader;
use std::cmp::Ordering;
use std::collections::btree_map;
@@ -37,7 +37,10 @@ impl<'a> PartialOrd<Hit<'a>> for Hit<'a> {
impl<'a> Ord for Hit<'a> {
fn cmp(&self, other: &Self) -> Ordering {
other.count.cmp(&self.count)
other
.count
.cmp(&self.count)
.then(self.facet.cmp(other.facet))
}
}
@@ -262,7 +265,7 @@ impl Collector for FacetCollector {
fn for_segment(
&self,
_: SegmentLocalId,
_: SegmentOrdinal,
reader: &SegmentReader,
) -> crate::Result<FacetSegmentCollector> {
let facet_reader = reader.facet_reader(self.field)?;
@@ -657,6 +660,41 @@ mod tests {
);
}
}
#[test]
fn test_facet_collector_topk_tie_break() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let docs: Vec<Document> = vec![("b", 2), ("a", 2), ("c", 4)]
.into_iter()
.flat_map(|(c, count)| {
let facet = Facet::from(&format!("/facet/{}", c));
let doc = doc!(facet_field => facet);
iter::repeat(doc).take(count)
})
.collect();
let mut index_writer = index.writer_for_tests()?;
for doc in docs {
index_writer.add_document(doc);
}
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let mut facet_collector = FacetCollector::for_field(facet_field);
facet_collector.add_facet("/facet");
let counts: FacetCounts = searcher.search(&AllQuery, &facet_collector)?;
let facets: Vec<(&Facet, u64)> = counts.top_k("/facet", 2);
assert_eq!(
facets,
vec![(&Facet::from("/facet/c"), 4), (&Facet::from("/facet/a"), 2)]
);
Ok(())
}
}
#[cfg(all(test, feature = "unstable"))]

View File

@@ -47,7 +47,7 @@ use crate::{Score, SegmentReader, TantivyError};
/// let top_docs = searcher.search(&query, &no_filter_collector).unwrap();
///
/// assert_eq!(top_docs.len(), 1);
/// assert_eq!(top_docs[0].1, DocAddress(0, 1));
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
///
/// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(price, &|value| value < 5u64, TopDocs::with_limit(2));
/// let filtered_top_docs = searcher.search(&query, &filter_all_collector).unwrap();

View File

@@ -106,7 +106,7 @@ impl Collector for HistogramCollector {
fn for_segment(
&self,
_segment_local_id: crate::SegmentLocalId,
_segment_local_id: crate::SegmentOrdinal,
segment: &crate::SegmentReader,
) -> crate::Result<Self::Child> {
let ff_reader = segment.fast_fields().u64_lenient(self.field)?;

View File

@@ -86,7 +86,7 @@ See the `custom_collector` example.
use crate::DocId;
use crate::Score;
use crate::SegmentLocalId;
use crate::SegmentOrdinal;
use crate::SegmentReader;
use downcast_rs::impl_downcast;
@@ -155,7 +155,7 @@ pub trait Collector: Sync + Send {
/// on this segment.
fn for_segment(
&self,
segment_local_id: SegmentLocalId,
segment_local_id: SegmentOrdinal,
segment: &SegmentReader,
) -> crate::Result<Self::Child>;
@@ -214,7 +214,7 @@ impl<TCollector: Collector> Collector for Option<TCollector> {
fn for_segment(
&self,
segment_local_id: SegmentLocalId,
segment_local_id: SegmentOrdinal,
segment: &SegmentReader,
) -> crate::Result<Self::Child> {
Ok(if let Some(inner) = self {

View File

@@ -3,7 +3,7 @@ use super::SegmentCollector;
use crate::collector::Fruit;
use crate::DocId;
use crate::Score;
use crate::SegmentLocalId;
use crate::SegmentOrdinal;
use crate::SegmentReader;
use crate::TantivyError;
use std::marker::PhantomData;
@@ -175,7 +175,7 @@ impl<'a> Collector for MultiCollector<'a> {
fn for_segment(
&self,
segment_local_id: SegmentLocalId,
segment_local_id: SegmentOrdinal,
segment: &SegmentReader,
) -> crate::Result<MultiCollectorChild> {
let children = self

View File

@@ -5,7 +5,7 @@ use crate::fastfield::FastFieldReader;
use crate::schema::Field;
use crate::DocId;
use crate::Score;
use crate::SegmentLocalId;
use crate::SegmentOrdinal;
use crate::{DocAddress, Document, Searcher};
use crate::collector::{Count, FilterCollector, TopDocs};
@@ -53,7 +53,7 @@ pub fn test_filter_collector() {
let top_docs = searcher.search(&query, &filter_some_collector).unwrap();
assert_eq!(top_docs.len(), 1);
assert_eq!(top_docs[0].1, DocAddress(0, 1));
assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
let filter_all_collector: FilterCollector<_, _, u64> =
FilterCollector::new(price, &|value| value < 5u64, TopDocs::with_limit(2));
@@ -82,7 +82,7 @@ pub struct TestCollector {
}
pub struct TestSegmentCollector {
segment_id: SegmentLocalId,
segment_id: SegmentOrdinal,
fruit: TestFruit,
}
@@ -108,7 +108,7 @@ impl Collector for TestCollector {
fn for_segment(
&self,
segment_id: SegmentLocalId,
segment_id: SegmentOrdinal,
_reader: &SegmentReader,
) -> crate::Result<TestSegmentCollector> {
Ok(TestSegmentCollector {
@@ -126,7 +126,7 @@ impl Collector for TestCollector {
if fruit.docs().is_empty() {
0
} else {
fruit.docs()[0].segment_ord()
fruit.docs()[0].segment_ord
}
});
let mut docs = vec![];
@@ -143,7 +143,7 @@ impl SegmentCollector for TestSegmentCollector {
type Fruit = TestFruit;
fn collect(&mut self, doc: DocId, score: Score) {
self.fruit.docs.push(DocAddress(self.segment_id, doc));
self.fruit.docs.push(DocAddress::new(self.segment_id, doc));
self.fruit.scores.push(score);
}
@@ -177,7 +177,7 @@ impl Collector for FastFieldTestCollector {
fn for_segment(
&self,
_: SegmentLocalId,
_: SegmentOrdinal,
segment_reader: &SegmentReader,
) -> crate::Result<FastFieldSegmentCollector> {
let reader = segment_reader

View File

@@ -1,6 +1,6 @@
use crate::DocAddress;
use crate::DocId;
use crate::SegmentLocalId;
use crate::SegmentOrdinal;
use crate::SegmentReader;
use std::cmp::Ordering;
use std::collections::BinaryHeap;
@@ -118,7 +118,7 @@ where
pub(crate) fn for_segment<F: PartialOrd>(
&self,
segment_id: SegmentLocalId,
segment_id: SegmentOrdinal,
_: &SegmentReader,
) -> TopSegmentCollector<F> {
TopSegmentCollector::new(segment_id, self.limit + self.offset)
@@ -147,29 +147,32 @@ where
pub(crate) struct TopSegmentCollector<T> {
limit: usize,
heap: BinaryHeap<ComparableDoc<T, DocId>>,
segment_id: u32,
segment_ord: u32,
}
impl<T: PartialOrd> TopSegmentCollector<T> {
fn new(segment_id: SegmentLocalId, limit: usize) -> TopSegmentCollector<T> {
fn new(segment_ord: SegmentOrdinal, limit: usize) -> TopSegmentCollector<T> {
TopSegmentCollector {
limit,
heap: BinaryHeap::with_capacity(limit),
segment_id,
segment_ord,
}
}
}
impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
pub fn harvest(self) -> Vec<(T, DocAddress)> {
let segment_id = self.segment_id;
let segment_ord = self.segment_ord;
self.heap
.into_sorted_vec()
.into_iter()
.map(|comparable_doc| {
(
comparable_doc.feature,
DocAddress(segment_id, comparable_doc.doc),
DocAddress {
segment_ord,
doc_id: comparable_doc.doc,
},
)
})
.collect()
@@ -220,9 +223,9 @@ mod tests {
assert_eq!(
top_collector.harvest(),
vec![
(0.8, DocAddress(0, 1)),
(0.3, DocAddress(0, 5)),
(0.2, DocAddress(0, 3))
(0.8, DocAddress::new(0, 1)),
(0.3, DocAddress::new(0, 5)),
(0.2, DocAddress::new(0, 3))
]
);
}
@@ -238,10 +241,10 @@ mod tests {
assert_eq!(
top_collector.harvest(),
vec![
(0.9, DocAddress(0, 7)),
(0.8, DocAddress(0, 1)),
(0.3, DocAddress(0, 5)),
(0.2, DocAddress(0, 3))
(0.9, DocAddress::new(0, 7)),
(0.8, DocAddress::new(0, 1)),
(0.3, DocAddress::new(0, 5)),
(0.2, DocAddress::new(0, 3))
]
);
}
@@ -276,17 +279,17 @@ mod tests {
let results = collector
.merge_fruits(vec![vec![
(0.9, DocAddress(0, 1)),
(0.8, DocAddress(0, 2)),
(0.7, DocAddress(0, 3)),
(0.6, DocAddress(0, 4)),
(0.5, DocAddress(0, 5)),
(0.9, DocAddress::new(0, 1)),
(0.8, DocAddress::new(0, 2)),
(0.7, DocAddress::new(0, 3)),
(0.6, DocAddress::new(0, 4)),
(0.5, DocAddress::new(0, 5)),
]])
.unwrap();
assert_eq!(
results,
vec![(0.8, DocAddress(0, 2)), (0.7, DocAddress(0, 3)),]
vec![(0.8, DocAddress::new(0, 2)), (0.7, DocAddress::new(0, 3)),]
);
}
@@ -295,10 +298,13 @@ mod tests {
let collector = TopCollector::with_limit(2).and_offset(1);
let results = collector
.merge_fruits(vec![vec![(0.9, DocAddress(0, 1)), (0.8, DocAddress(0, 2))]])
.merge_fruits(vec![vec![
(0.9, DocAddress::new(0, 1)),
(0.8, DocAddress::new(0, 2)),
]])
.unwrap();
assert_eq!(results, vec![(0.8, DocAddress(0, 2)),]);
assert_eq!(results, vec![(0.8, DocAddress::new(0, 2)),]);
}
#[test]
@@ -306,7 +312,10 @@ mod tests {
let collector = TopCollector::with_limit(2).and_offset(20);
let results = collector
.merge_fruits(vec![vec![(0.9, DocAddress(0, 1)), (0.8, DocAddress(0, 2))]])
.merge_fruits(vec![vec![
(0.9, DocAddress::new(0, 1)),
(0.8, DocAddress::new(0, 2)),
]])
.unwrap();
assert_eq!(results, vec![]);

View File

@@ -10,7 +10,7 @@ use crate::schema::Field;
use crate::DocAddress;
use crate::DocId;
use crate::Score;
use crate::SegmentLocalId;
use crate::SegmentOrdinal;
use crate::SegmentReader;
use crate::{collector::custom_score_top_collector::CustomScoreTopCollector, fastfield::FastValue};
use crate::{collector::top_collector::TopSegmentCollector, TantivyError};
@@ -37,7 +37,7 @@ where
fn for_segment(
&self,
segment_local_id: crate::SegmentLocalId,
segment_local_id: crate::SegmentOrdinal,
segment: &SegmentReader,
) -> crate::Result<Self::Child> {
let schema = segment.schema();
@@ -113,8 +113,8 @@ where
/// let query = query_parser.parse_query("diary").unwrap();
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2)).unwrap();
///
/// assert_eq!(top_docs[0].1, DocAddress(0, 1));
/// assert_eq!(top_docs[1].1, DocAddress(0, 3));
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
/// assert_eq!(top_docs[1].1, DocAddress::new(0, 3));
/// ```
pub struct TopDocs(TopCollector<Score>);
@@ -201,8 +201,8 @@ impl TopDocs {
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2).and_offset(1)).unwrap();
///
/// assert_eq!(top_docs.len(), 2);
/// assert_eq!(top_docs[0].1, DocAddress(0, 4));
/// assert_eq!(top_docs[1].1, DocAddress(0, 3));
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 4));
/// assert_eq!(top_docs[1].1, DocAddress::new(0, 3));
/// ```
pub fn and_offset(self, offset: usize) -> TopDocs {
TopDocs(self.0.and_offset(offset))
@@ -243,8 +243,8 @@ impl TopDocs {
/// # let query = QueryParser::for_index(&index, vec![title]).parse_query("diary")?;
/// # let top_docs = docs_sorted_by_rating(&reader.searcher(), &query, rating)?;
/// # assert_eq!(top_docs,
/// # vec![(97u64, DocAddress(0u32, 1)),
/// # (80u64, DocAddress(0u32, 3))]);
/// # vec![(97u64, DocAddress::new(0u32, 1)),
/// # (80u64, DocAddress::new(0u32, 3))]);
/// # Ok(())
/// # }
/// /// Searches the document matching the given query, and
@@ -323,8 +323,8 @@ impl TopDocs {
/// # let reader = index.reader()?;
/// # let top_docs = docs_sorted_by_revenue(&reader.searcher(), &AllQuery, rating)?;
/// # assert_eq!(top_docs,
/// # vec![(119_000_000i64, DocAddress(0, 1)),
/// # (92_000_000i64, DocAddress(0, 0))]);
/// # vec![(119_000_000i64, DocAddress::new(0, 1)),
/// # (92_000_000i64, DocAddress::new(0, 0))]);
/// # Ok(())
/// # }
/// /// Searches the document matching the given query, and
@@ -600,7 +600,7 @@ impl Collector for TopDocs {
fn for_segment(
&self,
segment_local_id: SegmentLocalId,
segment_local_id: SegmentOrdinal,
reader: &SegmentReader,
) -> crate::Result<Self::Child> {
let collector = self.0.for_segment(segment_local_id, reader);
@@ -671,7 +671,15 @@ impl Collector for TopDocs {
let fruit = heap
.into_sorted_vec()
.into_iter()
.map(|cid| (cid.feature, DocAddress(segment_ord, cid.doc)))
.map(|cid| {
(
cid.feature,
DocAddress {
segment_ord,
doc_id: cid.doc,
},
)
})
.collect();
Ok(fruit)
}
@@ -741,9 +749,9 @@ mod tests {
assert_results_equals(
&score_docs,
&[
(0.81221175, DocAddress(0u32, 1)),
(0.5376842, DocAddress(0u32, 2)),
(0.48527452, DocAddress(0, 0)),
(0.81221175, DocAddress::new(0u32, 1)),
(0.5376842, DocAddress::new(0u32, 2)),
(0.48527452, DocAddress::new(0, 0)),
],
);
}
@@ -760,7 +768,7 @@ mod tests {
.searcher()
.search(&text_query, &TopDocs::with_limit(4).and_offset(2))
.unwrap();
assert_results_equals(&score_docs[..], &[(0.48527452, DocAddress(0, 0))]);
assert_results_equals(&score_docs[..], &[(0.48527452, DocAddress::new(0, 0))]);
}
#[test]
@@ -778,8 +786,8 @@ mod tests {
assert_results_equals(
&score_docs,
&[
(0.81221175, DocAddress(0u32, 1)),
(0.5376842, DocAddress(0u32, 2)),
(0.81221175, DocAddress::new(0u32, 1)),
(0.5376842, DocAddress::new(0u32, 2)),
],
);
}
@@ -799,8 +807,8 @@ mod tests {
assert_results_equals(
&score_docs[..],
&[
(0.5376842, DocAddress(0u32, 2)),
(0.48527452, DocAddress(0, 0)),
(0.5376842, DocAddress::new(0u32, 2)),
(0.48527452, DocAddress::new(0, 0)),
],
);
}
@@ -864,9 +872,9 @@ mod tests {
assert_eq!(
&top_docs[..],
&[
(64, DocAddress(0, 1)),
(16, DocAddress(0, 2)),
(12, DocAddress(0, 0))
(64, DocAddress::new(0, 1)),
(16, DocAddress::new(0, 2)),
(12, DocAddress::new(0, 0))
]
);
}
@@ -898,8 +906,8 @@ mod tests {
assert_eq!(
&top_docs[..],
&[
(mr_birthday, DocAddress(0, 1)),
(pr_birthday, DocAddress(0, 0)),
(mr_birthday, DocAddress::new(0, 1)),
(pr_birthday, DocAddress::new(0, 0)),
]
);
Ok(())
@@ -927,7 +935,10 @@ mod tests {
let top_docs: Vec<(i64, DocAddress)> = searcher.search(&AllQuery, &top_collector)?;
assert_eq!(
&top_docs[..],
&[(40i64, DocAddress(0, 1)), (-1i64, DocAddress(0, 0)),]
&[
(40i64, DocAddress::new(0, 1)),
(-1i64, DocAddress::new(0, 0)),
]
);
Ok(())
}
@@ -954,7 +965,10 @@ mod tests {
let top_docs: Vec<(f64, DocAddress)> = searcher.search(&AllQuery, &top_collector)?;
assert_eq!(
&top_docs[..],
&[(40f64, DocAddress(0, 1)), (-1.0f64, DocAddress(0, 0)),]
&[
(40f64, DocAddress::new(0, 1)),
(-1.0f64, DocAddress::new(0, 0)),
]
);
Ok(())
}
@@ -1034,7 +1048,7 @@ mod tests {
assert_eq!(
score_docs,
vec![(1, DocAddress(0, 1)), (0, DocAddress(0, 0)),]
vec![(1, DocAddress::new(0, 1)), (0, DocAddress::new(0, 0)),]
);
}
@@ -1056,7 +1070,7 @@ mod tests {
assert_eq!(
score_docs,
vec![(1, DocAddress(0, 1)), (0, DocAddress(0, 0)),]
vec![(1, DocAddress::new(0, 1)), (0, DocAddress::new(0, 0)),]
);
}

View File

@@ -239,7 +239,7 @@ impl Index {
/// Such segments can of course be part of the index,
/// but also they could be segments being currently built or in the middle of a merge
/// operation.
pub fn list_all_segment_metas(&self) -> Vec<SegmentMeta> {
pub(crate) fn list_all_segment_metas(&self) -> Vec<SegmentMeta> {
self.inventory.all()
}

View File

@@ -54,9 +54,8 @@ impl Searcher {
/// The searcher uses the segment ordinal to route the
/// the request to the right `Segment`.
pub fn doc(&self, doc_address: DocAddress) -> crate::Result<Document> {
let DocAddress(segment_local_id, doc_id) = doc_address;
let store_reader = &self.store_readers[segment_local_id as usize];
store_reader.get(doc_id)
let store_reader = &self.store_readers[doc_address.segment_ord as usize];
store_reader.get(doc_address.doc_id)
}
/// Access the schema associated to the index of this searcher.

View File

@@ -56,7 +56,7 @@ mod tests {
fn test_stored_bytes() -> crate::Result<()> {
let searcher = create_index_for_test(STORED)?;
assert_eq!(searcher.num_docs(), 1);
let retrieved_doc = searcher.doc(DocAddress(0u32, 0u32))?;
let retrieved_doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
let field = searcher.schema().get_field("string_bytes").unwrap();
let values: Vec<&Value> = retrieved_doc.get_all(field).collect();
assert_eq!(values.len(), 2);
@@ -72,7 +72,7 @@ mod tests {
fn test_non_stored_bytes() -> crate::Result<()> {
let searcher = create_index_for_test(INDEXED)?;
assert_eq!(searcher.num_docs(), 1);
let retrieved_doc = searcher.doc(DocAddress(0u32, 0u32))?;
let retrieved_doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
let field = searcher.schema().get_field("string_bytes").unwrap();
assert!(retrieved_doc.get_first(field).is_none());
Ok(())

View File

@@ -105,7 +105,7 @@ mod tests {
let mut facet_ords = Vec::new();
facet_reader.facet_ords(0u32, &mut facet_ords);
assert_eq!(&facet_ords, &[2u64]);
let doc = searcher.doc(DocAddress(0u32, 0u32))?;
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
let value = doc.get_first(facet_field).and_then(Value::path);
assert_eq!(value, None);
Ok(())
@@ -128,7 +128,7 @@ mod tests {
let mut facet_ords = Vec::new();
facet_reader.facet_ords(0u32, &mut facet_ords);
assert!(facet_ords.is_empty());
let doc = searcher.doc(DocAddress(0u32, 0u32))?;
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
let value = doc.get_first(facet_field).and_then(Value::path);
assert_eq!(value, Some("/a/b".to_string()));
Ok(())
@@ -151,7 +151,7 @@ mod tests {
let mut facet_ords = Vec::new();
facet_reader.facet_ords(0u32, &mut facet_ords);
assert_eq!(&facet_ords, &[2u64]);
let doc = searcher.doc(DocAddress(0u32, 0u32))?;
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
let value = doc.get_first(facet_field).and_then(Value::path);
assert_eq!(value, Some("/a/b".to_string()));
Ok(())
@@ -174,7 +174,7 @@ mod tests {
let mut facet_ords = Vec::new();
facet_reader.facet_ords(0u32, &mut facet_ords);
assert!(facet_ords.is_empty());
let doc = searcher.doc(DocAddress(0u32, 0u32))?;
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
let value = doc.get_first(facet_field).and_then(Value::path);
assert_eq!(value, None);
Ok(())

View File

@@ -147,37 +147,50 @@ mod tests {
}
}
// TODO: support Date range queries
// {
// let parser = QueryParser::for_index(&index, vec![date_field]);
// let range_q = format!("\"{}\"..\"{}\"",
// (first_time_stamp + Duration::seconds(1)).to_rfc3339(),
// (first_time_stamp + Duration::seconds(3)).to_rfc3339()
// );
// let query = parser.parse_query(&range_q)
// .expect("could not parse query");
// let results = searcher.search(&query, &TopDocs::with_limit(5))
// .expect("could not query index");
//
//
// assert_eq!(results.len(), 2);
// for (i, doc_pair) in results.iter().enumerate() {
// let retrieved_doc = searcher.doc(doc_pair.1).expect("cannot fetch doc");
// let offset_sec = match i {
// 0 => 1,
// 1 => 3,
// _ => panic!("should not have more than 2 docs")
// };
// let time_i_val = match i {
// 0 => 2,
// 1 => 3,
// _ => panic!("should not have more than 2 docs")
// };
// assert_eq!(retrieved_doc.get_first(date_field).expect("cannot find value").date_value().timestamp(),
// (first_time_stamp + Duration::seconds(offset_sec)).timestamp());
// assert_eq!(retrieved_doc.get_first(time_i).expect("cannot find value").i64_value(), time_i_val);
// }
// }
{
let parser = QueryParser::for_index(&index, vec![date_field]);
let range_q = format!(
"[{} TO {}]",
(first_time_stamp + Duration::seconds(1)).to_rfc3339(),
(first_time_stamp + Duration::seconds(3)).to_rfc3339()
);
let query = parser.parse_query(&range_q).expect("could not parse query");
let results = searcher
.search(&query, &TopDocs::with_limit(5))
.expect("could not query index");
assert_eq!(results.len(), 2);
for (i, doc_pair) in results.iter().enumerate() {
let retrieved_doc = searcher.doc(doc_pair.1).expect("cannot fetch doc");
let offset_sec = match i {
0 => 1,
1 => 2,
_ => panic!("should not have more than 2 docs"),
};
let time_i_val = match i {
0 => 2,
1 => 3,
_ => panic!("should not have more than 2 docs"),
};
assert_eq!(
retrieved_doc
.get_first(date_field)
.expect("cannot find value")
.date_value()
.expect("value not of Date type")
.timestamp(),
(first_time_stamp + Duration::seconds(offset_sec)).timestamp()
);
assert_eq!(
retrieved_doc
.get_first(time_i)
.expect("cannot find value")
.i64_value()
.expect("value not of i64 type"),
time_i_val
);
}
}
}
#[test]

View File

@@ -798,49 +798,53 @@ mod tests {
{
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "a")])?,
vec![DocAddress(0, 1), DocAddress(0, 2), DocAddress(0, 4)]
vec![
DocAddress::new(0, 1),
DocAddress::new(0, 2),
DocAddress::new(0, 4)
]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "af")])?,
vec![DocAddress(0, 0), DocAddress(0, 3)]
vec![DocAddress::new(0, 0), DocAddress::new(0, 3)]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "g")])?,
vec![DocAddress(0, 4)]
vec![DocAddress::new(0, 4)]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "b")])?,
vec![
DocAddress(0, 0),
DocAddress(0, 1),
DocAddress(0, 2),
DocAddress(0, 3),
DocAddress(0, 4)
DocAddress::new(0, 0),
DocAddress::new(0, 1),
DocAddress::new(0, 2),
DocAddress::new(0, 3),
DocAddress::new(0, 4)
]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_date(date_field, &curr_time)])?,
vec![DocAddress(0, 0), DocAddress(0, 3)]
vec![DocAddress::new(0, 0), DocAddress::new(0, 3)]
);
}
{
let doc = searcher.doc(DocAddress(0, 0))?;
let doc = searcher.doc(DocAddress::new(0, 0))?;
assert_eq!(doc.get_first(text_field).unwrap().text(), Some("af b"));
}
{
let doc = searcher.doc(DocAddress(0, 1))?;
let doc = searcher.doc(DocAddress::new(0, 1))?;
assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c"));
}
{
let doc = searcher.doc(DocAddress(0, 2))?;
let doc = searcher.doc(DocAddress::new(0, 2))?;
assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c d"));
}
{
let doc = searcher.doc(DocAddress(0, 3))?;
let doc = searcher.doc(DocAddress::new(0, 3))?;
assert_eq!(doc.get_first(text_field).unwrap().text(), Some("af b"));
}
{
let doc = searcher.doc(DocAddress(0, 4))?;
let doc = searcher.doc(DocAddress::new(0, 4))?;
assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c g"));
}
{

View File

@@ -24,6 +24,7 @@ pub use self::prepared_commit::PreparedCommit;
pub use self::segment_entry::SegmentEntry;
pub use self::segment_manager::SegmentManager;
pub use self::segment_serializer::SegmentSerializer;
pub use self::segment_updater::merge_segments;
pub use self::segment_writer::SegmentWriter;
/// Alias for the default merge policy, which is the `LogMergePolicy`.

View File

@@ -114,7 +114,7 @@ fn merge(
// first we need to apply deletes to our segment.
let merged_segment = index.new_segment();
// First we apply all of the delet to the merged segment, up to the target opstamp.
// First we apply all of the delete to the merged segment, up to the target opstamp.
for segment_entry in &mut segment_entries {
let segment = index.segment(segment_entry.meta().clone());
advance_deletes(segment, segment_entry, target_opstamp)?;
@@ -141,6 +141,81 @@ fn merge(
Ok(SegmentEntry::new(segment_meta, delete_cursor, None))
}
/// Advanced: Merges a list of segments from different indices in a new index.
///
/// Returns `TantivyError` if the the indices list is empty or their
/// schemas don't match.
///
/// `output_directory`: is assumed to be empty.
///
/// # Warning
/// This function does NOT check or take the `IndexWriter` is running. It is not
/// meant to work if you have an IndexWriter running for the origin indices, or
/// the destination Index.
#[doc(hidden)]
pub fn merge_segments<Dir: Directory>(
indices: &[Index],
output_directory: Dir,
) -> crate::Result<Index> {
if indices.is_empty() {
// If there are no indices to merge, there is no need to do anything.
return Err(crate::TantivyError::InvalidArgument(
"No indices given to marge".to_string(),
));
}
let target_schema = indices[0].schema();
// let's check that all of the indices have the same schema
if indices
.iter()
.skip(1)
.any(|index| index.schema() != target_schema)
{
return Err(crate::TantivyError::InvalidArgument(
"Attempt to merge different schema indices".to_string(),
));
}
let mut segments: Vec<Segment> = Vec::new();
for index in indices {
segments.extend(index.searchable_segments()?);
}
let mut merged_index = Index::create(output_directory, target_schema.clone())?;
let merged_segment = merged_index.new_segment();
let merged_segment_id = merged_segment.id();
let merger: IndexMerger = IndexMerger::open(merged_index.schema(), &segments[..])?;
let segment_serializer = SegmentSerializer::for_segment(merged_segment)?;
let num_docs = merger.write(segment_serializer)?;
let segment_meta = merged_index.new_segment_meta(merged_segment_id, num_docs);
let stats = format!(
"Segments Merge: [{}]",
segments
.iter()
.fold(String::new(), |sum, current| format!(
"{}{} ",
sum,
current.meta().id().uuid_string()
))
.trim_end()
);
let index_meta = IndexMeta {
segments: vec![segment_meta],
schema: target_schema,
opstamp: 0u64,
payload: Some(stats),
};
// save the meta.json
save_metas(&index_meta, merged_index.directory_mut())?;
Ok(merged_index)
}
pub(crate) struct InnerSegmentUpdater {
// we keep a copy of the current active IndexMeta to
// avoid loading the file everytime we need it in the
@@ -479,7 +554,7 @@ impl SegmentUpdater {
if delete_operation.opstamp < committed_opstamp {
let index = &segment_updater.index;
let segment = index.segment(after_merge_segment_entry.meta().clone());
if let Err(e) = advance_deletes(
if let Err(advance_deletes_err) = advance_deletes(
segment,
&mut after_merge_segment_entry,
committed_opstamp,
@@ -487,7 +562,7 @@ impl SegmentUpdater {
error!(
"Merge of {:?} was cancelled (advancing deletes failed): {:?}",
merge_operation.segment_ids(),
e
advance_deletes_err
);
if cfg!(test) {
panic!("Merge failed.");
@@ -495,7 +570,7 @@ impl SegmentUpdater {
// ... cancel merge
// `merge_operations` are tracked. As it is dropped, the
// the segment_ids will be available again for merge.
return Err(e);
return Err(advance_deletes_err);
}
}
}
@@ -540,158 +615,201 @@ impl SegmentUpdater {
#[cfg(test)]
mod tests {
use super::merge_segments;
use crate::directory::RAMDirectory;
use crate::indexer::merge_policy::tests::MergeWheneverPossible;
use crate::schema::*;
use crate::Index;
#[test]
fn test_delete_during_merge() {
fn test_delete_during_merge() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let index = Index::create_in_ram(schema_builder.build());
// writing the segment
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(MergeWheneverPossible));
{
for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"b"));
}
assert!(index_writer.commit().is_ok());
for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"b"));
}
index_writer.commit()?;
{
for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"c"));
index_writer.add_document(doc!(text_field=>"d"));
}
assert!(index_writer.commit().is_ok());
for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"c"));
index_writer.add_document(doc!(text_field=>"d"));
}
index_writer.commit()?;
{
index_writer.add_document(doc!(text_field=>"e"));
index_writer.add_document(doc!(text_field=>"f"));
assert!(index_writer.commit().is_ok());
}
index_writer.add_document(doc!(text_field=>"e"));
index_writer.add_document(doc!(text_field=>"f"));
index_writer.commit()?;
{
let term = Term::from_field_text(text_field, "a");
index_writer.delete_term(term);
assert!(index_writer.commit().is_ok());
}
let reader = index.reader().unwrap();
let term = Term::from_field_text(text_field, "a");
index_writer.delete_term(term);
index_writer.commit()?;
let reader = index.reader()?;
assert_eq!(reader.searcher().num_docs(), 302);
{
index_writer
.wait_merging_threads()
.expect("waiting for merging threads");
}
index_writer.wait_merging_threads()?;
reader.reload().unwrap();
reader.reload()?;
assert_eq!(reader.searcher().segment_readers().len(), 1);
assert_eq!(reader.searcher().num_docs(), 302);
Ok(())
}
#[test]
fn delete_all_docs() {
fn delete_all_docs() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let index = Index::create_in_ram(schema_builder.build());
// writing the segment
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests()?;
{
for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"b"));
}
assert!(index_writer.commit().is_ok());
for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"b"));
}
index_writer.commit()?;
for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"c"));
index_writer.add_document(doc!(text_field=>"d"));
}
index_writer.commit()?;
index_writer.add_document(doc!(text_field=>"e"));
index_writer.add_document(doc!(text_field=>"f"));
index_writer.commit()?;
let seg_ids = index.searchable_segment_ids()?;
// docs exist, should have at least 1 segment
assert!(seg_ids.len() > 0);
let term_vals = vec!["a", "b", "c", "d", "e", "f"];
for term_val in term_vals {
let term = Term::from_field_text(text_field, term_val);
index_writer.delete_term(term);
index_writer.commit()?;
}
{
for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"c"));
index_writer.add_document(doc!(text_field=>"d"));
}
assert!(index_writer.commit().is_ok());
}
index_writer.wait_merging_threads()?;
{
index_writer.add_document(doc!(text_field=>"e"));
index_writer.add_document(doc!(text_field=>"f"));
assert!(index_writer.commit().is_ok());
}
{
let seg_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
// docs exist, should have at least 1 segment
assert!(seg_ids.len() > 0);
}
{
let term_vals = vec!["a", "b", "c", "d", "e", "f"];
for term_val in term_vals {
let term = Term::from_field_text(text_field, term_val);
index_writer.delete_term(term);
assert!(index_writer.commit().is_ok());
}
}
{
index_writer
.wait_merging_threads()
.expect("waiting for merging threads");
}
let reader = index.reader().unwrap();
let reader = index.reader()?;
assert_eq!(reader.searcher().num_docs(), 0);
let seg_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let seg_ids = index.searchable_segment_ids()?;
assert!(seg_ids.is_empty());
reader.reload().unwrap();
reader.reload()?;
assert_eq!(reader.searcher().num_docs(), 0);
// empty segments should be erased
assert!(index.searchable_segment_metas().unwrap().is_empty());
assert!(index.searchable_segment_metas()?.is_empty());
assert!(reader.searcher().segment_readers().is_empty());
Ok(())
}
#[test]
fn test_remove_all_segments() {
fn test_remove_all_segments() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let index = Index::create_in_ram(schema_builder.build());
// writing the segment
let mut index_writer = index.writer_for_tests().unwrap();
{
for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"b"));
}
assert!(index_writer.commit().is_ok());
let mut index_writer = index.writer_for_tests()?;
for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"b"));
}
index_writer.commit()?;
index_writer.segment_updater().remove_all_segments();
let seg_vec = index_writer
.segment_updater()
.segment_manager
.segment_entries();
assert!(seg_vec.is_empty());
Ok(())
}
#[test]
fn test_merge_segments() -> crate::Result<()> {
let mut indices = vec![];
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
for _ in 0..3 {
let index = Index::create_in_ram(schema.clone());
// writing two segments
let mut index_writer = index.writer_for_tests()?;
for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"fizz"));
index_writer.add_document(doc!(text_field=>"buzz"));
}
index_writer.commit()?;
for _ in 0..1000 {
index_writer.add_document(doc!(text_field=>"foo"));
index_writer.add_document(doc!(text_field=>"bar"));
}
index_writer.commit()?;
indices.push(index);
}
assert_eq!(indices.len(), 3);
let output_directory = RAMDirectory::default();
let index = merge_segments(&indices, output_directory)?;
assert_eq!(index.schema(), schema);
let segments = index.searchable_segments()?;
assert_eq!(segments.len(), 1);
let segment_metas = segments[0].meta();
assert_eq!(segment_metas.num_deleted_docs(), 0);
assert_eq!(segment_metas.num_docs(), 6600);
Ok(())
}
#[test]
fn test_merge_empty_indices_array() {
let merge_result = merge_segments(&[], RAMDirectory::default());
assert!(merge_result.is_err());
}
#[test]
fn test_merge_mismatched_schema() -> crate::Result<()> {
let first_index = {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"some text"));
index_writer.commit()?;
index
};
let second_index = {
let mut schema_builder = Schema::builder();
let body_field = schema_builder.add_text_field("body", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(body_field=>"some body"));
index_writer.commit()?;
index
};
// mismatched schema index list
let result = merge_segments(&[first_index, second_index], RAMDirectory::default());
assert!(result.is_err());
Ok(())
}
}

View File

@@ -163,6 +163,7 @@ pub use crate::core::{Executor, SegmentComponent};
pub use crate::core::{Index, IndexMeta, Searcher, Segment, SegmentId, SegmentMeta};
pub use crate::core::{InvertedIndexReader, SegmentReader};
pub use crate::directory::Directory;
pub use crate::indexer::merge_segments;
pub use crate::indexer::operation::UserOperation;
pub use crate::indexer::IndexWriter;
pub use crate::postings::Postings;
@@ -254,20 +255,16 @@ pub type Opstamp = u64;
/// the document to the search query.
pub type Score = f32;
/// A `SegmentLocalId` identifies a segment.
/// It only makes sense for a given searcher.
pub type SegmentLocalId = u32;
/// A `SegmentOrdinal` identifies a segment, within a `Searcher`.
pub type SegmentOrdinal = u32;
impl DocAddress {
/// Return the segment ordinal id that identifies the segment
/// hosting the document in the `Searcher` it is called from.
pub fn segment_ord(self) -> SegmentLocalId {
self.0
}
/// Return the segment-local `DocId`
pub fn doc(self) -> DocId {
self.1
/// Creates a new DocAddress from the segment/docId pair.
pub fn new(segment_ord: SegmentOrdinal, doc_id: DocId) -> DocAddress {
DocAddress {
segment_ord,
doc_id,
}
}
}
@@ -280,7 +277,13 @@ impl DocAddress {
/// The id used for the segment is actually an ordinal
/// in the list of `Segment`s held by a `Searcher`.
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct DocAddress(pub SegmentLocalId, pub DocId);
pub struct DocAddress {
/// The segment ordinal id that identifies the segment
/// hosting the document in the `Searcher` it is called from.
pub segment_ord: SegmentOrdinal,
/// The segment-local `DocId`.
pub doc_id: DocId,
}
#[cfg(test)]
mod tests {
@@ -778,30 +781,38 @@ mod tests {
};
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "a")])?,
vec![DocAddress(0, 1), DocAddress(0, 2)]
vec![DocAddress::new(0, 1), DocAddress::new(0, 2)]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "af")])?,
vec![DocAddress(0, 0)]
vec![DocAddress::new(0, 0)]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "b")])?,
vec![DocAddress(0, 0), DocAddress(0, 1), DocAddress(0, 2)]
vec![
DocAddress::new(0, 0),
DocAddress::new(0, 1),
DocAddress::new(0, 2)
]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "c")])?,
vec![DocAddress(0, 1), DocAddress(0, 2)]
vec![DocAddress::new(0, 1), DocAddress::new(0, 2)]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "d")])?,
vec![DocAddress(0, 2)]
vec![DocAddress::new(0, 2)]
);
assert_eq!(
get_doc_ids(vec![
Term::from_field_text(text_field, "b"),
Term::from_field_text(text_field, "a"),
])?,
vec![DocAddress(0, 0), DocAddress(0, 1), DocAddress(0, 2)]
vec![
DocAddress::new(0, 0),
DocAddress::new(0, 1),
DocAddress::new(0, 2)
]
);
Ok(())
}

View File

@@ -238,9 +238,9 @@ mod tests {
assert_eq!(
docs,
vec![
DocAddress(0u32, 1u32),
DocAddress(0u32, 2u32),
DocAddress(0u32, 3u32)
DocAddress::new(0u32, 1u32),
DocAddress::new(0u32, 2u32),
DocAddress::new(0u32, 3u32)
]
.into_iter()
.collect()
@@ -264,15 +264,24 @@ mod tests {
BooleanQuery::intersection(vec![term_b.box_clone(), term_c.box_clone()]);
{
let docs = searcher.search(&intersection_ab, &DocSetCollector)?;
assert_eq!(docs, vec![DocAddress(0u32, 2u32)].into_iter().collect());
assert_eq!(
docs,
vec![DocAddress::new(0u32, 2u32)].into_iter().collect()
);
}
{
let docs = searcher.search(&intersection_ac, &DocSetCollector)?;
assert_eq!(docs, vec![DocAddress(0u32, 1u32)].into_iter().collect());
assert_eq!(
docs,
vec![DocAddress::new(0u32, 1u32)].into_iter().collect()
);
}
{
let docs = searcher.search(&intersection_bc, &DocSetCollector)?;
assert_eq!(docs, vec![DocAddress(0u32, 0u32)].into_iter().collect());
assert_eq!(
docs,
vec![DocAddress::new(0u32, 0u32)].into_iter().collect()
);
}
Ok(())
}

View File

@@ -128,7 +128,7 @@ mod tests {
.docs()
.iter()
.cloned()
.map(|doc| doc.1)
.map(|doc| doc.doc_id)
.collect::<Vec<DocId>>()
};
{
@@ -196,8 +196,8 @@ mod tests {
let topdocs_no_excluded = matching_topdocs(&boolean_query_no_excluded);
assert_eq!(topdocs_no_excluded.len(), 2);
let (top_score, top_doc) = topdocs_no_excluded[0];
assert_eq!(top_doc, DocAddress(0, 4));
assert_eq!(topdocs_no_excluded[1].1, DocAddress(0, 3)); // ignore score of doc 3.
assert_eq!(top_doc, DocAddress::new(0, 4));
assert_eq!(topdocs_no_excluded[1].1, DocAddress::new(0, 3)); // ignore score of doc 3.
score_doc_4 = top_score;
}
@@ -210,7 +210,7 @@ mod tests {
let topdocs_excluded = matching_topdocs(&boolean_query_two_excluded);
assert_eq!(topdocs_excluded.len(), 1);
let (top_score, top_doc) = topdocs_excluded[0];
assert_eq!(top_doc, DocAddress(0, 4));
assert_eq!(top_doc, DocAddress::new(0, 4));
assert_eq!(top_score, score_doc_4);
}
}
@@ -309,7 +309,7 @@ mod tests {
IndexRecordOption::Basic,
));
let query = BooleanQuery::from(vec![(Occur::Should, term_a), (Occur::Should, term_b)]);
let explanation = query.explain(&searcher, DocAddress(0, 0u32))?;
let explanation = query.explain(&searcher, DocAddress::new(0, 0u32))?;
assert_nearly_equals!(explanation.value(), 0.6931472);
Ok(())
}

View File

@@ -150,7 +150,7 @@ mod tests {
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let query = BoostQuery::new(Box::new(AllQuery), 0.2);
let explanation = query.explain(&searcher, DocAddress(0, 0u32)).unwrap();
let explanation = query.explain(&searcher, DocAddress::new(0, 0u32)).unwrap();
assert_eq!(
explanation.to_pretty_json(),
"{\n \"value\": 0.2,\n \"description\": \"Boost x0.2 of ...\",\n \"details\": [\n {\n \"value\": 1.0,\n \"description\": \"AllQuery\",\n \"context\": []\n }\n ],\n \"context\": []\n}"

View File

@@ -58,7 +58,7 @@ pub mod tests {
test_fruits
.docs()
.iter()
.map(|docaddr| docaddr.1)
.map(|docaddr| docaddr.doc_id)
.collect::<Vec<_>>()
};
assert_eq!(test_query(vec!["a", "b"]), vec![1, 2, 3, 4]);
@@ -109,7 +109,7 @@ pub mod tests {
test_fruits
.docs()
.iter()
.map(|docaddr| docaddr.1)
.map(|docaddr| docaddr.doc_id)
.collect::<Vec<_>>()
};
assert_eq!(test_query(vec!["a", "b", "c"]), vec![2, 4]);
@@ -206,8 +206,8 @@ pub mod tests {
.docs()
.to_vec()
};
assert_eq!(test_query(vec!["a", "b"]), vec![DocAddress(0, 1)]);
assert_eq!(test_query(vec!["b", "a"]), vec![DocAddress(0, 2)]);
assert_eq!(test_query(vec!["a", "b"]), vec![DocAddress::new(0, 1)]);
assert_eq!(test_query(vec!["b", "a"]), vec![DocAddress::new(0, 2)]);
}
#[test] // motivated by #234
@@ -233,7 +233,7 @@ pub mod tests {
.expect("search should succeed")
.docs()
.iter()
.map(|doc_address| doc_address.1)
.map(|doc_address| doc_address.doc_id)
.collect::<Vec<DocId>>()
};
assert_eq!(test_query(vec![(0, "a"), (1, "b")]), vec![0]);

View File

@@ -51,9 +51,9 @@ pub trait Query: QueryClone + Send + Sync + downcast_rs::Downcast + fmt::Debug {
/// Returns an `Explanation` for the score of the document.
fn explain(&self, searcher: &Searcher, doc_address: DocAddress) -> crate::Result<Explanation> {
let reader = searcher.segment_reader(doc_address.segment_ord());
let reader = searcher.segment_reader(doc_address.segment_ord);
let weight = self.weight(searcher, true)?;
weight.explain(reader, doc_address.doc())
weight.explain(reader, doc_address.doc_id)
}
/// Returns the number of documents matching the query.

View File

@@ -157,7 +157,8 @@ fn trim_ast(logical_ast: LogicalAST) -> Option<LogicalAST> {
/// a word lexicographically between `a` and `c` (inclusive lower bound, exclusive upper bound).
/// Inclusive bounds are `[]`, exclusive are `{}`.
///
/// * date values: The query parser supports rfc3339 formatted dates. For example "2002-10-02T15:00:00.05Z"
/// * date values: The query parser supports rfc3339 formatted dates. For example `"2002-10-02T15:00:00.05Z"`
/// or `some_date_field:[2002-10-02T15:00:00Z TO 2002-10-02T18:00:00Z}`
///
/// * all docs query: A plain `*` will match all documents in the index.
///

View File

@@ -196,18 +196,18 @@ mod tests {
let term_query = TermQuery::new(term_a, IndexRecordOption::Basic);
let searcher = index.reader()?.searcher();
{
let explanation = term_query.explain(&searcher, DocAddress(0u32, 1u32))?;
let explanation = term_query.explain(&searcher, DocAddress::new(0u32, 1u32))?;
assert_nearly_equals!(explanation.value(), 0.6931472);
}
{
let explanation_err = term_query.explain(&searcher, DocAddress(0u32, 0u32));
let explanation_err = term_query.explain(&searcher, DocAddress::new(0u32, 0u32));
assert!(matches!(
explanation_err,
Err(crate::TantivyError::InvalidArgument(_msg))
));
}
{
let explanation_err = term_query.explain(&searcher, DocAddress(0u32, 3u32));
let explanation_err = term_query.explain(&searcher, DocAddress::new(0u32, 3u32));
assert!(matches!(
explanation_err,
Err(crate::TantivyError::InvalidArgument(_msg))

View File

@@ -192,7 +192,7 @@ impl<'de> Deserialize<'de> for FieldEntry {
Name,
Type,
Options,
};
}
const FIELDS: &[&str] = &["name", "type", "options"];

View File

@@ -192,7 +192,7 @@ impl SchemaBuilder {
}))
}
}
#[derive(Debug)]
struct InnerSchema {
fields: Vec<FieldEntry>,
fields_map: HashMap<String, Field>, // transient
@@ -226,7 +226,7 @@ impl Eq for InnerSchema {}
/// let schema = schema_builder.build();
///
/// ```
#[derive(Clone, Eq, PartialEq)]
#[derive(Clone, Eq, PartialEq, Debug)]
pub struct Schema(Arc<InnerSchema>);
impl Schema {

View File

@@ -154,7 +154,7 @@ mod tests {
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 30);
for i in 0..searcher.num_docs() as u32 {
let _doc = searcher.doc(DocAddress(0u32, i))?;
let _doc = searcher.doc(DocAddress::new(0u32, i))?;
}
Ok(())
}

View File

@@ -1,4 +1,3 @@
use crate::schema::Term;
use crate::termdict::TermOrdinal;
use crate::termdict::TermStreamer;
use std::cmp::Ordering;
@@ -114,14 +113,4 @@ impl<'a> TermMerger<'a> {
pub fn current_kvs(&self) -> &[HeapItem<'a>] {
&self.current_streamers[..]
}
/// Iterates through terms
#[cfg_attr(feature = "cargo-clippy", allow(clippy::should_implement_trait))]
pub fn next(&mut self) -> Option<Term<&[u8]>> {
if self.advance() {
Some(Term::wrap(self.current_streamers[0].streamer.key()))
} else {
None
}
}
}