diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs index e77783300..7ac74fbeb 100644 --- a/src/termdict/mod.rs +++ b/src/termdict/mod.rs @@ -1,7 +1,29 @@ +/*! +The term dictionary main role is to associate the sorted [`Term`s](../struct.Term.html) to +a [`TermInfo`](../postings/struct.TermInfo.html) struct that contains some meta-information +about the term. + +Internally, the term dictionary relies on the `fst` crate to store +a sorted mapping that associate each term to its rank in the lexicographical order. +For instance, in a dictionary containing the sorted terms "abba", "bjork", "blur" and "donovan", +the `TermOrdinal` are respectively `0`, `1`, `2`, and `3`. + +For `u64`-terms, tantivy explicitely uses a `BigEndian` representation to ensure that the +lexicographical order matches the natural order of integers. + +`i64`-terms are transformed to `u64` using a continuous mapping `val ⟶ val - i64::min_value()` +and then treated as a `u64`. + +`f64`-terms are transformed to `u64` using a mapping that preserve order, and are then treated +as `u64`. + +A second datastructure makes it possible to access a [`TermInfo`](../postings/struct.TermInfo.html). +*/ + use tantivy_fst::automaton::AlwaysMatch; mod fst_termdict; -// mod traits; +use fst_termdict as termdict; #[cfg(test)] mod tests; @@ -11,12 +33,12 @@ pub type TermOrdinal = u64; /// The term dictionary contains all of the terms in /// `tantivy index` in a sorted manner. -pub type TermDictionary = self::fst_termdict::TermDictionary; +pub type TermDictionary = self::termdict::TermDictionary; /// Builder for the new term dictionary. /// /// Inserting must be done in the order of the `keys`. -pub type TermDictionaryBuilder = self::fst_termdict::TermDictionaryBuilder; +pub type TermDictionaryBuilder = self::termdict::TermDictionaryBuilder; /// Given a list of sorted term streams, /// returns an iterator over sorted unique terms. @@ -25,8 +47,8 @@ pub type TermDictionaryBuilder = self::fst_termdict::TermDictionaryBuilder /// - the term /// - a slice with the ordinal of the segments containing /// the terms. -pub type TermMerger<'a> = self::fst_termdict::TermMerger<'a>; +pub type TermMerger<'a> = self::termdict::TermMerger<'a>; /// `TermStreamer` acts as a cursor over a range of terms of a segment. /// Terms are guaranteed to be sorted. -pub type TermStreamer<'a, A = AlwaysMatch> = self::fst_termdict::TermStreamer<'a, A>; +pub type TermStreamer<'a, A = AlwaysMatch> = self::termdict::TermStreamer<'a, A>; diff --git a/src/termdict/tests.rs b/src/termdict/tests.rs index 1a58e43b6..c7d3afc7f 100644 --- a/src/termdict/tests.rs +++ b/src/termdict/tests.rs @@ -1,421 +1,420 @@ +use super::{TermDictionary, TermDictionaryBuilder, TermStreamer}; +use crate::core::Index; +use crate::directory::{Directory, FileSlice, RAMDirectory}; +use crate::postings::TermInfo; +use crate::schema::{Schema, TEXT}; +use std::path::PathBuf; +use std::str; - use super::{TermDictionary, TermDictionaryBuilder, TermStreamer}; - use crate::core::Index; - use crate::directory::{Directory, FileSlice, RAMDirectory}; - use crate::postings::TermInfo; - use crate::schema::{Schema, TEXT}; - use std::path::PathBuf; - use std::str; +const BLOCK_SIZE: usize = 1_500; - const BLOCK_SIZE: usize = 1_500; - - fn make_term_info(term_ord: u64) -> TermInfo { - let offset = |term_ord: u64| term_ord * 100 + term_ord * term_ord; - TermInfo { - doc_freq: term_ord as u32, - postings_start_offset: offset(term_ord), - postings_stop_offset: offset(term_ord + 1), - positions_idx: offset(term_ord) * 2u64, - } +fn make_term_info(term_ord: u64) -> TermInfo { + let offset = |term_ord: u64| term_ord * 100 + term_ord * term_ord; + TermInfo { + doc_freq: term_ord as u32, + postings_start_offset: offset(term_ord), + postings_stop_offset: offset(term_ord + 1), + positions_idx: offset(term_ord) * 2u64, } +} - #[test] - fn test_empty_term_dictionary() { - let empty = TermDictionary::empty(); - assert!(empty.stream().next().is_none()); +#[test] +fn test_empty_term_dictionary() { + let empty = TermDictionary::empty(); + assert!(empty.stream().next().is_none()); +} + +#[test] +fn test_term_ordinals() -> crate::Result<()> { + const COUNTRIES: [&'static str; 7] = [ + "San Marino", + "Serbia", + "Slovakia", + "Slovenia", + "Spain", + "Sweden", + "Switzerland", + ]; + let directory = RAMDirectory::create(); + let path = PathBuf::from("TermDictionary"); + { + let write = directory.open_write(&path)?; + let mut term_dictionary_builder = TermDictionaryBuilder::create(write)?; + for term in COUNTRIES.iter() { + term_dictionary_builder.insert(term.as_bytes(), &make_term_info(0u64))?; + } + term_dictionary_builder.finish()?; } - - #[test] - fn test_term_ordinals() -> crate::Result<()> { - const COUNTRIES: [&'static str; 7] = [ - "San Marino", - "Serbia", - "Slovakia", - "Slovenia", - "Spain", - "Sweden", - "Switzerland", - ]; - let directory = RAMDirectory::create(); - let path = PathBuf::from("TermDictionary"); - { - let write = directory.open_write(&path)?; - let mut term_dictionary_builder = TermDictionaryBuilder::create(write)?; - for term in COUNTRIES.iter() { - term_dictionary_builder.insert(term.as_bytes(), &make_term_info(0u64))?; - } - term_dictionary_builder.finish()?; - } - let term_file = directory.open_read(&path)?; - let term_dict: TermDictionary = TermDictionary::open(term_file)?; - for (term_ord, term) in COUNTRIES.iter().enumerate() { - assert_eq!(term_dict.term_ord(term).unwrap(), term_ord as u64); - let mut bytes = vec![]; - assert!(term_dict.ord_to_term(term_ord as u64, &mut bytes)); - assert_eq!(bytes, term.as_bytes()); - } - Ok(()) + let term_file = directory.open_read(&path)?; + let term_dict: TermDictionary = TermDictionary::open(term_file)?; + for (term_ord, term) in COUNTRIES.iter().enumerate() { + assert_eq!(term_dict.term_ord(term).unwrap(), term_ord as u64); + let mut bytes = vec![]; + assert!(term_dict.ord_to_term(term_ord as u64, &mut bytes)); + assert_eq!(bytes, term.as_bytes()); } + Ok(()) +} - #[test] - fn test_term_dictionary_simple() -> crate::Result<()> { - let directory = RAMDirectory::create(); - let path = PathBuf::from("TermDictionary"); - { - let write = directory.open_write(&path)?; - let mut term_dictionary_builder = TermDictionaryBuilder::create(write)?; - term_dictionary_builder.insert("abc".as_bytes(), &make_term_info(34u64))?; - term_dictionary_builder.insert("abcd".as_bytes(), &make_term_info(346u64))?; - term_dictionary_builder.finish()?; - } - let file = directory.open_read(&path)?; - let term_dict: TermDictionary = TermDictionary::open(file)?; - assert_eq!(term_dict.get("abc").unwrap().doc_freq, 34u32); - assert_eq!(term_dict.get("abcd").unwrap().doc_freq, 346u32); - let mut stream = term_dict.stream(); - { - { - let (k, v) = stream.next().unwrap(); - assert_eq!(k.as_ref(), "abc".as_bytes()); - assert_eq!(v.doc_freq, 34u32); - } - assert_eq!(stream.key(), "abc".as_bytes()); - assert_eq!(stream.value().doc_freq, 34u32); - } - { - { - let (k, v) = stream.next().unwrap(); - assert_eq!(k, "abcd".as_bytes()); - assert_eq!(v.doc_freq, 346u32); - } - assert_eq!(stream.key(), "abcd".as_bytes()); - assert_eq!(stream.value().doc_freq, 346u32); - } - assert!(!stream.advance()); - Ok(()) +#[test] +fn test_term_dictionary_simple() -> crate::Result<()> { + let directory = RAMDirectory::create(); + let path = PathBuf::from("TermDictionary"); + { + let write = directory.open_write(&path)?; + let mut term_dictionary_builder = TermDictionaryBuilder::create(write)?; + term_dictionary_builder.insert("abc".as_bytes(), &make_term_info(34u64))?; + term_dictionary_builder.insert("abcd".as_bytes(), &make_term_info(346u64))?; + term_dictionary_builder.finish()?; } - - #[test] - fn test_term_iterator() -> crate::Result<()> { - let mut schema_builder = Schema::builder(); - let text_field = schema_builder.add_text_field("text", TEXT); - let index = Index::create_in_ram(schema_builder.build()); + let file = directory.open_read(&path)?; + let term_dict: TermDictionary = TermDictionary::open(file)?; + assert_eq!(term_dict.get("abc").unwrap().doc_freq, 34u32); + assert_eq!(term_dict.get("abcd").unwrap().doc_freq, 346u32); + let mut stream = term_dict.stream(); + { { - let mut index_writer = index.writer_for_tests()?; - index_writer.add_document(doc!(text_field=>"a b d f")); - index_writer.commit()?; - index_writer.add_document(doc!(text_field=>"a b c d f")); - index_writer.commit()?; - index_writer.add_document(doc!(text_field => "e f")); - index_writer.commit()?; + let (k, v) = stream.next().unwrap(); + assert_eq!(k.as_ref(), "abc".as_bytes()); + assert_eq!(v.doc_freq, 34u32); } - let searcher = index.reader()?.searcher(); - - let field_searcher = searcher.field(text_field)?; - let mut term_it = field_searcher.terms(); - let mut term_string = String::new(); - while term_it.advance() { - //let term = Term::from_bytes(term_it.key()); - term_string.push_str(str::from_utf8(term_it.key()).expect("test")); - } - assert_eq!(&*term_string, "abcdef"); - Ok(()) + assert_eq!(stream.key(), "abc".as_bytes()); + assert_eq!(stream.value().doc_freq, 34u32); } - - #[test] - fn test_term_dictionary_stream() -> crate::Result<()> { - let ids: Vec<_> = (0u32..10_000u32) - .map(|i| (format!("doc{:0>6}", i), i)) - .collect(); - let buffer: Vec = { - let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap(); - for &(ref id, ref i) in &ids { - term_dictionary_builder - .insert(id.as_bytes(), &make_term_info(*i as u64)) - .unwrap(); - } - term_dictionary_builder.finish().unwrap() - }; - let term_file = FileSlice::from(buffer); - let term_dictionary: TermDictionary = TermDictionary::open(term_file)?; + { { - let mut streamer = term_dictionary.stream(); - let mut i = 0; - while let Some((streamer_k, streamer_v)) = streamer.next() { - let &(ref key, ref v) = &ids[i]; - assert_eq!(streamer_k.as_ref(), key.as_bytes()); - assert_eq!(streamer_v, &make_term_info(*v as u64)); - i += 1; - } + let (k, v) = stream.next().unwrap(); + assert_eq!(k, "abcd".as_bytes()); + assert_eq!(v.doc_freq, 346u32); } - - let &(ref key, ref val) = &ids[2047]; - assert_eq!( - term_dictionary.get(key.as_bytes()), - Some(make_term_info(*val as u64)) - ); - Ok(()) + assert_eq!(stream.key(), "abcd".as_bytes()); + assert_eq!(stream.value().doc_freq, 346u32); } + assert!(!stream.advance()); + Ok(()) +} - #[test] - fn test_stream_high_range_prefix_suffix() -> crate::Result<()> { - let buffer: Vec = { - let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap(); - // term requires more than 16bits - term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1))?; - term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxyz", &make_term_info(2))?; - term_dictionary_builder.insert("abr", &make_term_info(3))?; - term_dictionary_builder.finish()? - }; - let term_dict_file = FileSlice::from(buffer); - let term_dictionary: TermDictionary = TermDictionary::open(term_dict_file)?; - let mut kv_stream = term_dictionary.stream(); - assert!(kv_stream.advance()); - assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxy".as_bytes()); - assert_eq!(kv_stream.value(), &make_term_info(1)); - assert!(kv_stream.advance()); - assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxyz".as_bytes()); - assert_eq!(kv_stream.value(), &make_term_info(2)); - assert!(kv_stream.advance()); - assert_eq!(kv_stream.key(), "abr".as_bytes()); - assert_eq!(kv_stream.value(), &make_term_info(3)); - assert!(!kv_stream.advance()); - Ok(()) +#[test] +fn test_term_iterator() -> crate::Result<()> { + let mut schema_builder = Schema::builder(); + let text_field = schema_builder.add_text_field("text", TEXT); + let index = Index::create_in_ram(schema_builder.build()); + { + let mut index_writer = index.writer_for_tests()?; + index_writer.add_document(doc!(text_field=>"a b d f")); + index_writer.commit()?; + index_writer.add_document(doc!(text_field=>"a b c d f")); + index_writer.commit()?; + index_writer.add_document(doc!(text_field => "e f")); + index_writer.commit()?; } + let searcher = index.reader()?.searcher(); - #[test] - fn test_stream_range() -> crate::Result<()> { - let ids: Vec<_> = (0u32..10_000u32) - .map(|i| (format!("doc{:0>6}", i), i)) - .collect(); - let buffer: Vec = { - let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap(); - for &(ref id, ref i) in &ids { - term_dictionary_builder - .insert(id.as_bytes(), &make_term_info(*i as u64)) - .unwrap(); - } - term_dictionary_builder.finish().unwrap() - }; - - let file = FileSlice::from(buffer); - - let term_dictionary: TermDictionary = TermDictionary::open(file)?; - { - for i in (0..20).chain(6000..8_000) { - let &(ref target_key, _) = &ids[i]; - let mut streamer = term_dictionary - .range() - .ge(target_key.as_bytes()) - .into_stream(); - for j in 0..3 { - let (streamer_k, streamer_v) = streamer.next().unwrap(); - let &(ref key, ref v) = &ids[i + j]; - assert_eq!(str::from_utf8(streamer_k.as_ref()).unwrap(), key); - assert_eq!(streamer_v.doc_freq, *v); - assert_eq!(streamer_v, &make_term_info(*v as u64)); - } - } - } - - { - for i in (0..20).chain(BLOCK_SIZE - 10..BLOCK_SIZE + 10) { - let &(ref target_key, _) = &ids[i]; - let mut streamer = term_dictionary - .range() - .gt(target_key.as_bytes()) - .into_stream(); - for j in 0..3 { - let (streamer_k, streamer_v) = streamer.next().unwrap(); - let &(ref key, ref v) = &ids[i + j + 1]; - assert_eq!(streamer_k.as_ref(), key.as_bytes()); - assert_eq!(streamer_v.doc_freq, *v); - } - } - } - - { - for i in (0..20).chain(BLOCK_SIZE - 10..BLOCK_SIZE + 10) { - for j in 0..3 { - let &(ref fst_key, _) = &ids[i]; - let &(ref last_key, _) = &ids[i + j]; - let mut streamer = term_dictionary - .range() - .ge(fst_key.as_bytes()) - .lt(last_key.as_bytes()) - .into_stream(); - for _ in 0..j { - assert!(streamer.next().is_some()); - } - assert!(streamer.next().is_none()); - } - } - } - Ok(()) + let field_searcher = searcher.field(text_field)?; + let mut term_it = field_searcher.terms(); + let mut term_string = String::new(); + while term_it.advance() { + //let term = Term::from_bytes(term_it.key()); + term_string.push_str(str::from_utf8(term_it.key()).expect("test")); } + assert_eq!(&*term_string, "abcdef"); + Ok(()) +} - #[test] - fn test_empty_string() -> crate::Result<()> { - let buffer: Vec = { - let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap(); +#[test] +fn test_term_dictionary_stream() -> crate::Result<()> { + let ids: Vec<_> = (0u32..10_000u32) + .map(|i| (format!("doc{:0>6}", i), i)) + .collect(); + let buffer: Vec = { + let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap(); + for &(ref id, ref i) in &ids { term_dictionary_builder - .insert(&[], &make_term_info(1 as u64)) + .insert(id.as_bytes(), &make_term_info(*i as u64)) .unwrap(); - term_dictionary_builder - .insert(&[1u8], &make_term_info(2 as u64)) - .unwrap(); - term_dictionary_builder.finish().unwrap() - }; - let file = FileSlice::from(buffer); - let term_dictionary: TermDictionary = TermDictionary::open(file)?; - let mut stream = term_dictionary.stream(); - assert!(stream.advance()); - assert!(stream.key().is_empty()); - assert!(stream.advance()); - assert_eq!(stream.key(), &[1u8]); - assert!(!stream.advance()); - Ok(()) + } + term_dictionary_builder.finish().unwrap() + }; + let term_file = FileSlice::from(buffer); + let term_dictionary: TermDictionary = TermDictionary::open(term_file)?; + { + let mut streamer = term_dictionary.stream(); + let mut i = 0; + while let Some((streamer_k, streamer_v)) = streamer.next() { + let &(ref key, ref v) = &ids[i]; + assert_eq!(streamer_k.as_ref(), key.as_bytes()); + assert_eq!(streamer_v, &make_term_info(*v as u64)); + i += 1; + } } - #[test] - fn test_stream_range_boundaries() -> crate::Result<()> { - let buffer: Vec = { - let mut term_dictionary_builder = TermDictionaryBuilder::create(Vec::new())?; - for i in 0u8..10u8 { - let number_arr = [i; 1]; - term_dictionary_builder.insert(&number_arr, &make_term_info(i as u64))?; - } - term_dictionary_builder.finish()? - }; - let file = FileSlice::from(buffer); - let term_dictionary: TermDictionary = TermDictionary::open(file)?; + let &(ref key, ref val) = &ids[2047]; + assert_eq!( + term_dictionary.get(key.as_bytes()), + Some(make_term_info(*val as u64)) + ); + Ok(()) +} - let value_list = |mut streamer: TermStreamer<'_>, backwards: bool| { - let mut res: Vec = vec![]; - while let Some((_, ref v)) = streamer.next() { - res.push(v.doc_freq); - } - if backwards { - res.reverse(); - } - res - }; - { - let range = term_dictionary.range().backward().into_stream(); - assert_eq!( - value_list(range, true), - vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32] - ); +#[test] +fn test_stream_high_range_prefix_suffix() -> crate::Result<()> { + let buffer: Vec = { + let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap(); + // term requires more than 16bits + term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1))?; + term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxyz", &make_term_info(2))?; + term_dictionary_builder.insert("abr", &make_term_info(3))?; + term_dictionary_builder.finish()? + }; + let term_dict_file = FileSlice::from(buffer); + let term_dictionary: TermDictionary = TermDictionary::open(term_dict_file)?; + let mut kv_stream = term_dictionary.stream(); + assert!(kv_stream.advance()); + assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxy".as_bytes()); + assert_eq!(kv_stream.value(), &make_term_info(1)); + assert!(kv_stream.advance()); + assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxyz".as_bytes()); + assert_eq!(kv_stream.value(), &make_term_info(2)); + assert!(kv_stream.advance()); + assert_eq!(kv_stream.key(), "abr".as_bytes()); + assert_eq!(kv_stream.value(), &make_term_info(3)); + assert!(!kv_stream.advance()); + Ok(()) +} + +#[test] +fn test_stream_range() -> crate::Result<()> { + let ids: Vec<_> = (0u32..10_000u32) + .map(|i| (format!("doc{:0>6}", i), i)) + .collect(); + let buffer: Vec = { + let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap(); + for &(ref id, ref i) in &ids { + term_dictionary_builder + .insert(id.as_bytes(), &make_term_info(*i as u64)) + .unwrap(); } - { - let range = term_dictionary.range().ge([2u8]).into_stream(); - assert_eq!( - value_list(range, false), - vec![2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32] - ); - } - { - let range = term_dictionary.range().ge([2u8]).backward().into_stream(); - assert_eq!( - value_list(range, true), - vec![2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32] - ); - } - { - let range = term_dictionary.range().gt([2u8]).into_stream(); - assert_eq!( - value_list(range, false), - vec![3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32] - ); - } - { - let range = term_dictionary.range().gt([2u8]).backward().into_stream(); - assert_eq!( - value_list(range, true), - vec![3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32] - ); - } - { - let range = term_dictionary.range().lt([6u8]).into_stream(); - assert_eq!( - value_list(range, false), - vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32] - ); - } - { - let range = term_dictionary.range().lt([6u8]).backward().into_stream(); - assert_eq!( - value_list(range, true), - vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32] - ); - } - { - let range = term_dictionary.range().le([6u8]).into_stream(); - assert_eq!( - value_list(range, false), - vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32] - ); - } - { - let range = term_dictionary.range().le([6u8]).backward().into_stream(); - assert_eq!( - value_list(range, true), - vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32] - ); - } - { - let range = term_dictionary.range().ge([0u8]).lt([5u8]).into_stream(); - assert_eq!(value_list(range, false), vec![0u32, 1u32, 2u32, 3u32, 4u32]); - } - { - let range = term_dictionary + term_dictionary_builder.finish().unwrap() + }; + + let file = FileSlice::from(buffer); + + let term_dictionary: TermDictionary = TermDictionary::open(file)?; + { + for i in (0..20).chain(6000..8_000) { + let &(ref target_key, _) = &ids[i]; + let mut streamer = term_dictionary .range() - .ge([0u8]) - .lt([5u8]) - .backward() + .ge(target_key.as_bytes()) .into_stream(); - assert_eq!(value_list(range, true), vec![0u32, 1u32, 2u32, 3u32, 4u32]); - } - Ok(()) - } - - #[test] - fn test_automaton_search() -> crate::Result<()> { - use crate::query::DFAWrapper; - use levenshtein_automata::LevenshteinAutomatonBuilder; - - const COUNTRIES: [&'static str; 7] = [ - "San Marino", - "Serbia", - "Slovakia", - "Slovenia", - "Spain", - "Sweden", - "Switzerland", - ]; - - let directory = RAMDirectory::create(); - let path = PathBuf::from("TermDictionary"); - { - let write = directory.open_write(&path)?; - let mut term_dictionary_builder = TermDictionaryBuilder::create(write)?; - for term in COUNTRIES.iter() { - term_dictionary_builder.insert(term.as_bytes(), &make_term_info(0u64))?; + for j in 0..3 { + let (streamer_k, streamer_v) = streamer.next().unwrap(); + let &(ref key, ref v) = &ids[i + j]; + assert_eq!(str::from_utf8(streamer_k.as_ref()).unwrap(), key); + assert_eq!(streamer_v.doc_freq, *v); + assert_eq!(streamer_v, &make_term_info(*v as u64)); } - term_dictionary_builder.finish()?; } - let file = directory.open_read(&path)?; - let term_dict: TermDictionary = TermDictionary::open(file)?; - - // We can now build an entire dfa. - let lev_automaton_builder = LevenshteinAutomatonBuilder::new(2, true); - let automaton = DFAWrapper(lev_automaton_builder.build_dfa("Spaen")); - - let mut range = term_dict.search(automaton).into_stream(); - - // get the first finding - assert!(range.advance()); - assert_eq!("Spain".as_bytes(), range.key()); - assert!(!range.advance()); - Ok(()) } + + { + for i in (0..20).chain(BLOCK_SIZE - 10..BLOCK_SIZE + 10) { + let &(ref target_key, _) = &ids[i]; + let mut streamer = term_dictionary + .range() + .gt(target_key.as_bytes()) + .into_stream(); + for j in 0..3 { + let (streamer_k, streamer_v) = streamer.next().unwrap(); + let &(ref key, ref v) = &ids[i + j + 1]; + assert_eq!(streamer_k.as_ref(), key.as_bytes()); + assert_eq!(streamer_v.doc_freq, *v); + } + } + } + + { + for i in (0..20).chain(BLOCK_SIZE - 10..BLOCK_SIZE + 10) { + for j in 0..3 { + let &(ref fst_key, _) = &ids[i]; + let &(ref last_key, _) = &ids[i + j]; + let mut streamer = term_dictionary + .range() + .ge(fst_key.as_bytes()) + .lt(last_key.as_bytes()) + .into_stream(); + for _ in 0..j { + assert!(streamer.next().is_some()); + } + assert!(streamer.next().is_none()); + } + } + } + Ok(()) +} + +#[test] +fn test_empty_string() -> crate::Result<()> { + let buffer: Vec = { + let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap(); + term_dictionary_builder + .insert(&[], &make_term_info(1 as u64)) + .unwrap(); + term_dictionary_builder + .insert(&[1u8], &make_term_info(2 as u64)) + .unwrap(); + term_dictionary_builder.finish().unwrap() + }; + let file = FileSlice::from(buffer); + let term_dictionary: TermDictionary = TermDictionary::open(file)?; + let mut stream = term_dictionary.stream(); + assert!(stream.advance()); + assert!(stream.key().is_empty()); + assert!(stream.advance()); + assert_eq!(stream.key(), &[1u8]); + assert!(!stream.advance()); + Ok(()) +} + +#[test] +fn test_stream_range_boundaries() -> crate::Result<()> { + let buffer: Vec = { + let mut term_dictionary_builder = TermDictionaryBuilder::create(Vec::new())?; + for i in 0u8..10u8 { + let number_arr = [i; 1]; + term_dictionary_builder.insert(&number_arr, &make_term_info(i as u64))?; + } + term_dictionary_builder.finish()? + }; + let file = FileSlice::from(buffer); + let term_dictionary: TermDictionary = TermDictionary::open(file)?; + + let value_list = |mut streamer: TermStreamer<'_>, backwards: bool| { + let mut res: Vec = vec![]; + while let Some((_, ref v)) = streamer.next() { + res.push(v.doc_freq); + } + if backwards { + res.reverse(); + } + res + }; + { + let range = term_dictionary.range().backward().into_stream(); + assert_eq!( + value_list(range, true), + vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32] + ); + } + { + let range = term_dictionary.range().ge([2u8]).into_stream(); + assert_eq!( + value_list(range, false), + vec![2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32] + ); + } + { + let range = term_dictionary.range().ge([2u8]).backward().into_stream(); + assert_eq!( + value_list(range, true), + vec![2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32] + ); + } + { + let range = term_dictionary.range().gt([2u8]).into_stream(); + assert_eq!( + value_list(range, false), + vec![3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32] + ); + } + { + let range = term_dictionary.range().gt([2u8]).backward().into_stream(); + assert_eq!( + value_list(range, true), + vec![3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32] + ); + } + { + let range = term_dictionary.range().lt([6u8]).into_stream(); + assert_eq!( + value_list(range, false), + vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32] + ); + } + { + let range = term_dictionary.range().lt([6u8]).backward().into_stream(); + assert_eq!( + value_list(range, true), + vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32] + ); + } + { + let range = term_dictionary.range().le([6u8]).into_stream(); + assert_eq!( + value_list(range, false), + vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32] + ); + } + { + let range = term_dictionary.range().le([6u8]).backward().into_stream(); + assert_eq!( + value_list(range, true), + vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32] + ); + } + { + let range = term_dictionary.range().ge([0u8]).lt([5u8]).into_stream(); + assert_eq!(value_list(range, false), vec![0u32, 1u32, 2u32, 3u32, 4u32]); + } + { + let range = term_dictionary + .range() + .ge([0u8]) + .lt([5u8]) + .backward() + .into_stream(); + assert_eq!(value_list(range, true), vec![0u32, 1u32, 2u32, 3u32, 4u32]); + } + Ok(()) +} + +#[test] +fn test_automaton_search() -> crate::Result<()> { + use crate::query::DFAWrapper; + use levenshtein_automata::LevenshteinAutomatonBuilder; + + const COUNTRIES: [&'static str; 7] = [ + "San Marino", + "Serbia", + "Slovakia", + "Slovenia", + "Spain", + "Sweden", + "Switzerland", + ]; + + let directory = RAMDirectory::create(); + let path = PathBuf::from("TermDictionary"); + { + let write = directory.open_write(&path)?; + let mut term_dictionary_builder = TermDictionaryBuilder::create(write)?; + for term in COUNTRIES.iter() { + term_dictionary_builder.insert(term.as_bytes(), &make_term_info(0u64))?; + } + term_dictionary_builder.finish()?; + } + let file = directory.open_read(&path)?; + let term_dict: TermDictionary = TermDictionary::open(file)?; + + // We can now build an entire dfa. + let lev_automaton_builder = LevenshteinAutomatonBuilder::new(2, true); + let automaton = DFAWrapper(lev_automaton_builder.build_dfa("Spaen")); + + let mut range = term_dict.search(automaton).into_stream(); + + // get the first finding + assert!(range.advance()); + assert_eq!("Spain".as_bytes(), range.key()); + assert!(!range.advance()); + Ok(()) +}