tantivy/src/termdict/mod.rs

/*!
The term dictionary main role is to associate the sorted [`Term`s](../struct.Term.html) to
a [`TermInfo`](../postings/struct.TermInfo.html) struct that contains some meta-information
about the term.

Internally, the term dictionary relies on the `fst` crate to store
a sorted mapping that associate each term to its rank in the lexicographical order.
For instance, in a dictionary containing the sorted terms "abba", "bjork", "blur" and "donovan",
the `TermOrdinal` are respectively `0`, `1`, `2`, and `3`.

For `u64`-terms, tantivy explicitely uses a `BigEndian` representation to ensure that the
lexicographical order matches the natural order of integers.

`i64`-terms are transformed to `u64` using a continuous mapping `val ⟶ val - i64::min_value()`
and then treated as a `u64`.

`f64`-terms are transformed to `u64` using a mapping that preserve order, and are then treated
as `u64`.

A second datastructure makes it possible to access a [`TermInfo`](../postings/struct.TermInfo.html).
*/

/// Position of the term in the sorted list of terms.
pub type TermOrdinal = u64;

mod merger;
mod streamer;
mod term_info_store;
mod termdict;

pub use self::merger::TermMerger;
pub use self::streamer::{TermStreamer, TermStreamerBuilder};
pub use self::termdict::{TermDictionary, TermDictionaryBuilder};

#[cfg(test)]
mod tests {
    use super::{TermDictionary, TermDictionaryBuilder, TermStreamer};
    use crate::core::Index;
    use crate::directory::{Directory, RAMDirectory, ReadOnlySource};
    use crate::postings::TermInfo;
    use crate::schema::{Document, Schema, TEXT};
    use std::path::PathBuf;
    use std::str;

    const BLOCK_SIZE: usize = 1_500;

    fn make_term_info(val: u64) -> TermInfo {
        TermInfo {
            doc_freq: val as u32,
            positions_idx: val * 2u64,
            postings_offset: val * 3u64,
        }
    }

    #[test]
    fn test_empty_term_dictionary() {
        let empty = TermDictionary::empty();
        assert!(empty.stream().next().is_none());
    }

    #[test]
    fn test_term_ordinals() {
        const COUNTRIES: [&'static str; 7] = [
            "San Marino",
            "Serbia",
            "Slovakia",
            "Slovenia",
            "Spain",
            "Sweden",
            "Switzerland",
        ];
        let mut directory = RAMDirectory::create();
        let path = PathBuf::from("TermDictionary");
        {
            let write = directory.open_write(&path).unwrap();
            let mut term_dictionary_builder = TermDictionaryBuilder::create(write).unwrap();
            for term in COUNTRIES.iter() {
                term_dictionary_builder
                    .insert(term.as_bytes(), &make_term_info(0u64))
                    .unwrap();
            }
            term_dictionary_builder.finish().unwrap();
        }
        let source = directory.open_read(&path).unwrap();
        let term_dict: TermDictionary = TermDictionary::from_source(&source);
        for (term_ord, term) in COUNTRIES.iter().enumerate() {
            assert_eq!(term_dict.term_ord(term).unwrap(), term_ord as u64);
            let mut bytes = vec![];
            assert!(term_dict.ord_to_term(term_ord as u64, &mut bytes));
            assert_eq!(bytes, term.as_bytes());
        }
    }

    #[test]
    fn test_term_dictionary_simple() {
        let mut directory = RAMDirectory::create();
        let path = PathBuf::from("TermDictionary");
        {
            let write = directory.open_write(&path).unwrap();
            let mut term_dictionary_builder = TermDictionaryBuilder::create(write).unwrap();
            term_dictionary_builder
                .insert("abc".as_bytes(), &make_term_info(34u64))
                .unwrap();
            term_dictionary_builder
                .insert("abcd".as_bytes(), &make_term_info(346u64))
                .unwrap();
            term_dictionary_builder.finish().unwrap();
        }
        let source = directory.open_read(&path).unwrap();
        let term_dict: TermDictionary = TermDictionary::from_source(&source);
        assert_eq!(term_dict.get("abc").unwrap().doc_freq, 34u32);
        assert_eq!(term_dict.get("abcd").unwrap().doc_freq, 346u32);
        let mut stream = term_dict.stream();
        {
            {
                let (k, v) = stream.next().unwrap();
                assert_eq!(k.as_ref(), "abc".as_bytes());
                assert_eq!(v.doc_freq, 34u32);
            }
            assert_eq!(stream.key(), "abc".as_bytes());
            assert_eq!(stream.value().doc_freq, 34u32);
        }
        {
            {
                let (k, v) = stream.next().unwrap();
                assert_eq!(k, "abcd".as_bytes());
                assert_eq!(v.doc_freq, 346u32);
            }
            assert_eq!(stream.key(), "abcd".as_bytes());
            assert_eq!(stream.value().doc_freq, 346u32);
        }
        assert!(!stream.advance());
    }

    #[test]
    fn test_term_iterator() {
        let mut schema_builder = Schema::builder();
        let text_field = schema_builder.add_text_field("text", TEXT);
        let index = Index::create_in_ram(schema_builder.build());
        {
            let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
            {
                {
                    let mut doc = Document::default();
                    doc.add_text(text_field, "a b d f");
                    index_writer.add_document(doc);
                }
                index_writer.commit().unwrap();
            }
            {
                {
                    let mut doc = Document::default();
                    doc.add_text(text_field, "a b c d f");
                    index_writer.add_document(doc);
                }
                index_writer.commit().unwrap();
            }
            {
                {
                    let mut doc = Document::default();
                    doc.add_text(text_field, "e f");
                    index_writer.add_document(doc);
                }
                index_writer.commit().unwrap();
            }
        }
        let searcher = index.reader().unwrap().searcher();

        let field_searcher = searcher.field(text_field);
        let mut term_it = field_searcher.terms();
        let mut term_string = String::new();
        while term_it.advance() {
            //let term = Term::from_bytes(term_it.key());
            term_string.push_str(str::from_utf8(term_it.key()).expect("test"));
        }
        assert_eq!(&*term_string, "abcdef");
    }

    #[test]
    fn test_term_dictionary_stream() {
        let ids: Vec<_> = (0u32..10_000u32)
            .map(|i| (format!("doc{:0>6}", i), i))
            .collect();
        let buffer: Vec<u8> = {
            let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
            for &(ref id, ref i) in &ids {
                term_dictionary_builder
                    .insert(id.as_bytes(), &make_term_info(*i as u64))
                    .unwrap();
            }
            term_dictionary_builder.finish().unwrap()
        };
        let source = ReadOnlySource::from(buffer);
        let term_dictionary: TermDictionary = TermDictionary::from_source(&source);
        {
            let mut streamer = term_dictionary.stream();
            let mut i = 0;
            while let Some((streamer_k, streamer_v)) = streamer.next() {
                let &(ref key, ref v) = &ids[i];
                assert_eq!(streamer_k.as_ref(), key.as_bytes());
                assert_eq!(streamer_v, &make_term_info(*v as u64));
                i += 1;
            }
        }

        let &(ref key, ref _v) = &ids[2047];
        term_dictionary.get(key.as_bytes());
    }

    #[test]
    fn test_stream_high_range_prefix_suffix() {
        let buffer: Vec<u8> = {
            let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
            // term requires more than 16bits
            term_dictionary_builder
                .insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1))
                .unwrap();
            term_dictionary_builder
                .insert("abcdefghijklmnopqrstuvwxyz", &make_term_info(2))
                .unwrap();
            term_dictionary_builder
                .insert("abr", &make_term_info(2))
                .unwrap();
            term_dictionary_builder.finish().unwrap()
        };
        let source = ReadOnlySource::from(buffer);
        let term_dictionary: TermDictionary = TermDictionary::from_source(&source);
        let mut kv_stream = term_dictionary.stream();
        assert!(kv_stream.advance());
        assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxy".as_bytes());
        assert_eq!(kv_stream.value(), &make_term_info(1));
        assert!(kv_stream.advance());
        assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxyz".as_bytes());
        assert_eq!(kv_stream.value(), &make_term_info(2));
        assert!(kv_stream.advance());
        assert_eq!(kv_stream.key(), "abr".as_bytes());
        assert!(!kv_stream.advance());
    }

    #[test]
    fn test_stream_range() {
        let ids: Vec<_> = (0u32..10_000u32)
            .map(|i| (format!("doc{:0>6}", i), i))
            .collect();
        let buffer: Vec<u8> = {
            let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
            for &(ref id, ref i) in &ids {
                term_dictionary_builder
                    .insert(id.as_bytes(), &make_term_info(*i as u64))
                    .unwrap();
            }
            term_dictionary_builder.finish().unwrap()
        };

        let source = ReadOnlySource::from(buffer);

        let term_dictionary: TermDictionary = TermDictionary::from_source(&source);
        {
            for i in (0..20).chain(6000..8_000) {
                let &(ref target_key, _) = &ids[i];
                let mut streamer = term_dictionary
                    .range()
                    .ge(target_key.as_bytes())
                    .into_stream();
                for j in 0..3 {
                    let (streamer_k, streamer_v) = streamer.next().unwrap();
                    let &(ref key, ref v) = &ids[i + j];
                    assert_eq!(str::from_utf8(streamer_k.as_ref()).unwrap(), key);
                    assert_eq!(streamer_v.doc_freq, *v);
                    assert_eq!(streamer_v, &make_term_info(*v as u64));
                }
            }
        }

        {
            for i in (0..20).chain(BLOCK_SIZE - 10..BLOCK_SIZE + 10) {
                let &(ref target_key, _) = &ids[i];
                let mut streamer = term_dictionary
                    .range()
                    .gt(target_key.as_bytes())
                    .into_stream();
                for j in 0..3 {
                    let (streamer_k, streamer_v) = streamer.next().unwrap();
                    let &(ref key, ref v) = &ids[i + j + 1];
                    assert_eq!(streamer_k.as_ref(), key.as_bytes());
                    assert_eq!(streamer_v.doc_freq, *v);
                }
            }
        }

        {
            for i in (0..20).chain(BLOCK_SIZE - 10..BLOCK_SIZE + 10) {
                for j in 0..3 {
                    let &(ref fst_key, _) = &ids[i];
                    let &(ref last_key, _) = &ids[i + j];
                    let mut streamer = term_dictionary
                        .range()
                        .ge(fst_key.as_bytes())
                        .lt(last_key.as_bytes())
                        .into_stream();
                    for _ in 0..j {
                        assert!(streamer.next().is_some());
                    }
                    assert!(streamer.next().is_none());
                }
            }
        }
    }

    #[test]
    fn test_empty_string() {
        let buffer: Vec<u8> = {
            let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
            term_dictionary_builder
                .insert(&[], &make_term_info(1 as u64))
                .unwrap();
            term_dictionary_builder
                .insert(&[1u8], &make_term_info(2 as u64))
                .unwrap();
            term_dictionary_builder.finish().unwrap()
        };
        let source = ReadOnlySource::from(buffer);
        let term_dictionary: TermDictionary = TermDictionary::from_source(&source);
        let mut stream = term_dictionary.stream();
        assert!(stream.advance());
        assert!(stream.key().is_empty());
        assert!(stream.advance());
        assert_eq!(stream.key(), &[1u8]);
        assert!(!stream.advance());
    }

    #[test]
    fn test_stream_range_boundaries() {
        let buffer: Vec<u8> = {
            let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
            for i in 0u8..10u8 {
                let number_arr = [i; 1];
                term_dictionary_builder
                    .insert(&number_arr, &make_term_info(i as u64))
                    .unwrap();
            }
            term_dictionary_builder.finish().unwrap()
        };
        let source = ReadOnlySource::from(buffer);
        let term_dictionary: TermDictionary = TermDictionary::from_source(&source);

        let value_list = |mut streamer: TermStreamer<'_>, backwards: bool| {
            let mut res: Vec<u32> = vec![];
            while let Some((_, ref v)) = streamer.next() {
                res.push(v.doc_freq);
            }
            if backwards {
                res.reverse();
            }
            res
        };
        {
            let range = term_dictionary.range().backward().into_stream();
            assert_eq!(
                value_list(range, true),
                vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
            );
        }
        {
            let range = term_dictionary.range().ge([2u8]).into_stream();
            assert_eq!(
                value_list(range, false),
                vec![2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
            );
        }
        {
            let range = term_dictionary.range().ge([2u8]).backward().into_stream();
            assert_eq!(
                value_list(range, true),
                vec![2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
            );
        }
        {
            let range = term_dictionary.range().gt([2u8]).into_stream();
            assert_eq!(
                value_list(range, false),
                vec![3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
            );
        }
        {
            let range = term_dictionary.range().gt([2u8]).backward().into_stream();
            assert_eq!(
                value_list(range, true),
                vec![3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
            );
        }
        {
            let range = term_dictionary.range().lt([6u8]).into_stream();
            assert_eq!(
                value_list(range, false),
                vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32]
            );
        }
        {
            let range = term_dictionary.range().lt([6u8]).backward().into_stream();
            assert_eq!(
                value_list(range, true),
                vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32]
            );
        }
        {
            let range = term_dictionary.range().le([6u8]).into_stream();
            assert_eq!(
                value_list(range, false),
                vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32]
            );
        }
        {
            let range = term_dictionary.range().le([6u8]).backward().into_stream();
            assert_eq!(
                value_list(range, true),
                vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32]
            );
        }
        {
            let range = term_dictionary.range().ge([0u8]).lt([5u8]).into_stream();
            assert_eq!(value_list(range, false), vec![0u32, 1u32, 2u32, 3u32, 4u32]);
        }
        {
            let range = term_dictionary
                .range()
                .ge([0u8])
                .lt([5u8])
                .backward()
                .into_stream();
            assert_eq!(value_list(range, true), vec![0u32, 1u32, 2u32, 3u32, 4u32]);
        }
    }

    #[test]
    fn test_automaton_search() {
        use crate::query::DFAWrapper;
        use levenshtein_automata::LevenshteinAutomatonBuilder;

        const COUNTRIES: [&'static str; 7] = [
            "San Marino",
            "Serbia",
            "Slovakia",
            "Slovenia",
            "Spain",
            "Sweden",
            "Switzerland",
        ];

        let mut directory = RAMDirectory::create();
        let path = PathBuf::from("TermDictionary");
        {
            let write = directory.open_write(&path).unwrap();
            let mut term_dictionary_builder = TermDictionaryBuilder::create(write).unwrap();
            for term in COUNTRIES.iter() {
                term_dictionary_builder
                    .insert(term.as_bytes(), &make_term_info(0u64))
                    .unwrap();
            }
            term_dictionary_builder.finish().unwrap();
        }
        let source = directory.open_read(&path).unwrap();
        let term_dict: TermDictionary = TermDictionary::from_source(&source);

        // We can now build an entire dfa.
        let lev_automaton_builder = LevenshteinAutomatonBuilder::new(2, true);
        let automaton = DFAWrapper(lev_automaton_builder.build_dfa("Spaen"));

        let mut range = term_dict.search(automaton).into_stream();

        // get the first finding
        assert!(range.advance());
        assert_eq!("Spain".as_bytes(), range.key());
        assert!(!range.advance());
    }
}