Added static directory

Compiling in WebAssembly
2026-01-09 02:22:54 +00:00 · 2018-10-04 23:28:44 +09:00 · 2018-10-04 08:45:04 +09:00
22 changed files with 200 additions and 1051 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,7 +1,3 @@
 Tantivy 0.7.1
 =====================
 - Bugfix: NGramTokenizer panics on non ascii chars
 - Added a space usage API
 Tantivy 0.7
 =====================
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tantivy"
-version = "0.7.1"
+version = "0.7.0"
 authors = ["Paul Masurel <paul.masurel@gmail.com>"]
 license = "MIT"
 categories = ["database-implementations", "data-structures"]
@@ -12,12 +12,12 @@ readme = "README.md"
 keywords = ["search", "information", "retrieval"]
 [dependencies]
-base64 = "0.10.0"
+base64 = "0.9.1"
 byteorder = "1.0"
 lazy_static = "1"
 regex = "1.0"
 fst = {version="0.3", default-features=false}
-fst-regex = { version="0.2" }
+fst-regex = { version="0.2", optional=true}
 lz4 = {version="1.20", optional=true}
 snap = {version="0.2"}
 atomicwrites = {version="0.2.2", optional=true}
@@ -68,8 +68,9 @@ overflow-checks = true
 [features]
 # by default no-fail is disabled. We manually enable it when running test.
-default = ["mmap", "no_fail"]
+default = ["mmap", "no_fail", "regex_query"]
 mmap = ["fst/mmap", "atomicwrites"]
 regex_query = ["fst-regex"]
 lz4-compression = ["lz4"]
 no_fail = ["fail/no_fail"]
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@
 **Tantivy** is a **full text search engine library** written in rust.
-It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elastic Search](https://www.elastic.co/products/elasticsearch) and [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
+It is closer to Lucene than to Elastic Search and Solr in the sense it is not
 an off-the-shelf search engine server, but rather a crate that can be used
 to build such a search engine.
--- a/src/common/composite_file.rs
+++ b/src/common/composite_file.rs
@@ -4,8 +4,6 @@ use common::VInt;
 use directory::ReadOnlySource;
 use directory::WritePtr;
 use schema::Field;
 use space_usage::PerFieldSpaceUsage;
 use space_usage::FieldUsage;
 use std::collections::HashMap;
 use std::io::Write;
 use std::io::{self, Read};
@@ -168,16 +166,6 @@ impl CompositeFile {
            .get(&FileAddr { field, idx })
            .map(|&(from, to)| self.data.slice(from, to))
    }
    pub fn space_usage(&self) -> PerFieldSpaceUsage {
        let mut fields = HashMap::new();
        for (&field_addr, &(start, end)) in self.offsets_index.iter() {
            fields.entry(field_addr.field)
                .or_insert_with(|| FieldUsage::empty(field_addr.field))
                .add_field_idx(field_addr.idx, end - start);
        }
        PerFieldSpaceUsage::new(fields)
    }
 }
 #[cfg(test)]
--- a/src/core/index.rs
+++ b/src/core/index.rs
@@ -49,11 +49,6 @@ pub struct Index {
 }
 impl Index {
    /// Examines the director to see if it contains an index
    pub fn exists<Dir: Directory>(dir: &Dir) -> bool {
        dir.exists(&META_FILEPATH)
    }
    /// Creates a new index using the `RAMDirectory`.
    ///
    /// The index will be allocated in anonymous memory.
@@ -70,28 +65,9 @@ impl Index {
    #[cfg(feature = "mmap")]
    pub fn create_in_dir<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
        let mmap_directory = MmapDirectory::open(directory_path)?;
        if Index::exists(&mmap_directory) {
            return Err(TantivyError::IndexAlreadyExists);
        }
        Index::create(mmap_directory, schema)
    }
    /// Opens or creates a new index in the provided directory
    #[cfg(feature = "mmap")]
    pub fn open_or_create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
        if Index::exists(&dir) {
            let index = Index::open(dir)?;
            if index.schema() == schema {
                Ok(index)
            } else {
                Err(TantivyError::SchemaError("An index exists but the schema does not match.".to_string()))
            }
        } else {
            Index::create(dir, schema)
        }
    }
    /// Creates a new index in a temp directory.
    ///
    /// The index will use the `MMapDirectory` in a newly created directory.
@@ -113,8 +89,6 @@ impl Index {
    }
    /// Create a new index from a directory.
    ///
    /// This will overwrite existing meta.json
    fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
        save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
        let metas = IndexMeta::with_schema(schema);
@@ -354,9 +328,8 @@ impl Clone for Index {
 #[cfg(test)]
 mod tests {
-    use schema::{Schema, SchemaBuilder, INT_INDEXED, TEXT};
+    use schema::{SchemaBuilder, INT_INDEXED, TEXT};
    use Index;
    use directory::RAMDirectory;
    #[test]
    fn test_indexer_for_field() {
@@ -372,52 +345,4 @@ mod tests {
        );
    }
    #[test]
    fn test_index_exists() {
        let directory = RAMDirectory::create();
        assert!(!Index::exists(&directory));
        assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
        assert!(Index::exists(&directory));
    }
    #[test]
    fn open_or_create_should_create() {
        let directory = RAMDirectory::create();
        assert!(!Index::exists(&directory));
        assert!(Index::open_or_create(directory.clone(), throw_away_schema()).is_ok());
        assert!(Index::exists(&directory));
    }
    #[test]
    fn open_or_create_should_open() {
        let directory = RAMDirectory::create();
        assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
        assert!(Index::exists(&directory));
        assert!(Index::open_or_create(directory, throw_away_schema()).is_ok());
    }
    #[test]
    fn create_should_wipeoff_existing() {
        let directory = RAMDirectory::create();
        assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
        assert!(Index::exists(&directory));
        assert!(Index::create(directory.clone(), SchemaBuilder::default().build()).is_ok());
    }
    #[test]
    fn open_or_create_exists_but_schema_does_not_match() {
        let directory = RAMDirectory::create();
        assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
        assert!(Index::exists(&directory));
        assert!(Index::open_or_create(directory.clone(), throw_away_schema()).is_ok());
        let err = Index::open_or_create(directory, SchemaBuilder::default().build());
        assert_eq!(format!("{:?}", err.unwrap_err()), "SchemaError(\"An index exists but the schema does not match.\")");
    }
    fn throw_away_schema() -> Schema {
        let mut schema_builder = SchemaBuilder::default();
        let _ = schema_builder.add_u64_field("num_likes", INT_INDEXED);
        schema_builder.build()
    }
 }
--- a/src/core/searcher.rs
+++ b/src/core/searcher.rs
@@ -5,7 +5,6 @@ use query::Query;
 use schema::Document;
 use schema::Schema;
 use schema::{Field, Term};
 use space_usage::SearcherSpaceUsage;
 use std::fmt;
 use std::sync::Arc;
 use termdict::TermMerger;
@@ -100,15 +99,6 @@ impl Searcher {
            .collect::<Vec<_>>();
        FieldSearcher::new(inv_index_readers)
    }
    /// Summarize total space usage of this searcher.
    pub fn space_usage(&self) -> SearcherSpaceUsage {
        let mut space_usage = SearcherSpaceUsage::new();
        for segment_reader in self.segment_readers.iter() {
            space_usage.add_segment(segment_reader.space_usage());
        }
        space_usage
    }
 }
 pub struct FieldSearcher {
--- a/src/core/segment_reader.rs
+++ b/src/core/segment_reader.rs
@@ -16,7 +16,6 @@ use schema::Document;
 use schema::Field;
 use schema::FieldType;
 use schema::Schema;
 use space_usage::SegmentSpaceUsage;
 use std::collections::HashMap;
 use std::fmt;
 use std::sync::Arc;
@@ -382,21 +381,6 @@ impl SegmentReader {
    pub fn doc_ids_alive(&self) -> SegmentReaderAliveDocsIterator {
        SegmentReaderAliveDocsIterator::new(&self)
    }
    /// Summarize total space usage of this segment.
    pub fn space_usage(&self) -> SegmentSpaceUsage {
        SegmentSpaceUsage::new(
            self.num_docs(),
            self.termdict_composite.space_usage(),
            self.postings_composite.space_usage(),
            self.positions_composite.space_usage(),
            self.positions_idx_composite.space_usage(),
            self.fast_fields_composite.space_usage(),
            self.fieldnorms_composite.space_usage(),
            self.store_reader.space_usage(),
            self.delete_bitset_opt.as_ref().map(|x| x.space_usage()).unwrap_or(0),
        )
    }
 }
 impl fmt::Debug for SegmentReader {
--- a/src/directory/mmap_directory.rs
+++ b/src/directory/mmap_directory.rs
@@ -364,11 +364,6 @@ mod tests {
    use super::*;
    #[test]
    fn test_open_non_existant_path() {
        assert!(MmapDirectory::open(PathBuf::from("./nowhere")).is_err());
    }
    #[test]
    fn test_open_empty() {
        // empty file is actually an edge case because those
--- a/src/directory/mod.rs
+++ b/src/directory/mod.rs
@@ -12,6 +12,7 @@ mod managed_directory;
 mod ram_directory;
 mod read_only_source;
 mod shared_vec_slice;
 mod static_dictionnary;
 /// Errors specific to the directory module.
 pub mod error;
@@ -21,6 +22,7 @@ use std::io::{BufWriter, Seek, Write};
 pub use self::directory::{Directory, DirectoryClone};
 pub use self::ram_directory::RAMDirectory;
 pub use self::read_only_source::ReadOnlySource;
 pub use self::static_dictionnary::StaticDirectory;
 #[cfg(feature = "mmap")]
 pub use self::mmap_directory::MmapDirectory;
--- a/src/directory/read_only_source.rs
+++ b/src/directory/read_only_source.rs
@@ -5,6 +5,9 @@ use fst::raw::MmapReadOnly;
 use stable_deref_trait::{CloneStableDeref, StableDeref};
 use std::ops::Deref;
 const EMPTY_SLICE: [u8; 0] = [];
 /// Read object that represents files in tantivy.
 ///
 /// These read objects are only in charge to deliver
@@ -17,6 +20,8 @@ pub enum ReadOnlySource {
    Mmap(MmapReadOnly),
    /// Wrapping a `Vec<u8>`
    Anonymous(SharedVecSlice),
    /// Wrapping a static slice
    Static(&'static [u8])
 }
 unsafe impl StableDeref for ReadOnlySource {}
@@ -33,7 +38,7 @@ impl Deref for ReadOnlySource {
 impl ReadOnlySource {
    /// Creates an empty ReadOnlySource
    pub fn empty() -> ReadOnlySource {
-        ReadOnlySource::Anonymous(SharedVecSlice::empty())
+        ReadOnlySource::Static(&EMPTY_SLICE)
    }
    /// Returns the data underlying the ReadOnlySource object.
@@ -42,6 +47,7 @@ impl ReadOnlySource {
            #[cfg(feature = "mmap")]
            ReadOnlySource::Mmap(ref mmap_read_only) => mmap_read_only.as_slice(),
            ReadOnlySource::Anonymous(ref shared_vec) => shared_vec.as_slice(),
            ReadOnlySource::Static(data) => data,
        }
    }
@@ -79,6 +85,9 @@ impl ReadOnlySource {
            ReadOnlySource::Anonymous(ref shared_vec) => {
                ReadOnlySource::Anonymous(shared_vec.slice(from_offset, to_offset))
            }
            ReadOnlySource::Static(data) => {
                ReadOnlySource::Static(&data[from_offset..to_offset])
            }
        }
    }
@@ -118,3 +127,9 @@ impl From<Vec<u8>> for ReadOnlySource {
        ReadOnlySource::Anonymous(shared_data)
    }
 }
 impl From<&'static [u8]> for ReadOnlySource {
    fn from(data: &'static [u8]) -> ReadOnlySource {
        ReadOnlySource::Static(data)
    }
 }
--- a/src/error.rs
+++ b/src/error.rs
@@ -20,9 +20,6 @@ pub enum TantivyError {
    /// File already exists, this is a problem when we try to write into a new file.
    #[fail(display = "file already exists: '{:?}'", _0)]
    FileAlreadyExists(PathBuf),
    /// Index already exists in this directory
    #[fail(display = "index already exists")]
    IndexAlreadyExists,
    /// Failed to acquire file lock
    #[fail(
        display = "Failed to acquire Lockfile: {:?}. Possible causes: another IndexWriter instance or panic during previous lock drop.",
--- a/src/fastfield/delete.rs
+++ b/src/fastfield/delete.rs
@@ -2,7 +2,6 @@ use bit_set::BitSet;
 use common::HasLen;
 use directory::ReadOnlySource;
 use directory::WritePtr;
 use space_usage::ByteCount;
 use std::io;
 use std::io::Write;
 use DocId;
@@ -64,11 +63,6 @@ impl DeleteBitSet {
            b & (1u8 << shift) != 0
        }
    }
    /// Summarize total space usage of this bitset.
    pub fn space_usage(&self) -> ByteCount {
        self.data.len()
    }
 }
 impl HasLen for DeleteBitSet {
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -136,7 +136,7 @@ extern crate crossbeam;
 extern crate crossbeam_channel;
 extern crate fnv;
 extern crate fst;
-extern crate fst_regex;
+
 extern crate futures;
 extern crate futures_cpupool;
 extern crate htmlescape;
@@ -213,7 +213,6 @@ pub(crate) mod positions;
 pub mod postings;
 pub mod query;
 pub mod schema;
 pub mod space_usage;
 pub mod store;
 pub mod termdict;
--- a/src/query/mod.rs
+++ b/src/query/mod.rs
@@ -16,7 +16,10 @@ mod phrase_query;
 mod query;
 mod query_parser;
 mod range_query;
 #[cfg(feature="regex_query")]
 mod regex_query;
 mod reqopt_scorer;
 mod scorer;
 mod term_query;
@@ -47,7 +50,10 @@ pub use self::query::Query;
 pub use self::query_parser::QueryParser;
 pub use self::query_parser::QueryParserError;
 pub use self::range_query::RangeQuery;
 #[cfg(feature="regex_query")]
 pub use self::regex_query::RegexQuery;
 pub use self::reqopt_scorer::RequiredOptionalScorer;
 pub use self::scorer::ConstScorer;
 pub use self::scorer::Scorer;
--- a/src/query/regex_query.rs
+++ b/src/query/regex_query.rs
@@ -1,5 +1,7 @@
 extern crate fst_regex;
 use error::TantivyError;
-use fst_regex::Regex;
+use self::fst_regex::Regex;
 use query::{AutomatonWeight, Query, Weight};
 use schema::Field;
 use std::clone::Clone;
--- a/src/schema/field_entry.rs
+++ b/src/schema/field_entry.rs
@@ -14,7 +14,7 @@ use std::fmt;
 /// - a field name
 /// - a field type, itself wrapping up options describing
 /// how the field should be indexed.
-#[derive(Clone, Debug, Eq, PartialEq)]
+#[derive(Clone, Debug)]
 pub struct FieldEntry {
    name: String,
    field_type: FieldType,
--- a/src/schema/schema.rs
+++ b/src/schema/schema.rs
@@ -134,15 +134,6 @@ struct InnerSchema {
    fields_map: HashMap<String, Field>, // transient
 }
 impl PartialEq for InnerSchema {
    fn eq(&self, other: &InnerSchema) -> bool {
        self.fields == other.fields
    }
 }
 impl Eq for InnerSchema {}
 /// Tantivy has a very strict schema.
 /// You need to specify in advance, whether a field is indexed or not,
 /// stored or not, and RAM-based or not.
@@ -163,7 +154,7 @@ impl Eq for InnerSchema {}
 /// let schema = schema_builder.build();
 ///
 /// ```
-#[derive(Clone, Eq, PartialEq)]
+#[derive(Clone)]
 pub struct Schema(Arc<InnerSchema>);
 impl Schema {
--- a/src/space_usage/mod.rs
+++ b/src/space_usage/mod.rs
@@ -1,484 +0,0 @@
 /*!
 Representations for the space usage of various parts of a Tantivy index.
 This can be used programmatically, and will also be exposed in a human readable fashion in
 tantivy-cli.
 One important caveat for all of this functionality is that none of it currently takes storage-level
 details into consideration. For example, if your file system block size is 4096 bytes, we can
 under-count actual resultant space usage by up to 4095 bytes per file.
 */
 use schema::Field;
 use std::collections::HashMap;
 use SegmentComponent;
 /// Indicates space usage in bytes
 pub type ByteCount = usize;
 /// Enum containing any of the possible space usage results for segment components.
 pub enum ComponentSpaceUsage {
    /// Data is stored per field in a uniform way
    PerField(PerFieldSpaceUsage),
    /// Data is stored in separate pieces in the store
    Store(StoreSpaceUsage),
    /// Some sort of raw byte count
    Basic(ByteCount),
 }
 /// Represents combined space usage of an entire searcher and its component segments.
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct SearcherSpaceUsage {
    segments: Vec<SegmentSpaceUsage>,
    total: ByteCount,
 }
 impl SearcherSpaceUsage {
    pub(crate) fn new() -> SearcherSpaceUsage {
        SearcherSpaceUsage {
            segments: Vec::new(),
            total: 0,
        }
    }
    /// Add a segment, to `self`.
    /// Performs no deduplication or other intelligence.
    pub(crate) fn add_segment(&mut self, segment: SegmentSpaceUsage) {
        self.total += segment.total();
        self.segments.push(segment);
    }
    /// Per segment space usage
    pub fn segments(&self) -> &[SegmentSpaceUsage] {
        &self.segments[..]
    }
    /// Returns total byte usage of this searcher, including all large subcomponents.
    /// Does not account for smaller things like `meta.json`.
    pub fn total(&self) -> ByteCount {
        self.total
    }
 }
 /// Represents combined space usage for all of the large components comprising a segment.
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct SegmentSpaceUsage {
    num_docs: u32,
    termdict: PerFieldSpaceUsage,
    postings: PerFieldSpaceUsage,
    positions: PerFieldSpaceUsage,
    positions_idx: PerFieldSpaceUsage,
    fast_fields: PerFieldSpaceUsage,
    fieldnorms: PerFieldSpaceUsage,
    store: StoreSpaceUsage,
    deletes: ByteCount,
    total: ByteCount,
 }
 impl SegmentSpaceUsage {
    pub(crate) fn new(
        num_docs: u32,
        termdict: PerFieldSpaceUsage,
        postings: PerFieldSpaceUsage,
        positions: PerFieldSpaceUsage,
        positions_idx: PerFieldSpaceUsage,
        fast_fields: PerFieldSpaceUsage,
        fieldnorms: PerFieldSpaceUsage,
        store: StoreSpaceUsage,
        deletes: ByteCount,
    ) -> SegmentSpaceUsage {
        let total = termdict.total()
            + postings.total()
            + positions.total()
            + fast_fields.total()
            + fieldnorms.total()
            + store.total()
            + deletes;
        SegmentSpaceUsage {
            num_docs,
            termdict,
            postings,
            positions,
            positions_idx,
            fast_fields,
            fieldnorms,
            store,
            deletes,
            total,
        }
    }
    /// Space usage for the given component
    ///
    /// Clones the underlying data.
    /// Use the components directly if this is somehow in performance critical code.
    pub fn component(&self, component: SegmentComponent) -> ComponentSpaceUsage {
        use SegmentComponent::*;
        use self::ComponentSpaceUsage::*;
        match component {
            POSTINGS => PerField(self.postings().clone()),
            POSITIONS => PerField(self.positions().clone()),
            POSITIONSSKIP => PerField(self.positions_skip_idx().clone()),
            FASTFIELDS => PerField(self.fast_fields().clone()),
            FIELDNORMS => PerField(self.fieldnorms().clone()),
            TERMS => PerField(self.termdict().clone()),
            STORE => Store(self.store().clone()),
            DELETE => Basic(self.deletes()),
        }
    }
    /// Num docs in segment
    pub fn num_docs(&self) -> u32 {
        self.num_docs
    }
    /// Space usage for term dictionary
    pub fn termdict(&self) -> &PerFieldSpaceUsage {
        &self.termdict
    }
    /// Space usage for postings list
    pub fn postings(&self) -> &PerFieldSpaceUsage {
        &self.postings
    }
    /// Space usage for positions
    pub fn positions(&self) -> &PerFieldSpaceUsage {
        &self.positions
    }
    /// Space usage for positions skip idx
    pub fn positions_skip_idx(&self) -> &PerFieldSpaceUsage {
        &self.positions_idx
    }
    /// Space usage for fast fields
    pub fn fast_fields(&self) -> &PerFieldSpaceUsage {
        &self.fast_fields
    }
    /// Space usage for field norms
    pub fn fieldnorms(&self) -> &PerFieldSpaceUsage {
        &self.fieldnorms
    }
    /// Space usage for stored documents
    pub fn store(&self) -> &StoreSpaceUsage {
        &self.store
    }
    /// Space usage for document deletions
    pub fn deletes(&self) -> ByteCount {
        self.deletes
    }
    /// Total space usage in bytes for this segment.
    pub fn total(&self) -> ByteCount {
        self.total
    }
 }
 /// Represents space usage for the Store for this segment.
 ///
 /// This is composed of two parts.
 /// `data` represents the compressed data itself.
 /// `offsets` represents a lookup to find the start of a block
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct StoreSpaceUsage {
    data: ByteCount,
    offsets: ByteCount,
 }
 impl StoreSpaceUsage {
    pub(crate) fn new(data: ByteCount, offsets: ByteCount) -> StoreSpaceUsage {
        StoreSpaceUsage { data, offsets }
    }
    /// Space usage for the data part of the store
    pub fn data_usage(&self) -> ByteCount {
        self.data
    }
    /// Space usage for the offsets part of the store (doc ID -> offset)
    pub fn offsets_usage(&self) -> ByteCount {
        self.offsets
    }
    /// Total space usage in bytes for this Store
    pub fn total(&self) -> ByteCount {
        self.data + self.offsets
    }
 }
 /// Represents space usage for all of the (field, index) pairs that appear in a CompositeFile.
 ///
 /// A field can appear with a single index (typically 0) or with multiple indexes.
 /// Multiple indexes are used to handle variable length things, where
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct PerFieldSpaceUsage {
    fields: HashMap<Field, FieldUsage>,
    total: ByteCount
 }
 impl PerFieldSpaceUsage {
    pub(crate) fn new(fields: HashMap<Field, FieldUsage>) -> PerFieldSpaceUsage {
        let total = fields.values().map(|x| x.total()).sum();
        PerFieldSpaceUsage { fields, total }
    }
    /// Per field space usage
    pub fn fields(&self) -> impl Iterator<Item = (&Field, &FieldUsage)> {
        self.fields.iter()
    }
    /// Bytes used by the represented file
    pub fn total(&self) -> ByteCount {
        self.total
    }
 }
 /// Represents space usage of a given field, breaking it down into the (field, index) pairs that
 /// comprise it.
 ///
 /// See documentation for PerFieldSpaceUsage for slightly more information.
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct FieldUsage {
    field: Field,
    num_bytes: ByteCount,
    /// A field can be composed of more than one piece.
    /// These pieces are indexed by arbitrary numbers starting at zero.
    /// `self.num_bytes` includes all of `self.sub_num_bytes`.
    sub_num_bytes: Vec<Option<ByteCount>>,
 }
 impl FieldUsage {
    pub(crate) fn empty(field: Field) -> FieldUsage {
        FieldUsage {
            field,
            num_bytes: 0,
            sub_num_bytes: Vec::new(),
        }
    }
    pub(crate) fn add_field_idx(&mut self, idx: usize, size: ByteCount) {
        if self.sub_num_bytes.len() < idx + 1{
            self.sub_num_bytes.resize(idx + 1, None);
        }
        assert!(self.sub_num_bytes[idx].is_none());
        self.sub_num_bytes[idx] = Some(size);
        self.num_bytes += size
    }
    /// Field
    pub fn field(&self) -> Field {
        self.field
    }
    /// Space usage for each index
    pub fn sub_num_bytes(&self) -> &[Option<ByteCount>] {
        &self.sub_num_bytes[..]
    }
    /// Total bytes used for this field in this context
    pub fn total(&self) -> ByteCount {
        self.num_bytes
    }
 }
 #[cfg(test)]
 mod test {
    use core::Index;
    use schema::SchemaBuilder;
    use schema::{FAST, INT_INDEXED, TEXT};
    use schema::Field;
    use space_usage::ByteCount;
    use space_usage::PerFieldSpaceUsage;
    use schema::STORED;
    use Term;
    #[test]
    fn test_empty() {
        let schema = SchemaBuilder::new().build();
        let index = Index::create_in_ram(schema.clone());
        index.load_searchers().unwrap();
        let searcher = index.searcher();
        let searcher_space_usage = searcher.space_usage();
        assert_eq!(0, searcher_space_usage.total());
    }
    fn expect_single_field(field_space: &PerFieldSpaceUsage, field: &Field, min_size: ByteCount, max_size: ByteCount) {
        assert!(field_space.total() >= min_size);
        assert!(field_space.total() <= max_size);
        assert_eq!(
            vec![(field, field_space.total())],
            field_space.fields().map(|(x,y)| (x, y.total())).collect::<Vec<_>>()
        );
    }
    #[test]
    fn test_fast_indexed() {
        let mut schema_builder = SchemaBuilder::new();
        let name = schema_builder.add_u64_field("name", FAST | INT_INDEXED);
        let schema = schema_builder.build();
        let index = Index::create_in_ram(schema.clone());
        {
            let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
            index_writer.add_document(doc!(name => 1u64));
            index_writer.add_document(doc!(name => 2u64));
            index_writer.add_document(doc!(name => 10u64));
            index_writer.add_document(doc!(name => 20u64));
            index_writer.commit().unwrap();
        }
        index.load_searchers().unwrap();
        let searcher = index.searcher();
        let searcher_space_usage = searcher.space_usage();
        assert!(searcher_space_usage.total() > 0);
        assert_eq!(1, searcher_space_usage.segments().len());
        let segment = &searcher_space_usage.segments()[0];
        assert!(segment.total() > 0);
        assert_eq!(4, segment.num_docs());
        expect_single_field(segment.termdict(), &name, 1, 512);
        expect_single_field(segment.postings(), &name, 1, 512);
        assert_eq!(0, segment.positions().total());
        assert_eq!(0, segment.positions_skip_idx().total());
        expect_single_field(segment.fast_fields(), &name, 1, 512);
        expect_single_field(segment.fieldnorms(), &name, 1, 512);
        // TODO: understand why the following fails
 //        assert_eq!(0, segment.store().total());
        assert_eq!(0, segment.deletes());
    }
    #[test]
    fn test_text() {
        let mut schema_builder = SchemaBuilder::new();
        let name = schema_builder.add_text_field("name", TEXT);
        let schema = schema_builder.build();
        let index = Index::create_in_ram(schema.clone());
        {
            let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
            index_writer.add_document(doc!(name => "hi"));
            index_writer.add_document(doc!(name => "this is a test"));
            index_writer.add_document(doc!(name => "some more documents with some word overlap with the other test"));
            index_writer.add_document(doc!(name => "hello hi goodbye"));
            index_writer.commit().unwrap();
        }
        index.load_searchers().unwrap();
        let searcher = index.searcher();
        let searcher_space_usage = searcher.space_usage();
        assert!(searcher_space_usage.total() > 0);
        assert_eq!(1, searcher_space_usage.segments().len());
        let segment = &searcher_space_usage.segments()[0];
        assert!(segment.total() > 0);
        assert_eq!(4, segment.num_docs());
        expect_single_field(segment.termdict(), &name, 1, 512);
        expect_single_field(segment.postings(), &name, 1, 512);
        expect_single_field(segment.positions(), &name, 1, 512);
        expect_single_field(segment.positions_skip_idx(), &name, 1, 512);
        assert_eq!(0, segment.fast_fields().total());
        expect_single_field(segment.fieldnorms(), &name, 1, 512);
        // TODO: understand why the following fails
 //        assert_eq!(0, segment.store().total());
        assert_eq!(0, segment.deletes());
    }
    #[test]
    fn test_store() {
        let mut schema_builder = SchemaBuilder::new();
        let name = schema_builder.add_text_field("name", STORED);
        let schema = schema_builder.build();
        let index = Index::create_in_ram(schema.clone());
        {
            let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
            index_writer.add_document(doc!(name => "hi"));
            index_writer.add_document(doc!(name => "this is a test"));
            index_writer.add_document(doc!(name => "some more documents with some word overlap with the other test"));
            index_writer.add_document(doc!(name => "hello hi goodbye"));
            index_writer.commit().unwrap();
        }
        index.load_searchers().unwrap();
        let searcher = index.searcher();
        let searcher_space_usage = searcher.space_usage();
        assert!(searcher_space_usage.total() > 0);
        assert_eq!(1, searcher_space_usage.segments().len());
        let segment = &searcher_space_usage.segments()[0];
        assert!(segment.total() > 0);
        assert_eq!(4, segment.num_docs());
        assert_eq!(0, segment.termdict().total());
        assert_eq!(0, segment.postings().total());
        assert_eq!(0, segment.positions().total());
        assert_eq!(0, segment.positions_skip_idx().total());
        assert_eq!(0, segment.fast_fields().total());
        assert_eq!(0, segment.fieldnorms().total());
        assert!(segment.store().total() > 0);
        assert!(segment.store().total() < 512);
        assert_eq!(0, segment.deletes());
    }
    #[test]
    fn test_deletes() {
        let mut schema_builder = SchemaBuilder::new();
        let name = schema_builder.add_u64_field("name", INT_INDEXED);
        let schema = schema_builder.build();
        let index = Index::create_in_ram(schema.clone());
        {
            let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
            index_writer.add_document(doc!(name => 1u64));
            index_writer.add_document(doc!(name => 2u64));
            index_writer.add_document(doc!(name => 3u64));
            index_writer.add_document(doc!(name => 4u64));
            index_writer.commit().unwrap();
        }
        {
            let mut index_writer2 = index.writer(50_000_000).unwrap();
            index_writer2.delete_term(Term::from_field_u64(name, 2u64));
            index_writer2.delete_term(Term::from_field_u64(name, 3u64));
            // ok, now we should have a deleted doc
            index_writer2.commit().unwrap();
        }
        index.load_searchers().unwrap();
        let searcher = index.searcher();
        let searcher_space_usage = searcher.space_usage();
        assert!(searcher_space_usage.total() > 0);
        assert_eq!(1, searcher_space_usage.segments().len());
        let segment = &searcher_space_usage.segments()[0];
        assert!(segment.total() > 0);
        assert_eq!(2, segment.num_docs());
        expect_single_field(segment.termdict(), &name, 1, 512);
        expect_single_field(segment.postings(), &name, 1, 512);
        assert_eq!(0, segment.positions().total());
        assert_eq!(0, segment.positions_skip_idx().total());
        assert_eq!(0, segment.fast_fields().total());
        expect_single_field(segment.fieldnorms(), &name, 1, 512);
        // TODO: understand why the following fails
 //        assert_eq!(0, segment.store().total());
        assert!(segment.deletes() > 0);
    }
 }
--- a/src/store/reader.rs
+++ b/src/store/reader.rs
@@ -6,7 +6,6 @@ use common::BinarySerializable;
 use common::VInt;
 use directory::ReadOnlySource;
 use schema::Document;
 use space_usage::StoreSpaceUsage;
 use std::cell::RefCell;
 use std::io;
 use std::mem::size_of;
@@ -88,11 +87,6 @@ impl StoreReader {
        cursor = &cursor[..doc_length];
        Ok(Document::deserialize(&mut cursor)?)
    }
    /// Summarize total space usage of this store reader.
    pub fn space_usage(&self) -> StoreSpaceUsage {
        StoreSpaceUsage::new(self.data.len(), self.offset_index_source.len())
    }
 }
 #[cfg_attr(
--- a/src/termdict/termdict.rs
+++ b/src/termdict/termdict.rs
@@ -96,6 +96,9 @@ fn open_fst_index(source: ReadOnlySource) -> fst::Map {
        ReadOnlySource::Mmap(mmap_readonly) => {
            Fst::from_mmap(mmap_readonly).expect("FST data is corrupted")
        }
        ReadOnlySource::Static(data) => {
            Fst::from_static_slice(data).expect("FST data is corrupted")
        }
    };
    fst::Map::from(fst)
 }
--- a/src/tokenizer/mod.rs
+++ b/src/tokenizer/mod.rs
@@ -157,34 +157,35 @@ pub use self::tokenizer::BoxedTokenizer;
 pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
 pub use self::tokenizer_manager::TokenizerManager;
 /// This is a function that can be used in tests and doc tests
 /// to assert a token's correctness.
 /// TODO: can this be wrapped in #[cfg(test)] so as not to be in the
 /// public api?
 pub fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) {
    assert_eq!(
        token.position, position,
        "expected position {} but {:?}",
        position, token
    );
    assert_eq!(token.text, text, "expected text {} but {:?}", text, token);
    assert_eq!(
        token.offset_from, from,
        "expected offset_from {} but {:?}",
        from, token
    );
    assert_eq!(
        token.offset_to, to,
        "expected offset_to {} but {:?}",
        to, token
    );
 }
 #[cfg(test)]
-pub mod tests {
+pub mod test {
    use super::assert_token;
    use super::Token;
    use super::TokenizerManager;
    /// This is a function that can be used in tests and doc tests
    /// to assert a token's correctness.
    pub fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) {
        assert_eq!(
            token.position, position,
            "expected position {} but {:?}",
            position, token
        );
        assert_eq!(token.text, text, "expected text {} but {:?}", text, token);
        assert_eq!(
            token.offset_from, from,
            "expected offset_from {} but {:?}",
            from, token
        );
        assert_eq!(
            token.offset_to, to,
            "expected offset_to {} but {:?}",
            to, token
        );
    }
    #[test]
    fn test_raw_tokenizer() {
        let tokenizer_manager = TokenizerManager::default();
@@ -223,6 +224,72 @@ pub mod tests {
        assert_token(&tokens[3], 3, "payer", 17, 22);
    }
    #[test]
    fn test_ngram_tokenizer() {
        use super::{LowerCaser, NgramTokenizer};
        use tokenizer::tokenizer::TokenStream;
        use tokenizer::tokenizer::Tokenizer;
        let tokenizer_manager = TokenizerManager::default();
        tokenizer_manager.register("ngram12", NgramTokenizer::new(1, 2, false));
        tokenizer_manager.register(
            "ngram3",
            NgramTokenizer::new(3, 3, false).filter(LowerCaser),
        );
        tokenizer_manager.register(
            "edgegram5",
            NgramTokenizer::new(2, 5, true).filter(LowerCaser),
        );
        let tokenizer = NgramTokenizer::new(1, 2, false);
        let mut tokens: Vec<Token> = vec![];
        {
            let mut add_token = |token: &Token| {
                tokens.push(token.clone());
            };
            tokenizer.token_stream("hello").process(&mut add_token);
        }
        assert_eq!(tokens.len(), 9);
        assert_token(&tokens[0], 0, "h", 0, 1);
        assert_token(&tokens[1], 0, "he", 0, 2);
        assert_token(&tokens[2], 1, "e", 1, 2);
        assert_token(&tokens[3], 1, "el", 1, 3);
        assert_token(&tokens[4], 2, "l", 2, 3);
        assert_token(&tokens[5], 2, "ll", 2, 4);
        assert_token(&tokens[6], 3, "l", 3, 4);
        assert_token(&tokens[7], 3, "lo", 3, 5);
        assert_token(&tokens[8], 4, "o", 4, 5);
        let tokenizer = tokenizer_manager.get("ngram3").unwrap();
        let mut tokens: Vec<Token> = vec![];
        {
            let mut add_token = |token: &Token| {
                tokens.push(token.clone());
            };
            tokenizer.token_stream("Hello").process(&mut add_token);
        }
        assert_eq!(tokens.len(), 3);
        assert_token(&tokens[0], 0, "hel", 0, 3);
        assert_token(&tokens[1], 1, "ell", 1, 4);
        assert_token(&tokens[2], 2, "llo", 2, 5);
        let tokenizer = tokenizer_manager.get("edgegram5").unwrap();
        let mut tokens: Vec<Token> = vec![];
        {
            let mut add_token = |token: &Token| {
                tokens.push(token.clone());
            };
            tokenizer
                .token_stream("Frankenstein")
                .process(&mut add_token);
        }
        assert_eq!(tokens.len(), 4);
        assert_token(&tokens[0], 0, "fr", 0, 2);
        assert_token(&tokens[1], 0, "fra", 0, 3);
        assert_token(&tokens[2], 0, "fran", 0, 4);
        assert_token(&tokens[3], 0, "frank", 0, 5);
    }
    #[test]
    fn test_tokenizer_empty() {
        let tokenizer_manager = TokenizerManager::default();
--- a/src/tokenizer/ngram_tokenizer.rs
+++ b/src/tokenizer/ngram_tokenizer.rs
@@ -2,15 +2,14 @@ use super::{Token, TokenStream, Tokenizer};
 /// Tokenize the text by splitting words into n-grams of the given size(s)
 ///
-/// With this tokenizer, the `position` is always 0.
+/// With this tokenizer, the `position` field expresses the starting offset of the ngram
-/// Beware however, in presence of multiple value for the same field,
+/// rather than the `token` offset.
 /// the position will be `POSITION_GAP * index of value`.
 ///
 /// Example 1: `hello` would be tokenized as (min_gram: 2, max_gram: 3, prefix_only: false)
 ///
 /// | Term     | he  | hel | el  | ell | ll  | llo | lo |
 /// |----------|-----|-----|-----|-----|-----|-----|----|
-/// | Position | 0   | 0   | 0   | 0   | 0   | 0   | 0  |
+/// | Position | 0   | 0   | 1   | 1   | 2   | 2   | 3  |
 /// | Offsets  | 0,2 | 0,3 | 1,3 | 1,4 | 2,4 | 2,5 | 3,5|
 ///
 /// Example 2: `hello` would be tokenized as (min_gram: 2, max_gram: 5, prefix_only: **true**)
@@ -20,63 +19,24 @@ use super::{Token, TokenStream, Tokenizer};
 /// | Position | 0   | 0   | 0     | 0     |
 /// | Offsets  | 0,2 | 0,3 | 0,4   | 0,5   |
 ///
 /// Example 3: `hεllo` (non-ascii) would be tokenized as (min_gram: 2, max_gram: 5, prefix_only: **true**)
 ///
 /// | Term     | hε  | hεl | hεll  | hεllo |
 /// |----------|-----|-----|-------|-------|
 /// | Position | 0   | 0   | 0     | 0     |
 /// | Offsets  | 0,3 | 0,4 | 0,5   | 0,6   |
 ///
 /// # Example
 ///
 /// ```
-/// # extern crate tantivy;
+/// extern crate tantivy;
 /// use tantivy::tokenizer::*;
 /// use tantivy::tokenizer::assert_token;
 ///
 /// # fn main() {
 /// let tokenizer = NgramTokenizer::new(2, 3, false);
 /// let mut stream = tokenizer.token_stream("hello");
-/// {
+///
-///     let token = stream.next().unwrap();
+/// assert_token(stream.next().unwrap(), 0, "he", 0, 2);
-///     assert_eq!(token.text, "he");
+/// assert_token(stream.next().unwrap(), 0, "hel", 0, 3);
-///     assert_eq!(token.offset_from, 0);
+/// assert_token(stream.next().unwrap(), 1, "el", 1, 3);
-///     assert_eq!(token.offset_to, 2);
+/// assert_token(stream.next().unwrap(), 1, "ell", 1, 4);
-/// }
+/// assert_token(stream.next().unwrap(), 2, "ll", 2, 4);
-/// {
+/// assert_token(stream.next().unwrap(), 2, "llo", 2, 5);
-///   let token = stream.next().unwrap();
+/// assert_token(stream.next().unwrap(), 3, "lo", 3, 5);
 ///     assert_eq!(token.text, "hel");
 ///     assert_eq!(token.offset_from, 0);
 ///     assert_eq!(token.offset_to, 3);
 /// }
 /// {
 ///   let token = stream.next().unwrap();
 ///     assert_eq!(token.text, "el");
 ///     assert_eq!(token.offset_from, 1);
 ///     assert_eq!(token.offset_to, 3);
 /// }
 /// {
 ///   let token = stream.next().unwrap();
 ///     assert_eq!(token.text, "ell");
 ///     assert_eq!(token.offset_from, 1);
 ///     assert_eq!(token.offset_to, 4);
 /// }
 /// {
 ///   let token = stream.next().unwrap();
 ///     assert_eq!(token.text, "ll");
 ///     assert_eq!(token.offset_from, 2);
 ///     assert_eq!(token.offset_to, 4);
 /// }
 /// {
 ///   let token = stream.next().unwrap();
 ///     assert_eq!(token.text, "llo");
 ///     assert_eq!(token.offset_from, 2);
 ///     assert_eq!(token.offset_to, 5);
 /// }
 /// {
 ///   let token = stream.next().unwrap();
 ///   assert_eq!(token.text, "lo");
 ///   assert_eq!(token.offset_from, 3);
 ///   assert_eq!(token.offset_to, 5);
 /// }
 /// assert!(stream.next().is_none());
 /// # }
 /// ```
@@ -98,37 +58,23 @@ impl NgramTokenizer {
            min_gram <= max_gram,
            "min_gram must not be greater than max_gram"
        );
        NgramTokenizer {
            min_gram,
            max_gram,
            prefix_only,
        }
    }
    /// Create a `NGramTokenizer` which generates tokens for all inner ngrams.
    ///
    /// This is as opposed to only prefix ngrams    .
    pub fn all_ngrams(min_gram: usize, max_gram:usize) ->  NgramTokenizer {
        Self::new(min_gram, max_gram, false)
    }
    /// Create a `NGramTokenizer` which only generates tokens for the
    /// prefix ngrams.
    pub fn prefix_only(min_gram: usize, max_gram: usize) -> NgramTokenizer {
        Self::new(min_gram, max_gram, true)
    }
 }
 /// TokenStream associate to the `NgramTokenizer`
 pub struct NgramTokenStream<'a> {
    /// parameters
    ngram_charidx_iterator: StutteringIterator<CodepointFrontiers<'a>>,
    /// true if the NgramTokenStream is in prefix mode.
    prefix_only: bool,
    /// input
    text: &'a str,
-    /// output
+    position: usize,
    text_length: usize,
    token: Token,
    min_gram: usize,
    max_gram: usize,
    gram_size: usize,
    prefix_only: bool,
 }
 impl<'a> Tokenizer<'a> for NgramTokenizer {
@@ -136,28 +82,65 @@ impl<'a> Tokenizer<'a> for NgramTokenizer {
    fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl {
        NgramTokenStream {
            ngram_charidx_iterator: StutteringIterator::new(
                    CodepointFrontiers::for_str(text),
                self.min_gram,
                self.max_gram),
            prefix_only: self.prefix_only,
            text,
            position: 0,
            text_length: text.len(),
            token: Token::default(),
            min_gram: self.min_gram,
            max_gram: self.max_gram,
            prefix_only: self.prefix_only,
            gram_size: self.min_gram,
        }
    }
 }
 impl<'a> NgramTokenStream<'a> {
    /// Get the next set of token options
    /// cycle through 1,2 (min..=max)
    /// returning None if processing should stop
    fn chomp(&mut self) -> Option<(usize, usize)> {
        // Have we exceeded the bounds of the text we are indexing?
        if self.gram_size > self.max_gram {
            if self.prefix_only {
                return None;
            }
            // since we aren't just processing edges
            // we need to reset the gram size
            self.gram_size = self.min_gram;
            // and move down the chain of letters
            self.position += 1;
        }
        let result = if (self.position + self.gram_size) <= self.text_length {
            Some((self.position, self.gram_size))
        } else {
            None
        };
        // increase the gram size for the next pass
        self.gram_size += 1;
        result
    }
 }
 impl<'a> TokenStream for NgramTokenStream<'a> {
    fn advance(&mut self) -> bool {
-        if let Some((offset_from, offset_to)) = self.ngram_charidx_iterator.next() {
+        // clear out working token text
-            if self.prefix_only && offset_from > 0 {
+        self.token.text.clear();
-                return false;
+
-            }
+        if let Some((position, size)) = self.chomp() {
-            self.token.position = 0;
+            self.token.position = position;
            let offset_from = position;
            let offset_to = offset_from + size;
            self.token.offset_from = offset_from;
            self.token.offset_to = offset_to;
-            self.token.text.clear();
+
            self.token.text.push_str(&self.text[offset_from..offset_to]);
            true
        } else {
            false
@@ -167,307 +150,8 @@ impl<'a> TokenStream for NgramTokenStream<'a> {
    fn token(&self) -> &Token {
        &self.token
    }
    fn token_mut(&mut self) -> &mut Token {
        &mut self.token
    }
 }
 /// This iterator takes an underlying Iterator
 /// and emits all of the pairs `(a,b)` such that
 /// a and b are items emitted by the iterator at
 /// an interval between `min_gram` and `max_gram`.
 ///
 /// The elements are emitted in the order of appearance
 /// of `a` first, `b` then.
 ///
 /// See `test_stutterring_iterator` for an example of its
 /// output.
 struct StutteringIterator<T> {
    underlying: T,
    min_gram: usize,
    max_gram: usize,
    memory: Vec<usize>,
    cursor: usize,
    gram_len: usize
 }
 impl<T> StutteringIterator<T>
    where T: Iterator<Item=usize> {
    pub fn new(mut underlying: T, min_gram: usize, max_gram: usize) -> StutteringIterator<T> {
        assert!(min_gram > 0);
        let memory: Vec<usize> = (&mut underlying).take(max_gram + 1).collect();
        if memory.len() <= min_gram {
            // returns an empty iterator
            StutteringIterator {
                underlying,
                min_gram: 1,
                max_gram: 0,
                memory,
                cursor: 0,
                gram_len: 0,
            }
        } else {
            StutteringIterator {
                underlying,
                min_gram,
                max_gram: memory.len() - 1,
                memory,
                cursor: 0,
                gram_len: min_gram,
            }
        }
    }
 }
 impl<T> Iterator for StutteringIterator<T>
    where T: Iterator<Item=usize> {
    type Item = (usize, usize);
    fn next(&mut self) -> Option<(usize, usize)> {
        if self.gram_len > self.max_gram {
            // we have exhausted all options
            // starting at `self.memory[self.cursor]`.
            //
            // Time to advance. 
            self.gram_len = self.min_gram;
            if let Some(next_val) = self.underlying.next() {
                self.memory[self.cursor] = next_val;
            } else {
                self.max_gram -= 1;
            }
            self.cursor += 1;
            if self.cursor >= self.memory.len() {
                self.cursor = 0;
            }
        }
        if self.max_gram < self.min_gram {
            return None;
        }
        let start = self.memory[self.cursor % self.memory.len()];
        let stop = self.memory[(self.cursor + self.gram_len) % self.memory.len()];
        self.gram_len += 1;
        Some((start, stop))
    }
 }
 /// Emits all of the offsets where a codepoint starts
 /// or a codepoint ends.
 ///
 /// By convention, we emit [0] for the empty string.
 struct CodepointFrontiers<'a> {
    s: &'a str,
    next_el: Option<usize>
 }
 impl<'a> CodepointFrontiers<'a> {
    fn for_str(s: &'a str) -> Self {
        CodepointFrontiers {
            s,
            next_el: Some(0)
        }
    }
 }
 impl<'a> Iterator for CodepointFrontiers<'a> {
    type Item = usize;
    fn next(&mut self) -> Option<usize> {
        self.next_el
            .map(|offset| {
                if self.s.is_empty() {
                    self.next_el = None;
                } else {
                    let first_codepoint_width = utf8_codepoint_width(self.s.as_bytes()[0]);
                    self.s = &self.s[first_codepoint_width..];
                    self.next_el = Some(offset + first_codepoint_width);
                }
                offset
            })
    }
 }
 const CODEPOINT_UTF8_WIDTH: [u8; 16] = [
    1, 1, 1, 1,
    1, 1, 1, 1,
    2, 2, 2, 2,
    2, 2, 3, 4,
 ];
 // Number of bytes to encode a codepoint in UTF-8 given
 // the first byte.
 //
 // To do that we count the number of higher significant bits set to `1`.
 fn utf8_codepoint_width(b: u8) -> usize {
    let higher_4_bits = (b as usize) >> 4;
    CODEPOINT_UTF8_WIDTH[higher_4_bits] as usize
 }
 #[cfg(test)]
 mod tests {
    use tokenizer::tokenizer::{TokenStream, Tokenizer};
    use super::NgramTokenizer;
    use tokenizer::Token;
    use tokenizer::tests::assert_token;
    use super::CodepointFrontiers;
    use super::StutteringIterator;
    use super::utf8_codepoint_width;
    fn test_helper<T: TokenStream>(mut tokenizer: T) -> Vec<Token> {
        let mut tokens: Vec<Token> = vec![];
        tokenizer.process(&mut |token: &Token| tokens.push(token.clone()));
        tokens
    }
    #[test]
    fn test_utf8_codepoint_width() {
        // 0xxx
        for i in 0..128 {
            assert_eq!(utf8_codepoint_width(i), 1);
        }
        // 110xx
        for i in (128 | 64)..(128 | 64 | 32) {
            assert_eq!(utf8_codepoint_width(i), 2);
        }
        // 1110xx
        for i in (128 | 64 | 32)..(128 | 64 | 32 | 16) {
            assert_eq!(utf8_codepoint_width(i), 3);
        }
        // 1111xx
        for i in (128 | 64 | 32 | 16)..256 {
            assert_eq!(utf8_codepoint_width(i as u8), 4);
        }
    }
    #[test]
    fn test_codepoint_frontiers() {
        assert_eq!(CodepointFrontiers::for_str("").collect::<Vec<_>>(), vec![0]);
        assert_eq!(
            CodepointFrontiers::for_str("abcd").collect::<Vec<_>>(),
            vec![0,1,2,3,4]
        );
        assert_eq!(
                CodepointFrontiers::for_str("aあ").collect::<Vec<_>>(),
            vec![0,1,4]
        );
    }
    #[test]
    fn test_ngram_tokenizer_1_2_false() {
        let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hello"));
        assert_eq!(tokens.len(), 9);
        assert_token(&tokens[0], 0, "h", 0, 1);
        assert_token(&tokens[1], 0, "he", 0, 2);
        assert_token(&tokens[2], 0, "e", 1, 2);
        assert_token(&tokens[3], 0, "el", 1, 3);
        assert_token(&tokens[4], 0, "l", 2, 3);
        assert_token(&tokens[5], 0, "ll", 2, 4);
        assert_token(&tokens[6], 0, "l", 3, 4);
        assert_token(&tokens[7], 0, "lo", 3, 5);
        assert_token(&tokens[8], 0, "o", 4, 5);
    }
    #[test]
    fn test_ngram_tokenizer_min_max_equal() {
        let tokens = test_helper(NgramTokenizer::all_ngrams(3, 3).token_stream("hello"));
        assert_eq!(tokens.len(), 3);
        assert_token(&tokens[0], 0, "hel", 0, 3);
        assert_token(&tokens[1], 0, "ell", 1, 4);
        assert_token(&tokens[2], 0, "llo", 2, 5);
    }
    #[test]
    fn test_ngram_tokenizer_2_5_prefix() {
        let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("frankenstein"));
        assert_eq!(tokens.len(), 4);
        assert_token(&tokens[0], 0, "fr", 0, 2);
        assert_token(&tokens[1], 0, "fra", 0, 3);
        assert_token(&tokens[2], 0, "fran", 0, 4);
        assert_token(&tokens[3], 0, "frank", 0, 5);
    }
    #[test]
    fn test_ngram_non_ascii_1_2() {
        let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hεllo"));
        assert_eq!(tokens.len(), 9);
        assert_token(&tokens[0], 0, "h", 0, 1);
        assert_token(&tokens[1], 0, "hε", 0, 3);
        assert_token(&tokens[2], 0, "ε", 1, 3);
        assert_token(&tokens[3], 0, "εl", 1, 4);
        assert_token(&tokens[4], 0, "l", 3, 4);
        assert_token(&tokens[5], 0, "ll", 3, 5);
        assert_token(&tokens[6], 0, "l", 4, 5);
        assert_token(&tokens[7], 0, "lo", 4, 6);
        assert_token(&tokens[8], 0, "o", 5, 6);
    }
    #[test]
    fn test_ngram_non_ascii_2_5_prefix() {
        let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("hεllo"));
        assert_eq!(tokens.len(), 4);
        assert_token(&tokens[0], 0, "hε", 0, 3);
        assert_token(&tokens[1], 0, "hεl", 0, 4);
        assert_token(&tokens[2], 0, "hεll", 0, 5);
        assert_token(&tokens[3], 0, "hεllo", 0, 6);
    }
    #[test]
    fn test_ngram_empty() {
        let tokens = test_helper(NgramTokenizer::all_ngrams(1, 5).token_stream(""));
        assert!(tokens.is_empty());
        let tokens = test_helper(NgramTokenizer::all_ngrams(2, 5).token_stream(""));
        assert!(tokens.is_empty());
    }
    #[test]
    #[should_panic(expected = "min_gram must be greater than 0")]
    fn test_ngram_min_max_interval_empty() {
        test_helper(NgramTokenizer::all_ngrams(0, 2).token_stream("hellossss"));
    }
    #[test]
    #[should_panic(expected = "min_gram must not be greater than max_gram")]
    fn test_invalid_interval_should_panic_if_smaller() {
        NgramTokenizer::all_ngrams(2, 1);
    }
    #[test]
    fn test_stutterring_iterator_empty() {
        let rg: Vec<usize> = vec![0];
        let mut it = StutteringIterator::new(rg.into_iter(), 1, 2);
        assert_eq!(it.next(), None);
    }
    #[test]
    fn test_stutterring_iterator() {
        let rg: Vec<usize> = (0..10).collect();
        let mut it = StutteringIterator::new(rg.into_iter(), 1, 2);
        assert_eq!(it.next(), Some((0, 1)));
        assert_eq!(it.next(), Some((0, 2)));
        assert_eq!(it.next(), Some((1, 2)));
        assert_eq!(it.next(), Some((1, 3)));
        assert_eq!(it.next(), Some((2, 3)));
        assert_eq!(it.next(), Some((2, 4)));
        assert_eq!(it.next(), Some((3, 4)));
        assert_eq!(it.next(), Some((3, 5)));
        assert_eq!(it.next(), Some((4, 5)));
        assert_eq!(it.next(), Some((4, 6)));
        assert_eq!(it.next(), Some((5, 6)));
        assert_eq!(it.next(), Some((5, 7)));
        assert_eq!(it.next(), Some((6, 7)));
        assert_eq!(it.next(), Some((6, 8)));
        assert_eq!(it.next(), Some((7, 8)));
        assert_eq!(it.next(), Some((7, 9)));
        assert_eq!(it.next(), Some((8, 9)));
        assert_eq!(it.next(), None);
    }
 }
Author	SHA1	Message	Date
Paul Masurel	507e46f814	Added static directory	2018-10-04 23:28:44 +09:00
Paul Masurel	3d3da2d66f	Compiling in WebAssembly	2018-10-04 08:45:04 +09:00