mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-09 02:22:54 +00:00
Compare commits
2 Commits
0.7.1
...
issue/weba
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
507e46f814 | ||
|
|
3d3da2d66f |
@@ -1,7 +1,3 @@
|
|||||||
Tantivy 0.7.1
|
|
||||||
=====================
|
|
||||||
- Bugfix: NGramTokenizer panics on non ascii chars
|
|
||||||
- Added a space usage API
|
|
||||||
|
|
||||||
Tantivy 0.7
|
Tantivy 0.7
|
||||||
=====================
|
=====================
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "tantivy"
|
name = "tantivy"
|
||||||
version = "0.7.1"
|
version = "0.7.0"
|
||||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
categories = ["database-implementations", "data-structures"]
|
categories = ["database-implementations", "data-structures"]
|
||||||
@@ -12,12 +12,12 @@ readme = "README.md"
|
|||||||
keywords = ["search", "information", "retrieval"]
|
keywords = ["search", "information", "retrieval"]
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
base64 = "0.10.0"
|
base64 = "0.9.1"
|
||||||
byteorder = "1.0"
|
byteorder = "1.0"
|
||||||
lazy_static = "1"
|
lazy_static = "1"
|
||||||
regex = "1.0"
|
regex = "1.0"
|
||||||
fst = {version="0.3", default-features=false}
|
fst = {version="0.3", default-features=false}
|
||||||
fst-regex = { version="0.2" }
|
fst-regex = { version="0.2", optional=true}
|
||||||
lz4 = {version="1.20", optional=true}
|
lz4 = {version="1.20", optional=true}
|
||||||
snap = {version="0.2"}
|
snap = {version="0.2"}
|
||||||
atomicwrites = {version="0.2.2", optional=true}
|
atomicwrites = {version="0.2.2", optional=true}
|
||||||
@@ -68,8 +68,9 @@ overflow-checks = true
|
|||||||
|
|
||||||
[features]
|
[features]
|
||||||
# by default no-fail is disabled. We manually enable it when running test.
|
# by default no-fail is disabled. We manually enable it when running test.
|
||||||
default = ["mmap", "no_fail"]
|
default = ["mmap", "no_fail", "regex_query"]
|
||||||
mmap = ["fst/mmap", "atomicwrites"]
|
mmap = ["fst/mmap", "atomicwrites"]
|
||||||
|
regex_query = ["fst-regex"]
|
||||||
lz4-compression = ["lz4"]
|
lz4-compression = ["lz4"]
|
||||||
no_fail = ["fail/no_fail"]
|
no_fail = ["fail/no_fail"]
|
||||||
|
|
||||||
|
|||||||
@@ -21,7 +21,7 @@
|
|||||||
|
|
||||||
**Tantivy** is a **full text search engine library** written in rust.
|
**Tantivy** is a **full text search engine library** written in rust.
|
||||||
|
|
||||||
It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elastic Search](https://www.elastic.co/products/elasticsearch) and [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
|
It is closer to Lucene than to Elastic Search and Solr in the sense it is not
|
||||||
an off-the-shelf search engine server, but rather a crate that can be used
|
an off-the-shelf search engine server, but rather a crate that can be used
|
||||||
to build such a search engine.
|
to build such a search engine.
|
||||||
|
|
||||||
|
|||||||
@@ -4,8 +4,6 @@ use common::VInt;
|
|||||||
use directory::ReadOnlySource;
|
use directory::ReadOnlySource;
|
||||||
use directory::WritePtr;
|
use directory::WritePtr;
|
||||||
use schema::Field;
|
use schema::Field;
|
||||||
use space_usage::PerFieldSpaceUsage;
|
|
||||||
use space_usage::FieldUsage;
|
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
use std::io::{self, Read};
|
use std::io::{self, Read};
|
||||||
@@ -168,16 +166,6 @@ impl CompositeFile {
|
|||||||
.get(&FileAddr { field, idx })
|
.get(&FileAddr { field, idx })
|
||||||
.map(|&(from, to)| self.data.slice(from, to))
|
.map(|&(from, to)| self.data.slice(from, to))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn space_usage(&self) -> PerFieldSpaceUsage {
|
|
||||||
let mut fields = HashMap::new();
|
|
||||||
for (&field_addr, &(start, end)) in self.offsets_index.iter() {
|
|
||||||
fields.entry(field_addr.field)
|
|
||||||
.or_insert_with(|| FieldUsage::empty(field_addr.field))
|
|
||||||
.add_field_idx(field_addr.idx, end - start);
|
|
||||||
}
|
|
||||||
PerFieldSpaceUsage::new(fields)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|||||||
@@ -49,11 +49,6 @@ pub struct Index {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Index {
|
impl Index {
|
||||||
/// Examines the director to see if it contains an index
|
|
||||||
pub fn exists<Dir: Directory>(dir: &Dir) -> bool {
|
|
||||||
dir.exists(&META_FILEPATH)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Creates a new index using the `RAMDirectory`.
|
/// Creates a new index using the `RAMDirectory`.
|
||||||
///
|
///
|
||||||
/// The index will be allocated in anonymous memory.
|
/// The index will be allocated in anonymous memory.
|
||||||
@@ -70,28 +65,9 @@ impl Index {
|
|||||||
#[cfg(feature = "mmap")]
|
#[cfg(feature = "mmap")]
|
||||||
pub fn create_in_dir<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
|
pub fn create_in_dir<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
|
||||||
let mmap_directory = MmapDirectory::open(directory_path)?;
|
let mmap_directory = MmapDirectory::open(directory_path)?;
|
||||||
if Index::exists(&mmap_directory) {
|
|
||||||
return Err(TantivyError::IndexAlreadyExists);
|
|
||||||
}
|
|
||||||
|
|
||||||
Index::create(mmap_directory, schema)
|
Index::create(mmap_directory, schema)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Opens or creates a new index in the provided directory
|
|
||||||
#[cfg(feature = "mmap")]
|
|
||||||
pub fn open_or_create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
|
|
||||||
if Index::exists(&dir) {
|
|
||||||
let index = Index::open(dir)?;
|
|
||||||
if index.schema() == schema {
|
|
||||||
Ok(index)
|
|
||||||
} else {
|
|
||||||
Err(TantivyError::SchemaError("An index exists but the schema does not match.".to_string()))
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
Index::create(dir, schema)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Creates a new index in a temp directory.
|
/// Creates a new index in a temp directory.
|
||||||
///
|
///
|
||||||
/// The index will use the `MMapDirectory` in a newly created directory.
|
/// The index will use the `MMapDirectory` in a newly created directory.
|
||||||
@@ -113,8 +89,6 @@ impl Index {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Create a new index from a directory.
|
/// Create a new index from a directory.
|
||||||
///
|
|
||||||
/// This will overwrite existing meta.json
|
|
||||||
fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
|
fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
|
||||||
save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
|
save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
|
||||||
let metas = IndexMeta::with_schema(schema);
|
let metas = IndexMeta::with_schema(schema);
|
||||||
@@ -354,9 +328,8 @@ impl Clone for Index {
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use schema::{Schema, SchemaBuilder, INT_INDEXED, TEXT};
|
use schema::{SchemaBuilder, INT_INDEXED, TEXT};
|
||||||
use Index;
|
use Index;
|
||||||
use directory::RAMDirectory;
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_indexer_for_field() {
|
fn test_indexer_for_field() {
|
||||||
@@ -372,52 +345,4 @@ mod tests {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_index_exists() {
|
|
||||||
let directory = RAMDirectory::create();
|
|
||||||
assert!(!Index::exists(&directory));
|
|
||||||
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
|
|
||||||
assert!(Index::exists(&directory));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn open_or_create_should_create() {
|
|
||||||
let directory = RAMDirectory::create();
|
|
||||||
assert!(!Index::exists(&directory));
|
|
||||||
assert!(Index::open_or_create(directory.clone(), throw_away_schema()).is_ok());
|
|
||||||
assert!(Index::exists(&directory));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn open_or_create_should_open() {
|
|
||||||
let directory = RAMDirectory::create();
|
|
||||||
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
|
|
||||||
assert!(Index::exists(&directory));
|
|
||||||
assert!(Index::open_or_create(directory, throw_away_schema()).is_ok());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn create_should_wipeoff_existing() {
|
|
||||||
let directory = RAMDirectory::create();
|
|
||||||
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
|
|
||||||
assert!(Index::exists(&directory));
|
|
||||||
assert!(Index::create(directory.clone(), SchemaBuilder::default().build()).is_ok());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn open_or_create_exists_but_schema_does_not_match() {
|
|
||||||
let directory = RAMDirectory::create();
|
|
||||||
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
|
|
||||||
assert!(Index::exists(&directory));
|
|
||||||
assert!(Index::open_or_create(directory.clone(), throw_away_schema()).is_ok());
|
|
||||||
let err = Index::open_or_create(directory, SchemaBuilder::default().build());
|
|
||||||
assert_eq!(format!("{:?}", err.unwrap_err()), "SchemaError(\"An index exists but the schema does not match.\")");
|
|
||||||
}
|
|
||||||
|
|
||||||
fn throw_away_schema() -> Schema {
|
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
|
||||||
let _ = schema_builder.add_u64_field("num_likes", INT_INDEXED);
|
|
||||||
schema_builder.build()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,7 +5,6 @@ use query::Query;
|
|||||||
use schema::Document;
|
use schema::Document;
|
||||||
use schema::Schema;
|
use schema::Schema;
|
||||||
use schema::{Field, Term};
|
use schema::{Field, Term};
|
||||||
use space_usage::SearcherSpaceUsage;
|
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use termdict::TermMerger;
|
use termdict::TermMerger;
|
||||||
@@ -100,15 +99,6 @@ impl Searcher {
|
|||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
FieldSearcher::new(inv_index_readers)
|
FieldSearcher::new(inv_index_readers)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Summarize total space usage of this searcher.
|
|
||||||
pub fn space_usage(&self) -> SearcherSpaceUsage {
|
|
||||||
let mut space_usage = SearcherSpaceUsage::new();
|
|
||||||
for segment_reader in self.segment_readers.iter() {
|
|
||||||
space_usage.add_segment(segment_reader.space_usage());
|
|
||||||
}
|
|
||||||
space_usage
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct FieldSearcher {
|
pub struct FieldSearcher {
|
||||||
|
|||||||
@@ -16,7 +16,6 @@ use schema::Document;
|
|||||||
use schema::Field;
|
use schema::Field;
|
||||||
use schema::FieldType;
|
use schema::FieldType;
|
||||||
use schema::Schema;
|
use schema::Schema;
|
||||||
use space_usage::SegmentSpaceUsage;
|
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
@@ -382,21 +381,6 @@ impl SegmentReader {
|
|||||||
pub fn doc_ids_alive(&self) -> SegmentReaderAliveDocsIterator {
|
pub fn doc_ids_alive(&self) -> SegmentReaderAliveDocsIterator {
|
||||||
SegmentReaderAliveDocsIterator::new(&self)
|
SegmentReaderAliveDocsIterator::new(&self)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Summarize total space usage of this segment.
|
|
||||||
pub fn space_usage(&self) -> SegmentSpaceUsage {
|
|
||||||
SegmentSpaceUsage::new(
|
|
||||||
self.num_docs(),
|
|
||||||
self.termdict_composite.space_usage(),
|
|
||||||
self.postings_composite.space_usage(),
|
|
||||||
self.positions_composite.space_usage(),
|
|
||||||
self.positions_idx_composite.space_usage(),
|
|
||||||
self.fast_fields_composite.space_usage(),
|
|
||||||
self.fieldnorms_composite.space_usage(),
|
|
||||||
self.store_reader.space_usage(),
|
|
||||||
self.delete_bitset_opt.as_ref().map(|x| x.space_usage()).unwrap_or(0),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl fmt::Debug for SegmentReader {
|
impl fmt::Debug for SegmentReader {
|
||||||
|
|||||||
@@ -364,11 +364,6 @@ mod tests {
|
|||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_open_non_existant_path() {
|
|
||||||
assert!(MmapDirectory::open(PathBuf::from("./nowhere")).is_err());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_open_empty() {
|
fn test_open_empty() {
|
||||||
// empty file is actually an edge case because those
|
// empty file is actually an edge case because those
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ mod managed_directory;
|
|||||||
mod ram_directory;
|
mod ram_directory;
|
||||||
mod read_only_source;
|
mod read_only_source;
|
||||||
mod shared_vec_slice;
|
mod shared_vec_slice;
|
||||||
|
mod static_dictionnary;
|
||||||
|
|
||||||
/// Errors specific to the directory module.
|
/// Errors specific to the directory module.
|
||||||
pub mod error;
|
pub mod error;
|
||||||
@@ -21,6 +22,7 @@ use std::io::{BufWriter, Seek, Write};
|
|||||||
pub use self::directory::{Directory, DirectoryClone};
|
pub use self::directory::{Directory, DirectoryClone};
|
||||||
pub use self::ram_directory::RAMDirectory;
|
pub use self::ram_directory::RAMDirectory;
|
||||||
pub use self::read_only_source::ReadOnlySource;
|
pub use self::read_only_source::ReadOnlySource;
|
||||||
|
pub use self::static_dictionnary::StaticDirectory;
|
||||||
|
|
||||||
#[cfg(feature = "mmap")]
|
#[cfg(feature = "mmap")]
|
||||||
pub use self::mmap_directory::MmapDirectory;
|
pub use self::mmap_directory::MmapDirectory;
|
||||||
|
|||||||
@@ -5,6 +5,9 @@ use fst::raw::MmapReadOnly;
|
|||||||
use stable_deref_trait::{CloneStableDeref, StableDeref};
|
use stable_deref_trait::{CloneStableDeref, StableDeref};
|
||||||
use std::ops::Deref;
|
use std::ops::Deref;
|
||||||
|
|
||||||
|
|
||||||
|
const EMPTY_SLICE: [u8; 0] = [];
|
||||||
|
|
||||||
/// Read object that represents files in tantivy.
|
/// Read object that represents files in tantivy.
|
||||||
///
|
///
|
||||||
/// These read objects are only in charge to deliver
|
/// These read objects are only in charge to deliver
|
||||||
@@ -17,6 +20,8 @@ pub enum ReadOnlySource {
|
|||||||
Mmap(MmapReadOnly),
|
Mmap(MmapReadOnly),
|
||||||
/// Wrapping a `Vec<u8>`
|
/// Wrapping a `Vec<u8>`
|
||||||
Anonymous(SharedVecSlice),
|
Anonymous(SharedVecSlice),
|
||||||
|
/// Wrapping a static slice
|
||||||
|
Static(&'static [u8])
|
||||||
}
|
}
|
||||||
|
|
||||||
unsafe impl StableDeref for ReadOnlySource {}
|
unsafe impl StableDeref for ReadOnlySource {}
|
||||||
@@ -33,7 +38,7 @@ impl Deref for ReadOnlySource {
|
|||||||
impl ReadOnlySource {
|
impl ReadOnlySource {
|
||||||
/// Creates an empty ReadOnlySource
|
/// Creates an empty ReadOnlySource
|
||||||
pub fn empty() -> ReadOnlySource {
|
pub fn empty() -> ReadOnlySource {
|
||||||
ReadOnlySource::Anonymous(SharedVecSlice::empty())
|
ReadOnlySource::Static(&EMPTY_SLICE)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the data underlying the ReadOnlySource object.
|
/// Returns the data underlying the ReadOnlySource object.
|
||||||
@@ -42,6 +47,7 @@ impl ReadOnlySource {
|
|||||||
#[cfg(feature = "mmap")]
|
#[cfg(feature = "mmap")]
|
||||||
ReadOnlySource::Mmap(ref mmap_read_only) => mmap_read_only.as_slice(),
|
ReadOnlySource::Mmap(ref mmap_read_only) => mmap_read_only.as_slice(),
|
||||||
ReadOnlySource::Anonymous(ref shared_vec) => shared_vec.as_slice(),
|
ReadOnlySource::Anonymous(ref shared_vec) => shared_vec.as_slice(),
|
||||||
|
ReadOnlySource::Static(data) => data,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -79,6 +85,9 @@ impl ReadOnlySource {
|
|||||||
ReadOnlySource::Anonymous(ref shared_vec) => {
|
ReadOnlySource::Anonymous(ref shared_vec) => {
|
||||||
ReadOnlySource::Anonymous(shared_vec.slice(from_offset, to_offset))
|
ReadOnlySource::Anonymous(shared_vec.slice(from_offset, to_offset))
|
||||||
}
|
}
|
||||||
|
ReadOnlySource::Static(data) => {
|
||||||
|
ReadOnlySource::Static(&data[from_offset..to_offset])
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -118,3 +127,9 @@ impl From<Vec<u8>> for ReadOnlySource {
|
|||||||
ReadOnlySource::Anonymous(shared_data)
|
ReadOnlySource::Anonymous(shared_data)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl From<&'static [u8]> for ReadOnlySource {
|
||||||
|
fn from(data: &'static [u8]) -> ReadOnlySource {
|
||||||
|
ReadOnlySource::Static(data)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -20,9 +20,6 @@ pub enum TantivyError {
|
|||||||
/// File already exists, this is a problem when we try to write into a new file.
|
/// File already exists, this is a problem when we try to write into a new file.
|
||||||
#[fail(display = "file already exists: '{:?}'", _0)]
|
#[fail(display = "file already exists: '{:?}'", _0)]
|
||||||
FileAlreadyExists(PathBuf),
|
FileAlreadyExists(PathBuf),
|
||||||
/// Index already exists in this directory
|
|
||||||
#[fail(display = "index already exists")]
|
|
||||||
IndexAlreadyExists,
|
|
||||||
/// Failed to acquire file lock
|
/// Failed to acquire file lock
|
||||||
#[fail(
|
#[fail(
|
||||||
display = "Failed to acquire Lockfile: {:?}. Possible causes: another IndexWriter instance or panic during previous lock drop.",
|
display = "Failed to acquire Lockfile: {:?}. Possible causes: another IndexWriter instance or panic during previous lock drop.",
|
||||||
|
|||||||
@@ -2,7 +2,6 @@ use bit_set::BitSet;
|
|||||||
use common::HasLen;
|
use common::HasLen;
|
||||||
use directory::ReadOnlySource;
|
use directory::ReadOnlySource;
|
||||||
use directory::WritePtr;
|
use directory::WritePtr;
|
||||||
use space_usage::ByteCount;
|
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
use DocId;
|
use DocId;
|
||||||
@@ -64,11 +63,6 @@ impl DeleteBitSet {
|
|||||||
b & (1u8 << shift) != 0
|
b & (1u8 << shift) != 0
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Summarize total space usage of this bitset.
|
|
||||||
pub fn space_usage(&self) -> ByteCount {
|
|
||||||
self.data.len()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl HasLen for DeleteBitSet {
|
impl HasLen for DeleteBitSet {
|
||||||
|
|||||||
@@ -136,7 +136,7 @@ extern crate crossbeam;
|
|||||||
extern crate crossbeam_channel;
|
extern crate crossbeam_channel;
|
||||||
extern crate fnv;
|
extern crate fnv;
|
||||||
extern crate fst;
|
extern crate fst;
|
||||||
extern crate fst_regex;
|
|
||||||
extern crate futures;
|
extern crate futures;
|
||||||
extern crate futures_cpupool;
|
extern crate futures_cpupool;
|
||||||
extern crate htmlescape;
|
extern crate htmlescape;
|
||||||
@@ -213,7 +213,6 @@ pub(crate) mod positions;
|
|||||||
pub mod postings;
|
pub mod postings;
|
||||||
pub mod query;
|
pub mod query;
|
||||||
pub mod schema;
|
pub mod schema;
|
||||||
pub mod space_usage;
|
|
||||||
pub mod store;
|
pub mod store;
|
||||||
pub mod termdict;
|
pub mod termdict;
|
||||||
|
|
||||||
|
|||||||
@@ -16,7 +16,10 @@ mod phrase_query;
|
|||||||
mod query;
|
mod query;
|
||||||
mod query_parser;
|
mod query_parser;
|
||||||
mod range_query;
|
mod range_query;
|
||||||
|
|
||||||
|
#[cfg(feature="regex_query")]
|
||||||
mod regex_query;
|
mod regex_query;
|
||||||
|
|
||||||
mod reqopt_scorer;
|
mod reqopt_scorer;
|
||||||
mod scorer;
|
mod scorer;
|
||||||
mod term_query;
|
mod term_query;
|
||||||
@@ -47,7 +50,10 @@ pub use self::query::Query;
|
|||||||
pub use self::query_parser::QueryParser;
|
pub use self::query_parser::QueryParser;
|
||||||
pub use self::query_parser::QueryParserError;
|
pub use self::query_parser::QueryParserError;
|
||||||
pub use self::range_query::RangeQuery;
|
pub use self::range_query::RangeQuery;
|
||||||
|
|
||||||
|
#[cfg(feature="regex_query")]
|
||||||
pub use self::regex_query::RegexQuery;
|
pub use self::regex_query::RegexQuery;
|
||||||
|
|
||||||
pub use self::reqopt_scorer::RequiredOptionalScorer;
|
pub use self::reqopt_scorer::RequiredOptionalScorer;
|
||||||
pub use self::scorer::ConstScorer;
|
pub use self::scorer::ConstScorer;
|
||||||
pub use self::scorer::Scorer;
|
pub use self::scorer::Scorer;
|
||||||
|
|||||||
@@ -1,5 +1,7 @@
|
|||||||
|
extern crate fst_regex;
|
||||||
|
|
||||||
use error::TantivyError;
|
use error::TantivyError;
|
||||||
use fst_regex::Regex;
|
use self::fst_regex::Regex;
|
||||||
use query::{AutomatonWeight, Query, Weight};
|
use query::{AutomatonWeight, Query, Weight};
|
||||||
use schema::Field;
|
use schema::Field;
|
||||||
use std::clone::Clone;
|
use std::clone::Clone;
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ use std::fmt;
|
|||||||
/// - a field name
|
/// - a field name
|
||||||
/// - a field type, itself wrapping up options describing
|
/// - a field type, itself wrapping up options describing
|
||||||
/// how the field should be indexed.
|
/// how the field should be indexed.
|
||||||
#[derive(Clone, Debug, Eq, PartialEq)]
|
#[derive(Clone, Debug)]
|
||||||
pub struct FieldEntry {
|
pub struct FieldEntry {
|
||||||
name: String,
|
name: String,
|
||||||
field_type: FieldType,
|
field_type: FieldType,
|
||||||
|
|||||||
@@ -134,15 +134,6 @@ struct InnerSchema {
|
|||||||
fields_map: HashMap<String, Field>, // transient
|
fields_map: HashMap<String, Field>, // transient
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PartialEq for InnerSchema {
|
|
||||||
fn eq(&self, other: &InnerSchema) -> bool {
|
|
||||||
self.fields == other.fields
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Eq for InnerSchema {}
|
|
||||||
|
|
||||||
|
|
||||||
/// Tantivy has a very strict schema.
|
/// Tantivy has a very strict schema.
|
||||||
/// You need to specify in advance, whether a field is indexed or not,
|
/// You need to specify in advance, whether a field is indexed or not,
|
||||||
/// stored or not, and RAM-based or not.
|
/// stored or not, and RAM-based or not.
|
||||||
@@ -163,7 +154,7 @@ impl Eq for InnerSchema {}
|
|||||||
/// let schema = schema_builder.build();
|
/// let schema = schema_builder.build();
|
||||||
///
|
///
|
||||||
/// ```
|
/// ```
|
||||||
#[derive(Clone, Eq, PartialEq)]
|
#[derive(Clone)]
|
||||||
pub struct Schema(Arc<InnerSchema>);
|
pub struct Schema(Arc<InnerSchema>);
|
||||||
|
|
||||||
impl Schema {
|
impl Schema {
|
||||||
|
|||||||
@@ -1,484 +0,0 @@
|
|||||||
/*!
|
|
||||||
Representations for the space usage of various parts of a Tantivy index.
|
|
||||||
|
|
||||||
This can be used programmatically, and will also be exposed in a human readable fashion in
|
|
||||||
tantivy-cli.
|
|
||||||
|
|
||||||
One important caveat for all of this functionality is that none of it currently takes storage-level
|
|
||||||
details into consideration. For example, if your file system block size is 4096 bytes, we can
|
|
||||||
under-count actual resultant space usage by up to 4095 bytes per file.
|
|
||||||
*/
|
|
||||||
|
|
||||||
use schema::Field;
|
|
||||||
use std::collections::HashMap;
|
|
||||||
use SegmentComponent;
|
|
||||||
|
|
||||||
/// Indicates space usage in bytes
|
|
||||||
pub type ByteCount = usize;
|
|
||||||
|
|
||||||
/// Enum containing any of the possible space usage results for segment components.
|
|
||||||
pub enum ComponentSpaceUsage {
|
|
||||||
/// Data is stored per field in a uniform way
|
|
||||||
PerField(PerFieldSpaceUsage),
|
|
||||||
/// Data is stored in separate pieces in the store
|
|
||||||
Store(StoreSpaceUsage),
|
|
||||||
/// Some sort of raw byte count
|
|
||||||
Basic(ByteCount),
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Represents combined space usage of an entire searcher and its component segments.
|
|
||||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
|
||||||
pub struct SearcherSpaceUsage {
|
|
||||||
segments: Vec<SegmentSpaceUsage>,
|
|
||||||
total: ByteCount,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SearcherSpaceUsage {
|
|
||||||
pub(crate) fn new() -> SearcherSpaceUsage {
|
|
||||||
SearcherSpaceUsage {
|
|
||||||
segments: Vec::new(),
|
|
||||||
total: 0,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Add a segment, to `self`.
|
|
||||||
/// Performs no deduplication or other intelligence.
|
|
||||||
pub(crate) fn add_segment(&mut self, segment: SegmentSpaceUsage) {
|
|
||||||
self.total += segment.total();
|
|
||||||
self.segments.push(segment);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Per segment space usage
|
|
||||||
pub fn segments(&self) -> &[SegmentSpaceUsage] {
|
|
||||||
&self.segments[..]
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns total byte usage of this searcher, including all large subcomponents.
|
|
||||||
/// Does not account for smaller things like `meta.json`.
|
|
||||||
pub fn total(&self) -> ByteCount {
|
|
||||||
self.total
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Represents combined space usage for all of the large components comprising a segment.
|
|
||||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
|
||||||
pub struct SegmentSpaceUsage {
|
|
||||||
num_docs: u32,
|
|
||||||
|
|
||||||
termdict: PerFieldSpaceUsage,
|
|
||||||
postings: PerFieldSpaceUsage,
|
|
||||||
positions: PerFieldSpaceUsage,
|
|
||||||
positions_idx: PerFieldSpaceUsage,
|
|
||||||
fast_fields: PerFieldSpaceUsage,
|
|
||||||
fieldnorms: PerFieldSpaceUsage,
|
|
||||||
|
|
||||||
store: StoreSpaceUsage,
|
|
||||||
|
|
||||||
deletes: ByteCount,
|
|
||||||
|
|
||||||
total: ByteCount,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SegmentSpaceUsage {
|
|
||||||
pub(crate) fn new(
|
|
||||||
num_docs: u32,
|
|
||||||
termdict: PerFieldSpaceUsage,
|
|
||||||
postings: PerFieldSpaceUsage,
|
|
||||||
positions: PerFieldSpaceUsage,
|
|
||||||
positions_idx: PerFieldSpaceUsage,
|
|
||||||
fast_fields: PerFieldSpaceUsage,
|
|
||||||
fieldnorms: PerFieldSpaceUsage,
|
|
||||||
store: StoreSpaceUsage,
|
|
||||||
deletes: ByteCount,
|
|
||||||
) -> SegmentSpaceUsage {
|
|
||||||
let total = termdict.total()
|
|
||||||
+ postings.total()
|
|
||||||
+ positions.total()
|
|
||||||
+ fast_fields.total()
|
|
||||||
+ fieldnorms.total()
|
|
||||||
+ store.total()
|
|
||||||
+ deletes;
|
|
||||||
SegmentSpaceUsage {
|
|
||||||
num_docs,
|
|
||||||
termdict,
|
|
||||||
postings,
|
|
||||||
positions,
|
|
||||||
positions_idx,
|
|
||||||
fast_fields,
|
|
||||||
fieldnorms,
|
|
||||||
store,
|
|
||||||
deletes,
|
|
||||||
total,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Space usage for the given component
|
|
||||||
///
|
|
||||||
/// Clones the underlying data.
|
|
||||||
/// Use the components directly if this is somehow in performance critical code.
|
|
||||||
pub fn component(&self, component: SegmentComponent) -> ComponentSpaceUsage {
|
|
||||||
use SegmentComponent::*;
|
|
||||||
use self::ComponentSpaceUsage::*;
|
|
||||||
match component {
|
|
||||||
POSTINGS => PerField(self.postings().clone()),
|
|
||||||
POSITIONS => PerField(self.positions().clone()),
|
|
||||||
POSITIONSSKIP => PerField(self.positions_skip_idx().clone()),
|
|
||||||
FASTFIELDS => PerField(self.fast_fields().clone()),
|
|
||||||
FIELDNORMS => PerField(self.fieldnorms().clone()),
|
|
||||||
TERMS => PerField(self.termdict().clone()),
|
|
||||||
STORE => Store(self.store().clone()),
|
|
||||||
DELETE => Basic(self.deletes()),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Num docs in segment
|
|
||||||
pub fn num_docs(&self) -> u32 {
|
|
||||||
self.num_docs
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Space usage for term dictionary
|
|
||||||
pub fn termdict(&self) -> &PerFieldSpaceUsage {
|
|
||||||
&self.termdict
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Space usage for postings list
|
|
||||||
pub fn postings(&self) -> &PerFieldSpaceUsage {
|
|
||||||
&self.postings
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Space usage for positions
|
|
||||||
pub fn positions(&self) -> &PerFieldSpaceUsage {
|
|
||||||
&self.positions
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Space usage for positions skip idx
|
|
||||||
pub fn positions_skip_idx(&self) -> &PerFieldSpaceUsage {
|
|
||||||
&self.positions_idx
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Space usage for fast fields
|
|
||||||
pub fn fast_fields(&self) -> &PerFieldSpaceUsage {
|
|
||||||
&self.fast_fields
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Space usage for field norms
|
|
||||||
pub fn fieldnorms(&self) -> &PerFieldSpaceUsage {
|
|
||||||
&self.fieldnorms
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Space usage for stored documents
|
|
||||||
pub fn store(&self) -> &StoreSpaceUsage {
|
|
||||||
&self.store
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Space usage for document deletions
|
|
||||||
pub fn deletes(&self) -> ByteCount {
|
|
||||||
self.deletes
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Total space usage in bytes for this segment.
|
|
||||||
pub fn total(&self) -> ByteCount {
|
|
||||||
self.total
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Represents space usage for the Store for this segment.
|
|
||||||
///
|
|
||||||
/// This is composed of two parts.
|
|
||||||
/// `data` represents the compressed data itself.
|
|
||||||
/// `offsets` represents a lookup to find the start of a block
|
|
||||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
|
||||||
pub struct StoreSpaceUsage {
|
|
||||||
data: ByteCount,
|
|
||||||
offsets: ByteCount,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl StoreSpaceUsage {
|
|
||||||
pub(crate) fn new(data: ByteCount, offsets: ByteCount) -> StoreSpaceUsage {
|
|
||||||
StoreSpaceUsage { data, offsets }
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Space usage for the data part of the store
|
|
||||||
pub fn data_usage(&self) -> ByteCount {
|
|
||||||
self.data
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Space usage for the offsets part of the store (doc ID -> offset)
|
|
||||||
pub fn offsets_usage(&self) -> ByteCount {
|
|
||||||
self.offsets
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Total space usage in bytes for this Store
|
|
||||||
pub fn total(&self) -> ByteCount {
|
|
||||||
self.data + self.offsets
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Represents space usage for all of the (field, index) pairs that appear in a CompositeFile.
|
|
||||||
///
|
|
||||||
/// A field can appear with a single index (typically 0) or with multiple indexes.
|
|
||||||
/// Multiple indexes are used to handle variable length things, where
|
|
||||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
|
||||||
pub struct PerFieldSpaceUsage {
|
|
||||||
fields: HashMap<Field, FieldUsage>,
|
|
||||||
total: ByteCount
|
|
||||||
}
|
|
||||||
|
|
||||||
impl PerFieldSpaceUsage {
|
|
||||||
pub(crate) fn new(fields: HashMap<Field, FieldUsage>) -> PerFieldSpaceUsage {
|
|
||||||
let total = fields.values().map(|x| x.total()).sum();
|
|
||||||
PerFieldSpaceUsage { fields, total }
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Per field space usage
|
|
||||||
pub fn fields(&self) -> impl Iterator<Item = (&Field, &FieldUsage)> {
|
|
||||||
self.fields.iter()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Bytes used by the represented file
|
|
||||||
pub fn total(&self) -> ByteCount {
|
|
||||||
self.total
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Represents space usage of a given field, breaking it down into the (field, index) pairs that
|
|
||||||
/// comprise it.
|
|
||||||
///
|
|
||||||
/// See documentation for PerFieldSpaceUsage for slightly more information.
|
|
||||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
|
||||||
pub struct FieldUsage {
|
|
||||||
field: Field,
|
|
||||||
num_bytes: ByteCount,
|
|
||||||
/// A field can be composed of more than one piece.
|
|
||||||
/// These pieces are indexed by arbitrary numbers starting at zero.
|
|
||||||
/// `self.num_bytes` includes all of `self.sub_num_bytes`.
|
|
||||||
sub_num_bytes: Vec<Option<ByteCount>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl FieldUsage {
|
|
||||||
pub(crate) fn empty(field: Field) -> FieldUsage {
|
|
||||||
FieldUsage {
|
|
||||||
field,
|
|
||||||
num_bytes: 0,
|
|
||||||
sub_num_bytes: Vec::new(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn add_field_idx(&mut self, idx: usize, size: ByteCount) {
|
|
||||||
if self.sub_num_bytes.len() < idx + 1{
|
|
||||||
self.sub_num_bytes.resize(idx + 1, None);
|
|
||||||
}
|
|
||||||
assert!(self.sub_num_bytes[idx].is_none());
|
|
||||||
self.sub_num_bytes[idx] = Some(size);
|
|
||||||
self.num_bytes += size
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Field
|
|
||||||
pub fn field(&self) -> Field {
|
|
||||||
self.field
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Space usage for each index
|
|
||||||
pub fn sub_num_bytes(&self) -> &[Option<ByteCount>] {
|
|
||||||
&self.sub_num_bytes[..]
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Total bytes used for this field in this context
|
|
||||||
pub fn total(&self) -> ByteCount {
|
|
||||||
self.num_bytes
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod test {
|
|
||||||
use core::Index;
|
|
||||||
use schema::SchemaBuilder;
|
|
||||||
use schema::{FAST, INT_INDEXED, TEXT};
|
|
||||||
use schema::Field;
|
|
||||||
use space_usage::ByteCount;
|
|
||||||
use space_usage::PerFieldSpaceUsage;
|
|
||||||
use schema::STORED;
|
|
||||||
use Term;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_empty() {
|
|
||||||
let schema = SchemaBuilder::new().build();
|
|
||||||
let index = Index::create_in_ram(schema.clone());
|
|
||||||
|
|
||||||
index.load_searchers().unwrap();
|
|
||||||
let searcher = index.searcher();
|
|
||||||
let searcher_space_usage = searcher.space_usage();
|
|
||||||
assert_eq!(0, searcher_space_usage.total());
|
|
||||||
}
|
|
||||||
|
|
||||||
fn expect_single_field(field_space: &PerFieldSpaceUsage, field: &Field, min_size: ByteCount, max_size: ByteCount) {
|
|
||||||
assert!(field_space.total() >= min_size);
|
|
||||||
assert!(field_space.total() <= max_size);
|
|
||||||
assert_eq!(
|
|
||||||
vec![(field, field_space.total())],
|
|
||||||
field_space.fields().map(|(x,y)| (x, y.total())).collect::<Vec<_>>()
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_fast_indexed() {
|
|
||||||
let mut schema_builder = SchemaBuilder::new();
|
|
||||||
let name = schema_builder.add_u64_field("name", FAST | INT_INDEXED);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
let index = Index::create_in_ram(schema.clone());
|
|
||||||
|
|
||||||
{
|
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
|
||||||
index_writer.add_document(doc!(name => 1u64));
|
|
||||||
index_writer.add_document(doc!(name => 2u64));
|
|
||||||
index_writer.add_document(doc!(name => 10u64));
|
|
||||||
index_writer.add_document(doc!(name => 20u64));
|
|
||||||
index_writer.commit().unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
index.load_searchers().unwrap();
|
|
||||||
let searcher = index.searcher();
|
|
||||||
let searcher_space_usage = searcher.space_usage();
|
|
||||||
assert!(searcher_space_usage.total() > 0);
|
|
||||||
assert_eq!(1, searcher_space_usage.segments().len());
|
|
||||||
|
|
||||||
let segment = &searcher_space_usage.segments()[0];
|
|
||||||
assert!(segment.total() > 0);
|
|
||||||
|
|
||||||
assert_eq!(4, segment.num_docs());
|
|
||||||
|
|
||||||
expect_single_field(segment.termdict(), &name, 1, 512);
|
|
||||||
expect_single_field(segment.postings(), &name, 1, 512);
|
|
||||||
assert_eq!(0, segment.positions().total());
|
|
||||||
assert_eq!(0, segment.positions_skip_idx().total());
|
|
||||||
expect_single_field(segment.fast_fields(), &name, 1, 512);
|
|
||||||
expect_single_field(segment.fieldnorms(), &name, 1, 512);
|
|
||||||
// TODO: understand why the following fails
|
|
||||||
// assert_eq!(0, segment.store().total());
|
|
||||||
assert_eq!(0, segment.deletes());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_text() {
|
|
||||||
let mut schema_builder = SchemaBuilder::new();
|
|
||||||
let name = schema_builder.add_text_field("name", TEXT);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
let index = Index::create_in_ram(schema.clone());
|
|
||||||
|
|
||||||
{
|
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
|
||||||
index_writer.add_document(doc!(name => "hi"));
|
|
||||||
index_writer.add_document(doc!(name => "this is a test"));
|
|
||||||
index_writer.add_document(doc!(name => "some more documents with some word overlap with the other test"));
|
|
||||||
index_writer.add_document(doc!(name => "hello hi goodbye"));
|
|
||||||
index_writer.commit().unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
index.load_searchers().unwrap();
|
|
||||||
let searcher = index.searcher();
|
|
||||||
let searcher_space_usage = searcher.space_usage();
|
|
||||||
assert!(searcher_space_usage.total() > 0);
|
|
||||||
assert_eq!(1, searcher_space_usage.segments().len());
|
|
||||||
|
|
||||||
let segment = &searcher_space_usage.segments()[0];
|
|
||||||
assert!(segment.total() > 0);
|
|
||||||
|
|
||||||
assert_eq!(4, segment.num_docs());
|
|
||||||
|
|
||||||
expect_single_field(segment.termdict(), &name, 1, 512);
|
|
||||||
expect_single_field(segment.postings(), &name, 1, 512);
|
|
||||||
expect_single_field(segment.positions(), &name, 1, 512);
|
|
||||||
expect_single_field(segment.positions_skip_idx(), &name, 1, 512);
|
|
||||||
assert_eq!(0, segment.fast_fields().total());
|
|
||||||
expect_single_field(segment.fieldnorms(), &name, 1, 512);
|
|
||||||
// TODO: understand why the following fails
|
|
||||||
// assert_eq!(0, segment.store().total());
|
|
||||||
assert_eq!(0, segment.deletes());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_store() {
|
|
||||||
let mut schema_builder = SchemaBuilder::new();
|
|
||||||
let name = schema_builder.add_text_field("name", STORED);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
let index = Index::create_in_ram(schema.clone());
|
|
||||||
|
|
||||||
{
|
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
|
||||||
index_writer.add_document(doc!(name => "hi"));
|
|
||||||
index_writer.add_document(doc!(name => "this is a test"));
|
|
||||||
index_writer.add_document(doc!(name => "some more documents with some word overlap with the other test"));
|
|
||||||
index_writer.add_document(doc!(name => "hello hi goodbye"));
|
|
||||||
index_writer.commit().unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
index.load_searchers().unwrap();
|
|
||||||
let searcher = index.searcher();
|
|
||||||
let searcher_space_usage = searcher.space_usage();
|
|
||||||
assert!(searcher_space_usage.total() > 0);
|
|
||||||
assert_eq!(1, searcher_space_usage.segments().len());
|
|
||||||
|
|
||||||
let segment = &searcher_space_usage.segments()[0];
|
|
||||||
assert!(segment.total() > 0);
|
|
||||||
|
|
||||||
assert_eq!(4, segment.num_docs());
|
|
||||||
|
|
||||||
assert_eq!(0, segment.termdict().total());
|
|
||||||
assert_eq!(0, segment.postings().total());
|
|
||||||
assert_eq!(0, segment.positions().total());
|
|
||||||
assert_eq!(0, segment.positions_skip_idx().total());
|
|
||||||
assert_eq!(0, segment.fast_fields().total());
|
|
||||||
assert_eq!(0, segment.fieldnorms().total());
|
|
||||||
assert!(segment.store().total() > 0);
|
|
||||||
assert!(segment.store().total() < 512);
|
|
||||||
assert_eq!(0, segment.deletes());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_deletes() {
|
|
||||||
let mut schema_builder = SchemaBuilder::new();
|
|
||||||
let name = schema_builder.add_u64_field("name", INT_INDEXED);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
let index = Index::create_in_ram(schema.clone());
|
|
||||||
|
|
||||||
{
|
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
|
||||||
index_writer.add_document(doc!(name => 1u64));
|
|
||||||
index_writer.add_document(doc!(name => 2u64));
|
|
||||||
index_writer.add_document(doc!(name => 3u64));
|
|
||||||
index_writer.add_document(doc!(name => 4u64));
|
|
||||||
index_writer.commit().unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
let mut index_writer2 = index.writer(50_000_000).unwrap();
|
|
||||||
index_writer2.delete_term(Term::from_field_u64(name, 2u64));
|
|
||||||
index_writer2.delete_term(Term::from_field_u64(name, 3u64));
|
|
||||||
|
|
||||||
// ok, now we should have a deleted doc
|
|
||||||
index_writer2.commit().unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
index.load_searchers().unwrap();
|
|
||||||
|
|
||||||
let searcher = index.searcher();
|
|
||||||
let searcher_space_usage = searcher.space_usage();
|
|
||||||
assert!(searcher_space_usage.total() > 0);
|
|
||||||
assert_eq!(1, searcher_space_usage.segments().len());
|
|
||||||
|
|
||||||
let segment = &searcher_space_usage.segments()[0];
|
|
||||||
assert!(segment.total() > 0);
|
|
||||||
|
|
||||||
assert_eq!(2, segment.num_docs());
|
|
||||||
|
|
||||||
expect_single_field(segment.termdict(), &name, 1, 512);
|
|
||||||
expect_single_field(segment.postings(), &name, 1, 512);
|
|
||||||
assert_eq!(0, segment.positions().total());
|
|
||||||
assert_eq!(0, segment.positions_skip_idx().total());
|
|
||||||
assert_eq!(0, segment.fast_fields().total());
|
|
||||||
expect_single_field(segment.fieldnorms(), &name, 1, 512);
|
|
||||||
// TODO: understand why the following fails
|
|
||||||
// assert_eq!(0, segment.store().total());
|
|
||||||
assert!(segment.deletes() > 0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -6,7 +6,6 @@ use common::BinarySerializable;
|
|||||||
use common::VInt;
|
use common::VInt;
|
||||||
use directory::ReadOnlySource;
|
use directory::ReadOnlySource;
|
||||||
use schema::Document;
|
use schema::Document;
|
||||||
use space_usage::StoreSpaceUsage;
|
|
||||||
use std::cell::RefCell;
|
use std::cell::RefCell;
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::mem::size_of;
|
use std::mem::size_of;
|
||||||
@@ -88,11 +87,6 @@ impl StoreReader {
|
|||||||
cursor = &cursor[..doc_length];
|
cursor = &cursor[..doc_length];
|
||||||
Ok(Document::deserialize(&mut cursor)?)
|
Ok(Document::deserialize(&mut cursor)?)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Summarize total space usage of this store reader.
|
|
||||||
pub fn space_usage(&self) -> StoreSpaceUsage {
|
|
||||||
StoreSpaceUsage::new(self.data.len(), self.offset_index_source.len())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg_attr(
|
#[cfg_attr(
|
||||||
|
|||||||
@@ -96,6 +96,9 @@ fn open_fst_index(source: ReadOnlySource) -> fst::Map {
|
|||||||
ReadOnlySource::Mmap(mmap_readonly) => {
|
ReadOnlySource::Mmap(mmap_readonly) => {
|
||||||
Fst::from_mmap(mmap_readonly).expect("FST data is corrupted")
|
Fst::from_mmap(mmap_readonly).expect("FST data is corrupted")
|
||||||
}
|
}
|
||||||
|
ReadOnlySource::Static(data) => {
|
||||||
|
Fst::from_static_slice(data).expect("FST data is corrupted")
|
||||||
|
}
|
||||||
};
|
};
|
||||||
fst::Map::from(fst)
|
fst::Map::from(fst)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -157,34 +157,35 @@ pub use self::tokenizer::BoxedTokenizer;
|
|||||||
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||||
pub use self::tokenizer_manager::TokenizerManager;
|
pub use self::tokenizer_manager::TokenizerManager;
|
||||||
|
|
||||||
|
/// This is a function that can be used in tests and doc tests
|
||||||
|
/// to assert a token's correctness.
|
||||||
|
/// TODO: can this be wrapped in #[cfg(test)] so as not to be in the
|
||||||
|
/// public api?
|
||||||
|
pub fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) {
|
||||||
|
assert_eq!(
|
||||||
|
token.position, position,
|
||||||
|
"expected position {} but {:?}",
|
||||||
|
position, token
|
||||||
|
);
|
||||||
|
assert_eq!(token.text, text, "expected text {} but {:?}", text, token);
|
||||||
|
assert_eq!(
|
||||||
|
token.offset_from, from,
|
||||||
|
"expected offset_from {} but {:?}",
|
||||||
|
from, token
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
token.offset_to, to,
|
||||||
|
"expected offset_to {} but {:?}",
|
||||||
|
to, token
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub mod tests {
|
pub mod test {
|
||||||
|
use super::assert_token;
|
||||||
use super::Token;
|
use super::Token;
|
||||||
use super::TokenizerManager;
|
use super::TokenizerManager;
|
||||||
|
|
||||||
|
|
||||||
/// This is a function that can be used in tests and doc tests
|
|
||||||
/// to assert a token's correctness.
|
|
||||||
pub fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) {
|
|
||||||
assert_eq!(
|
|
||||||
token.position, position,
|
|
||||||
"expected position {} but {:?}",
|
|
||||||
position, token
|
|
||||||
);
|
|
||||||
assert_eq!(token.text, text, "expected text {} but {:?}", text, token);
|
|
||||||
assert_eq!(
|
|
||||||
token.offset_from, from,
|
|
||||||
"expected offset_from {} but {:?}",
|
|
||||||
from, token
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
token.offset_to, to,
|
|
||||||
"expected offset_to {} but {:?}",
|
|
||||||
to, token
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_raw_tokenizer() {
|
fn test_raw_tokenizer() {
|
||||||
let tokenizer_manager = TokenizerManager::default();
|
let tokenizer_manager = TokenizerManager::default();
|
||||||
@@ -223,6 +224,72 @@ pub mod tests {
|
|||||||
assert_token(&tokens[3], 3, "payer", 17, 22);
|
assert_token(&tokens[3], 3, "payer", 17, 22);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_ngram_tokenizer() {
|
||||||
|
use super::{LowerCaser, NgramTokenizer};
|
||||||
|
use tokenizer::tokenizer::TokenStream;
|
||||||
|
use tokenizer::tokenizer::Tokenizer;
|
||||||
|
|
||||||
|
let tokenizer_manager = TokenizerManager::default();
|
||||||
|
tokenizer_manager.register("ngram12", NgramTokenizer::new(1, 2, false));
|
||||||
|
tokenizer_manager.register(
|
||||||
|
"ngram3",
|
||||||
|
NgramTokenizer::new(3, 3, false).filter(LowerCaser),
|
||||||
|
);
|
||||||
|
tokenizer_manager.register(
|
||||||
|
"edgegram5",
|
||||||
|
NgramTokenizer::new(2, 5, true).filter(LowerCaser),
|
||||||
|
);
|
||||||
|
|
||||||
|
let tokenizer = NgramTokenizer::new(1, 2, false);
|
||||||
|
let mut tokens: Vec<Token> = vec![];
|
||||||
|
{
|
||||||
|
let mut add_token = |token: &Token| {
|
||||||
|
tokens.push(token.clone());
|
||||||
|
};
|
||||||
|
tokenizer.token_stream("hello").process(&mut add_token);
|
||||||
|
}
|
||||||
|
assert_eq!(tokens.len(), 9);
|
||||||
|
assert_token(&tokens[0], 0, "h", 0, 1);
|
||||||
|
assert_token(&tokens[1], 0, "he", 0, 2);
|
||||||
|
assert_token(&tokens[2], 1, "e", 1, 2);
|
||||||
|
assert_token(&tokens[3], 1, "el", 1, 3);
|
||||||
|
assert_token(&tokens[4], 2, "l", 2, 3);
|
||||||
|
assert_token(&tokens[5], 2, "ll", 2, 4);
|
||||||
|
assert_token(&tokens[6], 3, "l", 3, 4);
|
||||||
|
assert_token(&tokens[7], 3, "lo", 3, 5);
|
||||||
|
assert_token(&tokens[8], 4, "o", 4, 5);
|
||||||
|
|
||||||
|
let tokenizer = tokenizer_manager.get("ngram3").unwrap();
|
||||||
|
let mut tokens: Vec<Token> = vec![];
|
||||||
|
{
|
||||||
|
let mut add_token = |token: &Token| {
|
||||||
|
tokens.push(token.clone());
|
||||||
|
};
|
||||||
|
tokenizer.token_stream("Hello").process(&mut add_token);
|
||||||
|
}
|
||||||
|
assert_eq!(tokens.len(), 3);
|
||||||
|
assert_token(&tokens[0], 0, "hel", 0, 3);
|
||||||
|
assert_token(&tokens[1], 1, "ell", 1, 4);
|
||||||
|
assert_token(&tokens[2], 2, "llo", 2, 5);
|
||||||
|
|
||||||
|
let tokenizer = tokenizer_manager.get("edgegram5").unwrap();
|
||||||
|
let mut tokens: Vec<Token> = vec![];
|
||||||
|
{
|
||||||
|
let mut add_token = |token: &Token| {
|
||||||
|
tokens.push(token.clone());
|
||||||
|
};
|
||||||
|
tokenizer
|
||||||
|
.token_stream("Frankenstein")
|
||||||
|
.process(&mut add_token);
|
||||||
|
}
|
||||||
|
assert_eq!(tokens.len(), 4);
|
||||||
|
assert_token(&tokens[0], 0, "fr", 0, 2);
|
||||||
|
assert_token(&tokens[1], 0, "fra", 0, 3);
|
||||||
|
assert_token(&tokens[2], 0, "fran", 0, 4);
|
||||||
|
assert_token(&tokens[3], 0, "frank", 0, 5);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_tokenizer_empty() {
|
fn test_tokenizer_empty() {
|
||||||
let tokenizer_manager = TokenizerManager::default();
|
let tokenizer_manager = TokenizerManager::default();
|
||||||
|
|||||||
@@ -2,15 +2,14 @@ use super::{Token, TokenStream, Tokenizer};
|
|||||||
|
|
||||||
/// Tokenize the text by splitting words into n-grams of the given size(s)
|
/// Tokenize the text by splitting words into n-grams of the given size(s)
|
||||||
///
|
///
|
||||||
/// With this tokenizer, the `position` is always 0.
|
/// With this tokenizer, the `position` field expresses the starting offset of the ngram
|
||||||
/// Beware however, in presence of multiple value for the same field,
|
/// rather than the `token` offset.
|
||||||
/// the position will be `POSITION_GAP * index of value`.
|
|
||||||
///
|
///
|
||||||
/// Example 1: `hello` would be tokenized as (min_gram: 2, max_gram: 3, prefix_only: false)
|
/// Example 1: `hello` would be tokenized as (min_gram: 2, max_gram: 3, prefix_only: false)
|
||||||
///
|
///
|
||||||
/// | Term | he | hel | el | ell | ll | llo | lo |
|
/// | Term | he | hel | el | ell | ll | llo | lo |
|
||||||
/// |----------|-----|-----|-----|-----|-----|-----|----|
|
/// |----------|-----|-----|-----|-----|-----|-----|----|
|
||||||
/// | Position | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
|
/// | Position | 0 | 0 | 1 | 1 | 2 | 2 | 3 |
|
||||||
/// | Offsets | 0,2 | 0,3 | 1,3 | 1,4 | 2,4 | 2,5 | 3,5|
|
/// | Offsets | 0,2 | 0,3 | 1,3 | 1,4 | 2,4 | 2,5 | 3,5|
|
||||||
///
|
///
|
||||||
/// Example 2: `hello` would be tokenized as (min_gram: 2, max_gram: 5, prefix_only: **true**)
|
/// Example 2: `hello` would be tokenized as (min_gram: 2, max_gram: 5, prefix_only: **true**)
|
||||||
@@ -20,63 +19,24 @@ use super::{Token, TokenStream, Tokenizer};
|
|||||||
/// | Position | 0 | 0 | 0 | 0 |
|
/// | Position | 0 | 0 | 0 | 0 |
|
||||||
/// | Offsets | 0,2 | 0,3 | 0,4 | 0,5 |
|
/// | Offsets | 0,2 | 0,3 | 0,4 | 0,5 |
|
||||||
///
|
///
|
||||||
/// Example 3: `hεllo` (non-ascii) would be tokenized as (min_gram: 2, max_gram: 5, prefix_only: **true**)
|
|
||||||
///
|
|
||||||
/// | Term | hε | hεl | hεll | hεllo |
|
|
||||||
/// |----------|-----|-----|-------|-------|
|
|
||||||
/// | Position | 0 | 0 | 0 | 0 |
|
|
||||||
/// | Offsets | 0,3 | 0,4 | 0,5 | 0,6 |
|
|
||||||
///
|
|
||||||
/// # Example
|
/// # Example
|
||||||
///
|
///
|
||||||
/// ```
|
/// ```
|
||||||
/// # extern crate tantivy;
|
/// extern crate tantivy;
|
||||||
/// use tantivy::tokenizer::*;
|
/// use tantivy::tokenizer::*;
|
||||||
|
/// use tantivy::tokenizer::assert_token;
|
||||||
|
///
|
||||||
/// # fn main() {
|
/// # fn main() {
|
||||||
/// let tokenizer = NgramTokenizer::new(2, 3, false);
|
/// let tokenizer = NgramTokenizer::new(2, 3, false);
|
||||||
/// let mut stream = tokenizer.token_stream("hello");
|
/// let mut stream = tokenizer.token_stream("hello");
|
||||||
/// {
|
///
|
||||||
/// let token = stream.next().unwrap();
|
/// assert_token(stream.next().unwrap(), 0, "he", 0, 2);
|
||||||
/// assert_eq!(token.text, "he");
|
/// assert_token(stream.next().unwrap(), 0, "hel", 0, 3);
|
||||||
/// assert_eq!(token.offset_from, 0);
|
/// assert_token(stream.next().unwrap(), 1, "el", 1, 3);
|
||||||
/// assert_eq!(token.offset_to, 2);
|
/// assert_token(stream.next().unwrap(), 1, "ell", 1, 4);
|
||||||
/// }
|
/// assert_token(stream.next().unwrap(), 2, "ll", 2, 4);
|
||||||
/// {
|
/// assert_token(stream.next().unwrap(), 2, "llo", 2, 5);
|
||||||
/// let token = stream.next().unwrap();
|
/// assert_token(stream.next().unwrap(), 3, "lo", 3, 5);
|
||||||
/// assert_eq!(token.text, "hel");
|
|
||||||
/// assert_eq!(token.offset_from, 0);
|
|
||||||
/// assert_eq!(token.offset_to, 3);
|
|
||||||
/// }
|
|
||||||
/// {
|
|
||||||
/// let token = stream.next().unwrap();
|
|
||||||
/// assert_eq!(token.text, "el");
|
|
||||||
/// assert_eq!(token.offset_from, 1);
|
|
||||||
/// assert_eq!(token.offset_to, 3);
|
|
||||||
/// }
|
|
||||||
/// {
|
|
||||||
/// let token = stream.next().unwrap();
|
|
||||||
/// assert_eq!(token.text, "ell");
|
|
||||||
/// assert_eq!(token.offset_from, 1);
|
|
||||||
/// assert_eq!(token.offset_to, 4);
|
|
||||||
/// }
|
|
||||||
/// {
|
|
||||||
/// let token = stream.next().unwrap();
|
|
||||||
/// assert_eq!(token.text, "ll");
|
|
||||||
/// assert_eq!(token.offset_from, 2);
|
|
||||||
/// assert_eq!(token.offset_to, 4);
|
|
||||||
/// }
|
|
||||||
/// {
|
|
||||||
/// let token = stream.next().unwrap();
|
|
||||||
/// assert_eq!(token.text, "llo");
|
|
||||||
/// assert_eq!(token.offset_from, 2);
|
|
||||||
/// assert_eq!(token.offset_to, 5);
|
|
||||||
/// }
|
|
||||||
/// {
|
|
||||||
/// let token = stream.next().unwrap();
|
|
||||||
/// assert_eq!(token.text, "lo");
|
|
||||||
/// assert_eq!(token.offset_from, 3);
|
|
||||||
/// assert_eq!(token.offset_to, 5);
|
|
||||||
/// }
|
|
||||||
/// assert!(stream.next().is_none());
|
/// assert!(stream.next().is_none());
|
||||||
/// # }
|
/// # }
|
||||||
/// ```
|
/// ```
|
||||||
@@ -98,37 +58,23 @@ impl NgramTokenizer {
|
|||||||
min_gram <= max_gram,
|
min_gram <= max_gram,
|
||||||
"min_gram must not be greater than max_gram"
|
"min_gram must not be greater than max_gram"
|
||||||
);
|
);
|
||||||
|
|
||||||
NgramTokenizer {
|
NgramTokenizer {
|
||||||
min_gram,
|
min_gram,
|
||||||
max_gram,
|
max_gram,
|
||||||
prefix_only,
|
prefix_only,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Create a `NGramTokenizer` which generates tokens for all inner ngrams.
|
|
||||||
///
|
|
||||||
/// This is as opposed to only prefix ngrams .
|
|
||||||
pub fn all_ngrams(min_gram: usize, max_gram:usize) -> NgramTokenizer {
|
|
||||||
Self::new(min_gram, max_gram, false)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create a `NGramTokenizer` which only generates tokens for the
|
|
||||||
/// prefix ngrams.
|
|
||||||
pub fn prefix_only(min_gram: usize, max_gram: usize) -> NgramTokenizer {
|
|
||||||
Self::new(min_gram, max_gram, true)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// TokenStream associate to the `NgramTokenizer`
|
|
||||||
pub struct NgramTokenStream<'a> {
|
pub struct NgramTokenStream<'a> {
|
||||||
/// parameters
|
|
||||||
ngram_charidx_iterator: StutteringIterator<CodepointFrontiers<'a>>,
|
|
||||||
/// true if the NgramTokenStream is in prefix mode.
|
|
||||||
prefix_only: bool,
|
|
||||||
/// input
|
|
||||||
text: &'a str,
|
text: &'a str,
|
||||||
/// output
|
position: usize,
|
||||||
|
text_length: usize,
|
||||||
token: Token,
|
token: Token,
|
||||||
|
min_gram: usize,
|
||||||
|
max_gram: usize,
|
||||||
|
gram_size: usize,
|
||||||
|
prefix_only: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Tokenizer<'a> for NgramTokenizer {
|
impl<'a> Tokenizer<'a> for NgramTokenizer {
|
||||||
@@ -136,28 +82,65 @@ impl<'a> Tokenizer<'a> for NgramTokenizer {
|
|||||||
|
|
||||||
fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl {
|
fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl {
|
||||||
NgramTokenStream {
|
NgramTokenStream {
|
||||||
ngram_charidx_iterator: StutteringIterator::new(
|
|
||||||
CodepointFrontiers::for_str(text),
|
|
||||||
self.min_gram,
|
|
||||||
self.max_gram),
|
|
||||||
prefix_only: self.prefix_only,
|
|
||||||
text,
|
text,
|
||||||
|
position: 0,
|
||||||
|
text_length: text.len(),
|
||||||
token: Token::default(),
|
token: Token::default(),
|
||||||
|
min_gram: self.min_gram,
|
||||||
|
max_gram: self.max_gram,
|
||||||
|
prefix_only: self.prefix_only,
|
||||||
|
gram_size: self.min_gram,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<'a> NgramTokenStream<'a> {
|
||||||
|
/// Get the next set of token options
|
||||||
|
/// cycle through 1,2 (min..=max)
|
||||||
|
/// returning None if processing should stop
|
||||||
|
fn chomp(&mut self) -> Option<(usize, usize)> {
|
||||||
|
// Have we exceeded the bounds of the text we are indexing?
|
||||||
|
if self.gram_size > self.max_gram {
|
||||||
|
if self.prefix_only {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
// since we aren't just processing edges
|
||||||
|
// we need to reset the gram size
|
||||||
|
self.gram_size = self.min_gram;
|
||||||
|
|
||||||
|
// and move down the chain of letters
|
||||||
|
self.position += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
let result = if (self.position + self.gram_size) <= self.text_length {
|
||||||
|
Some((self.position, self.gram_size))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
// increase the gram size for the next pass
|
||||||
|
self.gram_size += 1;
|
||||||
|
|
||||||
|
result
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<'a> TokenStream for NgramTokenStream<'a> {
|
impl<'a> TokenStream for NgramTokenStream<'a> {
|
||||||
fn advance(&mut self) -> bool {
|
fn advance(&mut self) -> bool {
|
||||||
if let Some((offset_from, offset_to)) = self.ngram_charidx_iterator.next() {
|
// clear out working token text
|
||||||
if self.prefix_only && offset_from > 0 {
|
self.token.text.clear();
|
||||||
return false;
|
|
||||||
}
|
if let Some((position, size)) = self.chomp() {
|
||||||
self.token.position = 0;
|
self.token.position = position;
|
||||||
|
let offset_from = position;
|
||||||
|
let offset_to = offset_from + size;
|
||||||
|
|
||||||
self.token.offset_from = offset_from;
|
self.token.offset_from = offset_from;
|
||||||
self.token.offset_to = offset_to;
|
self.token.offset_to = offset_to;
|
||||||
self.token.text.clear();
|
|
||||||
self.token.text.push_str(&self.text[offset_from..offset_to]);
|
self.token.text.push_str(&self.text[offset_from..offset_to]);
|
||||||
|
|
||||||
true
|
true
|
||||||
} else {
|
} else {
|
||||||
false
|
false
|
||||||
@@ -167,307 +150,8 @@ impl<'a> TokenStream for NgramTokenStream<'a> {
|
|||||||
fn token(&self) -> &Token {
|
fn token(&self) -> &Token {
|
||||||
&self.token
|
&self.token
|
||||||
}
|
}
|
||||||
|
|
||||||
fn token_mut(&mut self) -> &mut Token {
|
fn token_mut(&mut self) -> &mut Token {
|
||||||
&mut self.token
|
&mut self.token
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/// This iterator takes an underlying Iterator
|
|
||||||
/// and emits all of the pairs `(a,b)` such that
|
|
||||||
/// a and b are items emitted by the iterator at
|
|
||||||
/// an interval between `min_gram` and `max_gram`.
|
|
||||||
///
|
|
||||||
/// The elements are emitted in the order of appearance
|
|
||||||
/// of `a` first, `b` then.
|
|
||||||
///
|
|
||||||
/// See `test_stutterring_iterator` for an example of its
|
|
||||||
/// output.
|
|
||||||
struct StutteringIterator<T> {
|
|
||||||
underlying: T,
|
|
||||||
min_gram: usize,
|
|
||||||
max_gram: usize,
|
|
||||||
|
|
||||||
memory: Vec<usize>,
|
|
||||||
cursor: usize,
|
|
||||||
gram_len: usize
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T> StutteringIterator<T>
|
|
||||||
where T: Iterator<Item=usize> {
|
|
||||||
pub fn new(mut underlying: T, min_gram: usize, max_gram: usize) -> StutteringIterator<T> {
|
|
||||||
assert!(min_gram > 0);
|
|
||||||
let memory: Vec<usize> = (&mut underlying).take(max_gram + 1).collect();
|
|
||||||
if memory.len() <= min_gram {
|
|
||||||
// returns an empty iterator
|
|
||||||
StutteringIterator {
|
|
||||||
underlying,
|
|
||||||
min_gram: 1,
|
|
||||||
max_gram: 0,
|
|
||||||
memory,
|
|
||||||
cursor: 0,
|
|
||||||
gram_len: 0,
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
StutteringIterator {
|
|
||||||
underlying,
|
|
||||||
min_gram,
|
|
||||||
max_gram: memory.len() - 1,
|
|
||||||
memory,
|
|
||||||
cursor: 0,
|
|
||||||
gram_len: min_gram,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T> Iterator for StutteringIterator<T>
|
|
||||||
where T: Iterator<Item=usize> {
|
|
||||||
type Item = (usize, usize);
|
|
||||||
|
|
||||||
fn next(&mut self) -> Option<(usize, usize)> {
|
|
||||||
if self.gram_len > self.max_gram {
|
|
||||||
// we have exhausted all options
|
|
||||||
// starting at `self.memory[self.cursor]`.
|
|
||||||
//
|
|
||||||
// Time to advance.
|
|
||||||
self.gram_len = self.min_gram;
|
|
||||||
if let Some(next_val) = self.underlying.next() {
|
|
||||||
self.memory[self.cursor] = next_val;
|
|
||||||
} else {
|
|
||||||
self.max_gram -= 1;
|
|
||||||
}
|
|
||||||
self.cursor += 1;
|
|
||||||
if self.cursor >= self.memory.len() {
|
|
||||||
self.cursor = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if self.max_gram < self.min_gram {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
let start = self.memory[self.cursor % self.memory.len()];
|
|
||||||
let stop = self.memory[(self.cursor + self.gram_len) % self.memory.len()];
|
|
||||||
self.gram_len += 1;
|
|
||||||
Some((start, stop))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/// Emits all of the offsets where a codepoint starts
|
|
||||||
/// or a codepoint ends.
|
|
||||||
///
|
|
||||||
/// By convention, we emit [0] for the empty string.
|
|
||||||
struct CodepointFrontiers<'a> {
|
|
||||||
s: &'a str,
|
|
||||||
next_el: Option<usize>
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> CodepointFrontiers<'a> {
|
|
||||||
fn for_str(s: &'a str) -> Self {
|
|
||||||
CodepointFrontiers {
|
|
||||||
s,
|
|
||||||
next_el: Some(0)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> Iterator for CodepointFrontiers<'a> {
|
|
||||||
type Item = usize;
|
|
||||||
|
|
||||||
fn next(&mut self) -> Option<usize> {
|
|
||||||
self.next_el
|
|
||||||
.map(|offset| {
|
|
||||||
if self.s.is_empty() {
|
|
||||||
self.next_el = None;
|
|
||||||
} else {
|
|
||||||
let first_codepoint_width = utf8_codepoint_width(self.s.as_bytes()[0]);
|
|
||||||
self.s = &self.s[first_codepoint_width..];
|
|
||||||
self.next_el = Some(offset + first_codepoint_width);
|
|
||||||
}
|
|
||||||
offset
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const CODEPOINT_UTF8_WIDTH: [u8; 16] = [
|
|
||||||
1, 1, 1, 1,
|
|
||||||
1, 1, 1, 1,
|
|
||||||
2, 2, 2, 2,
|
|
||||||
2, 2, 3, 4,
|
|
||||||
];
|
|
||||||
|
|
||||||
// Number of bytes to encode a codepoint in UTF-8 given
|
|
||||||
// the first byte.
|
|
||||||
//
|
|
||||||
// To do that we count the number of higher significant bits set to `1`.
|
|
||||||
fn utf8_codepoint_width(b: u8) -> usize {
|
|
||||||
let higher_4_bits = (b as usize) >> 4;
|
|
||||||
CODEPOINT_UTF8_WIDTH[higher_4_bits] as usize
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
|
|
||||||
use tokenizer::tokenizer::{TokenStream, Tokenizer};
|
|
||||||
use super::NgramTokenizer;
|
|
||||||
use tokenizer::Token;
|
|
||||||
use tokenizer::tests::assert_token;
|
|
||||||
use super::CodepointFrontiers;
|
|
||||||
use super::StutteringIterator;
|
|
||||||
use super::utf8_codepoint_width;
|
|
||||||
|
|
||||||
fn test_helper<T: TokenStream>(mut tokenizer: T) -> Vec<Token> {
|
|
||||||
let mut tokens: Vec<Token> = vec![];
|
|
||||||
tokenizer.process(&mut |token: &Token| tokens.push(token.clone()));
|
|
||||||
tokens
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_utf8_codepoint_width() {
|
|
||||||
// 0xxx
|
|
||||||
for i in 0..128 {
|
|
||||||
assert_eq!(utf8_codepoint_width(i), 1);
|
|
||||||
}
|
|
||||||
// 110xx
|
|
||||||
for i in (128 | 64)..(128 | 64 | 32) {
|
|
||||||
assert_eq!(utf8_codepoint_width(i), 2);
|
|
||||||
}
|
|
||||||
// 1110xx
|
|
||||||
for i in (128 | 64 | 32)..(128 | 64 | 32 | 16) {
|
|
||||||
assert_eq!(utf8_codepoint_width(i), 3);
|
|
||||||
}
|
|
||||||
// 1111xx
|
|
||||||
for i in (128 | 64 | 32 | 16)..256 {
|
|
||||||
assert_eq!(utf8_codepoint_width(i as u8), 4);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_codepoint_frontiers() {
|
|
||||||
assert_eq!(CodepointFrontiers::for_str("").collect::<Vec<_>>(), vec![0]);
|
|
||||||
assert_eq!(
|
|
||||||
CodepointFrontiers::for_str("abcd").collect::<Vec<_>>(),
|
|
||||||
vec![0,1,2,3,4]
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
CodepointFrontiers::for_str("aあ").collect::<Vec<_>>(),
|
|
||||||
vec![0,1,4]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_ngram_tokenizer_1_2_false() {
|
|
||||||
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hello"));
|
|
||||||
assert_eq!(tokens.len(), 9);
|
|
||||||
assert_token(&tokens[0], 0, "h", 0, 1);
|
|
||||||
assert_token(&tokens[1], 0, "he", 0, 2);
|
|
||||||
assert_token(&tokens[2], 0, "e", 1, 2);
|
|
||||||
assert_token(&tokens[3], 0, "el", 1, 3);
|
|
||||||
assert_token(&tokens[4], 0, "l", 2, 3);
|
|
||||||
assert_token(&tokens[5], 0, "ll", 2, 4);
|
|
||||||
assert_token(&tokens[6], 0, "l", 3, 4);
|
|
||||||
assert_token(&tokens[7], 0, "lo", 3, 5);
|
|
||||||
assert_token(&tokens[8], 0, "o", 4, 5);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_ngram_tokenizer_min_max_equal() {
|
|
||||||
let tokens = test_helper(NgramTokenizer::all_ngrams(3, 3).token_stream("hello"));
|
|
||||||
assert_eq!(tokens.len(), 3);
|
|
||||||
assert_token(&tokens[0], 0, "hel", 0, 3);
|
|
||||||
assert_token(&tokens[1], 0, "ell", 1, 4);
|
|
||||||
assert_token(&tokens[2], 0, "llo", 2, 5);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_ngram_tokenizer_2_5_prefix() {
|
|
||||||
let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("frankenstein"));
|
|
||||||
assert_eq!(tokens.len(), 4);
|
|
||||||
assert_token(&tokens[0], 0, "fr", 0, 2);
|
|
||||||
assert_token(&tokens[1], 0, "fra", 0, 3);
|
|
||||||
assert_token(&tokens[2], 0, "fran", 0, 4);
|
|
||||||
assert_token(&tokens[3], 0, "frank", 0, 5);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_ngram_non_ascii_1_2() {
|
|
||||||
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hεllo"));
|
|
||||||
assert_eq!(tokens.len(), 9);
|
|
||||||
assert_token(&tokens[0], 0, "h", 0, 1);
|
|
||||||
assert_token(&tokens[1], 0, "hε", 0, 3);
|
|
||||||
assert_token(&tokens[2], 0, "ε", 1, 3);
|
|
||||||
assert_token(&tokens[3], 0, "εl", 1, 4);
|
|
||||||
assert_token(&tokens[4], 0, "l", 3, 4);
|
|
||||||
assert_token(&tokens[5], 0, "ll", 3, 5);
|
|
||||||
assert_token(&tokens[6], 0, "l", 4, 5);
|
|
||||||
assert_token(&tokens[7], 0, "lo", 4, 6);
|
|
||||||
assert_token(&tokens[8], 0, "o", 5, 6);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_ngram_non_ascii_2_5_prefix() {
|
|
||||||
let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("hεllo"));
|
|
||||||
assert_eq!(tokens.len(), 4);
|
|
||||||
assert_token(&tokens[0], 0, "hε", 0, 3);
|
|
||||||
assert_token(&tokens[1], 0, "hεl", 0, 4);
|
|
||||||
assert_token(&tokens[2], 0, "hεll", 0, 5);
|
|
||||||
assert_token(&tokens[3], 0, "hεllo", 0, 6);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_ngram_empty() {
|
|
||||||
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 5).token_stream(""));
|
|
||||||
assert!(tokens.is_empty());
|
|
||||||
let tokens = test_helper(NgramTokenizer::all_ngrams(2, 5).token_stream(""));
|
|
||||||
assert!(tokens.is_empty());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
#[should_panic(expected = "min_gram must be greater than 0")]
|
|
||||||
fn test_ngram_min_max_interval_empty() {
|
|
||||||
test_helper(NgramTokenizer::all_ngrams(0, 2).token_stream("hellossss"));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
#[should_panic(expected = "min_gram must not be greater than max_gram")]
|
|
||||||
fn test_invalid_interval_should_panic_if_smaller() {
|
|
||||||
NgramTokenizer::all_ngrams(2, 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_stutterring_iterator_empty() {
|
|
||||||
let rg: Vec<usize> = vec![0];
|
|
||||||
let mut it = StutteringIterator::new(rg.into_iter(), 1, 2);
|
|
||||||
assert_eq!(it.next(), None);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_stutterring_iterator() {
|
|
||||||
let rg: Vec<usize> = (0..10).collect();
|
|
||||||
let mut it = StutteringIterator::new(rg.into_iter(), 1, 2);
|
|
||||||
assert_eq!(it.next(), Some((0, 1)));
|
|
||||||
assert_eq!(it.next(), Some((0, 2)));
|
|
||||||
assert_eq!(it.next(), Some((1, 2)));
|
|
||||||
assert_eq!(it.next(), Some((1, 3)));
|
|
||||||
assert_eq!(it.next(), Some((2, 3)));
|
|
||||||
assert_eq!(it.next(), Some((2, 4)));
|
|
||||||
assert_eq!(it.next(), Some((3, 4)));
|
|
||||||
assert_eq!(it.next(), Some((3, 5)));
|
|
||||||
assert_eq!(it.next(), Some((4, 5)));
|
|
||||||
assert_eq!(it.next(), Some((4, 6)));
|
|
||||||
assert_eq!(it.next(), Some((5, 6)));
|
|
||||||
assert_eq!(it.next(), Some((5, 7)));
|
|
||||||
assert_eq!(it.next(), Some((6, 7)));
|
|
||||||
assert_eq!(it.next(), Some((6, 8)));
|
|
||||||
assert_eq!(it.next(), Some((7, 8)));
|
|
||||||
assert_eq!(it.next(), Some((7, 9)));
|
|
||||||
assert_eq!(it.next(), Some((8, 9)));
|
|
||||||
assert_eq!(it.next(), None);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
Reference in New Issue
Block a user