mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-28 13:02:55 +00:00
Compare commits
9 Commits
issue/weba
...
issue/457
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
bb21d12a70 | ||
|
|
4565aba62a | ||
|
|
545a7ec8dd | ||
|
|
14908479d5 | ||
|
|
ab4593eeb7 | ||
|
|
e75bb1d6a1 | ||
|
|
63b9d62237 | ||
|
|
0098e3d428 | ||
|
|
69d5e4b9b1 |
@@ -1,3 +1,7 @@
|
||||
Tantivy 0.7.1
|
||||
=====================
|
||||
- Bugfix: NGramTokenizer panics on non ascii chars
|
||||
- Added a space usage API
|
||||
|
||||
Tantivy 0.7
|
||||
=====================
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.7.0"
|
||||
version = "0.7.2"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
@@ -12,7 +12,7 @@ readme = "README.md"
|
||||
keywords = ["search", "information", "retrieval"]
|
||||
|
||||
[dependencies]
|
||||
base64 = "0.9.1"
|
||||
base64 = "0.10.0"
|
||||
byteorder = "1.0"
|
||||
lazy_static = "1"
|
||||
regex = "1.0"
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
|
||||
**Tantivy** is a **full text search engine library** written in rust.
|
||||
|
||||
It is closer to Lucene than to Elastic Search and Solr in the sense it is not
|
||||
It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elastic Search](https://www.elastic.co/products/elasticsearch) and [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
|
||||
an off-the-shelf search engine server, but rather a crate that can be used
|
||||
to build such a search engine.
|
||||
|
||||
|
||||
@@ -4,6 +4,8 @@ use common::VInt;
|
||||
use directory::ReadOnlySource;
|
||||
use directory::WritePtr;
|
||||
use schema::Field;
|
||||
use space_usage::PerFieldSpaceUsage;
|
||||
use space_usage::FieldUsage;
|
||||
use std::collections::HashMap;
|
||||
use std::io::Write;
|
||||
use std::io::{self, Read};
|
||||
@@ -166,6 +168,16 @@ impl CompositeFile {
|
||||
.get(&FileAddr { field, idx })
|
||||
.map(|&(from, to)| self.data.slice(from, to))
|
||||
}
|
||||
|
||||
pub fn space_usage(&self) -> PerFieldSpaceUsage {
|
||||
let mut fields = HashMap::new();
|
||||
for (&field_addr, &(start, end)) in self.offsets_index.iter() {
|
||||
fields.entry(field_addr.field)
|
||||
.or_insert_with(|| FieldUsage::empty(field_addr.field))
|
||||
.add_field_idx(field_addr.idx, end - start);
|
||||
}
|
||||
PerFieldSpaceUsage::new(fields)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -49,6 +49,11 @@ pub struct Index {
|
||||
}
|
||||
|
||||
impl Index {
|
||||
/// Examines the director to see if it contains an index
|
||||
pub fn exists<Dir: Directory>(dir: &Dir) -> bool {
|
||||
dir.exists(&META_FILEPATH)
|
||||
}
|
||||
|
||||
/// Creates a new index using the `RAMDirectory`.
|
||||
///
|
||||
/// The index will be allocated in anonymous memory.
|
||||
@@ -65,9 +70,28 @@ impl Index {
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn create_in_dir<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
|
||||
let mmap_directory = MmapDirectory::open(directory_path)?;
|
||||
if Index::exists(&mmap_directory) {
|
||||
return Err(TantivyError::IndexAlreadyExists);
|
||||
}
|
||||
|
||||
Index::create(mmap_directory, schema)
|
||||
}
|
||||
|
||||
/// Opens or creates a new index in the provided directory
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn open_or_create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
|
||||
if Index::exists(&dir) {
|
||||
let index = Index::open(dir)?;
|
||||
if index.schema() == schema {
|
||||
Ok(index)
|
||||
} else {
|
||||
Err(TantivyError::SchemaError("An index exists but the schema does not match.".to_string()))
|
||||
}
|
||||
} else {
|
||||
Index::create(dir, schema)
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new index in a temp directory.
|
||||
///
|
||||
/// The index will use the `MMapDirectory` in a newly created directory.
|
||||
@@ -89,6 +113,8 @@ impl Index {
|
||||
}
|
||||
|
||||
/// Create a new index from a directory.
|
||||
///
|
||||
/// This will overwrite existing meta.json
|
||||
fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
|
||||
save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
|
||||
let metas = IndexMeta::with_schema(schema);
|
||||
@@ -328,8 +354,9 @@ impl Clone for Index {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use schema::{SchemaBuilder, INT_INDEXED, TEXT};
|
||||
use schema::{Schema, SchemaBuilder, INT_INDEXED, TEXT};
|
||||
use Index;
|
||||
use directory::RAMDirectory;
|
||||
|
||||
#[test]
|
||||
fn test_indexer_for_field() {
|
||||
@@ -345,4 +372,52 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_index_exists() {
|
||||
let directory = RAMDirectory::create();
|
||||
assert!(!Index::exists(&directory));
|
||||
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
|
||||
assert!(Index::exists(&directory));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn open_or_create_should_create() {
|
||||
let directory = RAMDirectory::create();
|
||||
assert!(!Index::exists(&directory));
|
||||
assert!(Index::open_or_create(directory.clone(), throw_away_schema()).is_ok());
|
||||
assert!(Index::exists(&directory));
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn open_or_create_should_open() {
|
||||
let directory = RAMDirectory::create();
|
||||
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
|
||||
assert!(Index::exists(&directory));
|
||||
assert!(Index::open_or_create(directory, throw_away_schema()).is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn create_should_wipeoff_existing() {
|
||||
let directory = RAMDirectory::create();
|
||||
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
|
||||
assert!(Index::exists(&directory));
|
||||
assert!(Index::create(directory.clone(), SchemaBuilder::default().build()).is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn open_or_create_exists_but_schema_does_not_match() {
|
||||
let directory = RAMDirectory::create();
|
||||
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
|
||||
assert!(Index::exists(&directory));
|
||||
assert!(Index::open_or_create(directory.clone(), throw_away_schema()).is_ok());
|
||||
let err = Index::open_or_create(directory, SchemaBuilder::default().build());
|
||||
assert_eq!(format!("{:?}", err.unwrap_err()), "SchemaError(\"An index exists but the schema does not match.\")");
|
||||
}
|
||||
|
||||
fn throw_away_schema() -> Schema {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let _ = schema_builder.add_u64_field("num_likes", INT_INDEXED);
|
||||
schema_builder.build()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@ use query::Query;
|
||||
use schema::Document;
|
||||
use schema::Schema;
|
||||
use schema::{Field, Term};
|
||||
use space_usage::SearcherSpaceUsage;
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
use termdict::TermMerger;
|
||||
@@ -99,6 +100,15 @@ impl Searcher {
|
||||
.collect::<Vec<_>>();
|
||||
FieldSearcher::new(inv_index_readers)
|
||||
}
|
||||
|
||||
/// Summarize total space usage of this searcher.
|
||||
pub fn space_usage(&self) -> SearcherSpaceUsage {
|
||||
let mut space_usage = SearcherSpaceUsage::new();
|
||||
for segment_reader in self.segment_readers.iter() {
|
||||
space_usage.add_segment(segment_reader.space_usage());
|
||||
}
|
||||
space_usage
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FieldSearcher {
|
||||
|
||||
@@ -16,6 +16,7 @@ use schema::Document;
|
||||
use schema::Field;
|
||||
use schema::FieldType;
|
||||
use schema::Schema;
|
||||
use space_usage::SegmentSpaceUsage;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
@@ -381,6 +382,21 @@ impl SegmentReader {
|
||||
pub fn doc_ids_alive(&self) -> SegmentReaderAliveDocsIterator {
|
||||
SegmentReaderAliveDocsIterator::new(&self)
|
||||
}
|
||||
|
||||
/// Summarize total space usage of this segment.
|
||||
pub fn space_usage(&self) -> SegmentSpaceUsage {
|
||||
SegmentSpaceUsage::new(
|
||||
self.num_docs(),
|
||||
self.termdict_composite.space_usage(),
|
||||
self.postings_composite.space_usage(),
|
||||
self.positions_composite.space_usage(),
|
||||
self.positions_idx_composite.space_usage(),
|
||||
self.fast_fields_composite.space_usage(),
|
||||
self.fieldnorms_composite.space_usage(),
|
||||
self.store_reader.space_usage(),
|
||||
self.delete_bitset_opt.as_ref().map(|x| x.space_usage()).unwrap_or(0),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for SegmentReader {
|
||||
|
||||
@@ -364,6 +364,11 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_open_non_existant_path() {
|
||||
assert!(MmapDirectory::open(PathBuf::from("./nowhere")).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_open_empty() {
|
||||
// empty file is actually an edge case because those
|
||||
|
||||
@@ -20,6 +20,9 @@ pub enum TantivyError {
|
||||
/// File already exists, this is a problem when we try to write into a new file.
|
||||
#[fail(display = "file already exists: '{:?}'", _0)]
|
||||
FileAlreadyExists(PathBuf),
|
||||
/// Index already exists in this directory
|
||||
#[fail(display = "index already exists")]
|
||||
IndexAlreadyExists,
|
||||
/// Failed to acquire file lock
|
||||
#[fail(
|
||||
display = "Failed to acquire Lockfile: {:?}. Possible causes: another IndexWriter instance or panic during previous lock drop.",
|
||||
|
||||
@@ -2,6 +2,7 @@ use bit_set::BitSet;
|
||||
use common::HasLen;
|
||||
use directory::ReadOnlySource;
|
||||
use directory::WritePtr;
|
||||
use space_usage::ByteCount;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use DocId;
|
||||
@@ -63,6 +64,11 @@ impl DeleteBitSet {
|
||||
b & (1u8 << shift) != 0
|
||||
}
|
||||
}
|
||||
|
||||
/// Summarize total space usage of this bitset.
|
||||
pub fn space_usage(&self) -> ByteCount {
|
||||
self.data.len()
|
||||
}
|
||||
}
|
||||
|
||||
impl HasLen for DeleteBitSet {
|
||||
|
||||
@@ -213,6 +213,7 @@ pub(crate) mod positions;
|
||||
pub mod postings;
|
||||
pub mod query;
|
||||
pub mod schema;
|
||||
pub mod space_usage;
|
||||
pub mod store;
|
||||
pub mod termdict;
|
||||
|
||||
|
||||
@@ -126,7 +126,6 @@ impl SegmentPostings {
|
||||
fn exponential_search(target: u32, arr: &[u32]) -> (usize, usize) {
|
||||
let mut start = 0;
|
||||
let end = arr.len();
|
||||
debug_assert!(target >= arr[start]);
|
||||
debug_assert!(target <= arr[end - 1]);
|
||||
let mut jump = 1;
|
||||
loop {
|
||||
@@ -216,11 +215,10 @@ impl DocSet for SegmentPostings {
|
||||
|
||||
// we're in the right block now, start with an exponential search
|
||||
let block_docs = self.block_cursor.docs();
|
||||
|
||||
debug_assert!(target >= self.doc());
|
||||
let new_cur = self
|
||||
.cur
|
||||
.wrapping_add(search_within_block(&block_docs[self.cur..], target));
|
||||
|
||||
if need_positions {
|
||||
sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur]
|
||||
.iter()
|
||||
@@ -632,8 +630,10 @@ mod tests {
|
||||
use schema::IndexRecordOption;
|
||||
use schema::SchemaBuilder;
|
||||
use schema::Term;
|
||||
use super::exponential_search;
|
||||
use schema::INT_INDEXED;
|
||||
use DocId;
|
||||
use SkipResult;
|
||||
|
||||
#[test]
|
||||
fn test_empty_segment_postings() {
|
||||
@@ -661,6 +661,13 @@ mod tests {
|
||||
.0
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exponentiel_search() {
|
||||
assert_eq!(exponential_search(0, &[1,2]), (0, 1));
|
||||
assert_eq!(exponential_search(1, &[1,2]), (0, 1));
|
||||
assert_eq!(exponential_search(7, &[1,2,3,4,5,6,7,8,9,10,11]), (3,7));
|
||||
}
|
||||
|
||||
fn util_test_search_within_block(block: &[u32], target: u32) {
|
||||
assert_eq!(
|
||||
search_within_block(block, target),
|
||||
@@ -692,7 +699,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_block_segment_postings() {
|
||||
let mut block_segments = build_block_postings((0..100_000).collect::<Vec<u32>>());
|
||||
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
|
||||
let mut offset: u32 = 0u32;
|
||||
// checking that the block before calling advance is empty
|
||||
assert!(block_segments.docs().is_empty());
|
||||
@@ -706,14 +713,45 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
fn build_block_postings(docs: Vec<DocId>) -> BlockSegmentPostings {
|
||||
|
||||
#[test]
|
||||
fn test_skip_right_at_new_block() {
|
||||
let mut doc_ids = (0..128).collect::<Vec<u32>>();
|
||||
doc_ids.push(129);
|
||||
doc_ids.push(130);
|
||||
{
|
||||
let block_segments = build_block_postings(&doc_ids);
|
||||
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
||||
assert_eq!(docset.skip_next(128), SkipResult::OverStep);
|
||||
assert_eq!(docset.doc(), 129);
|
||||
assert!(docset.advance());
|
||||
assert_eq!(docset.doc(), 130);
|
||||
assert!(!docset.advance());
|
||||
}
|
||||
{
|
||||
let block_segments = build_block_postings(&doc_ids);
|
||||
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
||||
assert_eq!(docset.skip_next(129), SkipResult::Reached);
|
||||
assert_eq!(docset.doc(), 129);
|
||||
assert!(docset.advance());
|
||||
assert_eq!(docset.doc(), 130);
|
||||
assert!(!docset.advance());
|
||||
}
|
||||
{
|
||||
let block_segments = build_block_postings(&doc_ids);
|
||||
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
||||
assert_eq!(docset.skip_next(131), SkipResult::End);
|
||||
}
|
||||
}
|
||||
|
||||
fn build_block_postings(docs: &[DocId]) -> BlockSegmentPostings {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
let mut last_doc = 0u32;
|
||||
for doc in docs {
|
||||
for &doc in docs {
|
||||
for _ in last_doc..doc {
|
||||
index_writer.add_document(doc!(int_field=>1u64));
|
||||
}
|
||||
@@ -733,7 +771,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_block_segment_postings_skip() {
|
||||
for i in 0..4 {
|
||||
let mut block_postings = build_block_postings(vec![3]);
|
||||
let mut block_postings = build_block_postings(&[3]);
|
||||
assert_eq!(
|
||||
block_postings.skip_to(i),
|
||||
BlockSegmentPostingsSkipResult::Success(0u32)
|
||||
@@ -743,7 +781,7 @@ mod tests {
|
||||
BlockSegmentPostingsSkipResult::Terminated
|
||||
);
|
||||
}
|
||||
let mut block_postings = build_block_postings(vec![3]);
|
||||
let mut block_postings = build_block_postings(&[3]);
|
||||
assert_eq!(
|
||||
block_postings.skip_to(4u32),
|
||||
BlockSegmentPostingsSkipResult::Terminated
|
||||
@@ -756,7 +794,7 @@ mod tests {
|
||||
for i in 0..1300 {
|
||||
docs.push((i * i / 100) + i);
|
||||
}
|
||||
let mut block_postings = build_block_postings(docs.clone());
|
||||
let mut block_postings = build_block_postings(&docs[..]);
|
||||
for i in vec![0, 424, 10000] {
|
||||
assert_eq!(
|
||||
block_postings.skip_to(i),
|
||||
|
||||
@@ -14,7 +14,7 @@ use std::fmt;
|
||||
/// - a field name
|
||||
/// - a field type, itself wrapping up options describing
|
||||
/// how the field should be indexed.
|
||||
#[derive(Clone, Debug)]
|
||||
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||
pub struct FieldEntry {
|
||||
name: String,
|
||||
field_type: FieldType,
|
||||
|
||||
@@ -134,6 +134,15 @@ struct InnerSchema {
|
||||
fields_map: HashMap<String, Field>, // transient
|
||||
}
|
||||
|
||||
impl PartialEq for InnerSchema {
|
||||
fn eq(&self, other: &InnerSchema) -> bool {
|
||||
self.fields == other.fields
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for InnerSchema {}
|
||||
|
||||
|
||||
/// Tantivy has a very strict schema.
|
||||
/// You need to specify in advance, whether a field is indexed or not,
|
||||
/// stored or not, and RAM-based or not.
|
||||
@@ -154,7 +163,7 @@ struct InnerSchema {
|
||||
/// let schema = schema_builder.build();
|
||||
///
|
||||
/// ```
|
||||
#[derive(Clone)]
|
||||
#[derive(Clone, Eq, PartialEq)]
|
||||
pub struct Schema(Arc<InnerSchema>);
|
||||
|
||||
impl Schema {
|
||||
|
||||
484
src/space_usage/mod.rs
Normal file
484
src/space_usage/mod.rs
Normal file
@@ -0,0 +1,484 @@
|
||||
/*!
|
||||
Representations for the space usage of various parts of a Tantivy index.
|
||||
|
||||
This can be used programmatically, and will also be exposed in a human readable fashion in
|
||||
tantivy-cli.
|
||||
|
||||
One important caveat for all of this functionality is that none of it currently takes storage-level
|
||||
details into consideration. For example, if your file system block size is 4096 bytes, we can
|
||||
under-count actual resultant space usage by up to 4095 bytes per file.
|
||||
*/
|
||||
|
||||
use schema::Field;
|
||||
use std::collections::HashMap;
|
||||
use SegmentComponent;
|
||||
|
||||
/// Indicates space usage in bytes
|
||||
pub type ByteCount = usize;
|
||||
|
||||
/// Enum containing any of the possible space usage results for segment components.
|
||||
pub enum ComponentSpaceUsage {
|
||||
/// Data is stored per field in a uniform way
|
||||
PerField(PerFieldSpaceUsage),
|
||||
/// Data is stored in separate pieces in the store
|
||||
Store(StoreSpaceUsage),
|
||||
/// Some sort of raw byte count
|
||||
Basic(ByteCount),
|
||||
}
|
||||
|
||||
/// Represents combined space usage of an entire searcher and its component segments.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct SearcherSpaceUsage {
|
||||
segments: Vec<SegmentSpaceUsage>,
|
||||
total: ByteCount,
|
||||
}
|
||||
|
||||
impl SearcherSpaceUsage {
|
||||
pub(crate) fn new() -> SearcherSpaceUsage {
|
||||
SearcherSpaceUsage {
|
||||
segments: Vec::new(),
|
||||
total: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Add a segment, to `self`.
|
||||
/// Performs no deduplication or other intelligence.
|
||||
pub(crate) fn add_segment(&mut self, segment: SegmentSpaceUsage) {
|
||||
self.total += segment.total();
|
||||
self.segments.push(segment);
|
||||
}
|
||||
|
||||
/// Per segment space usage
|
||||
pub fn segments(&self) -> &[SegmentSpaceUsage] {
|
||||
&self.segments[..]
|
||||
}
|
||||
|
||||
/// Returns total byte usage of this searcher, including all large subcomponents.
|
||||
/// Does not account for smaller things like `meta.json`.
|
||||
pub fn total(&self) -> ByteCount {
|
||||
self.total
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents combined space usage for all of the large components comprising a segment.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct SegmentSpaceUsage {
|
||||
num_docs: u32,
|
||||
|
||||
termdict: PerFieldSpaceUsage,
|
||||
postings: PerFieldSpaceUsage,
|
||||
positions: PerFieldSpaceUsage,
|
||||
positions_idx: PerFieldSpaceUsage,
|
||||
fast_fields: PerFieldSpaceUsage,
|
||||
fieldnorms: PerFieldSpaceUsage,
|
||||
|
||||
store: StoreSpaceUsage,
|
||||
|
||||
deletes: ByteCount,
|
||||
|
||||
total: ByteCount,
|
||||
}
|
||||
|
||||
impl SegmentSpaceUsage {
|
||||
pub(crate) fn new(
|
||||
num_docs: u32,
|
||||
termdict: PerFieldSpaceUsage,
|
||||
postings: PerFieldSpaceUsage,
|
||||
positions: PerFieldSpaceUsage,
|
||||
positions_idx: PerFieldSpaceUsage,
|
||||
fast_fields: PerFieldSpaceUsage,
|
||||
fieldnorms: PerFieldSpaceUsage,
|
||||
store: StoreSpaceUsage,
|
||||
deletes: ByteCount,
|
||||
) -> SegmentSpaceUsage {
|
||||
let total = termdict.total()
|
||||
+ postings.total()
|
||||
+ positions.total()
|
||||
+ fast_fields.total()
|
||||
+ fieldnorms.total()
|
||||
+ store.total()
|
||||
+ deletes;
|
||||
SegmentSpaceUsage {
|
||||
num_docs,
|
||||
termdict,
|
||||
postings,
|
||||
positions,
|
||||
positions_idx,
|
||||
fast_fields,
|
||||
fieldnorms,
|
||||
store,
|
||||
deletes,
|
||||
total,
|
||||
}
|
||||
}
|
||||
|
||||
/// Space usage for the given component
|
||||
///
|
||||
/// Clones the underlying data.
|
||||
/// Use the components directly if this is somehow in performance critical code.
|
||||
pub fn component(&self, component: SegmentComponent) -> ComponentSpaceUsage {
|
||||
use SegmentComponent::*;
|
||||
use self::ComponentSpaceUsage::*;
|
||||
match component {
|
||||
POSTINGS => PerField(self.postings().clone()),
|
||||
POSITIONS => PerField(self.positions().clone()),
|
||||
POSITIONSSKIP => PerField(self.positions_skip_idx().clone()),
|
||||
FASTFIELDS => PerField(self.fast_fields().clone()),
|
||||
FIELDNORMS => PerField(self.fieldnorms().clone()),
|
||||
TERMS => PerField(self.termdict().clone()),
|
||||
STORE => Store(self.store().clone()),
|
||||
DELETE => Basic(self.deletes()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Num docs in segment
|
||||
pub fn num_docs(&self) -> u32 {
|
||||
self.num_docs
|
||||
}
|
||||
|
||||
/// Space usage for term dictionary
|
||||
pub fn termdict(&self) -> &PerFieldSpaceUsage {
|
||||
&self.termdict
|
||||
}
|
||||
|
||||
/// Space usage for postings list
|
||||
pub fn postings(&self) -> &PerFieldSpaceUsage {
|
||||
&self.postings
|
||||
}
|
||||
|
||||
/// Space usage for positions
|
||||
pub fn positions(&self) -> &PerFieldSpaceUsage {
|
||||
&self.positions
|
||||
}
|
||||
|
||||
/// Space usage for positions skip idx
|
||||
pub fn positions_skip_idx(&self) -> &PerFieldSpaceUsage {
|
||||
&self.positions_idx
|
||||
}
|
||||
|
||||
/// Space usage for fast fields
|
||||
pub fn fast_fields(&self) -> &PerFieldSpaceUsage {
|
||||
&self.fast_fields
|
||||
}
|
||||
|
||||
/// Space usage for field norms
|
||||
pub fn fieldnorms(&self) -> &PerFieldSpaceUsage {
|
||||
&self.fieldnorms
|
||||
}
|
||||
|
||||
/// Space usage for stored documents
|
||||
pub fn store(&self) -> &StoreSpaceUsage {
|
||||
&self.store
|
||||
}
|
||||
|
||||
/// Space usage for document deletions
|
||||
pub fn deletes(&self) -> ByteCount {
|
||||
self.deletes
|
||||
}
|
||||
|
||||
/// Total space usage in bytes for this segment.
|
||||
pub fn total(&self) -> ByteCount {
|
||||
self.total
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents space usage for the Store for this segment.
|
||||
///
|
||||
/// This is composed of two parts.
|
||||
/// `data` represents the compressed data itself.
|
||||
/// `offsets` represents a lookup to find the start of a block
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct StoreSpaceUsage {
|
||||
data: ByteCount,
|
||||
offsets: ByteCount,
|
||||
}
|
||||
|
||||
impl StoreSpaceUsage {
|
||||
pub(crate) fn new(data: ByteCount, offsets: ByteCount) -> StoreSpaceUsage {
|
||||
StoreSpaceUsage { data, offsets }
|
||||
}
|
||||
|
||||
/// Space usage for the data part of the store
|
||||
pub fn data_usage(&self) -> ByteCount {
|
||||
self.data
|
||||
}
|
||||
|
||||
/// Space usage for the offsets part of the store (doc ID -> offset)
|
||||
pub fn offsets_usage(&self) -> ByteCount {
|
||||
self.offsets
|
||||
}
|
||||
|
||||
/// Total space usage in bytes for this Store
|
||||
pub fn total(&self) -> ByteCount {
|
||||
self.data + self.offsets
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents space usage for all of the (field, index) pairs that appear in a CompositeFile.
|
||||
///
|
||||
/// A field can appear with a single index (typically 0) or with multiple indexes.
|
||||
/// Multiple indexes are used to handle variable length things, where
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct PerFieldSpaceUsage {
|
||||
fields: HashMap<Field, FieldUsage>,
|
||||
total: ByteCount
|
||||
}
|
||||
|
||||
impl PerFieldSpaceUsage {
|
||||
pub(crate) fn new(fields: HashMap<Field, FieldUsage>) -> PerFieldSpaceUsage {
|
||||
let total = fields.values().map(|x| x.total()).sum();
|
||||
PerFieldSpaceUsage { fields, total }
|
||||
}
|
||||
|
||||
/// Per field space usage
|
||||
pub fn fields(&self) -> impl Iterator<Item = (&Field, &FieldUsage)> {
|
||||
self.fields.iter()
|
||||
}
|
||||
|
||||
/// Bytes used by the represented file
|
||||
pub fn total(&self) -> ByteCount {
|
||||
self.total
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents space usage of a given field, breaking it down into the (field, index) pairs that
|
||||
/// comprise it.
|
||||
///
|
||||
/// See documentation for PerFieldSpaceUsage for slightly more information.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct FieldUsage {
|
||||
field: Field,
|
||||
num_bytes: ByteCount,
|
||||
/// A field can be composed of more than one piece.
|
||||
/// These pieces are indexed by arbitrary numbers starting at zero.
|
||||
/// `self.num_bytes` includes all of `self.sub_num_bytes`.
|
||||
sub_num_bytes: Vec<Option<ByteCount>>,
|
||||
}
|
||||
|
||||
impl FieldUsage {
|
||||
pub(crate) fn empty(field: Field) -> FieldUsage {
|
||||
FieldUsage {
|
||||
field,
|
||||
num_bytes: 0,
|
||||
sub_num_bytes: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn add_field_idx(&mut self, idx: usize, size: ByteCount) {
|
||||
if self.sub_num_bytes.len() < idx + 1{
|
||||
self.sub_num_bytes.resize(idx + 1, None);
|
||||
}
|
||||
assert!(self.sub_num_bytes[idx].is_none());
|
||||
self.sub_num_bytes[idx] = Some(size);
|
||||
self.num_bytes += size
|
||||
}
|
||||
|
||||
/// Field
|
||||
pub fn field(&self) -> Field {
|
||||
self.field
|
||||
}
|
||||
|
||||
/// Space usage for each index
|
||||
pub fn sub_num_bytes(&self) -> &[Option<ByteCount>] {
|
||||
&self.sub_num_bytes[..]
|
||||
}
|
||||
|
||||
/// Total bytes used for this field in this context
|
||||
pub fn total(&self) -> ByteCount {
|
||||
self.num_bytes
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use core::Index;
|
||||
use schema::SchemaBuilder;
|
||||
use schema::{FAST, INT_INDEXED, TEXT};
|
||||
use schema::Field;
|
||||
use space_usage::ByteCount;
|
||||
use space_usage::PerFieldSpaceUsage;
|
||||
use schema::STORED;
|
||||
use Term;
|
||||
|
||||
#[test]
|
||||
fn test_empty() {
|
||||
let schema = SchemaBuilder::new().build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let searcher_space_usage = searcher.space_usage();
|
||||
assert_eq!(0, searcher_space_usage.total());
|
||||
}
|
||||
|
||||
fn expect_single_field(field_space: &PerFieldSpaceUsage, field: &Field, min_size: ByteCount, max_size: ByteCount) {
|
||||
assert!(field_space.total() >= min_size);
|
||||
assert!(field_space.total() <= max_size);
|
||||
assert_eq!(
|
||||
vec![(field, field_space.total())],
|
||||
field_space.fields().map(|(x,y)| (x, y.total())).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fast_indexed() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let name = schema_builder.add_u64_field("name", FAST | INT_INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(name => 1u64));
|
||||
index_writer.add_document(doc!(name => 2u64));
|
||||
index_writer.add_document(doc!(name => 10u64));
|
||||
index_writer.add_document(doc!(name => 20u64));
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let searcher_space_usage = searcher.space_usage();
|
||||
assert!(searcher_space_usage.total() > 0);
|
||||
assert_eq!(1, searcher_space_usage.segments().len());
|
||||
|
||||
let segment = &searcher_space_usage.segments()[0];
|
||||
assert!(segment.total() > 0);
|
||||
|
||||
assert_eq!(4, segment.num_docs());
|
||||
|
||||
expect_single_field(segment.termdict(), &name, 1, 512);
|
||||
expect_single_field(segment.postings(), &name, 1, 512);
|
||||
assert_eq!(0, segment.positions().total());
|
||||
assert_eq!(0, segment.positions_skip_idx().total());
|
||||
expect_single_field(segment.fast_fields(), &name, 1, 512);
|
||||
expect_single_field(segment.fieldnorms(), &name, 1, 512);
|
||||
// TODO: understand why the following fails
|
||||
// assert_eq!(0, segment.store().total());
|
||||
assert_eq!(0, segment.deletes());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_text() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let name = schema_builder.add_text_field("name", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(name => "hi"));
|
||||
index_writer.add_document(doc!(name => "this is a test"));
|
||||
index_writer.add_document(doc!(name => "some more documents with some word overlap with the other test"));
|
||||
index_writer.add_document(doc!(name => "hello hi goodbye"));
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let searcher_space_usage = searcher.space_usage();
|
||||
assert!(searcher_space_usage.total() > 0);
|
||||
assert_eq!(1, searcher_space_usage.segments().len());
|
||||
|
||||
let segment = &searcher_space_usage.segments()[0];
|
||||
assert!(segment.total() > 0);
|
||||
|
||||
assert_eq!(4, segment.num_docs());
|
||||
|
||||
expect_single_field(segment.termdict(), &name, 1, 512);
|
||||
expect_single_field(segment.postings(), &name, 1, 512);
|
||||
expect_single_field(segment.positions(), &name, 1, 512);
|
||||
expect_single_field(segment.positions_skip_idx(), &name, 1, 512);
|
||||
assert_eq!(0, segment.fast_fields().total());
|
||||
expect_single_field(segment.fieldnorms(), &name, 1, 512);
|
||||
// TODO: understand why the following fails
|
||||
// assert_eq!(0, segment.store().total());
|
||||
assert_eq!(0, segment.deletes());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_store() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let name = schema_builder.add_text_field("name", STORED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(name => "hi"));
|
||||
index_writer.add_document(doc!(name => "this is a test"));
|
||||
index_writer.add_document(doc!(name => "some more documents with some word overlap with the other test"));
|
||||
index_writer.add_document(doc!(name => "hello hi goodbye"));
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let searcher_space_usage = searcher.space_usage();
|
||||
assert!(searcher_space_usage.total() > 0);
|
||||
assert_eq!(1, searcher_space_usage.segments().len());
|
||||
|
||||
let segment = &searcher_space_usage.segments()[0];
|
||||
assert!(segment.total() > 0);
|
||||
|
||||
assert_eq!(4, segment.num_docs());
|
||||
|
||||
assert_eq!(0, segment.termdict().total());
|
||||
assert_eq!(0, segment.postings().total());
|
||||
assert_eq!(0, segment.positions().total());
|
||||
assert_eq!(0, segment.positions_skip_idx().total());
|
||||
assert_eq!(0, segment.fast_fields().total());
|
||||
assert_eq!(0, segment.fieldnorms().total());
|
||||
assert!(segment.store().total() > 0);
|
||||
assert!(segment.store().total() < 512);
|
||||
assert_eq!(0, segment.deletes());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_deletes() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let name = schema_builder.add_u64_field("name", INT_INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(name => 1u64));
|
||||
index_writer.add_document(doc!(name => 2u64));
|
||||
index_writer.add_document(doc!(name => 3u64));
|
||||
index_writer.add_document(doc!(name => 4u64));
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
|
||||
{
|
||||
let mut index_writer2 = index.writer(50_000_000).unwrap();
|
||||
index_writer2.delete_term(Term::from_field_u64(name, 2u64));
|
||||
index_writer2.delete_term(Term::from_field_u64(name, 3u64));
|
||||
|
||||
// ok, now we should have a deleted doc
|
||||
index_writer2.commit().unwrap();
|
||||
}
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
|
||||
let searcher = index.searcher();
|
||||
let searcher_space_usage = searcher.space_usage();
|
||||
assert!(searcher_space_usage.total() > 0);
|
||||
assert_eq!(1, searcher_space_usage.segments().len());
|
||||
|
||||
let segment = &searcher_space_usage.segments()[0];
|
||||
assert!(segment.total() > 0);
|
||||
|
||||
assert_eq!(2, segment.num_docs());
|
||||
|
||||
expect_single_field(segment.termdict(), &name, 1, 512);
|
||||
expect_single_field(segment.postings(), &name, 1, 512);
|
||||
assert_eq!(0, segment.positions().total());
|
||||
assert_eq!(0, segment.positions_skip_idx().total());
|
||||
assert_eq!(0, segment.fast_fields().total());
|
||||
expect_single_field(segment.fieldnorms(), &name, 1, 512);
|
||||
// TODO: understand why the following fails
|
||||
// assert_eq!(0, segment.store().total());
|
||||
assert!(segment.deletes() > 0);
|
||||
}
|
||||
}
|
||||
@@ -6,6 +6,7 @@ use common::BinarySerializable;
|
||||
use common::VInt;
|
||||
use directory::ReadOnlySource;
|
||||
use schema::Document;
|
||||
use space_usage::StoreSpaceUsage;
|
||||
use std::cell::RefCell;
|
||||
use std::io;
|
||||
use std::mem::size_of;
|
||||
@@ -87,6 +88,11 @@ impl StoreReader {
|
||||
cursor = &cursor[..doc_length];
|
||||
Ok(Document::deserialize(&mut cursor)?)
|
||||
}
|
||||
|
||||
/// Summarize total space usage of this store reader.
|
||||
pub fn space_usage(&self) -> StoreSpaceUsage {
|
||||
StoreSpaceUsage::new(self.data.len(), self.offset_index_source.len())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg_attr(
|
||||
|
||||
@@ -157,35 +157,34 @@ pub use self::tokenizer::BoxedTokenizer;
|
||||
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
pub use self::tokenizer_manager::TokenizerManager;
|
||||
|
||||
/// This is a function that can be used in tests and doc tests
|
||||
/// to assert a token's correctness.
|
||||
/// TODO: can this be wrapped in #[cfg(test)] so as not to be in the
|
||||
/// public api?
|
||||
pub fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) {
|
||||
assert_eq!(
|
||||
token.position, position,
|
||||
"expected position {} but {:?}",
|
||||
position, token
|
||||
);
|
||||
assert_eq!(token.text, text, "expected text {} but {:?}", text, token);
|
||||
assert_eq!(
|
||||
token.offset_from, from,
|
||||
"expected offset_from {} but {:?}",
|
||||
from, token
|
||||
);
|
||||
assert_eq!(
|
||||
token.offset_to, to,
|
||||
"expected offset_to {} but {:?}",
|
||||
to, token
|
||||
);
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod test {
|
||||
use super::assert_token;
|
||||
pub mod tests {
|
||||
use super::Token;
|
||||
use super::TokenizerManager;
|
||||
|
||||
|
||||
/// This is a function that can be used in tests and doc tests
|
||||
/// to assert a token's correctness.
|
||||
pub fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) {
|
||||
assert_eq!(
|
||||
token.position, position,
|
||||
"expected position {} but {:?}",
|
||||
position, token
|
||||
);
|
||||
assert_eq!(token.text, text, "expected text {} but {:?}", text, token);
|
||||
assert_eq!(
|
||||
token.offset_from, from,
|
||||
"expected offset_from {} but {:?}",
|
||||
from, token
|
||||
);
|
||||
assert_eq!(
|
||||
token.offset_to, to,
|
||||
"expected offset_to {} but {:?}",
|
||||
to, token
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_raw_tokenizer() {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
@@ -224,72 +223,6 @@ pub mod test {
|
||||
assert_token(&tokens[3], 3, "payer", 17, 22);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ngram_tokenizer() {
|
||||
use super::{LowerCaser, NgramTokenizer};
|
||||
use tokenizer::tokenizer::TokenStream;
|
||||
use tokenizer::tokenizer::Tokenizer;
|
||||
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
tokenizer_manager.register("ngram12", NgramTokenizer::new(1, 2, false));
|
||||
tokenizer_manager.register(
|
||||
"ngram3",
|
||||
NgramTokenizer::new(3, 3, false).filter(LowerCaser),
|
||||
);
|
||||
tokenizer_manager.register(
|
||||
"edgegram5",
|
||||
NgramTokenizer::new(2, 5, true).filter(LowerCaser),
|
||||
);
|
||||
|
||||
let tokenizer = NgramTokenizer::new(1, 2, false);
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| {
|
||||
tokens.push(token.clone());
|
||||
};
|
||||
tokenizer.token_stream("hello").process(&mut add_token);
|
||||
}
|
||||
assert_eq!(tokens.len(), 9);
|
||||
assert_token(&tokens[0], 0, "h", 0, 1);
|
||||
assert_token(&tokens[1], 0, "he", 0, 2);
|
||||
assert_token(&tokens[2], 1, "e", 1, 2);
|
||||
assert_token(&tokens[3], 1, "el", 1, 3);
|
||||
assert_token(&tokens[4], 2, "l", 2, 3);
|
||||
assert_token(&tokens[5], 2, "ll", 2, 4);
|
||||
assert_token(&tokens[6], 3, "l", 3, 4);
|
||||
assert_token(&tokens[7], 3, "lo", 3, 5);
|
||||
assert_token(&tokens[8], 4, "o", 4, 5);
|
||||
|
||||
let tokenizer = tokenizer_manager.get("ngram3").unwrap();
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| {
|
||||
tokens.push(token.clone());
|
||||
};
|
||||
tokenizer.token_stream("Hello").process(&mut add_token);
|
||||
}
|
||||
assert_eq!(tokens.len(), 3);
|
||||
assert_token(&tokens[0], 0, "hel", 0, 3);
|
||||
assert_token(&tokens[1], 1, "ell", 1, 4);
|
||||
assert_token(&tokens[2], 2, "llo", 2, 5);
|
||||
|
||||
let tokenizer = tokenizer_manager.get("edgegram5").unwrap();
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| {
|
||||
tokens.push(token.clone());
|
||||
};
|
||||
tokenizer
|
||||
.token_stream("Frankenstein")
|
||||
.process(&mut add_token);
|
||||
}
|
||||
assert_eq!(tokens.len(), 4);
|
||||
assert_token(&tokens[0], 0, "fr", 0, 2);
|
||||
assert_token(&tokens[1], 0, "fra", 0, 3);
|
||||
assert_token(&tokens[2], 0, "fran", 0, 4);
|
||||
assert_token(&tokens[3], 0, "frank", 0, 5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tokenizer_empty() {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
|
||||
@@ -2,14 +2,15 @@ use super::{Token, TokenStream, Tokenizer};
|
||||
|
||||
/// Tokenize the text by splitting words into n-grams of the given size(s)
|
||||
///
|
||||
/// With this tokenizer, the `position` field expresses the starting offset of the ngram
|
||||
/// rather than the `token` offset.
|
||||
/// With this tokenizer, the `position` is always 0.
|
||||
/// Beware however, in presence of multiple value for the same field,
|
||||
/// the position will be `POSITION_GAP * index of value`.
|
||||
///
|
||||
/// Example 1: `hello` would be tokenized as (min_gram: 2, max_gram: 3, prefix_only: false)
|
||||
///
|
||||
/// | Term | he | hel | el | ell | ll | llo | lo |
|
||||
/// |----------|-----|-----|-----|-----|-----|-----|----|
|
||||
/// | Position | 0 | 0 | 1 | 1 | 2 | 2 | 3 |
|
||||
/// | Position | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
|
||||
/// | Offsets | 0,2 | 0,3 | 1,3 | 1,4 | 2,4 | 2,5 | 3,5|
|
||||
///
|
||||
/// Example 2: `hello` would be tokenized as (min_gram: 2, max_gram: 5, prefix_only: **true**)
|
||||
@@ -19,24 +20,63 @@ use super::{Token, TokenStream, Tokenizer};
|
||||
/// | Position | 0 | 0 | 0 | 0 |
|
||||
/// | Offsets | 0,2 | 0,3 | 0,4 | 0,5 |
|
||||
///
|
||||
/// Example 3: `hεllo` (non-ascii) would be tokenized as (min_gram: 2, max_gram: 5, prefix_only: **true**)
|
||||
///
|
||||
/// | Term | hε | hεl | hεll | hεllo |
|
||||
/// |----------|-----|-----|-------|-------|
|
||||
/// | Position | 0 | 0 | 0 | 0 |
|
||||
/// | Offsets | 0,3 | 0,4 | 0,5 | 0,6 |
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// extern crate tantivy;
|
||||
/// # extern crate tantivy;
|
||||
/// use tantivy::tokenizer::*;
|
||||
/// use tantivy::tokenizer::assert_token;
|
||||
///
|
||||
/// # fn main() {
|
||||
/// let tokenizer = NgramTokenizer::new(2, 3, false);
|
||||
/// let mut stream = tokenizer.token_stream("hello");
|
||||
///
|
||||
/// assert_token(stream.next().unwrap(), 0, "he", 0, 2);
|
||||
/// assert_token(stream.next().unwrap(), 0, "hel", 0, 3);
|
||||
/// assert_token(stream.next().unwrap(), 1, "el", 1, 3);
|
||||
/// assert_token(stream.next().unwrap(), 1, "ell", 1, 4);
|
||||
/// assert_token(stream.next().unwrap(), 2, "ll", 2, 4);
|
||||
/// assert_token(stream.next().unwrap(), 2, "llo", 2, 5);
|
||||
/// assert_token(stream.next().unwrap(), 3, "lo", 3, 5);
|
||||
/// {
|
||||
/// let token = stream.next().unwrap();
|
||||
/// assert_eq!(token.text, "he");
|
||||
/// assert_eq!(token.offset_from, 0);
|
||||
/// assert_eq!(token.offset_to, 2);
|
||||
/// }
|
||||
/// {
|
||||
/// let token = stream.next().unwrap();
|
||||
/// assert_eq!(token.text, "hel");
|
||||
/// assert_eq!(token.offset_from, 0);
|
||||
/// assert_eq!(token.offset_to, 3);
|
||||
/// }
|
||||
/// {
|
||||
/// let token = stream.next().unwrap();
|
||||
/// assert_eq!(token.text, "el");
|
||||
/// assert_eq!(token.offset_from, 1);
|
||||
/// assert_eq!(token.offset_to, 3);
|
||||
/// }
|
||||
/// {
|
||||
/// let token = stream.next().unwrap();
|
||||
/// assert_eq!(token.text, "ell");
|
||||
/// assert_eq!(token.offset_from, 1);
|
||||
/// assert_eq!(token.offset_to, 4);
|
||||
/// }
|
||||
/// {
|
||||
/// let token = stream.next().unwrap();
|
||||
/// assert_eq!(token.text, "ll");
|
||||
/// assert_eq!(token.offset_from, 2);
|
||||
/// assert_eq!(token.offset_to, 4);
|
||||
/// }
|
||||
/// {
|
||||
/// let token = stream.next().unwrap();
|
||||
/// assert_eq!(token.text, "llo");
|
||||
/// assert_eq!(token.offset_from, 2);
|
||||
/// assert_eq!(token.offset_to, 5);
|
||||
/// }
|
||||
/// {
|
||||
/// let token = stream.next().unwrap();
|
||||
/// assert_eq!(token.text, "lo");
|
||||
/// assert_eq!(token.offset_from, 3);
|
||||
/// assert_eq!(token.offset_to, 5);
|
||||
/// }
|
||||
/// assert!(stream.next().is_none());
|
||||
/// # }
|
||||
/// ```
|
||||
@@ -58,23 +98,37 @@ impl NgramTokenizer {
|
||||
min_gram <= max_gram,
|
||||
"min_gram must not be greater than max_gram"
|
||||
);
|
||||
|
||||
NgramTokenizer {
|
||||
min_gram,
|
||||
max_gram,
|
||||
prefix_only,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a `NGramTokenizer` which generates tokens for all inner ngrams.
|
||||
///
|
||||
/// This is as opposed to only prefix ngrams .
|
||||
pub fn all_ngrams(min_gram: usize, max_gram:usize) -> NgramTokenizer {
|
||||
Self::new(min_gram, max_gram, false)
|
||||
}
|
||||
|
||||
/// Create a `NGramTokenizer` which only generates tokens for the
|
||||
/// prefix ngrams.
|
||||
pub fn prefix_only(min_gram: usize, max_gram: usize) -> NgramTokenizer {
|
||||
Self::new(min_gram, max_gram, true)
|
||||
}
|
||||
}
|
||||
|
||||
/// TokenStream associate to the `NgramTokenizer`
|
||||
pub struct NgramTokenStream<'a> {
|
||||
text: &'a str,
|
||||
position: usize,
|
||||
text_length: usize,
|
||||
token: Token,
|
||||
min_gram: usize,
|
||||
max_gram: usize,
|
||||
gram_size: usize,
|
||||
/// parameters
|
||||
ngram_charidx_iterator: StutteringIterator<CodepointFrontiers<'a>>,
|
||||
/// true if the NgramTokenStream is in prefix mode.
|
||||
prefix_only: bool,
|
||||
/// input
|
||||
text: &'a str,
|
||||
/// output
|
||||
token: Token,
|
||||
}
|
||||
|
||||
impl<'a> Tokenizer<'a> for NgramTokenizer {
|
||||
@@ -82,65 +136,28 @@ impl<'a> Tokenizer<'a> for NgramTokenizer {
|
||||
|
||||
fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl {
|
||||
NgramTokenStream {
|
||||
text,
|
||||
position: 0,
|
||||
text_length: text.len(),
|
||||
token: Token::default(),
|
||||
min_gram: self.min_gram,
|
||||
max_gram: self.max_gram,
|
||||
ngram_charidx_iterator: StutteringIterator::new(
|
||||
CodepointFrontiers::for_str(text),
|
||||
self.min_gram,
|
||||
self.max_gram),
|
||||
prefix_only: self.prefix_only,
|
||||
gram_size: self.min_gram,
|
||||
text,
|
||||
token: Token::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> NgramTokenStream<'a> {
|
||||
/// Get the next set of token options
|
||||
/// cycle through 1,2 (min..=max)
|
||||
/// returning None if processing should stop
|
||||
fn chomp(&mut self) -> Option<(usize, usize)> {
|
||||
// Have we exceeded the bounds of the text we are indexing?
|
||||
if self.gram_size > self.max_gram {
|
||||
if self.prefix_only {
|
||||
return None;
|
||||
}
|
||||
|
||||
// since we aren't just processing edges
|
||||
// we need to reset the gram size
|
||||
self.gram_size = self.min_gram;
|
||||
|
||||
// and move down the chain of letters
|
||||
self.position += 1;
|
||||
}
|
||||
|
||||
let result = if (self.position + self.gram_size) <= self.text_length {
|
||||
Some((self.position, self.gram_size))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// increase the gram size for the next pass
|
||||
self.gram_size += 1;
|
||||
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for NgramTokenStream<'a> {
|
||||
fn advance(&mut self) -> bool {
|
||||
// clear out working token text
|
||||
self.token.text.clear();
|
||||
|
||||
if let Some((position, size)) = self.chomp() {
|
||||
self.token.position = position;
|
||||
let offset_from = position;
|
||||
let offset_to = offset_from + size;
|
||||
|
||||
if let Some((offset_from, offset_to)) = self.ngram_charidx_iterator.next() {
|
||||
if self.prefix_only && offset_from > 0 {
|
||||
return false;
|
||||
}
|
||||
self.token.position = 0;
|
||||
self.token.offset_from = offset_from;
|
||||
self.token.offset_to = offset_to;
|
||||
|
||||
self.token.text.clear();
|
||||
self.token.text.push_str(&self.text[offset_from..offset_to]);
|
||||
|
||||
true
|
||||
} else {
|
||||
false
|
||||
@@ -150,8 +167,307 @@ impl<'a> TokenStream for NgramTokenStream<'a> {
|
||||
fn token(&self) -> &Token {
|
||||
&self.token
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
&mut self.token
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// This iterator takes an underlying Iterator
|
||||
/// and emits all of the pairs `(a,b)` such that
|
||||
/// a and b are items emitted by the iterator at
|
||||
/// an interval between `min_gram` and `max_gram`.
|
||||
///
|
||||
/// The elements are emitted in the order of appearance
|
||||
/// of `a` first, `b` then.
|
||||
///
|
||||
/// See `test_stutterring_iterator` for an example of its
|
||||
/// output.
|
||||
struct StutteringIterator<T> {
|
||||
underlying: T,
|
||||
min_gram: usize,
|
||||
max_gram: usize,
|
||||
|
||||
memory: Vec<usize>,
|
||||
cursor: usize,
|
||||
gram_len: usize
|
||||
}
|
||||
|
||||
impl<T> StutteringIterator<T>
|
||||
where T: Iterator<Item=usize> {
|
||||
pub fn new(mut underlying: T, min_gram: usize, max_gram: usize) -> StutteringIterator<T> {
|
||||
assert!(min_gram > 0);
|
||||
let memory: Vec<usize> = (&mut underlying).take(max_gram + 1).collect();
|
||||
if memory.len() <= min_gram {
|
||||
// returns an empty iterator
|
||||
StutteringIterator {
|
||||
underlying,
|
||||
min_gram: 1,
|
||||
max_gram: 0,
|
||||
memory,
|
||||
cursor: 0,
|
||||
gram_len: 0,
|
||||
}
|
||||
} else {
|
||||
StutteringIterator {
|
||||
underlying,
|
||||
min_gram,
|
||||
max_gram: memory.len() - 1,
|
||||
memory,
|
||||
cursor: 0,
|
||||
gram_len: min_gram,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Iterator for StutteringIterator<T>
|
||||
where T: Iterator<Item=usize> {
|
||||
type Item = (usize, usize);
|
||||
|
||||
fn next(&mut self) -> Option<(usize, usize)> {
|
||||
if self.gram_len > self.max_gram {
|
||||
// we have exhausted all options
|
||||
// starting at `self.memory[self.cursor]`.
|
||||
//
|
||||
// Time to advance.
|
||||
self.gram_len = self.min_gram;
|
||||
if let Some(next_val) = self.underlying.next() {
|
||||
self.memory[self.cursor] = next_val;
|
||||
} else {
|
||||
self.max_gram -= 1;
|
||||
}
|
||||
self.cursor += 1;
|
||||
if self.cursor >= self.memory.len() {
|
||||
self.cursor = 0;
|
||||
}
|
||||
}
|
||||
if self.max_gram < self.min_gram {
|
||||
return None;
|
||||
}
|
||||
let start = self.memory[self.cursor % self.memory.len()];
|
||||
let stop = self.memory[(self.cursor + self.gram_len) % self.memory.len()];
|
||||
self.gram_len += 1;
|
||||
Some((start, stop))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Emits all of the offsets where a codepoint starts
|
||||
/// or a codepoint ends.
|
||||
///
|
||||
/// By convention, we emit [0] for the empty string.
|
||||
struct CodepointFrontiers<'a> {
|
||||
s: &'a str,
|
||||
next_el: Option<usize>
|
||||
}
|
||||
|
||||
impl<'a> CodepointFrontiers<'a> {
|
||||
fn for_str(s: &'a str) -> Self {
|
||||
CodepointFrontiers {
|
||||
s,
|
||||
next_el: Some(0)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for CodepointFrontiers<'a> {
|
||||
type Item = usize;
|
||||
|
||||
fn next(&mut self) -> Option<usize> {
|
||||
self.next_el
|
||||
.map(|offset| {
|
||||
if self.s.is_empty() {
|
||||
self.next_el = None;
|
||||
} else {
|
||||
let first_codepoint_width = utf8_codepoint_width(self.s.as_bytes()[0]);
|
||||
self.s = &self.s[first_codepoint_width..];
|
||||
self.next_el = Some(offset + first_codepoint_width);
|
||||
}
|
||||
offset
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
const CODEPOINT_UTF8_WIDTH: [u8; 16] = [
|
||||
1, 1, 1, 1,
|
||||
1, 1, 1, 1,
|
||||
2, 2, 2, 2,
|
||||
2, 2, 3, 4,
|
||||
];
|
||||
|
||||
// Number of bytes to encode a codepoint in UTF-8 given
|
||||
// the first byte.
|
||||
//
|
||||
// To do that we count the number of higher significant bits set to `1`.
|
||||
fn utf8_codepoint_width(b: u8) -> usize {
|
||||
let higher_4_bits = (b as usize) >> 4;
|
||||
CODEPOINT_UTF8_WIDTH[higher_4_bits] as usize
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use tokenizer::tokenizer::{TokenStream, Tokenizer};
|
||||
use super::NgramTokenizer;
|
||||
use tokenizer::Token;
|
||||
use tokenizer::tests::assert_token;
|
||||
use super::CodepointFrontiers;
|
||||
use super::StutteringIterator;
|
||||
use super::utf8_codepoint_width;
|
||||
|
||||
fn test_helper<T: TokenStream>(mut tokenizer: T) -> Vec<Token> {
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
tokenizer.process(&mut |token: &Token| tokens.push(token.clone()));
|
||||
tokens
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_utf8_codepoint_width() {
|
||||
// 0xxx
|
||||
for i in 0..128 {
|
||||
assert_eq!(utf8_codepoint_width(i), 1);
|
||||
}
|
||||
// 110xx
|
||||
for i in (128 | 64)..(128 | 64 | 32) {
|
||||
assert_eq!(utf8_codepoint_width(i), 2);
|
||||
}
|
||||
// 1110xx
|
||||
for i in (128 | 64 | 32)..(128 | 64 | 32 | 16) {
|
||||
assert_eq!(utf8_codepoint_width(i), 3);
|
||||
}
|
||||
// 1111xx
|
||||
for i in (128 | 64 | 32 | 16)..256 {
|
||||
assert_eq!(utf8_codepoint_width(i as u8), 4);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_codepoint_frontiers() {
|
||||
assert_eq!(CodepointFrontiers::for_str("").collect::<Vec<_>>(), vec![0]);
|
||||
assert_eq!(
|
||||
CodepointFrontiers::for_str("abcd").collect::<Vec<_>>(),
|
||||
vec![0,1,2,3,4]
|
||||
);
|
||||
assert_eq!(
|
||||
CodepointFrontiers::for_str("aあ").collect::<Vec<_>>(),
|
||||
vec![0,1,4]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ngram_tokenizer_1_2_false() {
|
||||
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hello"));
|
||||
assert_eq!(tokens.len(), 9);
|
||||
assert_token(&tokens[0], 0, "h", 0, 1);
|
||||
assert_token(&tokens[1], 0, "he", 0, 2);
|
||||
assert_token(&tokens[2], 0, "e", 1, 2);
|
||||
assert_token(&tokens[3], 0, "el", 1, 3);
|
||||
assert_token(&tokens[4], 0, "l", 2, 3);
|
||||
assert_token(&tokens[5], 0, "ll", 2, 4);
|
||||
assert_token(&tokens[6], 0, "l", 3, 4);
|
||||
assert_token(&tokens[7], 0, "lo", 3, 5);
|
||||
assert_token(&tokens[8], 0, "o", 4, 5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ngram_tokenizer_min_max_equal() {
|
||||
let tokens = test_helper(NgramTokenizer::all_ngrams(3, 3).token_stream("hello"));
|
||||
assert_eq!(tokens.len(), 3);
|
||||
assert_token(&tokens[0], 0, "hel", 0, 3);
|
||||
assert_token(&tokens[1], 0, "ell", 1, 4);
|
||||
assert_token(&tokens[2], 0, "llo", 2, 5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ngram_tokenizer_2_5_prefix() {
|
||||
let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("frankenstein"));
|
||||
assert_eq!(tokens.len(), 4);
|
||||
assert_token(&tokens[0], 0, "fr", 0, 2);
|
||||
assert_token(&tokens[1], 0, "fra", 0, 3);
|
||||
assert_token(&tokens[2], 0, "fran", 0, 4);
|
||||
assert_token(&tokens[3], 0, "frank", 0, 5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ngram_non_ascii_1_2() {
|
||||
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hεllo"));
|
||||
assert_eq!(tokens.len(), 9);
|
||||
assert_token(&tokens[0], 0, "h", 0, 1);
|
||||
assert_token(&tokens[1], 0, "hε", 0, 3);
|
||||
assert_token(&tokens[2], 0, "ε", 1, 3);
|
||||
assert_token(&tokens[3], 0, "εl", 1, 4);
|
||||
assert_token(&tokens[4], 0, "l", 3, 4);
|
||||
assert_token(&tokens[5], 0, "ll", 3, 5);
|
||||
assert_token(&tokens[6], 0, "l", 4, 5);
|
||||
assert_token(&tokens[7], 0, "lo", 4, 6);
|
||||
assert_token(&tokens[8], 0, "o", 5, 6);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ngram_non_ascii_2_5_prefix() {
|
||||
let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("hεllo"));
|
||||
assert_eq!(tokens.len(), 4);
|
||||
assert_token(&tokens[0], 0, "hε", 0, 3);
|
||||
assert_token(&tokens[1], 0, "hεl", 0, 4);
|
||||
assert_token(&tokens[2], 0, "hεll", 0, 5);
|
||||
assert_token(&tokens[3], 0, "hεllo", 0, 6);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ngram_empty() {
|
||||
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 5).token_stream(""));
|
||||
assert!(tokens.is_empty());
|
||||
let tokens = test_helper(NgramTokenizer::all_ngrams(2, 5).token_stream(""));
|
||||
assert!(tokens.is_empty());
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "min_gram must be greater than 0")]
|
||||
fn test_ngram_min_max_interval_empty() {
|
||||
test_helper(NgramTokenizer::all_ngrams(0, 2).token_stream("hellossss"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "min_gram must not be greater than max_gram")]
|
||||
fn test_invalid_interval_should_panic_if_smaller() {
|
||||
NgramTokenizer::all_ngrams(2, 1);
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_stutterring_iterator_empty() {
|
||||
let rg: Vec<usize> = vec![0];
|
||||
let mut it = StutteringIterator::new(rg.into_iter(), 1, 2);
|
||||
assert_eq!(it.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stutterring_iterator() {
|
||||
let rg: Vec<usize> = (0..10).collect();
|
||||
let mut it = StutteringIterator::new(rg.into_iter(), 1, 2);
|
||||
assert_eq!(it.next(), Some((0, 1)));
|
||||
assert_eq!(it.next(), Some((0, 2)));
|
||||
assert_eq!(it.next(), Some((1, 2)));
|
||||
assert_eq!(it.next(), Some((1, 3)));
|
||||
assert_eq!(it.next(), Some((2, 3)));
|
||||
assert_eq!(it.next(), Some((2, 4)));
|
||||
assert_eq!(it.next(), Some((3, 4)));
|
||||
assert_eq!(it.next(), Some((3, 5)));
|
||||
assert_eq!(it.next(), Some((4, 5)));
|
||||
assert_eq!(it.next(), Some((4, 6)));
|
||||
assert_eq!(it.next(), Some((5, 6)));
|
||||
assert_eq!(it.next(), Some((5, 7)));
|
||||
assert_eq!(it.next(), Some((6, 7)));
|
||||
assert_eq!(it.next(), Some((6, 8)));
|
||||
assert_eq!(it.next(), Some((7, 8)));
|
||||
assert_eq!(it.next(), Some((7, 9)));
|
||||
assert_eq!(it.next(), Some((8, 9)));
|
||||
assert_eq!(it.next(), None);
|
||||
}
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user