create index

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
This commit is contained in:
Ruihang Xia
2024-04-16 21:16:17 +08:00
parent 64941d848e
commit 182e22dda9
12 changed files with 567 additions and 1 deletions

300
Cargo.lock generated
View File

@@ -957,6 +957,15 @@ version = "2.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
[[package]]
name = "bitpacking"
version = "0.9.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c1d3e2bfd8d06048a179f7b17afc3188effa10385e7b00dc65af6aae732ea92"
dependencies = [
"crunchy",
]
[[package]]
name = "bitvec"
version = "1.0.1"
@@ -1303,6 +1312,12 @@ version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "060303ef31ef4a522737e1b1ab68c67916f2a787bb2f4f54f383279adba962b5"
[[package]]
name = "census"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4f4c707c6a209cbe82d10abd08e1ea8995e9ea937d2550646e02798948992be0"
[[package]]
name = "cesu8"
version = "1.1.0"
@@ -3088,6 +3103,12 @@ version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1435fa1053d8b2fbbe9be7e97eca7f33d37b28409959813daefc1446a14247f1"
[[package]]
name = "downcast-rs"
version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75b325c5dbd37f80359721ad39aca5a29fb04c89279657cffdda8736d0c0b9d2"
[[package]]
name = "dyn-clone"
version = "1.0.16"
@@ -3257,6 +3278,12 @@ version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a"
[[package]]
name = "fastdivide"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59668941c55e5c186b8b58c391629af56774ec768f73c08bbcd56f09348eb00b"
[[package]]
name = "fastrand"
version = "1.9.0"
@@ -3566,6 +3593,16 @@ dependencies = [
"winapi",
]
[[package]]
name = "fs4"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "21dabded2e32cd57ded879041205c60a4a4c4bab47bd0fd2fa8b01f30849f02b"
dependencies = [
"rustix 0.38.28",
"windows-sys 0.52.0",
]
[[package]]
name = "fsevent-sys"
version = "4.1.0"
@@ -3693,6 +3730,19 @@ dependencies = [
"slab",
]
[[package]]
name = "generator"
version = "0.7.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5cc16584ff22b460a382b7feec54b23d2908d858152e5739a120b949293bd74e"
dependencies = [
"cc",
"libc",
"log",
"rustversion",
"windows 0.48.0",
]
[[package]]
name = "generic-array"
version = "0.14.7"
@@ -3979,6 +4029,12 @@ dependencies = [
"utf8-width",
]
[[package]]
name = "htmlescape"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e9025058dae765dee5070ec375f591e2ba14638c63feff74f13805a72e523163"
[[package]]
name = "http"
version = "0.2.11"
@@ -4279,6 +4335,7 @@ dependencies = [
"regex",
"regex-automata 0.4.3",
"snafu",
"tantivy",
"tempfile",
"tokio",
"tokio-util",
@@ -4658,6 +4715,12 @@ version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
[[package]]
name = "levenshtein_automata"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c2cdeb66e45e9f36bfad5bbdb4d2384e70936afbee843c6f6543f0c551ebb25"
[[package]]
name = "lexical-core"
version = "0.8.5"
@@ -4870,6 +4933,20 @@ dependencies = [
"uuid",
]
[[package]]
name = "loom"
version = "0.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff50ecb28bb86013e935fb6683ab1f6d3a20016f123c76fd4c27470076ac30f5"
dependencies = [
"cfg-if 1.0.0",
"generator",
"pin-utils",
"scoped-tls",
"tracing",
"tracing-subscriber",
]
[[package]]
name = "lrlex"
version = "0.12.0"
@@ -4961,6 +5038,12 @@ dependencies = [
"twox-hash",
]
[[package]]
name = "lz4_flex"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5"
[[package]]
name = "lzma-sys"
version = "0.1.20"
@@ -5059,6 +5142,16 @@ version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
[[package]]
name = "measure_time"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56220900f1a0923789ecd6bf25fbae8af3b2f1ff3e9e297fc9b6b8674dd4d852"
dependencies = [
"instant",
"log",
]
[[package]]
name = "memchr"
version = "2.6.4"
@@ -5427,6 +5520,12 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97af489e1e21b68de4c390ecca6703318bc1aa16e9733bcb62c089b73c6fbb1b"
[[package]]
name = "murmurhash32"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2195bf6aa996a481483b29d62a7663eed3fe39600c460e323f8ff41e90bdd89b"
[[package]]
name = "mysql-common-derive"
version = "0.30.2"
@@ -5948,6 +6047,15 @@ version = "1.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
[[package]]
name = "oneshot"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6f6640c6bda7731b1fdbab747981a0f896dd1fedaf9f4a53fa237a04a84431f4"
dependencies = [
"loom",
]
[[package]]
name = "oorandom"
version = "11.1.3"
@@ -6293,6 +6401,15 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
[[package]]
name = "ownedbytes"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c3a059efb063b8f425b948e042e6b9bd85edfe60e913630ed727b23e2dfcc558"
dependencies = [
"stable_deref_trait",
]
[[package]]
name = "packedvec"
version = "1.2.4"
@@ -8102,6 +8219,16 @@ dependencies = [
"tree-sitter-cli",
]
[[package]]
name = "rust-stemmers"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e46a2036019fdb888131db7a4c847a1063a7493f971ed94ea82c67eada63ca54"
dependencies = [
"serde",
"serde_derive",
]
[[package]]
name = "rust_decimal"
version = "1.33.1"
@@ -8332,7 +8459,7 @@ dependencies = [
"bitflags 1.3.2",
"bstr",
"itertools 0.10.5",
"lz4_flex",
"lz4_flex 0.9.5",
"num-bigint",
"num-complex",
]
@@ -8723,6 +8850,12 @@ dependencies = [
"syn 1.0.109",
]
[[package]]
name = "scoped-tls"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e1cf6437eb19a8f4a6cc0f7dca544973b0b78843adbfeb3683d1a94a0024a294"
[[package]]
name = "scopeguard"
version = "1.2.0"
@@ -9259,6 +9392,15 @@ dependencies = [
"walkdir",
]
[[package]]
name = "sketches-ddsketch"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85636c14b73d81f541e525f585c0a2109e6744e1565b5c1668e31c70c10ed65c"
dependencies = [
"serde",
]
[[package]]
name = "slab"
version = "0.4.9"
@@ -9992,6 +10134,147 @@ version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417"
[[package]]
name = "tantivy"
version = "0.22.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8d0582f186c0a6d55655d24543f15e43607299425c5ad8352c242b914b31856"
dependencies = [
"aho-corasick",
"arc-swap",
"base64 0.22.0",
"bitpacking",
"byteorder",
"census",
"crc32fast",
"crossbeam-channel",
"downcast-rs",
"fastdivide",
"fnv",
"fs4",
"htmlescape",
"itertools 0.12.0",
"levenshtein_automata",
"log",
"lru",
"lz4_flex 0.11.3",
"measure_time",
"memmap2 0.9.3",
"num_cpus",
"once_cell",
"oneshot",
"rayon",
"regex",
"rust-stemmers",
"rustc-hash",
"serde",
"serde_json",
"sketches-ddsketch",
"smallvec",
"tantivy-bitpacker",
"tantivy-columnar",
"tantivy-common",
"tantivy-fst",
"tantivy-query-grammar",
"tantivy-stacker",
"tantivy-tokenizer-api",
"tempfile",
"thiserror",
"time",
"uuid",
"winapi",
]
[[package]]
name = "tantivy-bitpacker"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "284899c2325d6832203ac6ff5891b297fc5239c3dc754c5bc1977855b23c10df"
dependencies = [
"bitpacking",
]
[[package]]
name = "tantivy-columnar"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "12722224ffbe346c7fec3275c699e508fd0d4710e629e933d5736ec524a1f44e"
dependencies = [
"downcast-rs",
"fastdivide",
"itertools 0.12.0",
"serde",
"tantivy-bitpacker",
"tantivy-common",
"tantivy-sstable",
"tantivy-stacker",
]
[[package]]
name = "tantivy-common"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8019e3cabcfd20a1380b491e13ff42f57bb38bf97c3d5fa5c07e50816e0621f4"
dependencies = [
"async-trait",
"byteorder",
"ownedbytes",
"serde",
"time",
]
[[package]]
name = "tantivy-fst"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d60769b80ad7953d8a7b2c70cdfe722bbcdcac6bccc8ac934c40c034d866fc18"
dependencies = [
"byteorder",
"regex-syntax 0.8.2",
"utf8-ranges",
]
[[package]]
name = "tantivy-query-grammar"
version = "0.22.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "847434d4af57b32e309f4ab1b4f1707a6c566656264caa427ff4285c4d9d0b82"
dependencies = [
"nom",
]
[[package]]
name = "tantivy-sstable"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c69578242e8e9fc989119f522ba5b49a38ac20f576fc778035b96cc94f41f98e"
dependencies = [
"tantivy-bitpacker",
"tantivy-common",
"tantivy-fst",
"zstd 0.13.0",
]
[[package]]
name = "tantivy-stacker"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c56d6ff5591fc332739b3ce7035b57995a3ce29a93ffd6012660e0949c956ea8"
dependencies = [
"murmurhash32",
"rand_distr",
"tantivy-common",
]
[[package]]
name = "tantivy-tokenizer-api"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2a0dcade25819a89cfe6f17d932c9cedff11989936bf6dd4f336d50392053b04"
dependencies = [
"serde",
]
[[package]]
name = "tap"
version = "1.0.1"
@@ -11311,6 +11594,12 @@ version = "2.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da"
[[package]]
name = "utf8-ranges"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7fcfc827f90e53a02eaef5e535ee14266c1d569214c6aa70133a624d8a3164ba"
[[package]]
name = "utf8-width"
version = "0.1.7"
@@ -11666,6 +11955,15 @@ dependencies = [
"windows_x86_64_msvc 0.39.0",
]
[[package]]
name = "windows"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f"
dependencies = [
"windows-targets 0.48.5",
]
[[package]]
name = "windows"
version = "0.52.0"

View File

@@ -25,6 +25,7 @@ prost.workspace = true
regex.workspace = true
regex-automata.workspace = true
snafu.workspace = true
tantivy = "0.22"
[dev-dependencies]
rand.workspace = true

View File

@@ -0,0 +1,17 @@
// Copyright 2024 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
pub mod create;
pub mod error;
pub mod search;

View File

@@ -0,0 +1,79 @@
// Copyright 2024 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::BTreeMap;
use std::path::Path;
use snafu::ResultExt;
use tantivy::schema::{OwnedValue, Schema, INDEXED, STORED, TEXT};
use tantivy::{Document, Index, IndexWriter, TantivyDocument};
use super::error::TantivySnafu;
use crate::full_text_index::error::Result;
pub struct FullTextIndexCreater {
index: Index,
writer: IndexWriter,
count_field: tantivy::schema::Field,
text_field: tantivy::schema::Field,
row_count: usize,
segment_size: usize,
}
impl FullTextIndexCreater {
pub fn new<P>(segment_size: usize, path: P) -> Result<Self>
where
P: AsRef<Path>,
{
// build schema
let mut schema_builder = Schema::builder();
let count_field = schema_builder.add_i64_field("seg_count", INDEXED | STORED);
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
// build index
let index = Index::create_in_dir(path, schema).context(TantivySnafu)?;
// build writer
// 100 MB
let writer = index.writer(100_000_000).context(TantivySnafu)?;
Ok(Self {
index,
writer,
count_field,
text_field,
row_count: 0,
segment_size,
})
}
pub fn push_string(&mut self, content: String) -> Result<()> {
let mut doc = TantivyDocument::new();
doc.add_text(self.text_field, content);
doc.add_i64(self.count_field, (self.row_count / self.segment_size) as _);
self.writer.add_document(doc).context(TantivySnafu)?;
self.row_count += 1;
self.writer.commit().context(TantivySnafu)?;
Ok(())
}
pub fn finish(&mut self) -> Result<()> {
self.row_count = 0;
self.writer.commit().context(TantivySnafu)?;
Ok(())
}
}

View File

@@ -0,0 +1,45 @@
// Copyright 2024 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use common_macro::stack_trace_debug;
use snafu::{Location, Snafu};
use tantivy::directory::error::OpenDirectoryError;
#[derive(Snafu)]
#[snafu(visibility(pub))]
#[stack_trace_debug]
pub enum Error {
#[snafu(display("Tantivy error"))]
Tantivy {
#[snafu(source)]
error: tantivy::TantivyError,
location: Location,
},
#[snafu(display("Failed to open directory"))]
OpenDirectory {
#[snafu(source)]
error: OpenDirectoryError,
location: Location,
},
#[snafu(display("Failed to parse tantivy query"))]
ParseQuery {
#[snafu(source)]
error: tantivy::query::QueryParserError,
location: Location,
},
}
pub type Result<T> = std::result::Result<T, Error>;

View File

@@ -0,0 +1,74 @@
// Copyright 2024 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashSet;
use std::path::Path;
use snafu::ResultExt;
use tantivy::directory::MmapDirectory;
use tantivy::query::QueryParser;
use tantivy::schema::Value;
use tantivy::{Index, IndexReader, TantivyDocument, TantivyError};
use super::error::ParseQuerySnafu;
use crate::full_text_index::error::{OpenDirectorySnafu, Result, TantivySnafu};
pub struct FullTextIndexSearcher {
index: Index,
count_field: tantivy::schema::Field,
text_field: tantivy::schema::Field,
reader: IndexReader,
}
impl FullTextIndexSearcher {
pub fn open<P>(path: P) -> Result<Self>
where
P: AsRef<Path>,
{
let index = Index::open_in_dir(path).context(TantivySnafu)?;
let schema = index.schema();
let count_field = schema.get_field("seg_count").unwrap();
let text_field = schema.get_field("text").unwrap();
let reader = index.reader().context(TantivySnafu)?;
Ok(Self {
index,
count_field,
text_field,
reader,
})
}
pub fn search(&self, query: &str) -> Result<Vec<usize>> {
let searcher = self.reader.searcher();
let query_parser = QueryParser::for_index(&self.index, vec![self.text_field]);
let query = query_parser.parse_query(query).context(ParseQuerySnafu)?;
let top_docs = searcher
.search(&query, &tantivy::collector::TopDocs::with_limit(100))
.context(TantivySnafu)?;
let mut result = HashSet::new();
for (_score, doc_address) in top_docs {
let retrieved_doc = searcher
.doc::<TantivyDocument>(doc_address)
.context(TantivySnafu)?;
let seg_count = retrieved_doc
.get_first(self.count_field)
.unwrap()
.as_i64()
.unwrap();
result.insert(seg_count);
}
Ok(result.into_iter().map(|x| x as _).collect())
}
}

View File

@@ -14,4 +14,5 @@
#![feature(iter_partition_in_place)]
pub mod full_text_index;
pub mod inverted_index;

View File

@@ -336,6 +336,11 @@ impl ScanRegion {
.flatten()
.map(Arc::new)
}
fn build_full_text_index_applier(&self) -> Option<SstIndexApplierRef> {
// start here
todo!()
}
}
/// Config for parallel scan.

View File

@@ -189,6 +189,9 @@ impl<'a> IndexerBuilder<'a> {
segment_row_count = row_group_size;
}
// find a column named "log"
let log_column_id = self.metadata.column_by_name("log").map(|c| c.column_id);
let creator = SstIndexCreator::new(
self.file_path,
self.file_id,
@@ -197,6 +200,7 @@ impl<'a> IndexerBuilder<'a> {
self.intermediate_manager,
self.mem_threshold_index_create,
segment_row_count,
log_column_id,
)
.with_buffer_size(self.write_buffer_size)
.with_ignore_column_ids(

View File

@@ -21,6 +21,9 @@ use std::sync::atomic::AtomicUsize;
use std::sync::Arc;
use common_telemetry::warn;
use datatypes::scalars::ScalarVector;
use datatypes::vectors::StringVector;
use index::full_text_index::create::FullTextIndexCreater;
use index::inverted_index::create::sort::external_sort::ExternalSorter;
use index::inverted_index::create::sort_create::SortIndexCreator;
use index::inverted_index::create::InvertedIndexCreator;
@@ -29,6 +32,7 @@ use object_store::ObjectStore;
use puffin::file_format::writer::{Blob, PuffinAsyncWriter, PuffinFileWriter};
use snafu::{ensure, ResultExt};
use store_api::metadata::RegionMetadataRef;
use store_api::storage::ConcreteDataType;
use tokio::io::duplex;
use tokio_util::compat::{TokioAsyncReadCompatExt, TokioAsyncWriteCompatExt};
@@ -83,6 +87,10 @@ pub struct SstIndexCreator {
/// The memory usage of the index creator.
memory_usage: Arc<AtomicUsize>,
// experimental full text index
full_text_index_creater: FullTextIndexCreater,
log_column_id: Option<u32>,
}
impl SstIndexCreator {
@@ -96,6 +104,7 @@ impl SstIndexCreator {
intermediate_manager: IntermediateManager,
memory_usage_threshold: Option<usize>,
segment_row_count: NonZeroUsize,
log_column_id: Option<u32>,
) -> Self {
let temp_file_provider = Arc::new(TempFileProvider::new(
IntermediateLocation::new(&metadata.region_id, &sst_file_id),
@@ -112,6 +121,10 @@ impl SstIndexCreator {
);
let index_creator = Box::new(SortIndexCreator::new(sorter, segment_row_count));
let full_text_index_path = format!("{file_path}/full_text_index");
let full_text_index_creater =
FullTextIndexCreater::new(segment_row_count.get(), full_text_index_path).unwrap();
let codec = IndexValuesCodec::from_tag_columns(metadata.primary_key_columns());
Self {
file_path,
@@ -127,6 +140,9 @@ impl SstIndexCreator {
ignore_column_ids: HashSet::default(),
memory_usage,
full_text_index_creater,
log_column_id,
}
}
@@ -233,6 +249,23 @@ impl SstIndexCreator {
.context(PushIndexValueSnafu)?;
}
// try find column named "log" and update it into full text index
if let Some(log_column_id) = self.log_column_id {
for col in batch.fields() {
if col.column_id == log_column_id {
let vector = &col.data;
if vector.data_type() == ConcreteDataType::string_datatype() {
let vector = vector.as_any().downcast_ref::<StringVector>().unwrap();
for content in vector.iter_data() {
self.full_text_index_creater
.push_string(content.unwrap_or_default().to_string())
.unwrap();
}
}
}
}
}
Ok(())
}
@@ -296,6 +329,8 @@ impl SstIndexCreator {
_ => {}
}
self.full_text_index_creater.finish().unwrap();
let byte_count = puffin_writer.finish().await.context(PuffinFinishSnafu)?;
guard.inc_byte_count(byte_count);
Ok(())
@@ -421,6 +456,7 @@ mod tests {
intm_mgr,
memory_threshold,
NonZeroUsize::new(segment_row_count).unwrap(),
None,
);
for (str_tag, i32_tag) in &tags {

View File

@@ -29,6 +29,10 @@ pub fn index_file_path(region_dir: &str, sst_file_id: FileId) -> String {
util::join_path(&dir, &sst_file_id.as_puffin())
}
pub fn full_text_index_path(region_dir: &str) -> String {
util::join_dir(region_dir, "full_text_index")
}
#[cfg(test)]
mod tests {
use super::*;

View File

@@ -284,6 +284,8 @@ impl ParquetReaderBuilder {
.await
.or_else(|| self.prune_row_groups_by_minmax(read_format, parquet_meta, metrics))
.unwrap_or_else(|| (0..num_row_groups).map(|i| (i, None)).collect())
// todo: change here
}
/// Applies index to prune row groups.