diff --git a/Cargo.lock b/Cargo.lock index d4f44c1d83..5dcdc036f9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -957,6 +957,15 @@ version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" +[[package]] +name = "bitpacking" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c1d3e2bfd8d06048a179f7b17afc3188effa10385e7b00dc65af6aae732ea92" +dependencies = [ + "crunchy", +] + [[package]] name = "bitvec" version = "1.0.1" @@ -1303,6 +1312,12 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "060303ef31ef4a522737e1b1ab68c67916f2a787bb2f4f54f383279adba962b5" +[[package]] +name = "census" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f4c707c6a209cbe82d10abd08e1ea8995e9ea937d2550646e02798948992be0" + [[package]] name = "cesu8" version = "1.1.0" @@ -3088,6 +3103,12 @@ version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1435fa1053d8b2fbbe9be7e97eca7f33d37b28409959813daefc1446a14247f1" +[[package]] +name = "downcast-rs" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75b325c5dbd37f80359721ad39aca5a29fb04c89279657cffdda8736d0c0b9d2" + [[package]] name = "dyn-clone" version = "1.0.16" @@ -3257,6 +3278,12 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" +[[package]] +name = "fastdivide" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59668941c55e5c186b8b58c391629af56774ec768f73c08bbcd56f09348eb00b" + [[package]] name = "fastrand" version = "1.9.0" @@ -3566,6 +3593,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "fs4" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21dabded2e32cd57ded879041205c60a4a4c4bab47bd0fd2fa8b01f30849f02b" +dependencies = [ + "rustix 0.38.28", + "windows-sys 0.52.0", +] + [[package]] name = "fsevent-sys" version = "4.1.0" @@ -3693,6 +3730,19 @@ dependencies = [ "slab", ] +[[package]] +name = "generator" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5cc16584ff22b460a382b7feec54b23d2908d858152e5739a120b949293bd74e" +dependencies = [ + "cc", + "libc", + "log", + "rustversion", + "windows 0.48.0", +] + [[package]] name = "generic-array" version = "0.14.7" @@ -3979,6 +4029,12 @@ dependencies = [ "utf8-width", ] +[[package]] +name = "htmlescape" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9025058dae765dee5070ec375f591e2ba14638c63feff74f13805a72e523163" + [[package]] name = "http" version = "0.2.11" @@ -4279,6 +4335,7 @@ dependencies = [ "regex", "regex-automata 0.4.3", "snafu", + "tantivy", "tempfile", "tokio", "tokio-util", @@ -4658,6 +4715,12 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" +[[package]] +name = "levenshtein_automata" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c2cdeb66e45e9f36bfad5bbdb4d2384e70936afbee843c6f6543f0c551ebb25" + [[package]] name = "lexical-core" version = "0.8.5" @@ -4870,6 +4933,20 @@ dependencies = [ "uuid", ] +[[package]] +name = "loom" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff50ecb28bb86013e935fb6683ab1f6d3a20016f123c76fd4c27470076ac30f5" +dependencies = [ + "cfg-if 1.0.0", + "generator", + "pin-utils", + "scoped-tls", + "tracing", + "tracing-subscriber", +] + [[package]] name = "lrlex" version = "0.12.0" @@ -4961,6 +5038,12 @@ dependencies = [ "twox-hash", ] +[[package]] +name = "lz4_flex" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5" + [[package]] name = "lzma-sys" version = "0.1.20" @@ -5059,6 +5142,16 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" +[[package]] +name = "measure_time" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56220900f1a0923789ecd6bf25fbae8af3b2f1ff3e9e297fc9b6b8674dd4d852" +dependencies = [ + "instant", + "log", +] + [[package]] name = "memchr" version = "2.6.4" @@ -5427,6 +5520,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "97af489e1e21b68de4c390ecca6703318bc1aa16e9733bcb62c089b73c6fbb1b" +[[package]] +name = "murmurhash32" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2195bf6aa996a481483b29d62a7663eed3fe39600c460e323f8ff41e90bdd89b" + [[package]] name = "mysql-common-derive" version = "0.30.2" @@ -5948,6 +6047,15 @@ version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +[[package]] +name = "oneshot" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f6640c6bda7731b1fdbab747981a0f896dd1fedaf9f4a53fa237a04a84431f4" +dependencies = [ + "loom", +] + [[package]] name = "oorandom" version = "11.1.3" @@ -6293,6 +6401,15 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" +[[package]] +name = "ownedbytes" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3a059efb063b8f425b948e042e6b9bd85edfe60e913630ed727b23e2dfcc558" +dependencies = [ + "stable_deref_trait", +] + [[package]] name = "packedvec" version = "1.2.4" @@ -8102,6 +8219,16 @@ dependencies = [ "tree-sitter-cli", ] +[[package]] +name = "rust-stemmers" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e46a2036019fdb888131db7a4c847a1063a7493f971ed94ea82c67eada63ca54" +dependencies = [ + "serde", + "serde_derive", +] + [[package]] name = "rust_decimal" version = "1.33.1" @@ -8332,7 +8459,7 @@ dependencies = [ "bitflags 1.3.2", "bstr", "itertools 0.10.5", - "lz4_flex", + "lz4_flex 0.9.5", "num-bigint", "num-complex", ] @@ -8723,6 +8850,12 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "scoped-tls" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1cf6437eb19a8f4a6cc0f7dca544973b0b78843adbfeb3683d1a94a0024a294" + [[package]] name = "scopeguard" version = "1.2.0" @@ -9259,6 +9392,15 @@ dependencies = [ "walkdir", ] +[[package]] +name = "sketches-ddsketch" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85636c14b73d81f541e525f585c0a2109e6744e1565b5c1668e31c70c10ed65c" +dependencies = [ + "serde", +] + [[package]] name = "slab" version = "0.4.9" @@ -9992,6 +10134,147 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417" +[[package]] +name = "tantivy" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8d0582f186c0a6d55655d24543f15e43607299425c5ad8352c242b914b31856" +dependencies = [ + "aho-corasick", + "arc-swap", + "base64 0.22.0", + "bitpacking", + "byteorder", + "census", + "crc32fast", + "crossbeam-channel", + "downcast-rs", + "fastdivide", + "fnv", + "fs4", + "htmlescape", + "itertools 0.12.0", + "levenshtein_automata", + "log", + "lru", + "lz4_flex 0.11.3", + "measure_time", + "memmap2 0.9.3", + "num_cpus", + "once_cell", + "oneshot", + "rayon", + "regex", + "rust-stemmers", + "rustc-hash", + "serde", + "serde_json", + "sketches-ddsketch", + "smallvec", + "tantivy-bitpacker", + "tantivy-columnar", + "tantivy-common", + "tantivy-fst", + "tantivy-query-grammar", + "tantivy-stacker", + "tantivy-tokenizer-api", + "tempfile", + "thiserror", + "time", + "uuid", + "winapi", +] + +[[package]] +name = "tantivy-bitpacker" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "284899c2325d6832203ac6ff5891b297fc5239c3dc754c5bc1977855b23c10df" +dependencies = [ + "bitpacking", +] + +[[package]] +name = "tantivy-columnar" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12722224ffbe346c7fec3275c699e508fd0d4710e629e933d5736ec524a1f44e" +dependencies = [ + "downcast-rs", + "fastdivide", + "itertools 0.12.0", + "serde", + "tantivy-bitpacker", + "tantivy-common", + "tantivy-sstable", + "tantivy-stacker", +] + +[[package]] +name = "tantivy-common" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8019e3cabcfd20a1380b491e13ff42f57bb38bf97c3d5fa5c07e50816e0621f4" +dependencies = [ + "async-trait", + "byteorder", + "ownedbytes", + "serde", + "time", +] + +[[package]] +name = "tantivy-fst" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d60769b80ad7953d8a7b2c70cdfe722bbcdcac6bccc8ac934c40c034d866fc18" +dependencies = [ + "byteorder", + "regex-syntax 0.8.2", + "utf8-ranges", +] + +[[package]] +name = "tantivy-query-grammar" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "847434d4af57b32e309f4ab1b4f1707a6c566656264caa427ff4285c4d9d0b82" +dependencies = [ + "nom", +] + +[[package]] +name = "tantivy-sstable" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c69578242e8e9fc989119f522ba5b49a38ac20f576fc778035b96cc94f41f98e" +dependencies = [ + "tantivy-bitpacker", + "tantivy-common", + "tantivy-fst", + "zstd 0.13.0", +] + +[[package]] +name = "tantivy-stacker" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c56d6ff5591fc332739b3ce7035b57995a3ce29a93ffd6012660e0949c956ea8" +dependencies = [ + "murmurhash32", + "rand_distr", + "tantivy-common", +] + +[[package]] +name = "tantivy-tokenizer-api" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a0dcade25819a89cfe6f17d932c9cedff11989936bf6dd4f336d50392053b04" +dependencies = [ + "serde", +] + [[package]] name = "tap" version = "1.0.1" @@ -11311,6 +11594,12 @@ version = "2.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" +[[package]] +name = "utf8-ranges" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fcfc827f90e53a02eaef5e535ee14266c1d569214c6aa70133a624d8a3164ba" + [[package]] name = "utf8-width" version = "0.1.7" @@ -11666,6 +11955,15 @@ dependencies = [ "windows_x86_64_msvc 0.39.0", ] +[[package]] +name = "windows" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" +dependencies = [ + "windows-targets 0.48.5", +] + [[package]] name = "windows" version = "0.52.0" diff --git a/src/index/Cargo.toml b/src/index/Cargo.toml index effa0a79e6..65df20b2fc 100644 --- a/src/index/Cargo.toml +++ b/src/index/Cargo.toml @@ -25,6 +25,7 @@ prost.workspace = true regex.workspace = true regex-automata.workspace = true snafu.workspace = true +tantivy = "0.22" [dev-dependencies] rand.workspace = true diff --git a/src/index/src/full_text_index.rs b/src/index/src/full_text_index.rs new file mode 100644 index 0000000000..10d79a9092 --- /dev/null +++ b/src/index/src/full_text_index.rs @@ -0,0 +1,17 @@ +// Copyright 2024 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod create; +pub mod error; +pub mod search; diff --git a/src/index/src/full_text_index/create.rs b/src/index/src/full_text_index/create.rs new file mode 100644 index 0000000000..1a913f8621 --- /dev/null +++ b/src/index/src/full_text_index/create.rs @@ -0,0 +1,79 @@ +// Copyright 2024 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::BTreeMap; +use std::path::Path; + +use snafu::ResultExt; +use tantivy::schema::{OwnedValue, Schema, INDEXED, STORED, TEXT}; +use tantivy::{Document, Index, IndexWriter, TantivyDocument}; + +use super::error::TantivySnafu; +use crate::full_text_index::error::Result; + +pub struct FullTextIndexCreater { + index: Index, + writer: IndexWriter, + count_field: tantivy::schema::Field, + text_field: tantivy::schema::Field, + + row_count: usize, + segment_size: usize, +} + +impl FullTextIndexCreater { + pub fn new

(segment_size: usize, path: P) -> Result + where + P: AsRef, + { + // build schema + let mut schema_builder = Schema::builder(); + let count_field = schema_builder.add_i64_field("seg_count", INDEXED | STORED); + let text_field = schema_builder.add_text_field("text", TEXT); + let schema = schema_builder.build(); + + // build index + let index = Index::create_in_dir(path, schema).context(TantivySnafu)?; + + // build writer + // 100 MB + let writer = index.writer(100_000_000).context(TantivySnafu)?; + + Ok(Self { + index, + writer, + count_field, + text_field, + row_count: 0, + segment_size, + }) + } + + pub fn push_string(&mut self, content: String) -> Result<()> { + let mut doc = TantivyDocument::new(); + doc.add_text(self.text_field, content); + doc.add_i64(self.count_field, (self.row_count / self.segment_size) as _); + self.writer.add_document(doc).context(TantivySnafu)?; + self.row_count += 1; + + self.writer.commit().context(TantivySnafu)?; + Ok(()) + } + + pub fn finish(&mut self) -> Result<()> { + self.row_count = 0; + self.writer.commit().context(TantivySnafu)?; + Ok(()) + } +} diff --git a/src/index/src/full_text_index/error.rs b/src/index/src/full_text_index/error.rs new file mode 100644 index 0000000000..7328101e4d --- /dev/null +++ b/src/index/src/full_text_index/error.rs @@ -0,0 +1,45 @@ +// Copyright 2024 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use common_macro::stack_trace_debug; +use snafu::{Location, Snafu}; +use tantivy::directory::error::OpenDirectoryError; + +#[derive(Snafu)] +#[snafu(visibility(pub))] +#[stack_trace_debug] +pub enum Error { + #[snafu(display("Tantivy error"))] + Tantivy { + #[snafu(source)] + error: tantivy::TantivyError, + location: Location, + }, + + #[snafu(display("Failed to open directory"))] + OpenDirectory { + #[snafu(source)] + error: OpenDirectoryError, + location: Location, + }, + + #[snafu(display("Failed to parse tantivy query"))] + ParseQuery { + #[snafu(source)] + error: tantivy::query::QueryParserError, + location: Location, + }, +} + +pub type Result = std::result::Result; diff --git a/src/index/src/full_text_index/search.rs b/src/index/src/full_text_index/search.rs new file mode 100644 index 0000000000..6b75cabe3b --- /dev/null +++ b/src/index/src/full_text_index/search.rs @@ -0,0 +1,74 @@ +// Copyright 2024 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashSet; +use std::path::Path; + +use snafu::ResultExt; +use tantivy::directory::MmapDirectory; +use tantivy::query::QueryParser; +use tantivy::schema::Value; +use tantivy::{Index, IndexReader, TantivyDocument, TantivyError}; + +use super::error::ParseQuerySnafu; +use crate::full_text_index::error::{OpenDirectorySnafu, Result, TantivySnafu}; + +pub struct FullTextIndexSearcher { + index: Index, + count_field: tantivy::schema::Field, + text_field: tantivy::schema::Field, + reader: IndexReader, +} + +impl FullTextIndexSearcher { + pub fn open

(path: P) -> Result + where + P: AsRef, + { + let index = Index::open_in_dir(path).context(TantivySnafu)?; + let schema = index.schema(); + let count_field = schema.get_field("seg_count").unwrap(); + let text_field = schema.get_field("text").unwrap(); + let reader = index.reader().context(TantivySnafu)?; + + Ok(Self { + index, + count_field, + text_field, + reader, + }) + } + + pub fn search(&self, query: &str) -> Result> { + let searcher = self.reader.searcher(); + let query_parser = QueryParser::for_index(&self.index, vec![self.text_field]); + let query = query_parser.parse_query(query).context(ParseQuerySnafu)?; + let top_docs = searcher + .search(&query, &tantivy::collector::TopDocs::with_limit(100)) + .context(TantivySnafu)?; + let mut result = HashSet::new(); + for (_score, doc_address) in top_docs { + let retrieved_doc = searcher + .doc::(doc_address) + .context(TantivySnafu)?; + let seg_count = retrieved_doc + .get_first(self.count_field) + .unwrap() + .as_i64() + .unwrap(); + result.insert(seg_count); + } + Ok(result.into_iter().map(|x| x as _).collect()) + } +} diff --git a/src/index/src/lib.rs b/src/index/src/lib.rs index e7f448c398..bb57f2db27 100644 --- a/src/index/src/lib.rs +++ b/src/index/src/lib.rs @@ -14,4 +14,5 @@ #![feature(iter_partition_in_place)] +pub mod full_text_index; pub mod inverted_index; diff --git a/src/mito2/src/read/scan_region.rs b/src/mito2/src/read/scan_region.rs index 0ba6290c69..92d1d6b424 100644 --- a/src/mito2/src/read/scan_region.rs +++ b/src/mito2/src/read/scan_region.rs @@ -336,6 +336,11 @@ impl ScanRegion { .flatten() .map(Arc::new) } + + fn build_full_text_index_applier(&self) -> Option { + // start here + todo!() + } } /// Config for parallel scan. diff --git a/src/mito2/src/sst/index.rs b/src/mito2/src/sst/index.rs index cb10e7fc91..af35dcddfe 100644 --- a/src/mito2/src/sst/index.rs +++ b/src/mito2/src/sst/index.rs @@ -189,6 +189,9 @@ impl<'a> IndexerBuilder<'a> { segment_row_count = row_group_size; } + // find a column named "log" + let log_column_id = self.metadata.column_by_name("log").map(|c| c.column_id); + let creator = SstIndexCreator::new( self.file_path, self.file_id, @@ -197,6 +200,7 @@ impl<'a> IndexerBuilder<'a> { self.intermediate_manager, self.mem_threshold_index_create, segment_row_count, + log_column_id, ) .with_buffer_size(self.write_buffer_size) .with_ignore_column_ids( diff --git a/src/mito2/src/sst/index/creator.rs b/src/mito2/src/sst/index/creator.rs index 0e6fdc6125..893540e186 100644 --- a/src/mito2/src/sst/index/creator.rs +++ b/src/mito2/src/sst/index/creator.rs @@ -21,6 +21,9 @@ use std::sync::atomic::AtomicUsize; use std::sync::Arc; use common_telemetry::warn; +use datatypes::scalars::ScalarVector; +use datatypes::vectors::StringVector; +use index::full_text_index::create::FullTextIndexCreater; use index::inverted_index::create::sort::external_sort::ExternalSorter; use index::inverted_index::create::sort_create::SortIndexCreator; use index::inverted_index::create::InvertedIndexCreator; @@ -29,6 +32,7 @@ use object_store::ObjectStore; use puffin::file_format::writer::{Blob, PuffinAsyncWriter, PuffinFileWriter}; use snafu::{ensure, ResultExt}; use store_api::metadata::RegionMetadataRef; +use store_api::storage::ConcreteDataType; use tokio::io::duplex; use tokio_util::compat::{TokioAsyncReadCompatExt, TokioAsyncWriteCompatExt}; @@ -83,6 +87,10 @@ pub struct SstIndexCreator { /// The memory usage of the index creator. memory_usage: Arc, + + // experimental full text index + full_text_index_creater: FullTextIndexCreater, + log_column_id: Option, } impl SstIndexCreator { @@ -96,6 +104,7 @@ impl SstIndexCreator { intermediate_manager: IntermediateManager, memory_usage_threshold: Option, segment_row_count: NonZeroUsize, + log_column_id: Option, ) -> Self { let temp_file_provider = Arc::new(TempFileProvider::new( IntermediateLocation::new(&metadata.region_id, &sst_file_id), @@ -112,6 +121,10 @@ impl SstIndexCreator { ); let index_creator = Box::new(SortIndexCreator::new(sorter, segment_row_count)); + let full_text_index_path = format!("{file_path}/full_text_index"); + let full_text_index_creater = + FullTextIndexCreater::new(segment_row_count.get(), full_text_index_path).unwrap(); + let codec = IndexValuesCodec::from_tag_columns(metadata.primary_key_columns()); Self { file_path, @@ -127,6 +140,9 @@ impl SstIndexCreator { ignore_column_ids: HashSet::default(), memory_usage, + + full_text_index_creater, + log_column_id, } } @@ -233,6 +249,23 @@ impl SstIndexCreator { .context(PushIndexValueSnafu)?; } + // try find column named "log" and update it into full text index + if let Some(log_column_id) = self.log_column_id { + for col in batch.fields() { + if col.column_id == log_column_id { + let vector = &col.data; + if vector.data_type() == ConcreteDataType::string_datatype() { + let vector = vector.as_any().downcast_ref::().unwrap(); + for content in vector.iter_data() { + self.full_text_index_creater + .push_string(content.unwrap_or_default().to_string()) + .unwrap(); + } + } + } + } + } + Ok(()) } @@ -296,6 +329,8 @@ impl SstIndexCreator { _ => {} } + self.full_text_index_creater.finish().unwrap(); + let byte_count = puffin_writer.finish().await.context(PuffinFinishSnafu)?; guard.inc_byte_count(byte_count); Ok(()) @@ -421,6 +456,7 @@ mod tests { intm_mgr, memory_threshold, NonZeroUsize::new(segment_row_count).unwrap(), + None, ); for (str_tag, i32_tag) in &tags { diff --git a/src/mito2/src/sst/location.rs b/src/mito2/src/sst/location.rs index 179e9159c9..1d488fa0aa 100644 --- a/src/mito2/src/sst/location.rs +++ b/src/mito2/src/sst/location.rs @@ -29,6 +29,10 @@ pub fn index_file_path(region_dir: &str, sst_file_id: FileId) -> String { util::join_path(&dir, &sst_file_id.as_puffin()) } +pub fn full_text_index_path(region_dir: &str) -> String { + util::join_dir(region_dir, "full_text_index") +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/mito2/src/sst/parquet/reader.rs b/src/mito2/src/sst/parquet/reader.rs index 60aa0afa54..4afaf3ca39 100644 --- a/src/mito2/src/sst/parquet/reader.rs +++ b/src/mito2/src/sst/parquet/reader.rs @@ -284,6 +284,8 @@ impl ParquetReaderBuilder { .await .or_else(|| self.prune_row_groups_by_minmax(read_format, parquet_meta, metrics)) .unwrap_or_else(|| (0..num_row_groups).map(|i| (i, None)).collect()) + + // todo: change here } /// Applies index to prune row groups.