mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2025-12-22 22:20:02 +00:00
Compare commits
2 Commits
feat/bulk-
...
tantivy-po
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9e1e4a5181 | ||
|
|
182e22dda9 |
312
Cargo.lock
generated
312
Cargo.lock
generated
@@ -957,6 +957,15 @@ version = "2.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
|
||||
|
||||
[[package]]
|
||||
name = "bitpacking"
|
||||
version = "0.9.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4c1d3e2bfd8d06048a179f7b17afc3188effa10385e7b00dc65af6aae732ea92"
|
||||
dependencies = [
|
||||
"crunchy",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bitvec"
|
||||
version = "1.0.1"
|
||||
@@ -1303,6 +1312,12 @@ version = "2.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "060303ef31ef4a522737e1b1ab68c67916f2a787bb2f4f54f383279adba962b5"
|
||||
|
||||
[[package]]
|
||||
name = "census"
|
||||
version = "0.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4f4c707c6a209cbe82d10abd08e1ea8995e9ea937d2550646e02798948992be0"
|
||||
|
||||
[[package]]
|
||||
name = "cesu8"
|
||||
version = "1.1.0"
|
||||
@@ -2535,7 +2550,6 @@ checksum = "7e962a19be5cfc3f3bf6dd8f61eb50107f356ad6270fbb3ed41476571db78be5"
|
||||
[[package]]
|
||||
name = "datafusion"
|
||||
version = "32.0.0"
|
||||
source = "git+https://github.com/apache/arrow-datafusion.git?rev=26e43acac3a96cec8dd4c8365f22dfb1a84306e9#26e43acac3a96cec8dd4c8365f22dfb1a84306e9"
|
||||
dependencies = [
|
||||
"ahash 0.8.6",
|
||||
"arrow",
|
||||
@@ -2582,7 +2596,6 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-common"
|
||||
version = "32.0.0"
|
||||
source = "git+https://github.com/apache/arrow-datafusion.git?rev=26e43acac3a96cec8dd4c8365f22dfb1a84306e9#26e43acac3a96cec8dd4c8365f22dfb1a84306e9"
|
||||
dependencies = [
|
||||
"ahash 0.8.6",
|
||||
"arrow",
|
||||
@@ -2600,7 +2613,6 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-execution"
|
||||
version = "32.0.0"
|
||||
source = "git+https://github.com/apache/arrow-datafusion.git?rev=26e43acac3a96cec8dd4c8365f22dfb1a84306e9#26e43acac3a96cec8dd4c8365f22dfb1a84306e9"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"chrono",
|
||||
@@ -2620,7 +2632,6 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-expr"
|
||||
version = "32.0.0"
|
||||
source = "git+https://github.com/apache/arrow-datafusion.git?rev=26e43acac3a96cec8dd4c8365f22dfb1a84306e9#26e43acac3a96cec8dd4c8365f22dfb1a84306e9"
|
||||
dependencies = [
|
||||
"ahash 0.8.6",
|
||||
"arrow",
|
||||
@@ -2634,7 +2645,6 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-optimizer"
|
||||
version = "32.0.0"
|
||||
source = "git+https://github.com/apache/arrow-datafusion.git?rev=26e43acac3a96cec8dd4c8365f22dfb1a84306e9#26e43acac3a96cec8dd4c8365f22dfb1a84306e9"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"async-trait",
|
||||
@@ -2651,7 +2661,6 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-physical-expr"
|
||||
version = "32.0.0"
|
||||
source = "git+https://github.com/apache/arrow-datafusion.git?rev=26e43acac3a96cec8dd4c8365f22dfb1a84306e9#26e43acac3a96cec8dd4c8365f22dfb1a84306e9"
|
||||
dependencies = [
|
||||
"ahash 0.8.6",
|
||||
"arrow",
|
||||
@@ -2684,7 +2693,6 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-physical-plan"
|
||||
version = "32.0.0"
|
||||
source = "git+https://github.com/apache/arrow-datafusion.git?rev=26e43acac3a96cec8dd4c8365f22dfb1a84306e9#26e43acac3a96cec8dd4c8365f22dfb1a84306e9"
|
||||
dependencies = [
|
||||
"ahash 0.8.6",
|
||||
"arrow",
|
||||
@@ -2714,7 +2722,6 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-sql"
|
||||
version = "32.0.0"
|
||||
source = "git+https://github.com/apache/arrow-datafusion.git?rev=26e43acac3a96cec8dd4c8365f22dfb1a84306e9#26e43acac3a96cec8dd4c8365f22dfb1a84306e9"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-schema",
|
||||
@@ -2727,7 +2734,6 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-substrait"
|
||||
version = "32.0.0"
|
||||
source = "git+https://github.com/apache/arrow-datafusion.git?rev=26e43acac3a96cec8dd4c8365f22dfb1a84306e9#26e43acac3a96cec8dd4c8365f22dfb1a84306e9"
|
||||
dependencies = [
|
||||
"async-recursion",
|
||||
"chrono",
|
||||
@@ -3088,6 +3094,12 @@ version = "0.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1435fa1053d8b2fbbe9be7e97eca7f33d37b28409959813daefc1446a14247f1"
|
||||
|
||||
[[package]]
|
||||
name = "downcast-rs"
|
||||
version = "1.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "75b325c5dbd37f80359721ad39aca5a29fb04c89279657cffdda8736d0c0b9d2"
|
||||
|
||||
[[package]]
|
||||
name = "dyn-clone"
|
||||
version = "1.0.16"
|
||||
@@ -3257,6 +3269,12 @@ version = "0.1.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a"
|
||||
|
||||
[[package]]
|
||||
name = "fastdivide"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "59668941c55e5c186b8b58c391629af56774ec768f73c08bbcd56f09348eb00b"
|
||||
|
||||
[[package]]
|
||||
name = "fastrand"
|
||||
version = "1.9.0"
|
||||
@@ -3566,6 +3584,16 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fs4"
|
||||
version = "0.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "21dabded2e32cd57ded879041205c60a4a4c4bab47bd0fd2fa8b01f30849f02b"
|
||||
dependencies = [
|
||||
"rustix 0.38.28",
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fsevent-sys"
|
||||
version = "4.1.0"
|
||||
@@ -3693,6 +3721,19 @@ dependencies = [
|
||||
"slab",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "generator"
|
||||
version = "0.7.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5cc16584ff22b460a382b7feec54b23d2908d858152e5739a120b949293bd74e"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
"log",
|
||||
"rustversion",
|
||||
"windows 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "generic-array"
|
||||
version = "0.14.7"
|
||||
@@ -3979,6 +4020,12 @@ dependencies = [
|
||||
"utf8-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "htmlescape"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e9025058dae765dee5070ec375f591e2ba14638c63feff74f13805a72e523163"
|
||||
|
||||
[[package]]
|
||||
name = "http"
|
||||
version = "0.2.11"
|
||||
@@ -4279,6 +4326,7 @@ dependencies = [
|
||||
"regex",
|
||||
"regex-automata 0.4.3",
|
||||
"snafu",
|
||||
"tantivy",
|
||||
"tempfile",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
@@ -4658,6 +4706,12 @@ version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
|
||||
|
||||
[[package]]
|
||||
name = "levenshtein_automata"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0c2cdeb66e45e9f36bfad5bbdb4d2384e70936afbee843c6f6543f0c551ebb25"
|
||||
|
||||
[[package]]
|
||||
name = "lexical-core"
|
||||
version = "0.8.5"
|
||||
@@ -4870,6 +4924,20 @@ dependencies = [
|
||||
"uuid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "loom"
|
||||
version = "0.5.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ff50ecb28bb86013e935fb6683ab1f6d3a20016f123c76fd4c27470076ac30f5"
|
||||
dependencies = [
|
||||
"cfg-if 1.0.0",
|
||||
"generator",
|
||||
"pin-utils",
|
||||
"scoped-tls",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lrlex"
|
||||
version = "0.12.0"
|
||||
@@ -4961,6 +5029,12 @@ dependencies = [
|
||||
"twox-hash",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lz4_flex"
|
||||
version = "0.11.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5"
|
||||
|
||||
[[package]]
|
||||
name = "lzma-sys"
|
||||
version = "0.1.20"
|
||||
@@ -5059,6 +5133,16 @@ version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
|
||||
|
||||
[[package]]
|
||||
name = "measure_time"
|
||||
version = "0.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "56220900f1a0923789ecd6bf25fbae8af3b2f1ff3e9e297fc9b6b8674dd4d852"
|
||||
dependencies = [
|
||||
"instant",
|
||||
"log",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.6.4"
|
||||
@@ -5427,6 +5511,12 @@ version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "97af489e1e21b68de4c390ecca6703318bc1aa16e9733bcb62c089b73c6fbb1b"
|
||||
|
||||
[[package]]
|
||||
name = "murmurhash32"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2195bf6aa996a481483b29d62a7663eed3fe39600c460e323f8ff41e90bdd89b"
|
||||
|
||||
[[package]]
|
||||
name = "mysql-common-derive"
|
||||
version = "0.30.2"
|
||||
@@ -5948,6 +6038,15 @@ version = "1.19.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
|
||||
|
||||
[[package]]
|
||||
name = "oneshot"
|
||||
version = "0.1.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6f6640c6bda7731b1fdbab747981a0f896dd1fedaf9f4a53fa237a04a84431f4"
|
||||
dependencies = [
|
||||
"loom",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "oorandom"
|
||||
version = "11.1.3"
|
||||
@@ -6293,6 +6392,15 @@ version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
|
||||
|
||||
[[package]]
|
||||
name = "ownedbytes"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c3a059efb063b8f425b948e042e6b9bd85edfe60e913630ed727b23e2dfcc558"
|
||||
dependencies = [
|
||||
"stable_deref_trait",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "packedvec"
|
||||
version = "1.2.4"
|
||||
@@ -8102,6 +8210,16 @@ dependencies = [
|
||||
"tree-sitter-cli",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rust-stemmers"
|
||||
version = "1.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e46a2036019fdb888131db7a4c847a1063a7493f971ed94ea82c67eada63ca54"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rust_decimal"
|
||||
version = "1.33.1"
|
||||
@@ -8332,7 +8450,7 @@ dependencies = [
|
||||
"bitflags 1.3.2",
|
||||
"bstr",
|
||||
"itertools 0.10.5",
|
||||
"lz4_flex",
|
||||
"lz4_flex 0.9.5",
|
||||
"num-bigint",
|
||||
"num-complex",
|
||||
]
|
||||
@@ -8723,6 +8841,12 @@ dependencies = [
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "scoped-tls"
|
||||
version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e1cf6437eb19a8f4a6cc0f7dca544973b0b78843adbfeb3683d1a94a0024a294"
|
||||
|
||||
[[package]]
|
||||
name = "scopeguard"
|
||||
version = "1.2.0"
|
||||
@@ -9259,6 +9383,15 @@ dependencies = [
|
||||
"walkdir",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sketches-ddsketch"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "85636c14b73d81f541e525f585c0a2109e6744e1565b5c1668e31c70c10ed65c"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "slab"
|
||||
version = "0.4.9"
|
||||
@@ -9792,6 +9925,7 @@ dependencies = [
|
||||
"bytes",
|
||||
"catalog",
|
||||
"common-error",
|
||||
"common-function",
|
||||
"common-macro",
|
||||
"datafusion",
|
||||
"datafusion-common",
|
||||
@@ -9800,6 +9934,7 @@ dependencies = [
|
||||
"datatypes",
|
||||
"promql",
|
||||
"prost 0.12.3",
|
||||
"session",
|
||||
"snafu",
|
||||
"substrait 0.17.1",
|
||||
"tokio",
|
||||
@@ -9992,6 +10127,148 @@ version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417"
|
||||
|
||||
[[package]]
|
||||
name = "tantivy"
|
||||
version = "0.22.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f8d0582f186c0a6d55655d24543f15e43607299425c5ad8352c242b914b31856"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"arc-swap",
|
||||
"base64 0.22.0",
|
||||
"bitpacking",
|
||||
"byteorder",
|
||||
"census",
|
||||
"crc32fast",
|
||||
"crossbeam-channel",
|
||||
"downcast-rs",
|
||||
"fastdivide",
|
||||
"fnv",
|
||||
"fs4",
|
||||
"htmlescape",
|
||||
"itertools 0.12.0",
|
||||
"levenshtein_automata",
|
||||
"log",
|
||||
"lru",
|
||||
"lz4_flex 0.11.3",
|
||||
"measure_time",
|
||||
"memmap2 0.9.3",
|
||||
"num_cpus",
|
||||
"once_cell",
|
||||
"oneshot",
|
||||
"rayon",
|
||||
"regex",
|
||||
"rust-stemmers",
|
||||
"rustc-hash",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sketches-ddsketch",
|
||||
"smallvec",
|
||||
"tantivy-bitpacker",
|
||||
"tantivy-columnar",
|
||||
"tantivy-common",
|
||||
"tantivy-fst",
|
||||
"tantivy-query-grammar",
|
||||
"tantivy-stacker",
|
||||
"tantivy-tokenizer-api",
|
||||
"tempfile",
|
||||
"thiserror",
|
||||
"time",
|
||||
"uuid",
|
||||
"winapi",
|
||||
"zstd 0.13.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tantivy-bitpacker"
|
||||
version = "0.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "284899c2325d6832203ac6ff5891b297fc5239c3dc754c5bc1977855b23c10df"
|
||||
dependencies = [
|
||||
"bitpacking",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tantivy-columnar"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "12722224ffbe346c7fec3275c699e508fd0d4710e629e933d5736ec524a1f44e"
|
||||
dependencies = [
|
||||
"downcast-rs",
|
||||
"fastdivide",
|
||||
"itertools 0.12.0",
|
||||
"serde",
|
||||
"tantivy-bitpacker",
|
||||
"tantivy-common",
|
||||
"tantivy-sstable",
|
||||
"tantivy-stacker",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tantivy-common"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8019e3cabcfd20a1380b491e13ff42f57bb38bf97c3d5fa5c07e50816e0621f4"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
"ownedbytes",
|
||||
"serde",
|
||||
"time",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tantivy-fst"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d60769b80ad7953d8a7b2c70cdfe722bbcdcac6bccc8ac934c40c034d866fc18"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"regex-syntax 0.8.2",
|
||||
"utf8-ranges",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tantivy-query-grammar"
|
||||
version = "0.22.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "847434d4af57b32e309f4ab1b4f1707a6c566656264caa427ff4285c4d9d0b82"
|
||||
dependencies = [
|
||||
"nom",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tantivy-sstable"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c69578242e8e9fc989119f522ba5b49a38ac20f576fc778035b96cc94f41f98e"
|
||||
dependencies = [
|
||||
"tantivy-bitpacker",
|
||||
"tantivy-common",
|
||||
"tantivy-fst",
|
||||
"zstd 0.13.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tantivy-stacker"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c56d6ff5591fc332739b3ce7035b57995a3ce29a93ffd6012660e0949c956ea8"
|
||||
dependencies = [
|
||||
"murmurhash32",
|
||||
"rand_distr",
|
||||
"tantivy-common",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tantivy-tokenizer-api"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2a0dcade25819a89cfe6f17d932c9cedff11989936bf6dd4f336d50392053b04"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tap"
|
||||
version = "1.0.1"
|
||||
@@ -11311,6 +11588,12 @@ version = "2.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da"
|
||||
|
||||
[[package]]
|
||||
name = "utf8-ranges"
|
||||
version = "1.0.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7fcfc827f90e53a02eaef5e535ee14266c1d569214c6aa70133a624d8a3164ba"
|
||||
|
||||
[[package]]
|
||||
name = "utf8-width"
|
||||
version = "0.1.7"
|
||||
@@ -11666,6 +11949,15 @@ dependencies = [
|
||||
"windows_x86_64_msvc 0.39.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows"
|
||||
version = "0.48.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f"
|
||||
dependencies = [
|
||||
"windows-targets 0.48.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows"
|
||||
version = "0.52.0"
|
||||
|
||||
21
Cargo.toml
21
Cargo.toml
@@ -91,13 +91,20 @@ bytes = { version = "1.5", features = ["serde"] }
|
||||
chrono = { version = "0.4", features = ["serde"] }
|
||||
clap = { version = "4.4", features = ["derive"] }
|
||||
dashmap = "5.4"
|
||||
datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "26e43acac3a96cec8dd4c8365f22dfb1a84306e9" }
|
||||
datafusion-common = { git = "https://github.com/apache/arrow-datafusion.git", rev = "26e43acac3a96cec8dd4c8365f22dfb1a84306e9" }
|
||||
datafusion-expr = { git = "https://github.com/apache/arrow-datafusion.git", rev = "26e43acac3a96cec8dd4c8365f22dfb1a84306e9" }
|
||||
datafusion-optimizer = { git = "https://github.com/apache/arrow-datafusion.git", rev = "26e43acac3a96cec8dd4c8365f22dfb1a84306e9" }
|
||||
datafusion-physical-expr = { git = "https://github.com/apache/arrow-datafusion.git", rev = "26e43acac3a96cec8dd4c8365f22dfb1a84306e9" }
|
||||
datafusion-sql = { git = "https://github.com/apache/arrow-datafusion.git", rev = "26e43acac3a96cec8dd4c8365f22dfb1a84306e9" }
|
||||
datafusion-substrait = { git = "https://github.com/apache/arrow-datafusion.git", rev = "26e43acac3a96cec8dd4c8365f22dfb1a84306e9" }
|
||||
# datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "26e43acac3a96cec8dd4c8365f22dfb1a84306e9" }
|
||||
# datafusion-common = { git = "https://github.com/apache/arrow-datafusion.git", rev = "26e43acac3a96cec8dd4c8365f22dfb1a84306e9" }
|
||||
# datafusion-expr = { git = "https://github.com/apache/arrow-datafusion.git", rev = "26e43acac3a96cec8dd4c8365f22dfb1a84306e9" }
|
||||
# datafusion-optimizer = { git = "https://github.com/apache/arrow-datafusion.git", rev = "26e43acac3a96cec8dd4c8365f22dfb1a84306e9" }
|
||||
# datafusion-physical-expr = { git = "https://github.com/apache/arrow-datafusion.git", rev = "26e43acac3a96cec8dd4c8365f22dfb1a84306e9" }
|
||||
# datafusion-sql = { git = "https://github.com/apache/arrow-datafusion.git", rev = "26e43acac3a96cec8dd4c8365f22dfb1a84306e9" }
|
||||
# datafusion-substrait = { git = "https://github.com/apache/arrow-datafusion.git", rev = "26e43acac3a96cec8dd4c8365f22dfb1a84306e9" }
|
||||
datafusion = { path = "../arrow-datafusion/datafusion/core" }
|
||||
datafusion-common = { path = "../arrow-datafusion/datafusion/common" }
|
||||
datafusion-expr = { path = "../arrow-datafusion/datafusion/expr" }
|
||||
datafusion-optimizer = { path = "../arrow-datafusion/datafusion/optimizer" }
|
||||
datafusion-physical-expr = { path = "../arrow-datafusion/datafusion/physical-expr" }
|
||||
datafusion-sql = { path = "../arrow-datafusion/datafusion/sql" }
|
||||
datafusion-substrait = { path = "../arrow-datafusion/datafusion/substrait" }
|
||||
derive_builder = "0.12"
|
||||
dotenv = "0.15"
|
||||
etcd-client = "0.12"
|
||||
|
||||
54
feed.py
Normal file
54
feed.py
Normal file
@@ -0,0 +1,54 @@
|
||||
# read line from log-1000.txt and POST it to http://localhost:4000/v1/influxdb/write?db=public&precision=ms
|
||||
# POST data format: "many_logs,host=1 log=<FILE CONTENT> <INCREMENT ID>"
|
||||
|
||||
import requests
|
||||
from tqdm import tqdm
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
batch_size = 3000
|
||||
worker = 8
|
||||
|
||||
# Define the URL
|
||||
url = "http://localhost:4000/v1/influxdb/write?db=public&precision=ms"
|
||||
|
||||
|
||||
def send_data(start, data):
|
||||
# Send the POST request
|
||||
response = requests.post(url, data=data)
|
||||
# Check the response
|
||||
if response.status_code >= 300:
|
||||
print(
|
||||
f"Failed to send log line {start}: {response.status_code} {response.text}"
|
||||
)
|
||||
|
||||
|
||||
# Open the file
|
||||
with open("target/log-1000.txt", "r") as file:
|
||||
lines = file.readlines()
|
||||
|
||||
# Create a progress bar
|
||||
with tqdm(
|
||||
total=len(lines),
|
||||
desc="Processing lines",
|
||||
bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt}",
|
||||
) as pbar:
|
||||
data = ""
|
||||
with ThreadPoolExecutor(max_workers=worker) as executor:
|
||||
for i, line in enumerate(lines):
|
||||
# Prepare the POST data
|
||||
content = line.strip()
|
||||
content = content.replace('"', " ")
|
||||
content = content.replace("'", " ")
|
||||
content = content.replace("=", " ")
|
||||
content = content.replace(".", " ")
|
||||
|
||||
data = data + f'many_logs,host=1 log="{content}" {i}\n'
|
||||
|
||||
if i % batch_size == 0:
|
||||
executor.submit(send_data, i, data)
|
||||
data = ""
|
||||
# Update the progress bar
|
||||
pbar.update(batch_size)
|
||||
|
||||
# close the executor
|
||||
executor.shutdown(wait=True)
|
||||
@@ -15,6 +15,7 @@
|
||||
pub mod aggregate;
|
||||
pub(crate) mod date;
|
||||
pub mod expression;
|
||||
pub mod matches;
|
||||
pub mod math;
|
||||
pub mod numpy;
|
||||
#[cfg(test)]
|
||||
|
||||
61
src/common/function/src/scalars/matches.rs
Normal file
61
src/common/function/src/scalars/matches.rs
Normal file
@@ -0,0 +1,61 @@
|
||||
// Copyright 2024 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::fmt;
|
||||
use std::fmt::Display;
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_query::error::Result;
|
||||
use common_query::prelude::{Signature, Volatility};
|
||||
use datatypes::prelude::ConcreteDataType;
|
||||
use datatypes::vectors::{BooleanVector, VectorRef};
|
||||
|
||||
use crate::function::{Function, FunctionContext};
|
||||
|
||||
const NAME: &str = "matches";
|
||||
|
||||
/// The function to find remainders
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct MatchesFunction;
|
||||
|
||||
impl Display for MatchesFunction {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "{}", NAME.to_ascii_uppercase())
|
||||
}
|
||||
}
|
||||
|
||||
impl Function for MatchesFunction {
|
||||
fn name(&self) -> &str {
|
||||
NAME
|
||||
}
|
||||
|
||||
fn return_type(&self, _input_types: &[ConcreteDataType]) -> Result<ConcreteDataType> {
|
||||
Ok(ConcreteDataType::boolean_datatype())
|
||||
}
|
||||
|
||||
fn signature(&self) -> Signature {
|
||||
Signature::exact(
|
||||
vec![
|
||||
ConcreteDataType::string_datatype(),
|
||||
ConcreteDataType::string_datatype(),
|
||||
],
|
||||
Volatility::Immutable,
|
||||
)
|
||||
}
|
||||
|
||||
fn eval(&self, _func_ctx: FunctionContext, columns: &[VectorRef]) -> Result<VectorRef> {
|
||||
let num_rows = columns[1].len();
|
||||
Ok(Arc::new(BooleanVector::from(vec![true; num_rows])))
|
||||
}
|
||||
}
|
||||
@@ -31,6 +31,7 @@ pub use pow::PowFunction;
|
||||
pub use rate::RateFunction;
|
||||
use snafu::ResultExt;
|
||||
|
||||
use super::matches::MatchesFunction;
|
||||
use crate::function::{Function, FunctionContext};
|
||||
use crate::function_registry::FunctionRegistry;
|
||||
use crate::scalars::math::modulo::ModuloFunction;
|
||||
@@ -44,6 +45,7 @@ impl MathFunction {
|
||||
registry.register(Arc::new(RateFunction));
|
||||
registry.register(Arc::new(RangeFunction));
|
||||
registry.register(Arc::new(ClampFunction));
|
||||
registry.register(Arc::new(MatchesFunction));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -12,14 +12,16 @@ async-trait.workspace = true
|
||||
bytes.workspace = true
|
||||
catalog.workspace = true
|
||||
common-error.workspace = true
|
||||
common-function.workspace = true
|
||||
common-macro.workspace = true
|
||||
datafusion.workspace = true
|
||||
datafusion-common.workspace = true
|
||||
datafusion-expr.workspace = true
|
||||
datafusion-substrait.workspace = true
|
||||
datafusion.workspace = true
|
||||
datatypes.workspace = true
|
||||
promql.workspace = true
|
||||
prost.workspace = true
|
||||
session.workspace = true
|
||||
snafu.workspace = true
|
||||
|
||||
[dependencies.substrait_proto]
|
||||
|
||||
@@ -16,6 +16,9 @@ use std::sync::Arc;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use common_function::scalars::matches::MatchesFunction;
|
||||
use common_function::scalars::udf::create_udf;
|
||||
use common_function::state::FunctionState;
|
||||
use datafusion::catalog::CatalogList;
|
||||
use datafusion::execution::context::SessionState;
|
||||
use datafusion::execution::runtime_env::RuntimeEnv;
|
||||
@@ -24,6 +27,7 @@ use datafusion_expr::LogicalPlan;
|
||||
use datafusion_substrait::logical_plan::consumer::from_substrait_plan;
|
||||
use datafusion_substrait::logical_plan::producer::to_substrait_plan;
|
||||
use prost::Message;
|
||||
use session::context::QueryContext;
|
||||
use snafu::ResultExt;
|
||||
use substrait_proto::proto::Plan;
|
||||
|
||||
@@ -50,6 +54,13 @@ impl SubstraitPlan for DFLogicalSubstraitConvertor {
|
||||
let state = SessionState::new_with_config_rt(state_config, Arc::new(RuntimeEnv::default()))
|
||||
.with_serializer_registry(Arc::new(ExtensionSerializer));
|
||||
let mut context = SessionContext::new_with_state(state);
|
||||
|
||||
let udf = create_udf(
|
||||
Arc::new(MatchesFunction),
|
||||
QueryContext::arc(),
|
||||
Arc::new(FunctionState::default()),
|
||||
);
|
||||
context.register_udf(udf.into());
|
||||
context.register_catalog_list(catalog_list);
|
||||
let plan = Plan::decode(message).context(DecodeRelSnafu)?;
|
||||
let df_plan = from_substrait_plan(&mut context, &plan)
|
||||
@@ -65,6 +76,13 @@ impl SubstraitPlan for DFLogicalSubstraitConvertor {
|
||||
.with_serializer_registry(Arc::new(ExtensionSerializer));
|
||||
let context = SessionContext::new_with_state(session_state);
|
||||
|
||||
let udf = create_udf(
|
||||
Arc::new(MatchesFunction),
|
||||
QueryContext::arc(),
|
||||
Arc::new(FunctionState::default()),
|
||||
);
|
||||
context.register_udf(udf.into());
|
||||
|
||||
let substrait_plan = to_substrait_plan(plan, &context).context(EncodeDfPlanSnafu)?;
|
||||
substrait_plan.encode(&mut buf).context(EncodeRelSnafu)?;
|
||||
|
||||
|
||||
@@ -22,12 +22,13 @@ greptime-proto.workspace = true
|
||||
mockall.workspace = true
|
||||
pin-project.workspace = true
|
||||
prost.workspace = true
|
||||
regex.workspace = true
|
||||
regex-automata.workspace = true
|
||||
regex.workspace = true
|
||||
snafu.workspace = true
|
||||
tantivy = { version = "0.22", features = ["zstd-compression"] }
|
||||
|
||||
[dev-dependencies]
|
||||
rand.workspace = true
|
||||
tempfile.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tokio.workspace = true
|
||||
|
||||
17
src/index/src/full_text_index.rs
Normal file
17
src/index/src/full_text_index.rs
Normal file
@@ -0,0 +1,17 @@
|
||||
// Copyright 2024 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
pub mod create;
|
||||
pub mod error;
|
||||
pub mod search;
|
||||
90
src/index/src/full_text_index/create.rs
Normal file
90
src/index/src/full_text_index/create.rs
Normal file
@@ -0,0 +1,90 @@
|
||||
// Copyright 2024 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
use snafu::ResultExt;
|
||||
use tantivy::schema::{Schema, INDEXED, STORED, TEXT};
|
||||
use tantivy::store::{Compressor, ZstdCompressor};
|
||||
use tantivy::{Index, IndexWriter, TantivyDocument};
|
||||
|
||||
use super::error::TantivySnafu;
|
||||
use crate::full_text_index::error::Result;
|
||||
|
||||
pub struct FullTextIndexCreater {
|
||||
index: Index,
|
||||
writer: IndexWriter,
|
||||
count_field: tantivy::schema::Field,
|
||||
text_field: tantivy::schema::Field,
|
||||
|
||||
row_count: usize,
|
||||
segment_size: usize,
|
||||
}
|
||||
|
||||
impl FullTextIndexCreater {
|
||||
pub fn new<P>(segment_size: usize, path: P) -> Result<Self>
|
||||
where
|
||||
P: AsRef<Path>,
|
||||
{
|
||||
// build schema
|
||||
let mut schema_builder = Schema::builder();
|
||||
let count_field = schema_builder.add_i64_field("seg_count", INDEXED | STORED);
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// create path
|
||||
std::fs::create_dir_all(&path).unwrap();
|
||||
common_telemetry::info!("[DEBUG] create full text index in {:?}", path.as_ref());
|
||||
|
||||
// build index
|
||||
let mut index = Index::create_in_dir(path, schema).context(TantivySnafu)?;
|
||||
|
||||
// tune
|
||||
index.settings_mut().docstore_compression = Compressor::Zstd(ZstdCompressor::default());
|
||||
index.settings_mut().docstore_blocksize = 65_536;
|
||||
|
||||
// build writer
|
||||
// 100 MB
|
||||
let writer = index.writer(400_000_000).context(TantivySnafu)?;
|
||||
|
||||
Ok(Self {
|
||||
index,
|
||||
writer,
|
||||
count_field,
|
||||
text_field,
|
||||
row_count: 0,
|
||||
segment_size,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn push_string(&mut self, content: String) -> Result<()> {
|
||||
let mut doc = TantivyDocument::new();
|
||||
doc.add_text(self.text_field, content);
|
||||
doc.add_i64(self.count_field, (self.row_count / self.segment_size) as _);
|
||||
self.writer.add_document(doc).context(TantivySnafu)?;
|
||||
self.row_count += 1;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn finish(&mut self) -> Result<()> {
|
||||
common_telemetry::info!(
|
||||
"[DEBUG] full text index finish with {} entries",
|
||||
self.row_count
|
||||
);
|
||||
self.row_count = 0;
|
||||
self.writer.commit().context(TantivySnafu)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
45
src/index/src/full_text_index/error.rs
Normal file
45
src/index/src/full_text_index/error.rs
Normal file
@@ -0,0 +1,45 @@
|
||||
// Copyright 2024 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use common_macro::stack_trace_debug;
|
||||
use snafu::{Location, Snafu};
|
||||
use tantivy::directory::error::OpenDirectoryError;
|
||||
|
||||
#[derive(Snafu)]
|
||||
#[snafu(visibility(pub))]
|
||||
#[stack_trace_debug]
|
||||
pub enum Error {
|
||||
#[snafu(display("Tantivy error"))]
|
||||
Tantivy {
|
||||
#[snafu(source)]
|
||||
error: tantivy::TantivyError,
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to open directory"))]
|
||||
OpenDirectory {
|
||||
#[snafu(source)]
|
||||
error: OpenDirectoryError,
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to parse tantivy query"))]
|
||||
ParseQuery {
|
||||
#[snafu(source)]
|
||||
error: tantivy::query::QueryParserError,
|
||||
location: Location,
|
||||
},
|
||||
}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
74
src/index/src/full_text_index/search.rs
Normal file
74
src/index/src/full_text_index/search.rs
Normal file
@@ -0,0 +1,74 @@
|
||||
// Copyright 2024 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::path::Path;
|
||||
|
||||
use snafu::ResultExt;
|
||||
use tantivy::directory::MmapDirectory;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::Value;
|
||||
use tantivy::{Index, IndexReader, TantivyDocument, TantivyError};
|
||||
|
||||
use super::error::ParseQuerySnafu;
|
||||
use crate::full_text_index::error::{OpenDirectorySnafu, Result, TantivySnafu};
|
||||
|
||||
pub struct FullTextIndexSearcher {
|
||||
index: Index,
|
||||
count_field: tantivy::schema::Field,
|
||||
text_field: tantivy::schema::Field,
|
||||
reader: IndexReader,
|
||||
}
|
||||
|
||||
impl FullTextIndexSearcher {
|
||||
pub fn open<P>(path: P) -> Result<Self>
|
||||
where
|
||||
P: AsRef<Path>,
|
||||
{
|
||||
let index = Index::open_in_dir(path).context(TantivySnafu)?;
|
||||
let schema = index.schema();
|
||||
let count_field = schema.get_field("seg_count").unwrap();
|
||||
let text_field = schema.get_field("text").unwrap();
|
||||
let reader = index.reader().context(TantivySnafu)?;
|
||||
|
||||
Ok(Self {
|
||||
index,
|
||||
count_field,
|
||||
text_field,
|
||||
reader,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn search(&self, query: &str) -> Result<Vec<usize>> {
|
||||
let searcher = self.reader.searcher();
|
||||
let query_parser = QueryParser::for_index(&self.index, vec![self.text_field]);
|
||||
let query = query_parser.parse_query(query).context(ParseQuerySnafu)?;
|
||||
let top_docs = searcher
|
||||
.search(&query, &tantivy::collector::TopDocs::with_limit(1000_0000))
|
||||
.context(TantivySnafu)?;
|
||||
let mut result = HashSet::new();
|
||||
for (_score, doc_address) in top_docs {
|
||||
let retrieved_doc = searcher
|
||||
.doc::<TantivyDocument>(doc_address)
|
||||
.context(TantivySnafu)?;
|
||||
let seg_count = retrieved_doc
|
||||
.get_first(self.count_field)
|
||||
.unwrap()
|
||||
.as_i64()
|
||||
.unwrap();
|
||||
result.insert(seg_count);
|
||||
}
|
||||
Ok(result.into_iter().map(|x| x as _).collect())
|
||||
}
|
||||
}
|
||||
@@ -14,4 +14,5 @@
|
||||
|
||||
#![feature(iter_partition_in_place)]
|
||||
|
||||
pub mod full_text_index;
|
||||
pub mod inverted_index;
|
||||
|
||||
@@ -96,6 +96,13 @@ impl AccessLayer {
|
||||
})?;
|
||||
}
|
||||
|
||||
let full_text_index_dir = format!(
|
||||
"/tmp/greptimedb/{}index/{}/full_text_index",
|
||||
self.region_dir, file_meta.file_id
|
||||
);
|
||||
common_telemetry::info!("[DEBUG] removing {}", full_text_index_dir);
|
||||
tokio::fs::remove_dir(full_text_index_dir).await.unwrap();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@ use crate::read::{compat, Batch, Source};
|
||||
use crate::region::version::VersionRef;
|
||||
use crate::sst::file::FileHandle;
|
||||
use crate::sst::index::applier::builder::SstIndexApplierBuilder;
|
||||
use crate::sst::index::applier::SstIndexApplierRef;
|
||||
use crate::sst::index::applier::{FullTextIndexApplier, SstIndexApplierRef};
|
||||
|
||||
/// A scanner scans a region and returns a [SendableRecordBatchStream].
|
||||
pub(crate) enum Scanner {
|
||||
@@ -269,6 +269,7 @@ impl ScanRegion {
|
||||
);
|
||||
|
||||
let index_applier = self.build_index_applier();
|
||||
let full_text_index_applier = self.build_full_text_index_applier();
|
||||
let predicate = Predicate::new(self.request.filters.clone());
|
||||
// The mapper always computes projected column ids as the schema of SSTs may change.
|
||||
let mapper = match &self.request.projection {
|
||||
@@ -283,6 +284,7 @@ impl ScanRegion {
|
||||
.with_files(files)
|
||||
.with_cache(self.cache_manager)
|
||||
.with_index_applier(index_applier)
|
||||
.with_full_index_applier(full_text_index_applier)
|
||||
.with_parallelism(self.parallelism)
|
||||
.with_start_time(self.start_time)
|
||||
.with_append_mode(self.version.options.append_mode)
|
||||
@@ -336,6 +338,14 @@ impl ScanRegion {
|
||||
.flatten()
|
||||
.map(Arc::new)
|
||||
}
|
||||
|
||||
fn build_full_text_index_applier(&self) -> Option<FullTextIndexApplier> {
|
||||
FullTextIndexApplier::new(
|
||||
self.access_layer.region_dir().to_string(),
|
||||
self.version.metadata.region_id,
|
||||
&self.request.filters,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Config for parallel scan.
|
||||
@@ -393,6 +403,8 @@ pub(crate) struct ScanInput {
|
||||
pub(crate) append_mode: bool,
|
||||
/// Whether to remove deletion markers.
|
||||
pub(crate) filter_deleted: bool,
|
||||
|
||||
full_text_index_applier: Option<FullTextIndexApplier>,
|
||||
}
|
||||
|
||||
impl ScanInput {
|
||||
@@ -413,6 +425,7 @@ impl ScanInput {
|
||||
query_start: None,
|
||||
append_mode: false,
|
||||
filter_deleted: true,
|
||||
full_text_index_applier: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -472,6 +485,15 @@ impl ScanInput {
|
||||
self
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub(crate) fn with_full_index_applier(
|
||||
mut self,
|
||||
index_applier: Option<FullTextIndexApplier>,
|
||||
) -> Self {
|
||||
self.full_text_index_applier = index_applier;
|
||||
self
|
||||
}
|
||||
|
||||
/// Sets start time of the query.
|
||||
#[must_use]
|
||||
pub(crate) fn with_start_time(mut self, now: Option<Instant>) -> Self {
|
||||
@@ -509,6 +531,7 @@ impl ScanInput {
|
||||
.projection(Some(self.mapper.column_ids().to_vec()))
|
||||
.cache(self.cache_manager.clone())
|
||||
.index_applier(self.index_applier.clone())
|
||||
.full_text_index_applier(self.full_text_index_applier.clone())
|
||||
.build()
|
||||
.await;
|
||||
let reader = match maybe_reader {
|
||||
|
||||
@@ -63,6 +63,8 @@ impl Indexer {
|
||||
// Skip index creation if error occurs.
|
||||
self.inner = None;
|
||||
}
|
||||
} else {
|
||||
common_telemetry::info!("[DEBUG] Indexer::update: inner is None");
|
||||
}
|
||||
|
||||
if let Some(creator) = self.inner.as_ref() {
|
||||
@@ -189,6 +191,9 @@ impl<'a> IndexerBuilder<'a> {
|
||||
segment_row_count = row_group_size;
|
||||
}
|
||||
|
||||
// find a column named "log"
|
||||
let log_column_id = self.metadata.column_by_name("log").map(|c| c.column_id);
|
||||
|
||||
let creator = SstIndexCreator::new(
|
||||
self.file_path,
|
||||
self.file_id,
|
||||
@@ -197,6 +202,7 @@ impl<'a> IndexerBuilder<'a> {
|
||||
self.intermediate_manager,
|
||||
self.mem_threshold_index_create,
|
||||
segment_row_count,
|
||||
log_column_id,
|
||||
)
|
||||
.with_buffer_size(self.write_buffer_size)
|
||||
.with_ignore_column_ids(
|
||||
|
||||
@@ -16,7 +16,10 @@ pub mod builder;
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_query::logical_plan::Expr;
|
||||
use datafusion_expr::Expr as DfExpr;
|
||||
use futures::{AsyncRead, AsyncSeek};
|
||||
use index::full_text_index::search::FullTextIndexSearcher;
|
||||
use index::inverted_index::format::reader::InvertedIndexBlobReader;
|
||||
use index::inverted_index::search::index_apply::{
|
||||
ApplyOutput, IndexApplier, IndexNotFoundStrategy, SearchContext,
|
||||
@@ -172,6 +175,53 @@ impl Drop for SstIndexApplier {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) struct FullTextIndexApplier {
|
||||
region_dir: String,
|
||||
region_id: RegionId,
|
||||
query: String,
|
||||
}
|
||||
|
||||
impl FullTextIndexApplier {
|
||||
pub fn new(region_dir: String, region_id: RegionId, filters: &[Expr]) -> Option<Self> {
|
||||
let query = Self::extract_from_filter(filters)?;
|
||||
Some(Self {
|
||||
region_dir,
|
||||
region_id,
|
||||
query,
|
||||
})
|
||||
}
|
||||
|
||||
fn extract_from_filter(filters: &[Expr]) -> Option<String> {
|
||||
common_telemetry::info!("[DEBUG] filters in scan request: {:?}", filters);
|
||||
for filter in filters {
|
||||
if let DfExpr::ScalarUDF(udf) = filter.df_expr()
|
||||
&& udf.fun.name == "matches"
|
||||
{
|
||||
let pattern = &udf.args[0];
|
||||
if let DfExpr::Literal(literal) = pattern {
|
||||
return Some(literal.to_string());
|
||||
} else {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Returns the selected row number
|
||||
pub fn apply(&self, file_id: FileId) -> Result<Vec<usize>> {
|
||||
let index_path = format!(
|
||||
"/tmp/greptimedb/{}index/{}/full_text_index",
|
||||
self.region_dir, file_id
|
||||
);
|
||||
common_telemetry::info!("[DEBUG] open index at {index_path}");
|
||||
|
||||
let searcher = FullTextIndexSearcher::open(index_path).unwrap();
|
||||
Ok(searcher.search(&self.query).unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use common_base::BitVec;
|
||||
|
||||
@@ -21,6 +21,9 @@ use std::sync::atomic::AtomicUsize;
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_telemetry::warn;
|
||||
use datatypes::scalars::ScalarVector;
|
||||
use datatypes::vectors::StringVector;
|
||||
use index::full_text_index::create::FullTextIndexCreater;
|
||||
use index::inverted_index::create::sort::external_sort::ExternalSorter;
|
||||
use index::inverted_index::create::sort_create::SortIndexCreator;
|
||||
use index::inverted_index::create::InvertedIndexCreator;
|
||||
@@ -29,6 +32,7 @@ use object_store::ObjectStore;
|
||||
use puffin::file_format::writer::{Blob, PuffinAsyncWriter, PuffinFileWriter};
|
||||
use snafu::{ensure, ResultExt};
|
||||
use store_api::metadata::RegionMetadataRef;
|
||||
use store_api::storage::ConcreteDataType;
|
||||
use tokio::io::duplex;
|
||||
use tokio_util::compat::{TokioAsyncReadCompatExt, TokioAsyncWriteCompatExt};
|
||||
|
||||
@@ -83,6 +87,10 @@ pub struct SstIndexCreator {
|
||||
|
||||
/// The memory usage of the index creator.
|
||||
memory_usage: Arc<AtomicUsize>,
|
||||
|
||||
// experimental full text index
|
||||
full_text_index_creater: FullTextIndexCreater,
|
||||
log_column_id: Option<u32>,
|
||||
}
|
||||
|
||||
impl SstIndexCreator {
|
||||
@@ -96,6 +104,7 @@ impl SstIndexCreator {
|
||||
intermediate_manager: IntermediateManager,
|
||||
memory_usage_threshold: Option<usize>,
|
||||
segment_row_count: NonZeroUsize,
|
||||
log_column_id: Option<u32>,
|
||||
) -> Self {
|
||||
let temp_file_provider = Arc::new(TempFileProvider::new(
|
||||
IntermediateLocation::new(&metadata.region_id, &sst_file_id),
|
||||
@@ -112,6 +121,12 @@ impl SstIndexCreator {
|
||||
);
|
||||
let index_creator = Box::new(SortIndexCreator::new(sorter, segment_row_count));
|
||||
|
||||
let file_id = file_path.trim_end_matches(".puffin");
|
||||
let full_text_index_path = format!("/tmp/greptimedb/{file_id}/full_text_index");
|
||||
// let full_text_index_creater =
|
||||
// FullTextIndexCreater::new(segment_row_count.get(), full_text_index_path).unwrap();
|
||||
let full_text_index_creater = FullTextIndexCreater::new(1, full_text_index_path).unwrap();
|
||||
|
||||
let codec = IndexValuesCodec::from_tag_columns(metadata.primary_key_columns());
|
||||
Self {
|
||||
file_path,
|
||||
@@ -127,6 +142,9 @@ impl SstIndexCreator {
|
||||
|
||||
ignore_column_ids: HashSet::default(),
|
||||
memory_usage,
|
||||
|
||||
full_text_index_creater,
|
||||
log_column_id,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -233,6 +251,24 @@ impl SstIndexCreator {
|
||||
.context(PushIndexValueSnafu)?;
|
||||
}
|
||||
|
||||
// try find column named "log" and update it into full text index
|
||||
common_telemetry::info!("[DEBUG] do_update: log_column_id: {:?}", self.log_column_id);
|
||||
if let Some(log_column_id) = self.log_column_id {
|
||||
for col in batch.fields() {
|
||||
if col.column_id == log_column_id {
|
||||
let vector = &col.data;
|
||||
if vector.data_type() == ConcreteDataType::string_datatype() {
|
||||
let vector = vector.as_any().downcast_ref::<StringVector>().unwrap();
|
||||
for content in vector.iter_data() {
|
||||
self.full_text_index_creater
|
||||
.push_string(content.unwrap_or_default().to_string())
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -296,6 +332,8 @@ impl SstIndexCreator {
|
||||
_ => {}
|
||||
}
|
||||
|
||||
self.full_text_index_creater.finish().unwrap();
|
||||
|
||||
let byte_count = puffin_writer.finish().await.context(PuffinFinishSnafu)?;
|
||||
guard.inc_byte_count(byte_count);
|
||||
Ok(())
|
||||
@@ -421,6 +459,7 @@ mod tests {
|
||||
intm_mgr,
|
||||
memory_threshold,
|
||||
NonZeroUsize::new(segment_row_count).unwrap(),
|
||||
None,
|
||||
);
|
||||
|
||||
for (str_tag, i32_tag) in &tags {
|
||||
|
||||
@@ -29,6 +29,10 @@ pub fn index_file_path(region_dir: &str, sst_file_id: FileId) -> String {
|
||||
util::join_path(&dir, &sst_file_id.as_puffin())
|
||||
}
|
||||
|
||||
pub fn full_text_index_path(region_dir: &str) -> String {
|
||||
util::join_dir(region_dir, "full_text_index")
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
@@ -29,7 +29,7 @@ use datafusion_common::arrow::buffer::BooleanBuffer;
|
||||
use datatypes::arrow::record_batch::RecordBatch;
|
||||
use itertools::Itertools;
|
||||
use object_store::ObjectStore;
|
||||
use parquet::arrow::arrow_reader::{ParquetRecordBatchReader, RowSelection};
|
||||
use parquet::arrow::arrow_reader::{ParquetRecordBatchReader, RowSelection, RowSelector};
|
||||
use parquet::arrow::{parquet_to_arrow_field_levels, FieldLevels, ProjectionMask};
|
||||
use parquet::file::metadata::ParquetMetaData;
|
||||
use parquet::format::KeyValue;
|
||||
@@ -50,7 +50,7 @@ use crate::metrics::{
|
||||
use crate::read::{Batch, BatchReader};
|
||||
use crate::row_converter::{McmpRowCodec, RowCodec, SortField};
|
||||
use crate::sst::file::FileHandle;
|
||||
use crate::sst::index::applier::SstIndexApplierRef;
|
||||
use crate::sst::index::applier::{FullTextIndexApplier, SstIndexApplierRef};
|
||||
use crate::sst::parquet::format::ReadFormat;
|
||||
use crate::sst::parquet::metadata::MetadataLoader;
|
||||
use crate::sst::parquet::row_group::InMemoryRowGroup;
|
||||
@@ -77,6 +77,8 @@ pub(crate) struct ParquetReaderBuilder {
|
||||
cache_manager: Option<CacheManagerRef>,
|
||||
/// Index applier.
|
||||
index_applier: Option<SstIndexApplierRef>,
|
||||
|
||||
full_text_index_applier: Option<FullTextIndexApplier>,
|
||||
}
|
||||
|
||||
impl ParquetReaderBuilder {
|
||||
@@ -95,6 +97,7 @@ impl ParquetReaderBuilder {
|
||||
projection: None,
|
||||
cache_manager: None,
|
||||
index_applier: None,
|
||||
full_text_index_applier: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -131,6 +134,15 @@ impl ParquetReaderBuilder {
|
||||
self
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn full_text_index_applier(
|
||||
mut self,
|
||||
full_text_index_applier: Option<FullTextIndexApplier>,
|
||||
) -> Self {
|
||||
self.full_text_index_applier = full_text_index_applier;
|
||||
self
|
||||
}
|
||||
|
||||
/// Builds and initializes a [ParquetReader].
|
||||
///
|
||||
/// This needs to perform IO operation.
|
||||
@@ -280,10 +292,17 @@ impl ParquetReaderBuilder {
|
||||
}
|
||||
metrics.num_row_groups_before_filtering += num_row_groups;
|
||||
|
||||
if let Some(full_text_index_result) = self.prune_row_groups_by_full_text_index(parquet_meta)
|
||||
{
|
||||
return full_text_index_result;
|
||||
}
|
||||
|
||||
self.prune_row_groups_by_inverted_index(parquet_meta, metrics)
|
||||
.await
|
||||
.or_else(|| self.prune_row_groups_by_minmax(read_format, parquet_meta, metrics))
|
||||
.unwrap_or_else(|| (0..num_row_groups).map(|i| (i, None)).collect())
|
||||
|
||||
// todo: change here
|
||||
}
|
||||
|
||||
/// Applies index to prune row groups.
|
||||
@@ -406,6 +425,59 @@ impl ParquetReaderBuilder {
|
||||
|
||||
Some(row_groups)
|
||||
}
|
||||
|
||||
fn prune_row_groups_by_full_text_index(
|
||||
&self,
|
||||
parquet_meta: &ParquetMetaData,
|
||||
) -> Option<BTreeMap<usize, Option<RowSelection>>> {
|
||||
let applier = self.full_text_index_applier.as_ref()?;
|
||||
let file_id = self.file_handle.file_id();
|
||||
let mut selected_row = applier.apply(file_id).unwrap();
|
||||
|
||||
common_telemetry::info!("[DEBUG] selected_row: {:?}", selected_row.len());
|
||||
|
||||
// Let's assume that the number of rows in the first row group
|
||||
// can represent the `row_group_size` of the Parquet file.
|
||||
//
|
||||
// If the file contains only one row group, i.e. the number of rows
|
||||
// less than the `row_group_size`, the calculation of `row_group_id`
|
||||
// and `rg_begin_row_id` is still correct.
|
||||
let row_group_size = parquet_meta.row_group(0).num_rows() as usize;
|
||||
if row_group_size == 0 {
|
||||
return None;
|
||||
}
|
||||
|
||||
// translate `selected_row` into row groups selection
|
||||
selected_row.sort_unstable();
|
||||
let mut row_groups_selected = BTreeMap::new();
|
||||
for row_id in selected_row.iter() {
|
||||
let row_group_id = row_id / row_group_size;
|
||||
let rg_row_id = row_id % row_group_size;
|
||||
|
||||
row_groups_selected
|
||||
.entry(row_group_id)
|
||||
.or_insert_with(Vec::new)
|
||||
.push(rg_row_id);
|
||||
}
|
||||
let row_group = row_groups_selected
|
||||
.into_iter()
|
||||
.map(|(row_group_id, row_ids)| {
|
||||
let mut current_row = 0;
|
||||
let mut selection = vec![];
|
||||
for row_id in row_ids {
|
||||
selection.push(RowSelector::skip(row_id - current_row));
|
||||
selection.push(RowSelector::select(1));
|
||||
current_row = row_id + 1;
|
||||
}
|
||||
|
||||
(row_group_id, Some(RowSelection::from(selection)))
|
||||
})
|
||||
.collect();
|
||||
|
||||
// common_telemetry::info!("[DEBUG] row_group: {:?}", row_group);
|
||||
|
||||
Some(row_group)
|
||||
}
|
||||
}
|
||||
|
||||
/// Parquet reader metrics.
|
||||
|
||||
Reference in New Issue
Block a user