mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2026-01-07 05:42:57 +00:00
Compare commits
9 Commits
v0.12.0-ni
...
windows_pd
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
334dbee590 | ||
|
|
f33b378e45 | ||
|
|
267941bbb5 | ||
|
|
074846bbc2 | ||
|
|
88d46a38ae | ||
|
|
de0beabf34 | ||
|
|
68dd2916fb | ||
|
|
d51b65a8bf | ||
|
|
2082c4b6e4 |
@@ -54,7 +54,7 @@ runs:
|
||||
PROFILE_TARGET: ${{ inputs.cargo-profile == 'dev' && 'debug' || inputs.cargo-profile }}
|
||||
with:
|
||||
artifacts-dir: ${{ inputs.artifacts-dir }}
|
||||
target-file: ./target/$PROFILE_TARGET/greptime
|
||||
target-files: ./target/$PROFILE_TARGET/greptime
|
||||
version: ${{ inputs.version }}
|
||||
working-dir: ${{ inputs.working-dir }}
|
||||
|
||||
@@ -72,6 +72,6 @@ runs:
|
||||
if: ${{ inputs.build-android-artifacts == 'true' }}
|
||||
with:
|
||||
artifacts-dir: ${{ inputs.artifacts-dir }}
|
||||
target-file: ./target/aarch64-linux-android/release/greptime
|
||||
target-files: ./target/aarch64-linux-android/release/greptime
|
||||
version: ${{ inputs.version }}
|
||||
working-dir: ${{ inputs.working-dir }}
|
||||
|
||||
@@ -90,5 +90,5 @@ runs:
|
||||
uses: ./.github/actions/upload-artifacts
|
||||
with:
|
||||
artifacts-dir: ${{ inputs.artifacts-dir }}
|
||||
target-file: target/${{ inputs.arch }}/${{ inputs.cargo-profile }}/greptime
|
||||
target-files: target/${{ inputs.arch }}/${{ inputs.cargo-profile }}/greptime
|
||||
version: ${{ inputs.version }}
|
||||
|
||||
@@ -76,5 +76,5 @@ runs:
|
||||
uses: ./.github/actions/upload-artifacts
|
||||
with:
|
||||
artifacts-dir: ${{ inputs.artifacts-dir }}
|
||||
target-file: target/${{ inputs.arch }}/${{ inputs.cargo-profile }}/greptime
|
||||
target-files: target/${{ inputs.arch }}/${{ inputs.cargo-profile }}/greptime,target/${{ inputs.arch }}/${{ inputs.cargo-profile }}/greptime.pdb
|
||||
version: ${{ inputs.version }}
|
||||
|
||||
14
.github/actions/upload-artifacts/action.yml
vendored
14
.github/actions/upload-artifacts/action.yml
vendored
@@ -4,8 +4,8 @@ inputs:
|
||||
artifacts-dir:
|
||||
description: Directory to store artifacts
|
||||
required: true
|
||||
target-file:
|
||||
description: The path of the target artifact
|
||||
target-files:
|
||||
description: The multiple target files to upload, separated by comma
|
||||
required: false
|
||||
version:
|
||||
description: Version of the artifact
|
||||
@@ -18,12 +18,16 @@ runs:
|
||||
using: composite
|
||||
steps:
|
||||
- name: Create artifacts directory
|
||||
if: ${{ inputs.target-file != '' }}
|
||||
if: ${{ inputs.target-files != '' }}
|
||||
working-directory: ${{ inputs.working-dir }}
|
||||
shell: bash
|
||||
run: |
|
||||
mkdir -p ${{ inputs.artifacts-dir }} && \
|
||||
cp ${{ inputs.target-file }} ${{ inputs.artifacts-dir }}
|
||||
set -e
|
||||
mkdir -p ${{ inputs.artifacts-dir }}
|
||||
IFS=',' read -ra FILES <<< "${{ inputs.target-files }}"
|
||||
for file in "${FILES[@]}"; do
|
||||
cp "$file" ${{ inputs.artifacts-dir }}/
|
||||
done
|
||||
|
||||
# The compressed artifacts will use the following layout:
|
||||
# greptime-linux-amd64-pyo3-v0.3.0sha256sum
|
||||
|
||||
19
Cargo.lock
generated
19
Cargo.lock
generated
@@ -4152,6 +4152,7 @@ dependencies = [
|
||||
"futures",
|
||||
"humantime-serde",
|
||||
"lazy_static",
|
||||
"log-query",
|
||||
"log-store",
|
||||
"meta-client",
|
||||
"opentelemetry-proto 0.5.0",
|
||||
@@ -6122,6 +6123,7 @@ dependencies = [
|
||||
"chrono",
|
||||
"common-error",
|
||||
"common-macro",
|
||||
"serde",
|
||||
"snafu 0.8.5",
|
||||
"table",
|
||||
]
|
||||
@@ -7470,8 +7472,7 @@ checksum = "b410bbe7e14ab526a0e86877eb47c6996a2bd7746f027ba551028c925390e4e9"
|
||||
[[package]]
|
||||
name = "opendal"
|
||||
version = "0.50.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cb28bb6c64e116ceaf8dd4e87099d3cfea4a58e85e62b104fef74c91afba0f44"
|
||||
source = "git+https://github.com/GreptimeTeam/opendal.git?rev=c82605177f2feec83e49dcaa537c505639d94024#c82605177f2feec83e49dcaa537c505639d94024"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
@@ -8160,7 +8161,7 @@ dependencies = [
|
||||
"rand",
|
||||
"ring 0.17.8",
|
||||
"rust_decimal",
|
||||
"thiserror 2.0.4",
|
||||
"thiserror 2.0.6",
|
||||
"tokio",
|
||||
"tokio-rustls 0.26.0",
|
||||
"tokio-util",
|
||||
@@ -9098,6 +9099,7 @@ dependencies = [
|
||||
"humantime",
|
||||
"itertools 0.10.5",
|
||||
"lazy_static",
|
||||
"log-query",
|
||||
"meter-core",
|
||||
"meter-macros",
|
||||
"num",
|
||||
@@ -10952,6 +10954,7 @@ dependencies = [
|
||||
"json5",
|
||||
"jsonb",
|
||||
"lazy_static",
|
||||
"log-query",
|
||||
"loki-api",
|
||||
"mime_guess",
|
||||
"mysql_async",
|
||||
@@ -12434,11 +12437,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "2.0.4"
|
||||
version = "2.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2f49a1853cf82743e3b7950f77e0f4d622ca36cf4317cba00c767838bac8d490"
|
||||
checksum = "8fec2a1820ebd077e2b90c4df007bebf344cd394098a13c563957d0afc83ea47"
|
||||
dependencies = [
|
||||
"thiserror-impl 2.0.4",
|
||||
"thiserror-impl 2.0.6",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -12454,9 +12457,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "thiserror-impl"
|
||||
version = "2.0.4"
|
||||
version = "2.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8381894bb3efe0c4acac3ded651301ceee58a15d47c2e34885ed1908ad667061"
|
||||
checksum = "d65750cab40f4ff1929fb1ba509e9914eb756131cef4210da8d5d700d26f6312"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
|
||||
@@ -238,6 +238,7 @@ file-engine = { path = "src/file-engine" }
|
||||
flow = { path = "src/flow" }
|
||||
frontend = { path = "src/frontend", default-features = false }
|
||||
index = { path = "src/index" }
|
||||
log-query = { path = "src/log-query" }
|
||||
log-store = { path = "src/log-store" }
|
||||
meta-client = { path = "src/meta-client" }
|
||||
meta-srv = { path = "src/meta-srv" }
|
||||
|
||||
16
README.md
16
README.md
@@ -70,23 +70,23 @@ Our core developers have been building time-series data platforms for years. Bas
|
||||
|
||||
* **Unified Processing of Metrics, Logs, and Events**
|
||||
|
||||
GreptimeDB unifies time series data processing by treating all data - whether metrics, logs, or events - as timestamped events with context. Users can analyze this data using either [SQL](https://docs.greptime.com/user-guide/query-data/sql) or [PromQL](https://docs.greptime.com/user-guide/query-data/promql) and leverage stream processing ([Flow](https://docs.greptime.com/user-guide/flow-computation/overview)) to enable continuous aggregation. [Read more](https://docs.greptime.com/user-guide/concepts/data-model).
|
||||
GreptimeDB unifies time series data processing by treating all data - whether metrics, logs, or events - as timestamped events with context. Users can analyze this data using either [SQL](https://docs.greptime.com/user-guide/query-data/sql) or [PromQL](https://docs.greptime.com/user-guide/query-data/promql) and leverage stream processing ([Flow](https://docs.greptime.com/user-guide/flow-computation/overview)) to enable continuous aggregation. [Read more](https://docs.greptime.com/user-guide/concepts/data-model).
|
||||
|
||||
* **Cloud-native Distributed Database**
|
||||
|
||||
Built for [Kubernetes](https://docs.greptime.com/user-guide/deployments/deploy-on-kubernetes/greptimedb-operator-management). GreptimeDB achieves seamless scalability with its [cloud-native architecture](https://docs.greptime.com/user-guide/concepts/architecture) of separated compute and storage, built on object storage (AWS S3, Azure Blob Storage, etc.) while enabling cross-cloud deployment through a unified data access layer.
|
||||
Built for [Kubernetes](https://docs.greptime.com/user-guide/deployments/deploy-on-kubernetes/greptimedb-operator-management). GreptimeDB achieves seamless scalability with its [cloud-native architecture](https://docs.greptime.com/user-guide/concepts/architecture) of separated compute and storage, built on object storage (AWS S3, Azure Blob Storage, etc.) while enabling cross-cloud deployment through a unified data access layer.
|
||||
|
||||
* **Performance and Cost-effective**
|
||||
|
||||
Written in pure Rust for superior performance and reliability. GreptimeDB features a distributed query engine with intelligent indexing to handle high cardinality data efficiently. Its optimized columnar storage achieves 50x cost efficiency on cloud object storage through advanced compression. [Benchmark reports](https://www.greptime.com/blogs/2024-09-09-report-summary).
|
||||
Written in pure Rust for superior performance and reliability. GreptimeDB features a distributed query engine with intelligent indexing to handle high cardinality data efficiently. Its optimized columnar storage achieves 50x cost efficiency on cloud object storage through advanced compression. [Benchmark reports](https://www.greptime.com/blogs/2024-09-09-report-summary).
|
||||
|
||||
* **Cloud-Edge Collaboration**
|
||||
|
||||
GreptimeDB seamlessly operates across cloud and edge (ARM/Android/Linux), providing consistent APIs and control plane for unified data management and efficient synchronization. [Learn how to run on Android](https://docs.greptime.com/user-guide/deployments/run-on-android/).
|
||||
GreptimeDB seamlessly operates across cloud and edge (ARM/Android/Linux), providing consistent APIs and control plane for unified data management and efficient synchronization. [Learn how to run on Android](https://docs.greptime.com/user-guide/deployments/run-on-android/).
|
||||
|
||||
* **Multi-protocol Ingestion, SQL & PromQL Ready**
|
||||
|
||||
Widely adopted database protocols and APIs, including MySQL, PostgreSQL, InfluxDB, OpenTelemetry, Loki and Prometheus, etc. Effortless Adoption & Seamless Migration. [Supported Protocols Overview](https://docs.greptime.com/user-guide/protocols/overview).
|
||||
Widely adopted database protocols and APIs, including MySQL, PostgreSQL, InfluxDB, OpenTelemetry, Loki and Prometheus, etc. Effortless Adoption & Seamless Migration. [Supported Protocols Overview](https://docs.greptime.com/user-guide/protocols/overview).
|
||||
|
||||
For more detailed info please read [Why GreptimeDB](https://docs.greptime.com/user-guide/concepts/why-greptimedb).
|
||||
|
||||
@@ -138,7 +138,7 @@ Check the prerequisite:
|
||||
|
||||
* [Rust toolchain](https://www.rust-lang.org/tools/install) (nightly)
|
||||
* [Protobuf compiler](https://grpc.io/docs/protoc-installation/) (>= 3.15)
|
||||
* Python toolchain (optional): Required only if built with PyO3 backend. More detail for compiling with PyO3 can be found in its [documentation](https://pyo3.rs/v0.18.1/building_and_distribution#configuring-the-python-version).
|
||||
* Python toolchain (optional): Required only if built with PyO3 backend. More details for compiling with PyO3 can be found in its [documentation](https://pyo3.rs/v0.18.1/building_and_distribution#configuring-the-python-version).
|
||||
|
||||
Build GreptimeDB binary:
|
||||
|
||||
@@ -154,6 +154,10 @@ cargo run -- standalone start
|
||||
|
||||
## Tools & Extensions
|
||||
|
||||
### Kubernetes
|
||||
|
||||
- [GreptimeDB Operator](https://github.com/GrepTimeTeam/greptimedb-operator)
|
||||
|
||||
### Dashboard
|
||||
|
||||
- [The dashboard UI for GreptimeDB](https://github.com/GreptimeTeam/dashboard)
|
||||
|
||||
@@ -25,6 +25,7 @@ pub enum PermissionReq<'a> {
|
||||
GrpcRequest(&'a Request),
|
||||
SqlStatement(&'a Statement),
|
||||
PromQuery,
|
||||
LogQuery,
|
||||
Opentsdb,
|
||||
LineProtocol,
|
||||
PromStoreWrite,
|
||||
|
||||
@@ -38,7 +38,7 @@ pub fn new_table_cache(
|
||||
) -> TableCache {
|
||||
let init = init_factory(table_info_cache, table_name_cache);
|
||||
|
||||
CacheContainer::new(name, cache, Box::new(invalidator), init, Box::new(filter))
|
||||
CacheContainer::new(name, cache, Box::new(invalidator), init, filter)
|
||||
}
|
||||
|
||||
fn init_factory(
|
||||
|
||||
@@ -26,3 +26,4 @@ pub mod function_registry;
|
||||
pub mod handlers;
|
||||
pub mod helper;
|
||||
pub mod state;
|
||||
pub mod utils;
|
||||
|
||||
@@ -204,20 +204,10 @@ impl PatternAst {
|
||||
fn convert_literal(column: &str, pattern: &str) -> Expr {
|
||||
logical_expr::col(column).like(logical_expr::lit(format!(
|
||||
"%{}%",
|
||||
Self::escape_pattern(pattern)
|
||||
crate::utils::escape_like_pattern(pattern)
|
||||
)))
|
||||
}
|
||||
|
||||
fn escape_pattern(pattern: &str) -> String {
|
||||
pattern
|
||||
.chars()
|
||||
.flat_map(|c| match c {
|
||||
'\\' | '%' | '_' => vec!['\\', c],
|
||||
_ => vec![c],
|
||||
})
|
||||
.collect::<String>()
|
||||
}
|
||||
|
||||
/// Transform this AST with preset rules to make it correct.
|
||||
fn transform_ast(self) -> Result<Self> {
|
||||
self.transform_up(Self::collapse_binary_branch_fn)
|
||||
|
||||
58
src/common/function/src/utils.rs
Normal file
58
src/common/function/src/utils.rs
Normal file
@@ -0,0 +1,58 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
/// Escapes special characters in the provided pattern string for `LIKE`.
|
||||
///
|
||||
/// Specifically, it prefixes the backslash (`\`), percent (`%`), and underscore (`_`)
|
||||
/// characters with an additional backslash to ensure they are treated literally.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```rust
|
||||
/// let escaped = escape_pattern("100%_some\\path");
|
||||
/// assert_eq!(escaped, "100\\%\\_some\\\\path");
|
||||
/// ```
|
||||
pub fn escape_like_pattern(pattern: &str) -> String {
|
||||
pattern
|
||||
.chars()
|
||||
.flat_map(|c| match c {
|
||||
'\\' | '%' | '_' => vec!['\\', c],
|
||||
_ => vec![c],
|
||||
})
|
||||
.collect::<String>()
|
||||
}
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_escape_like_pattern() {
|
||||
assert_eq!(
|
||||
escape_like_pattern("100%_some\\path"),
|
||||
"100\\%\\_some\\\\path"
|
||||
);
|
||||
assert_eq!(escape_like_pattern(""), "");
|
||||
assert_eq!(escape_like_pattern("hello"), "hello");
|
||||
assert_eq!(escape_like_pattern("\\%_"), "\\\\\\%\\_");
|
||||
assert_eq!(escape_like_pattern("%%__\\\\"), "\\%\\%\\_\\_\\\\\\\\");
|
||||
assert_eq!(escape_like_pattern("abc123"), "abc123");
|
||||
assert_eq!(escape_like_pattern("%_\\"), "\\%\\_\\\\");
|
||||
assert_eq!(
|
||||
escape_like_pattern("%%__\\\\another%string"),
|
||||
"\\%\\%\\_\\_\\\\\\\\another\\%string"
|
||||
);
|
||||
assert_eq!(escape_like_pattern("foo%bar_"), "foo\\%bar\\_");
|
||||
assert_eq!(escape_like_pattern("\\_\\%"), "\\\\\\_\\\\\\%");
|
||||
}
|
||||
}
|
||||
44
src/common/meta/src/cache/container.rs
vendored
44
src/common/meta/src/cache/container.rs
vendored
@@ -43,7 +43,7 @@ pub struct CacheContainer<K, V, CacheToken> {
|
||||
cache: Cache<K, V>,
|
||||
invalidator: Invalidator<K, V, CacheToken>,
|
||||
initializer: Initializer<K, V>,
|
||||
token_filter: TokenFilter<CacheToken>,
|
||||
token_filter: fn(&CacheToken) -> bool,
|
||||
}
|
||||
|
||||
impl<K, V, CacheToken> CacheContainer<K, V, CacheToken>
|
||||
@@ -58,7 +58,7 @@ where
|
||||
cache: Cache<K, V>,
|
||||
invalidator: Invalidator<K, V, CacheToken>,
|
||||
initializer: Initializer<K, V>,
|
||||
token_filter: TokenFilter<CacheToken>,
|
||||
token_filter: fn(&CacheToken) -> bool,
|
||||
) -> Self {
|
||||
Self {
|
||||
name,
|
||||
@@ -206,10 +206,13 @@ mod tests {
|
||||
name: &'a str,
|
||||
}
|
||||
|
||||
fn always_true_filter(_: &String) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_get() {
|
||||
let cache: Cache<NameKey, String> = CacheBuilder::new(128).build();
|
||||
let filter: TokenFilter<String> = Box::new(|_| true);
|
||||
let counter = Arc::new(AtomicI32::new(0));
|
||||
let moved_counter = counter.clone();
|
||||
let init: Initializer<NameKey, String> = Arc::new(move |_| {
|
||||
@@ -219,7 +222,13 @@ mod tests {
|
||||
let invalidator: Invalidator<NameKey, String, String> =
|
||||
Box::new(|_, _| Box::pin(async { Ok(()) }));
|
||||
|
||||
let adv_cache = CacheContainer::new("test".to_string(), cache, invalidator, init, filter);
|
||||
let adv_cache = CacheContainer::new(
|
||||
"test".to_string(),
|
||||
cache,
|
||||
invalidator,
|
||||
init,
|
||||
always_true_filter,
|
||||
);
|
||||
let key = NameKey { name: "key" };
|
||||
let value = adv_cache.get(key).await.unwrap().unwrap();
|
||||
assert_eq!(value, "hi");
|
||||
@@ -233,7 +242,6 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn test_get_by_ref() {
|
||||
let cache: Cache<String, String> = CacheBuilder::new(128).build();
|
||||
let filter: TokenFilter<String> = Box::new(|_| true);
|
||||
let counter = Arc::new(AtomicI32::new(0));
|
||||
let moved_counter = counter.clone();
|
||||
let init: Initializer<String, String> = Arc::new(move |_| {
|
||||
@@ -243,7 +251,13 @@ mod tests {
|
||||
let invalidator: Invalidator<String, String, String> =
|
||||
Box::new(|_, _| Box::pin(async { Ok(()) }));
|
||||
|
||||
let adv_cache = CacheContainer::new("test".to_string(), cache, invalidator, init, filter);
|
||||
let adv_cache = CacheContainer::new(
|
||||
"test".to_string(),
|
||||
cache,
|
||||
invalidator,
|
||||
init,
|
||||
always_true_filter,
|
||||
);
|
||||
let value = adv_cache.get_by_ref("foo").await.unwrap().unwrap();
|
||||
assert_eq!(value, "hi");
|
||||
let value = adv_cache.get_by_ref("foo").await.unwrap().unwrap();
|
||||
@@ -257,13 +271,18 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn test_get_value_not_exits() {
|
||||
let cache: Cache<String, String> = CacheBuilder::new(128).build();
|
||||
let filter: TokenFilter<String> = Box::new(|_| true);
|
||||
let init: Initializer<String, String> =
|
||||
Arc::new(move |_| Box::pin(async { error::ValueNotExistSnafu {}.fail() }));
|
||||
let invalidator: Invalidator<String, String, String> =
|
||||
Box::new(|_, _| Box::pin(async { Ok(()) }));
|
||||
|
||||
let adv_cache = CacheContainer::new("test".to_string(), cache, invalidator, init, filter);
|
||||
let adv_cache = CacheContainer::new(
|
||||
"test".to_string(),
|
||||
cache,
|
||||
invalidator,
|
||||
init,
|
||||
always_true_filter,
|
||||
);
|
||||
let value = adv_cache.get_by_ref("foo").await.unwrap();
|
||||
assert!(value.is_none());
|
||||
}
|
||||
@@ -271,7 +290,6 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn test_invalidate() {
|
||||
let cache: Cache<String, String> = CacheBuilder::new(128).build();
|
||||
let filter: TokenFilter<String> = Box::new(|_| true);
|
||||
let counter = Arc::new(AtomicI32::new(0));
|
||||
let moved_counter = counter.clone();
|
||||
let init: Initializer<String, String> = Arc::new(move |_| {
|
||||
@@ -285,7 +303,13 @@ mod tests {
|
||||
})
|
||||
});
|
||||
|
||||
let adv_cache = CacheContainer::new("test".to_string(), cache, invalidator, init, filter);
|
||||
let adv_cache = CacheContainer::new(
|
||||
"test".to_string(),
|
||||
cache,
|
||||
invalidator,
|
||||
init,
|
||||
always_true_filter,
|
||||
);
|
||||
let value = adv_cache.get_by_ref("foo").await.unwrap().unwrap();
|
||||
assert_eq!(value, "hi");
|
||||
let value = adv_cache.get_by_ref("foo").await.unwrap().unwrap();
|
||||
|
||||
@@ -45,7 +45,7 @@ pub fn new_table_flownode_set_cache(
|
||||
let table_flow_manager = Arc::new(TableFlowManager::new(kv_backend));
|
||||
let init = init_factory(table_flow_manager);
|
||||
|
||||
CacheContainer::new(name, cache, Box::new(invalidator), init, Box::new(filter))
|
||||
CacheContainer::new(name, cache, Box::new(invalidator), init, filter)
|
||||
}
|
||||
|
||||
fn init_factory(table_flow_manager: TableFlowManagerRef) -> Initializer<TableId, FlownodeSet> {
|
||||
|
||||
22
src/common/meta/src/cache/registry.rs
vendored
22
src/common/meta/src/cache/registry.rs
vendored
@@ -151,12 +151,15 @@ mod tests {
|
||||
use crate::cache::*;
|
||||
use crate::instruction::CacheIdent;
|
||||
|
||||
fn always_true_filter(_: &CacheIdent) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
fn test_cache(
|
||||
name: &str,
|
||||
invalidator: Invalidator<String, String, CacheIdent>,
|
||||
) -> CacheContainer<String, String, CacheIdent> {
|
||||
let cache: Cache<String, String> = CacheBuilder::new(128).build();
|
||||
let filter: TokenFilter<CacheIdent> = Box::new(|_| true);
|
||||
let counter = Arc::new(AtomicI32::new(0));
|
||||
let moved_counter = counter.clone();
|
||||
let init: Initializer<String, String> = Arc::new(move |_| {
|
||||
@@ -164,7 +167,13 @@ mod tests {
|
||||
Box::pin(async { Ok(Some("hi".to_string())) })
|
||||
});
|
||||
|
||||
CacheContainer::new(name.to_string(), cache, invalidator, init, filter)
|
||||
CacheContainer::new(
|
||||
name.to_string(),
|
||||
cache,
|
||||
invalidator,
|
||||
init,
|
||||
always_true_filter,
|
||||
)
|
||||
}
|
||||
|
||||
fn test_i32_cache(
|
||||
@@ -172,7 +181,6 @@ mod tests {
|
||||
invalidator: Invalidator<i32, String, CacheIdent>,
|
||||
) -> CacheContainer<i32, String, CacheIdent> {
|
||||
let cache: Cache<i32, String> = CacheBuilder::new(128).build();
|
||||
let filter: TokenFilter<CacheIdent> = Box::new(|_| true);
|
||||
let counter = Arc::new(AtomicI32::new(0));
|
||||
let moved_counter = counter.clone();
|
||||
let init: Initializer<i32, String> = Arc::new(move |_| {
|
||||
@@ -180,7 +188,13 @@ mod tests {
|
||||
Box::pin(async { Ok(Some("foo".to_string())) })
|
||||
});
|
||||
|
||||
CacheContainer::new(name.to_string(), cache, invalidator, init, filter)
|
||||
CacheContainer::new(
|
||||
name.to_string(),
|
||||
cache,
|
||||
invalidator,
|
||||
init,
|
||||
always_true_filter,
|
||||
)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
||||
2
src/common/meta/src/cache/table/schema.rs
vendored
2
src/common/meta/src/cache/table/schema.rs
vendored
@@ -36,7 +36,7 @@ pub fn new_schema_cache(
|
||||
let schema_manager = SchemaManager::new(kv_backend.clone());
|
||||
let init = init_factory(schema_manager);
|
||||
|
||||
CacheContainer::new(name, cache, Box::new(invalidator), init, Box::new(filter))
|
||||
CacheContainer::new(name, cache, Box::new(invalidator), init, filter)
|
||||
}
|
||||
|
||||
fn init_factory(schema_manager: SchemaManager) -> Initializer<SchemaName, Arc<SchemaNameValue>> {
|
||||
|
||||
@@ -41,7 +41,7 @@ pub fn new_table_info_cache(
|
||||
let table_info_manager = Arc::new(TableInfoManager::new(kv_backend));
|
||||
let init = init_factory(table_info_manager);
|
||||
|
||||
CacheContainer::new(name, cache, Box::new(invalidator), init, Box::new(filter))
|
||||
CacheContainer::new(name, cache, Box::new(invalidator), init, filter)
|
||||
}
|
||||
|
||||
fn init_factory(table_info_manager: TableInfoManagerRef) -> Initializer<TableId, Arc<TableInfo>> {
|
||||
|
||||
@@ -41,7 +41,7 @@ pub fn new_table_name_cache(
|
||||
let table_name_manager = Arc::new(TableNameManager::new(kv_backend));
|
||||
let init = init_factory(table_name_manager);
|
||||
|
||||
CacheContainer::new(name, cache, Box::new(invalidator), init, Box::new(filter))
|
||||
CacheContainer::new(name, cache, Box::new(invalidator), init, filter)
|
||||
}
|
||||
|
||||
fn init_factory(table_name_manager: TableNameManagerRef) -> Initializer<TableName, TableId> {
|
||||
|
||||
@@ -65,7 +65,7 @@ pub fn new_table_route_cache(
|
||||
let table_info_manager = Arc::new(TableRouteManager::new(kv_backend));
|
||||
let init = init_factory(table_info_manager);
|
||||
|
||||
CacheContainer::new(name, cache, Box::new(invalidator), init, Box::new(filter))
|
||||
CacheContainer::new(name, cache, Box::new(invalidator), init, filter)
|
||||
}
|
||||
|
||||
fn init_factory(
|
||||
|
||||
@@ -40,7 +40,7 @@ pub fn new_table_schema_cache(
|
||||
let table_info_manager = TableInfoManager::new(kv_backend);
|
||||
let init = init_factory(table_info_manager);
|
||||
|
||||
CacheContainer::new(name, cache, Box::new(invalidator), init, Box::new(filter))
|
||||
CacheContainer::new(name, cache, Box::new(invalidator), init, filter)
|
||||
}
|
||||
|
||||
fn init_factory(table_info_manager: TableInfoManager) -> Initializer<TableId, Arc<SchemaName>> {
|
||||
|
||||
2
src/common/meta/src/cache/table/view_info.rs
vendored
2
src/common/meta/src/cache/table/view_info.rs
vendored
@@ -40,7 +40,7 @@ pub fn new_view_info_cache(
|
||||
let view_info_manager = Arc::new(ViewInfoManager::new(kv_backend));
|
||||
let init = init_factory(view_info_manager);
|
||||
|
||||
CacheContainer::new(name, cache, Box::new(invalidator), init, Box::new(filter))
|
||||
CacheContainer::new(name, cache, Box::new(invalidator), init, filter)
|
||||
}
|
||||
|
||||
fn init_factory(view_info_manager: ViewInfoManagerRef) -> Initializer<TableId, Arc<ViewInfoValue>> {
|
||||
|
||||
@@ -13,7 +13,6 @@
|
||||
// limitations under the License.
|
||||
|
||||
use std::fmt::Display;
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_catalog::consts::DEFAULT_CATALOG_NAME;
|
||||
use futures::stream::BoxStream;
|
||||
@@ -146,7 +145,7 @@ impl CatalogManager {
|
||||
self.kv_backend.clone(),
|
||||
req,
|
||||
DEFAULT_PAGE_SIZE,
|
||||
Arc::new(catalog_decoder),
|
||||
catalog_decoder,
|
||||
)
|
||||
.into_stream();
|
||||
|
||||
@@ -156,6 +155,8 @@ impl CatalogManager {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::*;
|
||||
use crate::kv_backend::memory::MemoryKvBackend;
|
||||
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::fmt::Display;
|
||||
use std::sync::Arc;
|
||||
|
||||
use futures::stream::BoxStream;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -166,7 +165,7 @@ impl DatanodeTableManager {
|
||||
self.kv_backend.clone(),
|
||||
req,
|
||||
DEFAULT_PAGE_SIZE,
|
||||
Arc::new(datanode_table_value_decoder),
|
||||
datanode_table_value_decoder,
|
||||
)
|
||||
.into_stream();
|
||||
|
||||
|
||||
@@ -12,8 +12,6 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use futures::stream::BoxStream;
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
@@ -201,7 +199,7 @@ impl FlowNameManager {
|
||||
self.kv_backend.clone(),
|
||||
req,
|
||||
DEFAULT_PAGE_SIZE,
|
||||
Arc::new(flow_name_decoder),
|
||||
flow_name_decoder,
|
||||
)
|
||||
.into_stream();
|
||||
|
||||
|
||||
@@ -12,8 +12,6 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use futures::stream::BoxStream;
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
@@ -179,7 +177,7 @@ impl FlowRouteManager {
|
||||
self.kv_backend.clone(),
|
||||
req,
|
||||
DEFAULT_PAGE_SIZE,
|
||||
Arc::new(flow_route_decoder),
|
||||
flow_route_decoder,
|
||||
)
|
||||
.into_stream();
|
||||
|
||||
|
||||
@@ -12,8 +12,6 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use futures::stream::BoxStream;
|
||||
use futures::TryStreamExt;
|
||||
use lazy_static::lazy_static;
|
||||
@@ -179,7 +177,7 @@ impl FlownodeFlowManager {
|
||||
self.kv_backend.clone(),
|
||||
req,
|
||||
DEFAULT_PAGE_SIZE,
|
||||
Arc::new(flownode_flow_key_decoder),
|
||||
flownode_flow_key_decoder,
|
||||
)
|
||||
.into_stream();
|
||||
|
||||
|
||||
@@ -206,7 +206,7 @@ impl TableFlowManager {
|
||||
self.kv_backend.clone(),
|
||||
req,
|
||||
DEFAULT_PAGE_SIZE,
|
||||
Arc::new(table_flow_decoder),
|
||||
table_flow_decoder,
|
||||
)
|
||||
.into_stream();
|
||||
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::fmt::Display;
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
|
||||
use common_time::DatabaseTimeToLive;
|
||||
@@ -283,7 +282,7 @@ impl SchemaManager {
|
||||
self.kv_backend.clone(),
|
||||
req,
|
||||
DEFAULT_PAGE_SIZE,
|
||||
Arc::new(schema_decoder),
|
||||
schema_decoder,
|
||||
)
|
||||
.into_stream();
|
||||
|
||||
@@ -308,6 +307,7 @@ impl<'a> From<&'a SchemaName> for SchemaNameKey<'a> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use super::*;
|
||||
|
||||
@@ -269,7 +269,7 @@ impl TableNameManager {
|
||||
self.kv_backend.clone(),
|
||||
req,
|
||||
DEFAULT_PAGE_SIZE,
|
||||
Arc::new(table_decoder),
|
||||
table_decoder,
|
||||
)
|
||||
.into_stream();
|
||||
|
||||
|
||||
@@ -12,8 +12,6 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use async_stream::try_stream;
|
||||
use common_telemetry::debug;
|
||||
use futures::Stream;
|
||||
@@ -148,7 +146,7 @@ impl PaginationStreamFactory {
|
||||
}
|
||||
|
||||
pub struct PaginationStream<T> {
|
||||
decoder_fn: Arc<KeyValueDecoderFn<T>>,
|
||||
decoder_fn: fn(KeyValue) -> Result<T>,
|
||||
factory: PaginationStreamFactory,
|
||||
}
|
||||
|
||||
@@ -158,7 +156,7 @@ impl<T> PaginationStream<T> {
|
||||
kv: KvBackendRef,
|
||||
req: RangeRequest,
|
||||
page_size: usize,
|
||||
decoder_fn: Arc<KeyValueDecoderFn<T>>,
|
||||
decoder_fn: fn(KeyValue) -> Result<T>,
|
||||
) -> Self {
|
||||
Self {
|
||||
decoder_fn,
|
||||
@@ -191,6 +189,7 @@ mod tests {
|
||||
|
||||
use std::assert_matches::assert_matches;
|
||||
use std::collections::BTreeMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use futures::TryStreamExt;
|
||||
|
||||
@@ -250,7 +249,7 @@ mod tests {
|
||||
..Default::default()
|
||||
},
|
||||
DEFAULT_PAGE_SIZE,
|
||||
Arc::new(decoder),
|
||||
decoder,
|
||||
)
|
||||
.into_stream();
|
||||
let kv = stream.try_collect::<Vec<_>>().await.unwrap();
|
||||
@@ -290,7 +289,7 @@ mod tests {
|
||||
..Default::default()
|
||||
},
|
||||
2,
|
||||
Arc::new(decoder),
|
||||
decoder,
|
||||
);
|
||||
let kv = stream
|
||||
.into_stream()
|
||||
|
||||
@@ -12,8 +12,6 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use common_error::ext::BoxedError;
|
||||
use common_procedure::error::{DeleteStatesSnafu, ListStateSnafu, PutStateSnafu};
|
||||
@@ -171,7 +169,7 @@ impl StateStore for KvStateStore {
|
||||
self.kv_backend.clone(),
|
||||
req,
|
||||
self.max_num_per_range_request.unwrap_or_default(),
|
||||
Arc::new(decode_kv),
|
||||
decode_kv,
|
||||
)
|
||||
.into_stream();
|
||||
|
||||
|
||||
@@ -41,6 +41,7 @@ datafusion-expr.workspace = true
|
||||
datanode.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
lazy_static.workspace = true
|
||||
log-query.workspace = true
|
||||
log-store.workspace = true
|
||||
meta-client.workspace = true
|
||||
opentelemetry-proto.workspace = true
|
||||
|
||||
@@ -16,6 +16,7 @@ pub mod builder;
|
||||
mod grpc;
|
||||
mod influxdb;
|
||||
mod log_handler;
|
||||
mod logs;
|
||||
mod opentsdb;
|
||||
mod otlp;
|
||||
mod prom_store;
|
||||
@@ -64,8 +65,8 @@ use servers::prometheus_handler::PrometheusHandler;
|
||||
use servers::query_handler::grpc::GrpcQueryHandler;
|
||||
use servers::query_handler::sql::SqlQueryHandler;
|
||||
use servers::query_handler::{
|
||||
InfluxdbLineProtocolHandler, OpenTelemetryProtocolHandler, OpentsdbProtocolHandler,
|
||||
PipelineHandler, PromStoreProtocolHandler, ScriptHandler,
|
||||
InfluxdbLineProtocolHandler, LogQueryHandler, OpenTelemetryProtocolHandler,
|
||||
OpentsdbProtocolHandler, PipelineHandler, PromStoreProtocolHandler, ScriptHandler,
|
||||
};
|
||||
use servers::server::ServerHandlers;
|
||||
use session::context::QueryContextRef;
|
||||
@@ -99,6 +100,7 @@ pub trait FrontendInstance:
|
||||
+ ScriptHandler
|
||||
+ PrometheusHandler
|
||||
+ PipelineHandler
|
||||
+ LogQueryHandler
|
||||
+ Send
|
||||
+ Sync
|
||||
+ 'static
|
||||
|
||||
67
src/frontend/src/instance/logs.rs
Normal file
67
src/frontend/src/instance/logs.rs
Normal file
@@ -0,0 +1,67 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
|
||||
use client::Output;
|
||||
use common_error::ext::BoxedError;
|
||||
use log_query::LogQuery;
|
||||
use server_error::Result as ServerResult;
|
||||
use servers::error::{self as server_error, AuthSnafu, ExecuteQuerySnafu};
|
||||
use servers::interceptor::{LogQueryInterceptor, LogQueryInterceptorRef};
|
||||
use servers::query_handler::LogQueryHandler;
|
||||
use session::context::QueryContextRef;
|
||||
use snafu::ResultExt;
|
||||
use tonic::async_trait;
|
||||
|
||||
use super::Instance;
|
||||
|
||||
#[async_trait]
|
||||
impl LogQueryHandler for Instance {
|
||||
async fn query(&self, mut request: LogQuery, ctx: QueryContextRef) -> ServerResult<Output> {
|
||||
let interceptor = self
|
||||
.plugins
|
||||
.get::<LogQueryInterceptorRef<server_error::Error>>();
|
||||
|
||||
self.plugins
|
||||
.get::<PermissionCheckerRef>()
|
||||
.as_ref()
|
||||
.check_permission(ctx.current_user(), PermissionReq::LogQuery)
|
||||
.context(AuthSnafu)?;
|
||||
|
||||
interceptor.as_ref().pre_query(&request, ctx.clone())?;
|
||||
|
||||
request
|
||||
.time_filter
|
||||
.canonicalize()
|
||||
.map_err(BoxedError::new)
|
||||
.context(ExecuteQuerySnafu)?;
|
||||
|
||||
let plan = self
|
||||
.query_engine
|
||||
.planner()
|
||||
.plan_logs_query(request, ctx.clone())
|
||||
.await
|
||||
.map_err(BoxedError::new)
|
||||
.context(ExecuteQuerySnafu)?;
|
||||
|
||||
let output = self
|
||||
.statement_executor
|
||||
.exec_plan(plan, ctx.clone())
|
||||
.await
|
||||
.map_err(BoxedError::new)
|
||||
.context(ExecuteQuerySnafu)?;
|
||||
|
||||
Ok(interceptor.as_ref().post_query(output, ctx.clone())?)
|
||||
}
|
||||
}
|
||||
@@ -87,6 +87,7 @@ where
|
||||
let ingest_interceptor = self.plugins.get::<LogIngestInterceptorRef<ServerError>>();
|
||||
builder =
|
||||
builder.with_log_ingest_handler(self.instance.clone(), validator, ingest_interceptor);
|
||||
builder = builder.with_logs_handler(self.instance.clone());
|
||||
|
||||
if let Some(user_provider) = self.plugins.get::<UserProviderRef>() {
|
||||
builder = builder.with_user_provider(user_provider);
|
||||
|
||||
@@ -31,12 +31,21 @@ mod footer;
|
||||
/// InvertedIndexReader defines an asynchronous reader of inverted index data
|
||||
#[mockall::automock]
|
||||
#[async_trait]
|
||||
pub trait InvertedIndexReader: Send {
|
||||
pub trait InvertedIndexReader: Send + Sync {
|
||||
/// Seeks to given offset and reads data with exact size as provided.
|
||||
async fn range_read(&mut self, offset: u64, size: u32) -> Result<Vec<u8>>;
|
||||
|
||||
/// Reads the bytes in the given ranges.
|
||||
async fn read_vec(&mut self, ranges: &[Range<u64>]) -> Result<Vec<Bytes>>;
|
||||
async fn read_vec(&mut self, ranges: &[Range<u64>]) -> Result<Vec<Bytes>> {
|
||||
let mut result = Vec::with_capacity(ranges.len());
|
||||
for range in ranges {
|
||||
let data = self
|
||||
.range_read(range.start, (range.end - range.start) as u32)
|
||||
.await?;
|
||||
result.push(Bytes::from(data));
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Retrieves metadata of all inverted indices stored within the blob.
|
||||
async fn metadata(&mut self) -> Result<Arc<InvertedIndexMetas>>;
|
||||
|
||||
@@ -51,7 +51,7 @@ impl<R> InvertedIndexBlobReader<R> {
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<R: RangeReader> InvertedIndexReader for InvertedIndexBlobReader<R> {
|
||||
impl<R: RangeReader + Sync> InvertedIndexReader for InvertedIndexBlobReader<R> {
|
||||
async fn range_read(&mut self, offset: u64, size: u32) -> Result<Vec<u8>> {
|
||||
let buf = self
|
||||
.source
|
||||
|
||||
@@ -11,5 +11,6 @@ workspace = true
|
||||
chrono.workspace = true
|
||||
common-error.workspace = true
|
||||
common-macro.workspace = true
|
||||
serde.workspace = true
|
||||
snafu.workspace = true
|
||||
table.workspace = true
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
use std::any::Any;
|
||||
|
||||
use common_error::ext::ErrorExt;
|
||||
use common_error::status_code::StatusCode;
|
||||
use common_macro::stack_trace_debug;
|
||||
use snafu::Snafu;
|
||||
|
||||
@@ -41,6 +42,15 @@ impl ErrorExt for Error {
|
||||
fn as_any(&self) -> &dyn Any {
|
||||
self
|
||||
}
|
||||
|
||||
fn status_code(&self) -> StatusCode {
|
||||
match self {
|
||||
Error::InvalidTimeFilter { .. }
|
||||
| Error::InvalidDateFormat { .. }
|
||||
| Error::InvalidSpanFormat { .. }
|
||||
| Error::EndBeforeStart { .. } => StatusCode::InvalidArguments,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
// limitations under the License.
|
||||
|
||||
use chrono::{DateTime, Datelike, Duration, NaiveDate, NaiveTime, TimeZone, Utc};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use table::table_name::TableName;
|
||||
|
||||
use crate::error::{
|
||||
@@ -21,9 +22,10 @@ use crate::error::{
|
||||
};
|
||||
|
||||
/// GreptimeDB's log query request.
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct LogQuery {
|
||||
/// A fully qualified table name to query logs from.
|
||||
pub table_name: TableName,
|
||||
pub table: TableName,
|
||||
/// Specifies the time range for the log query. See [`TimeFilter`] for more details.
|
||||
pub time_filter: TimeFilter,
|
||||
/// Columns with filters to query.
|
||||
@@ -34,6 +36,18 @@ pub struct LogQuery {
|
||||
pub context: Context,
|
||||
}
|
||||
|
||||
impl Default for LogQuery {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
table: TableName::new("", "", ""),
|
||||
time_filter: Default::default(),
|
||||
columns: vec![],
|
||||
limit: None,
|
||||
context: Default::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents a time range for log query.
|
||||
///
|
||||
/// This struct allows various formats to express a time range from the user side
|
||||
@@ -58,7 +72,7 @@ pub struct LogQuery {
|
||||
///
|
||||
/// This struct doesn't require a timezone to be presented. When the timezone is not
|
||||
/// provided, it will fill the default timezone with the same rules akin to other queries.
|
||||
#[derive(Debug, Clone)]
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
||||
pub struct TimeFilter {
|
||||
pub start: Option<String>,
|
||||
pub end: Option<String>,
|
||||
@@ -69,8 +83,7 @@ impl TimeFilter {
|
||||
/// Validate and canonicalize the time filter.
|
||||
///
|
||||
/// This function will try to fill the missing fields and convert all dates to timestamps
|
||||
// false positive
|
||||
#[allow(unused_assignments)]
|
||||
#[allow(unused_assignments)] // false positive
|
||||
pub fn canonicalize(&mut self) -> Result<()> {
|
||||
let mut start_dt = None;
|
||||
let mut end_dt = None;
|
||||
@@ -209,6 +222,7 @@ impl TimeFilter {
|
||||
}
|
||||
|
||||
/// Represents a column with filters to query.
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct ColumnFilters {
|
||||
/// Case-sensitive column name to query.
|
||||
pub column_name: String,
|
||||
@@ -216,6 +230,7 @@ pub struct ColumnFilters {
|
||||
pub filters: Vec<ContentFilter>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub enum ContentFilter {
|
||||
/// Only match the exact content.
|
||||
///
|
||||
@@ -234,13 +249,16 @@ pub enum ContentFilter {
|
||||
Compound(Vec<ContentFilter>, BinaryOperator),
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub enum BinaryOperator {
|
||||
And,
|
||||
Or,
|
||||
}
|
||||
|
||||
/// Controls how many adjacent lines to return.
|
||||
#[derive(Debug, Default, Serialize, Deserialize)]
|
||||
pub enum Context {
|
||||
#[default]
|
||||
None,
|
||||
/// Specify the number of lines before and after the matched line separately.
|
||||
Lines(usize, usize),
|
||||
|
||||
@@ -326,8 +326,8 @@ impl ClusterInfo for MetaClient {
|
||||
let cluster_kv_backend = Arc::new(self.cluster_client()?);
|
||||
let range_prefix = DatanodeStatKey::key_prefix_with_cluster_id(self.id.0);
|
||||
let req = RangeRequest::new().with_prefix(range_prefix);
|
||||
let stream = PaginationStream::new(cluster_kv_backend, req, 256, Arc::new(decode_stats))
|
||||
.into_stream();
|
||||
let stream =
|
||||
PaginationStream::new(cluster_kv_backend, req, 256, decode_stats).into_stream();
|
||||
let mut datanode_stats = stream
|
||||
.try_collect::<Vec<_>>()
|
||||
.await
|
||||
@@ -994,8 +994,7 @@ mod tests {
|
||||
|
||||
let req = RangeRequest::new().with_prefix(b"__prefix/");
|
||||
let stream =
|
||||
PaginationStream::new(Arc::new(cluster_client), req, 10, Arc::new(mock_decoder))
|
||||
.into_stream();
|
||||
PaginationStream::new(Arc::new(cluster_client), req, 10, mock_decoder).into_stream();
|
||||
|
||||
let res = stream.try_collect::<Vec<_>>().await.unwrap();
|
||||
assert_eq!(10, res.len());
|
||||
|
||||
@@ -102,7 +102,7 @@ impl LeaderCachedKvBackend {
|
||||
self.store.clone(),
|
||||
RangeRequest::new().with_prefix(prefix.as_bytes()),
|
||||
DEFAULT_PAGE_SIZE,
|
||||
Arc::new(Ok),
|
||||
Ok,
|
||||
)
|
||||
.into_stream();
|
||||
|
||||
|
||||
@@ -37,7 +37,7 @@ use store_api::storage::{ConcreteDataType, RegionId, TimeSeriesRowSelector};
|
||||
|
||||
use crate::cache::cache_size::parquet_meta_size;
|
||||
use crate::cache::file_cache::{FileType, IndexKey};
|
||||
use crate::cache::index::{InvertedIndexCache, InvertedIndexCacheRef};
|
||||
use crate::cache::index::inverted_index::{InvertedIndexCache, InvertedIndexCacheRef};
|
||||
use crate::cache::write_cache::WriteCacheRef;
|
||||
use crate::metrics::{CACHE_BYTES, CACHE_EVICTION, CACHE_HIT, CACHE_MISS};
|
||||
use crate::read::Batch;
|
||||
|
||||
530
src/mito2/src/cache/index.rs
vendored
530
src/mito2/src/cache/index.rs
vendored
@@ -12,168 +12,29 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
pub mod inverted_index;
|
||||
|
||||
use std::future::Future;
|
||||
use std::hash::Hash;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
|
||||
use api::v1::index::InvertedIndexMetas;
|
||||
use async_trait::async_trait;
|
||||
use bytes::Bytes;
|
||||
use common_base::BitVec;
|
||||
use index::inverted_index::error::DecodeFstSnafu;
|
||||
use index::inverted_index::format::reader::InvertedIndexReader;
|
||||
use index::inverted_index::FstMap;
|
||||
use object_store::Buffer;
|
||||
use prost::Message;
|
||||
use snafu::ResultExt;
|
||||
|
||||
use crate::metrics::{CACHE_BYTES, CACHE_HIT, CACHE_MISS};
|
||||
use crate::sst::file::FileId;
|
||||
|
||||
/// Metrics for index metadata.
|
||||
const INDEX_METADATA_TYPE: &str = "index_metadata";
|
||||
/// Metrics for index content.
|
||||
const INDEX_CONTENT_TYPE: &str = "index_content";
|
||||
|
||||
/// Inverted index blob reader with cache.
|
||||
pub struct CachedInvertedIndexBlobReader<R> {
|
||||
file_id: FileId,
|
||||
file_size: u64,
|
||||
inner: R,
|
||||
cache: InvertedIndexCacheRef,
|
||||
}
|
||||
|
||||
impl<R> CachedInvertedIndexBlobReader<R> {
|
||||
pub fn new(file_id: FileId, file_size: u64, inner: R, cache: InvertedIndexCacheRef) -> Self {
|
||||
Self {
|
||||
file_id,
|
||||
file_size,
|
||||
inner,
|
||||
cache,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<R> CachedInvertedIndexBlobReader<R>
|
||||
where
|
||||
R: InvertedIndexReader,
|
||||
{
|
||||
/// Gets given range of index data from cache, and loads from source if the file
|
||||
/// is not already cached.
|
||||
async fn get_or_load(
|
||||
&mut self,
|
||||
offset: u64,
|
||||
size: u32,
|
||||
) -> index::inverted_index::error::Result<Vec<u8>> {
|
||||
let keys =
|
||||
IndexDataPageKey::generate_page_keys(self.file_id, offset, size, self.cache.page_size);
|
||||
// Size is 0, return empty data.
|
||||
if keys.is_empty() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
let mut data = Vec::with_capacity(keys.len());
|
||||
data.resize(keys.len(), Bytes::new());
|
||||
let mut cache_miss_range = vec![];
|
||||
let mut cache_miss_idx = vec![];
|
||||
let last_index = keys.len() - 1;
|
||||
// TODO: Avoid copy as much as possible.
|
||||
for (i, index) in keys.iter().enumerate() {
|
||||
match self.cache.get_index(index) {
|
||||
Some(page) => {
|
||||
CACHE_HIT.with_label_values(&[INDEX_CONTENT_TYPE]).inc();
|
||||
data[i] = page;
|
||||
}
|
||||
None => {
|
||||
CACHE_MISS.with_label_values(&[INDEX_CONTENT_TYPE]).inc();
|
||||
let base_offset = index.page_id * self.cache.page_size;
|
||||
let pruned_size = if i == last_index {
|
||||
prune_size(&keys, self.file_size, self.cache.page_size)
|
||||
} else {
|
||||
self.cache.page_size
|
||||
};
|
||||
cache_miss_range.push(base_offset..base_offset + pruned_size);
|
||||
cache_miss_idx.push(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
if !cache_miss_range.is_empty() {
|
||||
let pages = self.inner.read_vec(&cache_miss_range).await?;
|
||||
for (i, page) in cache_miss_idx.into_iter().zip(pages.into_iter()) {
|
||||
let key = keys[i].clone();
|
||||
data[i] = page.clone();
|
||||
self.cache.put_index(key, page.clone());
|
||||
}
|
||||
}
|
||||
let buffer = Buffer::from_iter(data.into_iter());
|
||||
Ok(buffer
|
||||
.slice(IndexDataPageKey::calculate_range(
|
||||
offset,
|
||||
size,
|
||||
self.cache.page_size,
|
||||
))
|
||||
.to_vec())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<R: InvertedIndexReader> InvertedIndexReader for CachedInvertedIndexBlobReader<R> {
|
||||
async fn range_read(
|
||||
&mut self,
|
||||
offset: u64,
|
||||
size: u32,
|
||||
) -> index::inverted_index::error::Result<Vec<u8>> {
|
||||
self.inner.range_read(offset, size).await
|
||||
}
|
||||
|
||||
async fn read_vec(
|
||||
&mut self,
|
||||
ranges: &[Range<u64>],
|
||||
) -> index::inverted_index::error::Result<Vec<Bytes>> {
|
||||
self.inner.read_vec(ranges).await
|
||||
}
|
||||
|
||||
async fn metadata(&mut self) -> index::inverted_index::error::Result<Arc<InvertedIndexMetas>> {
|
||||
if let Some(cached) = self.cache.get_index_metadata(self.file_id) {
|
||||
CACHE_HIT.with_label_values(&[INDEX_METADATA_TYPE]).inc();
|
||||
Ok(cached)
|
||||
} else {
|
||||
let meta = self.inner.metadata().await?;
|
||||
self.cache.put_index_metadata(self.file_id, meta.clone());
|
||||
CACHE_MISS.with_label_values(&[INDEX_METADATA_TYPE]).inc();
|
||||
Ok(meta)
|
||||
}
|
||||
}
|
||||
|
||||
async fn fst(
|
||||
&mut self,
|
||||
offset: u64,
|
||||
size: u32,
|
||||
) -> index::inverted_index::error::Result<FstMap> {
|
||||
self.get_or_load(offset, size)
|
||||
.await
|
||||
.and_then(|r| FstMap::new(r).context(DecodeFstSnafu))
|
||||
}
|
||||
|
||||
async fn bitmap(
|
||||
&mut self,
|
||||
offset: u64,
|
||||
size: u32,
|
||||
) -> index::inverted_index::error::Result<BitVec> {
|
||||
self.get_or_load(offset, size).await.map(BitVec::from_vec)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct IndexMetadataKey {
|
||||
file_id: FileId,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct IndexDataPageKey {
|
||||
file_id: FileId,
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub struct PageKey {
|
||||
page_id: u64,
|
||||
}
|
||||
|
||||
impl IndexDataPageKey {
|
||||
impl PageKey {
|
||||
/// Converts an offset to a page ID based on the page size.
|
||||
fn calculate_page_id(offset: u64, page_size: u64) -> u64 {
|
||||
offset / page_size
|
||||
@@ -199,49 +60,60 @@ impl IndexDataPageKey {
|
||||
start..end
|
||||
}
|
||||
|
||||
/// Generates a vector of IndexKey instances for the pages that a given offset and size span.
|
||||
fn generate_page_keys(file_id: FileId, offset: u64, size: u32, page_size: u64) -> Vec<Self> {
|
||||
/// Generates a iterator of `IndexKey` for the pages that a given offset and size span.
|
||||
fn generate_page_keys(offset: u64, size: u32, page_size: u64) -> impl Iterator<Item = Self> {
|
||||
let start_page = Self::calculate_page_id(offset, page_size);
|
||||
let total_pages = Self::calculate_page_count(offset, size, page_size);
|
||||
(0..total_pages)
|
||||
.map(|i| Self {
|
||||
file_id,
|
||||
page_id: start_page + i as u64,
|
||||
})
|
||||
.collect()
|
||||
(0..total_pages).map(move |i| Self {
|
||||
page_id: start_page + i as u64,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub type InvertedIndexCacheRef = Arc<InvertedIndexCache>;
|
||||
|
||||
pub struct InvertedIndexCache {
|
||||
/// Cache for inverted index metadata
|
||||
index_metadata: moka::sync::Cache<IndexMetadataKey, Arc<InvertedIndexMetas>>,
|
||||
/// Cache for inverted index content.
|
||||
index: moka::sync::Cache<IndexDataPageKey, Bytes>,
|
||||
/// Cache for index metadata and content.
|
||||
pub struct IndexCache<K, M> {
|
||||
/// Cache for index metadata
|
||||
index_metadata: moka::sync::Cache<K, Arc<M>>,
|
||||
/// Cache for index content.
|
||||
index: moka::sync::Cache<(K, PageKey), Bytes>,
|
||||
// Page size for index content.
|
||||
page_size: u64,
|
||||
|
||||
/// Weighter for metadata.
|
||||
weight_of_metadata: fn(&K, &Arc<M>) -> u32,
|
||||
/// Weighter for content.
|
||||
weight_of_content: fn(&(K, PageKey), &Bytes) -> u32,
|
||||
}
|
||||
|
||||
impl InvertedIndexCache {
|
||||
/// Creates `InvertedIndexCache` with provided `index_metadata_cap` and `index_content_cap`.
|
||||
pub fn new(index_metadata_cap: u64, index_content_cap: u64, page_size: u64) -> Self {
|
||||
common_telemetry::debug!("Building InvertedIndexCache with metadata size: {index_metadata_cap}, content size: {index_content_cap}");
|
||||
impl<K, M> IndexCache<K, M>
|
||||
where
|
||||
K: Hash + Eq + Send + Sync + 'static,
|
||||
M: Send + Sync + 'static,
|
||||
{
|
||||
pub fn new_with_weighter(
|
||||
index_metadata_cap: u64,
|
||||
index_content_cap: u64,
|
||||
page_size: u64,
|
||||
index_type: &'static str,
|
||||
weight_of_metadata: fn(&K, &Arc<M>) -> u32,
|
||||
weight_of_content: fn(&(K, PageKey), &Bytes) -> u32,
|
||||
) -> Self {
|
||||
common_telemetry::debug!("Building IndexCache with metadata size: {index_metadata_cap}, content size: {index_content_cap}, page size: {page_size}, index type: {index_type}");
|
||||
let index_metadata = moka::sync::CacheBuilder::new(index_metadata_cap)
|
||||
.name("inverted_index_metadata")
|
||||
.weigher(index_metadata_weight)
|
||||
.eviction_listener(|k, v, _cause| {
|
||||
let size = index_metadata_weight(&k, &v);
|
||||
.name(&format!("index_metadata_{}", index_type))
|
||||
.weigher(weight_of_metadata)
|
||||
.eviction_listener(move |k, v, _cause| {
|
||||
let size = weight_of_metadata(&k, &v);
|
||||
CACHE_BYTES
|
||||
.with_label_values(&[INDEX_METADATA_TYPE])
|
||||
.sub(size.into());
|
||||
})
|
||||
.build();
|
||||
let index_cache = moka::sync::CacheBuilder::new(index_content_cap)
|
||||
.name("inverted_index_content")
|
||||
.weigher(index_content_weight)
|
||||
.eviction_listener(|k, v, _cause| {
|
||||
let size = index_content_weight(&k, &v);
|
||||
.name(&format!("index_content_{}", index_type))
|
||||
.weigher(weight_of_content)
|
||||
.eviction_listener(move |k, v, _cause| {
|
||||
let size = weight_of_content(&k, &v);
|
||||
CACHE_BYTES
|
||||
.with_label_values(&[INDEX_CONTENT_TYPE])
|
||||
.sub(size.into());
|
||||
@@ -251,259 +123,109 @@ impl InvertedIndexCache {
|
||||
index_metadata,
|
||||
index: index_cache,
|
||||
page_size,
|
||||
weight_of_content,
|
||||
weight_of_metadata,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl InvertedIndexCache {
|
||||
pub fn get_index_metadata(&self, file_id: FileId) -> Option<Arc<InvertedIndexMetas>> {
|
||||
self.index_metadata.get(&IndexMetadataKey { file_id })
|
||||
impl<K, M> IndexCache<K, M>
|
||||
where
|
||||
K: Hash + Eq + Clone + Copy + Send + Sync + 'static,
|
||||
M: Send + Sync + 'static,
|
||||
{
|
||||
pub fn get_metadata(&self, key: K) -> Option<Arc<M>> {
|
||||
self.index_metadata.get(&key)
|
||||
}
|
||||
|
||||
pub fn put_index_metadata(&self, file_id: FileId, metadata: Arc<InvertedIndexMetas>) {
|
||||
let key = IndexMetadataKey { file_id };
|
||||
pub fn put_metadata(&self, key: K, metadata: Arc<M>) {
|
||||
CACHE_BYTES
|
||||
.with_label_values(&[INDEX_METADATA_TYPE])
|
||||
.add(index_metadata_weight(&key, &metadata).into());
|
||||
.add((self.weight_of_metadata)(&key, &metadata).into());
|
||||
self.index_metadata.insert(key, metadata)
|
||||
}
|
||||
|
||||
pub fn get_index(&self, key: &IndexDataPageKey) -> Option<Bytes> {
|
||||
self.index.get(key)
|
||||
/// Gets given range of index data from cache, and loads from source if the file
|
||||
/// is not already cached.
|
||||
async fn get_or_load<F, Fut, E>(
|
||||
&self,
|
||||
key: K,
|
||||
file_size: u64,
|
||||
offset: u64,
|
||||
size: u32,
|
||||
load: F,
|
||||
) -> Result<Vec<u8>, E>
|
||||
where
|
||||
F: FnOnce(Vec<Range<u64>>) -> Fut,
|
||||
Fut: Future<Output = Result<Vec<Bytes>, E>>,
|
||||
E: std::error::Error,
|
||||
{
|
||||
let page_keys =
|
||||
PageKey::generate_page_keys(offset, size, self.page_size).collect::<Vec<_>>();
|
||||
// Size is 0, return empty data.
|
||||
if page_keys.is_empty() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
let mut data = Vec::with_capacity(page_keys.len());
|
||||
data.resize(page_keys.len(), Bytes::new());
|
||||
let mut cache_miss_range = vec![];
|
||||
let mut cache_miss_idx = vec![];
|
||||
let last_index = page_keys.len() - 1;
|
||||
// TODO: Avoid copy as much as possible.
|
||||
for (i, page_key) in page_keys.iter().enumerate() {
|
||||
match self.get_page(key, *page_key) {
|
||||
Some(page) => {
|
||||
CACHE_HIT.with_label_values(&[INDEX_CONTENT_TYPE]).inc();
|
||||
data[i] = page;
|
||||
}
|
||||
None => {
|
||||
CACHE_MISS.with_label_values(&[INDEX_CONTENT_TYPE]).inc();
|
||||
let base_offset = page_key.page_id * self.page_size;
|
||||
let pruned_size = if i == last_index {
|
||||
prune_size(page_keys.iter(), file_size, self.page_size)
|
||||
} else {
|
||||
self.page_size
|
||||
};
|
||||
cache_miss_range.push(base_offset..base_offset + pruned_size);
|
||||
cache_miss_idx.push(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
if !cache_miss_range.is_empty() {
|
||||
let pages = load(cache_miss_range).await?;
|
||||
for (i, page) in cache_miss_idx.into_iter().zip(pages.into_iter()) {
|
||||
let page_key = page_keys[i];
|
||||
data[i] = page.clone();
|
||||
self.put_page(key, page_key, page.clone());
|
||||
}
|
||||
}
|
||||
let buffer = Buffer::from_iter(data.into_iter());
|
||||
Ok(buffer
|
||||
.slice(PageKey::calculate_range(offset, size, self.page_size))
|
||||
.to_vec())
|
||||
}
|
||||
|
||||
pub fn put_index(&self, key: IndexDataPageKey, value: Bytes) {
|
||||
fn get_page(&self, key: K, page_key: PageKey) -> Option<Bytes> {
|
||||
self.index.get(&(key, page_key))
|
||||
}
|
||||
|
||||
fn put_page(&self, key: K, page_key: PageKey, value: Bytes) {
|
||||
CACHE_BYTES
|
||||
.with_label_values(&[INDEX_CONTENT_TYPE])
|
||||
.add(index_content_weight(&key, &value).into());
|
||||
self.index.insert(key, value);
|
||||
.add((self.weight_of_content)(&(key, page_key), &value).into());
|
||||
self.index.insert((key, page_key), value);
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculates weight for index metadata.
|
||||
fn index_metadata_weight(k: &IndexMetadataKey, v: &Arc<InvertedIndexMetas>) -> u32 {
|
||||
(k.file_id.as_bytes().len() + v.encoded_len()) as u32
|
||||
}
|
||||
|
||||
/// Calculates weight for index content.
|
||||
fn index_content_weight(k: &IndexDataPageKey, v: &Bytes) -> u32 {
|
||||
(k.file_id.as_bytes().len() + v.len()) as u32
|
||||
}
|
||||
|
||||
/// Prunes the size of the last page based on the indexes.
|
||||
/// We have following cases:
|
||||
/// 1. The rest file size is less than the page size, read to the end of the file.
|
||||
/// 2. Otherwise, read the page size.
|
||||
fn prune_size(indexes: &[IndexDataPageKey], file_size: u64, page_size: u64) -> u64 {
|
||||
fn prune_size<'a>(
|
||||
indexes: impl Iterator<Item = &'a PageKey>,
|
||||
file_size: u64,
|
||||
page_size: u64,
|
||||
) -> u64 {
|
||||
let last_page_start = indexes.last().map(|i| i.page_id * page_size).unwrap_or(0);
|
||||
page_size.min(file_size - last_page_start)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use std::num::NonZeroUsize;
|
||||
|
||||
use common_base::BitVec;
|
||||
use futures::stream;
|
||||
use index::inverted_index::format::reader::{InvertedIndexBlobReader, InvertedIndexReader};
|
||||
use index::inverted_index::format::writer::{InvertedIndexBlobWriter, InvertedIndexWriter};
|
||||
use index::inverted_index::Bytes;
|
||||
use prometheus::register_int_counter_vec;
|
||||
use rand::{Rng, RngCore};
|
||||
|
||||
use super::*;
|
||||
use crate::sst::index::store::InstrumentedStore;
|
||||
use crate::test_util::TestEnv;
|
||||
|
||||
// Repeat times for following little fuzz tests.
|
||||
const FUZZ_REPEAT_TIMES: usize = 100;
|
||||
|
||||
// Fuzz test for index data page key
|
||||
#[test]
|
||||
fn fuzz_index_calculation() {
|
||||
// randomly generate a large u8 array
|
||||
let mut rng = rand::thread_rng();
|
||||
let mut data = vec![0u8; 1024 * 1024];
|
||||
rng.fill_bytes(&mut data);
|
||||
let file_id = FileId::random();
|
||||
|
||||
for _ in 0..FUZZ_REPEAT_TIMES {
|
||||
let offset = rng.gen_range(0..data.len() as u64);
|
||||
let size = rng.gen_range(0..data.len() as u32 - offset as u32);
|
||||
let page_size: usize = rng.gen_range(1..1024);
|
||||
|
||||
let indexes =
|
||||
IndexDataPageKey::generate_page_keys(file_id, offset, size, page_size as u64);
|
||||
let page_num = indexes.len();
|
||||
let mut read = Vec::with_capacity(size as usize);
|
||||
for key in indexes.into_iter() {
|
||||
let start = key.page_id as usize * page_size;
|
||||
let page = if start + page_size < data.len() {
|
||||
&data[start..start + page_size]
|
||||
} else {
|
||||
&data[start..]
|
||||
};
|
||||
read.extend_from_slice(page);
|
||||
}
|
||||
let expected_range = offset as usize..(offset + size as u64 as u64) as usize;
|
||||
let read =
|
||||
read[IndexDataPageKey::calculate_range(offset, size, page_size as u64)].to_vec();
|
||||
if read != data.get(expected_range).unwrap() {
|
||||
panic!(
|
||||
"fuzz_read_index failed, offset: {}, size: {}, page_size: {}\nread len: {}, expected len: {}\nrange: {:?}, page num: {}",
|
||||
offset, size, page_size, read.len(), size as usize,
|
||||
IndexDataPageKey::calculate_range(offset, size, page_size as u64),
|
||||
page_num
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn unpack(fst_value: u64) -> [u32; 2] {
|
||||
bytemuck::cast::<u64, [u32; 2]>(fst_value)
|
||||
}
|
||||
|
||||
async fn create_inverted_index_blob() -> Vec<u8> {
|
||||
let mut blob = Vec::new();
|
||||
let mut writer = InvertedIndexBlobWriter::new(&mut blob);
|
||||
writer
|
||||
.add_index(
|
||||
"tag0".to_string(),
|
||||
BitVec::from_slice(&[0b0000_0001, 0b0000_0000]),
|
||||
Box::new(stream::iter(vec![
|
||||
Ok((Bytes::from("a"), BitVec::from_slice(&[0b0000_0001]))),
|
||||
Ok((Bytes::from("b"), BitVec::from_slice(&[0b0010_0000]))),
|
||||
Ok((Bytes::from("c"), BitVec::from_slice(&[0b0000_0001]))),
|
||||
])),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
writer
|
||||
.add_index(
|
||||
"tag1".to_string(),
|
||||
BitVec::from_slice(&[0b0000_0001, 0b0000_0000]),
|
||||
Box::new(stream::iter(vec![
|
||||
Ok((Bytes::from("x"), BitVec::from_slice(&[0b0000_0001]))),
|
||||
Ok((Bytes::from("y"), BitVec::from_slice(&[0b0010_0000]))),
|
||||
Ok((Bytes::from("z"), BitVec::from_slice(&[0b0000_0001]))),
|
||||
])),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
writer
|
||||
.finish(8, NonZeroUsize::new(1).unwrap())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
blob
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_inverted_index_cache() {
|
||||
let blob = create_inverted_index_blob().await;
|
||||
|
||||
// Init a test range reader in local fs.
|
||||
let mut env = TestEnv::new();
|
||||
let file_size = blob.len() as u64;
|
||||
let store = env.init_object_store_manager();
|
||||
let temp_path = "data";
|
||||
store.write(temp_path, blob).await.unwrap();
|
||||
let store = InstrumentedStore::new(store);
|
||||
let metric =
|
||||
register_int_counter_vec!("test_bytes", "a counter for test", &["test"]).unwrap();
|
||||
let counter = metric.with_label_values(&["test"]);
|
||||
let range_reader = store
|
||||
.range_reader("data", &counter, &counter)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let reader = InvertedIndexBlobReader::new(range_reader);
|
||||
let mut cached_reader = CachedInvertedIndexBlobReader::new(
|
||||
FileId::random(),
|
||||
file_size,
|
||||
reader,
|
||||
Arc::new(InvertedIndexCache::new(8192, 8192, 50)),
|
||||
);
|
||||
let metadata = cached_reader.metadata().await.unwrap();
|
||||
assert_eq!(metadata.total_row_count, 8);
|
||||
assert_eq!(metadata.segment_row_count, 1);
|
||||
assert_eq!(metadata.metas.len(), 2);
|
||||
// tag0
|
||||
let tag0 = metadata.metas.get("tag0").unwrap();
|
||||
let stats0 = tag0.stats.as_ref().unwrap();
|
||||
assert_eq!(stats0.distinct_count, 3);
|
||||
assert_eq!(stats0.null_count, 1);
|
||||
assert_eq!(stats0.min_value, Bytes::from("a"));
|
||||
assert_eq!(stats0.max_value, Bytes::from("c"));
|
||||
let fst0 = cached_reader
|
||||
.fst(
|
||||
tag0.base_offset + tag0.relative_fst_offset as u64,
|
||||
tag0.fst_size,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(fst0.len(), 3);
|
||||
let [offset, size] = unpack(fst0.get(b"a").unwrap());
|
||||
let bitmap = cached_reader
|
||||
.bitmap(tag0.base_offset + offset as u64, size)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
|
||||
let [offset, size] = unpack(fst0.get(b"b").unwrap());
|
||||
let bitmap = cached_reader
|
||||
.bitmap(tag0.base_offset + offset as u64, size)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(bitmap, BitVec::from_slice(&[0b0010_0000]));
|
||||
let [offset, size] = unpack(fst0.get(b"c").unwrap());
|
||||
let bitmap = cached_reader
|
||||
.bitmap(tag0.base_offset + offset as u64, size)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
|
||||
|
||||
// tag1
|
||||
let tag1 = metadata.metas.get("tag1").unwrap();
|
||||
let stats1 = tag1.stats.as_ref().unwrap();
|
||||
assert_eq!(stats1.distinct_count, 3);
|
||||
assert_eq!(stats1.null_count, 1);
|
||||
assert_eq!(stats1.min_value, Bytes::from("x"));
|
||||
assert_eq!(stats1.max_value, Bytes::from("z"));
|
||||
let fst1 = cached_reader
|
||||
.fst(
|
||||
tag1.base_offset + tag1.relative_fst_offset as u64,
|
||||
tag1.fst_size,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(fst1.len(), 3);
|
||||
let [offset, size] = unpack(fst1.get(b"x").unwrap());
|
||||
let bitmap = cached_reader
|
||||
.bitmap(tag1.base_offset + offset as u64, size)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
|
||||
let [offset, size] = unpack(fst1.get(b"y").unwrap());
|
||||
let bitmap = cached_reader
|
||||
.bitmap(tag1.base_offset + offset as u64, size)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(bitmap, BitVec::from_slice(&[0b0010_0000]));
|
||||
let [offset, size] = unpack(fst1.get(b"z").unwrap());
|
||||
let bitmap = cached_reader
|
||||
.bitmap(tag1.base_offset + offset as u64, size)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
|
||||
|
||||
// fuzz test
|
||||
let mut rng = rand::thread_rng();
|
||||
for _ in 0..FUZZ_REPEAT_TIMES {
|
||||
let offset = rng.gen_range(0..file_size);
|
||||
let size = rng.gen_range(0..file_size as u32 - offset as u32);
|
||||
let expected = cached_reader.range_read(offset, size).await.unwrap();
|
||||
let read = cached_reader.get_or_load(offset, size).await.unwrap();
|
||||
assert_eq!(read, expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
322
src/mito2/src/cache/index/inverted_index.rs
vendored
Normal file
322
src/mito2/src/cache/index/inverted_index.rs
vendored
Normal file
@@ -0,0 +1,322 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use api::v1::index::InvertedIndexMetas;
|
||||
use async_trait::async_trait;
|
||||
use bytes::Bytes;
|
||||
use index::inverted_index::error::Result;
|
||||
use index::inverted_index::format::reader::InvertedIndexReader;
|
||||
use prost::Message;
|
||||
|
||||
use crate::cache::index::{IndexCache, PageKey, INDEX_METADATA_TYPE};
|
||||
use crate::metrics::{CACHE_HIT, CACHE_MISS};
|
||||
use crate::sst::file::FileId;
|
||||
|
||||
const INDEX_TYPE_INVERTED_INDEX: &str = "inverted_index";
|
||||
|
||||
/// Cache for inverted index.
|
||||
pub type InvertedIndexCache = IndexCache<FileId, InvertedIndexMetas>;
|
||||
pub type InvertedIndexCacheRef = Arc<InvertedIndexCache>;
|
||||
|
||||
impl InvertedIndexCache {
|
||||
/// Creates a new inverted index cache.
|
||||
pub fn new(index_metadata_cap: u64, index_content_cap: u64, page_size: u64) -> Self {
|
||||
Self::new_with_weighter(
|
||||
index_metadata_cap,
|
||||
index_content_cap,
|
||||
page_size,
|
||||
INDEX_TYPE_INVERTED_INDEX,
|
||||
inverted_index_metadata_weight,
|
||||
inverted_index_content_weight,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculates weight for inverted index metadata.
|
||||
fn inverted_index_metadata_weight(k: &FileId, v: &Arc<InvertedIndexMetas>) -> u32 {
|
||||
(k.as_bytes().len() + v.encoded_len()) as u32
|
||||
}
|
||||
|
||||
/// Calculates weight for inverted index content.
|
||||
fn inverted_index_content_weight((k, _): &(FileId, PageKey), v: &Bytes) -> u32 {
|
||||
(k.as_bytes().len() + v.len()) as u32
|
||||
}
|
||||
|
||||
/// Inverted index blob reader with cache.
|
||||
pub struct CachedInvertedIndexBlobReader<R> {
|
||||
file_id: FileId,
|
||||
file_size: u64,
|
||||
inner: R,
|
||||
cache: InvertedIndexCacheRef,
|
||||
}
|
||||
|
||||
impl<R> CachedInvertedIndexBlobReader<R> {
|
||||
/// Creates a new inverted index blob reader with cache.
|
||||
pub fn new(file_id: FileId, file_size: u64, inner: R, cache: InvertedIndexCacheRef) -> Self {
|
||||
Self {
|
||||
file_id,
|
||||
file_size,
|
||||
inner,
|
||||
cache,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<R: InvertedIndexReader> InvertedIndexReader for CachedInvertedIndexBlobReader<R> {
|
||||
async fn range_read(&mut self, offset: u64, size: u32) -> Result<Vec<u8>> {
|
||||
let inner = &mut self.inner;
|
||||
self.cache
|
||||
.get_or_load(
|
||||
self.file_id,
|
||||
self.file_size,
|
||||
offset,
|
||||
size,
|
||||
move |ranges| async move { inner.read_vec(&ranges).await },
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn metadata(&mut self) -> Result<Arc<InvertedIndexMetas>> {
|
||||
if let Some(cached) = self.cache.get_metadata(self.file_id) {
|
||||
CACHE_HIT.with_label_values(&[INDEX_METADATA_TYPE]).inc();
|
||||
Ok(cached)
|
||||
} else {
|
||||
let meta = self.inner.metadata().await?;
|
||||
self.cache.put_metadata(self.file_id, meta.clone());
|
||||
CACHE_MISS.with_label_values(&[INDEX_METADATA_TYPE]).inc();
|
||||
Ok(meta)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use std::num::NonZeroUsize;
|
||||
|
||||
use common_base::BitVec;
|
||||
use futures::stream;
|
||||
use index::inverted_index::format::reader::{InvertedIndexBlobReader, InvertedIndexReader};
|
||||
use index::inverted_index::format::writer::{InvertedIndexBlobWriter, InvertedIndexWriter};
|
||||
use index::inverted_index::Bytes;
|
||||
use prometheus::register_int_counter_vec;
|
||||
use rand::{Rng, RngCore};
|
||||
|
||||
use super::*;
|
||||
use crate::sst::index::store::InstrumentedStore;
|
||||
use crate::test_util::TestEnv;
|
||||
|
||||
// Repeat times for following little fuzz tests.
|
||||
const FUZZ_REPEAT_TIMES: usize = 100;
|
||||
|
||||
// Fuzz test for index data page key
|
||||
#[test]
|
||||
fn fuzz_index_calculation() {
|
||||
// randomly generate a large u8 array
|
||||
let mut rng = rand::thread_rng();
|
||||
let mut data = vec![0u8; 1024 * 1024];
|
||||
rng.fill_bytes(&mut data);
|
||||
|
||||
for _ in 0..FUZZ_REPEAT_TIMES {
|
||||
let offset = rng.gen_range(0..data.len() as u64);
|
||||
let size = rng.gen_range(0..data.len() as u32 - offset as u32);
|
||||
let page_size: usize = rng.gen_range(1..1024);
|
||||
|
||||
let indexes =
|
||||
PageKey::generate_page_keys(offset, size, page_size as u64).collect::<Vec<_>>();
|
||||
let page_num = indexes.len();
|
||||
let mut read = Vec::with_capacity(size as usize);
|
||||
for key in indexes.into_iter() {
|
||||
let start = key.page_id as usize * page_size;
|
||||
let page = if start + page_size < data.len() {
|
||||
&data[start..start + page_size]
|
||||
} else {
|
||||
&data[start..]
|
||||
};
|
||||
read.extend_from_slice(page);
|
||||
}
|
||||
let expected_range = offset as usize..(offset + size as u64 as u64) as usize;
|
||||
let read = read[PageKey::calculate_range(offset, size, page_size as u64)].to_vec();
|
||||
if read != data.get(expected_range).unwrap() {
|
||||
panic!(
|
||||
"fuzz_read_index failed, offset: {}, size: {}, page_size: {}\nread len: {}, expected len: {}\nrange: {:?}, page num: {}",
|
||||
offset, size, page_size, read.len(), size as usize,
|
||||
PageKey::calculate_range(offset, size, page_size as u64),
|
||||
page_num
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn unpack(fst_value: u64) -> [u32; 2] {
|
||||
bytemuck::cast::<u64, [u32; 2]>(fst_value)
|
||||
}
|
||||
|
||||
async fn create_inverted_index_blob() -> Vec<u8> {
|
||||
let mut blob = Vec::new();
|
||||
let mut writer = InvertedIndexBlobWriter::new(&mut blob);
|
||||
writer
|
||||
.add_index(
|
||||
"tag0".to_string(),
|
||||
BitVec::from_slice(&[0b0000_0001, 0b0000_0000]),
|
||||
Box::new(stream::iter(vec![
|
||||
Ok((Bytes::from("a"), BitVec::from_slice(&[0b0000_0001]))),
|
||||
Ok((Bytes::from("b"), BitVec::from_slice(&[0b0010_0000]))),
|
||||
Ok((Bytes::from("c"), BitVec::from_slice(&[0b0000_0001]))),
|
||||
])),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
writer
|
||||
.add_index(
|
||||
"tag1".to_string(),
|
||||
BitVec::from_slice(&[0b0000_0001, 0b0000_0000]),
|
||||
Box::new(stream::iter(vec![
|
||||
Ok((Bytes::from("x"), BitVec::from_slice(&[0b0000_0001]))),
|
||||
Ok((Bytes::from("y"), BitVec::from_slice(&[0b0010_0000]))),
|
||||
Ok((Bytes::from("z"), BitVec::from_slice(&[0b0000_0001]))),
|
||||
])),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
writer
|
||||
.finish(8, NonZeroUsize::new(1).unwrap())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
blob
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_inverted_index_cache() {
|
||||
let blob = create_inverted_index_blob().await;
|
||||
|
||||
// Init a test range reader in local fs.
|
||||
let mut env = TestEnv::new();
|
||||
let file_size = blob.len() as u64;
|
||||
let store = env.init_object_store_manager();
|
||||
let temp_path = "data";
|
||||
store.write(temp_path, blob).await.unwrap();
|
||||
let store = InstrumentedStore::new(store);
|
||||
let metric =
|
||||
register_int_counter_vec!("test_bytes", "a counter for test", &["test"]).unwrap();
|
||||
let counter = metric.with_label_values(&["test"]);
|
||||
let range_reader = store
|
||||
.range_reader("data", &counter, &counter)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let reader = InvertedIndexBlobReader::new(range_reader);
|
||||
let mut cached_reader = CachedInvertedIndexBlobReader::new(
|
||||
FileId::random(),
|
||||
file_size,
|
||||
reader,
|
||||
Arc::new(InvertedIndexCache::new(8192, 8192, 50)),
|
||||
);
|
||||
let metadata = cached_reader.metadata().await.unwrap();
|
||||
assert_eq!(metadata.total_row_count, 8);
|
||||
assert_eq!(metadata.segment_row_count, 1);
|
||||
assert_eq!(metadata.metas.len(), 2);
|
||||
// tag0
|
||||
let tag0 = metadata.metas.get("tag0").unwrap();
|
||||
let stats0 = tag0.stats.as_ref().unwrap();
|
||||
assert_eq!(stats0.distinct_count, 3);
|
||||
assert_eq!(stats0.null_count, 1);
|
||||
assert_eq!(stats0.min_value, Bytes::from("a"));
|
||||
assert_eq!(stats0.max_value, Bytes::from("c"));
|
||||
let fst0 = cached_reader
|
||||
.fst(
|
||||
tag0.base_offset + tag0.relative_fst_offset as u64,
|
||||
tag0.fst_size,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(fst0.len(), 3);
|
||||
let [offset, size] = unpack(fst0.get(b"a").unwrap());
|
||||
let bitmap = cached_reader
|
||||
.bitmap(tag0.base_offset + offset as u64, size)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
|
||||
let [offset, size] = unpack(fst0.get(b"b").unwrap());
|
||||
let bitmap = cached_reader
|
||||
.bitmap(tag0.base_offset + offset as u64, size)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(bitmap, BitVec::from_slice(&[0b0010_0000]));
|
||||
let [offset, size] = unpack(fst0.get(b"c").unwrap());
|
||||
let bitmap = cached_reader
|
||||
.bitmap(tag0.base_offset + offset as u64, size)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
|
||||
|
||||
// tag1
|
||||
let tag1 = metadata.metas.get("tag1").unwrap();
|
||||
let stats1 = tag1.stats.as_ref().unwrap();
|
||||
assert_eq!(stats1.distinct_count, 3);
|
||||
assert_eq!(stats1.null_count, 1);
|
||||
assert_eq!(stats1.min_value, Bytes::from("x"));
|
||||
assert_eq!(stats1.max_value, Bytes::from("z"));
|
||||
let fst1 = cached_reader
|
||||
.fst(
|
||||
tag1.base_offset + tag1.relative_fst_offset as u64,
|
||||
tag1.fst_size,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(fst1.len(), 3);
|
||||
let [offset, size] = unpack(fst1.get(b"x").unwrap());
|
||||
let bitmap = cached_reader
|
||||
.bitmap(tag1.base_offset + offset as u64, size)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
|
||||
let [offset, size] = unpack(fst1.get(b"y").unwrap());
|
||||
let bitmap = cached_reader
|
||||
.bitmap(tag1.base_offset + offset as u64, size)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(bitmap, BitVec::from_slice(&[0b0010_0000]));
|
||||
let [offset, size] = unpack(fst1.get(b"z").unwrap());
|
||||
let bitmap = cached_reader
|
||||
.bitmap(tag1.base_offset + offset as u64, size)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
|
||||
|
||||
// fuzz test
|
||||
let mut rng = rand::thread_rng();
|
||||
for _ in 0..FUZZ_REPEAT_TIMES {
|
||||
let offset = rng.gen_range(0..file_size);
|
||||
let size = rng.gen_range(0..file_size as u32 - offset as u32);
|
||||
let expected = cached_reader.range_read(offset, size).await.unwrap();
|
||||
let inner = &mut cached_reader.inner;
|
||||
let read = cached_reader
|
||||
.cache
|
||||
.get_or_load(
|
||||
cached_reader.file_id,
|
||||
file_size,
|
||||
offset,
|
||||
size,
|
||||
|ranges| async move { inner.read_vec(&ranges).await },
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(read, expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -723,10 +723,20 @@ pub enum Error {
|
||||
|
||||
#[snafu(display("Failed to iter data part"))]
|
||||
ReadDataPart {
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
#[snafu(source)]
|
||||
error: parquet::errors::ParquetError,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to read row group in memtable"))]
|
||||
DecodeArrowRowGroup {
|
||||
#[snafu(source)]
|
||||
error: ArrowError,
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Invalid region options, {}", reason))]
|
||||
InvalidRegionOptions {
|
||||
reason: String,
|
||||
@@ -1029,6 +1039,7 @@ impl ErrorExt for Error {
|
||||
RegionBusy { .. } => StatusCode::RegionBusy,
|
||||
GetSchemaMetadata { source, .. } => source.status_code(),
|
||||
Timeout { .. } => StatusCode::Cancelled,
|
||||
DecodeArrowRowGroup { .. } => StatusCode::Internal,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -27,8 +27,12 @@ use crate::memtable::{
|
||||
BoxedBatchIterator, KeyValues, Memtable, MemtableId, MemtableRanges, MemtableRef, MemtableStats,
|
||||
};
|
||||
|
||||
#[allow(unused)]
|
||||
mod context;
|
||||
#[allow(unused)]
|
||||
pub(crate) mod part;
|
||||
mod part_reader;
|
||||
mod row_group_reader;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct BulkMemtable {
|
||||
|
||||
117
src/mito2/src/memtable/bulk/context.rs
Normal file
117
src/mito2/src/memtable/bulk/context.rs
Normal file
@@ -0,0 +1,117 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Context for iterating bulk memtable.
|
||||
|
||||
use std::collections::VecDeque;
|
||||
use std::sync::Arc;
|
||||
|
||||
use parquet::file::metadata::ParquetMetaData;
|
||||
use store_api::metadata::RegionMetadataRef;
|
||||
use store_api::storage::ColumnId;
|
||||
use table::predicate::Predicate;
|
||||
|
||||
use crate::row_converter::McmpRowCodec;
|
||||
use crate::sst::parquet::file_range::RangeBase;
|
||||
use crate::sst::parquet::format::ReadFormat;
|
||||
use crate::sst::parquet::reader::SimpleFilterContext;
|
||||
use crate::sst::parquet::stats::RowGroupPruningStats;
|
||||
|
||||
pub(crate) type BulkIterContextRef = Arc<BulkIterContext>;
|
||||
|
||||
pub(crate) struct BulkIterContext {
|
||||
pub(crate) base: RangeBase,
|
||||
pub(crate) predicate: Option<Predicate>,
|
||||
}
|
||||
|
||||
impl BulkIterContext {
|
||||
pub(crate) fn new(
|
||||
region_metadata: RegionMetadataRef,
|
||||
projection: &Option<&[ColumnId]>,
|
||||
predicate: Option<Predicate>,
|
||||
) -> Self {
|
||||
let codec = McmpRowCodec::new_with_primary_keys(®ion_metadata);
|
||||
|
||||
let simple_filters = predicate
|
||||
.as_ref()
|
||||
.iter()
|
||||
.flat_map(|predicate| {
|
||||
predicate
|
||||
.exprs()
|
||||
.iter()
|
||||
.filter_map(|expr| SimpleFilterContext::new_opt(®ion_metadata, None, expr))
|
||||
})
|
||||
.collect();
|
||||
|
||||
let read_format = build_read_format(region_metadata, projection);
|
||||
|
||||
Self {
|
||||
base: RangeBase {
|
||||
filters: simple_filters,
|
||||
read_format,
|
||||
codec,
|
||||
// we don't need to compat batch since all batch in memtable have the same schema.
|
||||
compat_batch: None,
|
||||
},
|
||||
predicate,
|
||||
}
|
||||
}
|
||||
|
||||
/// Prunes row groups by stats.
|
||||
pub(crate) fn row_groups_to_read(&self, file_meta: &Arc<ParquetMetaData>) -> VecDeque<usize> {
|
||||
let region_meta = self.base.read_format.metadata();
|
||||
let row_groups = file_meta.row_groups();
|
||||
// expected_metadata is set to None since we always expect region metadata of memtable is up-to-date.
|
||||
let stats = RowGroupPruningStats::new(row_groups, &self.base.read_format, None);
|
||||
if let Some(predicate) = self.predicate.as_ref() {
|
||||
predicate
|
||||
.prune_with_stats(&stats, region_meta.schema.arrow_schema())
|
||||
.iter()
|
||||
.zip(0..file_meta.num_row_groups())
|
||||
.filter_map(|(selected, row_group)| {
|
||||
if !*selected {
|
||||
return None;
|
||||
}
|
||||
Some(row_group)
|
||||
})
|
||||
.collect::<VecDeque<_>>()
|
||||
} else {
|
||||
(0..file_meta.num_row_groups()).collect()
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn read_format(&self) -> &ReadFormat {
|
||||
&self.base.read_format
|
||||
}
|
||||
}
|
||||
|
||||
fn build_read_format(
|
||||
region_metadata: RegionMetadataRef,
|
||||
projection: &Option<&[ColumnId]>,
|
||||
) -> ReadFormat {
|
||||
let read_format = if let Some(column_ids) = &projection {
|
||||
ReadFormat::new(region_metadata, column_ids.iter().copied())
|
||||
} else {
|
||||
// No projection, lists all column ids to read.
|
||||
ReadFormat::new(
|
||||
region_metadata.clone(),
|
||||
region_metadata
|
||||
.column_metadatas
|
||||
.iter()
|
||||
.map(|col| col.column_id),
|
||||
)
|
||||
};
|
||||
|
||||
read_format
|
||||
}
|
||||
@@ -13,10 +13,12 @@
|
||||
// limitations under the License.
|
||||
|
||||
//! Bulk part encoder/decoder.
|
||||
|
||||
use std::collections::VecDeque;
|
||||
use std::sync::Arc;
|
||||
|
||||
use api::v1::Mutation;
|
||||
use bytes::Bytes;
|
||||
use common_time::timestamp::TimeUnit;
|
||||
use datafusion::arrow::array::{TimestampNanosecondArray, UInt64Builder};
|
||||
use datatypes::arrow;
|
||||
@@ -26,93 +28,145 @@ use datatypes::arrow::array::{
|
||||
UInt8Builder,
|
||||
};
|
||||
use datatypes::arrow::compute::TakeOptions;
|
||||
use datatypes::arrow::datatypes::{DataType as ArrowDataType, SchemaRef};
|
||||
use datatypes::arrow::datatypes::SchemaRef;
|
||||
use datatypes::arrow_array::BinaryArray;
|
||||
use datatypes::data_type::DataType;
|
||||
use datatypes::prelude::{MutableVector, ScalarVectorBuilder, Vector};
|
||||
use datatypes::types::TimestampType;
|
||||
use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions};
|
||||
use parquet::arrow::ArrowWriter;
|
||||
use parquet::data_type::AsBytes;
|
||||
use parquet::file::metadata::ParquetMetaData;
|
||||
use parquet::file::properties::WriterProperties;
|
||||
use snafu::ResultExt;
|
||||
use store_api::metadata::RegionMetadataRef;
|
||||
use store_api::storage::ColumnId;
|
||||
use table::predicate::Predicate;
|
||||
|
||||
use crate::error;
|
||||
use crate::error::{ComputeArrowSnafu, EncodeMemtableSnafu, NewRecordBatchSnafu, Result};
|
||||
use crate::memtable::bulk::context::BulkIterContextRef;
|
||||
use crate::memtable::bulk::part_reader::BulkPartIter;
|
||||
use crate::memtable::key_values::KeyValuesRef;
|
||||
use crate::read::Batch;
|
||||
use crate::memtable::BoxedBatchIterator;
|
||||
use crate::row_converter::{McmpRowCodec, RowCodec};
|
||||
use crate::sst::parquet::format::PrimaryKeyArray;
|
||||
use crate::sst::parquet::format::{PrimaryKeyArray, ReadFormat};
|
||||
use crate::sst::parquet::helper::parse_parquet_metadata;
|
||||
use crate::sst::to_sst_arrow_schema;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct BulkPart {
|
||||
data: Vec<u8>,
|
||||
data: Bytes,
|
||||
metadata: BulkPartMeta,
|
||||
}
|
||||
|
||||
impl BulkPart {
|
||||
pub fn new(data: Vec<u8>, metadata: BulkPartMeta) -> Self {
|
||||
pub fn new(data: Bytes, metadata: BulkPartMeta) -> Self {
|
||||
Self { data, metadata }
|
||||
}
|
||||
|
||||
pub(crate) fn metadata(&self) -> &BulkPartMeta {
|
||||
&self.metadata
|
||||
}
|
||||
|
||||
pub(crate) fn read(&self, context: BulkIterContextRef) -> Result<Option<BoxedBatchIterator>> {
|
||||
// use predicate to find row groups to read.
|
||||
let row_groups_to_read = context.row_groups_to_read(&self.metadata.parquet_metadata);
|
||||
|
||||
if row_groups_to_read.is_empty() {
|
||||
// All row groups are filtered.
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let iter = BulkPartIter::try_new(
|
||||
context,
|
||||
row_groups_to_read,
|
||||
self.metadata.parquet_metadata.clone(),
|
||||
self.data.clone(),
|
||||
)?;
|
||||
Ok(Some(Box::new(iter) as BoxedBatchIterator))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct BulkPartMeta {
|
||||
/// Total rows in part.
|
||||
pub num_rows: usize,
|
||||
/// Max timestamp in part.
|
||||
pub max_timestamp: i64,
|
||||
/// Min timestamp in part.
|
||||
pub min_timestamp: i64,
|
||||
}
|
||||
|
||||
impl Default for BulkPartMeta {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
num_rows: 0,
|
||||
max_timestamp: i64::MIN,
|
||||
min_timestamp: i64::MAX,
|
||||
}
|
||||
}
|
||||
/// Part file metadata.
|
||||
pub parquet_metadata: Arc<ParquetMetaData>,
|
||||
/// Part region schema.
|
||||
pub region_metadata: RegionMetadataRef,
|
||||
}
|
||||
|
||||
pub struct BulkPartEncoder {
|
||||
metadata: RegionMetadataRef,
|
||||
arrow_schema: SchemaRef,
|
||||
pk_encoder: McmpRowCodec,
|
||||
row_group_size: usize,
|
||||
dedup: bool,
|
||||
writer_props: Option<WriterProperties>,
|
||||
}
|
||||
|
||||
impl BulkPartEncoder {
|
||||
pub(crate) fn new(
|
||||
metadata: RegionMetadataRef,
|
||||
dedup: bool,
|
||||
row_group_size: usize,
|
||||
) -> BulkPartEncoder {
|
||||
let codec = McmpRowCodec::new_with_primary_keys(&metadata);
|
||||
let writer_props = Some(
|
||||
WriterProperties::builder()
|
||||
.set_write_batch_size(row_group_size)
|
||||
.set_max_row_group_size(row_group_size)
|
||||
.build(),
|
||||
);
|
||||
Self {
|
||||
metadata,
|
||||
pk_encoder: codec,
|
||||
row_group_size,
|
||||
dedup,
|
||||
writer_props,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl BulkPartEncoder {
|
||||
/// Encodes mutations to a [BulkPart], returns true if encoded data has been written to `dest`.
|
||||
fn encode_mutations(&self, mutations: &[Mutation], dest: &mut BulkPart) -> Result<bool> {
|
||||
fn encode_mutations(&self, mutations: &[Mutation]) -> Result<Option<BulkPart>> {
|
||||
let Some((arrow_record_batch, min_ts, max_ts)) =
|
||||
mutations_to_record_batch(mutations, &self.metadata, &self.pk_encoder, false)?
|
||||
mutations_to_record_batch(mutations, &self.metadata, &self.pk_encoder, self.dedup)?
|
||||
else {
|
||||
return Ok(false);
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
let mut buf = Vec::with_capacity(4096);
|
||||
let arrow_schema = arrow_record_batch.schema();
|
||||
{
|
||||
let mut writer = ArrowWriter::try_new(&mut dest.data, arrow_schema, None)
|
||||
.context(EncodeMemtableSnafu)?;
|
||||
|
||||
let file_metadata = {
|
||||
let mut writer =
|
||||
ArrowWriter::try_new(&mut buf, arrow_schema, self.writer_props.clone())
|
||||
.context(EncodeMemtableSnafu)?;
|
||||
writer
|
||||
.write(&arrow_record_batch)
|
||||
.context(EncodeMemtableSnafu)?;
|
||||
let _metadata = writer.finish().context(EncodeMemtableSnafu)?;
|
||||
}
|
||||
|
||||
dest.metadata = BulkPartMeta {
|
||||
num_rows: arrow_record_batch.num_rows(),
|
||||
max_timestamp: max_ts,
|
||||
min_timestamp: min_ts,
|
||||
writer.finish().context(EncodeMemtableSnafu)?
|
||||
};
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
/// Decodes [BulkPart] to [Batch]es.
|
||||
fn decode_to_batches(&self, _part: &BulkPart, _dest: &mut VecDeque<Batch>) -> Result<()> {
|
||||
todo!()
|
||||
let buf = Bytes::from(buf);
|
||||
let parquet_metadata = Arc::new(parse_parquet_metadata(file_metadata)?);
|
||||
|
||||
Ok(Some(BulkPart {
|
||||
data: buf,
|
||||
metadata: BulkPartMeta {
|
||||
num_rows: arrow_record_batch.num_rows(),
|
||||
max_timestamp: max_ts,
|
||||
min_timestamp: min_ts,
|
||||
parquet_metadata,
|
||||
region_metadata: self.metadata.clone(),
|
||||
},
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -379,10 +433,12 @@ fn binary_array_to_dictionary(input: &BinaryArray) -> Result<PrimaryKeyArray> {
|
||||
mod tests {
|
||||
use std::collections::VecDeque;
|
||||
|
||||
use datafusion_common::ScalarValue;
|
||||
use datatypes::prelude::{ScalarVector, Value};
|
||||
use datatypes::vectors::{Float64Vector, TimestampMillisecondVector};
|
||||
|
||||
use super::*;
|
||||
use crate::memtable::bulk::context::BulkIterContext;
|
||||
use crate::sst::parquet::format::ReadFormat;
|
||||
use crate::test_util::memtable_util::{build_key_values_with_ts_seq_values, metadata_for_test};
|
||||
|
||||
@@ -444,7 +500,7 @@ mod tests {
|
||||
k0: &'a str,
|
||||
k1: u32,
|
||||
timestamps: &'a [i64],
|
||||
v0: &'a [Option<f64>],
|
||||
v1: &'a [Option<f64>],
|
||||
sequence: u64,
|
||||
}
|
||||
|
||||
@@ -452,7 +508,7 @@ mod tests {
|
||||
struct BatchOutput<'a> {
|
||||
pk_values: &'a [Value],
|
||||
timestamps: &'a [i64],
|
||||
v0: &'a [Option<f64>],
|
||||
v1: &'a [Option<f64>],
|
||||
}
|
||||
|
||||
fn check_mutations_to_record_batches(
|
||||
@@ -470,7 +526,7 @@ mod tests {
|
||||
m.k0.to_string(),
|
||||
m.k1,
|
||||
m.timestamps.iter().copied(),
|
||||
m.v0.iter().copied(),
|
||||
m.v1.iter().copied(),
|
||||
m.sequence,
|
||||
)
|
||||
.mutation
|
||||
@@ -526,7 +582,7 @@ mod tests {
|
||||
for idx in 0..expected.len() {
|
||||
assert_eq!(expected[idx].pk_values, &batch_values[idx].0);
|
||||
assert_eq!(expected[idx].timestamps, &batch_values[idx].1);
|
||||
assert_eq!(expected[idx].v0, &batch_values[idx].2);
|
||||
assert_eq!(expected[idx].v1, &batch_values[idx].2);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -537,13 +593,13 @@ mod tests {
|
||||
k0: "a",
|
||||
k1: 0,
|
||||
timestamps: &[0],
|
||||
v0: &[Some(0.1)],
|
||||
v1: &[Some(0.1)],
|
||||
sequence: 0,
|
||||
}],
|
||||
&[BatchOutput {
|
||||
pk_values: &[Value::String("a".into()), Value::UInt32(0)],
|
||||
timestamps: &[0],
|
||||
v0: &[Some(0.1)],
|
||||
v1: &[Some(0.1)],
|
||||
}],
|
||||
(0, 0),
|
||||
true,
|
||||
@@ -555,28 +611,28 @@ mod tests {
|
||||
k0: "a",
|
||||
k1: 0,
|
||||
timestamps: &[0],
|
||||
v0: &[Some(0.1)],
|
||||
v1: &[Some(0.1)],
|
||||
sequence: 0,
|
||||
},
|
||||
MutationInput {
|
||||
k0: "b",
|
||||
k1: 0,
|
||||
timestamps: &[0],
|
||||
v0: &[Some(0.0)],
|
||||
v1: &[Some(0.0)],
|
||||
sequence: 0,
|
||||
},
|
||||
MutationInput {
|
||||
k0: "a",
|
||||
k1: 0,
|
||||
timestamps: &[1],
|
||||
v0: &[Some(0.2)],
|
||||
v1: &[Some(0.2)],
|
||||
sequence: 1,
|
||||
},
|
||||
MutationInput {
|
||||
k0: "a",
|
||||
k1: 1,
|
||||
timestamps: &[1],
|
||||
v0: &[Some(0.3)],
|
||||
v1: &[Some(0.3)],
|
||||
sequence: 2,
|
||||
},
|
||||
],
|
||||
@@ -584,17 +640,17 @@ mod tests {
|
||||
BatchOutput {
|
||||
pk_values: &[Value::String("a".into()), Value::UInt32(0)],
|
||||
timestamps: &[0, 1],
|
||||
v0: &[Some(0.1), Some(0.2)],
|
||||
v1: &[Some(0.1), Some(0.2)],
|
||||
},
|
||||
BatchOutput {
|
||||
pk_values: &[Value::String("a".into()), Value::UInt32(1)],
|
||||
timestamps: &[1],
|
||||
v0: &[Some(0.3)],
|
||||
v1: &[Some(0.3)],
|
||||
},
|
||||
BatchOutput {
|
||||
pk_values: &[Value::String("b".into()), Value::UInt32(0)],
|
||||
timestamps: &[0],
|
||||
v0: &[Some(0.0)],
|
||||
v1: &[Some(0.0)],
|
||||
},
|
||||
],
|
||||
(0, 1),
|
||||
@@ -607,21 +663,21 @@ mod tests {
|
||||
k0: "a",
|
||||
k1: 0,
|
||||
timestamps: &[0],
|
||||
v0: &[Some(0.1)],
|
||||
v1: &[Some(0.1)],
|
||||
sequence: 0,
|
||||
},
|
||||
MutationInput {
|
||||
k0: "b",
|
||||
k1: 0,
|
||||
timestamps: &[0],
|
||||
v0: &[Some(0.0)],
|
||||
v1: &[Some(0.0)],
|
||||
sequence: 0,
|
||||
},
|
||||
MutationInput {
|
||||
k0: "a",
|
||||
k1: 0,
|
||||
timestamps: &[0],
|
||||
v0: &[Some(0.2)],
|
||||
v1: &[Some(0.2)],
|
||||
sequence: 1,
|
||||
},
|
||||
],
|
||||
@@ -629,12 +685,12 @@ mod tests {
|
||||
BatchOutput {
|
||||
pk_values: &[Value::String("a".into()), Value::UInt32(0)],
|
||||
timestamps: &[0],
|
||||
v0: &[Some(0.2)],
|
||||
v1: &[Some(0.2)],
|
||||
},
|
||||
BatchOutput {
|
||||
pk_values: &[Value::String("b".into()), Value::UInt32(0)],
|
||||
timestamps: &[0],
|
||||
v0: &[Some(0.0)],
|
||||
v1: &[Some(0.0)],
|
||||
},
|
||||
],
|
||||
(0, 0),
|
||||
@@ -646,21 +702,21 @@ mod tests {
|
||||
k0: "a",
|
||||
k1: 0,
|
||||
timestamps: &[0],
|
||||
v0: &[Some(0.1)],
|
||||
v1: &[Some(0.1)],
|
||||
sequence: 0,
|
||||
},
|
||||
MutationInput {
|
||||
k0: "b",
|
||||
k1: 0,
|
||||
timestamps: &[0],
|
||||
v0: &[Some(0.0)],
|
||||
v1: &[Some(0.0)],
|
||||
sequence: 0,
|
||||
},
|
||||
MutationInput {
|
||||
k0: "a",
|
||||
k1: 0,
|
||||
timestamps: &[0],
|
||||
v0: &[Some(0.2)],
|
||||
v1: &[Some(0.2)],
|
||||
sequence: 1,
|
||||
},
|
||||
],
|
||||
@@ -668,16 +724,194 @@ mod tests {
|
||||
BatchOutput {
|
||||
pk_values: &[Value::String("a".into()), Value::UInt32(0)],
|
||||
timestamps: &[0, 0],
|
||||
v0: &[Some(0.2), Some(0.1)],
|
||||
v1: &[Some(0.2), Some(0.1)],
|
||||
},
|
||||
BatchOutput {
|
||||
pk_values: &[Value::String("b".into()), Value::UInt32(0)],
|
||||
timestamps: &[0],
|
||||
v0: &[Some(0.0)],
|
||||
v1: &[Some(0.0)],
|
||||
},
|
||||
],
|
||||
(0, 0),
|
||||
false,
|
||||
);
|
||||
}
|
||||
|
||||
fn encode(input: &[MutationInput]) -> BulkPart {
|
||||
let metadata = metadata_for_test();
|
||||
let mutations = input
|
||||
.iter()
|
||||
.map(|m| {
|
||||
build_key_values_with_ts_seq_values(
|
||||
&metadata,
|
||||
m.k0.to_string(),
|
||||
m.k1,
|
||||
m.timestamps.iter().copied(),
|
||||
m.v1.iter().copied(),
|
||||
m.sequence,
|
||||
)
|
||||
.mutation
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let encoder = BulkPartEncoder::new(metadata, true, 1024);
|
||||
encoder.encode_mutations(&mutations).unwrap().unwrap()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_write_and_read_part_projection() {
|
||||
let part = encode(&[
|
||||
MutationInput {
|
||||
k0: "a",
|
||||
k1: 0,
|
||||
timestamps: &[1],
|
||||
v1: &[Some(0.1)],
|
||||
sequence: 0,
|
||||
},
|
||||
MutationInput {
|
||||
k0: "b",
|
||||
k1: 0,
|
||||
timestamps: &[1],
|
||||
v1: &[Some(0.0)],
|
||||
sequence: 0,
|
||||
},
|
||||
MutationInput {
|
||||
k0: "a",
|
||||
k1: 0,
|
||||
timestamps: &[2],
|
||||
v1: &[Some(0.2)],
|
||||
sequence: 1,
|
||||
},
|
||||
]);
|
||||
|
||||
let projection = &[4u32];
|
||||
|
||||
let mut reader = part
|
||||
.read(Arc::new(BulkIterContext::new(
|
||||
part.metadata.region_metadata.clone(),
|
||||
&Some(projection.as_slice()),
|
||||
None,
|
||||
)))
|
||||
.unwrap()
|
||||
.expect("expect at least one row group");
|
||||
|
||||
let mut total_rows_read = 0;
|
||||
let mut field = vec![];
|
||||
for res in reader {
|
||||
let batch = res.unwrap();
|
||||
assert_eq!(1, batch.fields().len());
|
||||
assert_eq!(4, batch.fields()[0].column_id);
|
||||
field.extend(
|
||||
batch.fields()[0]
|
||||
.data
|
||||
.as_any()
|
||||
.downcast_ref::<Float64Vector>()
|
||||
.unwrap()
|
||||
.iter_data()
|
||||
.map(|v| v.unwrap()),
|
||||
);
|
||||
total_rows_read += batch.num_rows();
|
||||
}
|
||||
assert_eq!(3, total_rows_read);
|
||||
assert_eq!(vec![0.1, 0.2, 0.0], field);
|
||||
}
|
||||
|
||||
fn prepare(key_values: Vec<(&str, u32, (i64, i64), u64)>) -> BulkPart {
|
||||
let metadata = metadata_for_test();
|
||||
let mutations = key_values
|
||||
.into_iter()
|
||||
.map(|(k0, k1, (start, end), sequence)| {
|
||||
let ts = (start..end);
|
||||
let v1 = (start..end).map(|_| None);
|
||||
build_key_values_with_ts_seq_values(&metadata, k0.to_string(), k1, ts, v1, sequence)
|
||||
.mutation
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let encoder = BulkPartEncoder::new(metadata, true, 100);
|
||||
encoder.encode_mutations(&mutations).unwrap().unwrap()
|
||||
}
|
||||
|
||||
fn check_prune_row_group(part: &BulkPart, predicate: Option<Predicate>, expected_rows: usize) {
|
||||
let context = Arc::new(BulkIterContext::new(
|
||||
part.metadata.region_metadata.clone(),
|
||||
&None,
|
||||
predicate,
|
||||
));
|
||||
let mut reader = part
|
||||
.read(context)
|
||||
.unwrap()
|
||||
.expect("expect at least one row group");
|
||||
let mut total_rows_read = 0;
|
||||
for res in reader {
|
||||
let batch = res.unwrap();
|
||||
total_rows_read += batch.num_rows();
|
||||
}
|
||||
// Should only read row group 1.
|
||||
assert_eq!(expected_rows, total_rows_read);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_prune_row_groups() {
|
||||
let part = prepare(vec![
|
||||
("a", 0, (0, 40), 1),
|
||||
("a", 1, (0, 60), 1),
|
||||
("b", 0, (0, 100), 2),
|
||||
("b", 1, (100, 180), 3),
|
||||
("b", 1, (180, 210), 4),
|
||||
]);
|
||||
|
||||
let context = Arc::new(BulkIterContext::new(
|
||||
part.metadata.region_metadata.clone(),
|
||||
&None,
|
||||
Some(Predicate::new(vec![datafusion_expr::col("ts").eq(
|
||||
datafusion_expr::lit(ScalarValue::TimestampMillisecond(Some(300), None)),
|
||||
)])),
|
||||
));
|
||||
assert!(part.read(context).unwrap().is_none());
|
||||
|
||||
check_prune_row_group(&part, None, 310);
|
||||
|
||||
check_prune_row_group(
|
||||
&part,
|
||||
Some(Predicate::new(vec![
|
||||
datafusion_expr::col("k0").eq(datafusion_expr::lit("a")),
|
||||
datafusion_expr::col("k1").eq(datafusion_expr::lit(0u32)),
|
||||
])),
|
||||
40,
|
||||
);
|
||||
|
||||
check_prune_row_group(
|
||||
&part,
|
||||
Some(Predicate::new(vec![
|
||||
datafusion_expr::col("k0").eq(datafusion_expr::lit("a")),
|
||||
datafusion_expr::col("k1").eq(datafusion_expr::lit(1u32)),
|
||||
])),
|
||||
60,
|
||||
);
|
||||
|
||||
check_prune_row_group(
|
||||
&part,
|
||||
Some(Predicate::new(vec![
|
||||
datafusion_expr::col("k0").eq(datafusion_expr::lit("a"))
|
||||
])),
|
||||
100,
|
||||
);
|
||||
|
||||
check_prune_row_group(
|
||||
&part,
|
||||
Some(Predicate::new(vec![
|
||||
datafusion_expr::col("k0").eq(datafusion_expr::lit("b")),
|
||||
datafusion_expr::col("k1").eq(datafusion_expr::lit(0u32)),
|
||||
])),
|
||||
100,
|
||||
);
|
||||
|
||||
/// Predicates over field column can do precise filtering.
|
||||
check_prune_row_group(
|
||||
&part,
|
||||
Some(Predicate::new(vec![
|
||||
datafusion_expr::col("v0").eq(datafusion_expr::lit(150i64))
|
||||
])),
|
||||
1,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
149
src/mito2/src/memtable/bulk/part_reader.rs
Normal file
149
src/mito2/src/memtable/bulk/part_reader.rs
Normal file
@@ -0,0 +1,149 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::VecDeque;
|
||||
use std::sync::Arc;
|
||||
|
||||
use bytes::Bytes;
|
||||
use parquet::arrow::ProjectionMask;
|
||||
use parquet::file::metadata::ParquetMetaData;
|
||||
|
||||
use crate::error;
|
||||
use crate::memtable::bulk::context::BulkIterContextRef;
|
||||
use crate::memtable::bulk::row_group_reader::{
|
||||
MemtableRowGroupReader, MemtableRowGroupReaderBuilder,
|
||||
};
|
||||
use crate::read::Batch;
|
||||
|
||||
/// Iterator for reading data inside a bulk part.
|
||||
pub struct BulkPartIter {
|
||||
row_groups_to_read: VecDeque<usize>,
|
||||
current_reader: Option<PruneReader>,
|
||||
builder: MemtableRowGroupReaderBuilder,
|
||||
}
|
||||
|
||||
impl BulkPartIter {
|
||||
/// Creates a new [BulkPartIter].
|
||||
pub(crate) fn try_new(
|
||||
context: BulkIterContextRef,
|
||||
mut row_groups_to_read: VecDeque<usize>,
|
||||
parquet_meta: Arc<ParquetMetaData>,
|
||||
data: Bytes,
|
||||
) -> error::Result<Self> {
|
||||
let projection_mask = ProjectionMask::roots(
|
||||
parquet_meta.file_metadata().schema_descr(),
|
||||
context.read_format().projection_indices().iter().copied(),
|
||||
);
|
||||
|
||||
let builder = MemtableRowGroupReaderBuilder::try_new(
|
||||
context.clone(),
|
||||
projection_mask,
|
||||
parquet_meta,
|
||||
data,
|
||||
)?;
|
||||
|
||||
let init_reader = row_groups_to_read
|
||||
.pop_front()
|
||||
.map(|first_row_group| builder.build_row_group_reader(first_row_group, None))
|
||||
.transpose()?
|
||||
.map(|r| PruneReader::new(context, r));
|
||||
Ok(Self {
|
||||
row_groups_to_read,
|
||||
current_reader: init_reader,
|
||||
builder,
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn next_batch(&mut self) -> error::Result<Option<Batch>> {
|
||||
let Some(current) = &mut self.current_reader else {
|
||||
// All row group exhausted.
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
if let Some(batch) = current.next_batch()? {
|
||||
return Ok(Some(batch));
|
||||
}
|
||||
|
||||
// Previous row group exhausted, read next row group
|
||||
while let Some(next_row_group) = self.row_groups_to_read.pop_front() {
|
||||
current.reset(self.builder.build_row_group_reader(next_row_group, None)?);
|
||||
if let Some(next_batch) = current.next_batch()? {
|
||||
return Ok(Some(next_batch));
|
||||
}
|
||||
}
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for BulkPartIter {
|
||||
type Item = error::Result<Batch>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.next_batch().transpose()
|
||||
}
|
||||
}
|
||||
|
||||
struct PruneReader {
|
||||
context: BulkIterContextRef,
|
||||
row_group_reader: MemtableRowGroupReader,
|
||||
}
|
||||
|
||||
//todo(hl): maybe we also need to support lastrow mode here.
|
||||
impl PruneReader {
|
||||
fn new(context: BulkIterContextRef, reader: MemtableRowGroupReader) -> Self {
|
||||
Self {
|
||||
context,
|
||||
row_group_reader: reader,
|
||||
}
|
||||
}
|
||||
|
||||
/// Iterates current inner reader until exhausted.
|
||||
fn next_batch(&mut self) -> error::Result<Option<Batch>> {
|
||||
while let Some(b) = self.row_group_reader.next_inner()? {
|
||||
match self.prune(b)? {
|
||||
Some(b) => {
|
||||
return Ok(Some(b));
|
||||
}
|
||||
None => {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
/// Prunes batch according to filters.
|
||||
fn prune(&mut self, batch: Batch) -> error::Result<Option<Batch>> {
|
||||
//todo(hl): add metrics.
|
||||
|
||||
// fast path
|
||||
if self.context.base.filters.is_empty() {
|
||||
return Ok(Some(batch));
|
||||
}
|
||||
|
||||
let Some(batch_filtered) = self.context.base.precise_filter(batch)? else {
|
||||
// the entire batch is filtered out
|
||||
return Ok(None);
|
||||
};
|
||||
if !batch_filtered.is_empty() {
|
||||
Ok(Some(batch_filtered))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
fn reset(&mut self, reader: MemtableRowGroupReader) {
|
||||
self.row_group_reader = reader;
|
||||
}
|
||||
}
|
||||
189
src/mito2/src/memtable/bulk/row_group_reader.rs
Normal file
189
src/mito2/src/memtable/bulk/row_group_reader.rs
Normal file
@@ -0,0 +1,189 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
|
||||
use bytes::Bytes;
|
||||
use datatypes::arrow::array::RecordBatch;
|
||||
use datatypes::arrow::error::ArrowError;
|
||||
use parquet::arrow::arrow_reader::{ParquetRecordBatchReader, RowGroups, RowSelection};
|
||||
use parquet::arrow::{parquet_to_arrow_field_levels, FieldLevels, ProjectionMask};
|
||||
use parquet::column::page::{PageIterator, PageReader};
|
||||
use parquet::file::metadata::ParquetMetaData;
|
||||
use snafu::ResultExt;
|
||||
|
||||
use crate::error;
|
||||
use crate::error::ReadDataPartSnafu;
|
||||
use crate::memtable::bulk::context::BulkIterContextRef;
|
||||
use crate::sst::parquet::format::ReadFormat;
|
||||
use crate::sst::parquet::reader::{RowGroupReaderBase, RowGroupReaderContext};
|
||||
use crate::sst::parquet::row_group::{ColumnChunkIterator, RowGroupBase};
|
||||
use crate::sst::parquet::DEFAULT_READ_BATCH_SIZE;
|
||||
|
||||
/// Helper for reading specific row group inside Memtable Parquet parts.
|
||||
// This is similar to [mito2::sst::parquet::row_group::InMemoryRowGroup] since
|
||||
// it's a workaround for lacking of keyword generics.
|
||||
pub struct MemtableRowGroupPageFetcher<'a> {
|
||||
/// Shared structs for reading row group.
|
||||
base: RowGroupBase<'a>,
|
||||
bytes: Bytes,
|
||||
}
|
||||
|
||||
impl<'a> MemtableRowGroupPageFetcher<'a> {
|
||||
pub(crate) fn create(
|
||||
row_group_idx: usize,
|
||||
parquet_meta: &'a ParquetMetaData,
|
||||
bytes: Bytes,
|
||||
) -> Self {
|
||||
let metadata = parquet_meta.row_group(row_group_idx);
|
||||
let row_count = metadata.num_rows() as usize;
|
||||
let page_locations = parquet_meta
|
||||
.offset_index()
|
||||
.map(|x| x[row_group_idx].as_slice());
|
||||
|
||||
Self {
|
||||
base: RowGroupBase {
|
||||
metadata,
|
||||
page_locations,
|
||||
row_count,
|
||||
column_chunks: vec![None; metadata.columns().len()],
|
||||
// the cached `column_uncompressed_pages` would never be used in Memtable readers.
|
||||
column_uncompressed_pages: vec![None; metadata.columns().len()],
|
||||
},
|
||||
bytes,
|
||||
}
|
||||
}
|
||||
|
||||
/// Fetches column pages from memory file.
|
||||
pub(crate) fn fetch(&mut self, projection: &ProjectionMask, selection: Option<&RowSelection>) {
|
||||
if let Some((selection, page_locations)) = selection.zip(self.base.page_locations) {
|
||||
// Selection provided.
|
||||
let (fetch_ranges, page_start_offsets) =
|
||||
self.base
|
||||
.calc_sparse_read_ranges(projection, page_locations, selection);
|
||||
if fetch_ranges.is_empty() {
|
||||
return;
|
||||
}
|
||||
let chunk_data = self.fetch_bytes(&fetch_ranges);
|
||||
|
||||
self.base
|
||||
.assign_sparse_chunk(projection, chunk_data, page_start_offsets);
|
||||
} else {
|
||||
let fetch_ranges = self.base.calc_dense_read_ranges(projection);
|
||||
if fetch_ranges.is_empty() {
|
||||
// Nothing to fetch.
|
||||
return;
|
||||
}
|
||||
let chunk_data = self.fetch_bytes(&fetch_ranges);
|
||||
self.base.assign_dense_chunk(projection, chunk_data);
|
||||
}
|
||||
}
|
||||
|
||||
fn fetch_bytes(&self, ranges: &[Range<u64>]) -> Vec<Bytes> {
|
||||
ranges
|
||||
.iter()
|
||||
.map(|range| self.bytes.slice(range.start as usize..range.end as usize))
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Creates a page reader to read column at `i`.
|
||||
fn column_page_reader(&self, i: usize) -> parquet::errors::Result<Box<dyn PageReader>> {
|
||||
let reader = self.base.column_reader(i)?;
|
||||
Ok(Box::new(reader))
|
||||
}
|
||||
}
|
||||
|
||||
impl RowGroups for MemtableRowGroupPageFetcher<'_> {
|
||||
fn num_rows(&self) -> usize {
|
||||
self.base.row_count
|
||||
}
|
||||
|
||||
fn column_chunks(&self, i: usize) -> parquet::errors::Result<Box<dyn PageIterator>> {
|
||||
Ok(Box::new(ColumnChunkIterator {
|
||||
reader: Some(self.column_page_reader(i)),
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
impl RowGroupReaderContext for BulkIterContextRef {
|
||||
fn map_result(
|
||||
&self,
|
||||
result: Result<Option<RecordBatch>, ArrowError>,
|
||||
) -> error::Result<Option<RecordBatch>> {
|
||||
result.context(error::DecodeArrowRowGroupSnafu)
|
||||
}
|
||||
|
||||
fn read_format(&self) -> &ReadFormat {
|
||||
self.as_ref().read_format()
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) type MemtableRowGroupReader = RowGroupReaderBase<BulkIterContextRef>;
|
||||
|
||||
pub(crate) struct MemtableRowGroupReaderBuilder {
|
||||
context: BulkIterContextRef,
|
||||
projection: ProjectionMask,
|
||||
parquet_metadata: Arc<ParquetMetaData>,
|
||||
field_levels: FieldLevels,
|
||||
data: Bytes,
|
||||
}
|
||||
|
||||
impl MemtableRowGroupReaderBuilder {
|
||||
pub(crate) fn try_new(
|
||||
context: BulkIterContextRef,
|
||||
projection: ProjectionMask,
|
||||
parquet_metadata: Arc<ParquetMetaData>,
|
||||
data: Bytes,
|
||||
) -> error::Result<Self> {
|
||||
let parquet_schema_desc = parquet_metadata.file_metadata().schema_descr();
|
||||
let hint = Some(context.read_format().arrow_schema().fields());
|
||||
let field_levels =
|
||||
parquet_to_arrow_field_levels(parquet_schema_desc, projection.clone(), hint)
|
||||
.context(ReadDataPartSnafu)?;
|
||||
Ok(Self {
|
||||
context,
|
||||
projection,
|
||||
parquet_metadata,
|
||||
field_levels,
|
||||
data,
|
||||
})
|
||||
}
|
||||
|
||||
/// Builds a reader to read the row group at `row_group_idx` from memory.
|
||||
pub(crate) fn build_row_group_reader(
|
||||
&self,
|
||||
row_group_idx: usize,
|
||||
row_selection: Option<RowSelection>,
|
||||
) -> error::Result<MemtableRowGroupReader> {
|
||||
let mut row_group = MemtableRowGroupPageFetcher::create(
|
||||
row_group_idx,
|
||||
&self.parquet_metadata,
|
||||
self.data.clone(),
|
||||
);
|
||||
// Fetches data from memory part. Currently, row selection is not supported.
|
||||
row_group.fetch(&self.projection, row_selection.as_ref());
|
||||
|
||||
// Builds the parquet reader.
|
||||
// Now the row selection is None.
|
||||
let reader = ParquetRecordBatchReader::try_new_with_row_groups(
|
||||
&self.field_levels,
|
||||
&row_group,
|
||||
DEFAULT_READ_BATCH_SIZE,
|
||||
row_selection,
|
||||
)
|
||||
.context(ReadDataPartSnafu)?;
|
||||
Ok(MemtableRowGroupReader::create(self.context.clone(), reader))
|
||||
}
|
||||
}
|
||||
@@ -99,11 +99,8 @@ impl RowGroupLastRowCachedReader {
|
||||
return Self::new_miss(key, row_group_reader, None);
|
||||
};
|
||||
if let Some(value) = cache_manager.get_selector_result(&key) {
|
||||
let schema_matches = value.projection
|
||||
== row_group_reader
|
||||
.context()
|
||||
.read_format()
|
||||
.projection_indices();
|
||||
let schema_matches =
|
||||
value.projection == row_group_reader.read_format().projection_indices();
|
||||
if schema_matches {
|
||||
// Schema matches, use cache batches.
|
||||
Self::new_hit(value)
|
||||
@@ -218,29 +215,23 @@ impl RowGroupLastRowReader {
|
||||
};
|
||||
|
||||
// All last rows in row group are yielded, update cache.
|
||||
self.update_cache();
|
||||
self.maybe_update_cache();
|
||||
Ok(last_batch)
|
||||
}
|
||||
|
||||
/// Updates row group's last row cache if cache manager is present.
|
||||
fn update_cache(&mut self) {
|
||||
if self.yielded_batches.is_empty() {
|
||||
// we always expect that row groups yields batches.
|
||||
return;
|
||||
fn maybe_update_cache(&mut self) {
|
||||
if let Some(cache) = &self.cache_manager {
|
||||
if self.yielded_batches.is_empty() {
|
||||
// we always expect that row groups yields batches.
|
||||
return;
|
||||
}
|
||||
let value = Arc::new(SelectorResultValue {
|
||||
result: std::mem::take(&mut self.yielded_batches),
|
||||
projection: self.reader.read_format().projection_indices().to_vec(),
|
||||
});
|
||||
cache.put_selector_result(self.key, value)
|
||||
}
|
||||
let Some(cache) = &self.cache_manager else {
|
||||
return;
|
||||
};
|
||||
let value = Arc::new(SelectorResultValue {
|
||||
result: std::mem::take(&mut self.yielded_batches),
|
||||
projection: self
|
||||
.reader
|
||||
.context()
|
||||
.read_format()
|
||||
.projection_indices()
|
||||
.to_vec(),
|
||||
});
|
||||
cache.put_selector_result(self.key, value);
|
||||
}
|
||||
|
||||
fn metrics(&self) -> &ReaderMetrics {
|
||||
|
||||
@@ -399,7 +399,7 @@ impl ScanRegion {
|
||||
});
|
||||
}
|
||||
|
||||
/// Use the latest schema to build the inveretd index applier.
|
||||
/// Use the latest schema to build the inverted index applier.
|
||||
fn build_invereted_index_applier(&self) -> Option<InvertedIndexApplierRef> {
|
||||
if self.ignore_inverted_index {
|
||||
return None;
|
||||
|
||||
@@ -447,7 +447,26 @@ impl ManifestContext {
|
||||
pub(crate) fn set_role(&self, next_role: RegionRole, region_id: RegionId) {
|
||||
match next_role {
|
||||
RegionRole::Follower => {
|
||||
self.state.store(RegionRoleState::Follower);
|
||||
match self.state.fetch_update(|state| {
|
||||
if !matches!(state, RegionRoleState::Follower) {
|
||||
Some(RegionRoleState::Follower)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}) {
|
||||
Ok(state) => info!(
|
||||
"Convert region {} to follower, previous role state: {:?}",
|
||||
region_id, state
|
||||
),
|
||||
Err(state) => {
|
||||
if state != RegionRoleState::Follower {
|
||||
warn!(
|
||||
"Failed to convert region {} to follower, current role state: {:?}",
|
||||
region_id, state
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
RegionRole::Leader => {
|
||||
match self.state.fetch_update(|state| {
|
||||
|
||||
@@ -29,7 +29,7 @@ use snafu::ResultExt;
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
use crate::cache::file_cache::{FileCacheRef, FileType, IndexKey};
|
||||
use crate::cache::index::{CachedInvertedIndexBlobReader, InvertedIndexCacheRef};
|
||||
use crate::cache::index::inverted_index::{CachedInvertedIndexBlobReader, InvertedIndexCacheRef};
|
||||
use crate::error::{
|
||||
ApplyInvertedIndexSnafu, MetadataSnafu, PuffinBuildReaderSnafu, PuffinReadBlobSnafu, Result,
|
||||
};
|
||||
|
||||
@@ -34,7 +34,7 @@ use store_api::metadata::RegionMetadata;
|
||||
use store_api::storage::ColumnId;
|
||||
|
||||
use crate::cache::file_cache::FileCacheRef;
|
||||
use crate::cache::index::InvertedIndexCacheRef;
|
||||
use crate::cache::index::inverted_index::InvertedIndexCacheRef;
|
||||
use crate::error::{BuildIndexApplierSnafu, ColumnNotFoundSnafu, ConvertValueSnafu, Result};
|
||||
use crate::row_converter::SortField;
|
||||
use crate::sst::index::inverted_index::applier::InvertedIndexApplier;
|
||||
|
||||
@@ -316,7 +316,7 @@ mod tests {
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
use super::*;
|
||||
use crate::cache::index::InvertedIndexCache;
|
||||
use crate::cache::index::inverted_index::InvertedIndexCache;
|
||||
use crate::metrics::CACHE_BYTES;
|
||||
use crate::read::BatchColumn;
|
||||
use crate::row_converter::{McmpRowCodec, RowCodec, SortField};
|
||||
|
||||
@@ -27,11 +27,11 @@ pub(crate) mod file_range;
|
||||
pub mod format;
|
||||
pub(crate) mod helper;
|
||||
pub(crate) mod metadata;
|
||||
mod page_reader;
|
||||
pub(crate) mod page_reader;
|
||||
pub mod reader;
|
||||
pub mod row_group;
|
||||
mod row_selection;
|
||||
mod stats;
|
||||
pub(crate) mod stats;
|
||||
pub mod writer;
|
||||
|
||||
/// Key of metadata in parquet SST.
|
||||
|
||||
@@ -24,6 +24,7 @@ use async_trait::async_trait;
|
||||
use common_recordbatch::filter::SimpleFilterEvaluator;
|
||||
use common_telemetry::{debug, warn};
|
||||
use datafusion_expr::Expr;
|
||||
use datatypes::arrow::error::ArrowError;
|
||||
use datatypes::arrow::record_batch::RecordBatch;
|
||||
use datatypes::data_type::ConcreteDataType;
|
||||
use itertools::Itertools;
|
||||
@@ -39,7 +40,8 @@ use table::predicate::Predicate;
|
||||
|
||||
use crate::cache::CacheManagerRef;
|
||||
use crate::error::{
|
||||
ArrowReaderSnafu, InvalidMetadataSnafu, InvalidParquetSnafu, ReadParquetSnafu, Result,
|
||||
ArrowReaderSnafu, InvalidMetadataSnafu, InvalidParquetSnafu, ReadDataPartSnafu,
|
||||
ReadParquetSnafu, Result,
|
||||
};
|
||||
use crate::metrics::{
|
||||
PRECISE_FILTER_ROWS_TOTAL, READ_ROWS_IN_ROW_GROUP_TOTAL, READ_ROWS_TOTAL,
|
||||
@@ -207,8 +209,7 @@ impl ParquetReaderBuilder {
|
||||
let hint = Some(read_format.arrow_schema().fields());
|
||||
let field_levels =
|
||||
parquet_to_arrow_field_levels(parquet_schema_desc, projection_mask.clone(), hint)
|
||||
.context(ReadParquetSnafu { path: &file_path })?;
|
||||
|
||||
.context(ReadDataPartSnafu)?;
|
||||
let row_groups = self
|
||||
.row_groups_to_read(&read_format, &parquet_meta, &mut metrics.filter_metrics)
|
||||
.await;
|
||||
@@ -871,7 +872,7 @@ impl SimpleFilterContext {
|
||||
///
|
||||
/// Returns None if the column to filter doesn't exist in the SST metadata or the
|
||||
/// expected metadata.
|
||||
fn new_opt(
|
||||
pub(crate) fn new_opt(
|
||||
sst_meta: &RegionMetadataRef,
|
||||
expected_meta: Option<&RegionMetadata>,
|
||||
expr: &Expr,
|
||||
@@ -1035,10 +1036,51 @@ impl ParquetReader {
|
||||
}
|
||||
}
|
||||
|
||||
/// RowGroupReaderContext represents the fields that cannot be shared
|
||||
/// between different `RowGroupReader`s.
|
||||
pub(crate) trait RowGroupReaderContext: Send {
|
||||
fn map_result(
|
||||
&self,
|
||||
result: std::result::Result<Option<RecordBatch>, ArrowError>,
|
||||
) -> Result<Option<RecordBatch>>;
|
||||
|
||||
fn read_format(&self) -> &ReadFormat;
|
||||
}
|
||||
|
||||
impl RowGroupReaderContext for FileRangeContextRef {
|
||||
fn map_result(
|
||||
&self,
|
||||
result: std::result::Result<Option<RecordBatch>, ArrowError>,
|
||||
) -> Result<Option<RecordBatch>> {
|
||||
result.context(ArrowReaderSnafu {
|
||||
path: self.file_path(),
|
||||
})
|
||||
}
|
||||
|
||||
fn read_format(&self) -> &ReadFormat {
|
||||
self.as_ref().read_format()
|
||||
}
|
||||
}
|
||||
|
||||
/// [RowGroupReader] that reads from [FileRange].
|
||||
pub(crate) type RowGroupReader = RowGroupReaderBase<FileRangeContextRef>;
|
||||
|
||||
impl RowGroupReader {
|
||||
/// Creates a new reader from file range.
|
||||
pub(crate) fn new(context: FileRangeContextRef, reader: ParquetRecordBatchReader) -> Self {
|
||||
Self {
|
||||
context,
|
||||
reader,
|
||||
batches: VecDeque::new(),
|
||||
metrics: ReaderMetrics::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Reader to read a row group of a parquet file.
|
||||
pub struct RowGroupReader {
|
||||
/// Context for file ranges.
|
||||
context: FileRangeContextRef,
|
||||
pub(crate) struct RowGroupReaderBase<T> {
|
||||
/// Context of [RowGroupReader] so adapts to different underlying implementation.
|
||||
context: T,
|
||||
/// Inner parquet reader.
|
||||
reader: ParquetRecordBatchReader,
|
||||
/// Buffered batches to return.
|
||||
@@ -1047,9 +1089,12 @@ pub struct RowGroupReader {
|
||||
metrics: ReaderMetrics,
|
||||
}
|
||||
|
||||
impl RowGroupReader {
|
||||
impl<T> RowGroupReaderBase<T>
|
||||
where
|
||||
T: RowGroupReaderContext,
|
||||
{
|
||||
/// Creates a new reader.
|
||||
pub(crate) fn new(context: FileRangeContextRef, reader: ParquetRecordBatchReader) -> Self {
|
||||
pub(crate) fn create(context: T, reader: ParquetRecordBatchReader) -> Self {
|
||||
Self {
|
||||
context,
|
||||
reader,
|
||||
@@ -1062,21 +1107,19 @@ impl RowGroupReader {
|
||||
pub(crate) fn metrics(&self) -> &ReaderMetrics {
|
||||
&self.metrics
|
||||
}
|
||||
pub(crate) fn context(&self) -> &FileRangeContextRef {
|
||||
&self.context
|
||||
|
||||
/// Gets [ReadFormat] of underlying reader.
|
||||
pub(crate) fn read_format(&self) -> &ReadFormat {
|
||||
self.context.read_format()
|
||||
}
|
||||
|
||||
/// Tries to fetch next [RecordBatch] from the reader.
|
||||
fn fetch_next_record_batch(&mut self) -> Result<Option<RecordBatch>> {
|
||||
self.reader.next().transpose().context(ArrowReaderSnafu {
|
||||
path: self.context.file_path(),
|
||||
})
|
||||
self.context.map_result(self.reader.next().transpose())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl BatchReader for RowGroupReader {
|
||||
async fn next_batch(&mut self) -> Result<Option<Batch>> {
|
||||
/// Returns the next [Batch].
|
||||
pub(crate) fn next_inner(&mut self) -> Result<Option<Batch>> {
|
||||
let scan_start = Instant::now();
|
||||
if let Some(batch) = self.batches.pop_front() {
|
||||
self.metrics.num_rows += batch.num_rows();
|
||||
@@ -1104,6 +1147,16 @@ impl BatchReader for RowGroupReader {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl<T> BatchReader for RowGroupReaderBase<T>
|
||||
where
|
||||
T: RowGroupReaderContext,
|
||||
{
|
||||
async fn next_batch(&mut self) -> Result<Option<Batch>> {
|
||||
self.next_inner()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use parquet::arrow::arrow_reader::RowSelector;
|
||||
|
||||
@@ -38,25 +38,196 @@ use crate::sst::file::FileId;
|
||||
use crate::sst::parquet::helper::fetch_byte_ranges;
|
||||
use crate::sst::parquet::page_reader::RowGroupCachedReader;
|
||||
|
||||
/// An in-memory collection of column chunks
|
||||
pub struct InMemoryRowGroup<'a> {
|
||||
metadata: &'a RowGroupMetaData,
|
||||
page_locations: Option<&'a [Vec<PageLocation>]>,
|
||||
pub(crate) struct RowGroupBase<'a> {
|
||||
pub(crate) metadata: &'a RowGroupMetaData,
|
||||
pub(crate) page_locations: Option<&'a [Vec<PageLocation>]>,
|
||||
/// Compressed page of each column.
|
||||
column_chunks: Vec<Option<Arc<ColumnChunkData>>>,
|
||||
row_count: usize,
|
||||
region_id: RegionId,
|
||||
file_id: FileId,
|
||||
row_group_idx: usize,
|
||||
cache_manager: Option<CacheManagerRef>,
|
||||
pub(crate) column_chunks: Vec<Option<Arc<ColumnChunkData>>>,
|
||||
pub(crate) row_count: usize,
|
||||
/// Row group level cached pages for each column.
|
||||
///
|
||||
/// These pages are uncompressed pages of a row group.
|
||||
/// `column_uncompressed_pages.len()` equals to `column_chunks.len()`.
|
||||
column_uncompressed_pages: Vec<Option<Arc<PageValue>>>,
|
||||
pub(crate) column_uncompressed_pages: Vec<Option<Arc<PageValue>>>,
|
||||
}
|
||||
|
||||
impl<'a> RowGroupBase<'a> {
|
||||
pub(crate) fn new(parquet_meta: &'a ParquetMetaData, row_group_idx: usize) -> Self {
|
||||
let metadata = parquet_meta.row_group(row_group_idx);
|
||||
// `page_locations` is always `None` if we don't set
|
||||
// [with_page_index()](https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index)
|
||||
// to `true`.
|
||||
let page_locations = parquet_meta
|
||||
.offset_index()
|
||||
.map(|x| x[row_group_idx].as_slice());
|
||||
|
||||
Self {
|
||||
metadata,
|
||||
page_locations,
|
||||
column_chunks: vec![None; metadata.columns().len()],
|
||||
row_count: metadata.num_rows() as usize,
|
||||
column_uncompressed_pages: vec![None; metadata.columns().len()],
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn calc_sparse_read_ranges(
|
||||
&self,
|
||||
projection: &ProjectionMask,
|
||||
page_locations: &[Vec<PageLocation>],
|
||||
selection: &RowSelection,
|
||||
) -> (Vec<Range<u64>>, Vec<Vec<usize>>) {
|
||||
// If we have a `RowSelection` and an `OffsetIndex` then only fetch pages required for the
|
||||
// `RowSelection`
|
||||
let mut page_start_offsets: Vec<Vec<usize>> = vec![];
|
||||
let ranges = self
|
||||
.column_chunks
|
||||
.iter()
|
||||
.zip(self.metadata.columns())
|
||||
.enumerate()
|
||||
.filter(|&(idx, (chunk, _chunk_meta))| chunk.is_none() && projection.leaf_included(idx))
|
||||
.flat_map(|(idx, (_chunk, chunk_meta))| {
|
||||
// If the first page does not start at the beginning of the column,
|
||||
// then we need to also fetch a dictionary page.
|
||||
let mut ranges = vec![];
|
||||
let (start, _len) = chunk_meta.byte_range();
|
||||
match page_locations[idx].first() {
|
||||
Some(first) if first.offset as u64 != start => {
|
||||
ranges.push(start..first.offset as u64);
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
|
||||
ranges.extend(
|
||||
selection
|
||||
.scan_ranges(&page_locations[idx])
|
||||
.iter()
|
||||
.map(|range| range.start as u64..range.end as u64),
|
||||
);
|
||||
page_start_offsets.push(ranges.iter().map(|range| range.start as usize).collect());
|
||||
|
||||
ranges
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
(ranges, page_start_offsets)
|
||||
}
|
||||
|
||||
pub(crate) fn assign_sparse_chunk(
|
||||
&mut self,
|
||||
projection: &ProjectionMask,
|
||||
data: Vec<Bytes>,
|
||||
page_start_offsets: Vec<Vec<usize>>,
|
||||
) {
|
||||
let mut page_start_offsets = page_start_offsets.into_iter();
|
||||
let mut chunk_data = data.into_iter();
|
||||
|
||||
for (idx, chunk) in self.column_chunks.iter_mut().enumerate() {
|
||||
if chunk.is_some() || !projection.leaf_included(idx) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(offsets) = page_start_offsets.next() {
|
||||
let mut chunks = Vec::with_capacity(offsets.len());
|
||||
for _ in 0..offsets.len() {
|
||||
chunks.push(chunk_data.next().unwrap());
|
||||
}
|
||||
|
||||
*chunk = Some(Arc::new(ColumnChunkData::Sparse {
|
||||
length: self.metadata.column(idx).byte_range().1 as usize,
|
||||
data: offsets.into_iter().zip(chunks).collect(),
|
||||
}))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn calc_dense_read_ranges(&self, projection: &ProjectionMask) -> Vec<Range<u64>> {
|
||||
self.column_chunks
|
||||
.iter()
|
||||
.zip(&self.column_uncompressed_pages)
|
||||
.enumerate()
|
||||
.filter(|&(idx, (chunk, uncompressed_pages))| {
|
||||
// Don't need to fetch column data if we already cache the column's pages.
|
||||
chunk.is_none() && projection.leaf_included(idx) && uncompressed_pages.is_none()
|
||||
})
|
||||
.map(|(idx, (_chunk, _pages))| {
|
||||
let column = self.metadata.column(idx);
|
||||
let (start, length) = column.byte_range();
|
||||
start..(start + length)
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
}
|
||||
|
||||
/// Assigns uncompressed chunk binary data to [RowGroupBase::column_chunks]
|
||||
/// and returns the chunk offset and binary data assigned.
|
||||
pub(crate) fn assign_dense_chunk(
|
||||
&mut self,
|
||||
projection: &ProjectionMask,
|
||||
chunk_data: Vec<Bytes>,
|
||||
) -> Vec<(usize, Bytes)> {
|
||||
let mut chunk_data = chunk_data.into_iter();
|
||||
let mut res = vec![];
|
||||
|
||||
for (idx, (chunk, row_group_pages)) in self
|
||||
.column_chunks
|
||||
.iter_mut()
|
||||
.zip(&self.column_uncompressed_pages)
|
||||
.enumerate()
|
||||
{
|
||||
if chunk.is_some() || !projection.leaf_included(idx) || row_group_pages.is_some() {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Get the fetched page.
|
||||
let Some(data) = chunk_data.next() else {
|
||||
continue;
|
||||
};
|
||||
|
||||
let column = self.metadata.column(idx);
|
||||
res.push((idx, data.clone()));
|
||||
*chunk = Some(Arc::new(ColumnChunkData::Dense {
|
||||
offset: column.byte_range().0 as usize,
|
||||
data,
|
||||
}));
|
||||
}
|
||||
res
|
||||
}
|
||||
|
||||
/// Create [PageReader] from [RowGroupBase::column_chunks]
|
||||
pub(crate) fn column_reader(
|
||||
&self,
|
||||
col_idx: usize,
|
||||
) -> Result<SerializedPageReader<ColumnChunkData>> {
|
||||
let page_reader = match &self.column_chunks[col_idx] {
|
||||
None => {
|
||||
return Err(ParquetError::General(format!(
|
||||
"Invalid column index {col_idx}, column was not fetched"
|
||||
)))
|
||||
}
|
||||
Some(data) => {
|
||||
let page_locations = self.page_locations.map(|index| index[col_idx].clone());
|
||||
SerializedPageReader::new(
|
||||
data.clone(),
|
||||
self.metadata.column(col_idx),
|
||||
self.row_count,
|
||||
page_locations,
|
||||
)?
|
||||
}
|
||||
};
|
||||
|
||||
// This column don't cache uncompressed pages.
|
||||
Ok(page_reader)
|
||||
}
|
||||
}
|
||||
|
||||
/// An in-memory collection of column chunks
|
||||
pub struct InMemoryRowGroup<'a> {
|
||||
region_id: RegionId,
|
||||
file_id: FileId,
|
||||
row_group_idx: usize,
|
||||
cache_manager: Option<CacheManagerRef>,
|
||||
file_path: &'a str,
|
||||
/// Object store.
|
||||
object_store: ObjectStore,
|
||||
base: RowGroupBase<'a>,
|
||||
}
|
||||
|
||||
impl<'a> InMemoryRowGroup<'a> {
|
||||
@@ -73,24 +244,12 @@ impl<'a> InMemoryRowGroup<'a> {
|
||||
file_path: &'a str,
|
||||
object_store: ObjectStore,
|
||||
) -> Self {
|
||||
let metadata = parquet_meta.row_group(row_group_idx);
|
||||
// `page_locations` is always `None` if we don't set
|
||||
// [with_page_index()](https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index)
|
||||
// to `true`.
|
||||
let page_locations = parquet_meta
|
||||
.offset_index()
|
||||
.map(|x| x[row_group_idx].as_slice());
|
||||
|
||||
Self {
|
||||
metadata,
|
||||
row_count: metadata.num_rows() as usize,
|
||||
column_chunks: vec![None; metadata.columns().len()],
|
||||
page_locations,
|
||||
base: RowGroupBase::new(parquet_meta, row_group_idx),
|
||||
region_id,
|
||||
file_id,
|
||||
row_group_idx,
|
||||
cache_manager,
|
||||
column_uncompressed_pages: vec![None; metadata.columns().len()],
|
||||
file_path,
|
||||
object_store,
|
||||
}
|
||||
@@ -102,65 +261,15 @@ impl<'a> InMemoryRowGroup<'a> {
|
||||
projection: &ProjectionMask,
|
||||
selection: Option<&RowSelection>,
|
||||
) -> Result<()> {
|
||||
if let Some((selection, page_locations)) = selection.zip(self.page_locations) {
|
||||
// If we have a `RowSelection` and an `OffsetIndex` then only fetch pages required for the
|
||||
// `RowSelection`
|
||||
let mut page_start_offsets: Vec<Vec<usize>> = vec![];
|
||||
if let Some((selection, page_locations)) = selection.zip(self.base.page_locations) {
|
||||
let (fetch_ranges, page_start_offsets) =
|
||||
self.base
|
||||
.calc_sparse_read_ranges(projection, page_locations, selection);
|
||||
|
||||
let fetch_ranges = self
|
||||
.column_chunks
|
||||
.iter()
|
||||
.zip(self.metadata.columns())
|
||||
.enumerate()
|
||||
.filter(|&(idx, (chunk, _chunk_meta))| {
|
||||
chunk.is_none() && projection.leaf_included(idx)
|
||||
})
|
||||
.flat_map(|(idx, (_chunk, chunk_meta))| {
|
||||
// If the first page does not start at the beginning of the column,
|
||||
// then we need to also fetch a dictionary page.
|
||||
let mut ranges = vec![];
|
||||
let (start, _len) = chunk_meta.byte_range();
|
||||
match page_locations[idx].first() {
|
||||
Some(first) if first.offset as u64 != start => {
|
||||
ranges.push(start..first.offset as u64);
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
|
||||
ranges.extend(
|
||||
selection
|
||||
.scan_ranges(&page_locations[idx])
|
||||
.iter()
|
||||
.map(|range| range.start as u64..range.end as u64),
|
||||
);
|
||||
page_start_offsets
|
||||
.push(ranges.iter().map(|range| range.start as usize).collect());
|
||||
|
||||
ranges
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let mut chunk_data = self.fetch_bytes(&fetch_ranges).await?.into_iter();
|
||||
|
||||
let mut page_start_offsets = page_start_offsets.into_iter();
|
||||
|
||||
for (idx, chunk) in self.column_chunks.iter_mut().enumerate() {
|
||||
if chunk.is_some() || !projection.leaf_included(idx) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(offsets) = page_start_offsets.next() {
|
||||
let mut chunks = Vec::with_capacity(offsets.len());
|
||||
for _ in 0..offsets.len() {
|
||||
chunks.push(chunk_data.next().unwrap());
|
||||
}
|
||||
|
||||
*chunk = Some(Arc::new(ColumnChunkData::Sparse {
|
||||
length: self.metadata.column(idx).byte_range().1 as usize,
|
||||
data: offsets.into_iter().zip(chunks).collect(),
|
||||
}))
|
||||
}
|
||||
}
|
||||
let chunk_data = self.fetch_bytes(&fetch_ranges).await?;
|
||||
// Assign sparse chunk data to base.
|
||||
self.base
|
||||
.assign_sparse_chunk(projection, chunk_data, page_start_offsets);
|
||||
} else {
|
||||
// Now we only use cache in dense chunk data.
|
||||
self.fetch_pages_from_cache(projection);
|
||||
@@ -169,46 +278,24 @@ impl<'a> InMemoryRowGroup<'a> {
|
||||
// is a synchronous, CPU-bound operation.
|
||||
yield_now().await;
|
||||
|
||||
let fetch_ranges = self
|
||||
.column_chunks
|
||||
.iter()
|
||||
.zip(&self.column_uncompressed_pages)
|
||||
.enumerate()
|
||||
.filter(|&(idx, (chunk, uncompressed_pages))| {
|
||||
// Don't need to fetch column data if we already cache the column's pages.
|
||||
chunk.is_none() && projection.leaf_included(idx) && uncompressed_pages.is_none()
|
||||
})
|
||||
.map(|(idx, (_chunk, _pages))| {
|
||||
let column = self.metadata.column(idx);
|
||||
let (start, length) = column.byte_range();
|
||||
start..(start + length)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
// Calculate ranges to read.
|
||||
let fetch_ranges = self.base.calc_dense_read_ranges(projection);
|
||||
|
||||
if fetch_ranges.is_empty() {
|
||||
// Nothing to fetch.
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut chunk_data = self.fetch_bytes(&fetch_ranges).await?.into_iter();
|
||||
// Fetch data with ranges
|
||||
let chunk_data = self.fetch_bytes(&fetch_ranges).await?;
|
||||
|
||||
for (idx, (chunk, row_group_pages)) in self
|
||||
.column_chunks
|
||||
.iter_mut()
|
||||
.zip(&self.column_uncompressed_pages)
|
||||
.enumerate()
|
||||
{
|
||||
if chunk.is_some() || !projection.leaf_included(idx) || row_group_pages.is_some() {
|
||||
continue;
|
||||
}
|
||||
// Assigns fetched data to base.
|
||||
let assigned_columns = self.base.assign_dense_chunk(projection, chunk_data);
|
||||
|
||||
// Get the fetched page.
|
||||
let Some(data) = chunk_data.next() else {
|
||||
continue;
|
||||
};
|
||||
|
||||
let column = self.metadata.column(idx);
|
||||
if let Some(cache) = &self.cache_manager {
|
||||
// Put fetched data to cache if necessary.
|
||||
if let Some(cache) = &self.cache_manager {
|
||||
for (col_idx, data) in assigned_columns {
|
||||
let column = self.base.metadata.column(col_idx);
|
||||
if !cache_uncompressed_pages(column) {
|
||||
// For columns that have multiple uncompressed pages, we only cache the compressed page
|
||||
// to save memory.
|
||||
@@ -216,17 +303,12 @@ impl<'a> InMemoryRowGroup<'a> {
|
||||
self.region_id,
|
||||
self.file_id,
|
||||
self.row_group_idx,
|
||||
idx,
|
||||
col_idx,
|
||||
);
|
||||
cache
|
||||
.put_pages(page_key, Arc::new(PageValue::new_compressed(data.clone())));
|
||||
}
|
||||
}
|
||||
|
||||
*chunk = Some(Arc::new(ColumnChunkData::Dense {
|
||||
offset: column.byte_range().0 as usize,
|
||||
data,
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -237,7 +319,8 @@ impl<'a> InMemoryRowGroup<'a> {
|
||||
/// If the page is in the cache, sets the column chunk or `column_uncompressed_pages` for the column.
|
||||
fn fetch_pages_from_cache(&mut self, projection: &ProjectionMask) {
|
||||
let _timer = READ_STAGE_FETCH_PAGES.start_timer();
|
||||
self.column_chunks
|
||||
self.base
|
||||
.column_chunks
|
||||
.iter_mut()
|
||||
.enumerate()
|
||||
.filter(|(idx, chunk)| chunk.is_none() && projection.leaf_included(*idx))
|
||||
@@ -245,7 +328,7 @@ impl<'a> InMemoryRowGroup<'a> {
|
||||
let Some(cache) = &self.cache_manager else {
|
||||
return;
|
||||
};
|
||||
let column = self.metadata.column(idx);
|
||||
let column = self.base.metadata.column(idx);
|
||||
if cache_uncompressed_pages(column) {
|
||||
// Fetches uncompressed pages for the row group.
|
||||
let page_key = PageKey::new_uncompressed(
|
||||
@@ -254,7 +337,7 @@ impl<'a> InMemoryRowGroup<'a> {
|
||||
self.row_group_idx,
|
||||
idx,
|
||||
);
|
||||
self.column_uncompressed_pages[idx] = cache.get_pages(&page_key);
|
||||
self.base.column_uncompressed_pages[idx] = cache.get_pages(&page_key);
|
||||
} else {
|
||||
// Fetches the compressed page from the cache.
|
||||
let page_key = PageKey::new_compressed(
|
||||
@@ -308,34 +391,19 @@ impl<'a> InMemoryRowGroup<'a> {
|
||||
|
||||
/// Creates a page reader to read column at `i`.
|
||||
fn column_page_reader(&self, i: usize) -> Result<Box<dyn PageReader>> {
|
||||
if let Some(cached_pages) = &self.column_uncompressed_pages[i] {
|
||||
if let Some(cached_pages) = &self.base.column_uncompressed_pages[i] {
|
||||
debug_assert!(!cached_pages.row_group.is_empty());
|
||||
// Hits the row group level page cache.
|
||||
return Ok(Box::new(RowGroupCachedReader::new(&cached_pages.row_group)));
|
||||
}
|
||||
|
||||
let page_reader = match &self.column_chunks[i] {
|
||||
None => {
|
||||
return Err(ParquetError::General(format!(
|
||||
"Invalid column index {i}, column was not fetched"
|
||||
)))
|
||||
}
|
||||
Some(data) => {
|
||||
let page_locations = self.page_locations.map(|index| index[i].clone());
|
||||
SerializedPageReader::new(
|
||||
data.clone(),
|
||||
self.metadata.column(i),
|
||||
self.row_count,
|
||||
page_locations,
|
||||
)?
|
||||
}
|
||||
};
|
||||
let page_reader = self.base.column_reader(i)?;
|
||||
|
||||
let Some(cache) = &self.cache_manager else {
|
||||
return Ok(Box::new(page_reader));
|
||||
};
|
||||
|
||||
let column = self.metadata.column(i);
|
||||
let column = self.base.metadata.column(i);
|
||||
if cache_uncompressed_pages(column) {
|
||||
// This column use row group level page cache.
|
||||
// We collect all pages and put them into the cache.
|
||||
@@ -362,7 +430,7 @@ fn cache_uncompressed_pages(column: &ColumnChunkMetaData) -> bool {
|
||||
|
||||
impl RowGroups for InMemoryRowGroup<'_> {
|
||||
fn num_rows(&self) -> usize {
|
||||
self.row_count
|
||||
self.base.row_count
|
||||
}
|
||||
|
||||
fn column_chunks(&self, i: usize) -> Result<Box<dyn PageIterator>> {
|
||||
@@ -430,8 +498,8 @@ impl ChunkReader for ColumnChunkData {
|
||||
}
|
||||
|
||||
/// Implements [`PageIterator`] for a single column chunk, yielding a single [`PageReader`]
|
||||
struct ColumnChunkIterator {
|
||||
reader: Option<Result<Box<dyn PageReader>>>,
|
||||
pub(crate) struct ColumnChunkIterator {
|
||||
pub(crate) reader: Option<Result<Box<dyn PageReader>>>,
|
||||
}
|
||||
|
||||
impl Iterator for ColumnChunkIterator {
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
|
||||
//! Memtable test utilities.
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use api::helper::ColumnDataTypeWrapper;
|
||||
@@ -34,8 +33,8 @@ use crate::error::Result;
|
||||
use crate::memtable::key_values::KeyValue;
|
||||
use crate::memtable::partition_tree::data::{timestamp_array_to_i64_slice, DataBatch, DataBuffer};
|
||||
use crate::memtable::{
|
||||
BoxedBatchIterator, BulkPart, KeyValues, Memtable, MemtableBuilder, MemtableId, MemtableRange,
|
||||
MemtableRanges, MemtableRef, MemtableStats,
|
||||
BoxedBatchIterator, BulkPart, KeyValues, Memtable, MemtableBuilder, MemtableId, MemtableRanges,
|
||||
MemtableRef, MemtableStats,
|
||||
};
|
||||
use crate::row_converter::{McmpRowCodec, RowCodec, SortField};
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@ futures.workspace = true
|
||||
lazy_static.workspace = true
|
||||
md5 = "0.7"
|
||||
moka = { workspace = true, features = ["future"] }
|
||||
opendal = { version = "0.50", features = [
|
||||
opendal = { git = "https://github.com/GreptimeTeam/opendal.git", rev = "c82605177f2feec83e49dcaa537c505639d94024", features = [
|
||||
"layers-tracing",
|
||||
"layers-prometheus",
|
||||
"services-azblob",
|
||||
|
||||
@@ -46,6 +46,7 @@ greptime-proto.workspace = true
|
||||
humantime.workspace = true
|
||||
itertools.workspace = true
|
||||
lazy_static.workspace = true
|
||||
log-query.workspace = true
|
||||
meter-core.workspace = true
|
||||
meter-macros.workspace = true
|
||||
object-store.workspace = true
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
#![feature(trait_upcasting)]
|
||||
#![feature(try_blocks)]
|
||||
#![feature(stmt_expr_attributes)]
|
||||
#![feature(iterator_try_collect)]
|
||||
|
||||
mod analyze;
|
||||
pub mod dataframe;
|
||||
@@ -25,6 +26,7 @@ pub mod dist_plan;
|
||||
pub mod dummy_catalog;
|
||||
pub mod error;
|
||||
pub mod executor;
|
||||
pub mod log_query;
|
||||
pub mod metrics;
|
||||
mod optimizer;
|
||||
pub mod parser;
|
||||
|
||||
16
src/query/src/log_query.rs
Normal file
16
src/query/src/log_query.rs
Normal file
@@ -0,0 +1,16 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
pub mod error;
|
||||
pub mod planner;
|
||||
84
src/query/src/log_query/error.rs
Normal file
84
src/query/src/log_query/error.rs
Normal file
@@ -0,0 +1,84 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::any::Any;
|
||||
|
||||
use common_error::ext::ErrorExt;
|
||||
use common_error::status_code::StatusCode;
|
||||
use common_macro::stack_trace_debug;
|
||||
use datafusion::error::DataFusionError;
|
||||
use snafu::{Location, Snafu};
|
||||
|
||||
#[derive(Snafu)]
|
||||
#[snafu(visibility(pub))]
|
||||
#[stack_trace_debug]
|
||||
pub enum Error {
|
||||
#[snafu(display("General catalog error"))]
|
||||
Catalog {
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
source: catalog::error::Error,
|
||||
},
|
||||
|
||||
#[snafu(display("Internal error during building DataFusion plan"))]
|
||||
DataFusionPlanning {
|
||||
#[snafu(source)]
|
||||
error: datafusion::error::DataFusionError,
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Unknown table type, downcast failed"))]
|
||||
UnknownTable {
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Cannot find time index column"))]
|
||||
TimeIndexNotFound {
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Unimplemented feature: {}", feature))]
|
||||
Unimplemented {
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
feature: String,
|
||||
},
|
||||
}
|
||||
|
||||
impl ErrorExt for Error {
|
||||
fn status_code(&self) -> StatusCode {
|
||||
use Error::*;
|
||||
match self {
|
||||
Catalog { source, .. } => source.status_code(),
|
||||
DataFusionPlanning { .. } => StatusCode::External,
|
||||
UnknownTable { .. } | TimeIndexNotFound { .. } => StatusCode::Internal,
|
||||
Unimplemented { .. } => StatusCode::Unsupported,
|
||||
}
|
||||
}
|
||||
|
||||
fn as_any(&self) -> &dyn Any {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
|
||||
impl From<Error> for DataFusionError {
|
||||
fn from(err: Error) -> Self {
|
||||
DataFusionError::External(Box::new(err))
|
||||
}
|
||||
}
|
||||
371
src/query/src/log_query/planner.rs
Normal file
371
src/query/src/log_query/planner.rs
Normal file
@@ -0,0 +1,371 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use catalog::table_source::DfTableSourceProvider;
|
||||
use common_function::utils::escape_like_pattern;
|
||||
use datafusion::datasource::DefaultTableSource;
|
||||
use datafusion_common::ScalarValue;
|
||||
use datafusion_expr::utils::conjunction;
|
||||
use datafusion_expr::{col, lit, Expr, LogicalPlan, LogicalPlanBuilder};
|
||||
use datafusion_sql::TableReference;
|
||||
use datatypes::schema::Schema;
|
||||
use log_query::{ColumnFilters, LogQuery, TimeFilter};
|
||||
use snafu::{OptionExt, ResultExt};
|
||||
use table::table::adapter::DfTableProviderAdapter;
|
||||
|
||||
use crate::log_query::error::{
|
||||
CatalogSnafu, DataFusionPlanningSnafu, Result, TimeIndexNotFoundSnafu, UnimplementedSnafu,
|
||||
UnknownTableSnafu,
|
||||
};
|
||||
|
||||
const DEFAULT_LIMIT: usize = 1000;
|
||||
|
||||
pub struct LogQueryPlanner {
|
||||
table_provider: DfTableSourceProvider,
|
||||
}
|
||||
|
||||
impl LogQueryPlanner {
|
||||
pub fn new(table_provider: DfTableSourceProvider) -> Self {
|
||||
Self { table_provider }
|
||||
}
|
||||
|
||||
pub async fn query_to_plan(&mut self, query: LogQuery) -> Result<LogicalPlan> {
|
||||
// Resolve table
|
||||
let table_ref: TableReference = query.table.table_ref().into();
|
||||
let table_source = self
|
||||
.table_provider
|
||||
.resolve_table(table_ref.clone())
|
||||
.await
|
||||
.context(CatalogSnafu)?;
|
||||
let schema = table_source
|
||||
.as_any()
|
||||
.downcast_ref::<DefaultTableSource>()
|
||||
.context(UnknownTableSnafu)?
|
||||
.table_provider
|
||||
.as_any()
|
||||
.downcast_ref::<DfTableProviderAdapter>()
|
||||
.context(UnknownTableSnafu)?
|
||||
.table()
|
||||
.schema();
|
||||
|
||||
// Build the initial scan plan
|
||||
let mut plan_builder = LogicalPlanBuilder::scan(table_ref, table_source, None)
|
||||
.context(DataFusionPlanningSnafu)?;
|
||||
|
||||
// Collect filter expressions
|
||||
let mut filters = Vec::new();
|
||||
|
||||
// Time filter
|
||||
filters.push(self.build_time_filter(&query.time_filter, &schema)?);
|
||||
|
||||
// Column filters and projections
|
||||
let mut projected_columns = Vec::new();
|
||||
for column_filter in &query.columns {
|
||||
if let Some(expr) = self.build_column_filter(column_filter)? {
|
||||
filters.push(expr);
|
||||
}
|
||||
projected_columns.push(col(&column_filter.column_name));
|
||||
}
|
||||
|
||||
// Apply filters
|
||||
if !filters.is_empty() {
|
||||
let filter_expr = filters.into_iter().reduce(|a, b| a.and(b)).unwrap();
|
||||
plan_builder = plan_builder
|
||||
.filter(filter_expr)
|
||||
.context(DataFusionPlanningSnafu)?;
|
||||
}
|
||||
|
||||
// Apply projections
|
||||
plan_builder = plan_builder
|
||||
.project(projected_columns)
|
||||
.context(DataFusionPlanningSnafu)?;
|
||||
|
||||
// Apply limit
|
||||
plan_builder = plan_builder
|
||||
.limit(0, query.limit.or(Some(DEFAULT_LIMIT)))
|
||||
.context(DataFusionPlanningSnafu)?;
|
||||
|
||||
// Build the final plan
|
||||
let plan = plan_builder.build().context(DataFusionPlanningSnafu)?;
|
||||
|
||||
Ok(plan)
|
||||
}
|
||||
|
||||
fn build_time_filter(&self, time_filter: &TimeFilter, schema: &Schema) -> Result<Expr> {
|
||||
let timestamp_col = schema
|
||||
.timestamp_column()
|
||||
.with_context(|| TimeIndexNotFoundSnafu {})?
|
||||
.name
|
||||
.clone();
|
||||
|
||||
let start_time = ScalarValue::Utf8(time_filter.start.clone());
|
||||
let end_time = ScalarValue::Utf8(
|
||||
time_filter
|
||||
.end
|
||||
.clone()
|
||||
.or(Some("9999-12-31T23:59:59Z".to_string())),
|
||||
);
|
||||
let expr = col(timestamp_col.clone())
|
||||
.gt_eq(lit(start_time))
|
||||
.and(col(timestamp_col).lt_eq(lit(end_time)));
|
||||
|
||||
Ok(expr)
|
||||
}
|
||||
|
||||
/// Returns filter expressions
|
||||
fn build_column_filter(&self, column_filter: &ColumnFilters) -> Result<Option<Expr>> {
|
||||
if column_filter.filters.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let exprs = column_filter
|
||||
.filters
|
||||
.iter()
|
||||
.map(|filter| match filter {
|
||||
log_query::ContentFilter::Exact(pattern) => Ok(col(&column_filter.column_name)
|
||||
.like(lit(ScalarValue::Utf8(Some(escape_like_pattern(pattern)))))),
|
||||
log_query::ContentFilter::Prefix(pattern) => Ok(col(&column_filter.column_name)
|
||||
.like(lit(ScalarValue::Utf8(Some(format!(
|
||||
"{}%",
|
||||
escape_like_pattern(pattern)
|
||||
)))))),
|
||||
log_query::ContentFilter::Postfix(pattern) => Ok(col(&column_filter.column_name)
|
||||
.like(lit(ScalarValue::Utf8(Some(format!(
|
||||
"%{}",
|
||||
escape_like_pattern(pattern)
|
||||
)))))),
|
||||
log_query::ContentFilter::Contains(pattern) => Ok(col(&column_filter.column_name)
|
||||
.like(lit(ScalarValue::Utf8(Some(format!(
|
||||
"%{}%",
|
||||
escape_like_pattern(pattern)
|
||||
)))))),
|
||||
log_query::ContentFilter::Regex(..) => Err::<Expr, _>(
|
||||
UnimplementedSnafu {
|
||||
feature: "regex filter",
|
||||
}
|
||||
.build(),
|
||||
),
|
||||
log_query::ContentFilter::Compound(..) => Err::<Expr, _>(
|
||||
UnimplementedSnafu {
|
||||
feature: "compound filter",
|
||||
}
|
||||
.build(),
|
||||
),
|
||||
})
|
||||
.try_collect::<Vec<_>>()?;
|
||||
|
||||
Ok(conjunction(exprs))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use catalog::memory::MemoryCatalogManager;
|
||||
use catalog::RegisterTableRequest;
|
||||
use common_catalog::consts::DEFAULT_CATALOG_NAME;
|
||||
use common_query::test_util::DummyDecoder;
|
||||
use datatypes::prelude::ConcreteDataType;
|
||||
use datatypes::schema::{ColumnSchema, SchemaRef};
|
||||
use log_query::{ContentFilter, Context};
|
||||
use session::context::QueryContext;
|
||||
use table::metadata::{TableInfoBuilder, TableMetaBuilder};
|
||||
use table::table_name::TableName;
|
||||
use table::test_util::EmptyTable;
|
||||
|
||||
use super::*;
|
||||
|
||||
fn mock_schema() -> SchemaRef {
|
||||
let columns = vec![
|
||||
ColumnSchema::new(
|
||||
"message".to_string(),
|
||||
ConcreteDataType::string_datatype(),
|
||||
false,
|
||||
),
|
||||
ColumnSchema::new(
|
||||
"timestamp".to_string(),
|
||||
ConcreteDataType::timestamp_millisecond_datatype(),
|
||||
false,
|
||||
)
|
||||
.with_time_index(true),
|
||||
ColumnSchema::new(
|
||||
"host".to_string(),
|
||||
ConcreteDataType::string_datatype(),
|
||||
true,
|
||||
),
|
||||
];
|
||||
|
||||
Arc::new(Schema::new(columns))
|
||||
}
|
||||
|
||||
/// Registers table under `greptime`, with `message` and `timestamp` and `host` columns.
|
||||
async fn build_test_table_provider(
|
||||
table_name_tuples: &[(String, String)],
|
||||
) -> DfTableSourceProvider {
|
||||
let catalog_list = MemoryCatalogManager::with_default_setup();
|
||||
for (schema_name, table_name) in table_name_tuples {
|
||||
let schema = mock_schema();
|
||||
let table_meta = TableMetaBuilder::default()
|
||||
.schema(schema)
|
||||
.primary_key_indices(vec![2])
|
||||
.value_indices(vec![0])
|
||||
.next_column_id(1024)
|
||||
.build()
|
||||
.unwrap();
|
||||
let table_info = TableInfoBuilder::default()
|
||||
.name(table_name.to_string())
|
||||
.meta(table_meta)
|
||||
.build()
|
||||
.unwrap();
|
||||
let table = EmptyTable::from_table_info(&table_info);
|
||||
|
||||
catalog_list
|
||||
.register_table_sync(RegisterTableRequest {
|
||||
catalog: DEFAULT_CATALOG_NAME.to_string(),
|
||||
schema: schema_name.to_string(),
|
||||
table_name: table_name.to_string(),
|
||||
table_id: 1024,
|
||||
table,
|
||||
})
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
DfTableSourceProvider::new(
|
||||
catalog_list,
|
||||
false,
|
||||
QueryContext::arc(),
|
||||
DummyDecoder::arc(),
|
||||
false,
|
||||
)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_query_to_plan() {
|
||||
let table_provider =
|
||||
build_test_table_provider(&[("public".to_string(), "test_table".to_string())]).await;
|
||||
let mut planner = LogQueryPlanner::new(table_provider);
|
||||
|
||||
let log_query = LogQuery {
|
||||
table: TableName::new(DEFAULT_CATALOG_NAME, "public", "test_table"),
|
||||
time_filter: TimeFilter {
|
||||
start: Some("2021-01-01T00:00:00Z".to_string()),
|
||||
end: Some("2021-01-02T00:00:00Z".to_string()),
|
||||
span: None,
|
||||
},
|
||||
columns: vec![ColumnFilters {
|
||||
column_name: "message".to_string(),
|
||||
filters: vec![ContentFilter::Contains("error".to_string())],
|
||||
}],
|
||||
limit: Some(100),
|
||||
context: Context::None,
|
||||
};
|
||||
|
||||
let plan = planner.query_to_plan(log_query).await.unwrap();
|
||||
let expected = "Limit: skip=0, fetch=100 [message:Utf8]\
|
||||
\n Projection: greptime.public.test_table.message [message:Utf8]\
|
||||
\n Filter: greptime.public.test_table.timestamp >= Utf8(\"2021-01-01T00:00:00Z\") AND greptime.public.test_table.timestamp <= Utf8(\"2021-01-02T00:00:00Z\") AND greptime.public.test_table.message LIKE Utf8(\"%error%\") [message:Utf8, timestamp:Timestamp(Millisecond, None), host:Utf8;N]\
|
||||
\n TableScan: greptime.public.test_table [message:Utf8, timestamp:Timestamp(Millisecond, None), host:Utf8;N]";
|
||||
|
||||
assert_eq!(plan.display_indent_schema().to_string(), expected);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_build_time_filter() {
|
||||
let table_provider =
|
||||
build_test_table_provider(&[("public".to_string(), "test_table".to_string())]).await;
|
||||
let planner = LogQueryPlanner::new(table_provider);
|
||||
|
||||
let time_filter = TimeFilter {
|
||||
start: Some("2021-01-01T00:00:00Z".to_string()),
|
||||
end: Some("2021-01-02T00:00:00Z".to_string()),
|
||||
span: None,
|
||||
};
|
||||
|
||||
let expr = planner
|
||||
.build_time_filter(&time_filter, &mock_schema())
|
||||
.unwrap();
|
||||
|
||||
let expected_expr = col("timestamp")
|
||||
.gt_eq(lit(ScalarValue::Utf8(Some(
|
||||
"2021-01-01T00:00:00Z".to_string(),
|
||||
))))
|
||||
.and(col("timestamp").lt_eq(lit(ScalarValue::Utf8(Some(
|
||||
"2021-01-02T00:00:00Z".to_string(),
|
||||
)))));
|
||||
|
||||
assert_eq!(format!("{:?}", expr), format!("{:?}", expected_expr));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_build_time_filter_without_end() {
|
||||
let table_provider =
|
||||
build_test_table_provider(&[("public".to_string(), "test_table".to_string())]).await;
|
||||
let planner = LogQueryPlanner::new(table_provider);
|
||||
|
||||
let time_filter = TimeFilter {
|
||||
start: Some("2021-01-01T00:00:00Z".to_string()),
|
||||
end: None,
|
||||
span: None,
|
||||
};
|
||||
|
||||
let expr = planner
|
||||
.build_time_filter(&time_filter, &mock_schema())
|
||||
.unwrap();
|
||||
|
||||
let expected_expr = col("timestamp")
|
||||
.gt_eq(lit(ScalarValue::Utf8(Some(
|
||||
"2021-01-01T00:00:00Z".to_string(),
|
||||
))))
|
||||
.and(col("timestamp").lt_eq(lit(ScalarValue::Utf8(Some(
|
||||
"9999-12-31T23:59:59Z".to_string(),
|
||||
)))));
|
||||
|
||||
assert_eq!(format!("{:?}", expr), format!("{:?}", expected_expr));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_build_column_filter() {
|
||||
let table_provider =
|
||||
build_test_table_provider(&[("public".to_string(), "test_table".to_string())]).await;
|
||||
let planner = LogQueryPlanner::new(table_provider);
|
||||
|
||||
let column_filter = ColumnFilters {
|
||||
column_name: "message".to_string(),
|
||||
filters: vec![
|
||||
ContentFilter::Contains("error".to_string()),
|
||||
ContentFilter::Prefix("WARN".to_string()),
|
||||
],
|
||||
};
|
||||
|
||||
let expr_option = planner.build_column_filter(&column_filter).unwrap();
|
||||
assert!(expr_option.is_some());
|
||||
|
||||
let expr = expr_option.unwrap();
|
||||
|
||||
let expected_expr = col("message")
|
||||
.like(lit(ScalarValue::Utf8(Some("%error%".to_string()))))
|
||||
.and(col("message").like(lit(ScalarValue::Utf8(Some("WARN%".to_string())))));
|
||||
|
||||
assert_eq!(format!("{:?}", expr), format!("{:?}", expected_expr));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_escape_pattern() {
|
||||
assert_eq!(escape_like_pattern("test"), "test");
|
||||
assert_eq!(escape_like_pattern("te%st"), "te\\%st");
|
||||
assert_eq!(escape_like_pattern("te_st"), "te\\_st");
|
||||
assert_eq!(escape_like_pattern("te\\st"), "te\\\\st");
|
||||
}
|
||||
}
|
||||
@@ -24,6 +24,7 @@ use datafusion::execution::context::SessionState;
|
||||
use datafusion::sql::planner::PlannerContext;
|
||||
use datafusion_expr::{Expr as DfExpr, LogicalPlan};
|
||||
use datafusion_sql::planner::{ParserOptions, SqlToRel};
|
||||
use log_query::LogQuery;
|
||||
use promql_parser::parser::EvalStmt;
|
||||
use session::context::QueryContextRef;
|
||||
use snafu::ResultExt;
|
||||
@@ -31,6 +32,7 @@ use sql::ast::Expr as SqlExpr;
|
||||
use sql::statements::statement::Statement;
|
||||
|
||||
use crate::error::{DataFusionSnafu, PlanSqlSnafu, QueryPlanSnafu, Result, SqlSnafu};
|
||||
use crate::log_query::planner::LogQueryPlanner;
|
||||
use crate::parser::QueryStatement;
|
||||
use crate::promql::planner::PromPlanner;
|
||||
use crate::query_engine::{DefaultPlanDecoder, QueryEngineState};
|
||||
@@ -41,6 +43,12 @@ use crate::{DfContextProviderAdapter, QueryEngineContext};
|
||||
pub trait LogicalPlanner: Send + Sync {
|
||||
async fn plan(&self, stmt: &QueryStatement, query_ctx: QueryContextRef) -> Result<LogicalPlan>;
|
||||
|
||||
async fn plan_logs_query(
|
||||
&self,
|
||||
query: LogQuery,
|
||||
query_ctx: QueryContextRef,
|
||||
) -> Result<LogicalPlan>;
|
||||
|
||||
fn optimize(&self, plan: LogicalPlan) -> Result<LogicalPlan>;
|
||||
|
||||
fn as_any(&self) -> &dyn Any;
|
||||
@@ -182,6 +190,34 @@ impl LogicalPlanner for DfLogicalPlanner {
|
||||
}
|
||||
}
|
||||
|
||||
async fn plan_logs_query(
|
||||
&self,
|
||||
query: LogQuery,
|
||||
query_ctx: QueryContextRef,
|
||||
) -> Result<LogicalPlan> {
|
||||
let plan_decoder = Arc::new(DefaultPlanDecoder::new(
|
||||
self.session_state.clone(),
|
||||
&query_ctx,
|
||||
)?);
|
||||
let table_provider = DfTableSourceProvider::new(
|
||||
self.engine_state.catalog_manager().clone(),
|
||||
self.engine_state.disallow_cross_catalog_query(),
|
||||
query_ctx,
|
||||
plan_decoder,
|
||||
self.session_state
|
||||
.config_options()
|
||||
.sql_parser
|
||||
.enable_ident_normalization,
|
||||
);
|
||||
|
||||
let mut planner = LogQueryPlanner::new(table_provider);
|
||||
planner
|
||||
.query_to_plan(query)
|
||||
.await
|
||||
.map_err(BoxedError::new)
|
||||
.context(QueryPlanSnafu)
|
||||
}
|
||||
|
||||
fn optimize(&self, plan: LogicalPlan) -> Result<LogicalPlan> {
|
||||
self.optimize_logical_plan(plan)
|
||||
}
|
||||
|
||||
@@ -66,6 +66,7 @@ itertools.workspace = true
|
||||
json5 = "0.4"
|
||||
jsonb.workspace = true
|
||||
lazy_static.workspace = true
|
||||
log-query.workspace = true
|
||||
loki-api = "0.1"
|
||||
mime_guess = "2.0"
|
||||
notify.workspace = true
|
||||
|
||||
@@ -66,8 +66,8 @@ use crate::metrics_handler::MetricsHandler;
|
||||
use crate::prometheus_handler::PrometheusHandlerRef;
|
||||
use crate::query_handler::sql::ServerSqlQueryHandlerRef;
|
||||
use crate::query_handler::{
|
||||
InfluxdbLineProtocolHandlerRef, OpenTelemetryProtocolHandlerRef, OpentsdbProtocolHandlerRef,
|
||||
PipelineHandlerRef, PromStoreProtocolHandlerRef, ScriptHandlerRef,
|
||||
InfluxdbLineProtocolHandlerRef, LogQueryHandlerRef, OpenTelemetryProtocolHandlerRef,
|
||||
OpentsdbProtocolHandlerRef, PipelineHandlerRef, PromStoreProtocolHandlerRef, ScriptHandlerRef,
|
||||
};
|
||||
use crate::server::Server;
|
||||
|
||||
@@ -80,6 +80,7 @@ mod extractor;
|
||||
pub mod handler;
|
||||
pub mod header;
|
||||
pub mod influxdb;
|
||||
pub mod logs;
|
||||
pub mod mem_prof;
|
||||
pub mod opentsdb;
|
||||
pub mod otlp;
|
||||
@@ -506,6 +507,17 @@ impl HttpServerBuilder {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_logs_handler(self, logs_handler: LogQueryHandlerRef) -> Self {
|
||||
let logs_router = HttpServer::route_logs(logs_handler);
|
||||
|
||||
Self {
|
||||
router: self
|
||||
.router
|
||||
.nest(&format!("/{HTTP_API_VERSION}"), logs_router),
|
||||
..self
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_opentsdb_handler(self, handler: OpentsdbProtocolHandlerRef) -> Self {
|
||||
Self {
|
||||
router: self.router.nest(
|
||||
@@ -770,6 +782,12 @@ impl HttpServer {
|
||||
.with_state(api_state)
|
||||
}
|
||||
|
||||
fn route_logs<S>(log_handler: LogQueryHandlerRef) -> Router<S> {
|
||||
Router::new()
|
||||
.route("/logs", routing::get(logs::logs).post(logs::logs))
|
||||
.with_state(log_handler)
|
||||
}
|
||||
|
||||
/// Route Prometheus [HTTP API].
|
||||
///
|
||||
/// [HTTP API]: https://prometheus.io/docs/prometheus/latest/querying/api/
|
||||
|
||||
50
src/servers/src/http/logs.rs
Normal file
50
src/servers/src/http/logs.rs
Normal file
@@ -0,0 +1,50 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
|
||||
use axum::extract::State;
|
||||
use axum::response::{IntoResponse, Response};
|
||||
use axum::{Extension, Json};
|
||||
use common_telemetry::tracing;
|
||||
use log_query::LogQuery;
|
||||
use session::context::{Channel, QueryContext};
|
||||
|
||||
use crate::http::result::greptime_result_v1::GreptimedbV1Response;
|
||||
use crate::query_handler::LogQueryHandlerRef;
|
||||
|
||||
#[axum_macros::debug_handler]
|
||||
#[tracing::instrument(skip_all, fields(protocol = "http", request_type = "logs"))]
|
||||
pub async fn logs(
|
||||
State(handler): State<LogQueryHandlerRef>,
|
||||
Extension(mut query_ctx): Extension<QueryContext>,
|
||||
Json(params): Json<LogQuery>,
|
||||
) -> Response {
|
||||
let exec_start = Instant::now();
|
||||
let db = query_ctx.get_db_string();
|
||||
|
||||
query_ctx.set_channel(Channel::Http);
|
||||
let query_ctx = Arc::new(query_ctx);
|
||||
|
||||
let _timer = crate::metrics::METRIC_HTTP_LOGS_INGESTION_ELAPSED
|
||||
.with_label_values(&[db.as_str()])
|
||||
.start_timer();
|
||||
|
||||
let output = handler.query(params, query_ctx).await;
|
||||
let resp = GreptimedbV1Response::from_output(vec![output]).await;
|
||||
|
||||
resp.with_execution_time(exec_start.elapsed().as_millis() as u64)
|
||||
.into_response()
|
||||
}
|
||||
@@ -22,6 +22,7 @@ use async_trait::async_trait;
|
||||
use common_error::ext::ErrorExt;
|
||||
use common_query::Output;
|
||||
use datafusion_expr::LogicalPlan;
|
||||
use log_query::LogQuery;
|
||||
use query::parser::PromQuery;
|
||||
use serde_json::Value;
|
||||
use session::context::QueryContextRef;
|
||||
@@ -458,3 +459,54 @@ where
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// LogQueryInterceptor can track life cycle of a log query request
|
||||
/// and customize or abort its execution at given point.
|
||||
pub trait LogQueryInterceptor {
|
||||
type Error: ErrorExt;
|
||||
|
||||
/// Called before query is actually executed.
|
||||
fn pre_query(&self, _query: &LogQuery, _query_ctx: QueryContextRef) -> Result<(), Self::Error> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Called after execution finished. The implementation can modify the
|
||||
/// output if needed.
|
||||
fn post_query(
|
||||
&self,
|
||||
output: Output,
|
||||
_query_ctx: QueryContextRef,
|
||||
) -> Result<Output, Self::Error> {
|
||||
Ok(output)
|
||||
}
|
||||
}
|
||||
|
||||
pub type LogQueryInterceptorRef<E> =
|
||||
Arc<dyn LogQueryInterceptor<Error = E> + Send + Sync + 'static>;
|
||||
|
||||
impl<E> LogQueryInterceptor for Option<&LogQueryInterceptorRef<E>>
|
||||
where
|
||||
E: ErrorExt,
|
||||
{
|
||||
type Error = E;
|
||||
|
||||
fn pre_query(&self, query: &LogQuery, query_ctx: QueryContextRef) -> Result<(), Self::Error> {
|
||||
if let Some(this) = self {
|
||||
this.pre_query(query, query_ctx)
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn post_query(
|
||||
&self,
|
||||
output: Output,
|
||||
query_ctx: QueryContextRef,
|
||||
) -> Result<Output, Self::Error> {
|
||||
if let Some(this) = self {
|
||||
this.post_query(output, query_ctx)
|
||||
} else {
|
||||
Ok(output)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -72,6 +72,14 @@ lazy_static! {
|
||||
vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0, 300.0]
|
||||
)
|
||||
.unwrap();
|
||||
/// Http logs query duration per database.
|
||||
pub static ref METRIC_HTTP_LOGS_ELAPSED: HistogramVec = register_histogram_vec!(
|
||||
"greptime_servers_http_logs_elapsed",
|
||||
"servers http logs elapsed",
|
||||
&[METRIC_DB_LABEL],
|
||||
vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0, 300.0]
|
||||
)
|
||||
.unwrap();
|
||||
pub static ref METRIC_AUTH_FAILURE: IntCounterVec = register_int_counter_vec!(
|
||||
"greptime_servers_auth_failure_count",
|
||||
"servers auth failure count",
|
||||
|
||||
@@ -34,6 +34,7 @@ use api::v1::RowInsertRequests;
|
||||
use async_trait::async_trait;
|
||||
use common_query::Output;
|
||||
use headers::HeaderValue;
|
||||
use log_query::LogQuery;
|
||||
use opentelemetry_proto::tonic::collector::logs::v1::ExportLogsServiceRequest;
|
||||
use opentelemetry_proto::tonic::collector::metrics::v1::ExportMetricsServiceRequest;
|
||||
use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest;
|
||||
@@ -52,6 +53,7 @@ pub type PromStoreProtocolHandlerRef = Arc<dyn PromStoreProtocolHandler + Send +
|
||||
pub type OpenTelemetryProtocolHandlerRef = Arc<dyn OpenTelemetryProtocolHandler + Send + Sync>;
|
||||
pub type ScriptHandlerRef = Arc<dyn ScriptHandler + Send + Sync>;
|
||||
pub type PipelineHandlerRef = Arc<dyn PipelineHandler + Send + Sync>;
|
||||
pub type LogQueryHandlerRef = Arc<dyn LogQueryHandler + Send + Sync>;
|
||||
|
||||
#[async_trait]
|
||||
pub trait ScriptHandler {
|
||||
@@ -174,3 +176,9 @@ pub trait PipelineHandler {
|
||||
//// Build a pipeline from a string.
|
||||
fn build_pipeline(&self, pipeline: &str) -> Result<Pipeline<GreptimeTransformer>>;
|
||||
}
|
||||
|
||||
/// Handle log query requests.
|
||||
#[async_trait]
|
||||
pub trait LogQueryHandler {
|
||||
async fn query(&self, query: LogQuery, ctx: QueryContextRef) -> Result<Output>;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user