mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2026-01-05 12:52:57 +00:00
Compare commits
15 Commits
v0.12.0-ni
...
chore/debu
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1e37847f48 | ||
|
|
2b89970d45 | ||
|
|
53d006292d | ||
|
|
d18c8b5e16 | ||
|
|
e0949c4a11 | ||
|
|
5cf931c417 | ||
|
|
cc5b1d42b0 | ||
|
|
55b7656956 | ||
|
|
75e4f307c9 | ||
|
|
89f2e15ffb | ||
|
|
13ed10556a | ||
|
|
d1108ab581 | ||
|
|
1287d4cb9f | ||
|
|
109fe04d17 | ||
|
|
f1eb76f489 |
4
.github/actions/build-images/action.yml
vendored
4
.github/actions/build-images/action.yml
vendored
@@ -41,8 +41,8 @@ runs:
|
||||
image-name: ${{ inputs.image-name }}
|
||||
image-tag: ${{ inputs.version }}
|
||||
docker-file: docker/ci/ubuntu/Dockerfile
|
||||
amd64-artifact-name: greptime-linux-amd64-pyo3-${{ inputs.version }}
|
||||
arm64-artifact-name: greptime-linux-arm64-pyo3-${{ inputs.version }}
|
||||
amd64-artifact-name: greptime-linux-amd64-${{ inputs.version }}
|
||||
arm64-artifact-name: greptime-linux-arm64-${{ inputs.version }}
|
||||
platforms: linux/amd64,linux/arm64
|
||||
push-latest-tag: ${{ inputs.push-latest-tag }}
|
||||
|
||||
|
||||
14
.github/actions/build-linux-artifacts/action.yml
vendored
14
.github/actions/build-linux-artifacts/action.yml
vendored
@@ -48,19 +48,7 @@ runs:
|
||||
path: /tmp/greptime-*.log
|
||||
retention-days: 3
|
||||
|
||||
- name: Build standard greptime
|
||||
uses: ./.github/actions/build-greptime-binary
|
||||
with:
|
||||
base-image: ubuntu
|
||||
features: pyo3_backend,servers/dashboard
|
||||
cargo-profile: ${{ inputs.cargo-profile }}
|
||||
artifacts-dir: greptime-linux-${{ inputs.arch }}-pyo3-${{ inputs.version }}
|
||||
version: ${{ inputs.version }}
|
||||
working-dir: ${{ inputs.working-dir }}
|
||||
image-registry: ${{ inputs.image-registry }}
|
||||
image-namespace: ${{ inputs.image-namespace }}
|
||||
|
||||
- name: Build greptime without pyo3
|
||||
- name: Build greptime
|
||||
if: ${{ inputs.dev-mode == 'false' }}
|
||||
uses: ./.github/actions/build-greptime-binary
|
||||
with:
|
||||
|
||||
@@ -33,15 +33,6 @@ runs:
|
||||
- name: Rust Cache
|
||||
uses: Swatinem/rust-cache@v2
|
||||
|
||||
- name: Install Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
|
||||
- name: Install PyArrow Package
|
||||
shell: pwsh
|
||||
run: pip install pyarrow numpy
|
||||
|
||||
- name: Install WSL distribution
|
||||
uses: Vampire/setup-wsl@v2
|
||||
with:
|
||||
|
||||
34
.github/workflows/develop.yml
vendored
34
.github/workflows/develop.yml
vendored
@@ -10,17 +10,6 @@ on:
|
||||
- 'docker/**'
|
||||
- '.gitignore'
|
||||
- 'grafana/**'
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths-ignore:
|
||||
- 'docs/**'
|
||||
- 'config/**'
|
||||
- '**.md'
|
||||
- '.dockerignore'
|
||||
- 'docker/**'
|
||||
- '.gitignore'
|
||||
- 'grafana/**'
|
||||
workflow_dispatch:
|
||||
|
||||
name: CI
|
||||
@@ -84,7 +73,7 @@ jobs:
|
||||
# Shares across multiple jobs
|
||||
shared-key: "check-toml"
|
||||
- name: Install taplo
|
||||
run: cargo +stable install taplo-cli --version ^0.9 --locked
|
||||
run: cargo +stable install taplo-cli --version ^0.9 --locked --force
|
||||
- name: Run taplo
|
||||
run: taplo format --check
|
||||
|
||||
@@ -107,7 +96,7 @@ jobs:
|
||||
shared-key: "build-binaries"
|
||||
- name: Install cargo-gc-bin
|
||||
shell: bash
|
||||
run: cargo install cargo-gc-bin
|
||||
run: cargo install cargo-gc-bin --force
|
||||
- name: Build greptime binaries
|
||||
shell: bash
|
||||
# `cargo gc` will invoke `cargo build` with specified args
|
||||
@@ -163,7 +152,7 @@ jobs:
|
||||
run: |
|
||||
sudo apt-get install -y libfuzzer-14-dev
|
||||
rustup install nightly
|
||||
cargo +nightly install cargo-fuzz cargo-gc-bin
|
||||
cargo +nightly install cargo-fuzz cargo-gc-bin --force
|
||||
- name: Download pre-built binaries
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
@@ -220,7 +209,7 @@ jobs:
|
||||
shell: bash
|
||||
run: |
|
||||
sudo apt update && sudo apt install -y libfuzzer-14-dev
|
||||
cargo install cargo-fuzz cargo-gc-bin
|
||||
cargo install cargo-fuzz cargo-gc-bin --force
|
||||
- name: Download pre-built binariy
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
@@ -268,7 +257,7 @@ jobs:
|
||||
shared-key: "build-greptime-ci"
|
||||
- name: Install cargo-gc-bin
|
||||
shell: bash
|
||||
run: cargo install cargo-gc-bin
|
||||
run: cargo install cargo-gc-bin --force
|
||||
- name: Build greptime bianry
|
||||
shell: bash
|
||||
# `cargo gc` will invoke `cargo build` with specified args
|
||||
@@ -338,7 +327,7 @@ jobs:
|
||||
run: |
|
||||
sudo apt-get install -y libfuzzer-14-dev
|
||||
rustup install nightly
|
||||
cargo +nightly install cargo-fuzz cargo-gc-bin
|
||||
cargo +nightly install cargo-fuzz cargo-gc-bin --force
|
||||
# Downloads ci image
|
||||
- name: Download pre-built binariy
|
||||
uses: actions/download-artifact@v4
|
||||
@@ -487,7 +476,7 @@ jobs:
|
||||
run: |
|
||||
sudo apt-get install -y libfuzzer-14-dev
|
||||
rustup install nightly
|
||||
cargo +nightly install cargo-fuzz cargo-gc-bin
|
||||
cargo +nightly install cargo-fuzz cargo-gc-bin --force
|
||||
# Downloads ci image
|
||||
- name: Download pre-built binariy
|
||||
uses: actions/download-artifact@v4
|
||||
@@ -653,6 +642,7 @@ jobs:
|
||||
if: github.event.pull_request.draft == false
|
||||
runs-on: ubuntu-20.04-8-cores
|
||||
timeout-minutes: 60
|
||||
needs: [clippy, fmt]
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: arduino/setup-protoc@v3
|
||||
@@ -678,12 +668,6 @@ jobs:
|
||||
uses: taiki-e/install-action@nextest
|
||||
- name: Install cargo-llvm-cov
|
||||
uses: taiki-e/install-action@cargo-llvm-cov
|
||||
- name: Install Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.10'
|
||||
- name: Install PyArrow Package
|
||||
run: pip install pyarrow numpy
|
||||
- name: Setup etcd server
|
||||
working-directory: tests-integration/fixtures/etcd
|
||||
run: docker compose -f docker-compose-standalone.yml up -d --wait
|
||||
@@ -697,7 +681,7 @@ jobs:
|
||||
working-directory: tests-integration/fixtures/postgres
|
||||
run: docker compose -f docker-compose-standalone.yml up -d --wait
|
||||
- name: Run nextest cases
|
||||
run: cargo llvm-cov nextest --workspace --lcov --output-path lcov.info -F pyo3_backend -F dashboard -F pg_kvbackend
|
||||
run: cargo llvm-cov nextest --workspace --lcov --output-path lcov.info -F dashboard -F pg_kvbackend
|
||||
env:
|
||||
CARGO_BUILD_RUSTFLAGS: "-C link-arg=-fuse-ld=lld"
|
||||
RUST_BACKTRACE: 1
|
||||
|
||||
10
.github/workflows/nightly-ci.yml
vendored
10
.github/workflows/nightly-ci.yml
vendored
@@ -1,6 +1,6 @@
|
||||
on:
|
||||
schedule:
|
||||
- cron: "0 23 * * 1-5"
|
||||
- cron: "0 23 * * 1-4"
|
||||
workflow_dispatch:
|
||||
|
||||
name: Nightly CI
|
||||
@@ -91,18 +91,12 @@ jobs:
|
||||
uses: Swatinem/rust-cache@v2
|
||||
- name: Install Cargo Nextest
|
||||
uses: taiki-e/install-action@nextest
|
||||
- name: Install Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
- name: Install PyArrow Package
|
||||
run: pip install pyarrow numpy
|
||||
- name: Install WSL distribution
|
||||
uses: Vampire/setup-wsl@v2
|
||||
with:
|
||||
distribution: Ubuntu-22.04
|
||||
- name: Running tests
|
||||
run: cargo nextest run -F pyo3_backend,dashboard
|
||||
run: cargo nextest run -F dashboard
|
||||
env:
|
||||
CARGO_BUILD_RUSTFLAGS: "-C linker=lld-link"
|
||||
RUST_BACKTRACE: 1
|
||||
|
||||
12
.github/workflows/release.yml
vendored
12
.github/workflows/release.yml
vendored
@@ -222,18 +222,10 @@ jobs:
|
||||
arch: aarch64-apple-darwin
|
||||
features: servers/dashboard
|
||||
artifacts-dir-prefix: greptime-darwin-arm64
|
||||
- os: ${{ needs.allocate-runners.outputs.macos-runner }}
|
||||
arch: aarch64-apple-darwin
|
||||
features: pyo3_backend,servers/dashboard
|
||||
artifacts-dir-prefix: greptime-darwin-arm64-pyo3
|
||||
- os: ${{ needs.allocate-runners.outputs.macos-runner }}
|
||||
features: servers/dashboard
|
||||
arch: x86_64-apple-darwin
|
||||
artifacts-dir-prefix: greptime-darwin-amd64
|
||||
- os: ${{ needs.allocate-runners.outputs.macos-runner }}
|
||||
features: pyo3_backend,servers/dashboard
|
||||
arch: x86_64-apple-darwin
|
||||
artifacts-dir-prefix: greptime-darwin-amd64-pyo3
|
||||
runs-on: ${{ matrix.os }}
|
||||
outputs:
|
||||
build-macos-result: ${{ steps.set-build-macos-result.outputs.build-macos-result }}
|
||||
@@ -271,10 +263,6 @@ jobs:
|
||||
arch: x86_64-pc-windows-msvc
|
||||
features: servers/dashboard
|
||||
artifacts-dir-prefix: greptime-windows-amd64
|
||||
- os: ${{ needs.allocate-runners.outputs.windows-runner }}
|
||||
arch: x86_64-pc-windows-msvc
|
||||
features: pyo3_backend,servers/dashboard
|
||||
artifacts-dir-prefix: greptime-windows-amd64-pyo3
|
||||
runs-on: ${{ matrix.os }}
|
||||
outputs:
|
||||
build-windows-result: ${{ steps.set-build-windows-result.outputs.build-windows-result }}
|
||||
|
||||
31
Cargo.lock
generated
31
Cargo.lock
generated
@@ -2351,6 +2351,8 @@ dependencies = [
|
||||
"snafu 0.8.5",
|
||||
"tempfile",
|
||||
"tokio",
|
||||
"tokio-metrics",
|
||||
"tokio-metrics-collector",
|
||||
"tokio-test",
|
||||
"tokio-util",
|
||||
]
|
||||
@@ -12619,9 +12621,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
|
||||
|
||||
[[package]]
|
||||
name = "tokio"
|
||||
version = "1.40.0"
|
||||
version = "1.42.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e2b070231665d27ad9ec9b8df639893f46727666c6767db40317fbe920a5d998"
|
||||
checksum = "5cec9b21b0450273377fc97bd4c33a8acffc8c996c987a7c5b319a0083707551"
|
||||
dependencies = [
|
||||
"backtrace",
|
||||
"bytes",
|
||||
@@ -12657,6 +12659,31 @@ dependencies = [
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-metrics"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eace09241d62c98b7eeb1107d4c5c64ca3bd7da92e8c218c153ab3a78f9be112"
|
||||
dependencies = [
|
||||
"futures-util",
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-metrics-collector"
|
||||
version = "0.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a8092b7a97ed5dac2f44892db190eca8f476ede0fa585bc87664de4151cd0b64"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
"parking_lot 0.12.3",
|
||||
"prometheus",
|
||||
"tokio",
|
||||
"tokio-metrics",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.12"
|
||||
|
||||
@@ -13,8 +13,6 @@ RUN yum install -y epel-release \
|
||||
openssl \
|
||||
openssl-devel \
|
||||
centos-release-scl \
|
||||
rh-python38 \
|
||||
rh-python38-python-devel \
|
||||
which
|
||||
|
||||
# Install protoc
|
||||
@@ -43,8 +41,6 @@ RUN yum install -y epel-release \
|
||||
openssl \
|
||||
openssl-devel \
|
||||
centos-release-scl \
|
||||
rh-python38 \
|
||||
rh-python38-python-devel \
|
||||
which
|
||||
|
||||
WORKDIR /greptime
|
||||
|
||||
@@ -20,10 +20,7 @@ RUN --mount=type=cache,target=/var/cache/apt \
|
||||
curl \
|
||||
git \
|
||||
build-essential \
|
||||
pkg-config \
|
||||
python3.10 \
|
||||
python3.10-dev \
|
||||
python3-pip
|
||||
pkg-config
|
||||
|
||||
# Install Rust.
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
@@ -46,15 +43,8 @@ ARG OUTPUT_DIR
|
||||
|
||||
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get \
|
||||
-y install ca-certificates \
|
||||
python3.10 \
|
||||
python3.10-dev \
|
||||
python3-pip \
|
||||
curl
|
||||
|
||||
COPY ./docker/python/requirements.txt /etc/greptime/requirements.txt
|
||||
|
||||
RUN python3 -m pip install -r /etc/greptime/requirements.txt
|
||||
|
||||
WORKDIR /greptime
|
||||
COPY --from=builder /out/target/${OUTPUT_DIR}/greptime /greptime/bin/
|
||||
ENV PATH /greptime/bin/:$PATH
|
||||
|
||||
@@ -7,9 +7,7 @@ RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
|
||||
RUN yum install -y epel-release \
|
||||
openssl \
|
||||
openssl-devel \
|
||||
centos-release-scl \
|
||||
rh-python38 \
|
||||
rh-python38-python-devel
|
||||
centos-release-scl
|
||||
|
||||
ARG TARGETARCH
|
||||
|
||||
|
||||
@@ -8,15 +8,8 @@ ARG TARGET_BIN=greptime
|
||||
|
||||
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
|
||||
ca-certificates \
|
||||
python3.10 \
|
||||
python3.10-dev \
|
||||
python3-pip \
|
||||
curl
|
||||
|
||||
COPY $DOCKER_BUILD_ROOT/docker/python/requirements.txt /etc/greptime/requirements.txt
|
||||
|
||||
RUN python3 -m pip install -r /etc/greptime/requirements.txt
|
||||
|
||||
ARG TARGETARCH
|
||||
|
||||
ADD $TARGETARCH/$TARGET_BIN /greptime/bin/
|
||||
|
||||
@@ -20,3 +20,31 @@ Sample at 49 Hertz, for 10 seconds, output report in text format.
|
||||
```bash
|
||||
curl -X POST -s '0:4000/debug/prof/cpu?seconds=10&frequency=49&output=text' > /tmp/pprof.txt
|
||||
```
|
||||
|
||||
## Using `perf`
|
||||
|
||||
First find the pid of GreptimeDB:
|
||||
|
||||
Using `perf record` to profile GreptimeDB, at the sampling frequency of 99 hertz, and a duration of 60 seconds:
|
||||
|
||||
```bash
|
||||
perf record -p <pid> --call-graph dwarf -F 99 -- sleep 60
|
||||
```
|
||||
|
||||
The result will be saved to file `perf.data`.
|
||||
|
||||
Then
|
||||
|
||||
```bash
|
||||
perf script --no-inline > perf.out
|
||||
```
|
||||
|
||||
Produce a flame graph out of it:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/brendangregg/FlameGraph
|
||||
|
||||
FlameGraph/stackcollapse-perf.pl perf.out > perf.folded
|
||||
|
||||
FlameGraph/flamegraph.pl perf.folded > perf.svg
|
||||
```
|
||||
|
||||
@@ -18,7 +18,7 @@ use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::Duration;
|
||||
|
||||
use common_error::ext::BoxedError;
|
||||
use common_error::ext::{BoxedError, ErrorExt};
|
||||
use common_meta::cache_invalidator::KvCacheInvalidator;
|
||||
use common_meta::error::Error::CacheNotGet;
|
||||
use common_meta::error::{CacheNotGetSnafu, Error, ExternalSnafu, GetKvCacheSnafu, Result};
|
||||
@@ -37,6 +37,7 @@ use snafu::{OptionExt, ResultExt};
|
||||
|
||||
use crate::metrics::{
|
||||
METRIC_CATALOG_KV_BATCH_GET, METRIC_CATALOG_KV_GET, METRIC_CATALOG_KV_REMOTE_GET,
|
||||
METRIC_META_CLIENT_GET,
|
||||
};
|
||||
|
||||
const DEFAULT_CACHE_MAX_CAPACITY: u64 = 10000;
|
||||
@@ -292,7 +293,7 @@ impl KvBackend for CachedKvBackend {
|
||||
}
|
||||
.map_err(|e| {
|
||||
GetKvCacheSnafu {
|
||||
err_msg: e.to_string(),
|
||||
err_msg: e.output_msg(),
|
||||
}
|
||||
.build()
|
||||
});
|
||||
@@ -445,6 +446,8 @@ impl KvBackend for MetaKvBackend {
|
||||
}
|
||||
|
||||
async fn get(&self, key: &[u8]) -> Result<Option<KeyValue>> {
|
||||
let _timer = METRIC_META_CLIENT_GET.start_timer();
|
||||
|
||||
let mut response = self
|
||||
.client
|
||||
.range(RangeRequest::new().with_key(key))
|
||||
|
||||
@@ -34,4 +34,6 @@ lazy_static! {
|
||||
register_histogram!("greptime_catalog_kv_get", "catalog kv get").unwrap();
|
||||
pub static ref METRIC_CATALOG_KV_BATCH_GET: Histogram =
|
||||
register_histogram!("greptime_catalog_kv_batch_get", "catalog kv batch get").unwrap();
|
||||
pub static ref METRIC_META_CLIENT_GET: Histogram =
|
||||
register_histogram!("greptime_meta_client_get", "meta client get").unwrap();
|
||||
}
|
||||
|
||||
@@ -15,7 +15,7 @@ cache.workspace = true
|
||||
catalog.workspace = true
|
||||
chrono.workspace = true
|
||||
clap.workspace = true
|
||||
client.workspace = true
|
||||
client = { workspace = true, features = ["testing"] }
|
||||
common-base.workspace = true
|
||||
common-catalog.workspace = true
|
||||
common-config.workspace = true
|
||||
@@ -56,7 +56,6 @@ tokio.workspace = true
|
||||
tracing-appender.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
client = { workspace = true, features = ["testing"] }
|
||||
common-test-util.workspace = true
|
||||
common-version.workspace = true
|
||||
serde.workspace = true
|
||||
|
||||
@@ -62,6 +62,13 @@ impl Instance {
|
||||
pub fn datanode(&self) -> &Datanode {
|
||||
&self.datanode
|
||||
}
|
||||
|
||||
/// Get mutable Datanode instance for changing some internal state, before starting it.
|
||||
// Useful for wrapping Datanode instance. Please do not remove this method even if you find
|
||||
// nowhere it is called.
|
||||
pub fn datanode_mut(&mut self) -> &mut Datanode {
|
||||
&mut self.datanode
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
|
||||
@@ -20,6 +20,7 @@ mod scalar_add;
|
||||
mod scalar_mul;
|
||||
mod sub;
|
||||
pub(crate) mod sum;
|
||||
mod vector_div;
|
||||
mod vector_mul;
|
||||
|
||||
use std::sync::Arc;
|
||||
@@ -45,6 +46,7 @@ impl VectorFunction {
|
||||
|
||||
// vector calculation
|
||||
registry.register(Arc::new(vector_mul::VectorMulFunction));
|
||||
registry.register(Arc::new(vector_div::VectorDivFunction));
|
||||
registry.register(Arc::new(sub::SubFunction));
|
||||
registry.register(Arc::new(elem_sum::ElemSumFunction));
|
||||
}
|
||||
|
||||
218
src/common/function/src/scalars/vector/vector_div.rs
Normal file
218
src/common/function/src/scalars/vector/vector_div.rs
Normal file
@@ -0,0 +1,218 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::borrow::Cow;
|
||||
use std::fmt::Display;
|
||||
|
||||
use common_query::error::{InvalidFuncArgsSnafu, Result};
|
||||
use common_query::prelude::Signature;
|
||||
use datatypes::prelude::ConcreteDataType;
|
||||
use datatypes::scalars::ScalarVectorBuilder;
|
||||
use datatypes::vectors::{BinaryVectorBuilder, MutableVector, VectorRef};
|
||||
use nalgebra::DVectorView;
|
||||
use snafu::ensure;
|
||||
|
||||
use crate::function::{Function, FunctionContext};
|
||||
use crate::helper;
|
||||
use crate::scalars::vector::impl_conv::{as_veclit, as_veclit_if_const, veclit_to_binlit};
|
||||
|
||||
const NAME: &str = "vec_div";
|
||||
|
||||
/// Divides corresponding elements of two vectors.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```sql
|
||||
/// SELECT vec_to_string(vec_div("[2, 4, 6]", "[2, 2, 2]")) as result;
|
||||
///
|
||||
/// +---------+
|
||||
/// | result |
|
||||
/// +---------+
|
||||
/// | [1,2,3] |
|
||||
/// +---------+
|
||||
///
|
||||
/// ```
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct VectorDivFunction;
|
||||
|
||||
impl Function for VectorDivFunction {
|
||||
fn name(&self) -> &str {
|
||||
NAME
|
||||
}
|
||||
|
||||
fn return_type(&self, _input_types: &[ConcreteDataType]) -> Result<ConcreteDataType> {
|
||||
Ok(ConcreteDataType::binary_datatype())
|
||||
}
|
||||
|
||||
fn signature(&self) -> Signature {
|
||||
helper::one_of_sigs2(
|
||||
vec![
|
||||
ConcreteDataType::string_datatype(),
|
||||
ConcreteDataType::binary_datatype(),
|
||||
],
|
||||
vec![
|
||||
ConcreteDataType::string_datatype(),
|
||||
ConcreteDataType::binary_datatype(),
|
||||
],
|
||||
)
|
||||
}
|
||||
|
||||
fn eval(&self, _func_ctx: FunctionContext, columns: &[VectorRef]) -> Result<VectorRef> {
|
||||
ensure!(
|
||||
columns.len() == 2,
|
||||
InvalidFuncArgsSnafu {
|
||||
err_msg: format!(
|
||||
"The length of the args is not correct, expect exactly two, have: {}",
|
||||
columns.len()
|
||||
),
|
||||
}
|
||||
);
|
||||
|
||||
let arg0 = &columns[0];
|
||||
let arg1 = &columns[1];
|
||||
|
||||
let len = arg0.len();
|
||||
let mut result = BinaryVectorBuilder::with_capacity(len);
|
||||
if len == 0 {
|
||||
return Ok(result.to_vector());
|
||||
}
|
||||
|
||||
let arg0_const = as_veclit_if_const(arg0)?;
|
||||
let arg1_const = as_veclit_if_const(arg1)?;
|
||||
|
||||
for i in 0..len {
|
||||
let arg0 = match arg0_const.as_ref() {
|
||||
Some(arg0) => Some(Cow::Borrowed(arg0.as_ref())),
|
||||
None => as_veclit(arg0.get_ref(i))?,
|
||||
};
|
||||
|
||||
let arg1 = match arg1_const.as_ref() {
|
||||
Some(arg1) => Some(Cow::Borrowed(arg1.as_ref())),
|
||||
None => as_veclit(arg1.get_ref(i))?,
|
||||
};
|
||||
|
||||
if let (Some(arg0), Some(arg1)) = (arg0, arg1) {
|
||||
ensure!(
|
||||
arg0.len() == arg1.len(),
|
||||
InvalidFuncArgsSnafu {
|
||||
err_msg: format!(
|
||||
"The length of the vectors must match for division, have: {} vs {}",
|
||||
arg0.len(),
|
||||
arg1.len()
|
||||
),
|
||||
}
|
||||
);
|
||||
let vec0 = DVectorView::from_slice(&arg0, arg0.len());
|
||||
let vec1 = DVectorView::from_slice(&arg1, arg1.len());
|
||||
let vec_res = vec0.component_div(&vec1);
|
||||
|
||||
let veclit = vec_res.as_slice();
|
||||
let binlit = veclit_to_binlit(veclit);
|
||||
result.push(Some(&binlit));
|
||||
} else {
|
||||
result.push_null();
|
||||
}
|
||||
}
|
||||
|
||||
Ok(result.to_vector())
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for VectorDivFunction {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", NAME.to_ascii_uppercase())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_query::error;
|
||||
use datatypes::vectors::StringVector;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_vector_mul() {
|
||||
let func = VectorDivFunction;
|
||||
|
||||
let vec0 = vec![1.0, 2.0, 3.0];
|
||||
let vec1 = vec![1.0, 1.0];
|
||||
let (len0, len1) = (vec0.len(), vec1.len());
|
||||
let input0 = Arc::new(StringVector::from(vec![Some(format!("{vec0:?}"))]));
|
||||
let input1 = Arc::new(StringVector::from(vec![Some(format!("{vec1:?}"))]));
|
||||
|
||||
let err = func
|
||||
.eval(FunctionContext::default(), &[input0, input1])
|
||||
.unwrap_err();
|
||||
|
||||
match err {
|
||||
error::Error::InvalidFuncArgs { err_msg, .. } => {
|
||||
assert_eq!(
|
||||
err_msg,
|
||||
format!(
|
||||
"The length of the vectors must match for division, have: {} vs {}",
|
||||
len0, len1
|
||||
)
|
||||
)
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
|
||||
let input0 = Arc::new(StringVector::from(vec![
|
||||
Some("[1.0,2.0,3.0]".to_string()),
|
||||
Some("[8.0,10.0,12.0]".to_string()),
|
||||
Some("[7.0,8.0,9.0]".to_string()),
|
||||
None,
|
||||
]));
|
||||
|
||||
let input1 = Arc::new(StringVector::from(vec![
|
||||
Some("[1.0,1.0,1.0]".to_string()),
|
||||
Some("[2.0,2.0,2.0]".to_string()),
|
||||
None,
|
||||
Some("[3.0,3.0,3.0]".to_string()),
|
||||
]));
|
||||
|
||||
let result = func
|
||||
.eval(FunctionContext::default(), &[input0, input1])
|
||||
.unwrap();
|
||||
|
||||
let result = result.as_ref();
|
||||
assert_eq!(result.len(), 4);
|
||||
assert_eq!(
|
||||
result.get_ref(0).as_binary().unwrap(),
|
||||
Some(veclit_to_binlit(&[1.0, 2.0, 3.0]).as_slice())
|
||||
);
|
||||
assert_eq!(
|
||||
result.get_ref(1).as_binary().unwrap(),
|
||||
Some(veclit_to_binlit(&[4.0, 5.0, 6.0]).as_slice())
|
||||
);
|
||||
assert!(result.get_ref(2).is_null());
|
||||
assert!(result.get_ref(3).is_null());
|
||||
|
||||
let input0 = Arc::new(StringVector::from(vec![Some("[1.0,-2.0]".to_string())]));
|
||||
let input1 = Arc::new(StringVector::from(vec![Some("[0.0,0.0]".to_string())]));
|
||||
|
||||
let result = func
|
||||
.eval(FunctionContext::default(), &[input0, input1])
|
||||
.unwrap();
|
||||
|
||||
let result = result.as_ref();
|
||||
assert_eq!(
|
||||
result.get_ref(0).as_binary().unwrap(),
|
||||
Some(veclit_to_binlit(&[f64::INFINITY as f32, f64::NEG_INFINITY as f32]).as_slice())
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -28,13 +28,10 @@ pub type SchemaMetadataManagerRef = Arc<SchemaMetadataManager>;
|
||||
pub struct SchemaMetadataManager {
|
||||
table_id_schema_cache: TableSchemaCacheRef,
|
||||
schema_cache: SchemaCacheRef,
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
kv_backend: crate::kv_backend::KvBackendRef,
|
||||
}
|
||||
|
||||
impl SchemaMetadataManager {
|
||||
/// Creates a new database meta
|
||||
#[cfg(not(any(test, feature = "testing")))]
|
||||
pub fn new(table_id_schema_cache: TableSchemaCacheRef, schema_cache: SchemaCacheRef) -> Self {
|
||||
Self {
|
||||
table_id_schema_cache,
|
||||
@@ -42,20 +39,6 @@ impl SchemaMetadataManager {
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new database meta
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
pub fn new(
|
||||
kv_backend: crate::kv_backend::KvBackendRef,
|
||||
table_id_schema_cache: TableSchemaCacheRef,
|
||||
schema_cache: SchemaCacheRef,
|
||||
) -> Self {
|
||||
Self {
|
||||
table_id_schema_cache,
|
||||
schema_cache,
|
||||
kv_backend,
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets schema options by table id.
|
||||
pub async fn get_schema_options_by_table_id(
|
||||
&self,
|
||||
@@ -80,6 +63,7 @@ impl SchemaMetadataManager {
|
||||
schema_name: &str,
|
||||
catalog_name: &str,
|
||||
schema_value: Option<crate::key::schema_name::SchemaNameValue>,
|
||||
kv_backend: crate::kv_backend::KvBackendRef,
|
||||
) {
|
||||
use table::metadata::{RawTableInfo, TableType};
|
||||
let value = crate::key::table_info::TableInfoValue::new(RawTableInfo {
|
||||
@@ -91,19 +75,18 @@ impl SchemaMetadataManager {
|
||||
meta: Default::default(),
|
||||
table_type: TableType::Base,
|
||||
});
|
||||
let table_info_manager =
|
||||
crate::key::table_info::TableInfoManager::new(self.kv_backend.clone());
|
||||
let table_info_manager = crate::key::table_info::TableInfoManager::new(kv_backend.clone());
|
||||
let (txn, _) = table_info_manager
|
||||
.build_create_txn(table_id, &value)
|
||||
.unwrap();
|
||||
let resp = self.kv_backend.txn(txn).await.unwrap();
|
||||
let resp = kv_backend.txn(txn).await.unwrap();
|
||||
assert!(resp.succeeded, "Failed to create table metadata");
|
||||
let key = crate::key::schema_name::SchemaNameKey {
|
||||
catalog: catalog_name,
|
||||
schema: schema_name,
|
||||
};
|
||||
|
||||
crate::key::schema_name::SchemaManager::new(self.kv_backend.clone())
|
||||
crate::key::schema_name::SchemaManager::new(kv_backend.clone())
|
||||
.create(key, schema_value, false)
|
||||
.await
|
||||
.expect("Failed to create schema metadata");
|
||||
|
||||
@@ -29,6 +29,7 @@ use crate::error::{self, Error, InvalidMetadataSnafu, ParseOptionSnafu, Result};
|
||||
use crate::key::{MetadataKey, SCHEMA_NAME_KEY_PATTERN, SCHEMA_NAME_KEY_PREFIX};
|
||||
use crate::kv_backend::txn::Txn;
|
||||
use crate::kv_backend::KvBackendRef;
|
||||
use crate::metrics::METRIC_META_SCHEMA_INFO_GET;
|
||||
use crate::range_stream::{PaginationStream, DEFAULT_PAGE_SIZE};
|
||||
use crate::rpc::store::RangeRequest;
|
||||
use crate::rpc::KeyValue;
|
||||
@@ -209,6 +210,8 @@ impl SchemaManager {
|
||||
&self,
|
||||
schema: SchemaNameKey<'_>,
|
||||
) -> Result<Option<DeserializedValueWithBytes<SchemaNameValue>>> {
|
||||
let _timer = METRIC_META_SCHEMA_INFO_GET.start_timer();
|
||||
|
||||
let raw_key = schema.to_bytes();
|
||||
self.kv_backend
|
||||
.get(&raw_key)
|
||||
|
||||
@@ -29,6 +29,7 @@ use crate::key::txn_helper::TxnOpGetResponseSet;
|
||||
use crate::key::{DeserializedValueWithBytes, MetadataKey, MetadataValue, TABLE_INFO_KEY_PREFIX};
|
||||
use crate::kv_backend::txn::Txn;
|
||||
use crate::kv_backend::KvBackendRef;
|
||||
use crate::metrics::METRIC_META_TABLE_INFO_GET;
|
||||
use crate::rpc::store::BatchGetRequest;
|
||||
|
||||
/// The key stores the metadata of the table.
|
||||
@@ -194,6 +195,8 @@ impl TableInfoManager {
|
||||
&self,
|
||||
table_id: TableId,
|
||||
) -> Result<Option<DeserializedValueWithBytes<TableInfoValue>>> {
|
||||
let _timer = METRIC_META_TABLE_INFO_GET.start_timer();
|
||||
|
||||
let key = TableInfoKey::new(table_id);
|
||||
let raw_key = key.to_bytes();
|
||||
self.kv_backend
|
||||
|
||||
@@ -108,4 +108,9 @@ lazy_static! {
|
||||
&["name"]
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
pub static ref METRIC_META_TABLE_INFO_GET: Histogram =
|
||||
register_histogram!("greptime_meta_table_info_get", "get table info from kvbackend").unwrap();
|
||||
pub static ref METRIC_META_SCHEMA_INFO_GET: Histogram =
|
||||
register_histogram!("greptime_meta_schema_info_get", "get schema info from kvbackend").unwrap();
|
||||
}
|
||||
|
||||
@@ -39,3 +39,7 @@ tokio-util.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
tokio-test = "0.4"
|
||||
|
||||
[target.'cfg(tokio_unstable)'.dependencies]
|
||||
tokio-metrics = { version = "0.3" }
|
||||
tokio-metrics-collector = { version = "0.2" }
|
||||
|
||||
@@ -224,7 +224,6 @@ impl DatanodeBuilder {
|
||||
cache_registry.get().context(MissingCacheSnafu)?;
|
||||
|
||||
let schema_metadata_manager = Arc::new(SchemaMetadataManager::new(
|
||||
kv_backend.clone(),
|
||||
table_id_schema_cache,
|
||||
schema_cache,
|
||||
));
|
||||
|
||||
@@ -336,7 +336,7 @@ impl FlownodeContext {
|
||||
let (known_table_name, schema) = srv_map.get_table_name_schema(&table_id).await?;
|
||||
table_name = table_name.or(Some(known_table_name));
|
||||
self.schema.insert(global_id, schema);
|
||||
} // if we don't have table id, it means database havn't assign one yet or we don't need it
|
||||
} // if we don't have table id, it means database haven't assign one yet or we don't need it
|
||||
|
||||
// still update the mapping with new global id
|
||||
self.table_repr.insert(table_name, table_id, global_id);
|
||||
|
||||
@@ -62,7 +62,7 @@ impl TableSource {
|
||||
.map(|id| id.table_id())
|
||||
}
|
||||
|
||||
/// If the table havn't been created in database, the tableId returned would be null
|
||||
/// If the table haven't been created in database, the tableId returned would be null
|
||||
pub async fn get_table_id_from_name(&self, name: &TableName) -> Result<Option<TableId>, Error> {
|
||||
let ret = self
|
||||
.table_name_manager
|
||||
|
||||
@@ -492,7 +492,7 @@ impl ScalarUDFImpl for TumbleExpand {
|
||||
if let Some(start_time) = opt{
|
||||
if !matches!(start_time, Utf8 | Date32 | Date64 | Timestamp(_, _)){
|
||||
return Err(DataFusionError::Plan(
|
||||
format!("Expect start_time to either be date, timestampe or string, found {:?}", start_time)
|
||||
format!("Expect start_time to either be date, timestamp or string, found {:?}", start_time)
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -57,7 +57,7 @@ pub const BROADCAST_CAP: usize = 1024;
|
||||
/// The maximum capacity of the send buffer, to prevent the buffer from growing too large
|
||||
pub const SEND_BUF_CAP: usize = BROADCAST_CAP * 2;
|
||||
|
||||
/// Flow worker will try to at least accumulate this many rows before processing them(if one second havn't passed)
|
||||
/// Flow worker will try to at least accumulate this many rows before processing them(if one second haven't passed)
|
||||
pub const BATCH_SIZE: usize = 32 * 16384;
|
||||
|
||||
/// Convert a value that is or can be converted to Datetime to internal timestamp
|
||||
|
||||
@@ -30,8 +30,8 @@ pub struct LogQuery {
|
||||
pub time_filter: TimeFilter,
|
||||
/// Columns with filters to query.
|
||||
pub columns: Vec<ColumnFilters>,
|
||||
/// Maximum number of logs to return. If not provided, it will return all matched logs.
|
||||
pub limit: Option<usize>,
|
||||
/// Controls row skipping and fetch count for logs.
|
||||
pub limit: Limit,
|
||||
/// Adjacent lines to return.
|
||||
pub context: Context,
|
||||
}
|
||||
@@ -42,7 +42,7 @@ impl Default for LogQuery {
|
||||
table: TableName::new("", "", ""),
|
||||
time_filter: Default::default(),
|
||||
columns: vec![],
|
||||
limit: None,
|
||||
limit: Limit::default(),
|
||||
context: Default::default(),
|
||||
}
|
||||
}
|
||||
@@ -266,6 +266,15 @@ pub enum Context {
|
||||
Seconds(usize, usize),
|
||||
}
|
||||
|
||||
/// Represents limit and offset parameters for query pagination.
|
||||
#[derive(Debug, Default, Serialize, Deserialize)]
|
||||
pub struct Limit {
|
||||
/// Optional number of items to skip before starting to return results
|
||||
pub skip: Option<usize>,
|
||||
/// Optional number of items to return after skipping
|
||||
pub fetch: Option<usize>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
@@ -48,6 +48,10 @@ use tonic::transport::server::{Router, TcpIncoming};
|
||||
|
||||
use crate::election::etcd::EtcdElection;
|
||||
#[cfg(feature = "pg_kvbackend")]
|
||||
use crate::election::postgres::PgElection;
|
||||
#[cfg(feature = "pg_kvbackend")]
|
||||
use crate::election::CANDIDATE_LEASE_SECS;
|
||||
#[cfg(feature = "pg_kvbackend")]
|
||||
use crate::error::InvalidArgumentsSnafu;
|
||||
use crate::error::{InitExportMetricsTaskSnafu, TomlFormatSnafu};
|
||||
use crate::metasrv::builder::MetasrvBuilder;
|
||||
@@ -229,7 +233,15 @@ pub async fn metasrv_builder(
|
||||
let kv_backend = PgStore::with_pg_client(pg_client)
|
||||
.await
|
||||
.context(error::KvBackendSnafu)?;
|
||||
(kv_backend, None)
|
||||
let election_client = create_postgres_client(opts).await?;
|
||||
let election = PgElection::with_pg_client(
|
||||
opts.server_addr.clone(),
|
||||
election_client,
|
||||
opts.store_key_prefix.clone(),
|
||||
CANDIDATE_LEASE_SECS,
|
||||
)
|
||||
.await?;
|
||||
(kv_backend, Some(election))
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -19,7 +19,9 @@ pub mod postgres;
|
||||
use std::fmt::{self, Debug};
|
||||
use std::sync::Arc;
|
||||
|
||||
use tokio::sync::broadcast::Receiver;
|
||||
use common_telemetry::{info, warn};
|
||||
use tokio::sync::broadcast::error::RecvError;
|
||||
use tokio::sync::broadcast::{self, Receiver, Sender};
|
||||
|
||||
use crate::error::Result;
|
||||
use crate::metasrv::MetasrvNodeInfo;
|
||||
@@ -75,6 +77,37 @@ impl fmt::Display for LeaderChangeMessage {
|
||||
}
|
||||
}
|
||||
|
||||
fn listen_leader_change(leader_value: String) -> Sender<LeaderChangeMessage> {
|
||||
let (tx, mut rx) = broadcast::channel(100);
|
||||
let _handle = common_runtime::spawn_global(async move {
|
||||
loop {
|
||||
match rx.recv().await {
|
||||
Ok(msg) => match msg {
|
||||
LeaderChangeMessage::Elected(key) => {
|
||||
info!(
|
||||
"[{leader_value}] is elected as leader: {:?}, lease: {}",
|
||||
String::from_utf8_lossy(key.name()),
|
||||
key.lease_id()
|
||||
);
|
||||
}
|
||||
LeaderChangeMessage::StepDown(key) => {
|
||||
warn!(
|
||||
"[{leader_value}] is stepping down: {:?}, lease: {}",
|
||||
String::from_utf8_lossy(key.name()),
|
||||
key.lease_id()
|
||||
);
|
||||
}
|
||||
},
|
||||
Err(RecvError::Lagged(_)) => {
|
||||
warn!("Log printing is too slow or leader changed too fast!");
|
||||
}
|
||||
Err(RecvError::Closed) => break,
|
||||
}
|
||||
}
|
||||
});
|
||||
tx
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
pub trait Election: Send + Sync {
|
||||
type Leader;
|
||||
|
||||
@@ -23,13 +23,12 @@ use etcd_client::{
|
||||
};
|
||||
use snafu::{ensure, OptionExt, ResultExt};
|
||||
use tokio::sync::broadcast;
|
||||
use tokio::sync::broadcast::error::RecvError;
|
||||
use tokio::sync::broadcast::Receiver;
|
||||
use tokio::time::{timeout, MissedTickBehavior};
|
||||
|
||||
use crate::election::{
|
||||
Election, LeaderChangeMessage, LeaderKey, CANDIDATES_ROOT, CANDIDATE_LEASE_SECS, ELECTION_KEY,
|
||||
KEEP_ALIVE_INTERVAL_SECS,
|
||||
listen_leader_change, Election, LeaderChangeMessage, LeaderKey, CANDIDATES_ROOT,
|
||||
CANDIDATE_LEASE_SECS, ELECTION_KEY, KEEP_ALIVE_INTERVAL_SECS,
|
||||
};
|
||||
use crate::error;
|
||||
use crate::error::Result;
|
||||
@@ -88,36 +87,7 @@ impl EtcdElection {
|
||||
E: AsRef<str>,
|
||||
{
|
||||
let leader_value: String = leader_value.as_ref().into();
|
||||
|
||||
let leader_ident = leader_value.clone();
|
||||
let (tx, mut rx) = broadcast::channel(100);
|
||||
let _handle = common_runtime::spawn_global(async move {
|
||||
loop {
|
||||
match rx.recv().await {
|
||||
Ok(msg) => match msg {
|
||||
LeaderChangeMessage::Elected(key) => {
|
||||
info!(
|
||||
"[{leader_ident}] is elected as leader: {:?}, lease: {}",
|
||||
String::from_utf8_lossy(key.name()),
|
||||
key.lease_id()
|
||||
);
|
||||
}
|
||||
LeaderChangeMessage::StepDown(key) => {
|
||||
warn!(
|
||||
"[{leader_ident}] is stepping down: {:?}, lease: {}",
|
||||
String::from_utf8_lossy(key.name()),
|
||||
key.lease_id()
|
||||
);
|
||||
}
|
||||
},
|
||||
Err(RecvError::Lagged(_)) => {
|
||||
warn!("Log printing is too slow or leader changed too fast!");
|
||||
}
|
||||
Err(RecvError::Closed) => break,
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
let tx = listen_leader_change(leader_value.clone());
|
||||
Ok(Arc::new(Self {
|
||||
leader_value,
|
||||
client,
|
||||
|
||||
@@ -16,18 +16,32 @@ use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use common_meta::distributed_time_constants::{META_KEEP_ALIVE_INTERVAL_SECS, META_LEASE_SECS};
|
||||
use common_telemetry::{error, warn};
|
||||
use common_time::Timestamp;
|
||||
use itertools::Itertools;
|
||||
use snafu::{ensure, OptionExt, ResultExt};
|
||||
use tokio::sync::broadcast;
|
||||
use tokio::time::MissedTickBehavior;
|
||||
use tokio_postgres::Client;
|
||||
|
||||
use crate::election::{Election, LeaderChangeMessage, CANDIDATES_ROOT, ELECTION_KEY};
|
||||
use crate::election::{
|
||||
listen_leader_change, Election, LeaderChangeMessage, LeaderKey, CANDIDATES_ROOT, ELECTION_KEY,
|
||||
};
|
||||
use crate::error::{
|
||||
DeserializeFromJsonSnafu, PostgresExecutionSnafu, Result, SerializeToJsonSnafu, UnexpectedSnafu,
|
||||
DeserializeFromJsonSnafu, NoLeaderSnafu, PostgresExecutionSnafu, Result, SerializeToJsonSnafu,
|
||||
UnexpectedSnafu,
|
||||
};
|
||||
use crate::metasrv::{ElectionRef, LeaderValue, MetasrvNodeInfo};
|
||||
|
||||
// TODO(CookiePie): The lock id should be configurable.
|
||||
const CAMPAIGN: &str = "SELECT pg_try_advisory_lock(28319)";
|
||||
const STEP_DOWN: &str = "SELECT pg_advisory_unlock(28319)";
|
||||
const SET_IDLE_SESSION_TIMEOUT: &str = "SET idle_in_transaction_session_timeout = $1";
|
||||
// Currently the session timeout is longer than the leader lease time, so the leader lease may expire while the session is still alive.
|
||||
// Either the leader reconnects and step down or the session expires and the lock is released.
|
||||
const IDLE_SESSION_TIMEOUT: &str = "10s";
|
||||
|
||||
// Separator between value and expire time.
|
||||
const LEASE_SEP: &str = r#"||__metadata_lease_sep||"#;
|
||||
|
||||
@@ -81,8 +95,33 @@ fn parse_value_and_expire_time(value: &str) -> Result<(String, Timestamp)> {
|
||||
Ok((value.to_string(), expire_time))
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
struct PgLeaderKey {
|
||||
name: Vec<u8>,
|
||||
key: Vec<u8>,
|
||||
rev: i64,
|
||||
lease: i64,
|
||||
}
|
||||
|
||||
impl LeaderKey for PgLeaderKey {
|
||||
fn name(&self) -> &[u8] {
|
||||
&self.name
|
||||
}
|
||||
|
||||
fn key(&self) -> &[u8] {
|
||||
&self.key
|
||||
}
|
||||
|
||||
fn revision(&self) -> i64 {
|
||||
self.rev
|
||||
}
|
||||
|
||||
fn lease_id(&self) -> i64 {
|
||||
self.lease
|
||||
}
|
||||
}
|
||||
|
||||
/// PostgreSql implementation of Election.
|
||||
/// TODO(CookiePie): Currently only support candidate registration. Add election logic.
|
||||
pub struct PgElection {
|
||||
leader_value: String,
|
||||
client: Client,
|
||||
@@ -100,7 +139,13 @@ impl PgElection {
|
||||
store_key_prefix: String,
|
||||
candidate_lease_ttl_secs: u64,
|
||||
) -> Result<ElectionRef> {
|
||||
let (tx, _) = broadcast::channel(100);
|
||||
// Set idle session timeout to IDLE_SESSION_TIMEOUT to avoid dead advisory lock.
|
||||
client
|
||||
.execute(SET_IDLE_SESSION_TIMEOUT, &[&IDLE_SESSION_TIMEOUT])
|
||||
.await
|
||||
.context(PostgresExecutionSnafu)?;
|
||||
|
||||
let tx = listen_leader_change(leader_value.clone());
|
||||
Ok(Arc::new(Self {
|
||||
leader_value,
|
||||
client,
|
||||
@@ -112,7 +157,7 @@ impl PgElection {
|
||||
}))
|
||||
}
|
||||
|
||||
fn _election_key(&self) -> String {
|
||||
fn election_key(&self) -> String {
|
||||
format!("{}{}", self.store_key_prefix, ELECTION_KEY)
|
||||
}
|
||||
|
||||
@@ -146,11 +191,14 @@ impl Election for PgElection {
|
||||
serde_json::to_string(node_info).with_context(|_| SerializeToJsonSnafu {
|
||||
input: format!("{node_info:?}"),
|
||||
})?;
|
||||
let res = self.put_value_with_lease(&key, &node_info).await?;
|
||||
let res = self
|
||||
.put_value_with_lease(&key, &node_info, self.candidate_lease_ttl_secs)
|
||||
.await?;
|
||||
// May registered before, just update the lease.
|
||||
if !res {
|
||||
self.delete_value(&key).await?;
|
||||
self.put_value_with_lease(&key, &node_info).await?;
|
||||
self.put_value_with_lease(&key, &node_info, self.candidate_lease_ttl_secs)
|
||||
.await?;
|
||||
}
|
||||
|
||||
// Check if the current lease has expired and renew the lease.
|
||||
@@ -197,12 +245,65 @@ impl Election for PgElection {
|
||||
Ok(valid_candidates)
|
||||
}
|
||||
|
||||
/// Attempts to acquire leadership by executing a campaign. This function continuously checks
|
||||
/// if the current instance can become the leader by acquiring an advisory lock in the PostgreSQL database.
|
||||
///
|
||||
/// The function operates in a loop, where it:
|
||||
///
|
||||
/// 1. Waits for a predefined interval before attempting to acquire the lock again.
|
||||
/// 2. Executes the `CAMPAIGN` SQL query to try to acquire the advisory lock.
|
||||
/// 3. Checks the result of the query:
|
||||
/// - If the lock is successfully acquired (result is true), it calls the `leader_action` method
|
||||
/// to perform actions as the leader.
|
||||
/// - If the lock is not acquired (result is false), it calls the `follower_action` method
|
||||
/// to perform actions as a follower.
|
||||
async fn campaign(&self) -> Result<()> {
|
||||
todo!()
|
||||
let mut keep_alive_interval =
|
||||
tokio::time::interval(Duration::from_secs(META_KEEP_ALIVE_INTERVAL_SECS));
|
||||
keep_alive_interval.set_missed_tick_behavior(MissedTickBehavior::Delay);
|
||||
|
||||
loop {
|
||||
let res = self
|
||||
.client
|
||||
.query(CAMPAIGN, &[])
|
||||
.await
|
||||
.context(PostgresExecutionSnafu)?;
|
||||
if let Some(row) = res.first() {
|
||||
match row.try_get(0) {
|
||||
Ok(true) => self.leader_action().await?,
|
||||
Ok(false) => self.follower_action().await?,
|
||||
Err(_) => {
|
||||
return UnexpectedSnafu {
|
||||
violated: "Failed to get the result of acquiring advisory lock"
|
||||
.to_string(),
|
||||
}
|
||||
.fail();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return UnexpectedSnafu {
|
||||
violated: "Failed to get the result of acquiring advisory lock".to_string(),
|
||||
}
|
||||
.fail();
|
||||
}
|
||||
let _ = keep_alive_interval.tick().await;
|
||||
}
|
||||
}
|
||||
|
||||
async fn leader(&self) -> Result<Self::Leader> {
|
||||
todo!()
|
||||
if self.is_leader.load(Ordering::Relaxed) {
|
||||
Ok(self.leader_value.as_bytes().into())
|
||||
} else {
|
||||
let key = self.election_key();
|
||||
if let Some((leader, expire_time, current, _)) =
|
||||
self.get_value_with_lease(&key, false).await?
|
||||
{
|
||||
ensure!(expire_time > current, NoLeaderSnafu);
|
||||
Ok(leader.as_bytes().into())
|
||||
} else {
|
||||
NoLeaderSnafu.fail()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn resign(&self) -> Result<()> {
|
||||
@@ -315,17 +416,17 @@ impl PgElection {
|
||||
}
|
||||
|
||||
/// Returns `true` if the insertion is successful
|
||||
async fn put_value_with_lease(&self, key: &str, value: &str) -> Result<bool> {
|
||||
async fn put_value_with_lease(
|
||||
&self,
|
||||
key: &str,
|
||||
value: &str,
|
||||
lease_ttl_secs: u64,
|
||||
) -> Result<bool> {
|
||||
let res = self
|
||||
.client
|
||||
.query(
|
||||
PUT_IF_NOT_EXISTS_WITH_EXPIRE_TIME,
|
||||
&[
|
||||
&key,
|
||||
&value,
|
||||
&LEASE_SEP,
|
||||
&(self.candidate_lease_ttl_secs as f64),
|
||||
],
|
||||
&[&key, &value, &LEASE_SEP, &(lease_ttl_secs as f64)],
|
||||
)
|
||||
.await
|
||||
.context(PostgresExecutionSnafu)?;
|
||||
@@ -343,6 +444,177 @@ impl PgElection {
|
||||
|
||||
Ok(res.len() == 1)
|
||||
}
|
||||
|
||||
/// Handles the actions of a leader in the election process.
|
||||
///
|
||||
/// This function performs the following checks and actions:
|
||||
///
|
||||
/// - **Case 1**: If the current instance believes it is the leader from the previous term,
|
||||
/// it attempts to renew the lease. It checks if the lease is still valid and either renews it
|
||||
/// or steps down if it has expired.
|
||||
///
|
||||
/// - **Case 1.1**: If the instance is still the leader and the lease is valid, it renews the lease
|
||||
/// by updating the value associated with the election key.
|
||||
/// - **Case 1.2**: If the instance is still the leader but the lease has expired, it logs a warning
|
||||
/// and steps down, initiating a new campaign for leadership.
|
||||
/// - **Case 1.3**: If the instance is not the leader (which is a rare scenario), it logs a warning
|
||||
/// indicating that it still holds the lock and steps down to re-initiate the campaign. This may
|
||||
/// happen if the leader has failed to renew the lease and the session has expired, and recovery
|
||||
/// after a period of time during which other leaders have been elected and stepped down.
|
||||
/// - **Case 1.4**: If no lease information is found, it also steps down and re-initiates the campaign.
|
||||
///
|
||||
/// - **Case 2**: If the current instance is not leader previously, it calls the
|
||||
/// `elected` method as a newly elected leader.
|
||||
async fn leader_action(&self) -> Result<()> {
|
||||
let key = self.election_key();
|
||||
// Case 1
|
||||
if self.is_leader() {
|
||||
match self.get_value_with_lease(&key, true).await? {
|
||||
Some((prev_leader, expire_time, current, prev)) => {
|
||||
match (prev_leader == self.leader_value, expire_time > current) {
|
||||
// Case 1.1
|
||||
(true, true) => {
|
||||
// Safety: prev is Some since we are using `get_value_with_lease` with `true`.
|
||||
let prev = prev.unwrap();
|
||||
self.update_value_with_lease(&key, &prev, &self.leader_value)
|
||||
.await?;
|
||||
}
|
||||
// Case 1.2
|
||||
(true, false) => {
|
||||
warn!("Leader lease expired, now stepping down.");
|
||||
self.step_down().await?;
|
||||
}
|
||||
// Case 1.3
|
||||
(false, _) => {
|
||||
warn!("Leader lease not found, but still hold the lock. Now stepping down.");
|
||||
self.step_down().await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Case 1.4
|
||||
None => {
|
||||
warn!("Leader lease not found, but still hold the lock. Now stepping down.");
|
||||
self.step_down().await?;
|
||||
}
|
||||
}
|
||||
// Case 2
|
||||
} else {
|
||||
self.elected().await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Handles the actions of a follower in the election process.
|
||||
///
|
||||
/// This function performs the following checks and actions:
|
||||
///
|
||||
/// - **Case 1**: If the current instance believes it is the leader from the previous term,
|
||||
/// it steps down without deleting the key.
|
||||
/// - **Case 2**: If the current instance is not the leader but the lease has expired, it raises an error
|
||||
/// to re-initiate the campaign. If the leader failed to renew the lease, its session will expire and the lock
|
||||
/// will be released.
|
||||
/// - **Case 3**: If all checks pass, the function returns without performing any actions.
|
||||
async fn follower_action(&self) -> Result<()> {
|
||||
let key = self.election_key();
|
||||
// Case 1
|
||||
if self.is_leader() {
|
||||
self.step_down_without_lock().await?;
|
||||
}
|
||||
let (_, expire_time, current, _) = self
|
||||
.get_value_with_lease(&key, false)
|
||||
.await?
|
||||
.context(NoLeaderSnafu)?;
|
||||
// Case 2
|
||||
ensure!(expire_time > current, NoLeaderSnafu);
|
||||
// Case 3
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Step down the leader. The leader should delete the key and notify the leader watcher.
|
||||
///
|
||||
/// __DO NOT__ check if the deletion is successful, since the key may be deleted by others elected.
|
||||
///
|
||||
/// ## Caution:
|
||||
/// Should only step down while holding the advisory lock.
|
||||
async fn step_down(&self) -> Result<()> {
|
||||
let key = self.election_key();
|
||||
let leader_key = PgLeaderKey {
|
||||
name: self.leader_value.clone().into_bytes(),
|
||||
key: key.clone().into_bytes(),
|
||||
..Default::default()
|
||||
};
|
||||
if self
|
||||
.is_leader
|
||||
.compare_exchange(true, false, Ordering::Relaxed, Ordering::Relaxed)
|
||||
.is_ok()
|
||||
{
|
||||
self.delete_value(&key).await?;
|
||||
self.client
|
||||
.query(STEP_DOWN, &[])
|
||||
.await
|
||||
.context(PostgresExecutionSnafu)?;
|
||||
if let Err(e) = self
|
||||
.leader_watcher
|
||||
.send(LeaderChangeMessage::StepDown(Arc::new(leader_key)))
|
||||
{
|
||||
error!(e; "Failed to send leader change message");
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Still consider itself as the leader locally but failed to acquire the lock. Step down without deleting the key.
|
||||
async fn step_down_without_lock(&self) -> Result<()> {
|
||||
let key = self.election_key().into_bytes();
|
||||
let leader_key = PgLeaderKey {
|
||||
name: self.leader_value.clone().into_bytes(),
|
||||
key: key.clone(),
|
||||
..Default::default()
|
||||
};
|
||||
if self
|
||||
.is_leader
|
||||
.compare_exchange(true, false, Ordering::Relaxed, Ordering::Relaxed)
|
||||
.is_ok()
|
||||
{
|
||||
if let Err(e) = self
|
||||
.leader_watcher
|
||||
.send(LeaderChangeMessage::StepDown(Arc::new(leader_key)))
|
||||
{
|
||||
error!(e; "Failed to send leader change message");
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Elected as leader. The leader should put the key and notify the leader watcher.
|
||||
/// Caution: Should only elected while holding the advisory lock.
|
||||
async fn elected(&self) -> Result<()> {
|
||||
let key = self.election_key();
|
||||
let leader_key = PgLeaderKey {
|
||||
name: self.leader_value.clone().into_bytes(),
|
||||
key: key.clone().into_bytes(),
|
||||
..Default::default()
|
||||
};
|
||||
self.delete_value(&key).await?;
|
||||
self.put_value_with_lease(&key, &self.leader_value, META_LEASE_SECS)
|
||||
.await?;
|
||||
|
||||
if self
|
||||
.is_leader
|
||||
.compare_exchange(false, true, Ordering::Relaxed, Ordering::Relaxed)
|
||||
.is_ok()
|
||||
{
|
||||
self.leader_infancy.store(true, Ordering::Relaxed);
|
||||
|
||||
if let Err(e) = self
|
||||
.leader_watcher
|
||||
.send(LeaderChangeMessage::Elected(Arc::new(leader_key)))
|
||||
{
|
||||
error!(e; "Failed to send leader change message");
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -390,7 +662,7 @@ mod tests {
|
||||
};
|
||||
|
||||
let res = pg_election
|
||||
.put_value_with_lease(&key, &value)
|
||||
.put_value_with_lease(&key, &value, 10)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(res);
|
||||
@@ -418,7 +690,7 @@ mod tests {
|
||||
let key = format!("test_key_{}", i);
|
||||
let value = format!("test_value_{}", i);
|
||||
pg_election
|
||||
.put_value_with_lease(&key, &value)
|
||||
.put_value_with_lease(&key, &value, 10)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
@@ -478,7 +750,7 @@ mod tests {
|
||||
handles.push(handle);
|
||||
}
|
||||
// Wait for candidates to registrate themselves and renew their leases at least once.
|
||||
tokio::time::sleep(Duration::from_secs(6)).await;
|
||||
tokio::time::sleep(Duration::from_secs(3)).await;
|
||||
|
||||
let client = create_postgres_client().await.unwrap();
|
||||
|
||||
@@ -516,4 +788,402 @@ mod tests {
|
||||
assert!(res);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_elected_and_step_down() {
|
||||
let leader_value = "test_leader".to_string();
|
||||
let candidate_lease_ttl_secs = 5;
|
||||
let client = create_postgres_client().await.unwrap();
|
||||
|
||||
let (tx, mut rx) = broadcast::channel(100);
|
||||
let leader_pg_election = PgElection {
|
||||
leader_value: leader_value.clone(),
|
||||
client,
|
||||
is_leader: AtomicBool::new(false),
|
||||
leader_infancy: AtomicBool::new(true),
|
||||
leader_watcher: tx,
|
||||
store_key_prefix: "test_prefix".to_string(),
|
||||
candidate_lease_ttl_secs,
|
||||
};
|
||||
|
||||
leader_pg_election.elected().await.unwrap();
|
||||
let (leader, expire_time, current, _) = leader_pg_election
|
||||
.get_value_with_lease(&leader_pg_election.election_key(), false)
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert!(leader == leader_value);
|
||||
assert!(expire_time > current);
|
||||
assert!(leader_pg_election.is_leader());
|
||||
|
||||
match rx.recv().await {
|
||||
Ok(LeaderChangeMessage::Elected(key)) => {
|
||||
assert_eq!(String::from_utf8_lossy(key.name()), leader_value);
|
||||
assert_eq!(
|
||||
String::from_utf8_lossy(key.key()),
|
||||
leader_pg_election.election_key()
|
||||
);
|
||||
assert_eq!(key.lease_id(), i64::default());
|
||||
assert_eq!(key.revision(), i64::default());
|
||||
}
|
||||
_ => panic!("Expected LeaderChangeMessage::Elected"),
|
||||
}
|
||||
|
||||
leader_pg_election.step_down_without_lock().await.unwrap();
|
||||
let (leader, _, _, _) = leader_pg_election
|
||||
.get_value_with_lease(&leader_pg_election.election_key(), false)
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert!(leader == leader_value);
|
||||
assert!(!leader_pg_election.is_leader());
|
||||
|
||||
match rx.recv().await {
|
||||
Ok(LeaderChangeMessage::StepDown(key)) => {
|
||||
assert_eq!(String::from_utf8_lossy(key.name()), leader_value);
|
||||
assert_eq!(
|
||||
String::from_utf8_lossy(key.key()),
|
||||
leader_pg_election.election_key()
|
||||
);
|
||||
assert_eq!(key.lease_id(), i64::default());
|
||||
assert_eq!(key.revision(), i64::default());
|
||||
}
|
||||
_ => panic!("Expected LeaderChangeMessage::StepDown"),
|
||||
}
|
||||
|
||||
leader_pg_election.elected().await.unwrap();
|
||||
let (leader, expire_time, current, _) = leader_pg_election
|
||||
.get_value_with_lease(&leader_pg_election.election_key(), false)
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert!(leader == leader_value);
|
||||
assert!(expire_time > current);
|
||||
assert!(leader_pg_election.is_leader());
|
||||
|
||||
match rx.recv().await {
|
||||
Ok(LeaderChangeMessage::Elected(key)) => {
|
||||
assert_eq!(String::from_utf8_lossy(key.name()), leader_value);
|
||||
assert_eq!(
|
||||
String::from_utf8_lossy(key.key()),
|
||||
leader_pg_election.election_key()
|
||||
);
|
||||
assert_eq!(key.lease_id(), i64::default());
|
||||
assert_eq!(key.revision(), i64::default());
|
||||
}
|
||||
_ => panic!("Expected LeaderChangeMessage::Elected"),
|
||||
}
|
||||
|
||||
leader_pg_election.step_down().await.unwrap();
|
||||
let res = leader_pg_election
|
||||
.get_value_with_lease(&leader_pg_election.election_key(), false)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(res.is_none());
|
||||
assert!(!leader_pg_election.is_leader());
|
||||
|
||||
match rx.recv().await {
|
||||
Ok(LeaderChangeMessage::StepDown(key)) => {
|
||||
assert_eq!(String::from_utf8_lossy(key.name()), leader_value);
|
||||
assert_eq!(
|
||||
String::from_utf8_lossy(key.key()),
|
||||
leader_pg_election.election_key()
|
||||
);
|
||||
assert_eq!(key.lease_id(), i64::default());
|
||||
assert_eq!(key.revision(), i64::default());
|
||||
}
|
||||
_ => panic!("Expected LeaderChangeMessage::StepDown"),
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_leader_action() {
|
||||
let leader_value = "test_leader".to_string();
|
||||
let candidate_lease_ttl_secs = 5;
|
||||
let client = create_postgres_client().await.unwrap();
|
||||
|
||||
let (tx, mut rx) = broadcast::channel(100);
|
||||
let leader_pg_election = PgElection {
|
||||
leader_value: leader_value.clone(),
|
||||
client,
|
||||
is_leader: AtomicBool::new(false),
|
||||
leader_infancy: AtomicBool::new(true),
|
||||
leader_watcher: tx,
|
||||
store_key_prefix: "test_prefix".to_string(),
|
||||
candidate_lease_ttl_secs,
|
||||
};
|
||||
|
||||
// Step 1: No leader exists, campaign and elected.
|
||||
let res = leader_pg_election
|
||||
.client
|
||||
.query(CAMPAIGN, &[])
|
||||
.await
|
||||
.unwrap();
|
||||
let res: bool = res[0].get(0);
|
||||
assert!(res);
|
||||
leader_pg_election.leader_action().await.unwrap();
|
||||
let (leader, expire_time, current, _) = leader_pg_election
|
||||
.get_value_with_lease(&leader_pg_election.election_key(), false)
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert!(leader == leader_value);
|
||||
assert!(expire_time > current);
|
||||
assert!(leader_pg_election.is_leader());
|
||||
|
||||
match rx.recv().await {
|
||||
Ok(LeaderChangeMessage::Elected(key)) => {
|
||||
assert_eq!(String::from_utf8_lossy(key.name()), leader_value);
|
||||
assert_eq!(
|
||||
String::from_utf8_lossy(key.key()),
|
||||
leader_pg_election.election_key()
|
||||
);
|
||||
assert_eq!(key.lease_id(), i64::default());
|
||||
assert_eq!(key.revision(), i64::default());
|
||||
}
|
||||
_ => panic!("Expected LeaderChangeMessage::Elected"),
|
||||
}
|
||||
|
||||
// Step 2: As a leader, renew the lease.
|
||||
let res = leader_pg_election
|
||||
.client
|
||||
.query(CAMPAIGN, &[])
|
||||
.await
|
||||
.unwrap();
|
||||
let res: bool = res[0].get(0);
|
||||
assert!(res);
|
||||
leader_pg_election.leader_action().await.unwrap();
|
||||
let (leader, new_expire_time, current, _) = leader_pg_election
|
||||
.get_value_with_lease(&leader_pg_election.election_key(), false)
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert!(leader == leader_value);
|
||||
assert!(new_expire_time > current && new_expire_time > expire_time);
|
||||
assert!(leader_pg_election.is_leader());
|
||||
|
||||
// Step 3: Something wrong, the leader lease expired.
|
||||
tokio::time::sleep(Duration::from_secs(META_LEASE_SECS)).await;
|
||||
|
||||
let res = leader_pg_election
|
||||
.client
|
||||
.query(CAMPAIGN, &[])
|
||||
.await
|
||||
.unwrap();
|
||||
let res: bool = res[0].get(0);
|
||||
assert!(res);
|
||||
leader_pg_election.leader_action().await.unwrap();
|
||||
let res = leader_pg_election
|
||||
.get_value_with_lease(&leader_pg_election.election_key(), false)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(res.is_none());
|
||||
|
||||
match rx.recv().await {
|
||||
Ok(LeaderChangeMessage::StepDown(key)) => {
|
||||
assert_eq!(String::from_utf8_lossy(key.name()), leader_value);
|
||||
assert_eq!(
|
||||
String::from_utf8_lossy(key.key()),
|
||||
leader_pg_election.election_key()
|
||||
);
|
||||
assert_eq!(key.lease_id(), i64::default());
|
||||
assert_eq!(key.revision(), i64::default());
|
||||
}
|
||||
_ => panic!("Expected LeaderChangeMessage::StepDown"),
|
||||
}
|
||||
|
||||
// Step 4: Re-campaign and elected.
|
||||
let res = leader_pg_election
|
||||
.client
|
||||
.query(CAMPAIGN, &[])
|
||||
.await
|
||||
.unwrap();
|
||||
let res: bool = res[0].get(0);
|
||||
assert!(res);
|
||||
leader_pg_election.leader_action().await.unwrap();
|
||||
let (leader, expire_time, current, _) = leader_pg_election
|
||||
.get_value_with_lease(&leader_pg_election.election_key(), false)
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert!(leader == leader_value);
|
||||
assert!(expire_time > current);
|
||||
assert!(leader_pg_election.is_leader());
|
||||
|
||||
match rx.recv().await {
|
||||
Ok(LeaderChangeMessage::Elected(key)) => {
|
||||
assert_eq!(String::from_utf8_lossy(key.name()), leader_value);
|
||||
assert_eq!(
|
||||
String::from_utf8_lossy(key.key()),
|
||||
leader_pg_election.election_key()
|
||||
);
|
||||
assert_eq!(key.lease_id(), i64::default());
|
||||
assert_eq!(key.revision(), i64::default());
|
||||
}
|
||||
_ => panic!("Expected LeaderChangeMessage::Elected"),
|
||||
}
|
||||
|
||||
// Step 5: Something wrong, the leader key is deleted by other followers.
|
||||
leader_pg_election
|
||||
.delete_value(&leader_pg_election.election_key())
|
||||
.await
|
||||
.unwrap();
|
||||
leader_pg_election.leader_action().await.unwrap();
|
||||
let res = leader_pg_election
|
||||
.get_value_with_lease(&leader_pg_election.election_key(), false)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(res.is_none());
|
||||
assert!(!leader_pg_election.is_leader());
|
||||
|
||||
match rx.recv().await {
|
||||
Ok(LeaderChangeMessage::StepDown(key)) => {
|
||||
assert_eq!(String::from_utf8_lossy(key.name()), leader_value);
|
||||
assert_eq!(
|
||||
String::from_utf8_lossy(key.key()),
|
||||
leader_pg_election.election_key()
|
||||
);
|
||||
assert_eq!(key.lease_id(), i64::default());
|
||||
assert_eq!(key.revision(), i64::default());
|
||||
}
|
||||
_ => panic!("Expected LeaderChangeMessage::StepDown"),
|
||||
}
|
||||
|
||||
// Step 6: Re-campaign and elected.
|
||||
let res = leader_pg_election
|
||||
.client
|
||||
.query(CAMPAIGN, &[])
|
||||
.await
|
||||
.unwrap();
|
||||
let res: bool = res[0].get(0);
|
||||
assert!(res);
|
||||
leader_pg_election.leader_action().await.unwrap();
|
||||
let (leader, expire_time, current, _) = leader_pg_election
|
||||
.get_value_with_lease(&leader_pg_election.election_key(), false)
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert!(leader == leader_value);
|
||||
assert!(expire_time > current);
|
||||
assert!(leader_pg_election.is_leader());
|
||||
|
||||
match rx.recv().await {
|
||||
Ok(LeaderChangeMessage::Elected(key)) => {
|
||||
assert_eq!(String::from_utf8_lossy(key.name()), leader_value);
|
||||
assert_eq!(
|
||||
String::from_utf8_lossy(key.key()),
|
||||
leader_pg_election.election_key()
|
||||
);
|
||||
assert_eq!(key.lease_id(), i64::default());
|
||||
assert_eq!(key.revision(), i64::default());
|
||||
}
|
||||
_ => panic!("Expected LeaderChangeMessage::Elected"),
|
||||
}
|
||||
|
||||
// Step 7: Something wrong, the leader key changed by others.
|
||||
let res = leader_pg_election
|
||||
.client
|
||||
.query(CAMPAIGN, &[])
|
||||
.await
|
||||
.unwrap();
|
||||
let res: bool = res[0].get(0);
|
||||
assert!(res);
|
||||
leader_pg_election
|
||||
.delete_value(&leader_pg_election.election_key())
|
||||
.await
|
||||
.unwrap();
|
||||
leader_pg_election
|
||||
.put_value_with_lease(&leader_pg_election.election_key(), "test", 10)
|
||||
.await
|
||||
.unwrap();
|
||||
leader_pg_election.leader_action().await.unwrap();
|
||||
let res = leader_pg_election
|
||||
.get_value_with_lease(&leader_pg_election.election_key(), false)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(res.is_none());
|
||||
assert!(!leader_pg_election.is_leader());
|
||||
|
||||
match rx.recv().await {
|
||||
Ok(LeaderChangeMessage::StepDown(key)) => {
|
||||
assert_eq!(String::from_utf8_lossy(key.name()), leader_value);
|
||||
assert_eq!(
|
||||
String::from_utf8_lossy(key.key()),
|
||||
leader_pg_election.election_key()
|
||||
);
|
||||
assert_eq!(key.lease_id(), i64::default());
|
||||
assert_eq!(key.revision(), i64::default());
|
||||
}
|
||||
_ => panic!("Expected LeaderChangeMessage::StepDown"),
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_follower_action() {
|
||||
let candidate_lease_ttl_secs = 5;
|
||||
|
||||
let follower_client = create_postgres_client().await.unwrap();
|
||||
let (tx, mut rx) = broadcast::channel(100);
|
||||
let follower_pg_election = PgElection {
|
||||
leader_value: "test_follower".to_string(),
|
||||
client: follower_client,
|
||||
is_leader: AtomicBool::new(false),
|
||||
leader_infancy: AtomicBool::new(true),
|
||||
leader_watcher: tx,
|
||||
store_key_prefix: "test_prefix".to_string(),
|
||||
candidate_lease_ttl_secs,
|
||||
};
|
||||
|
||||
let leader_client = create_postgres_client().await.unwrap();
|
||||
let (tx, _) = broadcast::channel(100);
|
||||
let leader_pg_election = PgElection {
|
||||
leader_value: "test_leader".to_string(),
|
||||
client: leader_client,
|
||||
is_leader: AtomicBool::new(false),
|
||||
leader_infancy: AtomicBool::new(true),
|
||||
leader_watcher: tx,
|
||||
store_key_prefix: "test_prefix".to_string(),
|
||||
candidate_lease_ttl_secs,
|
||||
};
|
||||
|
||||
leader_pg_election
|
||||
.client
|
||||
.query(CAMPAIGN, &[])
|
||||
.await
|
||||
.unwrap();
|
||||
leader_pg_election.elected().await.unwrap();
|
||||
|
||||
// Step 1: As a follower, the leader exists and the lease is not expired.
|
||||
follower_pg_election.follower_action().await.unwrap();
|
||||
|
||||
// Step 2: As a follower, the leader exists but the lease expired.
|
||||
tokio::time::sleep(Duration::from_secs(META_LEASE_SECS)).await;
|
||||
assert!(follower_pg_election.follower_action().await.is_err());
|
||||
|
||||
// Step 3: As a follower, the leader does not exist.
|
||||
leader_pg_election
|
||||
.delete_value(&leader_pg_election.election_key())
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(follower_pg_election.follower_action().await.is_err());
|
||||
|
||||
// Step 4: Follower thinks it's the leader but failed to acquire the lock.
|
||||
follower_pg_election
|
||||
.is_leader
|
||||
.store(true, Ordering::Relaxed);
|
||||
assert!(follower_pg_election.follower_action().await.is_err());
|
||||
|
||||
match rx.recv().await {
|
||||
Ok(LeaderChangeMessage::StepDown(key)) => {
|
||||
assert_eq!(String::from_utf8_lossy(key.name()), "test_follower");
|
||||
assert_eq!(
|
||||
String::from_utf8_lossy(key.key()),
|
||||
follower_pg_election.election_key()
|
||||
);
|
||||
assert_eq!(key.lease_id(), i64::default());
|
||||
assert_eq!(key.revision(), i64::default());
|
||||
}
|
||||
_ => panic!("Expected LeaderChangeMessage::StepDown"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,6 +19,7 @@ use object_store::util::{join_dir, with_instrument_layers};
|
||||
use object_store::ObjectStore;
|
||||
use snafu::ResultExt;
|
||||
use store_api::metadata::RegionMetadataRef;
|
||||
use store_api::storage::SequenceNumber;
|
||||
|
||||
use crate::cache::write_cache::SstUploadRequest;
|
||||
use crate::cache::CacheManagerRef;
|
||||
@@ -164,7 +165,9 @@ impl AccessLayer {
|
||||
request.metadata,
|
||||
indexer,
|
||||
);
|
||||
writer.write_all(request.source, write_opts).await?
|
||||
writer
|
||||
.write_all(request.source, request.max_sequence, write_opts)
|
||||
.await?
|
||||
};
|
||||
|
||||
// Put parquet metadata to cache manager.
|
||||
@@ -194,6 +197,7 @@ pub(crate) struct SstWriteRequest {
|
||||
pub(crate) cache_manager: CacheManagerRef,
|
||||
#[allow(dead_code)]
|
||||
pub(crate) storage: Option<String>,
|
||||
pub(crate) max_sequence: Option<SequenceNumber>,
|
||||
|
||||
/// Configs for index
|
||||
pub(crate) index_options: IndexOptions,
|
||||
|
||||
@@ -55,6 +55,195 @@ const FILE_TYPE: &str = "file";
|
||||
/// Metrics type key for selector result cache.
|
||||
const SELECTOR_RESULT_TYPE: &str = "selector_result";
|
||||
|
||||
/// Cache strategies that may only enable a subset of caches.
|
||||
#[derive(Clone)]
|
||||
pub enum CacheStrategy {
|
||||
/// Strategy for normal operations.
|
||||
/// Doesn't disable any cache.
|
||||
EnableAll(CacheManagerRef),
|
||||
/// Strategy for compaction.
|
||||
/// Disables some caches during compaction to avoid affecting queries.
|
||||
/// Enables the write cache so that the compaction can read files cached
|
||||
/// in the write cache and write the compacted files back to the write cache.
|
||||
Compaction(CacheManagerRef),
|
||||
/// Do not use any cache.
|
||||
Disabled,
|
||||
}
|
||||
|
||||
impl CacheStrategy {
|
||||
/// Calls [CacheManager::get_parquet_meta_data()].
|
||||
pub async fn get_parquet_meta_data(
|
||||
&self,
|
||||
region_id: RegionId,
|
||||
file_id: FileId,
|
||||
) -> Option<Arc<ParquetMetaData>> {
|
||||
match self {
|
||||
CacheStrategy::EnableAll(cache_manager) => {
|
||||
cache_manager
|
||||
.get_parquet_meta_data(region_id, file_id)
|
||||
.await
|
||||
}
|
||||
CacheStrategy::Compaction(cache_manager) => {
|
||||
cache_manager
|
||||
.get_parquet_meta_data(region_id, file_id)
|
||||
.await
|
||||
}
|
||||
CacheStrategy::Disabled => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Calls [CacheManager::get_parquet_meta_data_from_mem_cache()].
|
||||
pub fn get_parquet_meta_data_from_mem_cache(
|
||||
&self,
|
||||
region_id: RegionId,
|
||||
file_id: FileId,
|
||||
) -> Option<Arc<ParquetMetaData>> {
|
||||
match self {
|
||||
CacheStrategy::EnableAll(cache_manager) => {
|
||||
cache_manager.get_parquet_meta_data_from_mem_cache(region_id, file_id)
|
||||
}
|
||||
CacheStrategy::Compaction(cache_manager) => {
|
||||
cache_manager.get_parquet_meta_data_from_mem_cache(region_id, file_id)
|
||||
}
|
||||
CacheStrategy::Disabled => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Calls [CacheManager::put_parquet_meta_data()].
|
||||
pub fn put_parquet_meta_data(
|
||||
&self,
|
||||
region_id: RegionId,
|
||||
file_id: FileId,
|
||||
metadata: Arc<ParquetMetaData>,
|
||||
) {
|
||||
match self {
|
||||
CacheStrategy::EnableAll(cache_manager) => {
|
||||
cache_manager.put_parquet_meta_data(region_id, file_id, metadata);
|
||||
}
|
||||
CacheStrategy::Compaction(cache_manager) => {
|
||||
cache_manager.put_parquet_meta_data(region_id, file_id, metadata);
|
||||
}
|
||||
CacheStrategy::Disabled => {}
|
||||
}
|
||||
}
|
||||
|
||||
/// Calls [CacheManager::remove_parquet_meta_data()].
|
||||
pub fn remove_parquet_meta_data(&self, region_id: RegionId, file_id: FileId) {
|
||||
match self {
|
||||
CacheStrategy::EnableAll(cache_manager) => {
|
||||
cache_manager.remove_parquet_meta_data(region_id, file_id);
|
||||
}
|
||||
CacheStrategy::Compaction(cache_manager) => {
|
||||
cache_manager.remove_parquet_meta_data(region_id, file_id);
|
||||
}
|
||||
CacheStrategy::Disabled => {}
|
||||
}
|
||||
}
|
||||
|
||||
/// Calls [CacheManager::get_repeated_vector()].
|
||||
/// It returns None if the strategy is [CacheStrategy::Compaction] or [CacheStrategy::Disabled].
|
||||
pub fn get_repeated_vector(
|
||||
&self,
|
||||
data_type: &ConcreteDataType,
|
||||
value: &Value,
|
||||
) -> Option<VectorRef> {
|
||||
match self {
|
||||
CacheStrategy::EnableAll(cache_manager) => {
|
||||
cache_manager.get_repeated_vector(data_type, value)
|
||||
}
|
||||
CacheStrategy::Compaction(_) | CacheStrategy::Disabled => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Calls [CacheManager::put_repeated_vector()].
|
||||
/// It does nothing if the strategy isn't [CacheStrategy::EnableAll].
|
||||
pub fn put_repeated_vector(&self, value: Value, vector: VectorRef) {
|
||||
if let CacheStrategy::EnableAll(cache_manager) = self {
|
||||
cache_manager.put_repeated_vector(value, vector);
|
||||
}
|
||||
}
|
||||
|
||||
/// Calls [CacheManager::get_pages()].
|
||||
/// It returns None if the strategy is [CacheStrategy::Compaction] or [CacheStrategy::Disabled].
|
||||
pub fn get_pages(&self, page_key: &PageKey) -> Option<Arc<PageValue>> {
|
||||
match self {
|
||||
CacheStrategy::EnableAll(cache_manager) => cache_manager.get_pages(page_key),
|
||||
CacheStrategy::Compaction(_) | CacheStrategy::Disabled => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Calls [CacheManager::put_pages()].
|
||||
/// It does nothing if the strategy isn't [CacheStrategy::EnableAll].
|
||||
pub fn put_pages(&self, page_key: PageKey, pages: Arc<PageValue>) {
|
||||
if let CacheStrategy::EnableAll(cache_manager) = self {
|
||||
cache_manager.put_pages(page_key, pages);
|
||||
}
|
||||
}
|
||||
|
||||
/// Calls [CacheManager::get_selector_result()].
|
||||
/// It returns None if the strategy is [CacheStrategy::Compaction] or [CacheStrategy::Disabled].
|
||||
pub fn get_selector_result(
|
||||
&self,
|
||||
selector_key: &SelectorResultKey,
|
||||
) -> Option<Arc<SelectorResultValue>> {
|
||||
match self {
|
||||
CacheStrategy::EnableAll(cache_manager) => {
|
||||
cache_manager.get_selector_result(selector_key)
|
||||
}
|
||||
CacheStrategy::Compaction(_) | CacheStrategy::Disabled => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Calls [CacheManager::put_selector_result()].
|
||||
/// It does nothing if the strategy isn't [CacheStrategy::EnableAll].
|
||||
pub fn put_selector_result(
|
||||
&self,
|
||||
selector_key: SelectorResultKey,
|
||||
result: Arc<SelectorResultValue>,
|
||||
) {
|
||||
if let CacheStrategy::EnableAll(cache_manager) = self {
|
||||
cache_manager.put_selector_result(selector_key, result);
|
||||
}
|
||||
}
|
||||
|
||||
/// Calls [CacheManager::write_cache()].
|
||||
/// It returns None if the strategy is [CacheStrategy::Disabled].
|
||||
pub fn write_cache(&self) -> Option<&WriteCacheRef> {
|
||||
match self {
|
||||
CacheStrategy::EnableAll(cache_manager) => cache_manager.write_cache(),
|
||||
CacheStrategy::Compaction(cache_manager) => cache_manager.write_cache(),
|
||||
CacheStrategy::Disabled => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Calls [CacheManager::index_cache()].
|
||||
/// It returns None if the strategy is [CacheStrategy::Compaction] or [CacheStrategy::Disabled].
|
||||
pub fn index_cache(&self) -> Option<&InvertedIndexCacheRef> {
|
||||
match self {
|
||||
CacheStrategy::EnableAll(cache_manager) => cache_manager.index_cache(),
|
||||
CacheStrategy::Compaction(_) | CacheStrategy::Disabled => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Calls [CacheManager::bloom_filter_index_cache()].
|
||||
/// It returns None if the strategy is [CacheStrategy::Compaction] or [CacheStrategy::Disabled].
|
||||
pub fn bloom_filter_index_cache(&self) -> Option<&BloomFilterIndexCacheRef> {
|
||||
match self {
|
||||
CacheStrategy::EnableAll(cache_manager) => cache_manager.bloom_filter_index_cache(),
|
||||
CacheStrategy::Compaction(_) | CacheStrategy::Disabled => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Calls [CacheManager::puffin_metadata_cache()].
|
||||
/// It returns None if the strategy is [CacheStrategy::Compaction] or [CacheStrategy::Disabled].
|
||||
pub fn puffin_metadata_cache(&self) -> Option<&PuffinMetadataCacheRef> {
|
||||
match self {
|
||||
CacheStrategy::EnableAll(cache_manager) => cache_manager.puffin_metadata_cache(),
|
||||
CacheStrategy::Compaction(_) | CacheStrategy::Disabled => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Manages cached data for the engine.
|
||||
///
|
||||
/// All caches are disabled by default.
|
||||
|
||||
@@ -61,7 +61,7 @@ fn bloom_filter_index_content_weight((k, _): &((FileId, ColumnId), PageKey), v:
|
||||
pub struct CachedBloomFilterIndexBlobReader<R> {
|
||||
file_id: FileId,
|
||||
column_id: ColumnId,
|
||||
file_size: u64,
|
||||
blob_size: u64,
|
||||
inner: R,
|
||||
cache: BloomFilterIndexCacheRef,
|
||||
}
|
||||
@@ -71,14 +71,14 @@ impl<R> CachedBloomFilterIndexBlobReader<R> {
|
||||
pub fn new(
|
||||
file_id: FileId,
|
||||
column_id: ColumnId,
|
||||
file_size: u64,
|
||||
blob_size: u64,
|
||||
inner: R,
|
||||
cache: BloomFilterIndexCacheRef,
|
||||
) -> Self {
|
||||
Self {
|
||||
file_id,
|
||||
column_id,
|
||||
file_size,
|
||||
blob_size,
|
||||
inner,
|
||||
cache,
|
||||
}
|
||||
@@ -92,7 +92,7 @@ impl<R: BloomFilterReader + Send> BloomFilterReader for CachedBloomFilterIndexBl
|
||||
self.cache
|
||||
.get_or_load(
|
||||
(self.file_id, self.column_id),
|
||||
self.file_size,
|
||||
self.blob_size,
|
||||
offset,
|
||||
size,
|
||||
move |ranges| async move { inner.read_vec(&ranges).await },
|
||||
|
||||
8
src/mito2/src/cache/index/inverted_index.rs
vendored
8
src/mito2/src/cache/index/inverted_index.rs
vendored
@@ -58,17 +58,17 @@ fn inverted_index_content_weight((k, _): &(FileId, PageKey), v: &Bytes) -> u32 {
|
||||
/// Inverted index blob reader with cache.
|
||||
pub struct CachedInvertedIndexBlobReader<R> {
|
||||
file_id: FileId,
|
||||
file_size: u64,
|
||||
blob_size: u64,
|
||||
inner: R,
|
||||
cache: InvertedIndexCacheRef,
|
||||
}
|
||||
|
||||
impl<R> CachedInvertedIndexBlobReader<R> {
|
||||
/// Creates a new inverted index blob reader with cache.
|
||||
pub fn new(file_id: FileId, file_size: u64, inner: R, cache: InvertedIndexCacheRef) -> Self {
|
||||
pub fn new(file_id: FileId, blob_size: u64, inner: R, cache: InvertedIndexCacheRef) -> Self {
|
||||
Self {
|
||||
file_id,
|
||||
file_size,
|
||||
blob_size,
|
||||
inner,
|
||||
cache,
|
||||
}
|
||||
@@ -82,7 +82,7 @@ impl<R: InvertedIndexReader> InvertedIndexReader for CachedInvertedIndexBlobRead
|
||||
self.cache
|
||||
.get_or_load(
|
||||
self.file_id,
|
||||
self.file_size,
|
||||
self.blob_size,
|
||||
offset,
|
||||
size,
|
||||
move |ranges| async move { inner.read_vec(&ranges).await },
|
||||
|
||||
10
src/mito2/src/cache/write_cache.rs
vendored
10
src/mito2/src/cache/write_cache.rs
vendored
@@ -138,7 +138,9 @@ impl WriteCache {
|
||||
indexer,
|
||||
);
|
||||
|
||||
let sst_info = writer.write_all(write_request.source, write_opts).await?;
|
||||
let sst_info = writer
|
||||
.write_all(write_request.source, write_request.max_sequence, write_opts)
|
||||
.await?;
|
||||
|
||||
timer.stop_and_record();
|
||||
|
||||
@@ -332,7 +334,7 @@ mod tests {
|
||||
use super::*;
|
||||
use crate::access_layer::OperationType;
|
||||
use crate::cache::test_util::new_fs_store;
|
||||
use crate::cache::CacheManager;
|
||||
use crate::cache::{CacheManager, CacheStrategy};
|
||||
use crate::region::options::IndexOptions;
|
||||
use crate::sst::file::FileId;
|
||||
use crate::sst::location::{index_file_path, sst_file_path};
|
||||
@@ -375,6 +377,7 @@ mod tests {
|
||||
metadata,
|
||||
source,
|
||||
storage: None,
|
||||
max_sequence: None,
|
||||
cache_manager: Default::default(),
|
||||
index_options: IndexOptions::default(),
|
||||
inverted_index_config: Default::default(),
|
||||
@@ -468,6 +471,7 @@ mod tests {
|
||||
metadata,
|
||||
source,
|
||||
storage: None,
|
||||
max_sequence: None,
|
||||
cache_manager: cache_manager.clone(),
|
||||
index_options: IndexOptions::default(),
|
||||
inverted_index_config: Default::default(),
|
||||
@@ -495,7 +499,7 @@ mod tests {
|
||||
|
||||
// Read metadata from write cache
|
||||
let builder = ParquetReaderBuilder::new(data_home, handle.clone(), mock_store.clone())
|
||||
.cache(Some(cache_manager.clone()));
|
||||
.cache(CacheStrategy::EnableAll(cache_manager.clone()));
|
||||
let reader = builder.build().await.unwrap();
|
||||
|
||||
// Check parquet metadata
|
||||
|
||||
@@ -43,7 +43,7 @@ use table::predicate::Predicate;
|
||||
use tokio::sync::mpsc::{self, Sender};
|
||||
|
||||
use crate::access_layer::AccessLayerRef;
|
||||
use crate::cache::CacheManagerRef;
|
||||
use crate::cache::{CacheManagerRef, CacheStrategy};
|
||||
use crate::compaction::compactor::{CompactionRegion, CompactionVersion, DefaultCompactor};
|
||||
use crate::compaction::picker::{new_picker, CompactionTask};
|
||||
use crate::compaction::task::CompactionTaskImpl;
|
||||
@@ -573,6 +573,7 @@ pub struct SerializedCompactionOutput {
|
||||
struct CompactionSstReaderBuilder<'a> {
|
||||
metadata: RegionMetadataRef,
|
||||
sst_layer: AccessLayerRef,
|
||||
cache: CacheManagerRef,
|
||||
inputs: &'a [FileHandle],
|
||||
append_mode: bool,
|
||||
filter_deleted: bool,
|
||||
@@ -586,7 +587,8 @@ impl<'a> CompactionSstReaderBuilder<'a> {
|
||||
let mut scan_input = ScanInput::new(self.sst_layer, ProjectionMapper::all(&self.metadata)?)
|
||||
.with_files(self.inputs.to_vec())
|
||||
.with_append_mode(self.append_mode)
|
||||
.with_cache(None)
|
||||
// We use special cache strategy for compaction.
|
||||
.with_cache(CacheStrategy::Compaction(self.cache))
|
||||
.with_filter_deleted(self.filter_deleted)
|
||||
// We ignore file not found error during compaction.
|
||||
.with_ignore_file_not_found(true)
|
||||
@@ -693,7 +695,7 @@ mod tests {
|
||||
let (tx, _rx) = mpsc::channel(4);
|
||||
let mut scheduler = env.mock_compaction_scheduler(tx);
|
||||
let mut builder = VersionControlBuilder::new();
|
||||
let schema_metadata_manager = mock_schema_metadata_manager();
|
||||
let (schema_metadata_manager, kv_backend) = mock_schema_metadata_manager();
|
||||
schema_metadata_manager
|
||||
.register_region_table_info(
|
||||
builder.region_id().table_id(),
|
||||
@@ -701,6 +703,7 @@ mod tests {
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
kv_backend,
|
||||
)
|
||||
.await;
|
||||
// Nothing to compact.
|
||||
@@ -757,7 +760,7 @@ mod tests {
|
||||
let purger = builder.file_purger();
|
||||
let region_id = builder.region_id();
|
||||
|
||||
let schema_metadata_manager = mock_schema_metadata_manager();
|
||||
let (schema_metadata_manager, kv_backend) = mock_schema_metadata_manager();
|
||||
schema_metadata_manager
|
||||
.register_region_table_info(
|
||||
builder.region_id().table_id(),
|
||||
@@ -765,6 +768,7 @@ mod tests {
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
kv_backend,
|
||||
)
|
||||
.await;
|
||||
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::num::NonZero;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
@@ -303,10 +304,17 @@ impl Compactor for DefaultCompactor {
|
||||
let fulltext_index_config = compaction_region.engine_config.fulltext_index.clone();
|
||||
let bloom_filter_index_config =
|
||||
compaction_region.engine_config.bloom_filter_index.clone();
|
||||
let max_sequence = output
|
||||
.inputs
|
||||
.iter()
|
||||
.map(|f| f.meta_ref().sequence)
|
||||
.max()
|
||||
.flatten();
|
||||
futs.push(async move {
|
||||
let reader = CompactionSstReaderBuilder {
|
||||
metadata: region_metadata.clone(),
|
||||
sst_layer: sst_layer.clone(),
|
||||
cache: cache_manager.clone(),
|
||||
inputs: &output.inputs,
|
||||
append_mode,
|
||||
filter_deleted: output.filter_deleted,
|
||||
@@ -324,6 +332,7 @@ impl Compactor for DefaultCompactor {
|
||||
source: Source::Reader(reader),
|
||||
cache_manager,
|
||||
storage,
|
||||
max_sequence: max_sequence.map(NonZero::get),
|
||||
index_options,
|
||||
inverted_index_config,
|
||||
fulltext_index_config,
|
||||
@@ -342,6 +351,7 @@ impl Compactor for DefaultCompactor {
|
||||
index_file_size: sst_info.index_metadata.file_size,
|
||||
num_rows: sst_info.num_rows as u64,
|
||||
num_row_groups: sst_info.num_row_groups,
|
||||
sequence: max_sequence,
|
||||
});
|
||||
Ok(file_meta_opt)
|
||||
});
|
||||
|
||||
@@ -39,6 +39,7 @@ pub fn new_file_handle(
|
||||
index_file_size: 0,
|
||||
num_rows: 0,
|
||||
num_row_groups: 0,
|
||||
sequence: None,
|
||||
},
|
||||
file_purger,
|
||||
)
|
||||
@@ -63,6 +64,7 @@ pub(crate) fn new_file_handles(file_specs: &[(i64, i64, u64)]) -> Vec<FileHandle
|
||||
index_file_size: 0,
|
||||
num_rows: 0,
|
||||
num_row_groups: 0,
|
||||
sequence: None,
|
||||
},
|
||||
file_purger.clone(),
|
||||
)
|
||||
|
||||
@@ -760,6 +760,7 @@ mod tests {
|
||||
index_file_size: 0,
|
||||
num_rows: 0,
|
||||
num_row_groups: 0,
|
||||
sequence: None,
|
||||
},
|
||||
Arc::new(NoopFilePurger),
|
||||
)
|
||||
|
||||
@@ -84,6 +84,7 @@ use store_api::region_request::{AffectedRows, RegionOpenRequest, RegionRequest};
|
||||
use store_api::storage::{RegionId, ScanRequest};
|
||||
use tokio::sync::{oneshot, Semaphore};
|
||||
|
||||
use crate::cache::CacheStrategy;
|
||||
use crate::config::MitoConfig;
|
||||
use crate::error::{
|
||||
InvalidRequestSnafu, JoinSnafu, RecvSnafu, RegionNotFoundSnafu, Result, SerdeJsonSnafu,
|
||||
@@ -428,7 +429,7 @@ impl EngineInner {
|
||||
version,
|
||||
region.access_layer.clone(),
|
||||
request,
|
||||
Some(cache_manager),
|
||||
CacheStrategy::EnableAll(cache_manager),
|
||||
)
|
||||
.with_parallel_scan_channel_size(self.config.parallel_scan_channel_size)
|
||||
.with_ignore_inverted_index(self.config.inverted_index.apply_on_query.disabled())
|
||||
|
||||
@@ -116,6 +116,7 @@ async fn test_alter_region() {
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
env.get_kv_backend(),
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -210,6 +211,7 @@ async fn test_put_after_alter() {
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
env.get_kv_backend(),
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -315,6 +317,7 @@ async fn test_alter_region_retry() {
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
env.get_kv_backend(),
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -374,6 +377,7 @@ async fn test_alter_on_flushing() {
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
env.get_kv_backend(),
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -477,6 +481,7 @@ async fn test_alter_column_fulltext_options() {
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
env.get_kv_backend(),
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -594,6 +599,7 @@ async fn test_alter_region_ttl_options() {
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
env.get_kv_backend(),
|
||||
)
|
||||
.await;
|
||||
engine
|
||||
@@ -644,6 +650,7 @@ async fn test_write_stall_on_altering() {
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
env.get_kv_backend(),
|
||||
)
|
||||
.await;
|
||||
|
||||
|
||||
@@ -104,6 +104,7 @@ async fn test_append_mode_compaction() {
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
env.get_kv_backend(),
|
||||
)
|
||||
.await;
|
||||
|
||||
|
||||
@@ -580,7 +580,7 @@ async fn test_region_usage() {
|
||||
flush_region(&engine, region_id, None).await;
|
||||
|
||||
let region_stat = region.region_statistic();
|
||||
assert_eq!(region_stat.sst_size, 2790);
|
||||
assert!(region_stat.sst_size > 0); // Chief says this assert can ensure the size is counted.
|
||||
assert_eq!(region_stat.num_rows, 10);
|
||||
|
||||
// region total usage
|
||||
|
||||
@@ -119,6 +119,7 @@ async fn test_compaction_region() {
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
env.get_kv_backend(),
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -190,6 +191,7 @@ async fn test_compaction_region_with_overlapping() {
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
env.get_kv_backend(),
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -245,6 +247,7 @@ async fn test_compaction_region_with_overlapping_delete_all() {
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
env.get_kv_backend(),
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -319,6 +322,7 @@ async fn test_readonly_during_compaction() {
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
env.get_kv_backend(),
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -374,3 +378,91 @@ async fn test_readonly_during_compaction() {
|
||||
let vec = collect_stream_ts(stream).await;
|
||||
assert_eq!((0..20).map(|v| v * 1000).collect::<Vec<_>>(), vec);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_compaction_update_time_window() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let mut env = TestEnv::new();
|
||||
let engine = env.create_engine(MitoConfig::default()).await;
|
||||
|
||||
let region_id = RegionId::new(1, 1);
|
||||
|
||||
env.get_schema_metadata_manager()
|
||||
.register_region_table_info(
|
||||
region_id.table_id(),
|
||||
"test_table",
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
env.get_kv_backend(),
|
||||
)
|
||||
.await;
|
||||
|
||||
let request = CreateRequestBuilder::new()
|
||||
.insert_option("compaction.type", "twcs")
|
||||
.insert_option("compaction.twcs.max_active_window_runs", "2")
|
||||
.insert_option("compaction.twcs.max_active_window_files", "2")
|
||||
.insert_option("compaction.twcs.max_inactive_window_runs", "2")
|
||||
.insert_option("compaction.twcs.max_inactive_window_files", "2")
|
||||
.build();
|
||||
|
||||
let column_schemas = request
|
||||
.column_metadatas
|
||||
.iter()
|
||||
.map(column_metadata_to_column_schema)
|
||||
.collect::<Vec<_>>();
|
||||
engine
|
||||
.handle_request(region_id, RegionRequest::Create(request))
|
||||
.await
|
||||
.unwrap();
|
||||
// Flush 3 SSTs for compaction.
|
||||
put_and_flush(&engine, region_id, &column_schemas, 0..1200).await; // window 3600
|
||||
put_and_flush(&engine, region_id, &column_schemas, 1200..2400).await; // window 3600
|
||||
put_and_flush(&engine, region_id, &column_schemas, 2400..3600).await; // window 3600
|
||||
|
||||
let result = engine
|
||||
.handle_request(
|
||||
region_id,
|
||||
RegionRequest::Compact(RegionCompactRequest::default()),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(result.affected_rows, 0);
|
||||
|
||||
let scanner = engine.scanner(region_id, ScanRequest::default()).unwrap();
|
||||
assert_eq!(0, scanner.num_memtables());
|
||||
// We keep at most two files.
|
||||
assert_eq!(
|
||||
2,
|
||||
scanner.num_files(),
|
||||
"unexpected files: {:?}",
|
||||
scanner.file_ids()
|
||||
);
|
||||
|
||||
// Flush a new SST and the time window is applied.
|
||||
put_and_flush(&engine, region_id, &column_schemas, 0..1200).await; // window 3600
|
||||
|
||||
// Puts window 7200.
|
||||
let rows = Rows {
|
||||
schema: column_schemas.clone(),
|
||||
rows: build_rows_for_key("a", 3600, 4000, 0),
|
||||
};
|
||||
put_rows(&engine, region_id, rows).await;
|
||||
let scanner = engine.scanner(region_id, ScanRequest::default()).unwrap();
|
||||
assert_eq!(1, scanner.num_memtables());
|
||||
let stream = scanner.scan().await.unwrap();
|
||||
let vec = collect_stream_ts(stream).await;
|
||||
assert_eq!((0..4000).map(|v| v * 1000).collect::<Vec<_>>(), vec);
|
||||
|
||||
// Puts window 3600.
|
||||
let rows = Rows {
|
||||
schema: column_schemas.clone(),
|
||||
rows: build_rows_for_key("a", 2400, 3600, 0),
|
||||
};
|
||||
put_rows(&engine, region_id, rows).await;
|
||||
let scanner = engine.scanner(region_id, ScanRequest::default()).unwrap();
|
||||
assert_eq!(2, scanner.num_memtables());
|
||||
let stream = scanner.scan().await.unwrap();
|
||||
let vec = collect_stream_ts(stream).await;
|
||||
assert_eq!((0..4000).map(|v| v * 1000).collect::<Vec<_>>(), vec);
|
||||
}
|
||||
|
||||
@@ -17,6 +17,7 @@ use std::time::Duration;
|
||||
|
||||
use api::v1::Rows;
|
||||
use common_meta::key::SchemaMetadataManager;
|
||||
use common_meta::kv_backend::KvBackendRef;
|
||||
use object_store::util::join_path;
|
||||
use store_api::region_engine::RegionEngine;
|
||||
use store_api::region_request::{RegionDropRequest, RegionRequest};
|
||||
@@ -49,6 +50,7 @@ async fn test_engine_drop_region() {
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
env.get_kv_backend(),
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -102,6 +104,7 @@ async fn test_engine_drop_region_for_custom_store() {
|
||||
async fn setup(
|
||||
engine: &MitoEngine,
|
||||
schema_metadata_manager: &SchemaMetadataManager,
|
||||
kv_backend: &KvBackendRef,
|
||||
region_id: RegionId,
|
||||
storage_name: &str,
|
||||
) {
|
||||
@@ -123,6 +126,7 @@ async fn test_engine_drop_region_for_custom_store() {
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
kv_backend.clone(),
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -145,17 +149,26 @@ async fn test_engine_drop_region_for_custom_store() {
|
||||
.await;
|
||||
let schema_metadata_manager = env.get_schema_metadata_manager();
|
||||
let object_store_manager = env.get_object_store_manager().unwrap();
|
||||
let kv_backend = env.get_kv_backend();
|
||||
|
||||
let global_region_id = RegionId::new(1, 1);
|
||||
setup(
|
||||
&engine,
|
||||
&schema_metadata_manager,
|
||||
&kv_backend,
|
||||
global_region_id,
|
||||
"default",
|
||||
)
|
||||
.await;
|
||||
let custom_region_id = RegionId::new(2, 1);
|
||||
setup(&engine, &schema_metadata_manager, custom_region_id, "Gcs").await;
|
||||
setup(
|
||||
&engine,
|
||||
&schema_metadata_manager,
|
||||
&kv_backend,
|
||||
custom_region_id,
|
||||
"Gcs",
|
||||
)
|
||||
.await;
|
||||
|
||||
let global_region = engine.get_region(global_region_id).unwrap();
|
||||
let global_region_dir = global_region.access_layer.region_dir().to_string();
|
||||
|
||||
@@ -72,6 +72,7 @@ async fn test_edit_region_schedule_compaction() {
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
env.get_kv_backend(),
|
||||
)
|
||||
.await;
|
||||
engine
|
||||
|
||||
@@ -40,6 +40,7 @@ async fn test_scan_without_filtering_deleted() {
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
env.get_kv_backend(),
|
||||
)
|
||||
.await;
|
||||
let request = CreateRequestBuilder::new()
|
||||
|
||||
@@ -52,6 +52,7 @@ async fn test_manual_flush() {
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
env.get_kv_backend(),
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -109,6 +110,7 @@ async fn test_flush_engine() {
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
env.get_kv_backend(),
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -178,6 +180,7 @@ async fn test_write_stall() {
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
env.get_kv_backend(),
|
||||
)
|
||||
.await;
|
||||
let request = CreateRequestBuilder::new().build();
|
||||
@@ -251,6 +254,7 @@ async fn test_flush_empty() {
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
env.get_kv_backend(),
|
||||
)
|
||||
.await;
|
||||
let request = CreateRequestBuilder::new().build();
|
||||
@@ -295,6 +299,7 @@ async fn test_flush_reopen_region(factory: Option<LogStoreFactory>) {
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
env.get_kv_backend(),
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -415,6 +420,7 @@ async fn test_auto_flush_engine() {
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
env.get_kv_backend(),
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -484,6 +490,7 @@ async fn test_flush_workers() {
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
env.get_kv_backend(),
|
||||
)
|
||||
.await;
|
||||
|
||||
|
||||
@@ -104,6 +104,7 @@ async fn test_merge_mode_compaction() {
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
env.get_kv_backend(),
|
||||
)
|
||||
.await;
|
||||
|
||||
|
||||
@@ -252,6 +252,7 @@ async fn test_open_region_skip_wal_replay() {
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
env.get_kv_backend(),
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -441,6 +442,7 @@ async fn test_open_compaction_region() {
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
env.get_kv_backend(),
|
||||
)
|
||||
.await;
|
||||
let request = CreateRequestBuilder::new().build();
|
||||
|
||||
@@ -84,6 +84,7 @@ async fn test_parallel_scan() {
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
env.get_kv_backend(),
|
||||
)
|
||||
.await;
|
||||
|
||||
|
||||
@@ -159,6 +159,7 @@ async fn test_prune_memtable() {
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
env.get_kv_backend(),
|
||||
)
|
||||
.await;
|
||||
|
||||
|
||||
@@ -36,6 +36,7 @@ async fn test_last_row(append_mode: bool) {
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
env.get_kv_backend(),
|
||||
)
|
||||
.await;
|
||||
let request = CreateRequestBuilder::new()
|
||||
|
||||
@@ -159,6 +159,7 @@ async fn test_engine_truncate_after_flush() {
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
env.get_kv_backend(),
|
||||
)
|
||||
.await;
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
//! Flush related utilities and structs.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::num::NonZeroU64;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
@@ -345,6 +346,7 @@ impl RegionFlushTask {
|
||||
continue;
|
||||
}
|
||||
|
||||
let max_sequence = mem.stats().max_sequence();
|
||||
let file_id = FileId::random();
|
||||
let iter = mem.iter(None, None)?;
|
||||
let source = Source::Iter(iter);
|
||||
@@ -357,6 +359,7 @@ impl RegionFlushTask {
|
||||
source,
|
||||
cache_manager: self.cache_manager.clone(),
|
||||
storage: version.options.storage.clone(),
|
||||
max_sequence: Some(max_sequence),
|
||||
index_options: self.index_options.clone(),
|
||||
inverted_index_config: self.engine_config.inverted_index.clone(),
|
||||
fulltext_index_config: self.engine_config.fulltext_index.clone(),
|
||||
@@ -382,6 +385,7 @@ impl RegionFlushTask {
|
||||
index_file_size: sst_info.index_metadata.file_size,
|
||||
num_rows: sst_info.num_rows as u64,
|
||||
num_row_groups: sst_info.num_row_groups,
|
||||
sequence: NonZeroU64::new(max_sequence),
|
||||
};
|
||||
file_metas.push(file_meta);
|
||||
}
|
||||
|
||||
@@ -225,6 +225,7 @@ async fn checkpoint_with_different_compression_types() {
|
||||
index_file_size: 0,
|
||||
num_rows: 0,
|
||||
num_row_groups: 0,
|
||||
sequence: None,
|
||||
};
|
||||
let action = RegionMetaActionList::new(vec![RegionMetaAction::Edit(RegionEdit {
|
||||
files_to_add: vec![file_meta],
|
||||
|
||||
@@ -23,7 +23,7 @@ pub use bulk::part::BulkPart;
|
||||
use common_time::Timestamp;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use store_api::metadata::RegionMetadataRef;
|
||||
use store_api::storage::ColumnId;
|
||||
use store_api::storage::{ColumnId, SequenceNumber};
|
||||
use table::predicate::Predicate;
|
||||
|
||||
use crate::config::MitoConfig;
|
||||
@@ -70,13 +70,15 @@ impl Default for MemtableConfig {
|
||||
pub struct MemtableStats {
|
||||
/// The estimated bytes allocated by this memtable from heap.
|
||||
estimated_bytes: usize,
|
||||
/// The time range that this memtable contains. It is None if
|
||||
/// The inclusive time range that this memtable contains. It is None if
|
||||
/// and only if the memtable is empty.
|
||||
time_range: Option<(Timestamp, Timestamp)>,
|
||||
/// Total rows in memtable
|
||||
num_rows: usize,
|
||||
/// Total number of ranges in the memtable.
|
||||
num_ranges: usize,
|
||||
/// The maximum sequence number in the memtable.
|
||||
max_sequence: SequenceNumber,
|
||||
}
|
||||
|
||||
impl MemtableStats {
|
||||
@@ -106,6 +108,11 @@ impl MemtableStats {
|
||||
pub fn num_ranges(&self) -> usize {
|
||||
self.num_ranges
|
||||
}
|
||||
|
||||
/// Returns the maximum sequence number in the memtable.
|
||||
pub fn max_sequence(&self) -> SequenceNumber {
|
||||
self.max_sequence
|
||||
}
|
||||
}
|
||||
|
||||
pub type BoxedBatchIterator = Box<dyn Iterator<Item = Result<Batch>> + Send>;
|
||||
|
||||
@@ -63,6 +63,25 @@ impl KeyValues {
|
||||
// Safety: rows is not None.
|
||||
self.mutation.rows.as_ref().unwrap().rows.len()
|
||||
}
|
||||
|
||||
/// Returns if this container is empty
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.mutation.rows.is_none()
|
||||
}
|
||||
|
||||
/// Return the max sequence in this container.
|
||||
///
|
||||
/// When the mutation has no rows, the sequence is the same as the mutation sequence.
|
||||
pub fn max_sequence(&self) -> SequenceNumber {
|
||||
let mut sequence = self.mutation.sequence;
|
||||
let num_rows = self.mutation.rows.as_ref().unwrap().rows.len() as u64;
|
||||
sequence += num_rows;
|
||||
if num_rows > 0 {
|
||||
sequence -= 1;
|
||||
}
|
||||
|
||||
sequence
|
||||
}
|
||||
}
|
||||
|
||||
/// Key value view of a mutation.
|
||||
|
||||
@@ -24,7 +24,7 @@ mod shard_builder;
|
||||
mod tree;
|
||||
|
||||
use std::fmt;
|
||||
use std::sync::atomic::{AtomicI64, AtomicUsize, Ordering};
|
||||
use std::sync::atomic::{AtomicI64, AtomicU64, AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_base::readable_size::ReadableSize;
|
||||
@@ -113,6 +113,7 @@ pub struct PartitionTreeMemtable {
|
||||
alloc_tracker: AllocTracker,
|
||||
max_timestamp: AtomicI64,
|
||||
min_timestamp: AtomicI64,
|
||||
max_sequence: AtomicU64,
|
||||
/// Total written rows in memtable. This also includes deleted and duplicated rows.
|
||||
num_rows: AtomicUsize,
|
||||
}
|
||||
@@ -131,6 +132,10 @@ impl Memtable for PartitionTreeMemtable {
|
||||
}
|
||||
|
||||
fn write(&self, kvs: &KeyValues) -> Result<()> {
|
||||
if kvs.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// TODO(yingwen): Validate schema while inserting rows.
|
||||
|
||||
let mut metrics = WriteMetrics::default();
|
||||
@@ -140,6 +145,12 @@ impl Memtable for PartitionTreeMemtable {
|
||||
|
||||
self.update_stats(&metrics);
|
||||
|
||||
// update max_sequence
|
||||
if res.is_ok() {
|
||||
let sequence = kvs.max_sequence();
|
||||
self.max_sequence.fetch_max(sequence, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
self.num_rows.fetch_add(kvs.num_rows(), Ordering::Relaxed);
|
||||
res
|
||||
}
|
||||
@@ -152,6 +163,12 @@ impl Memtable for PartitionTreeMemtable {
|
||||
|
||||
self.update_stats(&metrics);
|
||||
|
||||
// update max_sequence
|
||||
if res.is_ok() {
|
||||
self.max_sequence
|
||||
.fetch_max(key_value.sequence(), Ordering::Relaxed);
|
||||
}
|
||||
|
||||
self.num_rows.fetch_add(1, Ordering::Relaxed);
|
||||
res
|
||||
}
|
||||
@@ -210,6 +227,7 @@ impl Memtable for PartitionTreeMemtable {
|
||||
time_range: None,
|
||||
num_rows: 0,
|
||||
num_ranges: 0,
|
||||
max_sequence: 0,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -229,6 +247,7 @@ impl Memtable for PartitionTreeMemtable {
|
||||
time_range: Some((min_timestamp, max_timestamp)),
|
||||
num_rows: self.num_rows.load(Ordering::Relaxed),
|
||||
num_ranges: 1,
|
||||
max_sequence: self.max_sequence.load(Ordering::Relaxed),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -267,6 +286,7 @@ impl PartitionTreeMemtable {
|
||||
max_timestamp: AtomicI64::new(i64::MIN),
|
||||
min_timestamp: AtomicI64::new(i64::MAX),
|
||||
num_rows: AtomicUsize::new(0),
|
||||
max_sequence: AtomicU64::new(0),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -168,8 +168,11 @@ impl TimePartitions {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Forks latest partition.
|
||||
pub fn fork(&self, metadata: &RegionMetadataRef) -> Self {
|
||||
/// Forks latest partition and updates the partition duration if `part_duration` is Some.
|
||||
pub fn fork(&self, metadata: &RegionMetadataRef, part_duration: Option<Duration>) -> Self {
|
||||
// Fall back to the existing partition duration.
|
||||
let part_duration = part_duration.or(self.part_duration);
|
||||
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
let latest_part = inner
|
||||
.parts
|
||||
@@ -178,24 +181,39 @@ impl TimePartitions {
|
||||
.cloned();
|
||||
|
||||
let Some(old_part) = latest_part else {
|
||||
// If there is no partition, then we create a new partition with the new duration.
|
||||
return Self::new(
|
||||
metadata.clone(),
|
||||
self.builder.clone(),
|
||||
inner.next_memtable_id,
|
||||
self.part_duration,
|
||||
part_duration,
|
||||
);
|
||||
};
|
||||
|
||||
let old_stats = old_part.memtable.stats();
|
||||
// Use the max timestamp to compute the new time range for the memtable.
|
||||
// If `part_duration` is None, the new range will be None.
|
||||
let new_time_range =
|
||||
old_stats
|
||||
.time_range()
|
||||
.zip(part_duration)
|
||||
.and_then(|(range, bucket)| {
|
||||
partition_start_timestamp(range.1, bucket)
|
||||
.and_then(|start| PartTimeRange::from_start_duration(start, bucket))
|
||||
});
|
||||
// Forks the latest partition, but compute the time range based on the new duration.
|
||||
let memtable = old_part.memtable.fork(inner.alloc_memtable_id(), metadata);
|
||||
let new_part = TimePartition {
|
||||
memtable,
|
||||
time_range: old_part.time_range,
|
||||
time_range: new_time_range,
|
||||
};
|
||||
|
||||
Self {
|
||||
inner: Mutex::new(PartitionsInner::with_partition(
|
||||
new_part,
|
||||
inner.next_memtable_id,
|
||||
)),
|
||||
part_duration: self.part_duration,
|
||||
part_duration,
|
||||
metadata: metadata.clone(),
|
||||
builder: self.builder.clone(),
|
||||
}
|
||||
@@ -238,6 +256,19 @@ impl TimePartitions {
|
||||
inner.next_memtable_id
|
||||
}
|
||||
|
||||
/// Creates a new empty partition list from this list and a `part_duration`.
|
||||
/// It falls back to the old partition duration if `part_duration` is `None`.
|
||||
pub(crate) fn new_with_part_duration(&self, part_duration: Option<Duration>) -> Self {
|
||||
debug_assert!(self.is_empty());
|
||||
|
||||
Self::new(
|
||||
self.metadata.clone(),
|
||||
self.builder.clone(),
|
||||
self.next_memtable_id(),
|
||||
part_duration.or(self.part_duration),
|
||||
)
|
||||
}
|
||||
|
||||
/// Returns all partitions.
|
||||
fn list_partitions(&self) -> PartitionVec {
|
||||
let inner = self.inner.lock().unwrap();
|
||||
@@ -447,9 +478,9 @@ mod tests {
|
||||
|
||||
assert_eq!(1, partitions.num_partitions());
|
||||
assert!(!partitions.is_empty());
|
||||
assert!(!partitions.is_empty());
|
||||
let mut memtables = Vec::new();
|
||||
partitions.list_memtables(&mut memtables);
|
||||
assert_eq!(0, memtables[0].id());
|
||||
|
||||
let iter = memtables[0].iter(None, None).unwrap();
|
||||
let timestamps = collect_iter_timestamps(iter);
|
||||
@@ -503,16 +534,14 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_write_multi_parts() {
|
||||
let metadata = memtable_util::metadata_for_test();
|
||||
fn new_multi_partitions(metadata: &RegionMetadataRef) -> TimePartitions {
|
||||
let builder = Arc::new(PartitionTreeMemtableBuilder::default());
|
||||
let partitions =
|
||||
TimePartitions::new(metadata.clone(), builder, 0, Some(Duration::from_secs(5)));
|
||||
assert_eq!(0, partitions.num_partitions());
|
||||
|
||||
let kvs = memtable_util::build_key_values(
|
||||
&metadata,
|
||||
metadata,
|
||||
"hello".to_string(),
|
||||
0,
|
||||
&[2000, 0],
|
||||
@@ -524,7 +553,7 @@ mod tests {
|
||||
assert!(!partitions.is_empty());
|
||||
|
||||
let kvs = memtable_util::build_key_values(
|
||||
&metadata,
|
||||
metadata,
|
||||
"hello".to_string(),
|
||||
0,
|
||||
&[3000, 7000, 4000, 5000],
|
||||
@@ -534,9 +563,18 @@ mod tests {
|
||||
partitions.write(&kvs).unwrap();
|
||||
assert_eq!(2, partitions.num_partitions());
|
||||
|
||||
partitions
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_write_multi_parts() {
|
||||
let metadata = memtable_util::metadata_for_test();
|
||||
let partitions = new_multi_partitions(&metadata);
|
||||
|
||||
let parts = partitions.list_partitions();
|
||||
let iter = parts[0].memtable.iter(None, None).unwrap();
|
||||
let timestamps = collect_iter_timestamps(iter);
|
||||
assert_eq!(0, parts[0].memtable.id());
|
||||
assert_eq!(
|
||||
Timestamp::new_millisecond(0),
|
||||
parts[0].time_range.unwrap().min_timestamp
|
||||
@@ -547,6 +585,7 @@ mod tests {
|
||||
);
|
||||
assert_eq!(&[0, 2000, 3000, 4000], ×tamps[..]);
|
||||
let iter = parts[1].memtable.iter(None, None).unwrap();
|
||||
assert_eq!(1, parts[1].memtable.id());
|
||||
let timestamps = collect_iter_timestamps(iter);
|
||||
assert_eq!(&[5000, 7000], ×tamps[..]);
|
||||
assert_eq!(
|
||||
@@ -558,4 +597,85 @@ mod tests {
|
||||
parts[1].time_range.unwrap().max_timestamp
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_new_with_part_duration() {
|
||||
let metadata = memtable_util::metadata_for_test();
|
||||
let builder = Arc::new(PartitionTreeMemtableBuilder::default());
|
||||
let partitions = TimePartitions::new(metadata.clone(), builder.clone(), 0, None);
|
||||
|
||||
let new_parts = partitions.new_with_part_duration(Some(Duration::from_secs(5)));
|
||||
assert_eq!(Duration::from_secs(5), new_parts.part_duration().unwrap());
|
||||
assert_eq!(1, new_parts.next_memtable_id());
|
||||
|
||||
// Won't update the duration if it's None.
|
||||
let new_parts = new_parts.new_with_part_duration(None);
|
||||
assert_eq!(Duration::from_secs(5), new_parts.part_duration().unwrap());
|
||||
// Don't need to create new memtables.
|
||||
assert_eq!(1, new_parts.next_memtable_id());
|
||||
|
||||
let new_parts = new_parts.new_with_part_duration(Some(Duration::from_secs(10)));
|
||||
assert_eq!(Duration::from_secs(10), new_parts.part_duration().unwrap());
|
||||
// Don't need to create new memtables.
|
||||
assert_eq!(1, new_parts.next_memtable_id());
|
||||
|
||||
let builder = Arc::new(PartitionTreeMemtableBuilder::default());
|
||||
let partitions = TimePartitions::new(metadata.clone(), builder.clone(), 0, None);
|
||||
// Need to build a new memtable as duration is still None.
|
||||
let new_parts = partitions.new_with_part_duration(None);
|
||||
assert!(new_parts.part_duration().is_none());
|
||||
assert_eq!(2, new_parts.next_memtable_id());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fork_empty() {
|
||||
let metadata = memtable_util::metadata_for_test();
|
||||
let builder = Arc::new(PartitionTreeMemtableBuilder::default());
|
||||
let partitions = TimePartitions::new(metadata.clone(), builder, 0, None);
|
||||
partitions.freeze().unwrap();
|
||||
let new_parts = partitions.fork(&metadata, None);
|
||||
assert!(new_parts.part_duration().is_none());
|
||||
assert_eq!(1, new_parts.list_partitions()[0].memtable.id());
|
||||
assert_eq!(2, new_parts.next_memtable_id());
|
||||
|
||||
new_parts.freeze().unwrap();
|
||||
let new_parts = new_parts.fork(&metadata, Some(Duration::from_secs(5)));
|
||||
assert_eq!(Duration::from_secs(5), new_parts.part_duration().unwrap());
|
||||
assert_eq!(2, new_parts.list_partitions()[0].memtable.id());
|
||||
assert_eq!(3, new_parts.next_memtable_id());
|
||||
|
||||
new_parts.freeze().unwrap();
|
||||
let new_parts = new_parts.fork(&metadata, None);
|
||||
// Won't update the duration.
|
||||
assert_eq!(Duration::from_secs(5), new_parts.part_duration().unwrap());
|
||||
assert_eq!(3, new_parts.list_partitions()[0].memtable.id());
|
||||
assert_eq!(4, new_parts.next_memtable_id());
|
||||
|
||||
new_parts.freeze().unwrap();
|
||||
let new_parts = new_parts.fork(&metadata, Some(Duration::from_secs(10)));
|
||||
assert_eq!(Duration::from_secs(10), new_parts.part_duration().unwrap());
|
||||
assert_eq!(4, new_parts.list_partitions()[0].memtable.id());
|
||||
assert_eq!(5, new_parts.next_memtable_id());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fork_non_empty_none() {
|
||||
let metadata = memtable_util::metadata_for_test();
|
||||
let partitions = new_multi_partitions(&metadata);
|
||||
partitions.freeze().unwrap();
|
||||
|
||||
// Won't update the duration.
|
||||
let new_parts = partitions.fork(&metadata, None);
|
||||
assert!(new_parts.is_empty());
|
||||
assert_eq!(Duration::from_secs(5), new_parts.part_duration().unwrap());
|
||||
assert_eq!(2, new_parts.list_partitions()[0].memtable.id());
|
||||
assert_eq!(3, new_parts.next_memtable_id());
|
||||
|
||||
// Although we don't fork a memtable multiple times, we still add a test for it.
|
||||
let new_parts = partitions.fork(&metadata, Some(Duration::from_secs(10)));
|
||||
assert!(new_parts.is_empty());
|
||||
assert_eq!(Duration::from_secs(10), new_parts.part_duration().unwrap());
|
||||
assert_eq!(3, new_parts.list_partitions()[0].memtable.id());
|
||||
assert_eq!(4, new_parts.next_memtable_id());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
use std::collections::btree_map::Entry;
|
||||
use std::collections::{BTreeMap, Bound, HashSet};
|
||||
use std::fmt::{Debug, Formatter};
|
||||
use std::sync::atomic::{AtomicI64, AtomicUsize, Ordering};
|
||||
use std::sync::atomic::{AtomicI64, AtomicU64, AtomicUsize, Ordering};
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
@@ -100,6 +100,7 @@ pub struct TimeSeriesMemtable {
|
||||
alloc_tracker: AllocTracker,
|
||||
max_timestamp: AtomicI64,
|
||||
min_timestamp: AtomicI64,
|
||||
max_sequence: AtomicU64,
|
||||
dedup: bool,
|
||||
merge_mode: MergeMode,
|
||||
/// Total written rows in memtable. This also includes deleted and duplicated rows.
|
||||
@@ -134,6 +135,7 @@ impl TimeSeriesMemtable {
|
||||
alloc_tracker: AllocTracker::new(write_buffer_manager),
|
||||
max_timestamp: AtomicI64::new(i64::MIN),
|
||||
min_timestamp: AtomicI64::new(i64::MAX),
|
||||
max_sequence: AtomicU64::new(0),
|
||||
dedup,
|
||||
merge_mode,
|
||||
num_rows: Default::default(),
|
||||
@@ -186,6 +188,10 @@ impl Memtable for TimeSeriesMemtable {
|
||||
}
|
||||
|
||||
fn write(&self, kvs: &KeyValues) -> Result<()> {
|
||||
if kvs.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut local_stats = WriteMetrics::default();
|
||||
|
||||
for kv in kvs.iter() {
|
||||
@@ -199,6 +205,10 @@ impl Memtable for TimeSeriesMemtable {
|
||||
// so that we can ensure writing to memtable will succeed.
|
||||
self.update_stats(local_stats);
|
||||
|
||||
// update max_sequence
|
||||
let sequence = kvs.max_sequence();
|
||||
self.max_sequence.fetch_max(sequence, Ordering::Relaxed);
|
||||
|
||||
self.num_rows.fetch_add(kvs.num_rows(), Ordering::Relaxed);
|
||||
Ok(())
|
||||
}
|
||||
@@ -209,6 +219,13 @@ impl Memtable for TimeSeriesMemtable {
|
||||
metrics.value_bytes += std::mem::size_of::<Timestamp>() + std::mem::size_of::<OpType>();
|
||||
|
||||
self.update_stats(metrics);
|
||||
|
||||
// update max_sequence
|
||||
if res.is_ok() {
|
||||
self.max_sequence
|
||||
.fetch_max(key_value.sequence(), Ordering::Relaxed);
|
||||
}
|
||||
|
||||
self.num_rows.fetch_add(1, Ordering::Relaxed);
|
||||
res
|
||||
}
|
||||
@@ -294,6 +311,7 @@ impl Memtable for TimeSeriesMemtable {
|
||||
time_range: None,
|
||||
num_rows: 0,
|
||||
num_ranges: 0,
|
||||
max_sequence: 0,
|
||||
};
|
||||
}
|
||||
let ts_type = self
|
||||
@@ -311,6 +329,7 @@ impl Memtable for TimeSeriesMemtable {
|
||||
time_range: Some((min_timestamp, max_timestamp)),
|
||||
num_rows: self.num_rows.load(Ordering::Relaxed),
|
||||
num_ranges: 1,
|
||||
max_sequence: self.max_sequence.load(Ordering::Relaxed),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
//! Memtable version.
|
||||
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use smallvec::SmallVec;
|
||||
use store_api::metadata::RegionMetadataRef;
|
||||
@@ -65,27 +66,53 @@ impl MemtableVersion {
|
||||
/// Returns a new [MemtableVersion] which switches the old mutable memtable to immutable
|
||||
/// memtable.
|
||||
///
|
||||
/// It will switch to use the `time_window` provided.
|
||||
///
|
||||
/// Returns `None` if the mutable memtable is empty.
|
||||
pub(crate) fn freeze_mutable(
|
||||
&self,
|
||||
metadata: &RegionMetadataRef,
|
||||
time_window: Option<Duration>,
|
||||
) -> Result<Option<MemtableVersion>> {
|
||||
if self.mutable.is_empty() {
|
||||
// No need to freeze the mutable memtable.
|
||||
return Ok(None);
|
||||
// No need to freeze the mutable memtable, but we need to check the time window.
|
||||
if self.mutable.part_duration() == time_window {
|
||||
// If the time window is the same, we don't need to update it.
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// Update the time window.
|
||||
let mutable = self.mutable.new_with_part_duration(time_window);
|
||||
common_telemetry::debug!(
|
||||
"Freeze empty memtable, update partition duration from {:?} to {:?}",
|
||||
self.mutable.part_duration(),
|
||||
time_window
|
||||
);
|
||||
return Ok(Some(MemtableVersion {
|
||||
mutable: Arc::new(mutable),
|
||||
immutables: self.immutables.clone(),
|
||||
}));
|
||||
}
|
||||
|
||||
// Marks the mutable memtable as immutable so it can free the memory usage from our
|
||||
// soft limit.
|
||||
self.mutable.freeze()?;
|
||||
// Fork the memtable.
|
||||
let mutable = Arc::new(self.mutable.fork(metadata));
|
||||
if self.mutable.part_duration() != time_window {
|
||||
common_telemetry::debug!(
|
||||
"Fork memtable, update partition duration from {:?}, to {:?}",
|
||||
self.mutable.part_duration(),
|
||||
time_window
|
||||
);
|
||||
}
|
||||
let mutable = Arc::new(self.mutable.fork(metadata, time_window));
|
||||
|
||||
// Pushes the mutable memtable to immutable list.
|
||||
let mut immutables =
|
||||
SmallVec::with_capacity(self.immutables.len() + self.mutable.num_partitions());
|
||||
self.mutable.list_memtables_to_small_vec(&mut immutables);
|
||||
immutables.extend(self.immutables.iter().cloned());
|
||||
// Pushes the mutable memtable to immutable list.
|
||||
self.mutable.list_memtables_to_small_vec(&mut immutables);
|
||||
|
||||
Ok(Some(MemtableVersion {
|
||||
mutable,
|
||||
immutables,
|
||||
|
||||
@@ -21,7 +21,7 @@ use datatypes::vectors::UInt32Vector;
|
||||
use store_api::storage::TimeSeriesRowSelector;
|
||||
|
||||
use crate::cache::{
|
||||
selector_result_cache_hit, selector_result_cache_miss, CacheManagerRef, SelectorResultKey,
|
||||
selector_result_cache_hit, selector_result_cache_miss, CacheStrategy, SelectorResultKey,
|
||||
SelectorResultValue,
|
||||
};
|
||||
use crate::error::Result;
|
||||
@@ -86,7 +86,7 @@ impl RowGroupLastRowCachedReader {
|
||||
pub(crate) fn new(
|
||||
file_id: FileId,
|
||||
row_group_idx: usize,
|
||||
cache_manager: Option<CacheManagerRef>,
|
||||
cache_strategy: CacheStrategy,
|
||||
row_group_reader: RowGroupReader,
|
||||
) -> Self {
|
||||
let key = SelectorResultKey {
|
||||
@@ -95,20 +95,17 @@ impl RowGroupLastRowCachedReader {
|
||||
selector: TimeSeriesRowSelector::LastRow,
|
||||
};
|
||||
|
||||
let Some(cache_manager) = cache_manager else {
|
||||
return Self::new_miss(key, row_group_reader, None);
|
||||
};
|
||||
if let Some(value) = cache_manager.get_selector_result(&key) {
|
||||
if let Some(value) = cache_strategy.get_selector_result(&key) {
|
||||
let schema_matches =
|
||||
value.projection == row_group_reader.read_format().projection_indices();
|
||||
if schema_matches {
|
||||
// Schema matches, use cache batches.
|
||||
Self::new_hit(value)
|
||||
} else {
|
||||
Self::new_miss(key, row_group_reader, Some(cache_manager))
|
||||
Self::new_miss(key, row_group_reader, cache_strategy)
|
||||
}
|
||||
} else {
|
||||
Self::new_miss(key, row_group_reader, Some(cache_manager))
|
||||
Self::new_miss(key, row_group_reader, cache_strategy)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -130,13 +127,13 @@ impl RowGroupLastRowCachedReader {
|
||||
fn new_miss(
|
||||
key: SelectorResultKey,
|
||||
row_group_reader: RowGroupReader,
|
||||
cache_manager: Option<CacheManagerRef>,
|
||||
cache_strategy: CacheStrategy,
|
||||
) -> Self {
|
||||
selector_result_cache_miss();
|
||||
Self::Miss(RowGroupLastRowReader::new(
|
||||
key,
|
||||
row_group_reader,
|
||||
cache_manager,
|
||||
cache_strategy,
|
||||
))
|
||||
}
|
||||
}
|
||||
@@ -175,23 +172,19 @@ pub(crate) struct RowGroupLastRowReader {
|
||||
reader: RowGroupReader,
|
||||
selector: LastRowSelector,
|
||||
yielded_batches: Vec<Batch>,
|
||||
cache_manager: Option<CacheManagerRef>,
|
||||
cache_strategy: CacheStrategy,
|
||||
/// Index buffer to take a new batch from the last row.
|
||||
take_index: UInt32Vector,
|
||||
}
|
||||
|
||||
impl RowGroupLastRowReader {
|
||||
fn new(
|
||||
key: SelectorResultKey,
|
||||
reader: RowGroupReader,
|
||||
cache_manager: Option<CacheManagerRef>,
|
||||
) -> Self {
|
||||
fn new(key: SelectorResultKey, reader: RowGroupReader, cache_strategy: CacheStrategy) -> Self {
|
||||
Self {
|
||||
key,
|
||||
reader,
|
||||
selector: LastRowSelector::default(),
|
||||
yielded_batches: vec![],
|
||||
cache_manager,
|
||||
cache_strategy,
|
||||
take_index: UInt32Vector::from_vec(vec![0]),
|
||||
}
|
||||
}
|
||||
@@ -221,17 +214,15 @@ impl RowGroupLastRowReader {
|
||||
|
||||
/// Updates row group's last row cache if cache manager is present.
|
||||
fn maybe_update_cache(&mut self) {
|
||||
if let Some(cache) = &self.cache_manager {
|
||||
if self.yielded_batches.is_empty() {
|
||||
// we always expect that row groups yields batches.
|
||||
return;
|
||||
}
|
||||
let value = Arc::new(SelectorResultValue {
|
||||
result: std::mem::take(&mut self.yielded_batches),
|
||||
projection: self.reader.read_format().projection_indices().to_vec(),
|
||||
});
|
||||
cache.put_selector_result(self.key, value)
|
||||
if self.yielded_batches.is_empty() {
|
||||
// we always expect that row groups yields batches.
|
||||
return;
|
||||
}
|
||||
let value = Arc::new(SelectorResultValue {
|
||||
result: std::mem::take(&mut self.yielded_batches),
|
||||
projection: self.reader.read_format().projection_indices().to_vec(),
|
||||
});
|
||||
self.cache_strategy.put_selector_result(self.key, value);
|
||||
}
|
||||
|
||||
fn metrics(&self) -> &ReaderMetrics {
|
||||
|
||||
@@ -30,7 +30,7 @@ use snafu::{OptionExt, ResultExt};
|
||||
use store_api::metadata::RegionMetadataRef;
|
||||
use store_api::storage::ColumnId;
|
||||
|
||||
use crate::cache::CacheManager;
|
||||
use crate::cache::CacheStrategy;
|
||||
use crate::error::{InvalidRequestSnafu, Result};
|
||||
use crate::read::Batch;
|
||||
use crate::row_converter::{McmpRowCodec, RowCodec, SortField};
|
||||
@@ -171,7 +171,7 @@ impl ProjectionMapper {
|
||||
pub(crate) fn convert(
|
||||
&self,
|
||||
batch: &Batch,
|
||||
cache_manager: Option<&CacheManager>,
|
||||
cache_strategy: &CacheStrategy,
|
||||
) -> common_recordbatch::error::Result<RecordBatch> {
|
||||
debug_assert_eq!(self.batch_fields.len(), batch.fields().len());
|
||||
debug_assert!(self
|
||||
@@ -204,15 +204,12 @@ impl ProjectionMapper {
|
||||
match index {
|
||||
BatchIndex::Tag(idx) => {
|
||||
let value = &pk_values[*idx];
|
||||
let vector = match cache_manager {
|
||||
Some(cache) => repeated_vector_with_cache(
|
||||
&column_schema.data_type,
|
||||
value,
|
||||
num_rows,
|
||||
cache,
|
||||
)?,
|
||||
None => new_repeated_vector(&column_schema.data_type, value, num_rows)?,
|
||||
};
|
||||
let vector = repeated_vector_with_cache(
|
||||
&column_schema.data_type,
|
||||
value,
|
||||
num_rows,
|
||||
cache_strategy,
|
||||
)?;
|
||||
columns.push(vector);
|
||||
}
|
||||
BatchIndex::Timestamp => {
|
||||
@@ -244,9 +241,9 @@ fn repeated_vector_with_cache(
|
||||
data_type: &ConcreteDataType,
|
||||
value: &Value,
|
||||
num_rows: usize,
|
||||
cache_manager: &CacheManager,
|
||||
cache_strategy: &CacheStrategy,
|
||||
) -> common_recordbatch::error::Result<VectorRef> {
|
||||
if let Some(vector) = cache_manager.get_repeated_vector(data_type, value) {
|
||||
if let Some(vector) = cache_strategy.get_repeated_vector(data_type, value) {
|
||||
// Tries to get the vector from cache manager. If the vector doesn't
|
||||
// have enough length, creates a new one.
|
||||
match vector.len().cmp(&num_rows) {
|
||||
@@ -260,7 +257,7 @@ fn repeated_vector_with_cache(
|
||||
let vector = new_repeated_vector(data_type, value, num_rows)?;
|
||||
// Updates cache.
|
||||
if vector.len() <= MAX_VECTOR_LENGTH_TO_CACHE {
|
||||
cache_manager.put_repeated_vector(value.clone(), vector.clone());
|
||||
cache_strategy.put_repeated_vector(value.clone(), vector.clone());
|
||||
}
|
||||
|
||||
Ok(vector)
|
||||
@@ -284,12 +281,15 @@ fn new_repeated_vector(
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use api::v1::OpType;
|
||||
use datatypes::arrow::array::{Int64Array, TimestampMillisecondArray, UInt64Array, UInt8Array};
|
||||
use datatypes::arrow::util::pretty;
|
||||
use datatypes::value::ValueRef;
|
||||
|
||||
use super::*;
|
||||
use crate::cache::CacheManager;
|
||||
use crate::read::BatchBuilder;
|
||||
use crate::test_util::meta_util::TestRegionMetadataBuilder;
|
||||
|
||||
@@ -359,8 +359,9 @@ mod tests {
|
||||
|
||||
// With vector cache.
|
||||
let cache = CacheManager::builder().vector_cache_size(1024).build();
|
||||
let cache = CacheStrategy::EnableAll(Arc::new(cache));
|
||||
let batch = new_batch(0, &[1, 2], &[(3, 3), (4, 4)], 3);
|
||||
let record_batch = mapper.convert(&batch, Some(&cache)).unwrap();
|
||||
let record_batch = mapper.convert(&batch, &cache).unwrap();
|
||||
let expect = "\
|
||||
+---------------------+----+----+----+----+
|
||||
| ts | k0 | k1 | v0 | v1 |
|
||||
@@ -380,7 +381,7 @@ mod tests {
|
||||
assert!(cache
|
||||
.get_repeated_vector(&ConcreteDataType::int64_datatype(), &Value::Int64(3))
|
||||
.is_none());
|
||||
let record_batch = mapper.convert(&batch, Some(&cache)).unwrap();
|
||||
let record_batch = mapper.convert(&batch, &cache).unwrap();
|
||||
assert_eq!(expect, print_record_batch(record_batch));
|
||||
}
|
||||
|
||||
@@ -401,7 +402,9 @@ mod tests {
|
||||
);
|
||||
|
||||
let batch = new_batch(0, &[1, 2], &[(4, 4)], 3);
|
||||
let record_batch = mapper.convert(&batch, None).unwrap();
|
||||
let cache = CacheManager::builder().vector_cache_size(1024).build();
|
||||
let cache = CacheStrategy::EnableAll(Arc::new(cache));
|
||||
let record_batch = mapper.convert(&batch, &cache).unwrap();
|
||||
let expect = "\
|
||||
+----+----+
|
||||
| v1 | k0 |
|
||||
|
||||
@@ -22,7 +22,7 @@ use parquet::arrow::arrow_reader::RowSelection;
|
||||
use smallvec::{smallvec, SmallVec};
|
||||
use store_api::region_engine::PartitionRange;
|
||||
|
||||
use crate::cache::CacheManager;
|
||||
use crate::cache::CacheStrategy;
|
||||
use crate::error::Result;
|
||||
use crate::memtable::{MemtableRange, MemtableRanges, MemtableStats};
|
||||
use crate::read::scan_region::ScanInput;
|
||||
@@ -112,7 +112,7 @@ impl RangeMeta {
|
||||
Self::push_unordered_file_ranges(
|
||||
input.memtables.len(),
|
||||
&input.files,
|
||||
input.cache_manager.as_deref(),
|
||||
&input.cache_strategy,
|
||||
&mut ranges,
|
||||
);
|
||||
|
||||
@@ -203,16 +203,15 @@ impl RangeMeta {
|
||||
fn push_unordered_file_ranges(
|
||||
num_memtables: usize,
|
||||
files: &[FileHandle],
|
||||
cache: Option<&CacheManager>,
|
||||
cache: &CacheStrategy,
|
||||
ranges: &mut Vec<RangeMeta>,
|
||||
) {
|
||||
// For append mode, we can parallelize reading row groups.
|
||||
for (i, file) in files.iter().enumerate() {
|
||||
let file_index = num_memtables + i;
|
||||
// Get parquet meta from the cache.
|
||||
let parquet_meta = cache.and_then(|c| {
|
||||
c.get_parquet_meta_data_from_mem_cache(file.region_id(), file.file_id())
|
||||
});
|
||||
let parquet_meta =
|
||||
cache.get_parquet_meta_data_from_mem_cache(file.region_id(), file.file_id());
|
||||
if let Some(parquet_meta) = parquet_meta {
|
||||
// Scans each row group.
|
||||
for row_group_index in 0..file.meta_ref().num_row_groups {
|
||||
|
||||
@@ -33,7 +33,7 @@ use tokio_stream::wrappers::ReceiverStream;
|
||||
|
||||
use crate::access_layer::AccessLayerRef;
|
||||
use crate::cache::file_cache::FileCacheRef;
|
||||
use crate::cache::CacheManagerRef;
|
||||
use crate::cache::CacheStrategy;
|
||||
use crate::config::DEFAULT_SCAN_CHANNEL_SIZE;
|
||||
use crate::error::Result;
|
||||
use crate::memtable::MemtableRange;
|
||||
@@ -171,7 +171,7 @@ pub(crate) struct ScanRegion {
|
||||
/// Scan request.
|
||||
request: ScanRequest,
|
||||
/// Cache.
|
||||
cache_manager: Option<CacheManagerRef>,
|
||||
cache_strategy: CacheStrategy,
|
||||
/// Capacity of the channel to send data from parallel scan tasks to the main task.
|
||||
parallel_scan_channel_size: usize,
|
||||
/// Whether to ignore inverted index.
|
||||
@@ -190,13 +190,13 @@ impl ScanRegion {
|
||||
version: VersionRef,
|
||||
access_layer: AccessLayerRef,
|
||||
request: ScanRequest,
|
||||
cache_manager: Option<CacheManagerRef>,
|
||||
cache_strategy: CacheStrategy,
|
||||
) -> ScanRegion {
|
||||
ScanRegion {
|
||||
version,
|
||||
access_layer,
|
||||
request,
|
||||
cache_manager,
|
||||
cache_strategy,
|
||||
parallel_scan_channel_size: DEFAULT_SCAN_CHANNEL_SIZE,
|
||||
ignore_inverted_index: false,
|
||||
ignore_fulltext_index: false,
|
||||
@@ -357,7 +357,7 @@ impl ScanRegion {
|
||||
.with_predicate(Some(predicate))
|
||||
.with_memtables(memtables)
|
||||
.with_files(files)
|
||||
.with_cache(self.cache_manager)
|
||||
.with_cache(self.cache_strategy)
|
||||
.with_inverted_index_applier(inverted_index_applier)
|
||||
.with_bloom_filter_index_applier(bloom_filter_applier)
|
||||
.with_fulltext_index_applier(fulltext_index_applier)
|
||||
@@ -421,23 +421,14 @@ impl ScanRegion {
|
||||
}
|
||||
|
||||
let file_cache = || -> Option<FileCacheRef> {
|
||||
let cache_manager = self.cache_manager.as_ref()?;
|
||||
let write_cache = cache_manager.write_cache()?;
|
||||
let write_cache = self.cache_strategy.write_cache()?;
|
||||
let file_cache = write_cache.file_cache();
|
||||
Some(file_cache)
|
||||
}();
|
||||
|
||||
let index_cache = self
|
||||
.cache_manager
|
||||
.as_ref()
|
||||
.and_then(|c| c.index_cache())
|
||||
.cloned();
|
||||
let index_cache = self.cache_strategy.index_cache().cloned();
|
||||
|
||||
let puffin_metadata_cache = self
|
||||
.cache_manager
|
||||
.as_ref()
|
||||
.and_then(|c| c.puffin_metadata_cache())
|
||||
.cloned();
|
||||
let puffin_metadata_cache = self.cache_strategy.puffin_metadata_cache().cloned();
|
||||
|
||||
InvertedIndexApplierBuilder::new(
|
||||
self.access_layer.region_dir().to_string(),
|
||||
@@ -470,23 +461,14 @@ impl ScanRegion {
|
||||
}
|
||||
|
||||
let file_cache = || -> Option<FileCacheRef> {
|
||||
let cache_manager = self.cache_manager.as_ref()?;
|
||||
let write_cache = cache_manager.write_cache()?;
|
||||
let write_cache = self.cache_strategy.write_cache()?;
|
||||
let file_cache = write_cache.file_cache();
|
||||
Some(file_cache)
|
||||
}();
|
||||
|
||||
let index_cache = self
|
||||
.cache_manager
|
||||
.as_ref()
|
||||
.and_then(|c| c.bloom_filter_index_cache())
|
||||
.cloned();
|
||||
let index_cache = self.cache_strategy.bloom_filter_index_cache().cloned();
|
||||
|
||||
let puffin_metadata_cache = self
|
||||
.cache_manager
|
||||
.as_ref()
|
||||
.and_then(|c| c.puffin_metadata_cache())
|
||||
.cloned();
|
||||
let puffin_metadata_cache = self.cache_strategy.puffin_metadata_cache().cloned();
|
||||
|
||||
BloomFilterIndexApplierBuilder::new(
|
||||
self.access_layer.region_dir().to_string(),
|
||||
@@ -550,7 +532,7 @@ pub(crate) struct ScanInput {
|
||||
/// Handles to SST files to scan.
|
||||
pub(crate) files: Vec<FileHandle>,
|
||||
/// Cache.
|
||||
pub(crate) cache_manager: Option<CacheManagerRef>,
|
||||
pub(crate) cache_strategy: CacheStrategy,
|
||||
/// Ignores file not found error.
|
||||
ignore_file_not_found: bool,
|
||||
/// Capacity of the channel to send data from parallel scan tasks to the main task.
|
||||
@@ -582,7 +564,7 @@ impl ScanInput {
|
||||
predicate: None,
|
||||
memtables: Vec::new(),
|
||||
files: Vec::new(),
|
||||
cache_manager: None,
|
||||
cache_strategy: CacheStrategy::Disabled,
|
||||
ignore_file_not_found: false,
|
||||
parallel_scan_channel_size: DEFAULT_SCAN_CHANNEL_SIZE,
|
||||
inverted_index_applier: None,
|
||||
@@ -626,8 +608,8 @@ impl ScanInput {
|
||||
|
||||
/// Sets cache for this query.
|
||||
#[must_use]
|
||||
pub(crate) fn with_cache(mut self, cache: Option<CacheManagerRef>) -> Self {
|
||||
self.cache_manager = cache;
|
||||
pub(crate) fn with_cache(mut self, cache: CacheStrategy) -> Self {
|
||||
self.cache_strategy = cache;
|
||||
self
|
||||
}
|
||||
|
||||
@@ -760,7 +742,7 @@ impl ScanInput {
|
||||
.read_sst(file.clone())
|
||||
.predicate(self.predicate.clone())
|
||||
.projection(Some(self.mapper.column_ids().to_vec()))
|
||||
.cache(self.cache_manager.clone())
|
||||
.cache(self.cache_strategy.clone())
|
||||
.inverted_index_applier(self.inverted_index_applier.clone())
|
||||
.bloom_filter_index_applier(self.bloom_filter_index_applier.clone())
|
||||
.fulltext_index_applier(self.fulltext_index_applier.clone())
|
||||
|
||||
@@ -257,7 +257,7 @@ impl SeqScan {
|
||||
.await
|
||||
.map_err(BoxedError::new)
|
||||
.context(ExternalSnafu)?;
|
||||
let cache = stream_ctx.input.cache_manager.as_deref();
|
||||
let cache = &stream_ctx.input.cache_strategy;
|
||||
let mut metrics = ScannerMetrics::default();
|
||||
let mut fetch_start = Instant::now();
|
||||
#[cfg(debug_assertions)]
|
||||
|
||||
@@ -148,7 +148,7 @@ impl UnorderedScan {
|
||||
let stream = try_stream! {
|
||||
part_metrics.on_first_poll();
|
||||
|
||||
let cache = stream_ctx.input.cache_manager.as_deref();
|
||||
let cache = &stream_ctx.input.cache_strategy;
|
||||
let range_builder_list = Arc::new(RangeBuilderList::new(
|
||||
stream_ctx.input.num_memtables(),
|
||||
stream_ctx.input.num_files(),
|
||||
|
||||
@@ -80,8 +80,12 @@ impl VersionControl {
|
||||
/// Freezes the mutable memtable if it is not empty.
|
||||
pub(crate) fn freeze_mutable(&self) -> Result<()> {
|
||||
let version = self.current().version;
|
||||
let time_window = version.compaction_time_window;
|
||||
|
||||
let Some(new_memtables) = version.memtables.freeze_mutable(&version.metadata)? else {
|
||||
let Some(new_memtables) = version
|
||||
.memtables
|
||||
.freeze_mutable(&version.metadata, time_window)?
|
||||
else {
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
//! Structures to describe metadata of files.
|
||||
|
||||
use std::fmt;
|
||||
use std::num::NonZeroU64;
|
||||
use std::str::FromStr;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Arc;
|
||||
@@ -134,6 +135,11 @@ pub struct FileMeta {
|
||||
/// the default value `0` doesn't means the file doesn't contains any rows,
|
||||
/// but instead means the number of rows is unknown.
|
||||
pub num_row_groups: u64,
|
||||
/// Sequence in this file.
|
||||
///
|
||||
/// This sequence is the only sequence in this file. And it's retrieved from the max
|
||||
/// sequence of the rows on generating this file.
|
||||
pub sequence: Option<NonZeroU64>,
|
||||
}
|
||||
|
||||
/// Type of index.
|
||||
@@ -343,6 +349,7 @@ mod tests {
|
||||
index_file_size: 0,
|
||||
num_rows: 0,
|
||||
num_row_groups: 0,
|
||||
sequence: None,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -119,6 +119,8 @@ impl FilePurger for LocalFilePurger {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::num::NonZeroU64;
|
||||
|
||||
use common_test_util::temp_dir::create_temp_dir;
|
||||
use object_store::services::Fs;
|
||||
use object_store::ObjectStore;
|
||||
@@ -176,6 +178,7 @@ mod tests {
|
||||
index_file_size: 0,
|
||||
num_rows: 0,
|
||||
num_row_groups: 0,
|
||||
sequence: None,
|
||||
},
|
||||
file_purger,
|
||||
);
|
||||
@@ -238,6 +241,7 @@ mod tests {
|
||||
index_file_size: 4096,
|
||||
num_rows: 1024,
|
||||
num_row_groups: 1,
|
||||
sequence: NonZeroU64::new(4096),
|
||||
},
|
||||
file_purger,
|
||||
);
|
||||
|
||||
@@ -33,18 +33,18 @@ use snafu::{OptionExt, ResultExt};
|
||||
use store_api::metadata::RegionMetadata;
|
||||
use store_api::storage::{ColumnId, RegionId};
|
||||
|
||||
use super::INDEX_BLOB_TYPE;
|
||||
use crate::cache::file_cache::{FileCacheRef, FileType, IndexKey};
|
||||
use crate::cache::index::bloom_filter_index::{
|
||||
BloomFilterIndexCacheRef, CachedBloomFilterIndexBlobReader,
|
||||
};
|
||||
use crate::error::{
|
||||
ApplyBloomFilterIndexSnafu, ColumnNotFoundSnafu, ConvertValueSnafu, MetadataSnafu,
|
||||
ApplyBloomFilterIndexSnafu, ColumnNotFoundSnafu, ConvertValueSnafu, Error, MetadataSnafu,
|
||||
PuffinBuildReaderSnafu, PuffinReadBlobSnafu, Result,
|
||||
};
|
||||
use crate::metrics::INDEX_APPLY_ELAPSED;
|
||||
use crate::row_converter::SortField;
|
||||
use crate::sst::file::FileId;
|
||||
use crate::sst::index::bloom_filter::INDEX_BLOB_TYPE;
|
||||
use crate::sst::index::codec::IndexValueCodec;
|
||||
use crate::sst::index::puffin_manager::{BlobReader, PuffinManagerFactory};
|
||||
use crate::sst::index::TYPE_BLOOM_FILTER_INDEX;
|
||||
@@ -118,28 +118,21 @@ impl BloomFilterIndexApplier {
|
||||
.start_timer();
|
||||
|
||||
for (column_id, predicates) in &self.filters {
|
||||
let mut blob = match self.cached_blob_reader(file_id, *column_id).await {
|
||||
Ok(Some(puffin_reader)) => puffin_reader,
|
||||
other => {
|
||||
if let Err(err) = other {
|
||||
warn!(err; "An unexpected error occurred while reading the cached index file. Fallback to remote index file.")
|
||||
}
|
||||
self.remote_blob_reader(file_id, *column_id, file_size_hint)
|
||||
.await?
|
||||
}
|
||||
let mut blob = match self
|
||||
.blob_reader(file_id, *column_id, file_size_hint)
|
||||
.await?
|
||||
{
|
||||
Some(blob) => blob,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
// Create appropriate reader based on whether we have caching enabled
|
||||
if let Some(bloom_filter_cache) = &self.bloom_filter_index_cache {
|
||||
let file_size = if let Some(file_size) = file_size_hint {
|
||||
file_size
|
||||
} else {
|
||||
blob.metadata().await.context(MetadataSnafu)?.content_length
|
||||
};
|
||||
let blob_size = blob.metadata().await.context(MetadataSnafu)?.content_length;
|
||||
let reader = CachedBloomFilterIndexBlobReader::new(
|
||||
file_id,
|
||||
*column_id,
|
||||
file_size,
|
||||
blob_size,
|
||||
BloomFilterReaderImpl::new(blob),
|
||||
bloom_filter_cache.clone(),
|
||||
);
|
||||
@@ -157,6 +150,43 @@ impl BloomFilterIndexApplier {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Creates a blob reader from the cached or remote index file.
|
||||
///
|
||||
/// Returus `None` if the column does not have an index.
|
||||
async fn blob_reader(
|
||||
&self,
|
||||
file_id: FileId,
|
||||
column_id: ColumnId,
|
||||
file_size_hint: Option<u64>,
|
||||
) -> Result<Option<BlobReader>> {
|
||||
let reader = match self.cached_blob_reader(file_id, column_id).await {
|
||||
Ok(Some(puffin_reader)) => puffin_reader,
|
||||
other => {
|
||||
if let Err(err) = other {
|
||||
// Blob not found means no index for this column
|
||||
if is_blob_not_found(&err) {
|
||||
return Ok(None);
|
||||
}
|
||||
warn!(err; "An unexpected error occurred while reading the cached index file. Fallback to remote index file.")
|
||||
}
|
||||
let res = self
|
||||
.remote_blob_reader(file_id, column_id, file_size_hint)
|
||||
.await;
|
||||
if let Err(err) = res {
|
||||
// Blob not found means no index for this column
|
||||
if is_blob_not_found(&err) {
|
||||
return Ok(None);
|
||||
}
|
||||
return Err(err);
|
||||
}
|
||||
|
||||
res?
|
||||
}
|
||||
};
|
||||
|
||||
Ok(Some(reader))
|
||||
}
|
||||
|
||||
/// Creates a blob reader from the cached index file
|
||||
async fn cached_blob_reader(
|
||||
&self,
|
||||
@@ -242,6 +272,16 @@ impl BloomFilterIndexApplier {
|
||||
}
|
||||
}
|
||||
|
||||
fn is_blob_not_found(err: &Error) -> bool {
|
||||
matches!(
|
||||
err,
|
||||
Error::PuffinBuildReader {
|
||||
source: puffin::error::Error::BlobNotFound { .. },
|
||||
..
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
pub struct BloomFilterIndexApplierBuilder<'a> {
|
||||
region_dir: String,
|
||||
object_store: ObjectStore,
|
||||
|
||||
@@ -137,14 +137,10 @@ impl InvertedIndexApplier {
|
||||
};
|
||||
|
||||
if let Some(index_cache) = &self.inverted_index_cache {
|
||||
let file_size = if let Some(file_size) = file_size_hint {
|
||||
file_size
|
||||
} else {
|
||||
blob.metadata().await.context(MetadataSnafu)?.content_length
|
||||
};
|
||||
let blob_size = blob.metadata().await.context(MetadataSnafu)?.content_length;
|
||||
let mut index_reader = CachedInvertedIndexBlobReader::new(
|
||||
file_id,
|
||||
file_size,
|
||||
blob_size,
|
||||
InvertedIndexBlobReader::new(blob),
|
||||
index_cache.clone(),
|
||||
);
|
||||
|
||||
@@ -95,7 +95,7 @@ mod tests {
|
||||
use tokio_util::compat::FuturesAsyncWriteCompatExt;
|
||||
|
||||
use super::*;
|
||||
use crate::cache::{CacheManager, PageKey};
|
||||
use crate::cache::{CacheManager, CacheStrategy, PageKey};
|
||||
use crate::sst::index::Indexer;
|
||||
use crate::sst::parquet::format::WriteFormat;
|
||||
use crate::sst::parquet::reader::ParquetReaderBuilder;
|
||||
@@ -134,7 +134,7 @@ mod tests {
|
||||
);
|
||||
|
||||
let info = writer
|
||||
.write_all(source, &write_opts)
|
||||
.write_all(source, None, &write_opts)
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
@@ -189,13 +189,13 @@ mod tests {
|
||||
);
|
||||
|
||||
writer
|
||||
.write_all(source, &write_opts)
|
||||
.write_all(source, None, &write_opts)
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
|
||||
// Enable page cache.
|
||||
let cache = Some(Arc::new(
|
||||
let cache = CacheStrategy::EnableAll(Arc::new(
|
||||
CacheManager::builder()
|
||||
.page_cache_size(64 * 1024 * 1024)
|
||||
.build(),
|
||||
@@ -219,15 +219,15 @@ mod tests {
|
||||
|
||||
// Doesn't have compressed page cached.
|
||||
let page_key = PageKey::new_compressed(metadata.region_id, handle.file_id(), 0, 0);
|
||||
assert!(cache.as_ref().unwrap().get_pages(&page_key).is_none());
|
||||
assert!(cache.get_pages(&page_key).is_none());
|
||||
|
||||
// Cache 4 row groups.
|
||||
for i in 0..4 {
|
||||
let page_key = PageKey::new_uncompressed(metadata.region_id, handle.file_id(), i, 0);
|
||||
assert!(cache.as_ref().unwrap().get_pages(&page_key).is_some());
|
||||
assert!(cache.get_pages(&page_key).is_some());
|
||||
}
|
||||
let page_key = PageKey::new_uncompressed(metadata.region_id, handle.file_id(), 5, 0);
|
||||
assert!(cache.as_ref().unwrap().get_pages(&page_key).is_none());
|
||||
assert!(cache.get_pages(&page_key).is_none());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -258,7 +258,7 @@ mod tests {
|
||||
);
|
||||
|
||||
let sst_info = writer
|
||||
.write_all(source, &write_opts)
|
||||
.write_all(source, None, &write_opts)
|
||||
.await
|
||||
.unwrap()
|
||||
.expect("write_all should return sst info");
|
||||
@@ -297,7 +297,7 @@ mod tests {
|
||||
Indexer::default(),
|
||||
);
|
||||
writer
|
||||
.write_all(source, &write_opts)
|
||||
.write_all(source, None, &write_opts)
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
@@ -350,7 +350,7 @@ mod tests {
|
||||
Indexer::default(),
|
||||
);
|
||||
writer
|
||||
.write_all(source, &write_opts)
|
||||
.write_all(source, None, &write_opts)
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
@@ -386,7 +386,7 @@ mod tests {
|
||||
);
|
||||
|
||||
writer
|
||||
.write_all(source, &write_opts)
|
||||
.write_all(source, None, &write_opts)
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
|
||||
@@ -114,7 +114,7 @@ impl FileRange {
|
||||
let reader = RowGroupLastRowCachedReader::new(
|
||||
self.file_handle().file_id(),
|
||||
self.row_group_idx,
|
||||
self.context.reader_builder.cache_manager().clone(),
|
||||
self.context.reader_builder.cache_strategy().clone(),
|
||||
RowGroupReader::new(self.context.clone(), parquet_reader),
|
||||
);
|
||||
PruneReader::new_with_last_row_reader(self.context.clone(), reader)
|
||||
|
||||
@@ -42,7 +42,7 @@ use parquet::file::metadata::{ParquetMetaData, RowGroupMetaData};
|
||||
use parquet::file::statistics::Statistics;
|
||||
use snafu::{ensure, OptionExt, ResultExt};
|
||||
use store_api::metadata::{ColumnMetadata, RegionMetadataRef};
|
||||
use store_api::storage::ColumnId;
|
||||
use store_api::storage::{ColumnId, SequenceNumber};
|
||||
|
||||
use crate::error::{
|
||||
ConvertVectorSnafu, InvalidBatchSnafu, InvalidRecordBatchSnafu, NewRecordBatchSnafu, Result,
|
||||
@@ -65,6 +65,7 @@ pub(crate) struct WriteFormat {
|
||||
metadata: RegionMetadataRef,
|
||||
/// SST file schema.
|
||||
arrow_schema: SchemaRef,
|
||||
override_sequence: Option<SequenceNumber>,
|
||||
}
|
||||
|
||||
impl WriteFormat {
|
||||
@@ -74,9 +75,19 @@ impl WriteFormat {
|
||||
WriteFormat {
|
||||
metadata,
|
||||
arrow_schema,
|
||||
override_sequence: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Set override sequence.
|
||||
pub(crate) fn with_override_sequence(
|
||||
mut self,
|
||||
override_sequence: Option<SequenceNumber>,
|
||||
) -> Self {
|
||||
self.override_sequence = override_sequence;
|
||||
self
|
||||
}
|
||||
|
||||
/// Gets the arrow schema to store in parquet.
|
||||
pub(crate) fn arrow_schema(&self) -> &SchemaRef {
|
||||
&self.arrow_schema
|
||||
@@ -107,7 +118,14 @@ impl WriteFormat {
|
||||
columns.push(batch.timestamps().to_arrow_array());
|
||||
// Add internal columns: primary key, sequences, op types.
|
||||
columns.push(new_primary_key_array(batch.primary_key(), batch.num_rows()));
|
||||
columns.push(batch.sequences().to_arrow_array());
|
||||
|
||||
if let Some(override_sequence) = self.override_sequence {
|
||||
let sequence_array =
|
||||
Arc::new(UInt64Array::from(vec![override_sequence; batch.num_rows()]));
|
||||
columns.push(sequence_array);
|
||||
} else {
|
||||
columns.push(batch.sequences().to_arrow_array());
|
||||
}
|
||||
columns.push(batch.op_types().to_arrow_array());
|
||||
|
||||
RecordBatch::try_new(self.arrow_schema.clone(), columns).context(NewRecordBatchSnafu)
|
||||
@@ -756,6 +774,27 @@ mod tests {
|
||||
assert_eq!(expect_record, actual);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_convert_batch_with_override_sequence() {
|
||||
let metadata = build_test_region_metadata();
|
||||
let write_format = WriteFormat::new(metadata).with_override_sequence(Some(415411));
|
||||
|
||||
let num_rows = 4;
|
||||
let batch = new_batch(b"test", 1, 2, num_rows);
|
||||
let columns: Vec<ArrayRef> = vec![
|
||||
Arc::new(Int64Array::from(vec![2; num_rows])), // field1
|
||||
Arc::new(Int64Array::from(vec![3; num_rows])), // field0
|
||||
Arc::new(TimestampMillisecondArray::from(vec![1, 2, 3, 4])), // ts
|
||||
build_test_pk_array(&[(b"test".to_vec(), num_rows)]), // primary key
|
||||
Arc::new(UInt64Array::from(vec![415411; num_rows])), // sequence
|
||||
Arc::new(UInt8Array::from(vec![TEST_OP_TYPE; num_rows])), // op type
|
||||
];
|
||||
let expect_record = RecordBatch::try_new(build_test_arrow_schema(), columns).unwrap();
|
||||
|
||||
let actual = write_format.convert_batch(&batch).unwrap();
|
||||
assert_eq!(expect_record, actual);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_projection_indices() {
|
||||
let metadata = build_test_region_metadata();
|
||||
|
||||
@@ -38,7 +38,7 @@ use store_api::metadata::{RegionMetadata, RegionMetadataRef};
|
||||
use store_api::storage::ColumnId;
|
||||
use table::predicate::Predicate;
|
||||
|
||||
use crate::cache::CacheManagerRef;
|
||||
use crate::cache::CacheStrategy;
|
||||
use crate::error::{
|
||||
ArrowReaderSnafu, InvalidMetadataSnafu, InvalidParquetSnafu, ReadDataPartSnafu,
|
||||
ReadParquetSnafu, Result,
|
||||
@@ -77,8 +77,8 @@ pub struct ParquetReaderBuilder {
|
||||
/// `None` reads all columns. Due to schema change, the projection
|
||||
/// can contain columns not in the parquet file.
|
||||
projection: Option<Vec<ColumnId>>,
|
||||
/// Manager that caches SST data.
|
||||
cache_manager: Option<CacheManagerRef>,
|
||||
/// Strategy to cache SST data.
|
||||
cache_strategy: CacheStrategy,
|
||||
/// Index appliers.
|
||||
inverted_index_applier: Option<InvertedIndexApplierRef>,
|
||||
bloom_filter_index_applier: Option<BloomFilterIndexApplierRef>,
|
||||
@@ -102,7 +102,7 @@ impl ParquetReaderBuilder {
|
||||
object_store,
|
||||
predicate: None,
|
||||
projection: None,
|
||||
cache_manager: None,
|
||||
cache_strategy: CacheStrategy::Disabled,
|
||||
inverted_index_applier: None,
|
||||
bloom_filter_index_applier: None,
|
||||
fulltext_index_applier: None,
|
||||
@@ -128,8 +128,8 @@ impl ParquetReaderBuilder {
|
||||
|
||||
/// Attaches the cache to the builder.
|
||||
#[must_use]
|
||||
pub fn cache(mut self, cache: Option<CacheManagerRef>) -> ParquetReaderBuilder {
|
||||
self.cache_manager = cache;
|
||||
pub fn cache(mut self, cache: CacheStrategy) -> ParquetReaderBuilder {
|
||||
self.cache_strategy = cache;
|
||||
self
|
||||
}
|
||||
|
||||
@@ -234,7 +234,7 @@ impl ParquetReaderBuilder {
|
||||
object_store: self.object_store.clone(),
|
||||
projection: projection_mask,
|
||||
field_levels,
|
||||
cache_manager: self.cache_manager.clone(),
|
||||
cache_strategy: self.cache_strategy.clone(),
|
||||
};
|
||||
|
||||
let filters = if let Some(predicate) = &self.predicate {
|
||||
@@ -308,10 +308,12 @@ impl ParquetReaderBuilder {
|
||||
let region_id = self.file_handle.region_id();
|
||||
let file_id = self.file_handle.file_id();
|
||||
// Tries to get from global cache.
|
||||
if let Some(manager) = &self.cache_manager {
|
||||
if let Some(metadata) = manager.get_parquet_meta_data(region_id, file_id).await {
|
||||
return Ok(metadata);
|
||||
}
|
||||
if let Some(metadata) = self
|
||||
.cache_strategy
|
||||
.get_parquet_meta_data(region_id, file_id)
|
||||
.await
|
||||
{
|
||||
return Ok(metadata);
|
||||
}
|
||||
|
||||
// Cache miss, load metadata directly.
|
||||
@@ -319,13 +321,11 @@ impl ParquetReaderBuilder {
|
||||
let metadata = metadata_loader.load().await?;
|
||||
let metadata = Arc::new(metadata);
|
||||
// Cache the metadata.
|
||||
if let Some(cache) = &self.cache_manager {
|
||||
cache.put_parquet_meta_data(
|
||||
self.file_handle.region_id(),
|
||||
self.file_handle.file_id(),
|
||||
metadata.clone(),
|
||||
);
|
||||
}
|
||||
self.cache_strategy.put_parquet_meta_data(
|
||||
self.file_handle.region_id(),
|
||||
self.file_handle.file_id(),
|
||||
metadata.clone(),
|
||||
);
|
||||
|
||||
Ok(metadata)
|
||||
}
|
||||
@@ -857,7 +857,7 @@ pub(crate) struct RowGroupReaderBuilder {
|
||||
/// Field levels to read.
|
||||
field_levels: FieldLevels,
|
||||
/// Cache.
|
||||
cache_manager: Option<CacheManagerRef>,
|
||||
cache_strategy: CacheStrategy,
|
||||
}
|
||||
|
||||
impl RowGroupReaderBuilder {
|
||||
@@ -875,8 +875,8 @@ impl RowGroupReaderBuilder {
|
||||
&self.parquet_meta
|
||||
}
|
||||
|
||||
pub(crate) fn cache_manager(&self) -> &Option<CacheManagerRef> {
|
||||
&self.cache_manager
|
||||
pub(crate) fn cache_strategy(&self) -> &CacheStrategy {
|
||||
&self.cache_strategy
|
||||
}
|
||||
|
||||
/// Builds a [ParquetRecordBatchReader] to read the row group at `row_group_idx`.
|
||||
@@ -890,7 +890,7 @@ impl RowGroupReaderBuilder {
|
||||
self.file_handle.file_id(),
|
||||
&self.parquet_meta,
|
||||
row_group_idx,
|
||||
self.cache_manager.clone(),
|
||||
self.cache_strategy.clone(),
|
||||
&self.file_path,
|
||||
self.object_store.clone(),
|
||||
);
|
||||
|
||||
@@ -32,7 +32,7 @@ use store_api::storage::RegionId;
|
||||
use tokio::task::yield_now;
|
||||
|
||||
use crate::cache::file_cache::{FileType, IndexKey};
|
||||
use crate::cache::{CacheManagerRef, PageKey, PageValue};
|
||||
use crate::cache::{CacheStrategy, PageKey, PageValue};
|
||||
use crate::metrics::{READ_STAGE_ELAPSED, READ_STAGE_FETCH_PAGES};
|
||||
use crate::sst::file::FileId;
|
||||
use crate::sst::parquet::helper::fetch_byte_ranges;
|
||||
@@ -223,7 +223,7 @@ pub struct InMemoryRowGroup<'a> {
|
||||
region_id: RegionId,
|
||||
file_id: FileId,
|
||||
row_group_idx: usize,
|
||||
cache_manager: Option<CacheManagerRef>,
|
||||
cache_strategy: CacheStrategy,
|
||||
file_path: &'a str,
|
||||
/// Object store.
|
||||
object_store: ObjectStore,
|
||||
@@ -240,7 +240,7 @@ impl<'a> InMemoryRowGroup<'a> {
|
||||
file_id: FileId,
|
||||
parquet_meta: &'a ParquetMetaData,
|
||||
row_group_idx: usize,
|
||||
cache_manager: Option<CacheManagerRef>,
|
||||
cache_strategy: CacheStrategy,
|
||||
file_path: &'a str,
|
||||
object_store: ObjectStore,
|
||||
) -> Self {
|
||||
@@ -249,7 +249,7 @@ impl<'a> InMemoryRowGroup<'a> {
|
||||
region_id,
|
||||
file_id,
|
||||
row_group_idx,
|
||||
cache_manager,
|
||||
cache_strategy,
|
||||
file_path,
|
||||
object_store,
|
||||
}
|
||||
@@ -293,21 +293,19 @@ impl<'a> InMemoryRowGroup<'a> {
|
||||
let assigned_columns = self.base.assign_dense_chunk(projection, chunk_data);
|
||||
|
||||
// Put fetched data to cache if necessary.
|
||||
if let Some(cache) = &self.cache_manager {
|
||||
for (col_idx, data) in assigned_columns {
|
||||
let column = self.base.metadata.column(col_idx);
|
||||
if !cache_uncompressed_pages(column) {
|
||||
// For columns that have multiple uncompressed pages, we only cache the compressed page
|
||||
// to save memory.
|
||||
let page_key = PageKey::new_compressed(
|
||||
self.region_id,
|
||||
self.file_id,
|
||||
self.row_group_idx,
|
||||
col_idx,
|
||||
);
|
||||
cache
|
||||
.put_pages(page_key, Arc::new(PageValue::new_compressed(data.clone())));
|
||||
}
|
||||
for (col_idx, data) in assigned_columns {
|
||||
let column = self.base.metadata.column(col_idx);
|
||||
if !cache_uncompressed_pages(column) {
|
||||
// For columns that have multiple uncompressed pages, we only cache the compressed page
|
||||
// to save memory.
|
||||
let page_key = PageKey::new_compressed(
|
||||
self.region_id,
|
||||
self.file_id,
|
||||
self.row_group_idx,
|
||||
col_idx,
|
||||
);
|
||||
self.cache_strategy
|
||||
.put_pages(page_key, Arc::new(PageValue::new_compressed(data.clone())));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -325,9 +323,6 @@ impl<'a> InMemoryRowGroup<'a> {
|
||||
.enumerate()
|
||||
.filter(|(idx, chunk)| chunk.is_none() && projection.leaf_included(*idx))
|
||||
.for_each(|(idx, chunk)| {
|
||||
let Some(cache) = &self.cache_manager else {
|
||||
return;
|
||||
};
|
||||
let column = self.base.metadata.column(idx);
|
||||
if cache_uncompressed_pages(column) {
|
||||
// Fetches uncompressed pages for the row group.
|
||||
@@ -337,7 +332,8 @@ impl<'a> InMemoryRowGroup<'a> {
|
||||
self.row_group_idx,
|
||||
idx,
|
||||
);
|
||||
self.base.column_uncompressed_pages[idx] = cache.get_pages(&page_key);
|
||||
self.base.column_uncompressed_pages[idx] =
|
||||
self.cache_strategy.get_pages(&page_key);
|
||||
} else {
|
||||
// Fetches the compressed page from the cache.
|
||||
let page_key = PageKey::new_compressed(
|
||||
@@ -347,7 +343,7 @@ impl<'a> InMemoryRowGroup<'a> {
|
||||
idx,
|
||||
);
|
||||
|
||||
*chunk = cache.get_pages(&page_key).map(|page_value| {
|
||||
*chunk = self.cache_strategy.get_pages(&page_key).map(|page_value| {
|
||||
Arc::new(ColumnChunkData::Dense {
|
||||
offset: column.byte_range().0 as usize,
|
||||
data: page_value.compressed.clone(),
|
||||
@@ -383,7 +379,7 @@ impl<'a> InMemoryRowGroup<'a> {
|
||||
key: IndexKey,
|
||||
ranges: &[Range<u64>],
|
||||
) -> Option<Vec<Bytes>> {
|
||||
if let Some(cache) = self.cache_manager.as_ref()?.write_cache() {
|
||||
if let Some(cache) = self.cache_strategy.write_cache() {
|
||||
return cache.file_cache().read_ranges(key, ranges).await;
|
||||
}
|
||||
None
|
||||
@@ -399,10 +395,6 @@ impl<'a> InMemoryRowGroup<'a> {
|
||||
|
||||
let page_reader = self.base.column_reader(i)?;
|
||||
|
||||
let Some(cache) = &self.cache_manager else {
|
||||
return Ok(Box::new(page_reader));
|
||||
};
|
||||
|
||||
let column = self.base.metadata.column(i);
|
||||
if cache_uncompressed_pages(column) {
|
||||
// This column use row group level page cache.
|
||||
@@ -411,7 +403,7 @@ impl<'a> InMemoryRowGroup<'a> {
|
||||
let page_value = Arc::new(PageValue::new_row_group(pages));
|
||||
let page_key =
|
||||
PageKey::new_uncompressed(self.region_id, self.file_id, self.row_group_idx, i);
|
||||
cache.put_pages(page_key, page_value.clone());
|
||||
self.cache_strategy.put_pages(page_key, page_value.clone());
|
||||
|
||||
return Ok(Box::new(RowGroupCachedReader::new(&page_value.row_group)));
|
||||
}
|
||||
|
||||
@@ -31,6 +31,7 @@ use parquet::schema::types::ColumnPath;
|
||||
use snafu::ResultExt;
|
||||
use store_api::metadata::RegionMetadataRef;
|
||||
use store_api::storage::consts::SEQUENCE_COLUMN_NAME;
|
||||
use store_api::storage::SequenceNumber;
|
||||
use tokio::io::AsyncWrite;
|
||||
use tokio_util::compat::{Compat, FuturesAsyncWriteCompatExt};
|
||||
|
||||
@@ -112,9 +113,11 @@ where
|
||||
pub async fn write_all(
|
||||
&mut self,
|
||||
mut source: Source,
|
||||
override_sequence: Option<SequenceNumber>, // override the `sequence` field from `Source`
|
||||
opts: &WriteOptions,
|
||||
) -> Result<Option<SstInfo>> {
|
||||
let write_format = WriteFormat::new(self.metadata.clone());
|
||||
let write_format =
|
||||
WriteFormat::new(self.metadata.clone()).with_override_sequence(override_sequence);
|
||||
let mut stats = SourceStats::default();
|
||||
|
||||
while let Some(res) = self
|
||||
|
||||
@@ -201,6 +201,7 @@ pub struct TestEnv {
|
||||
log_store_factory: LogStoreFactory,
|
||||
object_store_manager: Option<ObjectStoreManagerRef>,
|
||||
schema_metadata_manager: SchemaMetadataManagerRef,
|
||||
kv_backend: KvBackendRef,
|
||||
}
|
||||
|
||||
impl Default for TestEnv {
|
||||
@@ -212,37 +213,40 @@ impl Default for TestEnv {
|
||||
impl TestEnv {
|
||||
/// Returns a new env with empty prefix for test.
|
||||
pub fn new() -> TestEnv {
|
||||
let schema_metadata_manager = mock_schema_metadata_manager();
|
||||
let (schema_metadata_manager, kv_backend) = mock_schema_metadata_manager();
|
||||
TestEnv {
|
||||
data_home: create_temp_dir(""),
|
||||
log_store: None,
|
||||
log_store_factory: LogStoreFactory::RaftEngine(RaftEngineLogStoreFactory),
|
||||
object_store_manager: None,
|
||||
schema_metadata_manager,
|
||||
kv_backend,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a new env with specific `prefix` for test.
|
||||
pub fn with_prefix(prefix: &str) -> TestEnv {
|
||||
let schema_metadata_manager = mock_schema_metadata_manager();
|
||||
let (schema_metadata_manager, kv_backend) = mock_schema_metadata_manager();
|
||||
TestEnv {
|
||||
data_home: create_temp_dir(prefix),
|
||||
log_store: None,
|
||||
log_store_factory: LogStoreFactory::RaftEngine(RaftEngineLogStoreFactory),
|
||||
object_store_manager: None,
|
||||
schema_metadata_manager,
|
||||
kv_backend,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a new env with specific `data_home` for test.
|
||||
pub fn with_data_home(data_home: TempDir) -> TestEnv {
|
||||
let schema_metadata_manager = mock_schema_metadata_manager();
|
||||
let (schema_metadata_manager, kv_backend) = mock_schema_metadata_manager();
|
||||
TestEnv {
|
||||
data_home,
|
||||
log_store: None,
|
||||
log_store_factory: LogStoreFactory::RaftEngine(RaftEngineLogStoreFactory),
|
||||
object_store_manager: None,
|
||||
schema_metadata_manager,
|
||||
kv_backend,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -653,6 +657,10 @@ impl TestEnv {
|
||||
pub fn get_schema_metadata_manager(&self) -> SchemaMetadataManagerRef {
|
||||
self.schema_metadata_manager.clone()
|
||||
}
|
||||
|
||||
pub fn get_kv_backend(&self) -> KvBackendRef {
|
||||
self.kv_backend.clone()
|
||||
}
|
||||
}
|
||||
|
||||
/// Builder to mock a [RegionCreateRequest].
|
||||
@@ -1143,7 +1151,7 @@ pub async fn reopen_region(
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn mock_schema_metadata_manager() -> Arc<SchemaMetadataManager> {
|
||||
pub(crate) fn mock_schema_metadata_manager() -> (Arc<SchemaMetadataManager>, KvBackendRef) {
|
||||
let kv_backend = Arc::new(MemoryKvBackend::new());
|
||||
let table_schema_cache = Arc::new(new_table_schema_cache(
|
||||
"table_schema_name_cache".to_string(),
|
||||
@@ -1155,9 +1163,8 @@ pub(crate) fn mock_schema_metadata_manager() -> Arc<SchemaMetadataManager> {
|
||||
CacheBuilder::default().build(),
|
||||
kv_backend.clone(),
|
||||
));
|
||||
Arc::new(SchemaMetadataManager::new(
|
||||
(
|
||||
Arc::new(SchemaMetadataManager::new(table_schema_cache, schema_cache)),
|
||||
kv_backend as KvBackendRef,
|
||||
table_schema_cache,
|
||||
schema_cache,
|
||||
))
|
||||
)
|
||||
}
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
|
||||
//! Utilities for testing SSTs.
|
||||
|
||||
use std::num::NonZeroU64;
|
||||
use std::sync::Arc;
|
||||
|
||||
use api::v1::{OpType, SemanticType};
|
||||
@@ -116,6 +117,7 @@ pub fn sst_file_handle(start_ms: i64, end_ms: i64) -> FileHandle {
|
||||
index_file_size: 0,
|
||||
num_rows: 0,
|
||||
num_row_groups: 0,
|
||||
sequence: None,
|
||||
},
|
||||
file_purger,
|
||||
)
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
//! Utilities to mock version.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::num::NonZeroU64;
|
||||
use std::sync::Arc;
|
||||
|
||||
use api::v1::value::ValueData;
|
||||
@@ -103,6 +104,7 @@ impl VersionControlBuilder {
|
||||
index_file_size: 0,
|
||||
num_rows: 0,
|
||||
num_row_groups: 0,
|
||||
sequence: None,
|
||||
},
|
||||
);
|
||||
self
|
||||
@@ -194,6 +196,7 @@ pub(crate) fn apply_edit(
|
||||
index_file_size: 0,
|
||||
num_rows: 0,
|
||||
num_row_groups: 0,
|
||||
sequence: None,
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
@@ -93,7 +93,10 @@ impl LogQueryPlanner {
|
||||
|
||||
// Apply limit
|
||||
plan_builder = plan_builder
|
||||
.limit(0, query.limit.or(Some(DEFAULT_LIMIT)))
|
||||
.limit(
|
||||
query.limit.skip.unwrap_or(0),
|
||||
Some(query.limit.fetch.unwrap_or(DEFAULT_LIMIT)),
|
||||
)
|
||||
.context(DataFusionPlanningSnafu)?;
|
||||
|
||||
// Build the final plan
|
||||
@@ -179,7 +182,7 @@ mod tests {
|
||||
use common_query::test_util::DummyDecoder;
|
||||
use datatypes::prelude::ConcreteDataType;
|
||||
use datatypes::schema::{ColumnSchema, SchemaRef};
|
||||
use log_query::{ContentFilter, Context};
|
||||
use log_query::{ContentFilter, Context, Limit};
|
||||
use session::context::QueryContext;
|
||||
use table::metadata::{TableInfoBuilder, TableMetaBuilder};
|
||||
use table::table_name::TableName;
|
||||
@@ -268,7 +271,10 @@ mod tests {
|
||||
column_name: "message".to_string(),
|
||||
filters: vec![ContentFilter::Contains("error".to_string())],
|
||||
}],
|
||||
limit: Some(100),
|
||||
limit: Limit {
|
||||
skip: None,
|
||||
fetch: Some(100),
|
||||
},
|
||||
context: Context::None,
|
||||
};
|
||||
|
||||
@@ -361,6 +367,72 @@ mod tests {
|
||||
assert_eq!(format!("{:?}", expr), format!("{:?}", expected_expr));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_query_to_plan_with_only_skip() {
|
||||
let table_provider =
|
||||
build_test_table_provider(&[("public".to_string(), "test_table".to_string())]).await;
|
||||
let mut planner = LogQueryPlanner::new(table_provider);
|
||||
|
||||
let log_query = LogQuery {
|
||||
table: TableName::new(DEFAULT_CATALOG_NAME, "public", "test_table"),
|
||||
time_filter: TimeFilter {
|
||||
start: Some("2021-01-01T00:00:00Z".to_string()),
|
||||
end: Some("2021-01-02T00:00:00Z".to_string()),
|
||||
span: None,
|
||||
},
|
||||
columns: vec![ColumnFilters {
|
||||
column_name: "message".to_string(),
|
||||
filters: vec![ContentFilter::Contains("error".to_string())],
|
||||
}],
|
||||
limit: Limit {
|
||||
skip: Some(10),
|
||||
fetch: None,
|
||||
},
|
||||
context: Context::None,
|
||||
};
|
||||
|
||||
let plan = planner.query_to_plan(log_query).await.unwrap();
|
||||
let expected = "Limit: skip=10, fetch=1000 [message:Utf8]\
|
||||
\n Projection: greptime.public.test_table.message [message:Utf8]\
|
||||
\n Filter: greptime.public.test_table.timestamp >= Utf8(\"2021-01-01T00:00:00Z\") AND greptime.public.test_table.timestamp <= Utf8(\"2021-01-02T00:00:00Z\") AND greptime.public.test_table.message LIKE Utf8(\"%error%\") [message:Utf8, timestamp:Timestamp(Millisecond, None), host:Utf8;N]\
|
||||
\n TableScan: greptime.public.test_table [message:Utf8, timestamp:Timestamp(Millisecond, None), host:Utf8;N]";
|
||||
|
||||
assert_eq!(plan.display_indent_schema().to_string(), expected);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_query_to_plan_without_limit() {
|
||||
let table_provider =
|
||||
build_test_table_provider(&[("public".to_string(), "test_table".to_string())]).await;
|
||||
let mut planner = LogQueryPlanner::new(table_provider);
|
||||
|
||||
let log_query = LogQuery {
|
||||
table: TableName::new(DEFAULT_CATALOG_NAME, "public", "test_table"),
|
||||
time_filter: TimeFilter {
|
||||
start: Some("2021-01-01T00:00:00Z".to_string()),
|
||||
end: Some("2021-01-02T00:00:00Z".to_string()),
|
||||
span: None,
|
||||
},
|
||||
columns: vec![ColumnFilters {
|
||||
column_name: "message".to_string(),
|
||||
filters: vec![ContentFilter::Contains("error".to_string())],
|
||||
}],
|
||||
limit: Limit {
|
||||
skip: None,
|
||||
fetch: None,
|
||||
},
|
||||
context: Context::None,
|
||||
};
|
||||
|
||||
let plan = planner.query_to_plan(log_query).await.unwrap();
|
||||
let expected = "Limit: skip=0, fetch=1000 [message:Utf8]\
|
||||
\n Projection: greptime.public.test_table.message [message:Utf8]\
|
||||
\n Filter: greptime.public.test_table.timestamp >= Utf8(\"2021-01-01T00:00:00Z\") AND greptime.public.test_table.timestamp <= Utf8(\"2021-01-02T00:00:00Z\") AND greptime.public.test_table.message LIKE Utf8(\"%error%\") [message:Utf8, timestamp:Timestamp(Millisecond, None), host:Utf8;N]\
|
||||
\n TableScan: greptime.public.test_table [message:Utf8, timestamp:Timestamp(Millisecond, None), host:Utf8;N]";
|
||||
|
||||
assert_eq!(plan.display_indent_schema().to_string(), expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_escape_pattern() {
|
||||
assert_eq!(escape_like_pattern("test"), "test");
|
||||
|
||||
@@ -15,6 +15,9 @@
|
||||
use http::HeaderMap;
|
||||
use tonic::metadata::MetadataMap;
|
||||
|
||||
// For the given format: `x-greptime-hints: auto_create_table=true, ttl=7d`
|
||||
pub const HINTS_KEY: &str = "x-greptime-hints";
|
||||
|
||||
pub const HINT_KEYS: [&str; 5] = [
|
||||
"x-greptime-hint-auto_create_table",
|
||||
"x-greptime-hint-ttl",
|
||||
@@ -25,6 +28,16 @@ pub const HINT_KEYS: [&str; 5] = [
|
||||
|
||||
pub(crate) fn extract_hints<T: ToHeaderMap>(headers: &T) -> Vec<(String, String)> {
|
||||
let mut hints = Vec::new();
|
||||
if let Some(value_str) = headers.get(HINTS_KEY) {
|
||||
value_str.split(',').for_each(|hint| {
|
||||
let mut parts = hint.splitn(2, '=');
|
||||
if let (Some(key), Some(value)) = (parts.next(), parts.next()) {
|
||||
hints.push((key.trim().to_string(), value.trim().to_string()));
|
||||
}
|
||||
});
|
||||
// If hints are provided in the `x-greptime-hints` header, ignore the rest of the headers
|
||||
return hints;
|
||||
}
|
||||
for key in HINT_KEYS.iter() {
|
||||
if let Some(value) = headers.get(key) {
|
||||
let new_key = key.replace("x-greptime-hint-", "");
|
||||
@@ -112,6 +125,30 @@ mod tests {
|
||||
assert_eq!(hints[1], ("ttl".to_string(), "3600d".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_hints_all_in_one() {
|
||||
let mut headers = HeaderMap::new();
|
||||
headers.insert(
|
||||
"x-greptime-hints",
|
||||
HeaderValue::from_static(" auto_create_table=true, ttl =3600d, append_mode=true , merge_mode=false , physical_table= table1"),
|
||||
);
|
||||
|
||||
let hints = extract_hints(&headers);
|
||||
|
||||
assert_eq!(hints.len(), 5);
|
||||
assert_eq!(
|
||||
hints[0],
|
||||
("auto_create_table".to_string(), "true".to_string())
|
||||
);
|
||||
assert_eq!(hints[1], ("ttl".to_string(), "3600d".to_string()));
|
||||
assert_eq!(hints[2], ("append_mode".to_string(), "true".to_string()));
|
||||
assert_eq!(hints[3], ("merge_mode".to_string(), "false".to_string()));
|
||||
assert_eq!(
|
||||
hints[4],
|
||||
("physical_table".to_string(), "table1".to_string())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_hints_with_metadata_map() {
|
||||
let mut metadata = MetadataMap::new();
|
||||
|
||||
@@ -17,7 +17,9 @@ use std::time::Duration;
|
||||
|
||||
use chrono::NaiveDate;
|
||||
use common_query::prelude::ScalarValue;
|
||||
use common_time::Timestamp;
|
||||
use datatypes::prelude::ConcreteDataType;
|
||||
use datatypes::types::TimestampType;
|
||||
use datatypes::value::{self, Value};
|
||||
use itertools::Itertools;
|
||||
use opensrv_mysql::{to_naive_datetime, ParamValue, ValueInner};
|
||||
@@ -161,7 +163,7 @@ pub fn convert_value(param: &ParamValue, t: &ConcreteDataType) -> Result<ScalarV
|
||||
String::from_utf8_lossy(b).to_string(),
|
||||
))),
|
||||
ConcreteDataType::Binary(_) => Ok(ScalarValue::Binary(Some(b.to_vec()))),
|
||||
|
||||
ConcreteDataType::Timestamp(ts_type) => covert_bytes_to_timestamp(b, ts_type),
|
||||
_ => error::PreparedStmtTypeMismatchSnafu {
|
||||
expected: t,
|
||||
actual: param.coltype,
|
||||
@@ -235,8 +237,41 @@ pub fn convert_expr_to_scalar_value(param: &Expr, t: &ConcreteDataType) -> Resul
|
||||
}
|
||||
}
|
||||
|
||||
fn covert_bytes_to_timestamp(bytes: &[u8], ts_type: &TimestampType) -> Result<ScalarValue> {
|
||||
let ts = Timestamp::from_str_utc(&String::from_utf8_lossy(bytes))
|
||||
.map_err(|e| {
|
||||
error::MysqlValueConversionSnafu {
|
||||
err_msg: e.to_string(),
|
||||
}
|
||||
.build()
|
||||
})?
|
||||
.convert_to(ts_type.unit())
|
||||
.ok_or_else(|| {
|
||||
error::MysqlValueConversionSnafu {
|
||||
err_msg: "Overflow when converting timestamp to target unit".to_string(),
|
||||
}
|
||||
.build()
|
||||
})?;
|
||||
match ts_type {
|
||||
TimestampType::Nanosecond(_) => {
|
||||
Ok(ScalarValue::TimestampNanosecond(Some(ts.value()), None))
|
||||
}
|
||||
TimestampType::Microsecond(_) => {
|
||||
Ok(ScalarValue::TimestampMicrosecond(Some(ts.value()), None))
|
||||
}
|
||||
TimestampType::Millisecond(_) => {
|
||||
Ok(ScalarValue::TimestampMillisecond(Some(ts.value()), None))
|
||||
}
|
||||
TimestampType::Second(_) => Ok(ScalarValue::TimestampSecond(Some(ts.value()), None)),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use datatypes::types::{
|
||||
TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType,
|
||||
TimestampSecondType,
|
||||
};
|
||||
use sql::dialect::MySqlDialect;
|
||||
use sql::parser::{ParseOptions, ParserContext};
|
||||
|
||||
@@ -340,4 +375,87 @@ mod tests {
|
||||
let v = convert_expr_to_scalar_value(&expr, &t).unwrap();
|
||||
assert_eq!(ScalarValue::Time64Microsecond(None), v);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_convert_bytes_to_timestamp() {
|
||||
let test_cases = vec![
|
||||
// input unix timestamp in seconds -> nanosecond.
|
||||
(
|
||||
"2024-12-26 12:00:00",
|
||||
TimestampType::Nanosecond(TimestampNanosecondType),
|
||||
ScalarValue::TimestampNanosecond(Some(1735214400000000000), None),
|
||||
),
|
||||
// input unix timestamp in seconds -> microsecond.
|
||||
(
|
||||
"2024-12-26 12:00:00",
|
||||
TimestampType::Microsecond(TimestampMicrosecondType),
|
||||
ScalarValue::TimestampMicrosecond(Some(1735214400000000), None),
|
||||
),
|
||||
// input unix timestamp in seconds -> millisecond.
|
||||
(
|
||||
"2024-12-26 12:00:00",
|
||||
TimestampType::Millisecond(TimestampMillisecondType),
|
||||
ScalarValue::TimestampMillisecond(Some(1735214400000), None),
|
||||
),
|
||||
// input unix timestamp in seconds -> second.
|
||||
(
|
||||
"2024-12-26 12:00:00",
|
||||
TimestampType::Second(TimestampSecondType),
|
||||
ScalarValue::TimestampSecond(Some(1735214400), None),
|
||||
),
|
||||
// input unix timestamp in milliseconds -> nanosecond.
|
||||
(
|
||||
"2024-12-26 12:00:00.123",
|
||||
TimestampType::Nanosecond(TimestampNanosecondType),
|
||||
ScalarValue::TimestampNanosecond(Some(1735214400123000000), None),
|
||||
),
|
||||
// input unix timestamp in milliseconds -> microsecond.
|
||||
(
|
||||
"2024-12-26 12:00:00.123",
|
||||
TimestampType::Microsecond(TimestampMicrosecondType),
|
||||
ScalarValue::TimestampMicrosecond(Some(1735214400123000), None),
|
||||
),
|
||||
// input unix timestamp in milliseconds -> millisecond.
|
||||
(
|
||||
"2024-12-26 12:00:00.123",
|
||||
TimestampType::Millisecond(TimestampMillisecondType),
|
||||
ScalarValue::TimestampMillisecond(Some(1735214400123), None),
|
||||
),
|
||||
// input unix timestamp in milliseconds -> second.
|
||||
(
|
||||
"2024-12-26 12:00:00.123",
|
||||
TimestampType::Second(TimestampSecondType),
|
||||
ScalarValue::TimestampSecond(Some(1735214400), None),
|
||||
),
|
||||
// input unix timestamp in microseconds -> nanosecond.
|
||||
(
|
||||
"2024-12-26 12:00:00.123456",
|
||||
TimestampType::Nanosecond(TimestampNanosecondType),
|
||||
ScalarValue::TimestampNanosecond(Some(1735214400123456000), None),
|
||||
),
|
||||
// input unix timestamp in microseconds -> microsecond.
|
||||
(
|
||||
"2024-12-26 12:00:00.123456",
|
||||
TimestampType::Microsecond(TimestampMicrosecondType),
|
||||
ScalarValue::TimestampMicrosecond(Some(1735214400123456), None),
|
||||
),
|
||||
// input unix timestamp in microseconds -> millisecond.
|
||||
(
|
||||
"2024-12-26 12:00:00.123456",
|
||||
TimestampType::Millisecond(TimestampMillisecondType),
|
||||
ScalarValue::TimestampMillisecond(Some(1735214400123), None),
|
||||
),
|
||||
// input unix timestamp in milliseconds -> second.
|
||||
(
|
||||
"2024-12-26 12:00:00.123456",
|
||||
TimestampType::Second(TimestampSecondType),
|
||||
ScalarValue::TimestampSecond(Some(1735214400), None),
|
||||
),
|
||||
];
|
||||
|
||||
for (input, ts_type, expected) in test_cases {
|
||||
let result = covert_bytes_to_timestamp(input.as_bytes(), &ts_type).unwrap();
|
||||
assert_eq!(result, expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -126,3 +126,35 @@ SELECT vec_elem_sum(parse_vec('[-1.0, -2.0, -3.0]'));
|
||||
| -6.0 |
|
||||
+-----------------------------------------------------+
|
||||
|
||||
SELECT vec_to_string(vec_div('[1.0, 2.0]', '[3.0, 4.0]'));
|
||||
|
||||
+---------------------------------------------------------------+
|
||||
| vec_to_string(vec_div(Utf8("[1.0, 2.0]"),Utf8("[3.0, 4.0]"))) |
|
||||
+---------------------------------------------------------------+
|
||||
| [0.33333334,0.5] |
|
||||
+---------------------------------------------------------------+
|
||||
|
||||
SELECT vec_to_string(vec_div(parse_vec('[1.0, 2.0]'), '[3.0, 4.0]'));
|
||||
|
||||
+--------------------------------------------------------------------------+
|
||||
| vec_to_string(vec_div(parse_vec(Utf8("[1.0, 2.0]")),Utf8("[3.0, 4.0]"))) |
|
||||
+--------------------------------------------------------------------------+
|
||||
| [0.33333334,0.5] |
|
||||
+--------------------------------------------------------------------------+
|
||||
|
||||
SELECT vec_to_string(vec_div('[1.0, 2.0]', parse_vec('[3.0, 4.0]')));
|
||||
|
||||
+--------------------------------------------------------------------------+
|
||||
| vec_to_string(vec_div(Utf8("[1.0, 2.0]"),parse_vec(Utf8("[3.0, 4.0]")))) |
|
||||
+--------------------------------------------------------------------------+
|
||||
| [0.33333334,0.5] |
|
||||
+--------------------------------------------------------------------------+
|
||||
|
||||
SELECT vec_to_string(vec_div('[1.0, -2.0]', parse_vec('[0.0, 0.0]')));
|
||||
|
||||
+---------------------------------------------------------------------------+
|
||||
| vec_to_string(vec_div(Utf8("[1.0, -2.0]"),parse_vec(Utf8("[0.0, 0.0]")))) |
|
||||
+---------------------------------------------------------------------------+
|
||||
| [inf,-inf] |
|
||||
+---------------------------------------------------------------------------+
|
||||
|
||||
|
||||
@@ -29,3 +29,11 @@ SELECT vec_elem_sum('[-1.0, -2.0, -3.0]');
|
||||
SELECT vec_elem_sum(parse_vec('[1.0, 2.0, 3.0]'));
|
||||
|
||||
SELECT vec_elem_sum(parse_vec('[-1.0, -2.0, -3.0]'));
|
||||
|
||||
SELECT vec_to_string(vec_div('[1.0, 2.0]', '[3.0, 4.0]'));
|
||||
|
||||
SELECT vec_to_string(vec_div(parse_vec('[1.0, 2.0]'), '[3.0, 4.0]'));
|
||||
|
||||
SELECT vec_to_string(vec_div('[1.0, 2.0]', parse_vec('[3.0, 4.0]')));
|
||||
|
||||
SELECT vec_to_string(vec_div('[1.0, -2.0]', parse_vec('[0.0, 0.0]')));
|
||||
|
||||
Reference in New Issue
Block a user