mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-24 00:20:37 +00:00
Compare commits
40 Commits
problame/t
...
vlad/log-o
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
fd5aa06e28 | ||
|
|
1cdebae470 | ||
|
|
dc7004ec10 | ||
|
|
411c3aa0d6 | ||
|
|
65b69392ea | ||
|
|
8d70f88b37 | ||
|
|
bcfe013094 | ||
|
|
d0a02f3649 | ||
|
|
8af9412eb2 | ||
|
|
96e35e11a6 | ||
|
|
745061ddf8 | ||
|
|
0c828c57e2 | ||
|
|
8e2e9f0fed | ||
|
|
b77b9bdc9f | ||
|
|
81f9aba005 | ||
|
|
88ff8a7803 | ||
|
|
0c075fab3a | ||
|
|
80e1630042 | ||
|
|
57499640c5 | ||
|
|
793ad50b7d | ||
|
|
7a1331eee5 | ||
|
|
4ef74215e1 | ||
|
|
d4cbc8cfeb | ||
|
|
47c35f67c3 | ||
|
|
45b558f480 | ||
|
|
a73402e646 | ||
|
|
07b974480c | ||
|
|
62f5d484d9 | ||
|
|
4df3987054 | ||
|
|
0624565617 | ||
|
|
7d5f6b6a52 | ||
|
|
f7c61e856f | ||
|
|
57c21aff9f | ||
|
|
248558dee8 | ||
|
|
3bad52543f | ||
|
|
3d64a7ddcd | ||
|
|
25f1e5cfeb | ||
|
|
8dd555d396 | ||
|
|
01b6843e12 | ||
|
|
93987b5a4a |
6
.github/workflows/benchmarking.yml
vendored
6
.github/workflows/benchmarking.yml
vendored
@@ -671,6 +671,10 @@ jobs:
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
# Increase timeout to 12h, default timeout is 6h
|
||||
# we have regression in clickbench causing it to run 2-3x longer
|
||||
timeout-minutes: 720
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
@@ -716,7 +720,7 @@ jobs:
|
||||
test_selection: performance/test_perf_olap.py
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_clickbench
|
||||
extra_params: -m remote_cluster --timeout 43200 -k test_clickbench
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
|
||||
6
.github/workflows/build_and_test.yml
vendored
6
.github/workflows/build_and_test.yml
vendored
@@ -839,6 +839,7 @@ jobs:
|
||||
- name: Build vm image
|
||||
run: |
|
||||
./vm-builder \
|
||||
-size=2G \
|
||||
-spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \
|
||||
-src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
|
||||
-dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}
|
||||
@@ -1116,7 +1117,10 @@ jobs:
|
||||
|
||||
gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \
|
||||
-f deployPgSniRouter=true \
|
||||
-f deployProxy=true \
|
||||
-f deployProxyLink=true \
|
||||
-f deployPrivatelinkProxy=true \
|
||||
-f deployProxyScram=true \
|
||||
-f deployProxyAuthBroker=true \
|
||||
-f branch=main \
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
else
|
||||
|
||||
21
Cargo.lock
generated
21
Cargo.lock
generated
@@ -3749,6 +3749,7 @@ dependencies = [
|
||||
"tracing",
|
||||
"url",
|
||||
"utils",
|
||||
"wal_decoder",
|
||||
"walkdir",
|
||||
"workspace_hack",
|
||||
]
|
||||
@@ -4186,6 +4187,7 @@ dependencies = [
|
||||
"regex",
|
||||
"serde",
|
||||
"thiserror",
|
||||
"tracing",
|
||||
"utils",
|
||||
]
|
||||
|
||||
@@ -6272,7 +6274,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-epoll-uring"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#cb2dcea2058034bc209e7917b01c5097712a3168"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#33e00106a268644d02ba0461bbd64476073b0ee1"
|
||||
dependencies = [
|
||||
"futures",
|
||||
"nix 0.26.4",
|
||||
@@ -6788,7 +6790,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "uring-common"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#cb2dcea2058034bc209e7917b01c5097712a3168"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#33e00106a268644d02ba0461bbd64476073b0ee1"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"io-uring",
|
||||
@@ -6954,6 +6956,21 @@ dependencies = [
|
||||
"utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wal_decoder"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bytes",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
"postgres_ffi",
|
||||
"serde",
|
||||
"tracing",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "walkdir"
|
||||
version = "2.3.3"
|
||||
|
||||
@@ -33,6 +33,7 @@ members = [
|
||||
"libs/postgres_ffi/wal_craft",
|
||||
"libs/vm_monitor",
|
||||
"libs/walproposer",
|
||||
"libs/wal_decoder",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
@@ -238,6 +239,7 @@ tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
|
||||
utils = { version = "0.1", path = "./libs/utils/" }
|
||||
vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }
|
||||
walproposer = { version = "0.1", path = "./libs/walproposer/" }
|
||||
wal_decoder = { version = "0.1", path = "./libs/wal_decoder" }
|
||||
|
||||
## Common library dependency
|
||||
workspace_hack = { version = "0.1", path = "./workspace_hack/" }
|
||||
|
||||
@@ -57,6 +57,18 @@ RUN set -e \
|
||||
zstd \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
# sql_exporter
|
||||
|
||||
# Keep the version the same as in compute/compute-node.Dockerfile and
|
||||
# test_runner/regress/test_compute_metrics.py.
|
||||
ENV SQL_EXPORTER_VERSION=0.13.1
|
||||
RUN curl -fsSL \
|
||||
"https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \
|
||||
--output sql_exporter.tar.gz \
|
||||
&& mkdir /tmp/sql_exporter \
|
||||
&& tar xzvf sql_exporter.tar.gz -C /tmp/sql_exporter --strip-components=1 \
|
||||
&& mv /tmp/sql_exporter/sql_exporter /usr/local/bin/sql_exporter
|
||||
|
||||
# protobuf-compiler (protoc)
|
||||
ENV PROTOC_VERSION=25.1
|
||||
RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
|
||||
|
||||
@@ -22,6 +22,7 @@ sql_exporter.yml: $(jsonnet_files)
|
||||
--output-file etc/$@ \
|
||||
--tla-str collector_name=neon_collector \
|
||||
--tla-str collector_file=neon_collector.yml \
|
||||
--tla-str 'connection_string=postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter' \
|
||||
etc/sql_exporter.jsonnet
|
||||
|
||||
sql_exporter_autoscaling.yml: $(jsonnet_files)
|
||||
@@ -29,7 +30,7 @@ sql_exporter_autoscaling.yml: $(jsonnet_files)
|
||||
--output-file etc/$@ \
|
||||
--tla-str collector_name=neon_collector_autoscaling \
|
||||
--tla-str collector_file=neon_collector_autoscaling.yml \
|
||||
--tla-str application_name=sql_exporter_autoscaling \
|
||||
--tla-str 'connection_string=postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling' \
|
||||
etc/sql_exporter.jsonnet
|
||||
|
||||
.PHONY: clean
|
||||
|
||||
@@ -666,7 +666,7 @@ RUN apt-get update && \
|
||||
#
|
||||
# Use new version only for v17
|
||||
# because Release_2024_09_1 has some backward incompatible changes
|
||||
# https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1
|
||||
# https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1
|
||||
ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v17") \
|
||||
@@ -860,18 +860,98 @@ ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
|
||||
USER nonroot
|
||||
WORKDIR /home/nonroot
|
||||
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
echo "v17 is not supported yet by pgrx. Quit" && exit 0;; \
|
||||
esac && \
|
||||
curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
|
||||
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
|
||||
chmod +x rustup-init && \
|
||||
./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
|
||||
rm rustup-init && \
|
||||
case "${PG_VERSION}" in \
|
||||
'v17') \
|
||||
echo 'v17 is not supported yet by pgrx. Quit' && exit 0;; \
|
||||
esac && \
|
||||
cargo install --locked --version 0.11.3 cargo-pgrx && \
|
||||
/bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
|
||||
|
||||
USER root
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "rust extensions pgrx12"
|
||||
#
|
||||
# pgrx started to support Postgres 17 since version 12,
|
||||
# but some older extension aren't compatible with it.
|
||||
# This layer should be used as a base for new pgrx extensions,
|
||||
# and eventually get merged with `rust-extensions-build`
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS rust-extensions-build-pgrx12
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install --no-install-recommends -y curl libclang-dev && \
|
||||
useradd -ms /bin/bash nonroot -b /home
|
||||
|
||||
ENV HOME=/home/nonroot
|
||||
ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
|
||||
USER nonroot
|
||||
WORKDIR /home/nonroot
|
||||
|
||||
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
|
||||
chmod +x rustup-init && \
|
||||
./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
|
||||
rm rustup-init && \
|
||||
cargo install --locked --version 0.12.6 cargo-pgrx && \
|
||||
/bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
|
||||
|
||||
USER root
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layers "pg-onnx-build" and "pgrag-pg-build"
|
||||
# Compile "pgrag" extensions
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM rust-extensions-build-pgrx12 AS pg-onnx-build
|
||||
|
||||
# cmake 3.26 or higher is required, so installing it using pip (bullseye-backports has cmake 3.25).
|
||||
# Install it using virtual environment, because Python 3.11 (the default version on Debian 12 (Bookworm)) complains otherwise
|
||||
RUN apt-get update && apt-get install -y python3 python3-pip python3-venv && \
|
||||
python3 -m venv venv && \
|
||||
. venv/bin/activate && \
|
||||
python3 -m pip install cmake==3.30.5 && \
|
||||
wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.gz -O onnxruntime.tar.gz && \
|
||||
mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \
|
||||
./build.sh --config Release --parallel --skip_submodule_sync --skip_tests --allow_running_as_root
|
||||
|
||||
|
||||
FROM pg-onnx-build AS pgrag-pg-build
|
||||
|
||||
RUN apt-get install -y protobuf-compiler && \
|
||||
wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.0.0.tar.gz -O pgrag.tar.gz && \
|
||||
echo "2cbe394c1e74fc8bcad9b52d5fbbfb783aef834ca3ce44626cfd770573700bb4 pgrag.tar.gz" | sha256sum --check && \
|
||||
mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . && \
|
||||
\
|
||||
cd exts/rag && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control && \
|
||||
\
|
||||
cd ../rag_bge_small_en_v15 && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
|
||||
REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \
|
||||
cargo pgrx install --release --features remote_onnx && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control && \
|
||||
\
|
||||
cd ../rag_jina_reranker_v1_tiny_en && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
|
||||
REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \
|
||||
cargo pgrx install --release --features remote_onnx && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_jina_reranker_v1_tiny_en.control
|
||||
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-jsonschema-pg-build"
|
||||
@@ -1041,6 +1121,34 @@ RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg_mooncake"
|
||||
# compile pg_mooncake extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM rust-extensions-build AS pg-mooncake-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# The topmost commit in the `neon` branch at the time of writing this
|
||||
# https://github.com/Mooncake-Labs/pg_mooncake/commits/neon/
|
||||
# https://github.com/Mooncake-Labs/pg_mooncake/commit/568b5a82b5fc16136bdf4ca5aac3e0cc261ab48d
|
||||
ENV PG_MOONCAKE_VERSION=568b5a82b5fc16136bdf4ca5aac3e0cc261ab48d
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
|
||||
RUN case "${PG_VERSION}" in \
|
||||
'v14') \
|
||||
echo "pg_mooncake is not supported on Postgres ${PG_VERSION}" && exit 0;; \
|
||||
esac && \
|
||||
git clone --depth 1 --branch neon https://github.com/Mooncake-Labs/pg_mooncake.git pg_mooncake-src && \
|
||||
cd pg_mooncake-src && \
|
||||
git checkout "${PG_MOONCAKE_VERSION}" && \
|
||||
git submodule update --init --depth 1 --recursive && \
|
||||
make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "neon-pg-ext-build"
|
||||
@@ -1059,6 +1167,7 @@ COPY --from=h3-pg-build /h3/usr /
|
||||
COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=vector-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pgjwt-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pgrag-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-jsonschema-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-graphql-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-tiktoken-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
@@ -1084,6 +1193,7 @@ COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
|
||||
COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY pgxn/ pgxn/
|
||||
|
||||
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
@@ -1191,7 +1301,10 @@ RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin l
|
||||
#########################################################################################
|
||||
|
||||
FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
|
||||
FROM burningalchemist/sql_exporter:0.13 AS sql-exporter
|
||||
|
||||
# Keep the version the same as in build-tools.Dockerfile and
|
||||
# test_runner/regress/test_compute_metrics.py.
|
||||
FROM burningalchemist/sql_exporter:0.13.1 AS sql-exporter
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -1247,6 +1360,7 @@ COPY --from=unit-pg-build /postgresql-unit.tar.gz /ext-src/
|
||||
COPY --from=vector-pg-build /pgvector.tar.gz /ext-src/
|
||||
COPY --from=vector-pg-build /pgvector.patch /ext-src/
|
||||
COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
|
||||
#COPY --from=pgrag-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
#COPY --from=pg-jsonschema-pg-build /home/nonroot/pg_jsonschema.tar.gz /ext-src
|
||||
#COPY --from=pg-graphql-pg-build /home/nonroot/pg_graphql.tar.gz /ext-src
|
||||
#COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
metrics: [
|
||||
import 'sql_exporter/checkpoints_req.libsonnet',
|
||||
import 'sql_exporter/checkpoints_timed.libsonnet',
|
||||
import 'sql_exporter/compute_backpressure_throttling_ms.libsonnet',
|
||||
import 'sql_exporter/compute_current_lsn.libsonnet',
|
||||
import 'sql_exporter/compute_logical_snapshot_files.libsonnet',
|
||||
import 'sql_exporter/compute_receive_lsn.libsonnet',
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
function(collector_name, collector_file, application_name='sql_exporter') {
|
||||
function(collector_name, collector_file, connection_string) {
|
||||
// Configuration for sql_exporter for autoscaling-agent
|
||||
// Global defaults.
|
||||
global: {
|
||||
@@ -23,7 +23,7 @@ function(collector_name, collector_file, application_name='sql_exporter') {
|
||||
target: {
|
||||
// Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
|
||||
// the schema gets dropped or replaced to match the driver expected DSN format.
|
||||
data_source_name: std.format('postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=%s', [application_name]),
|
||||
data_source_name: connection_string,
|
||||
|
||||
// Collectors (referenced by name) to execute on the target.
|
||||
// Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
|
||||
|
||||
@@ -0,0 +1,10 @@
|
||||
{
|
||||
metric_name: 'compute_backpressure_throttling_ms',
|
||||
type: 'gauge',
|
||||
help: 'Time compute has spent throttled',
|
||||
key_labels: null,
|
||||
values: [
|
||||
'throttled',
|
||||
],
|
||||
query: importstr 'sql_exporter/compute_backpressure_throttling_ms.sql',
|
||||
}
|
||||
@@ -0,0 +1 @@
|
||||
SELECT neon.backpressure_throttling_time() AS throttled;
|
||||
@@ -18,7 +18,7 @@ commands:
|
||||
- name: pgbouncer
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
|
||||
shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini 2>&1 > /dev/virtio-ports/tech.neon.log.0'
|
||||
- name: local_proxy
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
|
||||
@@ -18,7 +18,7 @@ commands:
|
||||
- name: pgbouncer
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
|
||||
shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini 2>&1 > /dev/virtio-ports/tech.neon.log.0'
|
||||
- name: local_proxy
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use std::collections::HashMap;
|
||||
use std::env;
|
||||
use std::fs;
|
||||
use std::io::BufRead;
|
||||
use std::os::unix::fs::{symlink, PermissionsExt};
|
||||
use std::path::Path;
|
||||
use std::process::{Command, Stdio};
|
||||
@@ -365,8 +364,7 @@ impl ComputeNode {
|
||||
let pageserver_connect_micros = start_time.elapsed().as_micros() as u64;
|
||||
|
||||
let basebackup_cmd = match lsn {
|
||||
// HACK We don't use compression on first start (Lsn(0)) because there's no API for it
|
||||
Lsn(0) => format!("basebackup {} {}", spec.tenant_id, spec.timeline_id),
|
||||
Lsn(0) => format!("basebackup {} {} --gzip", spec.tenant_id, spec.timeline_id),
|
||||
_ => format!(
|
||||
"basebackup {} {} {} --gzip",
|
||||
spec.tenant_id, spec.timeline_id, lsn
|
||||
@@ -375,38 +373,16 @@ impl ComputeNode {
|
||||
|
||||
let copyreader = client.copy_out(basebackup_cmd.as_str())?;
|
||||
let mut measured_reader = MeasuredReader::new(copyreader);
|
||||
|
||||
// Check the magic number to see if it's a gzip or not. Even though
|
||||
// we might explicitly ask for gzip, an old pageserver with no implementation
|
||||
// of gzip compression might send us uncompressed data. After some time
|
||||
// passes we can assume all pageservers know how to compress and we can
|
||||
// delete this check.
|
||||
//
|
||||
// If the data is not gzip, it will be tar. It will not be mistakenly
|
||||
// recognized as gzip because tar starts with an ascii encoding of a filename,
|
||||
// and 0x1f and 0x8b are unlikely first characters for any filename. Moreover,
|
||||
// we send the "global" directory first from the pageserver, so it definitely
|
||||
// won't be recognized as gzip.
|
||||
let mut bufreader = std::io::BufReader::new(&mut measured_reader);
|
||||
let gzip = {
|
||||
let peek = bufreader.fill_buf().unwrap();
|
||||
peek[0] == 0x1f && peek[1] == 0x8b
|
||||
};
|
||||
|
||||
// Read the archive directly from the `CopyOutReader`
|
||||
//
|
||||
// Set `ignore_zeros` so that unpack() reads all the Copy data and
|
||||
// doesn't stop at the end-of-archive marker. Otherwise, if the server
|
||||
// sends an Error after finishing the tarball, we will not notice it.
|
||||
if gzip {
|
||||
let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut bufreader));
|
||||
ar.set_ignore_zeros(true);
|
||||
ar.unpack(&self.pgdata)?;
|
||||
} else {
|
||||
let mut ar = tar::Archive::new(&mut bufreader);
|
||||
ar.set_ignore_zeros(true);
|
||||
ar.unpack(&self.pgdata)?;
|
||||
};
|
||||
let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut bufreader));
|
||||
ar.set_ignore_zeros(true);
|
||||
ar.unpack(&self.pgdata)?;
|
||||
|
||||
// Report metrics
|
||||
let mut state = self.state.lock().unwrap();
|
||||
|
||||
@@ -17,7 +17,7 @@ use std::time::Duration;
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::models::{self, AuxFilePolicy, TenantInfo, TimelineInfo};
|
||||
use pageserver_api::models::{self, TenantInfo, TimelineInfo};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api;
|
||||
use postgres_backend::AuthType;
|
||||
@@ -399,11 +399,6 @@ impl PageServerNode {
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("parse `timeline_get_throttle` from json")?,
|
||||
switch_aux_file_policy: settings
|
||||
.remove("switch_aux_file_policy")
|
||||
.map(|x| x.parse::<AuxFilePolicy>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'switch_aux_file_policy'")?,
|
||||
lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()),
|
||||
lsn_lease_length_for_ts: settings
|
||||
.remove("lsn_lease_length_for_ts")
|
||||
@@ -499,11 +494,6 @@ impl PageServerNode {
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("parse `timeline_get_throttle` from json")?,
|
||||
switch_aux_file_policy: settings
|
||||
.remove("switch_aux_file_policy")
|
||||
.map(|x| x.parse::<AuxFilePolicy>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'switch_aux_file_policy'")?,
|
||||
lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()),
|
||||
lsn_lease_length_for_ts: settings
|
||||
.remove("lsn_lease_length_for_ts")
|
||||
|
||||
@@ -1,187 +0,0 @@
|
||||
# Timelime import from `PGDATA`
|
||||
|
||||
Created at: 2024-10-28
|
||||
Author: Christian Schwarz (@problame)
|
||||
|
||||
## Summary
|
||||
|
||||
This document is a retroactive RFC for a feature to creating new root timelines
|
||||
from a vanilla Postgres PGDATA directory dump.
|
||||
|
||||
## Motivation & Context
|
||||
|
||||
perf hedge
|
||||
|
||||
companion RFCs:
|
||||
* cloud.git RFC that fits everyting together
|
||||
* READ THIS FIRST
|
||||
* importer that produces the PGDATA
|
||||
|
||||
## Non-Goals
|
||||
|
||||
## Components
|
||||
|
||||
this doc constrained to storage pieces; companion RFC in cloud.git fits it into a full system
|
||||
|
||||
## Prior Art
|
||||
|
||||
hackathon code
|
||||
|
||||
## Proposed Implementation
|
||||
|
||||
### High-Level
|
||||
|
||||
refactor timeline creation (modes) & idempotency checking
|
||||
|
||||
add new timeline creation mode for import from pgdata
|
||||
idempotency checking based on caller-provided idempotency key
|
||||
creating task does the following
|
||||
acquire creation guard
|
||||
create anonymous empty timeline
|
||||
upload index part to S3, with field indicating import is in progress
|
||||
spawn tokio task that performs the import (bound to tenant cancellation & guard)
|
||||
the import task produces layers, adds them to the layer map, uploads them using existing infrastructure
|
||||
after import task is done
|
||||
transition index part into `done` state
|
||||
shut down the timeline object
|
||||
load the timeline from remote storage, using the code that we use during tenant attach, and add it to Tenant::timelines
|
||||
|
||||
if pageserver restarts or the tenant is migrated during import, tenant attach resumes the import job, based on the
|
||||
Index part field.
|
||||
|
||||
SEQUENCE DIAGRAM
|
||||
|
||||
|
||||
### No Storage Controller Awareness
|
||||
|
||||
The storage controller is unaware of the import job, since it simply proxies the timeline creation request to each shard.
|
||||
|
||||
### PGDATA => Image Layer Conversion
|
||||
|
||||
The import task converts a PGDATA directory dump located in object storage into image layers.
|
||||
|
||||
shard awareness, including the new test for 0 disposable keys
|
||||
|
||||
range requests
|
||||
|
||||
### The Mechanics Of Executing The Conversion Code
|
||||
|
||||
Tokio Task Lifecycles / New Concept Of Long-Running Timeline Creation
|
||||
|
||||
This is the first time we make a timeline creation run long, and need to make it survive PS restarts and tenant migrations.
|
||||
|
||||
### Resumability & Forward Progress
|
||||
|
||||
Index Part field, document here.
|
||||
Evoultion toward checkpointing in the index part is possible.
|
||||
|
||||
For v1, no checkpointing is implemented because the MTBF of pageserver nodes is much higher than
|
||||
the highest expected import duration; and tenant migrations are rare enough.
|
||||
|
||||
### Resource Usage
|
||||
|
||||
Parallelized over multiple tokio tasks
|
||||
|
||||
### Interaction with Migrations / Generations
|
||||
|
||||
After migrating the tenant / reloading it with a newer generation, what happens?
|
||||
|
||||
### Security
|
||||
|
||||
The current implementation assumes that the PGDATA dump contents are trusted input,
|
||||
and further that it does not change while the import is ongoing.
|
||||
|
||||
There is no hard technical need for trusting the PGDATA dump contents, since
|
||||
the bulk of layer conversion is copying opaque 8k data blobs.
|
||||
|
||||
However, the conversion code uses serde-generated deserialization routines in some
|
||||
places which haven't been audited for resource exhaustion attacks (e.g. OOM due to large allcations).
|
||||
|
||||
Further, the conversion code is littered with `.unwrap()` and `assert!()` statements, which
|
||||
is a remnant from the fact that it was hackathon code. This risks panicking the pageserver
|
||||
that executes the import, and our panic blast radius is currently not reliably constrained
|
||||
to the tenant in which it originates.
|
||||
|
||||
### Progress Notification & Integration Into The Larger System
|
||||
|
||||
Timeline create API: no changes about durability semantics of create operation; cplane create
|
||||
operation is done as soon as API returns 201 (earlier design 202 code, remove that from cplane draft)
|
||||
|
||||
Readiness for import: due to limitations in the control plane regarding long-running jobs, cplane
|
||||
schedules the timeline creation before the PGDATA dunp has actually finished uploading.
|
||||
As a workaround, the import task polls for a special status key in the PGDATA to appear, which
|
||||
declares the PGDATA dump as finished uploading and not changing from thereon.
|
||||
TODO: review linearizability of S3, i.e., can we rely on ListObjectsV2 to return the
|
||||
full listing immediately after we observe the special status key? Or does the status key
|
||||
need to contain the list of prefixes? That would be better anyway, could be signed, cf security).
|
||||
|
||||
New concept: completion notification; didn't need this before, now we do;
|
||||
cplane must not use timeline before receiving completion notification.
|
||||
mechanism:
|
||||
* source of truth of completion is a special status object that import job uploads into the PGDATA location in S3
|
||||
* how to make cplane aware of update: invocation of a notification upcall API
|
||||
* import job retries delivery of the notification upcall until success response
|
||||
* in response to upcall, cplane collects statuses of alls hards using ListObjectV2 prefix (it understands ShardId)
|
||||
|
||||
SEQUENCE DIAGRAM FROM GIST
|
||||
|
||||
### Shard-Split during Import is UB
|
||||
|
||||
The design does not handle shard-splits during import, so it's declared UB right now.
|
||||
|
||||
### Storage-Controller, Sharding
|
||||
|
||||
As before this RFC, the storage controller forwards timeline create requests to each
|
||||
shard and is unaware that the timeline is importing, because so far storcon did not need
|
||||
to be aware of timelines.
|
||||
|
||||
However, to make sharding transparent to cplane, it would make sense for storcon to
|
||||
act as an intermediate for the import progress progress upcalls, i.e., batch up all
|
||||
shards' completion upcalls and when that's done, deliver a tenant-scoped (instead of shard-scoped)
|
||||
upcall to cplane.
|
||||
That way, cplane need not know about tenants and storcon can internalize shard-split.
|
||||
|
||||
### Alternative Designs
|
||||
|
||||
#### External Task Performs Conversion
|
||||
|
||||
External task performs conversion and uploads layers & index part into S3.
|
||||
|
||||
=> copy gist arguments against that here.
|
||||
|
||||
|
||||
## Future Work
|
||||
|
||||
### Work Required For Productization
|
||||
|
||||
TODOs at the top of import code, esp resource exhaustion attack vectors
|
||||
|
||||
Review linearizability of listobjectsv2 after observing pgdata upload done key.
|
||||
|
||||
Resource usage: play nice with other tenants
|
||||
|
||||
Regression test ensuring resumability (failpoints)
|
||||
|
||||
Storcon needs to be aware of import and
|
||||
- inhibit shard split while ongoing (can relax this later)
|
||||
- and redesign progress notification system to
|
||||
- avoid need to mutate pgdata, so it can be readonly
|
||||
|
||||
DESIGN WORK: does storage controller scheduler should be aware of ongoing migration?
|
||||
=> it should control resource usage, maybe different policy for migrations, etc
|
||||
|
||||
Safety: fail shard split while import is in progress. Pageserver should enforce this.
|
||||
Storcon needs to know and respsect this.
|
||||
|
||||
Azure support
|
||||
|
||||
### v2
|
||||
|
||||
Teach pageserver to natively read PGDATA relation files. Then it could be as simple
|
||||
as reading a few control files, and otherwise using object copy operations for
|
||||
moving the relation files.
|
||||
Problems:
|
||||
- how to deal with sharding? If we don't slice up the files, then each shard
|
||||
download a full copy of the database. Tricky. Maybe can handle this with 1GiB stripe size,
|
||||
but, pretty obscure.
|
||||
|
||||
@@ -250,12 +250,6 @@ pub struct TenantConfigToml {
|
||||
// Expresed in multiples of checkpoint distance.
|
||||
pub image_layer_creation_check_threshold: u8,
|
||||
|
||||
/// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
|
||||
/// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
|
||||
/// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
|
||||
/// file is written.
|
||||
pub switch_aux_file_policy: crate::models::AuxFilePolicy,
|
||||
|
||||
/// The length for an explicit LSN lease request.
|
||||
/// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
|
||||
#[serde(with = "humantime_serde")]
|
||||
@@ -475,7 +469,6 @@ impl Default for TenantConfigToml {
|
||||
lazy_slru_download: false,
|
||||
timeline_get_throttle: crate::models::ThrottleConfig::disabled(),
|
||||
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
|
||||
switch_aux_file_policy: crate::models::AuxFilePolicy::default_tenant_config(),
|
||||
lsn_lease_length: LsnLease::DEFAULT_LENGTH,
|
||||
lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
|
||||
}
|
||||
|
||||
@@ -5,9 +5,11 @@ pub mod controller_api;
|
||||
pub mod key;
|
||||
pub mod keyspace;
|
||||
pub mod models;
|
||||
pub mod record;
|
||||
pub mod reltag;
|
||||
pub mod shard;
|
||||
/// Public API types
|
||||
pub mod upcall_api;
|
||||
pub mod value;
|
||||
|
||||
pub mod config;
|
||||
|
||||
@@ -10,7 +10,6 @@ use std::{
|
||||
io::{BufRead, Read},
|
||||
num::{NonZeroU32, NonZeroU64, NonZeroUsize},
|
||||
str::FromStr,
|
||||
sync::atomic::AtomicUsize,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
|
||||
@@ -309,7 +308,6 @@ pub struct TenantConfig {
|
||||
pub lazy_slru_download: Option<bool>,
|
||||
pub timeline_get_throttle: Option<ThrottleConfig>,
|
||||
pub image_layer_creation_check_threshold: Option<u8>,
|
||||
pub switch_aux_file_policy: Option<AuxFilePolicy>,
|
||||
pub lsn_lease_length: Option<String>,
|
||||
pub lsn_lease_length_for_ts: Option<String>,
|
||||
}
|
||||
@@ -350,68 +348,6 @@ pub enum AuxFilePolicy {
|
||||
CrossValidation,
|
||||
}
|
||||
|
||||
impl AuxFilePolicy {
|
||||
pub fn is_valid_migration_path(from: Option<Self>, to: Self) -> bool {
|
||||
matches!(
|
||||
(from, to),
|
||||
(None, _) | (Some(AuxFilePolicy::CrossValidation), AuxFilePolicy::V2)
|
||||
)
|
||||
}
|
||||
|
||||
/// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
|
||||
pub fn default_tenant_config() -> Self {
|
||||
Self::V2
|
||||
}
|
||||
}
|
||||
|
||||
/// The aux file policy memory flag. Users can store `Option<AuxFilePolicy>` into this atomic flag. 0 == unspecified.
|
||||
pub struct AtomicAuxFilePolicy(AtomicUsize);
|
||||
|
||||
impl AtomicAuxFilePolicy {
|
||||
pub fn new(policy: Option<AuxFilePolicy>) -> Self {
|
||||
Self(AtomicUsize::new(
|
||||
policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
|
||||
))
|
||||
}
|
||||
|
||||
pub fn load(&self) -> Option<AuxFilePolicy> {
|
||||
match self.0.load(std::sync::atomic::Ordering::Acquire) {
|
||||
0 => None,
|
||||
other => Some(AuxFilePolicy::from_usize(other)),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn store(&self, policy: Option<AuxFilePolicy>) {
|
||||
self.0.store(
|
||||
policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
|
||||
std::sync::atomic::Ordering::Release,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
impl AuxFilePolicy {
|
||||
pub fn to_usize(self) -> usize {
|
||||
match self {
|
||||
Self::V1 => 1,
|
||||
Self::CrossValidation => 2,
|
||||
Self::V2 => 3,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn try_from_usize(this: usize) -> Option<Self> {
|
||||
match this {
|
||||
1 => Some(Self::V1),
|
||||
2 => Some(Self::CrossValidation),
|
||||
3 => Some(Self::V2),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_usize(this: usize) -> Self {
|
||||
Self::try_from_usize(this).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(tag = "kind")]
|
||||
pub enum EvictionPolicy {
|
||||
@@ -1633,71 +1569,6 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_aux_file_migration_path() {
|
||||
assert!(AuxFilePolicy::is_valid_migration_path(
|
||||
None,
|
||||
AuxFilePolicy::V1
|
||||
));
|
||||
assert!(AuxFilePolicy::is_valid_migration_path(
|
||||
None,
|
||||
AuxFilePolicy::V2
|
||||
));
|
||||
assert!(AuxFilePolicy::is_valid_migration_path(
|
||||
None,
|
||||
AuxFilePolicy::CrossValidation
|
||||
));
|
||||
// Self-migration is not a valid migration path, and the caller should handle it by itself.
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V1),
|
||||
AuxFilePolicy::V1
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V2),
|
||||
AuxFilePolicy::V2
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::CrossValidation),
|
||||
AuxFilePolicy::CrossValidation
|
||||
));
|
||||
// Migrations not allowed
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::CrossValidation),
|
||||
AuxFilePolicy::V1
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V1),
|
||||
AuxFilePolicy::V2
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V2),
|
||||
AuxFilePolicy::V1
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V2),
|
||||
AuxFilePolicy::CrossValidation
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V1),
|
||||
AuxFilePolicy::CrossValidation
|
||||
));
|
||||
// Migrations allowed
|
||||
assert!(AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::CrossValidation),
|
||||
AuxFilePolicy::V2
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_aux_parse() {
|
||||
assert_eq!(AuxFilePolicy::from_str("V2").unwrap(), AuxFilePolicy::V2);
|
||||
assert_eq!(AuxFilePolicy::from_str("v2").unwrap(), AuxFilePolicy::V2);
|
||||
assert_eq!(
|
||||
AuxFilePolicy::from_str("cross-validation").unwrap(),
|
||||
AuxFilePolicy::CrossValidation
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_image_compression_algorithm_parsing() {
|
||||
use ImageCompressionAlgorithm::*;
|
||||
|
||||
113
libs/pageserver_api/src/record.rs
Normal file
113
libs/pageserver_api/src/record.rs
Normal file
@@ -0,0 +1,113 @@
|
||||
//! This module defines the WAL record format used within the pageserver.
|
||||
|
||||
use bytes::Bytes;
|
||||
use postgres_ffi::walrecord::{describe_postgres_wal_record, MultiXactMember};
|
||||
use postgres_ffi::{MultiXactId, MultiXactOffset, TimestampTz, TransactionId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::bin_ser::DeserializeError;
|
||||
|
||||
/// Each update to a page is represented by a NeonWalRecord. It can be a wrapper
|
||||
/// around a PostgreSQL WAL record, or a custom neon-specific "record".
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub enum NeonWalRecord {
|
||||
/// Native PostgreSQL WAL record
|
||||
Postgres { will_init: bool, rec: Bytes },
|
||||
|
||||
/// Clear bits in heap visibility map. ('flags' is bitmap of bits to clear)
|
||||
ClearVisibilityMapFlags {
|
||||
new_heap_blkno: Option<u32>,
|
||||
old_heap_blkno: Option<u32>,
|
||||
flags: u8,
|
||||
},
|
||||
/// Mark transaction IDs as committed on a CLOG page
|
||||
ClogSetCommitted {
|
||||
xids: Vec<TransactionId>,
|
||||
timestamp: TimestampTz,
|
||||
},
|
||||
/// Mark transaction IDs as aborted on a CLOG page
|
||||
ClogSetAborted { xids: Vec<TransactionId> },
|
||||
/// Extend multixact offsets SLRU
|
||||
MultixactOffsetCreate {
|
||||
mid: MultiXactId,
|
||||
moff: MultiXactOffset,
|
||||
},
|
||||
/// Extend multixact members SLRU.
|
||||
MultixactMembersCreate {
|
||||
moff: MultiXactOffset,
|
||||
members: Vec<MultiXactMember>,
|
||||
},
|
||||
/// Update the map of AUX files, either writing or dropping an entry
|
||||
AuxFile {
|
||||
file_path: String,
|
||||
content: Option<Bytes>,
|
||||
},
|
||||
|
||||
/// A testing record for unit testing purposes. It supports append data to an existing image, or clear it.
|
||||
#[cfg(feature = "testing")]
|
||||
Test {
|
||||
/// Append a string to the image.
|
||||
append: String,
|
||||
/// Clear the image before appending.
|
||||
clear: bool,
|
||||
/// Treat this record as an init record. `clear` should be set to true if this field is set
|
||||
/// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and
|
||||
/// its references in `timeline.rs`.
|
||||
will_init: bool,
|
||||
},
|
||||
}
|
||||
|
||||
impl NeonWalRecord {
|
||||
/// Does replaying this WAL record initialize the page from scratch, or does
|
||||
/// it need to be applied over the previous image of the page?
|
||||
pub fn will_init(&self) -> bool {
|
||||
// If you change this function, you'll also need to change ValueBytes::will_init
|
||||
match self {
|
||||
NeonWalRecord::Postgres { will_init, rec: _ } => *will_init,
|
||||
#[cfg(feature = "testing")]
|
||||
NeonWalRecord::Test { will_init, .. } => *will_init,
|
||||
// None of the special neon record types currently initialize the page
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
pub fn wal_append(s: impl AsRef<str>) -> Self {
|
||||
Self::Test {
|
||||
append: s.as_ref().to_string(),
|
||||
clear: false,
|
||||
will_init: false,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
pub fn wal_clear() -> Self {
|
||||
Self::Test {
|
||||
append: "".to_string(),
|
||||
clear: true,
|
||||
will_init: false,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
pub fn wal_init() -> Self {
|
||||
Self::Test {
|
||||
append: "".to_string(),
|
||||
clear: true,
|
||||
will_init: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Build a human-readable string to describe a WAL record
|
||||
///
|
||||
/// For debugging purposes
|
||||
pub fn describe_wal_record(rec: &NeonWalRecord) -> Result<String, DeserializeError> {
|
||||
match rec {
|
||||
NeonWalRecord::Postgres { will_init, rec } => Ok(format!(
|
||||
"will_init: {}, {}",
|
||||
will_init,
|
||||
describe_postgres_wal_record(rec)?
|
||||
)),
|
||||
_ => Ok(format!("{:?}", rec)),
|
||||
}
|
||||
}
|
||||
@@ -1,13 +1,16 @@
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::Result;
|
||||
//! This module defines the value type used by the storage engine.
|
||||
//!
|
||||
//! A [`Value`] represents either a completely new value for one Key ([`Value::Image`]),
|
||||
//! or a "delta" of how to get from previous version of the value to the new one
|
||||
//! ([`Value::WalRecord`]])
|
||||
//!
|
||||
//! Note that the [`Value`] type is used for the permananent storage format, so any
|
||||
//! changes to it must be backwards compatible.
|
||||
|
||||
use crate::record::NeonWalRecord;
|
||||
use bytes::Bytes;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::ops::AddAssign;
|
||||
use std::time::Duration;
|
||||
|
||||
pub use pageserver_api::key::{Key, KEY_SIZE};
|
||||
|
||||
/// A 'value' stored for a one Key.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub enum Value {
|
||||
/// An Image value contains a full copy of the value
|
||||
@@ -20,10 +23,12 @@ pub enum Value {
|
||||
}
|
||||
|
||||
impl Value {
|
||||
#[inline(always)]
|
||||
pub fn is_image(&self) -> bool {
|
||||
matches!(self, Value::Image(_))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn will_init(&self) -> bool {
|
||||
match self {
|
||||
Value::Image(_) => true,
|
||||
@@ -33,17 +38,18 @@ impl Value {
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub(crate) enum InvalidInput {
|
||||
pub enum InvalidInput {
|
||||
TooShortValue,
|
||||
TooShortPostgresRecord,
|
||||
}
|
||||
|
||||
/// We could have a ValueRef where everything is `serde(borrow)`. Before implementing that, lets
|
||||
/// use this type for querying if a slice looks some particular way.
|
||||
pub(crate) struct ValueBytes;
|
||||
pub struct ValueBytes;
|
||||
|
||||
impl ValueBytes {
|
||||
pub(crate) fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
|
||||
#[inline(always)]
|
||||
pub fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
|
||||
if raw.len() < 12 {
|
||||
return Err(InvalidInput::TooShortValue);
|
||||
}
|
||||
@@ -79,6 +85,7 @@ impl ValueBytes {
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
use bytes::Bytes;
|
||||
use utils::bin_ser::BeSer;
|
||||
|
||||
macro_rules! roundtrip {
|
||||
@@ -229,56 +236,3 @@ mod test {
|
||||
assert!(!ValueBytes::will_init(&expected).unwrap());
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Result of performing GC
|
||||
///
|
||||
#[derive(Default, Serialize, Debug)]
|
||||
pub struct GcResult {
|
||||
pub layers_total: u64,
|
||||
pub layers_needed_by_cutoff: u64,
|
||||
pub layers_needed_by_pitr: u64,
|
||||
pub layers_needed_by_branches: u64,
|
||||
pub layers_needed_by_leases: u64,
|
||||
pub layers_not_updated: u64,
|
||||
pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
|
||||
|
||||
#[serde(serialize_with = "serialize_duration_as_millis")]
|
||||
pub elapsed: Duration,
|
||||
|
||||
/// The layers which were garbage collected.
|
||||
///
|
||||
/// Used in `/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc` to wait for the layers to be
|
||||
/// dropped in tests.
|
||||
#[cfg(feature = "testing")]
|
||||
#[serde(skip)]
|
||||
pub(crate) doomed_layers: Vec<crate::tenant::storage_layer::Layer>,
|
||||
}
|
||||
|
||||
// helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds
|
||||
fn serialize_duration_as_millis<S>(d: &Duration, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
d.as_millis().serialize(serializer)
|
||||
}
|
||||
|
||||
impl AddAssign for GcResult {
|
||||
fn add_assign(&mut self, other: Self) {
|
||||
self.layers_total += other.layers_total;
|
||||
self.layers_needed_by_pitr += other.layers_needed_by_pitr;
|
||||
self.layers_needed_by_cutoff += other.layers_needed_by_cutoff;
|
||||
self.layers_needed_by_branches += other.layers_needed_by_branches;
|
||||
self.layers_needed_by_leases += other.layers_needed_by_leases;
|
||||
self.layers_not_updated += other.layers_not_updated;
|
||||
self.layers_removed += other.layers_removed;
|
||||
|
||||
self.elapsed += other.elapsed;
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
{
|
||||
let mut other = other;
|
||||
self.doomed_layers.append(&mut other.doomed_layers);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -15,6 +15,7 @@ memoffset.workspace = true
|
||||
thiserror.workspace = true
|
||||
serde.workspace = true
|
||||
utils.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
env_logger.workspace = true
|
||||
|
||||
@@ -36,6 +36,7 @@ macro_rules! postgres_ffi {
|
||||
pub mod controlfile_utils;
|
||||
pub mod nonrelfile_utils;
|
||||
pub mod wal_craft_test_export;
|
||||
pub mod wal_generator;
|
||||
pub mod waldecoder_handler;
|
||||
pub mod xlog_utils;
|
||||
|
||||
@@ -217,6 +218,7 @@ macro_rules! enum_pgversion {
|
||||
|
||||
pub mod pg_constants;
|
||||
pub mod relfile_utils;
|
||||
pub mod walrecord;
|
||||
|
||||
// Export some widely used datatypes that are unlikely to change across Postgres versions
|
||||
pub use v14::bindings::RepOriginId;
|
||||
|
||||
@@ -170,6 +170,7 @@ pub const RM_RELMAP_ID: u8 = 7;
|
||||
pub const RM_STANDBY_ID: u8 = 8;
|
||||
pub const RM_HEAP2_ID: u8 = 9;
|
||||
pub const RM_HEAP_ID: u8 = 10;
|
||||
pub const RM_BTREE_ID: u8 = 11;
|
||||
pub const RM_REPLORIGIN_ID: u8 = 19;
|
||||
pub const RM_LOGICALMSG_ID: u8 = 21;
|
||||
|
||||
|
||||
203
libs/postgres_ffi/src/wal_generator.rs
Normal file
203
libs/postgres_ffi/src/wal_generator.rs
Normal file
@@ -0,0 +1,203 @@
|
||||
use std::ffi::CStr;
|
||||
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use crc32c::crc32c_append;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use super::bindings::{XLogLongPageHeaderData, XLogPageHeaderData, XLOG_PAGE_MAGIC};
|
||||
use super::xlog_utils::{
|
||||
XlLogicalMessage, XLOG_RECORD_CRC_OFFS, XLOG_SIZE_OF_XLOG_RECORD, XLP_BKP_REMOVABLE,
|
||||
XLP_FIRST_IS_CONTRECORD,
|
||||
};
|
||||
use super::XLogRecord;
|
||||
use crate::pg_constants::{
|
||||
RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE, XLP_LONG_HEADER, XLR_BLOCK_ID_DATA_LONG,
|
||||
XLR_BLOCK_ID_DATA_SHORT,
|
||||
};
|
||||
use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
|
||||
|
||||
/// Generates binary WAL records for use in tests and benchmarks. Currently only generates logical
|
||||
/// messages (effectively noops) with a fixed payload. It is used as an iterator which yields
|
||||
/// encoded bytes for a single WAL record, including internal page headers if it spans pages.
|
||||
/// Concatenating the bytes will yield a complete, well-formed WAL, which can be chunked at segment
|
||||
/// boundaries if desired. Not optimized for performance.
|
||||
///
|
||||
/// The WAL format is version-dependant (see e.g. `XLOG_PAGE_MAGIC`), so make sure to import this
|
||||
/// for the appropriate Postgres version (e.g. `postgres_ffi::v17::wal_generator::WalGenerator`).
|
||||
///
|
||||
/// A WAL is split into 16 MB segments. Each segment is split into 8 KB pages, with headers.
|
||||
/// Records are arbitrary length, 8-byte aligned, and may span pages. The layout is e.g.:
|
||||
///
|
||||
/// | Segment 1 | Segment 2 | Segment 3 |
|
||||
/// | Page 1 | Page 2 | Page 3 | Page 4 | Page 5 | Page 6 | Page 7 | Page 8 | Page 9 |
|
||||
/// | R1 | R2 |R3| R4 | R5 | R6 | R7 | R8 |
|
||||
///
|
||||
/// TODO: support generating actual tables and rows.
|
||||
#[derive(Default)]
|
||||
pub struct WalGenerator {
|
||||
/// Current LSN to append the next record at.
|
||||
///
|
||||
/// Callers can modify this (and prev_lsn) to restart generation at a different LSN, but should
|
||||
/// ensure that the LSN is on a valid record boundary (i.e. we can't start appending in the
|
||||
/// middle on an existing record or header, or beyond the end of the existing WAL).
|
||||
pub lsn: Lsn,
|
||||
/// The starting LSN of the previous record. Used in WAL record headers. The Safekeeper doesn't
|
||||
/// care about this, unlike Postgres, but we include it for completeness.
|
||||
pub prev_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl WalGenerator {
|
||||
// For now, hardcode the message payload.
|
||||
// TODO: support specifying the payload size.
|
||||
const PREFIX: &CStr = c"prefix";
|
||||
const MESSAGE: &[u8] = b"message";
|
||||
|
||||
// Hardcode the sys, timeline, and DB IDs. We can make them configurable if we care about them.
|
||||
const SYS_ID: u64 = 0;
|
||||
const TIMELINE_ID: u32 = 1;
|
||||
const DB_ID: u32 = 0;
|
||||
|
||||
/// Creates a new WAL generator, which emits logical message records (noops).
|
||||
pub fn new() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
/// Encodes a logical message (basically a noop), with the given prefix and message.
|
||||
pub(crate) fn encode_logical_message(prefix: &CStr, message: &[u8]) -> Bytes {
|
||||
let prefix = prefix.to_bytes_with_nul();
|
||||
let header = XlLogicalMessage {
|
||||
db_id: Self::DB_ID,
|
||||
transactional: 0,
|
||||
prefix_size: prefix.len() as u64,
|
||||
message_size: message.len() as u64,
|
||||
};
|
||||
[&header.encode(), prefix, message].concat().into()
|
||||
}
|
||||
|
||||
/// Encode a WAL record with the given payload data (e.g. a logical message).
|
||||
pub(crate) fn encode_record(data: Bytes, rmid: u8, info: u8, prev_lsn: Lsn) -> Bytes {
|
||||
// Prefix data with block ID and length.
|
||||
let data_header = Bytes::from(match data.len() {
|
||||
0 => vec![],
|
||||
1..=255 => vec![XLR_BLOCK_ID_DATA_SHORT, data.len() as u8],
|
||||
256.. => {
|
||||
let len_bytes = (data.len() as u32).to_le_bytes();
|
||||
[&[XLR_BLOCK_ID_DATA_LONG], len_bytes.as_slice()].concat()
|
||||
}
|
||||
});
|
||||
|
||||
// Construct the WAL record header.
|
||||
let mut header = XLogRecord {
|
||||
xl_tot_len: (XLOG_SIZE_OF_XLOG_RECORD + data_header.len() + data.len()) as u32,
|
||||
xl_xid: 0,
|
||||
xl_prev: prev_lsn.into(),
|
||||
xl_info: info,
|
||||
xl_rmid: rmid,
|
||||
__bindgen_padding_0: [0; 2],
|
||||
xl_crc: 0, // see below
|
||||
};
|
||||
|
||||
// Compute the CRC checksum for the data, and the header up to the CRC field.
|
||||
let mut crc = 0;
|
||||
crc = crc32c_append(crc, &data_header);
|
||||
crc = crc32c_append(crc, &data);
|
||||
crc = crc32c_append(crc, &header.encode().unwrap()[0..XLOG_RECORD_CRC_OFFS]);
|
||||
header.xl_crc = crc;
|
||||
|
||||
// Encode the final header and record.
|
||||
let header = header.encode().unwrap();
|
||||
|
||||
[header, data_header, data].concat().into()
|
||||
}
|
||||
|
||||
/// Injects page headers on 8KB page boundaries. Takes the current LSN position where the record
|
||||
/// is to be appended.
|
||||
fn encode_pages(record: Bytes, mut lsn: Lsn) -> Bytes {
|
||||
// Fast path: record fits in current page, and the page already has a header.
|
||||
if lsn.remaining_in_block() as usize >= record.len() && lsn.block_offset() > 0 {
|
||||
return record;
|
||||
}
|
||||
|
||||
let mut pages = BytesMut::new();
|
||||
let mut remaining = record.clone(); // Bytes::clone() is cheap
|
||||
while !remaining.is_empty() {
|
||||
// At new page boundary, inject page header.
|
||||
if lsn.block_offset() == 0 {
|
||||
let mut page_header = XLogPageHeaderData {
|
||||
xlp_magic: XLOG_PAGE_MAGIC as u16,
|
||||
xlp_info: XLP_BKP_REMOVABLE,
|
||||
xlp_tli: Self::TIMELINE_ID,
|
||||
xlp_pageaddr: lsn.0,
|
||||
xlp_rem_len: 0,
|
||||
__bindgen_padding_0: [0; 4],
|
||||
};
|
||||
// If the record was split across page boundaries, mark as continuation.
|
||||
if remaining.len() < record.len() {
|
||||
page_header.xlp_rem_len = remaining.len() as u32;
|
||||
page_header.xlp_info |= XLP_FIRST_IS_CONTRECORD;
|
||||
}
|
||||
// At start of segment, use a long page header.
|
||||
let page_header = if lsn.segment_offset(WAL_SEGMENT_SIZE) == 0 {
|
||||
page_header.xlp_info |= XLP_LONG_HEADER;
|
||||
XLogLongPageHeaderData {
|
||||
std: page_header,
|
||||
xlp_sysid: Self::SYS_ID,
|
||||
xlp_seg_size: WAL_SEGMENT_SIZE as u32,
|
||||
xlp_xlog_blcksz: XLOG_BLCKSZ as u32,
|
||||
}
|
||||
.encode()
|
||||
.unwrap()
|
||||
} else {
|
||||
page_header.encode().unwrap()
|
||||
};
|
||||
pages.extend_from_slice(&page_header);
|
||||
lsn += page_header.len() as u64;
|
||||
}
|
||||
|
||||
// Append the record up to the next page boundary, if any.
|
||||
let page_free = lsn.remaining_in_block() as usize;
|
||||
let chunk = remaining.split_to(std::cmp::min(page_free, remaining.len()));
|
||||
pages.extend_from_slice(&chunk);
|
||||
lsn += chunk.len() as u64;
|
||||
}
|
||||
pages.freeze()
|
||||
}
|
||||
|
||||
/// Records must be 8-byte aligned. Take an encoded record (including any injected page
|
||||
/// boundaries), starting at the given LSN, and add any necessary padding at the end.
|
||||
fn pad_record(record: Bytes, mut lsn: Lsn) -> Bytes {
|
||||
lsn += record.len() as u64;
|
||||
let padding = lsn.calc_padding(8u64) as usize;
|
||||
if padding == 0 {
|
||||
return record;
|
||||
}
|
||||
[record, Bytes::from(vec![0; padding])].concat().into()
|
||||
}
|
||||
|
||||
/// Generates a record with an arbitrary payload at the current LSN, then increments the LSN.
|
||||
pub fn generate_record(&mut self, data: Bytes, rmid: u8, info: u8) -> Bytes {
|
||||
let record = Self::encode_record(data, rmid, info, self.prev_lsn);
|
||||
let record = Self::encode_pages(record, self.lsn);
|
||||
let record = Self::pad_record(record, self.lsn);
|
||||
self.prev_lsn = self.lsn;
|
||||
self.lsn += record.len() as u64;
|
||||
record
|
||||
}
|
||||
|
||||
/// Generates a logical message at the current LSN. Can be used to construct arbitrary messages.
|
||||
pub fn generate_logical_message(&mut self, prefix: &CStr, message: &[u8]) -> Bytes {
|
||||
let data = Self::encode_logical_message(prefix, message);
|
||||
self.generate_record(data, RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE)
|
||||
}
|
||||
}
|
||||
|
||||
/// Generate WAL records as an iterator.
|
||||
impl Iterator for WalGenerator {
|
||||
type Item = (Lsn, Bytes);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let lsn = self.lsn;
|
||||
let record = self.generate_logical_message(Self::PREFIX, Self::MESSAGE);
|
||||
Some((lsn, record))
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -7,15 +7,14 @@
|
||||
// have been named the same as the corresponding PostgreSQL functions instead.
|
||||
//
|
||||
|
||||
use crc32c::crc32c_append;
|
||||
|
||||
use super::super::waldecoder::WalStreamDecoder;
|
||||
use super::bindings::{
|
||||
CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, FullTransactionId, TimeLineID, TimestampTz,
|
||||
XLogLongPageHeaderData, XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC,
|
||||
};
|
||||
use super::wal_generator::WalGenerator;
|
||||
use super::PG_MAJORVERSION;
|
||||
use crate::pg_constants;
|
||||
use crate::pg_constants::{self, RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE};
|
||||
use crate::PG_TLI;
|
||||
use crate::{uint32, uint64, Oid};
|
||||
use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
|
||||
@@ -26,7 +25,7 @@ use bytes::{Buf, Bytes};
|
||||
use log::*;
|
||||
|
||||
use serde::Serialize;
|
||||
use std::ffi::OsStr;
|
||||
use std::ffi::{CString, OsStr};
|
||||
use std::fs::File;
|
||||
use std::io::prelude::*;
|
||||
use std::io::ErrorKind;
|
||||
@@ -39,6 +38,7 @@ use utils::bin_ser::SerializeError;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
pub const XLOG_FNAME_LEN: usize = 24;
|
||||
pub const XLP_BKP_REMOVABLE: u16 = 0x0004;
|
||||
pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
|
||||
pub const XLP_REM_LEN_OFFS: usize = 2 + 2 + 4 + 8;
|
||||
pub const XLOG_RECORD_CRC_OFFS: usize = 4 + 4 + 8 + 1 + 1 + 2;
|
||||
@@ -489,64 +489,16 @@ impl XlLogicalMessage {
|
||||
/// Create new WAL record for non-transactional logical message.
|
||||
/// Used for creating artificial WAL for tests, as LogicalMessage
|
||||
/// record is basically no-op.
|
||||
///
|
||||
/// NOTE: This leaves the xl_prev field zero. The safekeeper and
|
||||
/// pageserver tolerate that, but PostgreSQL does not.
|
||||
pub fn encode_logical_message(prefix: &str, message: &str) -> Vec<u8> {
|
||||
let mut prefix_bytes: Vec<u8> = Vec::with_capacity(prefix.len() + 1);
|
||||
prefix_bytes.write_all(prefix.as_bytes()).unwrap();
|
||||
prefix_bytes.push(0);
|
||||
|
||||
let message_bytes = message.as_bytes();
|
||||
|
||||
let logical_message = XlLogicalMessage {
|
||||
db_id: 0,
|
||||
transactional: 0,
|
||||
prefix_size: prefix_bytes.len() as u64,
|
||||
message_size: message_bytes.len() as u64,
|
||||
};
|
||||
|
||||
let mainrdata = logical_message.encode();
|
||||
let mainrdata_len: usize = mainrdata.len() + prefix_bytes.len() + message_bytes.len();
|
||||
// only short mainrdata is supported for now
|
||||
assert!(mainrdata_len <= 255);
|
||||
let mainrdata_len = mainrdata_len as u8;
|
||||
|
||||
let mut data: Vec<u8> = vec![pg_constants::XLR_BLOCK_ID_DATA_SHORT, mainrdata_len];
|
||||
data.extend_from_slice(&mainrdata);
|
||||
data.extend_from_slice(&prefix_bytes);
|
||||
data.extend_from_slice(message_bytes);
|
||||
|
||||
let total_len = XLOG_SIZE_OF_XLOG_RECORD + data.len();
|
||||
|
||||
let mut header = XLogRecord {
|
||||
xl_tot_len: total_len as u32,
|
||||
xl_xid: 0,
|
||||
xl_prev: 0,
|
||||
xl_info: 0,
|
||||
xl_rmid: 21,
|
||||
__bindgen_padding_0: [0u8; 2usize],
|
||||
xl_crc: 0, // crc will be calculated later
|
||||
};
|
||||
|
||||
let header_bytes = header.encode().expect("failed to encode header");
|
||||
let crc = crc32c_append(0, &data);
|
||||
let crc = crc32c_append(crc, &header_bytes[0..XLOG_RECORD_CRC_OFFS]);
|
||||
header.xl_crc = crc;
|
||||
|
||||
let mut wal: Vec<u8> = Vec::new();
|
||||
wal.extend_from_slice(&header.encode().expect("failed to encode header"));
|
||||
wal.extend_from_slice(&data);
|
||||
|
||||
// WAL start position must be aligned at 8 bytes,
|
||||
// this will add padding for the next WAL record.
|
||||
const PADDING: usize = 8;
|
||||
let padding_rem = wal.len() % PADDING;
|
||||
if padding_rem != 0 {
|
||||
wal.resize(wal.len() + PADDING - padding_rem, 0);
|
||||
}
|
||||
|
||||
wal
|
||||
pub fn encode_logical_message(prefix: &str, message: &str) -> Bytes {
|
||||
// This function can take untrusted input, so discard any NUL bytes in the prefix string.
|
||||
let prefix = CString::new(prefix.replace('\0', "")).expect("no NULs");
|
||||
let message = message.as_bytes();
|
||||
WalGenerator::encode_record(
|
||||
WalGenerator::encode_logical_message(&prefix, message),
|
||||
RM_LOGICALMSG_ID,
|
||||
XLOG_LOGICAL_MESSAGE,
|
||||
Lsn(0),
|
||||
)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -12,7 +12,7 @@ use crate::seqwait::MonotonicCounter;
|
||||
pub const XLOG_BLCKSZ: u32 = 8192;
|
||||
|
||||
/// A Postgres LSN (Log Sequence Number), also known as an XLogRecPtr
|
||||
#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash)]
|
||||
#[derive(Clone, Copy, Default, Eq, Ord, PartialEq, PartialOrd, Hash)]
|
||||
pub struct Lsn(pub u64);
|
||||
|
||||
impl Serialize for Lsn {
|
||||
|
||||
19
libs/wal_decoder/Cargo.toml
Normal file
19
libs/wal_decoder/Cargo.toml
Normal file
@@ -0,0 +1,19 @@
|
||||
[package]
|
||||
name = "wal_decoder"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
testing = []
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
bytes.workspace = true
|
||||
once_cell.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
serde.workspace = true
|
||||
tracing.workspace = true
|
||||
utils.workspace = true
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
990
libs/wal_decoder/src/decoder.rs
Normal file
990
libs/wal_decoder/src/decoder.rs
Normal file
@@ -0,0 +1,990 @@
|
||||
//! This module contains logic for decoding and interpreting
|
||||
//! raw bytes which represent a raw Postgres WAL record.
|
||||
|
||||
use crate::models::*;
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use pageserver_api::value::Value;
|
||||
use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
|
||||
use postgres_ffi::walrecord::*;
|
||||
use postgres_ffi::{page_is_new, page_set_lsn, pg_constants, BLCKSZ};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
impl InterpretedWalRecord {
|
||||
/// Decode and interpreted raw bytes which represent one Postgres WAL record.
|
||||
/// Data blocks which do not match the provided shard identity are filtered out.
|
||||
/// Shard 0 is a special case since it tracks all relation sizes. We only give it
|
||||
/// the keys that are being written as that is enough for updating relation sizes.
|
||||
pub fn from_bytes_filtered(
|
||||
buf: Bytes,
|
||||
shard: &ShardIdentity,
|
||||
lsn: Lsn,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<InterpretedWalRecord> {
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
decode_wal_record(buf, &mut decoded, pg_version)?;
|
||||
|
||||
let flush_uncommitted = if decoded.is_dbase_create_copy(pg_version) {
|
||||
FlushUncommittedRecords::Yes
|
||||
} else {
|
||||
FlushUncommittedRecords::No
|
||||
};
|
||||
|
||||
let metadata_record = MetadataRecord::from_decoded(&decoded, lsn, pg_version)?;
|
||||
|
||||
let mut blocks = Vec::default();
|
||||
for blk in decoded.blocks.iter() {
|
||||
let rel = RelTag {
|
||||
spcnode: blk.rnode_spcnode,
|
||||
dbnode: blk.rnode_dbnode,
|
||||
relnode: blk.rnode_relnode,
|
||||
forknum: blk.forknum,
|
||||
};
|
||||
|
||||
let key = rel_block_to_key(rel, blk.blkno);
|
||||
|
||||
if !key.is_valid_key_on_write_path() {
|
||||
anyhow::bail!("Unsupported key decoded at LSN {}: {}", lsn, key);
|
||||
}
|
||||
|
||||
let key_is_local = shard.is_key_local(&key);
|
||||
|
||||
tracing::debug!(
|
||||
lsn=%lsn,
|
||||
key=%key,
|
||||
"ingest: shard decision {}",
|
||||
if !key_is_local { "drop" } else { "keep" },
|
||||
);
|
||||
|
||||
if !key_is_local {
|
||||
if shard.is_shard_zero() {
|
||||
// Shard 0 tracks relation sizes. Although we will not store this block, we will observe
|
||||
// its blkno in case it implicitly extends a relation.
|
||||
blocks.push((key.to_compact(), None));
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Instead of storing full-page-image WAL record,
|
||||
// it is better to store extracted image: we can skip wal-redo
|
||||
// in this case. Also some FPI records may contain multiple (up to 32) pages,
|
||||
// so them have to be copied multiple times.
|
||||
//
|
||||
let value = if blk.apply_image
|
||||
&& blk.has_image
|
||||
&& decoded.xl_rmid == pg_constants::RM_XLOG_ID
|
||||
&& (decoded.xl_info == pg_constants::XLOG_FPI
|
||||
|| decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
|
||||
// compression of WAL is not yet supported: fall back to storing the original WAL record
|
||||
&& !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version)
|
||||
// do not materialize null pages because them most likely be soon replaced with real data
|
||||
&& blk.bimg_len != 0
|
||||
{
|
||||
// Extract page image from FPI record
|
||||
let img_len = blk.bimg_len as usize;
|
||||
let img_offs = blk.bimg_offset as usize;
|
||||
let mut image = BytesMut::with_capacity(BLCKSZ as usize);
|
||||
// TODO(vlad): skip the copy
|
||||
image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]);
|
||||
|
||||
if blk.hole_length != 0 {
|
||||
let tail = image.split_off(blk.hole_offset as usize);
|
||||
image.resize(image.len() + blk.hole_length as usize, 0u8);
|
||||
image.unsplit(tail);
|
||||
}
|
||||
//
|
||||
// Match the logic of XLogReadBufferForRedoExtended:
|
||||
// The page may be uninitialized. If so, we can't set the LSN because
|
||||
// that would corrupt the page.
|
||||
//
|
||||
if !page_is_new(&image) {
|
||||
page_set_lsn(&mut image, lsn)
|
||||
}
|
||||
assert_eq!(image.len(), BLCKSZ as usize);
|
||||
|
||||
Value::Image(image.freeze())
|
||||
} else {
|
||||
Value::WalRecord(NeonWalRecord::Postgres {
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
rec: decoded.record.clone(),
|
||||
})
|
||||
};
|
||||
|
||||
blocks.push((key.to_compact(), Some(value)));
|
||||
}
|
||||
|
||||
Ok(InterpretedWalRecord {
|
||||
metadata_record,
|
||||
blocks,
|
||||
lsn,
|
||||
flush_uncommitted,
|
||||
xid: decoded.xl_xid,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl MetadataRecord {
|
||||
fn from_decoded(
|
||||
decoded: &DecodedWALRecord,
|
||||
lsn: Lsn,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Option<MetadataRecord>> {
|
||||
// Note: this doesn't actually copy the bytes since
|
||||
// the [`Bytes`] type implements it via a level of indirection.
|
||||
let mut buf = decoded.record.clone();
|
||||
buf.advance(decoded.main_data_offset);
|
||||
|
||||
match decoded.xl_rmid {
|
||||
pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
|
||||
Self::decode_heapam_record(&mut buf, decoded, pg_version)
|
||||
}
|
||||
pg_constants::RM_NEON_ID => Self::decode_neonmgr_record(&mut buf, decoded, pg_version),
|
||||
// Handle other special record types
|
||||
pg_constants::RM_SMGR_ID => Self::decode_smgr_record(&mut buf, decoded),
|
||||
pg_constants::RM_DBASE_ID => Self::decode_dbase_record(&mut buf, decoded, pg_version),
|
||||
pg_constants::RM_TBLSPC_ID => {
|
||||
tracing::trace!("XLOG_TBLSPC_CREATE/DROP is not handled yet");
|
||||
Ok(None)
|
||||
}
|
||||
pg_constants::RM_CLOG_ID => Self::decode_clog_record(&mut buf, decoded, pg_version),
|
||||
pg_constants::RM_XACT_ID => Self::decode_xact_record(&mut buf, decoded, lsn),
|
||||
pg_constants::RM_MULTIXACT_ID => {
|
||||
Self::decode_multixact_record(&mut buf, decoded, pg_version)
|
||||
}
|
||||
pg_constants::RM_RELMAP_ID => Self::decode_relmap_record(&mut buf, decoded),
|
||||
// This is an odd duck. It needs to go to all shards.
|
||||
// Since it uses the checkpoint image (that's initialized from CHECKPOINT_KEY
|
||||
// in WalIngest::new), we have to send the whole DecodedWalRecord::record to
|
||||
// the pageserver and decode it there.
|
||||
//
|
||||
// Alternatively, one can make the checkpoint part of the subscription protocol
|
||||
// to the pageserver. This should work fine, but can be done at a later point.
|
||||
pg_constants::RM_XLOG_ID => Self::decode_xlog_record(&mut buf, decoded, lsn),
|
||||
pg_constants::RM_LOGICALMSG_ID => {
|
||||
Self::decode_logical_message_record(&mut buf, decoded)
|
||||
}
|
||||
pg_constants::RM_STANDBY_ID => Self::decode_standby_record(&mut buf, decoded),
|
||||
pg_constants::RM_REPLORIGIN_ID => Self::decode_replorigin_record(&mut buf, decoded),
|
||||
pg_constants::RM_BTREE_ID => {
|
||||
// No special handling required for these record types.
|
||||
// We just ingest the blocks that come with it.
|
||||
Ok(None)
|
||||
}
|
||||
unexpected => {
|
||||
// TODO: consider failing here instead of blindly doing something without
|
||||
// understanding the protocol
|
||||
use once_cell::sync::Lazy;
|
||||
use std::sync::Mutex;
|
||||
use std::time::Duration;
|
||||
use utils::rate_limit::RateLimit;
|
||||
|
||||
static LOGGED: Lazy<Mutex<RateLimit>> =
|
||||
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
|
||||
let mut rate_limit = LOGGED.try_lock().unwrap();
|
||||
rate_limit.call(|| {
|
||||
tracing::warn!(
|
||||
"Unexpected resource manager id in PG WAL record at LSN {}: {}",
|
||||
lsn,
|
||||
unexpected
|
||||
);
|
||||
});
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn decode_heapam_record(
|
||||
buf: &mut Bytes,
|
||||
decoded: &DecodedWALRecord,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Option<MetadataRecord>> {
|
||||
// Handle VM bit updates that are implicitly part of heap records.
|
||||
|
||||
// First, look at the record to determine which VM bits need
|
||||
// to be cleared. If either of these variables is set, we
|
||||
// need to clear the corresponding bits in the visibility map.
|
||||
let mut new_heap_blkno: Option<u32> = None;
|
||||
let mut old_heap_blkno: Option<u32> = None;
|
||||
let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
|
||||
|
||||
match pg_version {
|
||||
14 => {
|
||||
if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
|
||||
|
||||
if info == pg_constants::XLOG_HEAP_INSERT {
|
||||
let xlrec = v14::XlHeapInsert::decode(buf);
|
||||
assert_eq!(0, buf.remaining());
|
||||
if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP_DELETE {
|
||||
let xlrec = v14::XlHeapDelete::decode(buf);
|
||||
if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP_UPDATE
|
||||
|| info == pg_constants::XLOG_HEAP_HOT_UPDATE
|
||||
{
|
||||
let xlrec = v14::XlHeapUpdate::decode(buf);
|
||||
// the size of tuple data is inferred from the size of the record.
|
||||
// we can't validate the remaining number of bytes without parsing
|
||||
// the tuple data.
|
||||
if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
|
||||
old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
|
||||
}
|
||||
if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
|
||||
// PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
|
||||
// non-HOT update where the new tuple goes to different page than
|
||||
// the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
|
||||
// set.
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP_LOCK {
|
||||
let xlrec = v14::XlHeapLock::decode(buf);
|
||||
if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
|
||||
old_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
|
||||
}
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
|
||||
if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
|
||||
let xlrec = v14::XlHeapMultiInsert::decode(buf);
|
||||
|
||||
let offset_array_len =
|
||||
if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
|
||||
// the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
|
||||
0
|
||||
} else {
|
||||
size_of::<u16>() * xlrec.ntuples as usize
|
||||
};
|
||||
assert_eq!(offset_array_len, buf.remaining());
|
||||
|
||||
if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
|
||||
let xlrec = v14::XlHeapLockUpdated::decode(buf);
|
||||
if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
|
||||
old_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
|
||||
}
|
||||
}
|
||||
15 => {
|
||||
if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
|
||||
|
||||
if info == pg_constants::XLOG_HEAP_INSERT {
|
||||
let xlrec = v15::XlHeapInsert::decode(buf);
|
||||
assert_eq!(0, buf.remaining());
|
||||
if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP_DELETE {
|
||||
let xlrec = v15::XlHeapDelete::decode(buf);
|
||||
if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP_UPDATE
|
||||
|| info == pg_constants::XLOG_HEAP_HOT_UPDATE
|
||||
{
|
||||
let xlrec = v15::XlHeapUpdate::decode(buf);
|
||||
// the size of tuple data is inferred from the size of the record.
|
||||
// we can't validate the remaining number of bytes without parsing
|
||||
// the tuple data.
|
||||
if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
|
||||
old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
|
||||
}
|
||||
if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
|
||||
// PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
|
||||
// non-HOT update where the new tuple goes to different page than
|
||||
// the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
|
||||
// set.
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP_LOCK {
|
||||
let xlrec = v15::XlHeapLock::decode(buf);
|
||||
if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
|
||||
old_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
|
||||
}
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
|
||||
if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
|
||||
let xlrec = v15::XlHeapMultiInsert::decode(buf);
|
||||
|
||||
let offset_array_len =
|
||||
if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
|
||||
// the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
|
||||
0
|
||||
} else {
|
||||
size_of::<u16>() * xlrec.ntuples as usize
|
||||
};
|
||||
assert_eq!(offset_array_len, buf.remaining());
|
||||
|
||||
if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
|
||||
let xlrec = v15::XlHeapLockUpdated::decode(buf);
|
||||
if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
|
||||
old_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
|
||||
}
|
||||
}
|
||||
16 => {
|
||||
if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
|
||||
|
||||
if info == pg_constants::XLOG_HEAP_INSERT {
|
||||
let xlrec = v16::XlHeapInsert::decode(buf);
|
||||
assert_eq!(0, buf.remaining());
|
||||
if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP_DELETE {
|
||||
let xlrec = v16::XlHeapDelete::decode(buf);
|
||||
if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP_UPDATE
|
||||
|| info == pg_constants::XLOG_HEAP_HOT_UPDATE
|
||||
{
|
||||
let xlrec = v16::XlHeapUpdate::decode(buf);
|
||||
// the size of tuple data is inferred from the size of the record.
|
||||
// we can't validate the remaining number of bytes without parsing
|
||||
// the tuple data.
|
||||
if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
|
||||
old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
|
||||
}
|
||||
if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
|
||||
// PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
|
||||
// non-HOT update where the new tuple goes to different page than
|
||||
// the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
|
||||
// set.
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP_LOCK {
|
||||
let xlrec = v16::XlHeapLock::decode(buf);
|
||||
if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
|
||||
old_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
|
||||
}
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
|
||||
if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
|
||||
let xlrec = v16::XlHeapMultiInsert::decode(buf);
|
||||
|
||||
let offset_array_len =
|
||||
if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
|
||||
// the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
|
||||
0
|
||||
} else {
|
||||
size_of::<u16>() * xlrec.ntuples as usize
|
||||
};
|
||||
assert_eq!(offset_array_len, buf.remaining());
|
||||
|
||||
if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
|
||||
let xlrec = v16::XlHeapLockUpdated::decode(buf);
|
||||
if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
|
||||
old_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
|
||||
}
|
||||
}
|
||||
17 => {
|
||||
if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
|
||||
|
||||
if info == pg_constants::XLOG_HEAP_INSERT {
|
||||
let xlrec = v17::XlHeapInsert::decode(buf);
|
||||
assert_eq!(0, buf.remaining());
|
||||
if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP_DELETE {
|
||||
let xlrec = v17::XlHeapDelete::decode(buf);
|
||||
if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP_UPDATE
|
||||
|| info == pg_constants::XLOG_HEAP_HOT_UPDATE
|
||||
{
|
||||
let xlrec = v17::XlHeapUpdate::decode(buf);
|
||||
// the size of tuple data is inferred from the size of the record.
|
||||
// we can't validate the remaining number of bytes without parsing
|
||||
// the tuple data.
|
||||
if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
|
||||
old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
|
||||
}
|
||||
if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
|
||||
// PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
|
||||
// non-HOT update where the new tuple goes to different page than
|
||||
// the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
|
||||
// set.
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP_LOCK {
|
||||
let xlrec = v17::XlHeapLock::decode(buf);
|
||||
if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
|
||||
old_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
|
||||
}
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
|
||||
if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
|
||||
let xlrec = v17::XlHeapMultiInsert::decode(buf);
|
||||
|
||||
let offset_array_len =
|
||||
if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
|
||||
// the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
|
||||
0
|
||||
} else {
|
||||
size_of::<u16>() * xlrec.ntuples as usize
|
||||
};
|
||||
assert_eq!(offset_array_len, buf.remaining());
|
||||
|
||||
if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
|
||||
let xlrec = v17::XlHeapLockUpdated::decode(buf);
|
||||
if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
|
||||
old_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
if new_heap_blkno.is_some() || old_heap_blkno.is_some() {
|
||||
let vm_rel = RelTag {
|
||||
forknum: VISIBILITYMAP_FORKNUM,
|
||||
spcnode: decoded.blocks[0].rnode_spcnode,
|
||||
dbnode: decoded.blocks[0].rnode_dbnode,
|
||||
relnode: decoded.blocks[0].rnode_relnode,
|
||||
};
|
||||
|
||||
Ok(Some(MetadataRecord::Heapam(HeapamRecord::ClearVmBits(
|
||||
ClearVmBits {
|
||||
new_heap_blkno,
|
||||
old_heap_blkno,
|
||||
vm_rel,
|
||||
flags,
|
||||
},
|
||||
))))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
fn decode_neonmgr_record(
|
||||
buf: &mut Bytes,
|
||||
decoded: &DecodedWALRecord,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Option<MetadataRecord>> {
|
||||
// Handle VM bit updates that are implicitly part of heap records.
|
||||
|
||||
// First, look at the record to determine which VM bits need
|
||||
// to be cleared. If either of these variables is set, we
|
||||
// need to clear the corresponding bits in the visibility map.
|
||||
let mut new_heap_blkno: Option<u32> = None;
|
||||
let mut old_heap_blkno: Option<u32> = None;
|
||||
let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
|
||||
|
||||
assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID);
|
||||
|
||||
match pg_version {
|
||||
16 | 17 => {
|
||||
let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
|
||||
|
||||
match info {
|
||||
pg_constants::XLOG_NEON_HEAP_INSERT => {
|
||||
let xlrec = v17::rm_neon::XlNeonHeapInsert::decode(buf);
|
||||
assert_eq!(0, buf.remaining());
|
||||
if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
}
|
||||
pg_constants::XLOG_NEON_HEAP_DELETE => {
|
||||
let xlrec = v17::rm_neon::XlNeonHeapDelete::decode(buf);
|
||||
if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
}
|
||||
pg_constants::XLOG_NEON_HEAP_UPDATE
|
||||
| pg_constants::XLOG_NEON_HEAP_HOT_UPDATE => {
|
||||
let xlrec = v17::rm_neon::XlNeonHeapUpdate::decode(buf);
|
||||
// the size of tuple data is inferred from the size of the record.
|
||||
// we can't validate the remaining number of bytes without parsing
|
||||
// the tuple data.
|
||||
if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
|
||||
old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
|
||||
}
|
||||
if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
|
||||
// PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
|
||||
// non-HOT update where the new tuple goes to different page than
|
||||
// the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
|
||||
// set.
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
}
|
||||
pg_constants::XLOG_NEON_HEAP_MULTI_INSERT => {
|
||||
let xlrec = v17::rm_neon::XlNeonHeapMultiInsert::decode(buf);
|
||||
|
||||
let offset_array_len =
|
||||
if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
|
||||
// the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
|
||||
0
|
||||
} else {
|
||||
size_of::<u16>() * xlrec.ntuples as usize
|
||||
};
|
||||
assert_eq!(offset_array_len, buf.remaining());
|
||||
|
||||
if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
}
|
||||
pg_constants::XLOG_NEON_HEAP_LOCK => {
|
||||
let xlrec = v17::rm_neon::XlNeonHeapLock::decode(buf);
|
||||
if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
|
||||
old_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
|
||||
}
|
||||
}
|
||||
info => anyhow::bail!("Unknown WAL record type for Neon RMGR: {}", info),
|
||||
}
|
||||
}
|
||||
_ => anyhow::bail!(
|
||||
"Neon RMGR has no known compatibility with PostgreSQL version {}",
|
||||
pg_version
|
||||
),
|
||||
}
|
||||
|
||||
if new_heap_blkno.is_some() || old_heap_blkno.is_some() {
|
||||
let vm_rel = RelTag {
|
||||
forknum: VISIBILITYMAP_FORKNUM,
|
||||
spcnode: decoded.blocks[0].rnode_spcnode,
|
||||
dbnode: decoded.blocks[0].rnode_dbnode,
|
||||
relnode: decoded.blocks[0].rnode_relnode,
|
||||
};
|
||||
|
||||
Ok(Some(MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(
|
||||
ClearVmBits {
|
||||
new_heap_blkno,
|
||||
old_heap_blkno,
|
||||
vm_rel,
|
||||
flags,
|
||||
},
|
||||
))))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
fn decode_smgr_record(
|
||||
buf: &mut Bytes,
|
||||
decoded: &DecodedWALRecord,
|
||||
) -> anyhow::Result<Option<MetadataRecord>> {
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_SMGR_CREATE {
|
||||
let create = XlSmgrCreate::decode(buf);
|
||||
let rel = RelTag {
|
||||
spcnode: create.rnode.spcnode,
|
||||
dbnode: create.rnode.dbnode,
|
||||
relnode: create.rnode.relnode,
|
||||
forknum: create.forknum,
|
||||
};
|
||||
|
||||
return Ok(Some(MetadataRecord::Smgr(SmgrRecord::Create(SmgrCreate {
|
||||
rel,
|
||||
}))));
|
||||
} else if info == pg_constants::XLOG_SMGR_TRUNCATE {
|
||||
let truncate = XlSmgrTruncate::decode(buf);
|
||||
return Ok(Some(MetadataRecord::Smgr(SmgrRecord::Truncate(truncate))));
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
fn decode_dbase_record(
|
||||
buf: &mut Bytes,
|
||||
decoded: &DecodedWALRecord,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Option<MetadataRecord>> {
|
||||
// TODO: Refactor this to avoid the duplication between postgres versions.
|
||||
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
tracing::debug!(%info, %pg_version, "handle RM_DBASE_ID");
|
||||
|
||||
if pg_version == 14 {
|
||||
if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE {
|
||||
let createdb = XlCreateDatabase::decode(buf);
|
||||
tracing::debug!("XLOG_DBASE_CREATE v14");
|
||||
|
||||
let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate {
|
||||
db_id: createdb.db_id,
|
||||
tablespace_id: createdb.tablespace_id,
|
||||
src_db_id: createdb.src_db_id,
|
||||
src_tablespace_id: createdb.src_tablespace_id,
|
||||
}));
|
||||
|
||||
return Ok(Some(record));
|
||||
} else if info == postgres_ffi::v14::bindings::XLOG_DBASE_DROP {
|
||||
let dropdb = XlDropDatabase::decode(buf);
|
||||
|
||||
let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop {
|
||||
db_id: dropdb.db_id,
|
||||
tablespace_ids: dropdb.tablespace_ids,
|
||||
}));
|
||||
|
||||
return Ok(Some(record));
|
||||
}
|
||||
} else if pg_version == 15 {
|
||||
if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG {
|
||||
tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
|
||||
} else if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY {
|
||||
// The XLOG record was renamed between v14 and v15,
|
||||
// but the record format is the same.
|
||||
// So we can reuse XlCreateDatabase here.
|
||||
tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY");
|
||||
|
||||
let createdb = XlCreateDatabase::decode(buf);
|
||||
let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate {
|
||||
db_id: createdb.db_id,
|
||||
tablespace_id: createdb.tablespace_id,
|
||||
src_db_id: createdb.src_db_id,
|
||||
src_tablespace_id: createdb.src_tablespace_id,
|
||||
}));
|
||||
|
||||
return Ok(Some(record));
|
||||
} else if info == postgres_ffi::v15::bindings::XLOG_DBASE_DROP {
|
||||
let dropdb = XlDropDatabase::decode(buf);
|
||||
let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop {
|
||||
db_id: dropdb.db_id,
|
||||
tablespace_ids: dropdb.tablespace_ids,
|
||||
}));
|
||||
|
||||
return Ok(Some(record));
|
||||
}
|
||||
} else if pg_version == 16 {
|
||||
if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG {
|
||||
tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
|
||||
} else if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY {
|
||||
// The XLOG record was renamed between v14 and v15,
|
||||
// but the record format is the same.
|
||||
// So we can reuse XlCreateDatabase here.
|
||||
tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY");
|
||||
|
||||
let createdb = XlCreateDatabase::decode(buf);
|
||||
let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate {
|
||||
db_id: createdb.db_id,
|
||||
tablespace_id: createdb.tablespace_id,
|
||||
src_db_id: createdb.src_db_id,
|
||||
src_tablespace_id: createdb.src_tablespace_id,
|
||||
}));
|
||||
|
||||
return Ok(Some(record));
|
||||
} else if info == postgres_ffi::v16::bindings::XLOG_DBASE_DROP {
|
||||
let dropdb = XlDropDatabase::decode(buf);
|
||||
let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop {
|
||||
db_id: dropdb.db_id,
|
||||
tablespace_ids: dropdb.tablespace_ids,
|
||||
}));
|
||||
|
||||
return Ok(Some(record));
|
||||
}
|
||||
} else if pg_version == 17 {
|
||||
if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_WAL_LOG {
|
||||
tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
|
||||
} else if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY {
|
||||
// The XLOG record was renamed between v14 and v15,
|
||||
// but the record format is the same.
|
||||
// So we can reuse XlCreateDatabase here.
|
||||
tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY");
|
||||
|
||||
let createdb = XlCreateDatabase::decode(buf);
|
||||
let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate {
|
||||
db_id: createdb.db_id,
|
||||
tablespace_id: createdb.tablespace_id,
|
||||
src_db_id: createdb.src_db_id,
|
||||
src_tablespace_id: createdb.src_tablespace_id,
|
||||
}));
|
||||
|
||||
return Ok(Some(record));
|
||||
} else if info == postgres_ffi::v17::bindings::XLOG_DBASE_DROP {
|
||||
let dropdb = XlDropDatabase::decode(buf);
|
||||
let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop {
|
||||
db_id: dropdb.db_id,
|
||||
tablespace_ids: dropdb.tablespace_ids,
|
||||
}));
|
||||
|
||||
return Ok(Some(record));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
fn decode_clog_record(
|
||||
buf: &mut Bytes,
|
||||
decoded: &DecodedWALRecord,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Option<MetadataRecord>> {
|
||||
let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK;
|
||||
|
||||
if info == pg_constants::CLOG_ZEROPAGE {
|
||||
let pageno = if pg_version < 17 {
|
||||
buf.get_u32_le()
|
||||
} else {
|
||||
buf.get_u64_le() as u32
|
||||
};
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
|
||||
Ok(Some(MetadataRecord::Clog(ClogRecord::ZeroPage(
|
||||
ClogZeroPage { segno, rpageno },
|
||||
))))
|
||||
} else {
|
||||
assert!(info == pg_constants::CLOG_TRUNCATE);
|
||||
let xlrec = XlClogTruncate::decode(buf, pg_version);
|
||||
|
||||
Ok(Some(MetadataRecord::Clog(ClogRecord::Truncate(
|
||||
ClogTruncate {
|
||||
pageno: xlrec.pageno,
|
||||
oldest_xid: xlrec.oldest_xid,
|
||||
oldest_xid_db: xlrec.oldest_xid_db,
|
||||
},
|
||||
))))
|
||||
}
|
||||
}
|
||||
|
||||
fn decode_xact_record(
|
||||
buf: &mut Bytes,
|
||||
decoded: &DecodedWALRecord,
|
||||
lsn: Lsn,
|
||||
) -> anyhow::Result<Option<MetadataRecord>> {
|
||||
let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||
let origin_id = decoded.origin_id;
|
||||
let xl_xid = decoded.xl_xid;
|
||||
|
||||
if info == pg_constants::XLOG_XACT_COMMIT {
|
||||
let parsed = XlXactParsedRecord::decode(buf, decoded.xl_xid, decoded.xl_info);
|
||||
return Ok(Some(MetadataRecord::Xact(XactRecord::Commit(XactCommon {
|
||||
parsed,
|
||||
origin_id,
|
||||
xl_xid,
|
||||
lsn,
|
||||
}))));
|
||||
} else if info == pg_constants::XLOG_XACT_ABORT {
|
||||
let parsed = XlXactParsedRecord::decode(buf, decoded.xl_xid, decoded.xl_info);
|
||||
return Ok(Some(MetadataRecord::Xact(XactRecord::Abort(XactCommon {
|
||||
parsed,
|
||||
origin_id,
|
||||
xl_xid,
|
||||
lsn,
|
||||
}))));
|
||||
} else if info == pg_constants::XLOG_XACT_COMMIT_PREPARED {
|
||||
let parsed = XlXactParsedRecord::decode(buf, decoded.xl_xid, decoded.xl_info);
|
||||
return Ok(Some(MetadataRecord::Xact(XactRecord::CommitPrepared(
|
||||
XactCommon {
|
||||
parsed,
|
||||
origin_id,
|
||||
xl_xid,
|
||||
lsn,
|
||||
},
|
||||
))));
|
||||
} else if info == pg_constants::XLOG_XACT_ABORT_PREPARED {
|
||||
let parsed = XlXactParsedRecord::decode(buf, decoded.xl_xid, decoded.xl_info);
|
||||
return Ok(Some(MetadataRecord::Xact(XactRecord::AbortPrepared(
|
||||
XactCommon {
|
||||
parsed,
|
||||
origin_id,
|
||||
xl_xid,
|
||||
lsn,
|
||||
},
|
||||
))));
|
||||
} else if info == pg_constants::XLOG_XACT_PREPARE {
|
||||
return Ok(Some(MetadataRecord::Xact(XactRecord::Prepare(
|
||||
XactPrepare {
|
||||
xl_xid: decoded.xl_xid,
|
||||
data: Bytes::copy_from_slice(&buf[..]),
|
||||
},
|
||||
))));
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
fn decode_multixact_record(
|
||||
buf: &mut Bytes,
|
||||
decoded: &DecodedWALRecord,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Option<MetadataRecord>> {
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
|
||||
if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE
|
||||
|| info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE
|
||||
{
|
||||
let pageno = if pg_version < 17 {
|
||||
buf.get_u32_le()
|
||||
} else {
|
||||
buf.get_u64_le() as u32
|
||||
};
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
|
||||
let slru_kind = match info {
|
||||
pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE => SlruKind::MultiXactOffsets,
|
||||
pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE => SlruKind::MultiXactMembers,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
||||
return Ok(Some(MetadataRecord::MultiXact(MultiXactRecord::ZeroPage(
|
||||
MultiXactZeroPage {
|
||||
slru_kind,
|
||||
segno,
|
||||
rpageno,
|
||||
},
|
||||
))));
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
|
||||
let xlrec = XlMultiXactCreate::decode(buf);
|
||||
return Ok(Some(MetadataRecord::MultiXact(MultiXactRecord::Create(
|
||||
xlrec,
|
||||
))));
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
|
||||
let xlrec = XlMultiXactTruncate::decode(buf);
|
||||
return Ok(Some(MetadataRecord::MultiXact(MultiXactRecord::Truncate(
|
||||
xlrec,
|
||||
))));
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
fn decode_relmap_record(
|
||||
buf: &mut Bytes,
|
||||
decoded: &DecodedWALRecord,
|
||||
) -> anyhow::Result<Option<MetadataRecord>> {
|
||||
let update = XlRelmapUpdate::decode(buf);
|
||||
|
||||
let mut buf = decoded.record.clone();
|
||||
buf.advance(decoded.main_data_offset);
|
||||
// skip xl_relmap_update
|
||||
buf.advance(12);
|
||||
|
||||
Ok(Some(MetadataRecord::Relmap(RelmapRecord::Update(
|
||||
RelmapUpdate {
|
||||
update,
|
||||
buf: Bytes::copy_from_slice(&buf[..]),
|
||||
},
|
||||
))))
|
||||
}
|
||||
|
||||
fn decode_xlog_record(
|
||||
buf: &mut Bytes,
|
||||
decoded: &DecodedWALRecord,
|
||||
lsn: Lsn,
|
||||
) -> anyhow::Result<Option<MetadataRecord>> {
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
Ok(Some(MetadataRecord::Xlog(XlogRecord::Raw(RawXlogRecord {
|
||||
info,
|
||||
lsn,
|
||||
buf: buf.clone(),
|
||||
}))))
|
||||
}
|
||||
|
||||
fn decode_logical_message_record(
|
||||
buf: &mut Bytes,
|
||||
decoded: &DecodedWALRecord,
|
||||
) -> anyhow::Result<Option<MetadataRecord>> {
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_LOGICAL_MESSAGE {
|
||||
let xlrec = XlLogicalMessage::decode(buf);
|
||||
let prefix = std::str::from_utf8(&buf[0..xlrec.prefix_size - 1])?;
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
if prefix == "neon-test" {
|
||||
return Ok(Some(MetadataRecord::LogicalMessage(
|
||||
LogicalMessageRecord::Failpoint,
|
||||
)));
|
||||
}
|
||||
|
||||
if let Some(path) = prefix.strip_prefix("neon-file:") {
|
||||
let buf_size = xlrec.prefix_size + xlrec.message_size;
|
||||
let buf = Bytes::copy_from_slice(&buf[xlrec.prefix_size..buf_size]);
|
||||
return Ok(Some(MetadataRecord::LogicalMessage(
|
||||
LogicalMessageRecord::Put(PutLogicalMessage {
|
||||
path: path.to_string(),
|
||||
buf,
|
||||
}),
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
fn decode_standby_record(
|
||||
buf: &mut Bytes,
|
||||
decoded: &DecodedWALRecord,
|
||||
) -> anyhow::Result<Option<MetadataRecord>> {
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_RUNNING_XACTS {
|
||||
let xlrec = XlRunningXacts::decode(buf);
|
||||
return Ok(Some(MetadataRecord::Standby(StandbyRecord::RunningXacts(
|
||||
StandbyRunningXacts {
|
||||
oldest_running_xid: xlrec.oldest_running_xid,
|
||||
},
|
||||
))));
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
fn decode_replorigin_record(
|
||||
buf: &mut Bytes,
|
||||
decoded: &DecodedWALRecord,
|
||||
) -> anyhow::Result<Option<MetadataRecord>> {
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_REPLORIGIN_SET {
|
||||
let xlrec = XlReploriginSet::decode(buf);
|
||||
return Ok(Some(MetadataRecord::Replorigin(ReploriginRecord::Set(
|
||||
xlrec,
|
||||
))));
|
||||
} else if info == pg_constants::XLOG_REPLORIGIN_DROP {
|
||||
let xlrec = XlReploriginDrop::decode(buf);
|
||||
return Ok(Some(MetadataRecord::Replorigin(ReploriginRecord::Drop(
|
||||
xlrec,
|
||||
))));
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
2
libs/wal_decoder/src/lib.rs
Normal file
2
libs/wal_decoder/src/lib.rs
Normal file
@@ -0,0 +1,2 @@
|
||||
pub mod decoder;
|
||||
pub mod models;
|
||||
211
libs/wal_decoder/src/models.rs
Normal file
211
libs/wal_decoder/src/models.rs
Normal file
@@ -0,0 +1,211 @@
|
||||
//! This module houses types which represent decoded PG WAL records
|
||||
//! ready for the pageserver to interpret. They are derived from the original
|
||||
//! WAL records, so that each struct corresponds closely to one WAL record of
|
||||
//! a specific kind. They contain the same information as the original WAL records,
|
||||
//! just decoded into structs and fields for easier access.
|
||||
//!
|
||||
//! The ingestion code uses these structs to help with parsing the WAL records,
|
||||
//! and it splits them into a stream of modifications to the key-value pairs that
|
||||
//! are ultimately stored in delta layers. See also the split-out counterparts in
|
||||
//! [`postgres_ffi::walrecord`].
|
||||
//!
|
||||
//! The pipeline which processes WAL records is not super obvious, so let's follow
|
||||
//! the flow of an example XACT_COMMIT Postgres record:
|
||||
//!
|
||||
//! (Postgres XACT_COMMIT record)
|
||||
//! |
|
||||
//! |--> pageserver::walingest::WalIngest::decode_xact_record
|
||||
//! |
|
||||
//! |--> ([`XactRecord::Commit`])
|
||||
//! |
|
||||
//! |--> pageserver::walingest::WalIngest::ingest_xact_record
|
||||
//! |
|
||||
//! |--> (NeonWalRecord::ClogSetCommitted)
|
||||
//! |
|
||||
//! |--> write to KV store within the pageserver
|
||||
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::key::CompactKey;
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
use pageserver_api::value::Value;
|
||||
use postgres_ffi::walrecord::{
|
||||
XlMultiXactCreate, XlMultiXactTruncate, XlRelmapUpdate, XlReploriginDrop, XlReploriginSet,
|
||||
XlSmgrTruncate, XlXactParsedRecord,
|
||||
};
|
||||
use postgres_ffi::{Oid, TransactionId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
pub enum FlushUncommittedRecords {
|
||||
Yes,
|
||||
No,
|
||||
}
|
||||
|
||||
/// An interpreted Postgres WAL record, ready to be handled by the pageserver
|
||||
pub struct InterpretedWalRecord {
|
||||
/// Optional metadata record - may cause writes to metadata keys
|
||||
/// in the storage engine
|
||||
pub metadata_record: Option<MetadataRecord>,
|
||||
/// Images or deltas for blocks modified in the original WAL record.
|
||||
/// The [`Value`] is optional to avoid sending superfluous data to
|
||||
/// shard 0 for relation size tracking.
|
||||
pub blocks: Vec<(CompactKey, Option<Value>)>,
|
||||
/// Byte offset within WAL for the end of the original PG WAL record
|
||||
pub lsn: Lsn,
|
||||
/// Whether to flush all uncommitted modifications to the storage engine
|
||||
/// before ingesting this record. This is currently only used for legacy PG
|
||||
/// database creations which read pages from a template database. Such WAL
|
||||
/// records require reading data blocks while ingesting, hence the need to flush.
|
||||
pub flush_uncommitted: FlushUncommittedRecords,
|
||||
/// Transaction id of the original PG WAL record
|
||||
pub xid: TransactionId,
|
||||
}
|
||||
|
||||
/// The interpreted part of the Postgres WAL record which requires metadata
|
||||
/// writes to the underlying storage engine.
|
||||
pub enum MetadataRecord {
|
||||
Heapam(HeapamRecord),
|
||||
Neonrmgr(NeonrmgrRecord),
|
||||
Smgr(SmgrRecord),
|
||||
Dbase(DbaseRecord),
|
||||
Clog(ClogRecord),
|
||||
Xact(XactRecord),
|
||||
MultiXact(MultiXactRecord),
|
||||
Relmap(RelmapRecord),
|
||||
Xlog(XlogRecord),
|
||||
LogicalMessage(LogicalMessageRecord),
|
||||
Standby(StandbyRecord),
|
||||
Replorigin(ReploriginRecord),
|
||||
}
|
||||
|
||||
pub enum HeapamRecord {
|
||||
ClearVmBits(ClearVmBits),
|
||||
}
|
||||
|
||||
pub struct ClearVmBits {
|
||||
pub new_heap_blkno: Option<u32>,
|
||||
pub old_heap_blkno: Option<u32>,
|
||||
pub vm_rel: RelTag,
|
||||
pub flags: u8,
|
||||
}
|
||||
|
||||
pub enum NeonrmgrRecord {
|
||||
ClearVmBits(ClearVmBits),
|
||||
}
|
||||
|
||||
pub enum SmgrRecord {
|
||||
Create(SmgrCreate),
|
||||
Truncate(XlSmgrTruncate),
|
||||
}
|
||||
|
||||
pub struct SmgrCreate {
|
||||
pub rel: RelTag,
|
||||
}
|
||||
|
||||
pub enum DbaseRecord {
|
||||
Create(DbaseCreate),
|
||||
Drop(DbaseDrop),
|
||||
}
|
||||
|
||||
pub struct DbaseCreate {
|
||||
pub db_id: Oid,
|
||||
pub tablespace_id: Oid,
|
||||
pub src_db_id: Oid,
|
||||
pub src_tablespace_id: Oid,
|
||||
}
|
||||
|
||||
pub struct DbaseDrop {
|
||||
pub db_id: Oid,
|
||||
pub tablespace_ids: Vec<Oid>,
|
||||
}
|
||||
|
||||
pub enum ClogRecord {
|
||||
ZeroPage(ClogZeroPage),
|
||||
Truncate(ClogTruncate),
|
||||
}
|
||||
|
||||
pub struct ClogZeroPage {
|
||||
pub segno: u32,
|
||||
pub rpageno: u32,
|
||||
}
|
||||
|
||||
pub struct ClogTruncate {
|
||||
pub pageno: u32,
|
||||
pub oldest_xid: TransactionId,
|
||||
pub oldest_xid_db: Oid,
|
||||
}
|
||||
|
||||
pub enum XactRecord {
|
||||
Commit(XactCommon),
|
||||
Abort(XactCommon),
|
||||
CommitPrepared(XactCommon),
|
||||
AbortPrepared(XactCommon),
|
||||
Prepare(XactPrepare),
|
||||
}
|
||||
|
||||
pub struct XactCommon {
|
||||
pub parsed: XlXactParsedRecord,
|
||||
pub origin_id: u16,
|
||||
// Fields below are only used for logging
|
||||
pub xl_xid: TransactionId,
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
pub struct XactPrepare {
|
||||
pub xl_xid: TransactionId,
|
||||
pub data: Bytes,
|
||||
}
|
||||
|
||||
pub enum MultiXactRecord {
|
||||
ZeroPage(MultiXactZeroPage),
|
||||
Create(XlMultiXactCreate),
|
||||
Truncate(XlMultiXactTruncate),
|
||||
}
|
||||
|
||||
pub struct MultiXactZeroPage {
|
||||
pub slru_kind: SlruKind,
|
||||
pub segno: u32,
|
||||
pub rpageno: u32,
|
||||
}
|
||||
|
||||
pub enum RelmapRecord {
|
||||
Update(RelmapUpdate),
|
||||
}
|
||||
|
||||
pub struct RelmapUpdate {
|
||||
pub update: XlRelmapUpdate,
|
||||
pub buf: Bytes,
|
||||
}
|
||||
|
||||
pub enum XlogRecord {
|
||||
Raw(RawXlogRecord),
|
||||
}
|
||||
|
||||
pub struct RawXlogRecord {
|
||||
pub info: u8,
|
||||
pub lsn: Lsn,
|
||||
pub buf: Bytes,
|
||||
}
|
||||
|
||||
pub enum LogicalMessageRecord {
|
||||
Put(PutLogicalMessage),
|
||||
#[cfg(feature = "testing")]
|
||||
Failpoint,
|
||||
}
|
||||
|
||||
pub struct PutLogicalMessage {
|
||||
pub path: String,
|
||||
pub buf: Bytes,
|
||||
}
|
||||
|
||||
pub enum StandbyRecord {
|
||||
RunningXacts(StandbyRunningXacts),
|
||||
}
|
||||
|
||||
pub struct StandbyRunningXacts {
|
||||
pub oldest_running_xid: TransactionId,
|
||||
}
|
||||
|
||||
pub enum ReploriginRecord {
|
||||
Set(XlReploriginSet),
|
||||
Drop(XlReploriginDrop),
|
||||
}
|
||||
@@ -8,7 +8,7 @@ license.workspace = true
|
||||
default = []
|
||||
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
|
||||
# which adds some runtime cost to run tests on outage conditions
|
||||
testing = ["fail/failpoints", "pageserver_api/testing" ]
|
||||
testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing"]
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
@@ -83,6 +83,7 @@ enum-map.workspace = true
|
||||
enumset = { workspace = true, features = ["serde"]}
|
||||
strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
wal_decoder.workspace = true
|
||||
|
||||
[target.'cfg(target_os = "linux")'.dependencies]
|
||||
procfs.workspace = true
|
||||
|
||||
@@ -8,13 +8,12 @@ use pageserver::{
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
l0_flush::{L0FlushConfig, L0FlushGlobalState},
|
||||
page_cache,
|
||||
repository::Value,
|
||||
task_mgr::TaskKind,
|
||||
tenant::storage_layer::inmemory_layer::SerializedBatch,
|
||||
tenant::storage_layer::InMemoryLayer,
|
||||
virtual_file,
|
||||
};
|
||||
use pageserver_api::{key::Key, shard::TenantShardId};
|
||||
use pageserver_api::{key::Key, shard::TenantShardId, value::Value};
|
||||
use utils::{
|
||||
bin_ser::BeSer,
|
||||
id::{TenantId, TimelineId},
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use criterion::measurement::WallTime;
|
||||
use pageserver::keyspace::{KeyPartitioning, KeySpace};
|
||||
use pageserver::repository::Key;
|
||||
use pageserver::tenant::layer_map::LayerMap;
|
||||
use pageserver::tenant::storage_layer::LayerName;
|
||||
use pageserver::tenant::storage_layer::PersistentLayerDesc;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
|
||||
use std::cmp::{max, min};
|
||||
|
||||
@@ -60,7 +60,8 @@ use anyhow::Context;
|
||||
use bytes::{Buf, Bytes};
|
||||
use criterion::{BenchmarkId, Criterion};
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
|
||||
use pageserver::{config::PageServerConf, walredo::PostgresRedoManager};
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::{key::Key, shard::TenantShardId};
|
||||
use std::{
|
||||
future::Future,
|
||||
|
||||
@@ -51,7 +51,7 @@
|
||||
//!
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use pageserver::repository::Key;
|
||||
use pageserver_api::key::Key;
|
||||
use std::cmp::Ordering;
|
||||
use std::io::{self, BufRead};
|
||||
use std::path::PathBuf;
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
//!
|
||||
//! Currently it only analyzes holes, which are regions within the layer range that the layer contains no updates for. In the future it might do more analysis (maybe key quantiles?) but it should never return sensitive data.
|
||||
|
||||
use anyhow::Result;
|
||||
use anyhow::{anyhow, Result};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use pageserver::context::{DownloadBehavior, RequestContext};
|
||||
use pageserver::task_mgr::TaskKind;
|
||||
@@ -11,15 +11,16 @@ use pageserver::virtual_file::api::IoMode;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::ops::Range;
|
||||
use std::str::FromStr;
|
||||
use std::{fs, str};
|
||||
|
||||
use pageserver::page_cache::{self, PAGE_SZ};
|
||||
use pageserver::repository::{Key, KEY_SIZE};
|
||||
use pageserver::tenant::block_io::FileBlockReader;
|
||||
use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
|
||||
use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE};
|
||||
use pageserver::tenant::storage_layer::range_overlaps;
|
||||
use pageserver::tenant::storage_layer::{range_overlaps, LayerName};
|
||||
use pageserver::virtual_file::{self, VirtualFile};
|
||||
use pageserver_api::key::{Key, KEY_SIZE};
|
||||
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
|
||||
@@ -74,35 +75,15 @@ impl LayerFile {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
|
||||
let split: Vec<&str> = name.split("__").collect();
|
||||
if split.len() != 2 {
|
||||
return None;
|
||||
}
|
||||
let keys: Vec<&str> = split[0].split('-').collect();
|
||||
let lsn_and_opt_generation: Vec<&str> = split[1].split('v').collect();
|
||||
let lsns: Vec<&str> = lsn_and_opt_generation[0].split('-').collect();
|
||||
let the_lsns: [&str; 2];
|
||||
pub(crate) fn parse_filename(name: &str) -> anyhow::Result<LayerFile> {
|
||||
let layer_name =
|
||||
LayerName::from_str(name).map_err(|e| anyhow!("failed to parse layer name: {e}"))?;
|
||||
|
||||
/*
|
||||
* Generations add a -vX-XXXXXX postfix, which causes issues when we try to
|
||||
* parse 'vX' as an LSN.
|
||||
*/
|
||||
let is_delta = if lsns.len() == 1 || lsns[1].is_empty() {
|
||||
the_lsns = [lsns[0], lsns[0]];
|
||||
false
|
||||
} else {
|
||||
the_lsns = [lsns[0], lsns[1]];
|
||||
true
|
||||
};
|
||||
|
||||
let key_range = Key::from_hex(keys[0]).unwrap()..Key::from_hex(keys[1]).unwrap();
|
||||
let lsn_range = Lsn::from_hex(the_lsns[0]).unwrap()..Lsn::from_hex(the_lsns[1]).unwrap();
|
||||
let holes = Vec::new();
|
||||
Some(LayerFile {
|
||||
key_range,
|
||||
lsn_range,
|
||||
is_delta,
|
||||
Ok(LayerFile {
|
||||
key_range: layer_name.key_range().clone(),
|
||||
lsn_range: layer_name.lsn_as_range(),
|
||||
is_delta: layer_name.is_delta(),
|
||||
holes,
|
||||
})
|
||||
}
|
||||
@@ -179,7 +160,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
|
||||
|
||||
for layer in fs::read_dir(timeline.path())? {
|
||||
let layer = layer?;
|
||||
if let Some(mut layer_file) =
|
||||
if let Ok(mut layer_file) =
|
||||
parse_filename(&layer.file_name().into_string().unwrap())
|
||||
{
|
||||
if layer_file.is_delta {
|
||||
|
||||
@@ -5,24 +5,12 @@ use camino::{Utf8Path, Utf8PathBuf};
|
||||
use clap::Subcommand;
|
||||
use pageserver::context::{DownloadBehavior, RequestContext};
|
||||
use pageserver::task_mgr::TaskKind;
|
||||
use pageserver::tenant::block_io::BlockCursor;
|
||||
use pageserver::tenant::disk_btree::DiskBtreeReader;
|
||||
use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
|
||||
use pageserver::tenant::storage_layer::{delta_layer, image_layer};
|
||||
use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer};
|
||||
use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||
use pageserver::virtual_file::api::IoMode;
|
||||
use pageserver::{page_cache, virtual_file};
|
||||
use pageserver::{
|
||||
repository::{Key, KEY_SIZE},
|
||||
tenant::{
|
||||
block_io::FileBlockReader, disk_btree::VisitDirection,
|
||||
storage_layer::delta_layer::DELTA_KEY_SIZE,
|
||||
},
|
||||
virtual_file::VirtualFile,
|
||||
};
|
||||
use std::fs;
|
||||
use utils::bin_ser::BeSer;
|
||||
use std::fs::{self, File};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use crate::layer_map_analyzer::parse_filename;
|
||||
@@ -59,44 +47,30 @@ pub(crate) enum LayerCmd {
|
||||
}
|
||||
|
||||
async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
|
||||
let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
|
||||
virtual_file::init(
|
||||
10,
|
||||
virtual_file::api::IoEngineKind::StdFs,
|
||||
IoMode::preferred(),
|
||||
);
|
||||
page_cache::init(100);
|
||||
let file = VirtualFile::open(path, ctx).await?;
|
||||
let file_id = page_cache::next_file_id();
|
||||
let block_reader = FileBlockReader::new(&file, file_id);
|
||||
let summary_blk = block_reader.read_blk(0, ctx).await?;
|
||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
actual_summary.index_start_blk,
|
||||
actual_summary.index_root_blk,
|
||||
&block_reader,
|
||||
let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
|
||||
let file = File::open(path)?;
|
||||
let delta_layer = DeltaLayer::new_for_path(path, file)?;
|
||||
delta_layer.dump(true, ctx).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn read_image_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
|
||||
virtual_file::init(
|
||||
10,
|
||||
virtual_file::api::IoEngineKind::StdFs,
|
||||
IoMode::preferred(),
|
||||
);
|
||||
// TODO(chi): dedup w/ `delta_layer.rs` by exposing the API.
|
||||
let mut all = vec![];
|
||||
tree_reader
|
||||
.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
|key, value_offset| {
|
||||
let curr = Key::from_slice(&key[..KEY_SIZE]);
|
||||
all.push((curr, BlobRef(value_offset)));
|
||||
true
|
||||
},
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
let cursor = BlockCursor::new_fileblockreader(&block_reader);
|
||||
for (k, v) in all {
|
||||
let value = cursor.read_blob(v.pos(), ctx).await?;
|
||||
println!("key:{} value_len:{}", k, value.len());
|
||||
assert!(k.is_i128_representable(), "invalid key: ");
|
||||
}
|
||||
// TODO(chi): special handling for last key?
|
||||
page_cache::init(100);
|
||||
let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
|
||||
let file = File::open(path)?;
|
||||
let image_layer = ImageLayer::new_for_path(path, file)?;
|
||||
image_layer.dump(true, ctx).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -133,8 +107,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
let mut idx = 0;
|
||||
for layer in fs::read_dir(timeline_path)? {
|
||||
let layer = layer?;
|
||||
if let Some(layer_file) = parse_filename(&layer.file_name().into_string().unwrap())
|
||||
{
|
||||
if let Ok(layer_file) = parse_filename(&layer.file_name().into_string().unwrap()) {
|
||||
println!(
|
||||
"[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}",
|
||||
idx,
|
||||
@@ -163,8 +136,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
let mut idx = 0;
|
||||
for layer in fs::read_dir(timeline_path)? {
|
||||
let layer = layer?;
|
||||
if let Some(layer_file) = parse_filename(&layer.file_name().into_string().unwrap())
|
||||
{
|
||||
if let Ok(layer_file) = parse_filename(&layer.file_name().into_string().unwrap()) {
|
||||
if *id == idx {
|
||||
// TODO(chi): dedup code
|
||||
println!(
|
||||
@@ -180,7 +152,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
if layer_file.is_delta {
|
||||
read_delta_file(layer.path(), &ctx).await?;
|
||||
} else {
|
||||
anyhow::bail!("not supported yet :(");
|
||||
read_image_file(layer.path(), &ctx).await?;
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use pageserver_api::models::{AuxFilePolicy, TenantConfig, TenantConfigRequest};
|
||||
use pageserver_api::models::{TenantConfig, TenantConfigRequest};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
@@ -66,10 +66,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
|
||||
mgmt_api_client
|
||||
.tenant_config(&TenantConfigRequest {
|
||||
tenant_id: timeline.tenant_id,
|
||||
config: TenantConfig {
|
||||
switch_aux_file_policy: Some(AuxFilePolicy::V2),
|
||||
..Default::default()
|
||||
},
|
||||
config: TenantConfig::default(),
|
||||
})
|
||||
.await?;
|
||||
|
||||
|
||||
@@ -59,6 +59,7 @@ pub async fn send_basebackup_tarball<'a, W>(
|
||||
req_lsn: Option<Lsn>,
|
||||
prev_lsn: Option<Lsn>,
|
||||
full_backup: bool,
|
||||
replica: bool,
|
||||
ctx: &'a RequestContext,
|
||||
) -> Result<(), BasebackupError>
|
||||
where
|
||||
@@ -110,8 +111,8 @@ where
|
||||
};
|
||||
|
||||
info!(
|
||||
"taking basebackup lsn={}, prev_lsn={} (full_backup={})",
|
||||
backup_lsn, prev_lsn, full_backup
|
||||
"taking basebackup lsn={}, prev_lsn={} (full_backup={}, replica={})",
|
||||
backup_lsn, prev_lsn, full_backup, replica
|
||||
);
|
||||
|
||||
let basebackup = Basebackup {
|
||||
@@ -120,6 +121,7 @@ where
|
||||
lsn: backup_lsn,
|
||||
prev_record_lsn: prev_lsn,
|
||||
full_backup,
|
||||
replica,
|
||||
ctx,
|
||||
};
|
||||
basebackup
|
||||
@@ -140,6 +142,7 @@ where
|
||||
lsn: Lsn,
|
||||
prev_record_lsn: Lsn,
|
||||
full_backup: bool,
|
||||
replica: bool,
|
||||
ctx: &'a RequestContext,
|
||||
}
|
||||
|
||||
@@ -372,6 +375,10 @@ where
|
||||
|
||||
for (path, content) in aux_files {
|
||||
if path.starts_with("pg_replslot") {
|
||||
// Do not create LR slots at standby because they are not used but prevent WAL truncation
|
||||
if self.replica {
|
||||
continue;
|
||||
}
|
||||
let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
|
||||
let restart_lsn = Lsn(u64::from_le_bytes(
|
||||
content[offs..offs + 8].try_into().unwrap(),
|
||||
|
||||
@@ -14,6 +14,7 @@ use itertools::Itertools as _;
|
||||
use pageserver_api::models::TenantState;
|
||||
use remote_storage::{GenericRemoteStorage, RemoteStorageConfig};
|
||||
use reqwest::Url;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, SystemTime};
|
||||
@@ -35,12 +36,62 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
|
||||
/// upload attempts.
|
||||
type RawMetric = (MetricsKey, (EventType, u64));
|
||||
|
||||
/// The new serializable metrics format
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct NewMetricsRoot {
|
||||
version: usize,
|
||||
metrics: Vec<NewRawMetric>,
|
||||
}
|
||||
|
||||
impl NewMetricsRoot {
|
||||
pub fn is_v2_metrics(json_value: &serde_json::Value) -> bool {
|
||||
if let Some(ver) = json_value.get("version") {
|
||||
if let Some(2) = ver.as_u64() {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// The new serializable metrics format
|
||||
#[derive(Serialize)]
|
||||
struct NewMetricsRefRoot<'a> {
|
||||
version: usize,
|
||||
metrics: &'a [NewRawMetric],
|
||||
}
|
||||
|
||||
impl<'a> NewMetricsRefRoot<'a> {
|
||||
fn new(metrics: &'a [NewRawMetric]) -> Self {
|
||||
Self {
|
||||
version: 2,
|
||||
metrics,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The new serializable metrics format
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
|
||||
struct NewRawMetric {
|
||||
key: MetricsKey,
|
||||
kind: EventType,
|
||||
value: u64,
|
||||
// TODO: add generation field and check against generations
|
||||
}
|
||||
|
||||
impl NewRawMetric {
|
||||
#[cfg(test)]
|
||||
fn to_kv_pair(&self) -> (MetricsKey, NewRawMetric) {
|
||||
(self.key, self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
/// Caches the [`RawMetric`]s
|
||||
///
|
||||
/// In practice, during startup, last sent values are stored here to be used in calculating new
|
||||
/// ones. After successful uploading, the cached values are updated to cache. This used to be used
|
||||
/// for deduplication, but that is no longer needed.
|
||||
type Cache = HashMap<MetricsKey, (EventType, u64)>;
|
||||
type Cache = HashMap<MetricsKey, NewRawMetric>;
|
||||
|
||||
pub async fn run(
|
||||
conf: &'static PageServerConf,
|
||||
@@ -231,11 +282,14 @@ async fn restore_and_reschedule(
|
||||
// collect_all_metrics
|
||||
let earlier_metric_at = found_some
|
||||
.iter()
|
||||
.map(|(_, (et, _))| et.recorded_at())
|
||||
.map(|item| item.kind.recorded_at())
|
||||
.copied()
|
||||
.next();
|
||||
|
||||
let cached = found_some.into_iter().collect::<Cache>();
|
||||
let cached = found_some
|
||||
.into_iter()
|
||||
.map(|item| (item.key, item))
|
||||
.collect::<Cache>();
|
||||
|
||||
(cached, earlier_metric_at)
|
||||
}
|
||||
|
||||
@@ -2,11 +2,33 @@ use anyhow::Context;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::RawMetric;
|
||||
use crate::consumption_metrics::NewMetricsRefRoot;
|
||||
|
||||
use super::{NewMetricsRoot, NewRawMetric, RawMetric};
|
||||
|
||||
pub(super) fn read_metrics_from_serde_value(
|
||||
json_value: serde_json::Value,
|
||||
) -> anyhow::Result<Vec<NewRawMetric>> {
|
||||
if NewMetricsRoot::is_v2_metrics(&json_value) {
|
||||
let root = serde_json::from_value::<NewMetricsRoot>(json_value)?;
|
||||
Ok(root.metrics)
|
||||
} else {
|
||||
let all_metrics = serde_json::from_value::<Vec<RawMetric>>(json_value)?;
|
||||
let all_metrics = all_metrics
|
||||
.into_iter()
|
||||
.map(|(key, (event_type, value))| NewRawMetric {
|
||||
key,
|
||||
kind: event_type,
|
||||
value,
|
||||
})
|
||||
.collect();
|
||||
Ok(all_metrics)
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) async fn read_metrics_from_disk(
|
||||
path: Arc<Utf8PathBuf>,
|
||||
) -> anyhow::Result<Vec<RawMetric>> {
|
||||
) -> anyhow::Result<Vec<NewRawMetric>> {
|
||||
// do not add context to each error, callsite will log with full path
|
||||
let span = tracing::Span::current();
|
||||
tokio::task::spawn_blocking(move || {
|
||||
@@ -20,7 +42,8 @@ pub(super) async fn read_metrics_from_disk(
|
||||
|
||||
let mut file = std::fs::File::open(&*path)?;
|
||||
let reader = std::io::BufReader::new(&mut file);
|
||||
anyhow::Ok(serde_json::from_reader::<_, Vec<RawMetric>>(reader)?)
|
||||
let json_value = serde_json::from_reader::<_, serde_json::Value>(reader)?;
|
||||
read_metrics_from_serde_value(json_value)
|
||||
})
|
||||
.await
|
||||
.context("read metrics join error")
|
||||
@@ -63,7 +86,7 @@ fn scan_and_delete_with_same_prefix(path: &Utf8Path) -> std::io::Result<()> {
|
||||
}
|
||||
|
||||
pub(super) async fn flush_metrics_to_disk(
|
||||
current_metrics: &Arc<Vec<RawMetric>>,
|
||||
current_metrics: &Arc<Vec<NewRawMetric>>,
|
||||
path: &Arc<Utf8PathBuf>,
|
||||
) -> anyhow::Result<()> {
|
||||
use std::io::Write;
|
||||
@@ -93,8 +116,11 @@ pub(super) async fn flush_metrics_to_disk(
|
||||
// write out all of the raw metrics, to be read out later on restart as cached values
|
||||
{
|
||||
let mut writer = std::io::BufWriter::new(&mut tempfile);
|
||||
serde_json::to_writer(&mut writer, &*current_metrics)
|
||||
.context("serialize metrics")?;
|
||||
serde_json::to_writer(
|
||||
&mut writer,
|
||||
&NewMetricsRefRoot::new(current_metrics.as_ref()),
|
||||
)
|
||||
.context("serialize metrics")?;
|
||||
writer
|
||||
.into_inner()
|
||||
.map_err(|_| anyhow::anyhow!("flushing metrics failed"))?;
|
||||
|
||||
@@ -9,7 +9,7 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use super::{Cache, RawMetric};
|
||||
use super::{Cache, NewRawMetric};
|
||||
|
||||
/// Name of the metric, used by `MetricsKey` factory methods and `deserialize_cached_events`
|
||||
/// instead of static str.
|
||||
@@ -64,11 +64,21 @@ impl MetricsKey {
|
||||
struct AbsoluteValueFactory(MetricsKey);
|
||||
|
||||
impl AbsoluteValueFactory {
|
||||
const fn at(self, time: DateTime<Utc>, val: u64) -> RawMetric {
|
||||
#[cfg(test)]
|
||||
const fn at_old_format(self, time: DateTime<Utc>, val: u64) -> super::RawMetric {
|
||||
let key = self.0;
|
||||
(key, (EventType::Absolute { time }, val))
|
||||
}
|
||||
|
||||
const fn at(self, time: DateTime<Utc>, val: u64) -> NewRawMetric {
|
||||
let key = self.0;
|
||||
NewRawMetric {
|
||||
key,
|
||||
kind: EventType::Absolute { time },
|
||||
value: val,
|
||||
}
|
||||
}
|
||||
|
||||
fn key(&self) -> &MetricsKey {
|
||||
&self.0
|
||||
}
|
||||
@@ -84,7 +94,28 @@ impl IncrementalValueFactory {
|
||||
prev_end: DateTime<Utc>,
|
||||
up_to: DateTime<Utc>,
|
||||
val: u64,
|
||||
) -> RawMetric {
|
||||
) -> NewRawMetric {
|
||||
let key = self.0;
|
||||
// cannot assert prev_end < up_to because these are realtime clock based
|
||||
let when = EventType::Incremental {
|
||||
start_time: prev_end,
|
||||
stop_time: up_to,
|
||||
};
|
||||
NewRawMetric {
|
||||
key,
|
||||
kind: when,
|
||||
value: val,
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::wrong_self_convention)]
|
||||
#[cfg(test)]
|
||||
const fn from_until_old_format(
|
||||
self,
|
||||
prev_end: DateTime<Utc>,
|
||||
up_to: DateTime<Utc>,
|
||||
val: u64,
|
||||
) -> super::RawMetric {
|
||||
let key = self.0;
|
||||
// cannot assert prev_end < up_to because these are realtime clock based
|
||||
let when = EventType::Incremental {
|
||||
@@ -185,7 +216,7 @@ pub(super) async fn collect_all_metrics(
|
||||
tenant_manager: &Arc<TenantManager>,
|
||||
cached_metrics: &Cache,
|
||||
ctx: &RequestContext,
|
||||
) -> Vec<RawMetric> {
|
||||
) -> Vec<NewRawMetric> {
|
||||
use pageserver_api::models::TenantState;
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
@@ -220,11 +251,11 @@ pub(super) async fn collect_all_metrics(
|
||||
res
|
||||
}
|
||||
|
||||
async fn collect<S>(tenants: S, cache: &Cache, ctx: &RequestContext) -> Vec<RawMetric>
|
||||
async fn collect<S>(tenants: S, cache: &Cache, ctx: &RequestContext) -> Vec<NewRawMetric>
|
||||
where
|
||||
S: futures::stream::Stream<Item = (TenantId, Arc<crate::tenant::Tenant>)>,
|
||||
{
|
||||
let mut current_metrics: Vec<RawMetric> = Vec::new();
|
||||
let mut current_metrics: Vec<NewRawMetric> = Vec::new();
|
||||
|
||||
let mut tenants = std::pin::pin!(tenants);
|
||||
|
||||
@@ -291,7 +322,7 @@ impl TenantSnapshot {
|
||||
tenant_id: TenantId,
|
||||
now: DateTime<Utc>,
|
||||
cached: &Cache,
|
||||
metrics: &mut Vec<RawMetric>,
|
||||
metrics: &mut Vec<NewRawMetric>,
|
||||
) {
|
||||
let remote_size = MetricsKey::remote_storage_size(tenant_id).at(now, self.remote_size);
|
||||
|
||||
@@ -302,9 +333,9 @@ impl TenantSnapshot {
|
||||
let mut synthetic_size = self.synthetic_size;
|
||||
|
||||
if synthetic_size == 0 {
|
||||
if let Some((_, value)) = cached.get(factory.key()) {
|
||||
// use the latest value from previous session
|
||||
synthetic_size = *value;
|
||||
if let Some(item) = cached.get(factory.key()) {
|
||||
// use the latest value from previous session, TODO: check generation number
|
||||
synthetic_size = item.value;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -381,37 +412,36 @@ impl TimelineSnapshot {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
now: DateTime<Utc>,
|
||||
metrics: &mut Vec<RawMetric>,
|
||||
metrics: &mut Vec<NewRawMetric>,
|
||||
cache: &Cache,
|
||||
) {
|
||||
let timeline_written_size = u64::from(self.last_record_lsn);
|
||||
|
||||
let written_size_delta_key = MetricsKey::written_size_delta(tenant_id, timeline_id);
|
||||
|
||||
let last_stop_time = cache
|
||||
.get(written_size_delta_key.key())
|
||||
.map(|(until, _val)| {
|
||||
until
|
||||
.incremental_timerange()
|
||||
.expect("never create EventType::Absolute for written_size_delta")
|
||||
.end
|
||||
});
|
||||
let last_stop_time = cache.get(written_size_delta_key.key()).map(|item| {
|
||||
item.kind
|
||||
.incremental_timerange()
|
||||
.expect("never create EventType::Absolute for written_size_delta")
|
||||
.end
|
||||
});
|
||||
|
||||
let (key, written_size_now) =
|
||||
let written_size_now =
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(now, timeline_written_size);
|
||||
|
||||
// by default, use the last sent written_size as the basis for
|
||||
// calculating the delta. if we don't yet have one, use the load time value.
|
||||
let prev = cache
|
||||
.get(&key)
|
||||
.map(|(prev_at, prev)| {
|
||||
let prev: (DateTime<Utc>, u64) = cache
|
||||
.get(&written_size_now.key)
|
||||
.map(|item| {
|
||||
// use the prev time from our last incremental update, or default to latest
|
||||
// absolute update on the first round.
|
||||
let prev_at = prev_at
|
||||
let prev_at = item
|
||||
.kind
|
||||
.absolute_time()
|
||||
.expect("never create EventType::Incremental for written_size");
|
||||
let prev_at = last_stop_time.unwrap_or(prev_at);
|
||||
(*prev_at, *prev)
|
||||
(*prev_at, item.value)
|
||||
})
|
||||
.unwrap_or_else(|| {
|
||||
// if we don't have a previous point of comparison, compare to the load time
|
||||
@@ -422,24 +452,28 @@ impl TimelineSnapshot {
|
||||
|
||||
let up_to = now;
|
||||
|
||||
if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
|
||||
if let Some(delta) = written_size_now.value.checked_sub(prev.1) {
|
||||
let key_value = written_size_delta_key.from_until(prev.0, up_to, delta);
|
||||
// written_size_delta
|
||||
metrics.push(key_value);
|
||||
// written_size
|
||||
metrics.push((key, written_size_now));
|
||||
metrics.push(written_size_now);
|
||||
} else {
|
||||
// the cached value was ahead of us, report zero until we've caught up
|
||||
metrics.push(written_size_delta_key.from_until(prev.0, up_to, 0));
|
||||
// the cached value was ahead of us, report the same until we've caught up
|
||||
metrics.push((key, (written_size_now.0, prev.1)));
|
||||
metrics.push(NewRawMetric {
|
||||
key: written_size_now.key,
|
||||
kind: written_size_now.kind,
|
||||
value: prev.1,
|
||||
});
|
||||
}
|
||||
|
||||
{
|
||||
let factory = MetricsKey::timeline_logical_size(tenant_id, timeline_id);
|
||||
let current_or_previous = self
|
||||
.current_exact_logical_size
|
||||
.or_else(|| cache.get(factory.key()).map(|(_, val)| *val));
|
||||
.or_else(|| cache.get(factory.key()).map(|item| item.value));
|
||||
|
||||
if let Some(size) = current_or_previous {
|
||||
metrics.push(factory.at(now, size));
|
||||
@@ -452,4 +486,4 @@ impl TimelineSnapshot {
|
||||
mod tests;
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) use tests::metric_examples;
|
||||
pub(crate) use tests::{metric_examples, metric_examples_old};
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
use crate::consumption_metrics::RawMetric;
|
||||
|
||||
use super::*;
|
||||
use std::collections::HashMap;
|
||||
|
||||
@@ -50,9 +52,9 @@ fn startup_collected_timeline_metrics_second_round() {
|
||||
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
let cache = HashMap::from([
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0)
|
||||
]);
|
||||
let cache = HashMap::from([MetricsKey::written_size(tenant_id, timeline_id)
|
||||
.at(before, disk_consistent_lsn.0)
|
||||
.to_kv_pair()]);
|
||||
|
||||
let snap = TimelineSnapshot {
|
||||
loaded_at: (disk_consistent_lsn, init),
|
||||
@@ -89,9 +91,13 @@ fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
|
||||
let mut metrics = Vec::new();
|
||||
let cache = HashMap::from([
|
||||
// at t=before was the last time the last_record_lsn changed
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0),
|
||||
MetricsKey::written_size(tenant_id, timeline_id)
|
||||
.at(before, disk_consistent_lsn.0)
|
||||
.to_kv_pair(),
|
||||
// end time of this event is used for the next ones
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, just_before, 0),
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id)
|
||||
.from_until(before, just_before, 0)
|
||||
.to_kv_pair(),
|
||||
]);
|
||||
|
||||
let snap = TimelineSnapshot {
|
||||
@@ -138,13 +144,17 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
|
||||
};
|
||||
|
||||
let mut cache = HashMap::from([
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(before_restart, 100),
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
|
||||
way_before,
|
||||
before_restart,
|
||||
// not taken into account, but the timestamps are important
|
||||
999_999_999,
|
||||
),
|
||||
MetricsKey::written_size(tenant_id, timeline_id)
|
||||
.at(before_restart, 100)
|
||||
.to_kv_pair(),
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id)
|
||||
.from_until(
|
||||
way_before,
|
||||
before_restart,
|
||||
// not taken into account, but the timestamps are important
|
||||
999_999_999,
|
||||
)
|
||||
.to_kv_pair(),
|
||||
]);
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
@@ -163,7 +173,7 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
|
||||
);
|
||||
|
||||
// now if we cache these metrics, and re-run while "still in recovery"
|
||||
cache.extend(metrics.drain(..));
|
||||
cache.extend(metrics.drain(..).map(|x| x.to_kv_pair()));
|
||||
|
||||
// "still in recovery", because our snapshot did not change
|
||||
snap.to_metrics(tenant_id, timeline_id, later, &mut metrics, &cache);
|
||||
@@ -194,14 +204,14 @@ fn post_restart_current_exact_logical_size_uses_cached() {
|
||||
current_exact_logical_size: None,
|
||||
};
|
||||
|
||||
let cache = HashMap::from([
|
||||
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(before_restart, 100)
|
||||
]);
|
||||
let cache = HashMap::from([MetricsKey::timeline_logical_size(tenant_id, timeline_id)
|
||||
.at(before_restart, 100)
|
||||
.to_kv_pair()]);
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
|
||||
|
||||
metrics.retain(|(key, _)| key.metric == Name::LogicalSize);
|
||||
metrics.retain(|item| item.key.metric == Name::LogicalSize);
|
||||
|
||||
assert_eq!(
|
||||
metrics,
|
||||
@@ -224,7 +234,9 @@ fn post_restart_synthetic_size_uses_cached_if_available() {
|
||||
let before_restart = DateTime::<Utc>::from(now - std::time::Duration::from_secs(5 * 60));
|
||||
let now = DateTime::<Utc>::from(now);
|
||||
|
||||
let cached = HashMap::from([MetricsKey::synthetic_size(tenant_id).at(before_restart, 1000)]);
|
||||
let cached = HashMap::from([MetricsKey::synthetic_size(tenant_id)
|
||||
.at(before_restart, 1000)
|
||||
.to_kv_pair()]);
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
ts.to_metrics(tenant_id, now, &cached, &mut metrics);
|
||||
@@ -278,12 +290,29 @@ fn time_backwards<const N: usize>() -> [std::time::SystemTime; N] {
|
||||
times
|
||||
}
|
||||
|
||||
pub(crate) const fn metric_examples(
|
||||
pub(crate) const fn metric_examples_old(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
now: DateTime<Utc>,
|
||||
before: DateTime<Utc>,
|
||||
) -> [RawMetric; 6] {
|
||||
[
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at_old_format(now, 0),
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id)
|
||||
.from_until_old_format(before, now, 0),
|
||||
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at_old_format(now, 0),
|
||||
MetricsKey::remote_storage_size(tenant_id).at_old_format(now, 0),
|
||||
MetricsKey::resident_size(tenant_id).at_old_format(now, 0),
|
||||
MetricsKey::synthetic_size(tenant_id).at_old_format(now, 1),
|
||||
]
|
||||
}
|
||||
|
||||
pub(crate) const fn metric_examples(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
now: DateTime<Utc>,
|
||||
before: DateTime<Utc>,
|
||||
) -> [NewRawMetric; 6] {
|
||||
[
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(now, 0),
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
|
||||
|
||||
@@ -7,7 +7,7 @@ use tokio::io::AsyncWriteExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::Instrument;
|
||||
|
||||
use super::{metrics::Name, Cache, MetricsKey, RawMetric};
|
||||
use super::{metrics::Name, Cache, MetricsKey, NewRawMetric, RawMetric};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
/// How the metrics from pageserver are identified.
|
||||
@@ -24,7 +24,7 @@ pub(super) async fn upload_metrics_http(
|
||||
client: &reqwest::Client,
|
||||
metric_collection_endpoint: &reqwest::Url,
|
||||
cancel: &CancellationToken,
|
||||
metrics: &[RawMetric],
|
||||
metrics: &[NewRawMetric],
|
||||
cached_metrics: &mut Cache,
|
||||
idempotency_keys: &[IdempotencyKey<'_>],
|
||||
) -> anyhow::Result<()> {
|
||||
@@ -53,8 +53,8 @@ pub(super) async fn upload_metrics_http(
|
||||
|
||||
match res {
|
||||
Ok(()) => {
|
||||
for (curr_key, curr_val) in chunk {
|
||||
cached_metrics.insert(*curr_key, *curr_val);
|
||||
for item in chunk {
|
||||
cached_metrics.insert(item.key, item.clone());
|
||||
}
|
||||
uploaded += chunk.len();
|
||||
}
|
||||
@@ -86,7 +86,7 @@ pub(super) async fn upload_metrics_bucket(
|
||||
client: &GenericRemoteStorage,
|
||||
cancel: &CancellationToken,
|
||||
node_id: &str,
|
||||
metrics: &[RawMetric],
|
||||
metrics: &[NewRawMetric],
|
||||
idempotency_keys: &[IdempotencyKey<'_>],
|
||||
) -> anyhow::Result<()> {
|
||||
if metrics.is_empty() {
|
||||
@@ -140,16 +140,16 @@ pub(super) async fn upload_metrics_bucket(
|
||||
/// across different metrics sinks), and must have the same length as input.
|
||||
fn serialize_in_chunks<'a>(
|
||||
chunk_size: usize,
|
||||
input: &'a [RawMetric],
|
||||
input: &'a [NewRawMetric],
|
||||
idempotency_keys: &'a [IdempotencyKey<'a>],
|
||||
) -> impl ExactSizeIterator<Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>> + 'a
|
||||
) -> impl ExactSizeIterator<Item = Result<(&'a [NewRawMetric], bytes::Bytes), serde_json::Error>> + 'a
|
||||
{
|
||||
use bytes::BufMut;
|
||||
|
||||
assert_eq!(input.len(), idempotency_keys.len());
|
||||
|
||||
struct Iter<'a> {
|
||||
inner: std::slice::Chunks<'a, RawMetric>,
|
||||
inner: std::slice::Chunks<'a, NewRawMetric>,
|
||||
idempotency_keys: std::slice::Iter<'a, IdempotencyKey<'a>>,
|
||||
chunk_size: usize,
|
||||
|
||||
@@ -160,7 +160,7 @@ fn serialize_in_chunks<'a>(
|
||||
}
|
||||
|
||||
impl<'a> Iterator for Iter<'a> {
|
||||
type Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>;
|
||||
type Item = Result<(&'a [NewRawMetric], bytes::Bytes), serde_json::Error>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let chunk = self.inner.next()?;
|
||||
@@ -269,6 +269,58 @@ impl RawMetricExt for RawMetric {
|
||||
}
|
||||
}
|
||||
|
||||
impl RawMetricExt for NewRawMetric {
|
||||
fn as_event(&self, key: &IdempotencyKey<'_>) -> Event<Ids, Name> {
|
||||
let MetricsKey {
|
||||
metric,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
} = self.key;
|
||||
|
||||
let kind = self.kind;
|
||||
let value = self.value;
|
||||
|
||||
Event {
|
||||
kind,
|
||||
metric,
|
||||
idempotency_key: key.to_string(),
|
||||
value,
|
||||
extra: Ids {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn update_in_place(&self, event: &mut Event<Ids, Name>, key: &IdempotencyKey<'_>) {
|
||||
use std::fmt::Write;
|
||||
|
||||
let MetricsKey {
|
||||
metric,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
} = self.key;
|
||||
|
||||
let kind = self.kind;
|
||||
let value = self.value;
|
||||
|
||||
*event = Event {
|
||||
kind,
|
||||
metric,
|
||||
idempotency_key: {
|
||||
event.idempotency_key.clear();
|
||||
write!(event.idempotency_key, "{key}").unwrap();
|
||||
std::mem::take(&mut event.idempotency_key)
|
||||
},
|
||||
value,
|
||||
extra: Ids {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) trait KeyGen<'a> {
|
||||
fn generate(&self) -> IdempotencyKey<'a>;
|
||||
}
|
||||
@@ -381,6 +433,10 @@ async fn upload(
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::consumption_metrics::{
|
||||
disk_cache::read_metrics_from_serde_value, NewMetricsRefRoot,
|
||||
};
|
||||
|
||||
use super::*;
|
||||
use chrono::{DateTime, Utc};
|
||||
use once_cell::sync::Lazy;
|
||||
@@ -473,23 +529,49 @@ mod tests {
|
||||
let idempotency_key = consumption_metrics::IdempotencyKey::for_tests(*SAMPLES_NOW, "1", 0);
|
||||
let examples = examples.into_iter().zip(metric_samples());
|
||||
|
||||
for ((line, expected), (key, (kind, value))) in examples {
|
||||
for ((line, expected), item) in examples {
|
||||
let e = consumption_metrics::Event {
|
||||
kind,
|
||||
metric: key.metric,
|
||||
kind: item.kind,
|
||||
metric: item.key.metric,
|
||||
idempotency_key: idempotency_key.to_string(),
|
||||
value,
|
||||
value: item.value,
|
||||
extra: Ids {
|
||||
tenant_id: key.tenant_id,
|
||||
timeline_id: key.timeline_id,
|
||||
tenant_id: item.key.tenant_id,
|
||||
timeline_id: item.key.timeline_id,
|
||||
},
|
||||
};
|
||||
let actual = serde_json::to_string(&e).unwrap();
|
||||
assert_eq!(expected, actual, "example for {kind:?} from line {line}");
|
||||
assert_eq!(
|
||||
expected, actual,
|
||||
"example for {:?} from line {line}",
|
||||
item.kind
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
fn metric_samples() -> [RawMetric; 6] {
|
||||
#[test]
|
||||
fn disk_format_upgrade() {
|
||||
let old_samples_json = serde_json::to_value(metric_samples_old()).unwrap();
|
||||
let new_samples =
|
||||
serde_json::to_value(NewMetricsRefRoot::new(metric_samples().as_ref())).unwrap();
|
||||
let upgraded_samples = read_metrics_from_serde_value(old_samples_json).unwrap();
|
||||
let new_samples = read_metrics_from_serde_value(new_samples).unwrap();
|
||||
assert_eq!(upgraded_samples, new_samples);
|
||||
}
|
||||
|
||||
fn metric_samples_old() -> [RawMetric; 6] {
|
||||
let tenant_id = TenantId::from_array([0; 16]);
|
||||
let timeline_id = TimelineId::from_array([0xff; 16]);
|
||||
|
||||
let before = DateTime::parse_from_rfc3339("2023-09-14T00:00:00.123456789Z")
|
||||
.unwrap()
|
||||
.into();
|
||||
let [now, before] = [*SAMPLES_NOW, before];
|
||||
|
||||
super::super::metrics::metric_examples_old(tenant_id, timeline_id, now, before)
|
||||
}
|
||||
|
||||
fn metric_samples() -> [NewRawMetric; 6] {
|
||||
let tenant_id = TenantId::from_array([0; 16]);
|
||||
let timeline_id = TimelineId::from_array([0xff; 16]);
|
||||
|
||||
|
||||
@@ -696,7 +696,7 @@ impl DeletionQueue {
|
||||
mod test {
|
||||
use camino::Utf8Path;
|
||||
use hex_literal::hex;
|
||||
use pageserver_api::{shard::ShardIndex, upcall_api::ReAttachResponseTenant};
|
||||
use pageserver_api::{key::Key, shard::ShardIndex, upcall_api::ReAttachResponseTenant};
|
||||
use std::{io::ErrorKind, time::Duration};
|
||||
use tracing::info;
|
||||
|
||||
@@ -705,7 +705,6 @@ mod test {
|
||||
|
||||
use crate::{
|
||||
controller_upcall_client::RetryForeverError,
|
||||
repository::Key,
|
||||
tenant::{harness::TenantHarness, storage_layer::DeltaLayerName},
|
||||
};
|
||||
|
||||
|
||||
@@ -2232,13 +2232,13 @@ async fn getpage_at_lsn_handler(
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let state = get_state(&request);
|
||||
|
||||
struct Key(crate::repository::Key);
|
||||
struct Key(pageserver_api::key::Key);
|
||||
|
||||
impl std::str::FromStr for Key {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
|
||||
crate::repository::Key::from_hex(s).map(Key)
|
||||
pageserver_api::key::Key::from_hex(s).map(Key)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -12,6 +12,7 @@ use pageserver_api::key::rel_block_to_key;
|
||||
use tokio::io::{AsyncRead, AsyncReadExt};
|
||||
use tokio_tar::Archive;
|
||||
use tracing::*;
|
||||
use wal_decoder::models::InterpretedWalRecord;
|
||||
use walkdir::WalkDir;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
@@ -19,8 +20,6 @@ use crate::metrics::WAL_INGEST;
|
||||
use crate::pgdatadir_mapping::*;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::walingest::WalIngest;
|
||||
use crate::walrecord::decode_wal_record;
|
||||
use crate::walrecord::DecodedWALRecord;
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::*;
|
||||
@@ -313,11 +312,15 @@ async fn import_wal(
|
||||
let mut modification = tline.begin_modification(last_lsn);
|
||||
while last_lsn <= endpoint {
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
decode_wal_record(recdata, &mut decoded, tline.pg_version)?;
|
||||
let interpreted = InterpretedWalRecord::from_bytes_filtered(
|
||||
recdata,
|
||||
tline.get_shard_identity(),
|
||||
lsn,
|
||||
tline.pg_version,
|
||||
)?;
|
||||
|
||||
walingest
|
||||
.ingest_record(decoded, lsn, &mut modification, ctx)
|
||||
.ingest_record(interpreted, &mut modification, ctx)
|
||||
.await?;
|
||||
WAL_INGEST.records_committed.inc();
|
||||
|
||||
@@ -454,10 +457,15 @@ pub async fn import_wal_from_tar(
|
||||
let mut modification = tline.begin_modification(last_lsn);
|
||||
while last_lsn <= end_lsn {
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
decode_wal_record(recdata, &mut decoded, tline.pg_version)?;
|
||||
let interpreted = InterpretedWalRecord::from_bytes_filtered(
|
||||
recdata,
|
||||
tline.get_shard_identity(),
|
||||
lsn,
|
||||
tline.pg_version,
|
||||
)?;
|
||||
|
||||
walingest
|
||||
.ingest_record(decoded, lsn, &mut modification, ctx)
|
||||
.ingest_record(interpreted, &mut modification, ctx)
|
||||
.await?;
|
||||
modification.commit(ctx).await?;
|
||||
last_lsn = lsn;
|
||||
|
||||
@@ -24,7 +24,6 @@ pub mod metrics;
|
||||
pub mod page_cache;
|
||||
pub mod page_service;
|
||||
pub mod pgdatadir_mapping;
|
||||
pub mod repository;
|
||||
pub mod span;
|
||||
pub(crate) mod statvfs;
|
||||
pub mod task_mgr;
|
||||
@@ -32,7 +31,6 @@ pub mod tenant;
|
||||
pub mod utilization;
|
||||
pub mod virtual_file;
|
||||
pub mod walingest;
|
||||
pub mod walrecord;
|
||||
pub mod walredo;
|
||||
|
||||
use camino::Utf8Path;
|
||||
|
||||
@@ -1080,6 +1080,7 @@ impl PageServerHandler {
|
||||
prev_lsn: Option<Lsn>,
|
||||
full_backup: bool,
|
||||
gzip: bool,
|
||||
replica: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
@@ -1132,6 +1133,7 @@ impl PageServerHandler {
|
||||
lsn,
|
||||
prev_lsn,
|
||||
full_backup,
|
||||
replica,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
@@ -1154,6 +1156,7 @@ impl PageServerHandler {
|
||||
lsn,
|
||||
prev_lsn,
|
||||
full_backup,
|
||||
replica,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
@@ -1170,6 +1173,7 @@ impl PageServerHandler {
|
||||
lsn,
|
||||
prev_lsn,
|
||||
full_backup,
|
||||
replica,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
@@ -1326,24 +1330,27 @@ where
|
||||
.for_command(ComputeCommandKind::Basebackup)
|
||||
.inc();
|
||||
|
||||
let (lsn, gzip) = match (params.get(2), params.get(3)) {
|
||||
(None, _) => (None, false),
|
||||
(Some(&"--gzip"), _) => (None, true),
|
||||
(Some(lsn_str), gzip_str_opt) => {
|
||||
let lsn = Lsn::from_str(lsn_str)
|
||||
.with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?;
|
||||
let gzip = match gzip_str_opt {
|
||||
Some(&"--gzip") => true,
|
||||
None => false,
|
||||
Some(third_param) => {
|
||||
let mut lsn = None;
|
||||
let mut replica = false;
|
||||
let mut gzip = false;
|
||||
for param in ¶ms[2..] {
|
||||
if param.starts_with("--") {
|
||||
match *param {
|
||||
"--gzip" => gzip = true,
|
||||
"--replica" => replica = true,
|
||||
_ => {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"Parameter in position 3 unknown {third_param}",
|
||||
"Unknown parameter {param}",
|
||||
)))
|
||||
}
|
||||
};
|
||||
(Some(lsn), gzip)
|
||||
}
|
||||
} else {
|
||||
lsn = Some(
|
||||
Lsn::from_str(param)
|
||||
.with_context(|| format!("Failed to parse Lsn from {param}"))?,
|
||||
);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
|
||||
let res = async {
|
||||
@@ -1355,6 +1362,7 @@ where
|
||||
None,
|
||||
false,
|
||||
gzip,
|
||||
replica,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -1415,6 +1423,7 @@ where
|
||||
prev_lsn,
|
||||
true,
|
||||
false,
|
||||
false,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -7,14 +7,14 @@
|
||||
//! Clarify that)
|
||||
//!
|
||||
use super::tenant::{PageReconstructError, Timeline};
|
||||
use crate::aux_file;
|
||||
use crate::context::RequestContext;
|
||||
use crate::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use crate::{aux_file, repository::*};
|
||||
use anyhow::{ensure, Context};
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use enum_map::Enum;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::key::{
|
||||
dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
|
||||
relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
|
||||
@@ -22,7 +22,9 @@ use pageserver_api::key::{
|
||||
CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
|
||||
};
|
||||
use pageserver_api::keyspace::SparseKeySpace;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
use pageserver_api::value::Value;
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use postgres_ffi::{Oid, RepOriginId, TimestampTz, TransactionId};
|
||||
|
||||
@@ -92,11 +92,11 @@ use crate::metrics::{
|
||||
remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN,
|
||||
TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
|
||||
};
|
||||
use crate::repository::GcResult;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::config::LocationMode;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::gc_result::GcResult;
|
||||
pub use crate::tenant::remote_timeline_client::index::IndexPart;
|
||||
use crate::tenant::remote_timeline_client::remote_initdb_archive_path;
|
||||
use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
|
||||
@@ -160,6 +160,7 @@ pub(crate) mod timeline;
|
||||
pub mod size;
|
||||
|
||||
mod gc_block;
|
||||
mod gc_result;
|
||||
pub(crate) mod throttle;
|
||||
|
||||
pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
@@ -301,6 +302,13 @@ pub struct Tenant {
|
||||
/// **Lock order**: if acquiring all (or a subset), acquire them in order `timelines`, `timelines_offloaded`, `timelines_creating`
|
||||
timelines_offloaded: Mutex<HashMap<TimelineId, Arc<OffloadedTimeline>>>,
|
||||
|
||||
/// Serialize writes of the tenant manifest to remote storage. If there are concurrent operations
|
||||
/// affecting the manifest, such as timeline deletion and timeline offload, they must wait for
|
||||
/// each other (this could be optimized to coalesce writes if necessary).
|
||||
///
|
||||
/// The contents of the Mutex are the last manifest we successfully uploaded
|
||||
tenant_manifest_upload: tokio::sync::Mutex<Option<TenantManifest>>,
|
||||
|
||||
// This mutex prevents creation of new timelines during GC.
|
||||
// Adding yet another mutex (in addition to `timelines`) is needed because holding
|
||||
// `timelines` mutex during all GC iteration
|
||||
@@ -467,10 +475,10 @@ impl WalRedoManager {
|
||||
/// This method is cancellation-safe.
|
||||
pub async fn request_redo(
|
||||
&self,
|
||||
key: crate::repository::Key,
|
||||
key: pageserver_api::key::Key,
|
||||
lsn: Lsn,
|
||||
base_img: Option<(Lsn, bytes::Bytes)>,
|
||||
records: Vec<(Lsn, crate::walrecord::NeonWalRecord)>,
|
||||
records: Vec<(Lsn, pageserver_api::record::NeonWalRecord)>,
|
||||
pg_version: u32,
|
||||
) -> Result<bytes::Bytes, walredo::Error> {
|
||||
match self {
|
||||
@@ -513,13 +521,6 @@ pub struct OffloadedTimeline {
|
||||
/// Present for future flattening deliberations.
|
||||
pub archived_at: NaiveDateTime,
|
||||
|
||||
/// Lazily constructed remote client for the timeline
|
||||
///
|
||||
/// If we offload a timeline, we keep around the remote client
|
||||
/// for the duration of the process. If we find it through the
|
||||
/// manifest, we don't construct it up until it's needed (deletion).
|
||||
pub remote_client: Option<Arc<RemoteTimelineClient>>,
|
||||
|
||||
/// Prevent two tasks from deleting the timeline at the same time. If held, the
|
||||
/// timeline is being deleted. If 'true', the timeline has already been deleted.
|
||||
pub delete_progress: TimelineDeleteProgress,
|
||||
@@ -546,7 +547,6 @@ impl OffloadedTimeline {
|
||||
ancestor_retain_lsn,
|
||||
archived_at,
|
||||
|
||||
remote_client: Some(timeline.remote_client.clone()),
|
||||
delete_progress: timeline.delete_progress.clone(),
|
||||
})
|
||||
}
|
||||
@@ -563,7 +563,6 @@ impl OffloadedTimeline {
|
||||
ancestor_timeline_id,
|
||||
ancestor_retain_lsn,
|
||||
archived_at,
|
||||
remote_client: None,
|
||||
delete_progress: TimelineDeleteProgress::default(),
|
||||
}
|
||||
}
|
||||
@@ -625,19 +624,10 @@ impl TimelineOrOffloaded {
|
||||
TimelineOrOffloaded::Offloaded(offloaded) => &offloaded.delete_progress,
|
||||
}
|
||||
}
|
||||
fn remote_client_maybe_construct(&self, tenant: &Tenant) -> Arc<RemoteTimelineClient> {
|
||||
fn maybe_remote_client(&self) -> Option<Arc<RemoteTimelineClient>> {
|
||||
match self {
|
||||
TimelineOrOffloaded::Timeline(timeline) => timeline.remote_client.clone(),
|
||||
TimelineOrOffloaded::Offloaded(offloaded) => match offloaded.remote_client.clone() {
|
||||
Some(remote_client) => remote_client,
|
||||
None => {
|
||||
let remote_client = tenant.build_timeline_client(
|
||||
offloaded.timeline_id,
|
||||
tenant.remote_storage.clone(),
|
||||
);
|
||||
Arc::new(remote_client)
|
||||
}
|
||||
},
|
||||
TimelineOrOffloaded::Timeline(timeline) => Some(timeline.remote_client.clone()),
|
||||
TimelineOrOffloaded::Offloaded(_offloaded) => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -749,6 +739,24 @@ pub enum TimelineArchivalError {
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum TenantManifestError {
|
||||
#[error("Remote storage error: {0}")]
|
||||
RemoteStorage(anyhow::Error),
|
||||
|
||||
#[error("Cancelled")]
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
impl From<TenantManifestError> for TimelineArchivalError {
|
||||
fn from(e: TenantManifestError) -> Self {
|
||||
match e {
|
||||
TenantManifestError::RemoteStorage(e) => Self::Other(e),
|
||||
TenantManifestError::Cancelled => Self::Cancelled,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Debug for TimelineArchivalError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
@@ -1335,14 +1343,15 @@ impl Tenant {
|
||||
)
|
||||
.await?;
|
||||
let (offloaded_add, tenant_manifest) =
|
||||
match remote_timeline_client::do_download_tenant_manifest(
|
||||
match remote_timeline_client::download_tenant_manifest(
|
||||
remote_storage,
|
||||
&self.tenant_shard_id,
|
||||
self.generation,
|
||||
&cancel,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok((tenant_manifest, _generation)) => (
|
||||
Ok((tenant_manifest, _generation, _manifest_mtime)) => (
|
||||
format!("{} offloaded", tenant_manifest.offloaded_timelines.len()),
|
||||
tenant_manifest,
|
||||
),
|
||||
@@ -1534,18 +1543,7 @@ impl Tenant {
|
||||
offloaded_timelines_accessor.extend(offloaded_timelines_list.into_iter());
|
||||
}
|
||||
if !offloaded_timeline_ids.is_empty() {
|
||||
let manifest = self.tenant_manifest();
|
||||
// TODO: generation support
|
||||
let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
|
||||
upload_tenant_manifest(
|
||||
&self.remote_storage,
|
||||
&self.tenant_shard_id,
|
||||
generation,
|
||||
&manifest,
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
.map_err(TimelineArchivalError::Other)?;
|
||||
self.store_tenant_manifest().await?;
|
||||
}
|
||||
|
||||
// The local filesystem contents are a cache of what's in the remote IndexPart;
|
||||
@@ -1830,6 +1828,18 @@ impl Tenant {
|
||||
ctx: RequestContext,
|
||||
) -> Result<Arc<Timeline>, TimelineArchivalError> {
|
||||
info!("unoffloading timeline");
|
||||
|
||||
// We activate the timeline below manually, so this must be called on an active timeline.
|
||||
// We expect callers of this function to ensure this.
|
||||
match self.current_state() {
|
||||
TenantState::Activating { .. }
|
||||
| TenantState::Attaching
|
||||
| TenantState::Broken { .. } => {
|
||||
panic!("Timeline expected to be active")
|
||||
}
|
||||
TenantState::Stopping { .. } => return Err(TimelineArchivalError::Cancelled),
|
||||
TenantState::Active => {}
|
||||
}
|
||||
let cancel = self.cancel.clone();
|
||||
|
||||
// Protect against concurrent attempts to use this TimelineId
|
||||
@@ -1914,18 +1924,7 @@ impl Tenant {
|
||||
};
|
||||
|
||||
// Upload new list of offloaded timelines to S3
|
||||
let manifest = self.tenant_manifest();
|
||||
// TODO: generation support
|
||||
let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
|
||||
upload_tenant_manifest(
|
||||
&self.remote_storage,
|
||||
&self.tenant_shard_id,
|
||||
generation,
|
||||
&manifest,
|
||||
&cancel,
|
||||
)
|
||||
.await
|
||||
.map_err(TimelineArchivalError::Other)?;
|
||||
self.store_tenant_manifest().await?;
|
||||
|
||||
// Activate the timeline (if it makes sense)
|
||||
if !(timeline.is_broken() || timeline.is_stopping()) {
|
||||
@@ -3122,9 +3121,7 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
|
||||
let tenant_manifest = self.tenant_manifest();
|
||||
// TODO: generation support
|
||||
let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
|
||||
let tenant_manifest = self.build_tenant_manifest();
|
||||
for child_shard in child_shards {
|
||||
tracing::info!(
|
||||
"Uploading tenant manifest for child {}",
|
||||
@@ -3133,7 +3130,7 @@ impl Tenant {
|
||||
upload_tenant_manifest(
|
||||
&self.remote_storage,
|
||||
child_shard,
|
||||
generation,
|
||||
self.generation,
|
||||
&tenant_manifest,
|
||||
&self.cancel,
|
||||
)
|
||||
@@ -3317,7 +3314,8 @@ impl Tenant {
|
||||
.unwrap_or(self.conf.default_tenant_conf.lsn_lease_length)
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_manifest(&self) -> TenantManifest {
|
||||
/// Generate an up-to-date TenantManifest based on the state of this Tenant.
|
||||
fn build_tenant_manifest(&self) -> TenantManifest {
|
||||
let timelines_offloaded = self.timelines_offloaded.lock().unwrap();
|
||||
|
||||
let mut timeline_manifests = timelines_offloaded
|
||||
@@ -3525,6 +3523,7 @@ impl Tenant {
|
||||
timelines: Mutex::new(HashMap::new()),
|
||||
timelines_creating: Mutex::new(HashSet::new()),
|
||||
timelines_offloaded: Mutex::new(HashMap::new()),
|
||||
tenant_manifest_upload: Default::default(),
|
||||
gc_cs: tokio::sync::Mutex::new(()),
|
||||
walredo_mgr,
|
||||
remote_storage,
|
||||
@@ -4704,6 +4703,49 @@ impl Tenant {
|
||||
.max()
|
||||
.unwrap_or(0)
|
||||
}
|
||||
|
||||
/// Serialize and write the latest TenantManifest to remote storage.
|
||||
pub(crate) async fn store_tenant_manifest(&self) -> Result<(), TenantManifestError> {
|
||||
// Only one manifest write may be done at at time, and the contents of the manifest
|
||||
// must be loaded while holding this lock. This makes it safe to call this function
|
||||
// from anywhere without worrying about colliding updates.
|
||||
let mut guard = tokio::select! {
|
||||
g = self.tenant_manifest_upload.lock() => {
|
||||
g
|
||||
},
|
||||
_ = self.cancel.cancelled() => {
|
||||
return Err(TenantManifestError::Cancelled);
|
||||
}
|
||||
};
|
||||
|
||||
let manifest = self.build_tenant_manifest();
|
||||
if Some(&manifest) == (*guard).as_ref() {
|
||||
// Optimisation: skip uploads that don't change anything.
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
upload_tenant_manifest(
|
||||
&self.remote_storage,
|
||||
&self.tenant_shard_id,
|
||||
self.generation,
|
||||
&manifest,
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
if self.cancel.is_cancelled() {
|
||||
TenantManifestError::Cancelled
|
||||
} else {
|
||||
TenantManifestError::RemoteStorage(e)
|
||||
}
|
||||
})?;
|
||||
|
||||
// Store the successfully uploaded manifest, so that future callers can avoid
|
||||
// re-uploading the same thing.
|
||||
*guard = Some(manifest);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Create the cluster temporarily in 'initdbpath' directory inside the repository
|
||||
@@ -4806,7 +4848,8 @@ pub(crate) mod harness {
|
||||
use crate::deletion_queue::mock::MockDeletionQueue;
|
||||
use crate::l0_flush::L0FlushConfig;
|
||||
use crate::walredo::apply_neon;
|
||||
use crate::{repository::Key, walrecord::NeonWalRecord};
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
|
||||
use super::*;
|
||||
use hex_literal::hex;
|
||||
@@ -4853,7 +4896,6 @@ pub(crate) mod harness {
|
||||
image_layer_creation_check_threshold: Some(
|
||||
tenant_conf.image_layer_creation_check_threshold,
|
||||
),
|
||||
switch_aux_file_policy: Some(tenant_conf.switch_aux_file_policy),
|
||||
lsn_lease_length: Some(tenant_conf.lsn_lease_length),
|
||||
lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
|
||||
}
|
||||
@@ -5076,25 +5118,31 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::keyspace::KeySpaceAccum;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::tenant::harness::*;
|
||||
use crate::tenant::timeline::CompactFlags;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use crate::DEFAULT_PG_VERSION;
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use hex_literal::hex;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE};
|
||||
use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
|
||||
use pageserver_api::value::Value;
|
||||
use pageserver_compaction::helpers::overlaps_with;
|
||||
use rand::{thread_rng, Rng};
|
||||
use storage_layer::PersistentLayerKey;
|
||||
use tests::storage_layer::ValuesReconstructState;
|
||||
use tests::timeline::{GetVectoredError, ShutdownMode};
|
||||
use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
|
||||
use timeline::{DeltaLayerTestDesc, GcInfo};
|
||||
use timeline::DeltaLayerTestDesc;
|
||||
use utils::id::TenantId;
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
#[cfg(feature = "testing")]
|
||||
use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
|
||||
#[cfg(feature = "testing")]
|
||||
use timeline::GcInfo;
|
||||
|
||||
static TEST_KEY: Lazy<Key> =
|
||||
Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
|
||||
|
||||
@@ -7603,23 +7651,7 @@ mod tests {
|
||||
}
|
||||
|
||||
// Check if old layers are removed / new layers have the expected LSN
|
||||
let mut all_layers = tline.inspect_historic_layers().await.unwrap();
|
||||
all_layers.sort_by(|k1, k2| {
|
||||
(
|
||||
k1.is_delta,
|
||||
k1.key_range.start,
|
||||
k1.key_range.end,
|
||||
k1.lsn_range.start,
|
||||
k1.lsn_range.end,
|
||||
)
|
||||
.cmp(&(
|
||||
k2.is_delta,
|
||||
k2.key_range.start,
|
||||
k2.key_range.end,
|
||||
k2.lsn_range.start,
|
||||
k2.lsn_range.end,
|
||||
))
|
||||
});
|
||||
let all_layers = inspect_and_sort(&tline, None).await;
|
||||
assert_eq!(
|
||||
all_layers,
|
||||
vec![
|
||||
@@ -7659,6 +7691,7 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_neon_test_record() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_neon_test_record").await?;
|
||||
@@ -7850,6 +7883,7 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction_deltas() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas").await?;
|
||||
@@ -8046,6 +8080,7 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_generate_key_retention() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_generate_key_retention").await?;
|
||||
@@ -8393,6 +8428,7 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction_with_retain_lsns() -> anyhow::Result<()> {
|
||||
let harness =
|
||||
@@ -8633,6 +8669,7 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction_with_retain_lsns_single_key() -> anyhow::Result<()>
|
||||
{
|
||||
@@ -8841,6 +8878,7 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
|
||||
@@ -9042,6 +9080,7 @@ mod tests {
|
||||
//
|
||||
// When querying the key range [A, B) we need to read at different LSN ranges
|
||||
// for [A, C) and [C, B). This test checks that the described edge case is handled correctly.
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_vectored_read_with_nested_image_layer() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_vectored_read_with_nested_image_layer").await?;
|
||||
@@ -9156,4 +9195,249 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn inspect_and_sort(
|
||||
tline: &Arc<Timeline>,
|
||||
filter: Option<std::ops::Range<Key>>,
|
||||
) -> Vec<PersistentLayerKey> {
|
||||
let mut all_layers = tline.inspect_historic_layers().await.unwrap();
|
||||
if let Some(filter) = filter {
|
||||
all_layers.retain(|layer| overlaps_with(&layer.key_range, &filter));
|
||||
}
|
||||
all_layers.sort_by(|k1, k2| {
|
||||
(
|
||||
k1.is_delta,
|
||||
k1.key_range.start,
|
||||
k1.key_range.end,
|
||||
k1.lsn_range.start,
|
||||
k1.lsn_range.end,
|
||||
)
|
||||
.cmp(&(
|
||||
k2.is_delta,
|
||||
k2.key_range.start,
|
||||
k2.key_range.end,
|
||||
k2.lsn_range.start,
|
||||
k2.lsn_range.end,
|
||||
))
|
||||
});
|
||||
all_layers
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_simple_partial_bottom_most_compaction() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_simple_partial_bottom_most_compaction").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
// using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
|
||||
let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
key
|
||||
}
|
||||
|
||||
// img layer at 0x10
|
||||
let img_layer = (0..10)
|
||||
.map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
|
||||
.collect_vec();
|
||||
|
||||
let delta1 = vec![
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x20),
|
||||
Value::Image(Bytes::from("value 1@0x20")),
|
||||
),
|
||||
(
|
||||
get_key(2),
|
||||
Lsn(0x30),
|
||||
Value::Image(Bytes::from("value 2@0x30")),
|
||||
),
|
||||
(
|
||||
get_key(3),
|
||||
Lsn(0x40),
|
||||
Value::Image(Bytes::from("value 3@0x40")),
|
||||
),
|
||||
];
|
||||
let delta2 = vec![
|
||||
(
|
||||
get_key(5),
|
||||
Lsn(0x20),
|
||||
Value::Image(Bytes::from("value 5@0x20")),
|
||||
),
|
||||
(
|
||||
get_key(6),
|
||||
Lsn(0x20),
|
||||
Value::Image(Bytes::from("value 6@0x20")),
|
||||
),
|
||||
];
|
||||
let delta3 = vec![
|
||||
(
|
||||
get_key(8),
|
||||
Lsn(0x48),
|
||||
Value::Image(Bytes::from("value 8@0x48")),
|
||||
),
|
||||
(
|
||||
get_key(9),
|
||||
Lsn(0x48),
|
||||
Value::Image(Bytes::from("value 9@0x48")),
|
||||
),
|
||||
];
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TIMELINE_ID,
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
vec![
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
|
||||
], // delta layers
|
||||
vec![(Lsn(0x10), img_layer)], // image layers
|
||||
Lsn(0x50),
|
||||
)
|
||||
.await?;
|
||||
|
||||
{
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
*guard = GcInfo {
|
||||
retain_lsns: vec![(Lsn(0x20), tline.timeline_id, MaybeOffloaded::No)],
|
||||
cutoffs: GcCutoffs {
|
||||
time: Lsn(0x30),
|
||||
space: Lsn(0x30),
|
||||
},
|
||||
leases: Default::default(),
|
||||
within_ancestor_pitr: false,
|
||||
};
|
||||
}
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
// Do a partial compaction on key range 0..4, we should generate a image layer; no other layers
|
||||
// can be removed because they might be used for other key ranges.
|
||||
tline
|
||||
.partial_compact_with_gc(Some(get_key(0)..get_key(4)), &cancel, EnumSet::new(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
|
||||
assert_eq!(
|
||||
all_layers,
|
||||
vec![
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(0)..get_key(4),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x21),
|
||||
is_delta: false
|
||||
},
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(0)..get_key(10),
|
||||
lsn_range: Lsn(0x10)..Lsn(0x11),
|
||||
is_delta: false
|
||||
},
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(1)..get_key(4),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x48),
|
||||
is_delta: true
|
||||
},
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(5)..get_key(7),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x48),
|
||||
is_delta: true
|
||||
},
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(8)..get_key(10),
|
||||
lsn_range: Lsn(0x48)..Lsn(0x50),
|
||||
is_delta: true
|
||||
}
|
||||
]
|
||||
);
|
||||
|
||||
// Do a partial compaction on key range 4..10
|
||||
tline
|
||||
.partial_compact_with_gc(Some(get_key(4)..get_key(10)), &cancel, EnumSet::new(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
|
||||
assert_eq!(
|
||||
all_layers,
|
||||
vec![
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(0)..get_key(4),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x21),
|
||||
is_delta: false
|
||||
},
|
||||
PersistentLayerKey {
|
||||
// if (in the future) GC kicks in, this layer will be removed
|
||||
key_range: get_key(0)..get_key(10),
|
||||
lsn_range: Lsn(0x10)..Lsn(0x11),
|
||||
is_delta: false
|
||||
},
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(4)..get_key(10),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x21),
|
||||
is_delta: false
|
||||
},
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(1)..get_key(4),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x48),
|
||||
is_delta: true
|
||||
},
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(5)..get_key(7),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x48),
|
||||
is_delta: true
|
||||
},
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(8)..get_key(10),
|
||||
lsn_range: Lsn(0x48)..Lsn(0x50),
|
||||
is_delta: true
|
||||
}
|
||||
]
|
||||
);
|
||||
|
||||
// Do a partial compaction on key range 0..10, all image layers below LSN 20 can be replaced with new ones.
|
||||
tline
|
||||
.partial_compact_with_gc(Some(get_key(0)..get_key(10)), &cancel, EnumSet::new(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
|
||||
assert_eq!(
|
||||
all_layers,
|
||||
vec![
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(0)..get_key(4),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x21),
|
||||
is_delta: false
|
||||
},
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(0)..get_key(10),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x21),
|
||||
is_delta: false
|
||||
},
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(4)..get_key(10),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x21),
|
||||
is_delta: false
|
||||
},
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(1)..get_key(4),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x48),
|
||||
is_delta: true
|
||||
},
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(5)..get_key(7),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x48),
|
||||
is_delta: true
|
||||
},
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(8)..get_key(10),
|
||||
lsn_range: Lsn(0x48)..Lsn(0x50),
|
||||
is_delta: true
|
||||
}
|
||||
]
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,7 +9,6 @@
|
||||
//! may lead to a data loss.
|
||||
//!
|
||||
pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::models::CompactionAlgorithmSettings;
|
||||
use pageserver_api::models::EvictionPolicy;
|
||||
use pageserver_api::models::{self, ThrottleConfig};
|
||||
@@ -341,10 +340,6 @@ pub struct TenantConfOpt {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub image_layer_creation_check_threshold: Option<u8>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub switch_aux_file_policy: Option<AuxFilePolicy>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(with = "humantime_serde")]
|
||||
#[serde(default)]
|
||||
@@ -410,9 +405,6 @@ impl TenantConfOpt {
|
||||
image_layer_creation_check_threshold: self
|
||||
.image_layer_creation_check_threshold
|
||||
.unwrap_or(global_conf.image_layer_creation_check_threshold),
|
||||
switch_aux_file_policy: self
|
||||
.switch_aux_file_policy
|
||||
.unwrap_or(global_conf.switch_aux_file_policy),
|
||||
lsn_lease_length: self
|
||||
.lsn_lease_length
|
||||
.unwrap_or(global_conf.lsn_lease_length),
|
||||
@@ -470,7 +462,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
|
||||
lazy_slru_download: value.lazy_slru_download,
|
||||
timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
|
||||
image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
|
||||
switch_aux_file_policy: value.switch_aux_file_policy,
|
||||
lsn_lease_length: value.lsn_lease_length.map(humantime),
|
||||
lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
|
||||
}
|
||||
|
||||
57
pageserver/src/tenant/gc_result.rs
Normal file
57
pageserver/src/tenant/gc_result.rs
Normal file
@@ -0,0 +1,57 @@
|
||||
use anyhow::Result;
|
||||
use serde::Serialize;
|
||||
use std::ops::AddAssign;
|
||||
use std::time::Duration;
|
||||
|
||||
///
|
||||
/// Result of performing GC
|
||||
///
|
||||
#[derive(Default, Serialize, Debug)]
|
||||
pub struct GcResult {
|
||||
pub layers_total: u64,
|
||||
pub layers_needed_by_cutoff: u64,
|
||||
pub layers_needed_by_pitr: u64,
|
||||
pub layers_needed_by_branches: u64,
|
||||
pub layers_needed_by_leases: u64,
|
||||
pub layers_not_updated: u64,
|
||||
pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
|
||||
|
||||
#[serde(serialize_with = "serialize_duration_as_millis")]
|
||||
pub elapsed: Duration,
|
||||
|
||||
/// The layers which were garbage collected.
|
||||
///
|
||||
/// Used in `/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc` to wait for the layers to be
|
||||
/// dropped in tests.
|
||||
#[cfg(feature = "testing")]
|
||||
#[serde(skip)]
|
||||
pub(crate) doomed_layers: Vec<crate::tenant::storage_layer::Layer>,
|
||||
}
|
||||
|
||||
// helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds
|
||||
fn serialize_duration_as_millis<S>(d: &Duration, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
d.as_millis().serialize(serializer)
|
||||
}
|
||||
|
||||
impl AddAssign for GcResult {
|
||||
fn add_assign(&mut self, other: Self) {
|
||||
self.layers_total += other.layers_total;
|
||||
self.layers_needed_by_pitr += other.layers_needed_by_pitr;
|
||||
self.layers_needed_by_cutoff += other.layers_needed_by_cutoff;
|
||||
self.layers_needed_by_branches += other.layers_needed_by_branches;
|
||||
self.layers_needed_by_leases += other.layers_needed_by_leases;
|
||||
self.layers_not_updated += other.layers_not_updated;
|
||||
self.layers_removed += other.layers_removed;
|
||||
|
||||
self.elapsed += other.elapsed;
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
{
|
||||
let mut other = other;
|
||||
self.doomed_layers.append(&mut other.doomed_layers);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -48,9 +48,9 @@ mod layer_coverage;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::keyspace::KeyPartitioning;
|
||||
use crate::repository::Key;
|
||||
use crate::tenant::storage_layer::InMemoryLayer;
|
||||
use anyhow::Result;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze};
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
|
||||
@@ -2811,7 +2811,7 @@ where
|
||||
}
|
||||
|
||||
use {
|
||||
crate::repository::GcResult, pageserver_api::models::TimelineGcRequest,
|
||||
crate::tenant::gc_result::GcResult, pageserver_api::models::TimelineGcRequest,
|
||||
utils::http::error::ApiError,
|
||||
};
|
||||
|
||||
|
||||
@@ -190,6 +190,7 @@ use chrono::{NaiveDateTime, Utc};
|
||||
pub(crate) use download::download_initdb_tar_zst;
|
||||
use pageserver_api::models::TimelineArchivalState;
|
||||
use pageserver_api::shard::{ShardIndex, TenantShardId};
|
||||
use regex::Regex;
|
||||
use scopeguard::ScopeGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::backoff::{
|
||||
@@ -199,7 +200,7 @@ use utils::pausable_failpoint;
|
||||
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::sync::atomic::{AtomicU32, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::sync::{Arc, Mutex, OnceLock};
|
||||
use std::time::Duration;
|
||||
|
||||
use remote_storage::{
|
||||
@@ -245,11 +246,11 @@ use super::upload_queue::{NotInitialized, SetDeletedFlagProgress};
|
||||
use super::Generation;
|
||||
|
||||
pub(crate) use download::{
|
||||
do_download_tenant_manifest, download_index_part, is_temp_download_file,
|
||||
download_index_part, download_tenant_manifest, is_temp_download_file,
|
||||
list_remote_tenant_shards, list_remote_timelines,
|
||||
};
|
||||
pub(crate) use index::LayerFileMetadata;
|
||||
pub(crate) use upload::{upload_initdb_dir, upload_tenant_manifest};
|
||||
pub(crate) use upload::upload_initdb_dir;
|
||||
|
||||
// Occasional network issues and such can cause remote operations to fail, and
|
||||
// that's expected. If a download fails, we log it at info-level, and retry.
|
||||
@@ -274,12 +275,6 @@ pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
|
||||
/// which we warn and skip.
|
||||
const DELETION_QUEUE_FLUSH_TIMEOUT: Duration = Duration::from_secs(10);
|
||||
|
||||
/// Hardcode a generation for the tenant manifest for now so that we don't
|
||||
/// need to deal with generation-less manifests in the future.
|
||||
///
|
||||
/// TODO: add proper generation support to all the places that use this.
|
||||
pub(crate) const TENANT_MANIFEST_GENERATION: Generation = Generation::new(1);
|
||||
|
||||
pub enum MaybeDeletedIndexPart {
|
||||
IndexPart(IndexPart),
|
||||
Deleted(IndexPart),
|
||||
@@ -2239,6 +2234,12 @@ pub fn remote_tenant_manifest_path(
|
||||
RemotePath::from_string(&path).expect("Failed to construct path")
|
||||
}
|
||||
|
||||
/// Prefix to all generations' manifest objects in a tenant shard
|
||||
pub fn remote_tenant_manifest_prefix(tenant_shard_id: &TenantShardId) -> RemotePath {
|
||||
let path = format!("tenants/{tenant_shard_id}/tenant-manifest",);
|
||||
RemotePath::from_string(&path).expect("Failed to construct path")
|
||||
}
|
||||
|
||||
pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
|
||||
let path = format!("tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}");
|
||||
RemotePath::from_string(&path).expect("Failed to construct path")
|
||||
@@ -2333,6 +2334,15 @@ pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Given the key of a tenant manifest, parse out the generation number
|
||||
pub(crate) fn parse_remote_tenant_manifest_path(path: RemotePath) -> Option<Generation> {
|
||||
static RE: OnceLock<Regex> = OnceLock::new();
|
||||
let re = RE.get_or_init(|| Regex::new(r".+tenant-manifest-([0-9a-f]{8}).json").unwrap());
|
||||
re.captures(path.get_path().as_str())
|
||||
.and_then(|c| c.get(1))
|
||||
.and_then(|m| Generation::parse_suffix(m.as_str()))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
@@ -20,7 +20,9 @@ use utils::backoff;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::span::{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id, debug_assert_current_span_has_tenant_id,
|
||||
};
|
||||
use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
|
||||
use crate::tenant::storage_layer::LayerName;
|
||||
use crate::tenant::Generation;
|
||||
@@ -36,9 +38,10 @@ use utils::pausable_failpoint;
|
||||
use super::index::{IndexPart, LayerFileMetadata};
|
||||
use super::manifest::TenantManifest;
|
||||
use super::{
|
||||
parse_remote_index_path, remote_index_path, remote_initdb_archive_path,
|
||||
remote_initdb_preserved_archive_path, remote_tenant_manifest_path, remote_tenant_path,
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
|
||||
parse_remote_index_path, parse_remote_tenant_manifest_path, remote_index_path,
|
||||
remote_initdb_archive_path, remote_initdb_preserved_archive_path, remote_tenant_manifest_path,
|
||||
remote_tenant_manifest_prefix, remote_tenant_path, FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
|
||||
};
|
||||
|
||||
///
|
||||
@@ -365,32 +368,34 @@ async fn do_download_remote_path_retry_forever(
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn do_download_tenant_manifest(
|
||||
async fn do_download_tenant_manifest(
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
_timeline_id: Option<&TimelineId>,
|
||||
generation: Generation,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(TenantManifest, Generation), DownloadError> {
|
||||
// TODO: generation support
|
||||
let generation = super::TENANT_MANIFEST_GENERATION;
|
||||
) -> Result<(TenantManifest, Generation, SystemTime), DownloadError> {
|
||||
let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation);
|
||||
|
||||
let (manifest_bytes, _manifest_bytes_mtime) =
|
||||
let (manifest_bytes, manifest_bytes_mtime) =
|
||||
do_download_remote_path_retry_forever(storage, &remote_path, cancel).await?;
|
||||
|
||||
let tenant_manifest = TenantManifest::from_json_bytes(&manifest_bytes)
|
||||
.with_context(|| format!("deserialize tenant manifest file at {remote_path:?}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
Ok((tenant_manifest, generation))
|
||||
Ok((tenant_manifest, generation, manifest_bytes_mtime))
|
||||
}
|
||||
|
||||
async fn do_download_index_part(
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
timeline_id: Option<&TimelineId>,
|
||||
index_generation: Generation,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
|
||||
let timeline_id =
|
||||
timeline_id.expect("A timeline ID is always provided when downloading an index");
|
||||
let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
|
||||
|
||||
let (index_part_bytes, index_part_mtime) =
|
||||
@@ -403,59 +408,79 @@ async fn do_download_index_part(
|
||||
Ok((index_part, index_generation, index_part_mtime))
|
||||
}
|
||||
|
||||
/// index_part.json objects are suffixed with a generation number, so we cannot
|
||||
/// directly GET the latest index part without doing some probing.
|
||||
/// Metadata objects are "generationed", meaning that they include a generation suffix. This
|
||||
/// function downloads the object with the highest generation <= `my_generation`.
|
||||
///
|
||||
/// In this function we probe for the most recent index in a generation <= our current generation.
|
||||
/// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
|
||||
/// Data objects (layer files) also include a generation in their path, but there is no equivalent
|
||||
/// search process, because their reference from an index includes the generation.
|
||||
///
|
||||
/// An expensive object listing operation is only done if necessary: the typical fast path is to issue two
|
||||
/// GET operations, one to our own generation (stale attachment case), and one to the immediately preceding
|
||||
/// generation (normal case when migrating/restarting). Only if both of these return 404 do we fall back
|
||||
/// to listing objects.
|
||||
///
|
||||
/// * `my_generation`: the value of `[crate::tenant::Tenant::generation]`
|
||||
/// * `what`: for logging, what object are we downloading
|
||||
/// * `prefix`: when listing objects, use this prefix (i.e. the part of the object path before the generation)
|
||||
/// * `do_download`: a GET of the object in a particular generation, which should **retry indefinitely** unless
|
||||
/// `cancel`` has fired. This function does not do its own retries of GET operations, and relies
|
||||
/// on the function passed in to do so.
|
||||
/// * `parse_path`: parse a fully qualified remote storage path to get the generation of the object.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[tracing::instrument(skip_all, fields(generation=?my_generation))]
|
||||
pub(crate) async fn download_index_part(
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
pub(crate) async fn download_generation_object<'a, T, DF, DFF, PF>(
|
||||
storage: &'a GenericRemoteStorage,
|
||||
tenant_shard_id: &'a TenantShardId,
|
||||
timeline_id: Option<&'a TimelineId>,
|
||||
my_generation: Generation,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
what: &str,
|
||||
prefix: RemotePath,
|
||||
do_download: DF,
|
||||
parse_path: PF,
|
||||
cancel: &'a CancellationToken,
|
||||
) -> Result<(T, Generation, SystemTime), DownloadError>
|
||||
where
|
||||
DF: Fn(
|
||||
&'a GenericRemoteStorage,
|
||||
&'a TenantShardId,
|
||||
Option<&'a TimelineId>,
|
||||
Generation,
|
||||
&'a CancellationToken,
|
||||
) -> DFF,
|
||||
DFF: Future<Output = Result<(T, Generation, SystemTime), DownloadError>>,
|
||||
PF: Fn(RemotePath) -> Option<Generation>,
|
||||
T: 'static,
|
||||
{
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
|
||||
if my_generation.is_none() {
|
||||
// Operating without generations: just fetch the generation-less path
|
||||
return do_download_index_part(
|
||||
storage,
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
my_generation,
|
||||
cancel,
|
||||
)
|
||||
.await;
|
||||
return do_download(storage, tenant_shard_id, timeline_id, my_generation, cancel).await;
|
||||
}
|
||||
|
||||
// Stale case: If we were intentionally attached in a stale generation, there may already be a remote
|
||||
// index in our generation.
|
||||
// Stale case: If we were intentionally attached in a stale generation, the remote object may already
|
||||
// exist in our generation.
|
||||
//
|
||||
// This is an optimization to avoid doing the listing for the general case below.
|
||||
let res =
|
||||
do_download_index_part(storage, tenant_shard_id, timeline_id, my_generation, cancel).await;
|
||||
let res = do_download(storage, tenant_shard_id, timeline_id, my_generation, cancel).await;
|
||||
match res {
|
||||
Ok(index_part) => {
|
||||
tracing::debug!(
|
||||
"Found index_part from current generation (this is a stale attachment)"
|
||||
);
|
||||
return Ok(index_part);
|
||||
Ok(decoded) => {
|
||||
tracing::debug!("Found {what} from current generation (this is a stale attachment)");
|
||||
return Ok(decoded);
|
||||
}
|
||||
Err(DownloadError::NotFound) => {}
|
||||
Err(e) => return Err(e),
|
||||
};
|
||||
|
||||
// Typical case: the previous generation of this tenant was running healthily, and had uploaded
|
||||
// and index part. We may safely start from this index without doing a listing, because:
|
||||
// Typical case: the previous generation of this tenant was running healthily, and had uploaded the object
|
||||
// we are seeking in that generation. We may safely start from this index without doing a listing, because:
|
||||
// - We checked for current generation case above
|
||||
// - generations > my_generation are to be ignored
|
||||
// - any other indices that exist would have an older generation than `previous_gen`, and
|
||||
// we want to find the most recent index from a previous generation.
|
||||
// - any other objects that exist would have an older generation than `previous_gen`, and
|
||||
// we want to find the most recent object from a previous generation.
|
||||
//
|
||||
// This is an optimization to avoid doing the listing for the general case below.
|
||||
let res = do_download_index_part(
|
||||
let res = do_download(
|
||||
storage,
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
@@ -464,14 +489,12 @@ pub(crate) async fn download_index_part(
|
||||
)
|
||||
.await;
|
||||
match res {
|
||||
Ok(index_part) => {
|
||||
tracing::debug!("Found index_part from previous generation");
|
||||
return Ok(index_part);
|
||||
Ok(decoded) => {
|
||||
tracing::debug!("Found {what} from previous generation");
|
||||
return Ok(decoded);
|
||||
}
|
||||
Err(DownloadError::NotFound) => {
|
||||
tracing::debug!(
|
||||
"No index_part found from previous generation, falling back to listing"
|
||||
);
|
||||
tracing::debug!("No {what} found from previous generation, falling back to listing");
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(e);
|
||||
@@ -481,12 +504,10 @@ pub(crate) async fn download_index_part(
|
||||
// General case/fallback: if there is no index at my_generation or prev_generation, then list all index_part.json
|
||||
// objects, and select the highest one with a generation <= my_generation. Constructing the prefix is equivalent
|
||||
// to constructing a full index path with no generation, because the generation is a suffix.
|
||||
let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
|
||||
|
||||
let indices = download_retry(
|
||||
let paths = download_retry(
|
||||
|| async {
|
||||
storage
|
||||
.list(Some(&index_prefix), ListingMode::NoDelimiter, None, cancel)
|
||||
.list(Some(&prefix), ListingMode::NoDelimiter, None, cancel)
|
||||
.await
|
||||
},
|
||||
"list index_part files",
|
||||
@@ -497,22 +518,22 @@ pub(crate) async fn download_index_part(
|
||||
|
||||
// General case logic for which index to use: the latest index whose generation
|
||||
// is <= our own. See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
|
||||
let max_previous_generation = indices
|
||||
let max_previous_generation = paths
|
||||
.into_iter()
|
||||
.filter_map(|o| parse_remote_index_path(o.key))
|
||||
.filter_map(|o| parse_path(o.key))
|
||||
.filter(|g| g <= &my_generation)
|
||||
.max();
|
||||
|
||||
match max_previous_generation {
|
||||
Some(g) => {
|
||||
tracing::debug!("Found index_part in generation {g:?}");
|
||||
do_download_index_part(storage, tenant_shard_id, timeline_id, g, cancel).await
|
||||
tracing::debug!("Found {what} in generation {g:?}");
|
||||
do_download(storage, tenant_shard_id, timeline_id, g, cancel).await
|
||||
}
|
||||
None => {
|
||||
// Migration from legacy pre-generation state: we have a generation but no prior
|
||||
// attached pageservers did. Try to load from a no-generation path.
|
||||
tracing::debug!("No index_part.json* found");
|
||||
do_download_index_part(
|
||||
tracing::debug!("No {what}* found");
|
||||
do_download(
|
||||
storage,
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
@@ -524,6 +545,57 @@ pub(crate) async fn download_index_part(
|
||||
}
|
||||
}
|
||||
|
||||
/// index_part.json objects are suffixed with a generation number, so we cannot
|
||||
/// directly GET the latest index part without doing some probing.
|
||||
///
|
||||
/// In this function we probe for the most recent index in a generation <= our current generation.
|
||||
/// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
|
||||
pub(crate) async fn download_index_part(
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
my_generation: Generation,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
|
||||
download_generation_object(
|
||||
storage,
|
||||
tenant_shard_id,
|
||||
Some(timeline_id),
|
||||
my_generation,
|
||||
"index_part",
|
||||
index_prefix,
|
||||
do_download_index_part,
|
||||
parse_remote_index_path,
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn download_tenant_manifest(
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
my_generation: Generation,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(TenantManifest, Generation, SystemTime), DownloadError> {
|
||||
let manifest_prefix = remote_tenant_manifest_prefix(tenant_shard_id);
|
||||
|
||||
download_generation_object(
|
||||
storage,
|
||||
tenant_shard_id,
|
||||
None,
|
||||
my_generation,
|
||||
"tenant-manifest",
|
||||
manifest_prefix,
|
||||
do_download_tenant_manifest,
|
||||
parse_remote_tenant_manifest_path,
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn download_initdb_tar_zst(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &GenericRemoteStorage,
|
||||
|
||||
@@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
|
||||
use utils::{id::TimelineId, lsn::Lsn};
|
||||
|
||||
/// Tenant-shard scoped manifest
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct TenantManifest {
|
||||
/// Debugging aid describing the version of this manifest.
|
||||
/// Can also be used for distinguishing breaking changes later on.
|
||||
@@ -23,7 +23,7 @@ pub struct TenantManifest {
|
||||
/// Very similar to [`pageserver_api::models::OffloadedTimelineInfo`],
|
||||
/// but the two datastructures serve different needs, this is for a persistent disk format
|
||||
/// that must be backwards compatible, while the other is only for informative purposes.
|
||||
#[derive(Clone, Serialize, Deserialize, Copy)]
|
||||
#[derive(Clone, Serialize, Deserialize, Copy, PartialEq, Eq)]
|
||||
pub struct OffloadedTimelineManifest {
|
||||
pub timeline_id: TimelineId,
|
||||
/// Whether the timeline has a parent it has been branched off from or not
|
||||
|
||||
@@ -11,11 +11,11 @@ mod layer_name;
|
||||
pub mod merge_iterator;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||
use crate::repository::Value;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::value::Value;
|
||||
use std::cmp::{Ordering, Reverse};
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::{BinaryHeap, HashMap};
|
||||
|
||||
@@ -5,7 +5,8 @@ use pageserver_api::key::{Key, KEY_SIZE};
|
||||
use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
|
||||
|
||||
use crate::tenant::storage_layer::Layer;
|
||||
use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline};
|
||||
use crate::{config::PageServerConf, context::RequestContext, tenant::Timeline};
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
use super::layer::S3_UPLOAD_LIMIT;
|
||||
use super::{
|
||||
|
||||
@@ -30,7 +30,6 @@
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
|
||||
use crate::page_cache::{self, FileId, PAGE_SZ};
|
||||
use crate::repository::{Key, Value, KEY_SIZE};
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{
|
||||
@@ -46,7 +45,7 @@ use crate::tenant::PageReconstructError;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
|
||||
use crate::virtual_file::IoBufferMut;
|
||||
use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
|
||||
use crate::{walrecord, TEMP_FILE_SUFFIX};
|
||||
use crate::TEMP_FILE_SUFFIX;
|
||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
@@ -54,9 +53,11 @@ use futures::StreamExt;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::config::MaxVectoredReadBytes;
|
||||
use pageserver_api::key::DBDIR_KEY;
|
||||
use pageserver_api::key::{Key, KEY_SIZE};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::ImageCompressionAlgorithm;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_api::value::Value;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::VecDeque;
|
||||
@@ -269,7 +270,7 @@ impl AsLayerDesc for DeltaLayer {
|
||||
}
|
||||
|
||||
impl DeltaLayer {
|
||||
pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
self.desc.dump();
|
||||
|
||||
if !verbose {
|
||||
@@ -1293,7 +1294,7 @@ impl DeltaLayerInner {
|
||||
// is it an image or will_init walrecord?
|
||||
// FIXME: this could be handled by threading the BlobRef to the
|
||||
// VectoredReadBuilder
|
||||
let will_init = crate::repository::ValueBytes::will_init(&data)
|
||||
let will_init = pageserver_api::value::ValueBytes::will_init(&data)
|
||||
.inspect_err(|_e| {
|
||||
#[cfg(feature = "testing")]
|
||||
tracing::error!(data=?utils::Hex(&data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value");
|
||||
@@ -1356,7 +1357,7 @@ impl DeltaLayerInner {
|
||||
format!(" img {} bytes", img.len())
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let wal_desc = walrecord::describe_wal_record(&rec)?;
|
||||
let wal_desc = pageserver_api::record::describe_wal_record(&rec)?;
|
||||
format!(
|
||||
" rec {} bytes will_init: {} {}",
|
||||
buf.len(),
|
||||
@@ -1437,7 +1438,7 @@ impl DeltaLayerInner {
|
||||
offset
|
||||
}
|
||||
|
||||
pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> {
|
||||
pub fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> {
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader =
|
||||
DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
|
||||
@@ -1610,7 +1611,6 @@ pub(crate) mod test {
|
||||
use rand::RngCore;
|
||||
|
||||
use super::*;
|
||||
use crate::repository::Value;
|
||||
use crate::tenant::harness::TIMELINE_ID;
|
||||
use crate::tenant::storage_layer::{Layer, ResidentLayer};
|
||||
use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
|
||||
@@ -1622,6 +1622,7 @@ pub(crate) mod test {
|
||||
DEFAULT_PG_VERSION,
|
||||
};
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
/// Construct an index for a fictional delta layer and and then
|
||||
/// traverse in order to plan vectored reads for a query. Finally,
|
||||
@@ -1974,8 +1975,8 @@ pub(crate) mod test {
|
||||
|
||||
#[tokio::test]
|
||||
async fn copy_delta_prefix_smoke() {
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
|
||||
let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke")
|
||||
.await
|
||||
@@ -2198,6 +2199,7 @@ pub(crate) mod test {
|
||||
(k1, l1).cmp(&(k2, l2))
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
pub(crate) fn sort_delta_value(
|
||||
(k1, l1, v1): &(Key, Lsn, Value),
|
||||
(k2, l2, v2): &(Key, Lsn, Value),
|
||||
|
||||
@@ -7,7 +7,7 @@ use pageserver_api::{
|
||||
};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::repository::Value;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
use super::merge_iterator::MergeIterator;
|
||||
|
||||
@@ -121,8 +121,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn filter_keyspace_iterator() {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
let harness = TenantHarness::create("filter_iterator_filter_keyspace_iterator")
|
||||
.await
|
||||
|
||||
@@ -28,7 +28,6 @@
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
|
||||
use crate::page_cache::{self, FileId, PAGE_SZ};
|
||||
use crate::repository::{Key, Value, KEY_SIZE};
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::{BlockBuf, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{
|
||||
@@ -51,8 +50,10 @@ use hex;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::config::MaxVectoredReadBytes;
|
||||
use pageserver_api::key::DBDIR_KEY;
|
||||
use pageserver_api::key::{Key, KEY_SIZE};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::shard::{ShardIdentity, TenantShardId};
|
||||
use pageserver_api::value::Value;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::VecDeque;
|
||||
@@ -230,7 +231,7 @@ impl AsLayerDesc for ImageLayer {
|
||||
}
|
||||
|
||||
impl ImageLayer {
|
||||
pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
self.desc.dump();
|
||||
|
||||
if !verbose {
|
||||
@@ -1125,6 +1126,7 @@ mod test {
|
||||
use pageserver_api::{
|
||||
key::Key,
|
||||
shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize},
|
||||
value::Value,
|
||||
};
|
||||
use utils::{
|
||||
generation::Generation,
|
||||
@@ -1134,7 +1136,6 @@ mod test {
|
||||
|
||||
use crate::{
|
||||
context::RequestContext,
|
||||
repository::Value,
|
||||
tenant::{
|
||||
config::TenantConf,
|
||||
harness::{TenantHarness, TIMELINE_ID},
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
use crate::assert_u64_eq_usize::{u64_to_usize, U64IsUsize, UsizeIsU64};
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::tenant::ephemeral_file::EphemeralFile;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::PageReconstructError;
|
||||
@@ -16,9 +15,11 @@ use crate::{l0_flush, page_cache};
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::key::CompactKey;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::InMemoryLayerInfo;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_api::value::Value;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::sync::{Arc, OnceLock};
|
||||
use std::time::Instant;
|
||||
|
||||
@@ -760,8 +760,8 @@ async fn evict_and_wait_does_not_wait_for_download() {
|
||||
/// Also checks that the same does not happen on a non-evicted layer (regression test).
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn eviction_cancellation_on_drop() {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
// this is the runtime on which Layer spawns the blocking tasks on
|
||||
let handle = tokio::runtime::Handle::current();
|
||||
@@ -782,7 +782,7 @@ async fn eviction_cancellation_on_drop() {
|
||||
let mut writer = timeline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
crate::repository::Key::from_i128(5),
|
||||
pageserver_api::key::Key::from_i128(5),
|
||||
Lsn(0x20),
|
||||
&Value::Image(Bytes::from_static(b"this does not matter either")),
|
||||
&ctx,
|
||||
|
||||
@@ -3,7 +3,7 @@ use pageserver_api::shard::TenantShardId;
|
||||
use std::ops::Range;
|
||||
use utils::{id::TimelineId, lsn::Lsn};
|
||||
|
||||
use crate::repository::Key;
|
||||
use pageserver_api::key::Key;
|
||||
|
||||
use super::{DeltaLayerName, ImageLayerName, LayerName};
|
||||
|
||||
|
||||
@@ -1,14 +1,12 @@
|
||||
//!
|
||||
//! Helper functions for dealing with filenames of the image and delta layer files.
|
||||
//!
|
||||
use crate::repository::Key;
|
||||
use std::borrow::Cow;
|
||||
use pageserver_api::key::Key;
|
||||
use std::cmp::Ordering;
|
||||
use std::fmt;
|
||||
use std::ops::Range;
|
||||
use std::str::FromStr;
|
||||
|
||||
use regex::Regex;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use super::PersistentLayerDesc;
|
||||
@@ -60,32 +58,31 @@ impl Ord for DeltaLayerName {
|
||||
/// Represents the region of the LSN-Key space covered by a DeltaLayer
|
||||
///
|
||||
/// ```text
|
||||
/// <key start>-<key end>__<LSN start>-<LSN end>
|
||||
/// <key start>-<key end>__<LSN start>-<LSN end>-<generation>
|
||||
/// ```
|
||||
impl DeltaLayerName {
|
||||
/// Parse the part of a delta layer's file name that represents the LayerName. Returns None
|
||||
/// if the filename does not match the expected pattern.
|
||||
pub fn parse_str(fname: &str) -> Option<Self> {
|
||||
let mut parts = fname.split("__");
|
||||
let mut key_parts = parts.next()?.split('-');
|
||||
let mut lsn_parts = parts.next()?.split('-');
|
||||
|
||||
let key_start_str = key_parts.next()?;
|
||||
let key_end_str = key_parts.next()?;
|
||||
let lsn_start_str = lsn_parts.next()?;
|
||||
let lsn_end_str = lsn_parts.next()?;
|
||||
|
||||
if parts.next().is_some() || key_parts.next().is_some() || key_parts.next().is_some() {
|
||||
return None;
|
||||
}
|
||||
|
||||
if key_start_str.len() != 36
|
||||
|| key_end_str.len() != 36
|
||||
|| lsn_start_str.len() != 16
|
||||
|| lsn_end_str.len() != 16
|
||||
let (key_parts, lsn_generation_parts) = fname.split_once("__")?;
|
||||
let (key_start_str, key_end_str) = key_parts.split_once('-')?;
|
||||
let (lsn_start_str, lsn_end_generation_parts) = lsn_generation_parts.split_once('-')?;
|
||||
let lsn_end_str = if let Some((lsn_end_str, maybe_generation)) =
|
||||
lsn_end_generation_parts.split_once('-')
|
||||
{
|
||||
return None;
|
||||
}
|
||||
if maybe_generation.starts_with("v") {
|
||||
// vY-XXXXXXXX
|
||||
lsn_end_str
|
||||
} else if maybe_generation.len() == 8 {
|
||||
// XXXXXXXX
|
||||
lsn_end_str
|
||||
} else {
|
||||
// no idea what this is
|
||||
return None;
|
||||
}
|
||||
} else {
|
||||
lsn_end_generation_parts
|
||||
};
|
||||
|
||||
let key_start = Key::from_hex(key_start_str).ok()?;
|
||||
let key_end = Key::from_hex(key_end_str).ok()?;
|
||||
@@ -173,25 +170,29 @@ impl ImageLayerName {
|
||||
/// Represents the part of the Key-LSN space covered by an ImageLayer
|
||||
///
|
||||
/// ```text
|
||||
/// <key start>-<key end>__<LSN>
|
||||
/// <key start>-<key end>__<LSN>-<generation>
|
||||
/// ```
|
||||
impl ImageLayerName {
|
||||
/// Parse a string as then LayerName part of an image layer file name. Returns None if the
|
||||
/// filename does not match the expected pattern.
|
||||
pub fn parse_str(fname: &str) -> Option<Self> {
|
||||
let mut parts = fname.split("__");
|
||||
let mut key_parts = parts.next()?.split('-');
|
||||
|
||||
let key_start_str = key_parts.next()?;
|
||||
let key_end_str = key_parts.next()?;
|
||||
let lsn_str = parts.next()?;
|
||||
if parts.next().is_some() || key_parts.next().is_some() {
|
||||
return None;
|
||||
}
|
||||
|
||||
if key_start_str.len() != 36 || key_end_str.len() != 36 || lsn_str.len() != 16 {
|
||||
return None;
|
||||
}
|
||||
let (key_parts, lsn_generation_parts) = fname.split_once("__")?;
|
||||
let (key_start_str, key_end_str) = key_parts.split_once('-')?;
|
||||
let lsn_str =
|
||||
if let Some((lsn_str, maybe_generation)) = lsn_generation_parts.split_once('-') {
|
||||
if maybe_generation.starts_with("v") {
|
||||
// vY-XXXXXXXX
|
||||
lsn_str
|
||||
} else if maybe_generation.len() == 8 {
|
||||
// XXXXXXXX
|
||||
lsn_str
|
||||
} else {
|
||||
// likely a delta layer
|
||||
return None;
|
||||
}
|
||||
} else {
|
||||
lsn_generation_parts
|
||||
};
|
||||
|
||||
let key_start = Key::from_hex(key_start_str).ok()?;
|
||||
let key_end = Key::from_hex(key_end_str).ok()?;
|
||||
@@ -258,6 +259,14 @@ impl LayerName {
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets the LSN range encoded in the layer name.
|
||||
pub fn lsn_as_range(&self) -> Range<Lsn> {
|
||||
match &self {
|
||||
LayerName::Image(layer) => layer.lsn_as_range(),
|
||||
LayerName::Delta(layer) => layer.lsn_range.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_delta(&self) -> bool {
|
||||
matches!(self, LayerName::Delta(_))
|
||||
}
|
||||
@@ -290,18 +299,8 @@ impl FromStr for LayerName {
|
||||
/// Self. When loading a physical layer filename, we drop any extra information
|
||||
/// not needed to build Self.
|
||||
fn from_str(value: &str) -> Result<Self, Self::Err> {
|
||||
let gen_suffix_regex = Regex::new("^(?<base>.+)(?<gen>-v1-[0-9a-f]{8})$").unwrap();
|
||||
let file_name: Cow<str> = match gen_suffix_regex.captures(value) {
|
||||
Some(captures) => captures
|
||||
.name("base")
|
||||
.expect("Non-optional group")
|
||||
.as_str()
|
||||
.into(),
|
||||
None => value.into(),
|
||||
};
|
||||
|
||||
let delta = DeltaLayerName::parse_str(&file_name);
|
||||
let image = ImageLayerName::parse_str(&file_name);
|
||||
let delta = DeltaLayerName::parse_str(value);
|
||||
let image = ImageLayerName::parse_str(value);
|
||||
let ok = match (delta, image) {
|
||||
(None, None) => {
|
||||
return Err(format!(
|
||||
@@ -367,11 +366,14 @@ mod test {
|
||||
lsn: Lsn::from_hex("00000000014FED58").unwrap(),
|
||||
});
|
||||
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").unwrap();
|
||||
assert_eq!(parsed, expected,);
|
||||
assert_eq!(parsed, expected);
|
||||
|
||||
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-00000001").unwrap();
|
||||
assert_eq!(parsed, expected);
|
||||
|
||||
// Omitting generation suffix is valid
|
||||
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").unwrap();
|
||||
assert_eq!(parsed, expected,);
|
||||
assert_eq!(parsed, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -385,6 +387,9 @@ mod test {
|
||||
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").unwrap();
|
||||
assert_eq!(parsed, expected);
|
||||
|
||||
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-00000001").unwrap();
|
||||
assert_eq!(parsed, expected);
|
||||
|
||||
// Omitting generation suffix is valid
|
||||
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").unwrap();
|
||||
assert_eq!(parsed, expected);
|
||||
|
||||
@@ -7,7 +7,8 @@ use anyhow::bail;
|
||||
use pageserver_api::key::Key;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{context::RequestContext, repository::Value};
|
||||
use crate::context::RequestContext;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
use super::{
|
||||
delta_layer::{DeltaLayerInner, DeltaLayerIterator},
|
||||
@@ -291,12 +292,16 @@ mod tests {
|
||||
use crate::{
|
||||
tenant::{
|
||||
harness::{TenantHarness, TIMELINE_ID},
|
||||
storage_layer::delta_layer::test::{produce_delta_layer, sort_delta, sort_delta_value},
|
||||
storage_layer::delta_layer::test::{produce_delta_layer, sort_delta},
|
||||
},
|
||||
walrecord::NeonWalRecord,
|
||||
DEFAULT_PG_VERSION,
|
||||
};
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
use crate::tenant::storage_layer::delta_layer::test::sort_delta_value;
|
||||
#[cfg(feature = "testing")]
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
|
||||
async fn assert_merge_iter_equal(
|
||||
merge_iter: &mut MergeIterator<'_>,
|
||||
expect: &[(Key, Lsn, Value)],
|
||||
@@ -319,8 +324,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn merge_in_between() {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
let harness = TenantHarness::create("merge_iterator_merge_in_between")
|
||||
.await
|
||||
@@ -384,8 +389,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn delta_merge() {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
let harness = TenantHarness::create("merge_iterator_delta_merge")
|
||||
.await
|
||||
@@ -458,10 +463,11 @@ mod tests {
|
||||
// TODO: test layers are loaded only when needed, reducing num of active iterators in k-merge
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn delta_image_mixed_merge() {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
let harness = TenantHarness::create("merge_iterator_delta_image_mixed_merge")
|
||||
.await
|
||||
@@ -586,5 +592,6 @@ mod tests {
|
||||
is_send(merge_iter);
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
fn is_send(_: impl Send) {}
|
||||
}
|
||||
|
||||
@@ -125,11 +125,12 @@ use utils::{
|
||||
simple_rcu::{Rcu, RcuReadGuard},
|
||||
};
|
||||
|
||||
use crate::repository::GcResult;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::gc_result::GcResult;
|
||||
use crate::ZERO_PAGE;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
use self::delete::DeleteTimelineFlow;
|
||||
pub(super) use self::eviction_task::EvictionTaskTenantState;
|
||||
@@ -5822,17 +5823,15 @@ fn is_send() {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::value::Value;
|
||||
use utils::{id::TimelineId, lsn::Lsn};
|
||||
|
||||
use crate::{
|
||||
repository::Value,
|
||||
tenant::{
|
||||
harness::{test_img, TenantHarness},
|
||||
layer_map::LayerMap,
|
||||
storage_layer::{Layer, LayerName},
|
||||
timeline::{DeltaLayerTestDesc, EvictionError},
|
||||
Timeline,
|
||||
},
|
||||
use crate::tenant::{
|
||||
harness::{test_img, TenantHarness},
|
||||
layer_map::LayerMap,
|
||||
storage_layer::{Layer, LayerName},
|
||||
timeline::{DeltaLayerTestDesc, EvictionError},
|
||||
Timeline,
|
||||
};
|
||||
|
||||
#[tokio::test]
|
||||
|
||||
@@ -49,9 +49,10 @@ use pageserver_api::config::tenant_conf_defaults::{
|
||||
DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD,
|
||||
};
|
||||
|
||||
use crate::keyspace::KeySpace;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -1715,20 +1716,32 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// An experimental compaction building block that combines compaction with garbage collection.
|
||||
///
|
||||
/// The current implementation picks all delta + image layers that are below or intersecting with
|
||||
/// the GC horizon without considering retain_lsns. Then, it does a full compaction over all these delta
|
||||
/// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
|
||||
/// and create delta layers with all deltas >= gc horizon.
|
||||
pub(crate) async fn compact_with_gc(
|
||||
self: &Arc<Self>,
|
||||
cancel: &CancellationToken,
|
||||
flags: EnumSet<CompactFlags>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
use std::collections::BTreeSet;
|
||||
self.partial_compact_with_gc(None, cancel, flags, ctx).await
|
||||
}
|
||||
|
||||
/// An experimental compaction building block that combines compaction with garbage collection.
|
||||
///
|
||||
/// The current implementation picks all delta + image layers that are below or intersecting with
|
||||
/// the GC horizon without considering retain_lsns. Then, it does a full compaction over all these delta
|
||||
/// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
|
||||
/// and create delta layers with all deltas >= gc horizon.
|
||||
///
|
||||
/// If `key_range`, it will only compact the keys within the range, aka partial compaction. This functionality
|
||||
/// is not complete yet, and if it is set, only image layers will be generated.
|
||||
///
|
||||
pub(crate) async fn partial_compact_with_gc(
|
||||
self: &Arc<Self>,
|
||||
compaction_key_range: Option<Range<Key>>,
|
||||
cancel: &CancellationToken,
|
||||
flags: EnumSet<CompactFlags>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
// Block other compaction/GC tasks from running for now. GC-compaction could run along
|
||||
// with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
|
||||
// Note that we already acquired the compaction lock when the outer `compact` function gets called.
|
||||
@@ -1749,8 +1762,13 @@ impl Timeline {
|
||||
.await?;
|
||||
|
||||
let dry_run = flags.contains(CompactFlags::DryRun);
|
||||
let partial_compaction = compaction_key_range.is_some();
|
||||
|
||||
info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");
|
||||
if let Some(ref compaction_key_range) = compaction_key_range {
|
||||
info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compaction_key_range={}..{}", compaction_key_range.start, compaction_key_range.end);
|
||||
} else {
|
||||
info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");
|
||||
}
|
||||
|
||||
scopeguard::defer! {
|
||||
info!("done enhanced gc bottom-most compaction");
|
||||
@@ -1762,7 +1780,7 @@ impl Timeline {
|
||||
// The layer selection has the following properties:
|
||||
// 1. If a layer is in the selection, all layers below it are in the selection.
|
||||
// 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
|
||||
let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = {
|
||||
let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = if !partial_compaction {
|
||||
let guard = self.layers.read().await;
|
||||
let layers = guard.layer_map()?;
|
||||
let gc_info = self.gc_info.read().unwrap();
|
||||
@@ -1778,7 +1796,7 @@ impl Timeline {
|
||||
retain_lsns_below_horizon.push(*lsn);
|
||||
}
|
||||
}
|
||||
let mut selected_layers = Vec::new();
|
||||
let mut selected_layers: Vec<Layer> = Vec::new();
|
||||
drop(gc_info);
|
||||
// Pick all the layers intersect or below the gc_cutoff, get the largest LSN in the selected layers.
|
||||
let Some(max_layer_lsn) = layers
|
||||
@@ -1803,8 +1821,52 @@ impl Timeline {
|
||||
}
|
||||
retain_lsns_below_horizon.sort();
|
||||
(selected_layers, gc_cutoff, retain_lsns_below_horizon)
|
||||
} else {
|
||||
// In case of partial compaction, we currently only support generating image layers, and therefore,
|
||||
// we pick all layers that are below the lowest retain_lsn and does not intersect with any of the layers.
|
||||
let guard = self.layers.read().await;
|
||||
let layers = guard.layer_map()?;
|
||||
let gc_info = self.gc_info.read().unwrap();
|
||||
let mut min_lsn = gc_info.cutoffs.select_min();
|
||||
for (lsn, _, _) in &gc_info.retain_lsns {
|
||||
if lsn < &min_lsn {
|
||||
min_lsn = *lsn;
|
||||
}
|
||||
}
|
||||
for lsn in gc_info.leases.keys() {
|
||||
if lsn < &min_lsn {
|
||||
min_lsn = *lsn;
|
||||
}
|
||||
}
|
||||
let mut selected_layers = Vec::new();
|
||||
drop(gc_info);
|
||||
// |-------| |-------| |-------|
|
||||
// | Delta | | Delta | | Delta | -- min_lsn could be intersecting with the layers
|
||||
// |-------| |-------| |-------| <- we want to pick all the layers below min_lsn, so that
|
||||
// | Delta | | Delta | | Delta | ...we can remove them after compaction
|
||||
// |-------| |-------| |-------|
|
||||
// Pick all the layers intersect or below the min_lsn, get the largest LSN in the selected layers.
|
||||
let Some(compaction_key_range) = compaction_key_range.as_ref() else {
|
||||
unreachable!()
|
||||
};
|
||||
for desc in layers.iter_historic_layers() {
|
||||
if desc.get_lsn_range().end <= min_lsn
|
||||
&& overlaps_with(&desc.key_range, compaction_key_range)
|
||||
{
|
||||
selected_layers.push(guard.get_from_desc(&desc));
|
||||
}
|
||||
}
|
||||
if selected_layers.is_empty() {
|
||||
info!("no layers to compact with gc");
|
||||
return Ok(());
|
||||
}
|
||||
(selected_layers, min_lsn, Vec::new())
|
||||
};
|
||||
let lowest_retain_lsn = if self.ancestor_timeline.is_some() {
|
||||
if partial_compaction {
|
||||
warn!("partial compaction cannot run on child branches (for now)");
|
||||
return Ok(());
|
||||
}
|
||||
Lsn(self.ancestor_lsn.0 + 1)
|
||||
} else {
|
||||
let res = retain_lsns_below_horizon
|
||||
@@ -1832,23 +1894,18 @@ impl Timeline {
|
||||
|
||||
self.check_compaction_space(&layer_selection).await?;
|
||||
|
||||
// Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
|
||||
// Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
|
||||
let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
|
||||
// Generate statistics for the compaction
|
||||
for layer in &layer_selection {
|
||||
let desc = layer.layer_desc();
|
||||
if desc.is_delta() {
|
||||
// ignore single-key layer files
|
||||
if desc.key_range.start.next() != desc.key_range.end {
|
||||
let lsn_range = &desc.lsn_range;
|
||||
lsn_split_point.insert(lsn_range.start);
|
||||
lsn_split_point.insert(lsn_range.end);
|
||||
}
|
||||
stat.visit_delta_layer(desc.file_size());
|
||||
} else {
|
||||
stat.visit_image_layer(desc.file_size());
|
||||
}
|
||||
}
|
||||
|
||||
// Step 1: construct a k-merge iterator over all layers.
|
||||
// Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
|
||||
let layer_names: Vec<crate::tenant::storage_layer::LayerName> = layer_selection
|
||||
.iter()
|
||||
.map(|layer| layer.layer_desc().layer_name())
|
||||
@@ -1899,7 +1956,10 @@ impl Timeline {
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
Key::MIN,
|
||||
compaction_key_range
|
||||
.as_ref()
|
||||
.map(|x| x.start)
|
||||
.unwrap_or(Key::MIN),
|
||||
lowest_retain_lsn,
|
||||
self.get_compaction_target_size(),
|
||||
ctx,
|
||||
@@ -1960,55 +2020,71 @@ impl Timeline {
|
||||
} else {
|
||||
let last_key = last_key.as_mut().unwrap();
|
||||
stat.on_unique_key_visited();
|
||||
let retention = self
|
||||
.generate_key_retention(
|
||||
*last_key,
|
||||
&accumulated_values,
|
||||
gc_cutoff,
|
||||
&retain_lsns_below_horizon,
|
||||
COMPACTION_DELTA_THRESHOLD,
|
||||
get_ancestor_image(self, *last_key, ctx).await?,
|
||||
)
|
||||
.await?;
|
||||
// Put the image into the image layer. Currently we have a single big layer for the compaction.
|
||||
retention
|
||||
.pipe_to(
|
||||
*last_key,
|
||||
&mut delta_layer_writer,
|
||||
image_layer_writer.as_mut(),
|
||||
&mut stat,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
let skip_adding_key = if let Some(ref compaction_key_range) = compaction_key_range {
|
||||
!compaction_key_range.contains(last_key)
|
||||
} else {
|
||||
false
|
||||
};
|
||||
if !skip_adding_key {
|
||||
let retention = self
|
||||
.generate_key_retention(
|
||||
*last_key,
|
||||
&accumulated_values,
|
||||
gc_cutoff,
|
||||
&retain_lsns_below_horizon,
|
||||
COMPACTION_DELTA_THRESHOLD,
|
||||
get_ancestor_image(self, *last_key, ctx).await?,
|
||||
)
|
||||
.await?;
|
||||
// Put the image into the image layer. Currently we have a single big layer for the compaction.
|
||||
retention
|
||||
.pipe_to(
|
||||
*last_key,
|
||||
&mut delta_layer_writer,
|
||||
image_layer_writer.as_mut(),
|
||||
&mut stat,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
accumulated_values.clear();
|
||||
*last_key = key;
|
||||
accumulated_values.push((key, lsn, val));
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: move the below part to the loop body
|
||||
let last_key = last_key.expect("no keys produced during compaction");
|
||||
// TODO: move this part to the loop body
|
||||
stat.on_unique_key_visited();
|
||||
let retention = self
|
||||
.generate_key_retention(
|
||||
last_key,
|
||||
&accumulated_values,
|
||||
gc_cutoff,
|
||||
&retain_lsns_below_horizon,
|
||||
COMPACTION_DELTA_THRESHOLD,
|
||||
get_ancestor_image(self, last_key, ctx).await?,
|
||||
)
|
||||
.await?;
|
||||
// Put the image into the image layer. Currently we have a single big layer for the compaction.
|
||||
retention
|
||||
.pipe_to(
|
||||
last_key,
|
||||
&mut delta_layer_writer,
|
||||
image_layer_writer.as_mut(),
|
||||
&mut stat,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let skip_adding_key = if let Some(ref compaction_key_range) = compaction_key_range {
|
||||
!compaction_key_range.contains(&last_key)
|
||||
} else {
|
||||
false
|
||||
};
|
||||
if !skip_adding_key {
|
||||
let retention = self
|
||||
.generate_key_retention(
|
||||
last_key,
|
||||
&accumulated_values,
|
||||
gc_cutoff,
|
||||
&retain_lsns_below_horizon,
|
||||
COMPACTION_DELTA_THRESHOLD,
|
||||
get_ancestor_image(self, last_key, ctx).await?,
|
||||
)
|
||||
.await?;
|
||||
// Put the image into the image layer. Currently we have a single big layer for the compaction.
|
||||
retention
|
||||
.pipe_to(
|
||||
last_key,
|
||||
&mut delta_layer_writer,
|
||||
image_layer_writer.as_mut(),
|
||||
&mut stat,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
// end: move the above part to the loop body
|
||||
|
||||
let discard = |key: &PersistentLayerKey| {
|
||||
let key = key.clone();
|
||||
@@ -2017,8 +2093,12 @@ impl Timeline {
|
||||
|
||||
let produced_image_layers = if let Some(writer) = image_layer_writer {
|
||||
if !dry_run {
|
||||
let end_key = compaction_key_range
|
||||
.as_ref()
|
||||
.map(|x| x.end)
|
||||
.unwrap_or(Key::MAX);
|
||||
writer
|
||||
.finish_with_discard_fn(self, ctx, Key::MAX, discard)
|
||||
.finish_with_discard_fn(self, ctx, end_key, discard)
|
||||
.await?
|
||||
} else {
|
||||
drop(writer);
|
||||
@@ -2037,6 +2117,10 @@ impl Timeline {
|
||||
Vec::new()
|
||||
};
|
||||
|
||||
if partial_compaction && !produced_delta_layers.is_empty() {
|
||||
bail!("implementation error: partial compaction should not be producing delta layers (for now)");
|
||||
}
|
||||
|
||||
let mut compact_to = Vec::new();
|
||||
let mut keep_layers = HashSet::new();
|
||||
let produced_delta_layers_len = produced_delta_layers.len();
|
||||
@@ -2067,6 +2151,28 @@ impl Timeline {
|
||||
}
|
||||
let mut layer_selection = layer_selection;
|
||||
layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
|
||||
if let Some(ref compaction_key_range) = compaction_key_range {
|
||||
// Partial compaction might select more data than it processes, e.g., if
|
||||
// the compaction_key_range only partially overlaps:
|
||||
//
|
||||
// [---compaction_key_range---]
|
||||
// [---A----][----B----][----C----][----D----]
|
||||
//
|
||||
// A,B,C,D are all in the `layer_selection`. The created image layers contain
|
||||
// whatever is needed from B, C, and from `----]` of A, and from `[--` of D.
|
||||
//
|
||||
// In contrast, `[--A-` and `--D----]` have not been processed, so, we must
|
||||
// keep that data.
|
||||
//
|
||||
// The solution for now is to keep A and D completely.
|
||||
// (layer_selection is what we'll remove from the layer map, so,
|
||||
// retain what is _not_ fully covered by compaction_key_range).
|
||||
layer_selection.retain(|x| {
|
||||
let key_range = &x.layer_desc().key_range;
|
||||
key_range.start >= compaction_key_range.start
|
||||
&& key_range.end <= compaction_key_range.end
|
||||
});
|
||||
}
|
||||
|
||||
info!(
|
||||
"gc-compaction statistics: {}",
|
||||
@@ -2148,7 +2254,7 @@ struct ResidentDeltaLayer(ResidentLayer);
|
||||
struct ResidentImageLayer(ResidentLayer);
|
||||
|
||||
impl CompactionJobExecutor for TimelineAdaptor {
|
||||
type Key = crate::repository::Key;
|
||||
type Key = pageserver_api::key::Key;
|
||||
|
||||
type Layer = OwnArc<PersistentLayerDesc>;
|
||||
type DeltaLayer = ResidentDeltaLayer;
|
||||
|
||||
@@ -6,7 +6,7 @@ use std::{
|
||||
use anyhow::Context;
|
||||
use pageserver_api::{models::TimelineState, shard::TenantShardId};
|
||||
use tokio::sync::OwnedMutexGuard;
|
||||
use tracing::{error, info, instrument, Instrument};
|
||||
use tracing::{error, info, info_span, instrument, Instrument};
|
||||
use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
|
||||
|
||||
use crate::{
|
||||
@@ -14,10 +14,9 @@ use crate::{
|
||||
task_mgr::{self, TaskKind},
|
||||
tenant::{
|
||||
metadata::TimelineMetadata,
|
||||
remote_timeline_client::{
|
||||
self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
|
||||
},
|
||||
CreateTimelineCause, DeleteTimelineError, Tenant, TimelineOrOffloaded,
|
||||
remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
|
||||
CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant,
|
||||
TimelineOrOffloaded,
|
||||
},
|
||||
};
|
||||
|
||||
@@ -176,32 +175,6 @@ async fn remove_maybe_offloaded_timeline_from_tenant(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// It is important that this gets called when DeletionGuard is being held.
|
||||
/// For more context see comments in [`DeleteTimelineFlow::prepare`]
|
||||
async fn upload_new_tenant_manifest(
|
||||
tenant: &Tenant,
|
||||
_: &DeletionGuard, // using it as a witness
|
||||
) -> anyhow::Result<()> {
|
||||
// This is susceptible to race conditions, i.e. we won't continue deletions if there is a crash
|
||||
// between the deletion of the index-part.json and reaching of this code.
|
||||
// So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted.
|
||||
// However, we handle this case in tenant loading code so the next time we attach, the issue is
|
||||
// resolved.
|
||||
let manifest = tenant.tenant_manifest();
|
||||
// TODO: generation support
|
||||
let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
|
||||
remote_timeline_client::upload_tenant_manifest(
|
||||
&tenant.remote_storage,
|
||||
&tenant.tenant_shard_id,
|
||||
generation,
|
||||
&manifest,
|
||||
&tenant.cancel,
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures,
|
||||
/// and deletes its data from both disk and s3.
|
||||
/// The sequence of steps:
|
||||
@@ -241,7 +214,8 @@ impl DeleteTimelineFlow {
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
super::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?;
|
||||
let allow_offloaded_children = false;
|
||||
let (timeline, mut guard) = Self::prepare(tenant, timeline_id, allow_offloaded_children)?;
|
||||
|
||||
guard.mark_in_progress()?;
|
||||
|
||||
@@ -258,7 +232,32 @@ impl DeleteTimelineFlow {
|
||||
))?
|
||||
});
|
||||
|
||||
let remote_client = timeline.remote_client_maybe_construct(tenant);
|
||||
let remote_client = match timeline.maybe_remote_client() {
|
||||
Some(remote_client) => remote_client,
|
||||
None => {
|
||||
let remote_client = tenant
|
||||
.build_timeline_client(timeline.timeline_id(), tenant.remote_storage.clone());
|
||||
let result = remote_client
|
||||
.download_index_file(&tenant.cancel)
|
||||
.instrument(info_span!("download_index_file"))
|
||||
.await
|
||||
.map_err(|e| DeleteTimelineError::Other(anyhow::anyhow!("error: {:?}", e)))?;
|
||||
let index_part = match result {
|
||||
MaybeDeletedIndexPart::Deleted(p) => {
|
||||
tracing::info!("Timeline already set as deleted in remote index");
|
||||
p
|
||||
}
|
||||
MaybeDeletedIndexPart::IndexPart(p) => p,
|
||||
};
|
||||
let remote_client = Arc::new(remote_client);
|
||||
|
||||
remote_client
|
||||
.init_upload_queue(&index_part)
|
||||
.map_err(DeleteTimelineError::Other)?;
|
||||
remote_client.shutdown().await;
|
||||
remote_client
|
||||
}
|
||||
};
|
||||
set_deleted_in_remote_index(&remote_client).await?;
|
||||
|
||||
fail::fail_point!("timeline-delete-before-schedule", |_| {
|
||||
@@ -342,6 +341,7 @@ impl DeleteTimelineFlow {
|
||||
pub(super) fn prepare(
|
||||
tenant: &Tenant,
|
||||
timeline_id: TimelineId,
|
||||
allow_offloaded_children: bool,
|
||||
) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> {
|
||||
// Note the interaction between this guard and deletion guard.
|
||||
// Here we attempt to lock deletion guard when we're holding a lock on timelines.
|
||||
@@ -354,30 +354,27 @@ impl DeleteTimelineFlow {
|
||||
// T1: acquire deletion lock, do another `DeleteTimelineFlow::run`
|
||||
// For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346`
|
||||
let timelines = tenant.timelines.lock().unwrap();
|
||||
let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
|
||||
|
||||
let timeline = match timelines.get(&timeline_id) {
|
||||
Some(t) => TimelineOrOffloaded::Timeline(Arc::clone(t)),
|
||||
None => {
|
||||
let offloaded_timelines = tenant.timelines_offloaded.lock().unwrap();
|
||||
match offloaded_timelines.get(&timeline_id) {
|
||||
Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)),
|
||||
None => return Err(DeleteTimelineError::NotFound),
|
||||
}
|
||||
}
|
||||
None => match timelines_offloaded.get(&timeline_id) {
|
||||
Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)),
|
||||
None => return Err(DeleteTimelineError::NotFound),
|
||||
},
|
||||
};
|
||||
|
||||
// Ensure that there are no child timelines **attached to that pageserver**,
|
||||
// because detach removes files, which will break child branches
|
||||
let children: Vec<TimelineId> = timelines
|
||||
.iter()
|
||||
.filter_map(|(id, entry)| {
|
||||
if entry.get_ancestor_timeline_id() == Some(timeline_id) {
|
||||
Some(*id)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
// Ensure that there are no child timelines, because we are about to remove files,
|
||||
// which will break child branches
|
||||
let mut children = Vec::new();
|
||||
if !allow_offloaded_children {
|
||||
children.extend(timelines_offloaded.iter().filter_map(|(id, entry)| {
|
||||
(entry.ancestor_timeline_id == Some(timeline_id)).then_some(*id)
|
||||
}));
|
||||
}
|
||||
children.extend(timelines.iter().filter_map(|(id, entry)| {
|
||||
(entry.get_ancestor_timeline_id() == Some(timeline_id)).then_some(*id)
|
||||
}));
|
||||
|
||||
if !children.is_empty() {
|
||||
return Err(DeleteTimelineError::HasChildren(children));
|
||||
@@ -455,7 +452,15 @@ impl DeleteTimelineFlow {
|
||||
|
||||
remove_maybe_offloaded_timeline_from_tenant(tenant, timeline, &guard).await?;
|
||||
|
||||
upload_new_tenant_manifest(tenant, &guard).await?;
|
||||
// This is susceptible to race conditions, i.e. we won't continue deletions if there is a crash
|
||||
// between the deletion of the index-part.json and reaching of this code.
|
||||
// So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted.
|
||||
// However, we handle this case in tenant loading code so the next time we attach, the issue is
|
||||
// resolved.
|
||||
tenant
|
||||
.store_tenant_manifest()
|
||||
.await
|
||||
.map_err(|e| DeleteTimelineError::Other(anyhow::anyhow!(e)))?;
|
||||
|
||||
*guard = Self::Finished;
|
||||
|
||||
|
||||
@@ -29,6 +29,9 @@ pub(crate) enum Error {
|
||||
#[error("shutting down, please retry later")]
|
||||
ShuttingDown,
|
||||
|
||||
#[error("archived: {}", .0)]
|
||||
Archived(TimelineId),
|
||||
|
||||
#[error(transparent)]
|
||||
NotFound(crate::tenant::GetTimelineError),
|
||||
|
||||
@@ -79,8 +82,9 @@ impl From<Error> for ApiError {
|
||||
fn from(value: Error) -> Self {
|
||||
match value {
|
||||
Error::NoAncestor => ApiError::Conflict(value.to_string()),
|
||||
Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{}", value)),
|
||||
Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{value}")),
|
||||
Error::ShuttingDown => ApiError::ShuttingDown,
|
||||
Error::Archived(_) => ApiError::BadRequest(anyhow::anyhow!("{value}")),
|
||||
Error::OtherTimelineDetachOngoing(_) | Error::FailedToReparentAll => {
|
||||
ApiError::ResourceUnavailable(value.to_string().into())
|
||||
}
|
||||
@@ -201,12 +205,18 @@ pub(super) async fn prepare(
|
||||
}));
|
||||
};
|
||||
|
||||
if detached.is_archived() != Some(false) {
|
||||
return Err(Archived(detached.timeline_id));
|
||||
}
|
||||
|
||||
if !ancestor_lsn.is_valid() {
|
||||
// rare case, probably wouldn't even load
|
||||
tracing::error!("ancestor is set, but ancestor_lsn is invalid, this timeline needs fixing");
|
||||
return Err(NoAncestor);
|
||||
}
|
||||
|
||||
check_no_archived_children_of_ancestor(tenant, detached, &ancestor, ancestor_lsn)?;
|
||||
|
||||
if ancestor.ancestor_timeline.is_some() {
|
||||
// non-technical requirement; we could flatten N ancestors just as easily but we chose
|
||||
// not to, at least initially
|
||||
@@ -950,3 +960,36 @@ where
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn check_no_archived_children_of_ancestor(
|
||||
tenant: &Tenant,
|
||||
detached: &Arc<Timeline>,
|
||||
ancestor: &Arc<Timeline>,
|
||||
ancestor_lsn: Lsn,
|
||||
) -> Result<(), Error> {
|
||||
let timelines = tenant.timelines.lock().unwrap();
|
||||
let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
|
||||
for timeline in reparentable_timelines(timelines.values(), detached, ancestor, ancestor_lsn) {
|
||||
if timeline.is_archived() == Some(true) {
|
||||
return Err(Error::Archived(timeline.timeline_id));
|
||||
}
|
||||
}
|
||||
for timeline_offloaded in timelines_offloaded.values() {
|
||||
if timeline_offloaded.ancestor_timeline_id != Some(ancestor.timeline_id) {
|
||||
continue;
|
||||
}
|
||||
// This forbids the detach ancestor feature if flattened timelines are present,
|
||||
// even if the ancestor_lsn is from after the branchpoint of the detached timeline.
|
||||
// But as per current design, we don't record the ancestor_lsn of flattened timelines.
|
||||
// This is a bit unfortunate, but as of writing this we don't support flattening
|
||||
// anyway. Maybe we can evolve the data model in the future.
|
||||
if let Some(retain_lsn) = timeline_offloaded.ancestor_retain_lsn {
|
||||
let is_earlier = retain_lsn <= ancestor_lsn;
|
||||
if !is_earlier {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return Err(Error::Archived(timeline_offloaded.timeline_id));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::sync::Arc;
|
||||
use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard};
|
||||
use super::Timeline;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::{remote_timeline_client, OffloadedTimeline, Tenant, TimelineOrOffloaded};
|
||||
use crate::tenant::{OffloadedTimeline, Tenant, TimelineOrOffloaded};
|
||||
|
||||
pub(crate) async fn offload_timeline(
|
||||
tenant: &Tenant,
|
||||
@@ -12,7 +12,9 @@ pub(crate) async fn offload_timeline(
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
tracing::info!("offloading archived timeline");
|
||||
|
||||
let (timeline, guard) = DeleteTimelineFlow::prepare(tenant, timeline.timeline_id)?;
|
||||
let allow_offloaded_children = true;
|
||||
let (timeline, guard) =
|
||||
DeleteTimelineFlow::prepare(tenant, timeline.timeline_id, allow_offloaded_children)?;
|
||||
|
||||
let TimelineOrOffloaded::Timeline(timeline) = timeline else {
|
||||
tracing::error!("timeline already offloaded, but given timeline object");
|
||||
@@ -63,17 +65,10 @@ pub(crate) async fn offload_timeline(
|
||||
// at the next restart attach it again.
|
||||
// For that to happen, we'd need to make the manifest reflect our *intended* state,
|
||||
// not our actual state of offloaded timelines.
|
||||
let manifest = tenant.tenant_manifest();
|
||||
// TODO: generation support
|
||||
let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
|
||||
remote_timeline_client::upload_tenant_manifest(
|
||||
&tenant.remote_storage,
|
||||
&tenant.tenant_shard_id,
|
||||
generation,
|
||||
&manifest,
|
||||
&tenant.cancel,
|
||||
)
|
||||
.await?;
|
||||
tenant
|
||||
.store_tenant_manifest()
|
||||
.await
|
||||
.map_err(|e| anyhow::anyhow!(e))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -22,6 +22,7 @@ use tokio::{select, sync::watch, time};
|
||||
use tokio_postgres::{replication::ReplicationStream, Client};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, error, info, trace, warn, Instrument};
|
||||
use wal_decoder::models::{FlushUncommittedRecords, InterpretedWalRecord};
|
||||
|
||||
use super::TaskStateUpdate;
|
||||
use crate::{
|
||||
@@ -31,7 +32,6 @@ use crate::{
|
||||
task_mgr::{TaskKind, WALRECEIVER_RUNTIME},
|
||||
tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
|
||||
walingest::WalIngest,
|
||||
walrecord::{decode_wal_record, DecodedWALRecord},
|
||||
};
|
||||
use postgres_backend::is_expected_io_error;
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
@@ -339,11 +339,15 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
|
||||
}
|
||||
|
||||
// Deserialize WAL record
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
decode_wal_record(recdata, &mut decoded, modification.tline.pg_version)?;
|
||||
// Deserialize and interpret WAL record
|
||||
let interpreted = InterpretedWalRecord::from_bytes_filtered(
|
||||
recdata,
|
||||
modification.tline.get_shard_identity(),
|
||||
lsn,
|
||||
modification.tline.pg_version,
|
||||
)?;
|
||||
|
||||
if decoded.is_dbase_create_copy(timeline.pg_version)
|
||||
if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
|
||||
&& uncommitted_records > 0
|
||||
{
|
||||
// Special case: legacy PG database creations operate by reading pages from a 'template' database:
|
||||
@@ -360,7 +364,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
|
||||
// Ingest the records without immediately committing them.
|
||||
let ingested = walingest
|
||||
.ingest_record(decoded, lsn, &mut modification, &ctx)
|
||||
.ingest_record(interpreted, &mut modification, &ctx)
|
||||
.await
|
||||
.with_context(|| format!("could not ingest record at {lsn}"))?;
|
||||
if !ingested {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -29,11 +29,11 @@ use crate::metrics::{
|
||||
WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
|
||||
WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_TIME,
|
||||
};
|
||||
use crate::repository::Key;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::Context;
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::future::Future;
|
||||
use std::sync::Arc;
|
||||
@@ -548,9 +548,10 @@ impl PostgresRedoManager {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::PostgresRedoManager;
|
||||
use crate::repository::Key;
|
||||
use crate::{config::PageServerConf, walrecord::NeonWalRecord};
|
||||
use crate::config::PageServerConf;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::str::FromStr;
|
||||
use tracing::Instrument;
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::Context;
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::BytesMut;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::reltag::SlruKind;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
|
||||
@@ -238,7 +238,7 @@ pub(crate) fn apply_in_neon(
|
||||
// No-op: this record will never be created in aux v2.
|
||||
warn!("AuxFile record should not be created in aux v2");
|
||||
}
|
||||
#[cfg(test)]
|
||||
#[cfg(feature = "testing")]
|
||||
NeonWalRecord::Test {
|
||||
append,
|
||||
clear,
|
||||
|
||||
@@ -8,10 +8,10 @@ use crate::{
|
||||
metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
|
||||
page_cache::PAGE_SZ,
|
||||
span::debug_assert_current_span_has_tenant_id,
|
||||
walrecord::NeonWalRecord,
|
||||
};
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::{reltag::RelTag, shard::TenantShardId};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
#[cfg(feature = "testing")]
|
||||
|
||||
@@ -16,6 +16,7 @@ OBJS = \
|
||||
neon_walreader.o \
|
||||
pagestore_smgr.o \
|
||||
relsize_cache.o \
|
||||
unstable_extensions.o \
|
||||
walproposer.o \
|
||||
walproposer_pg.o \
|
||||
control_plane_connector.o \
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#include "postgres.h"
|
||||
|
||||
#include <curl/curl.h>
|
||||
@@ -508,6 +509,8 @@ NeonXactCallback(XactEvent event, void *arg)
|
||||
static bool
|
||||
RoleIsNeonSuperuser(const char *role_name)
|
||||
{
|
||||
Assert(role_name);
|
||||
|
||||
return strcmp(role_name, "neon_superuser") == 0;
|
||||
}
|
||||
|
||||
@@ -670,7 +673,7 @@ HandleCreateRole(CreateRoleStmt *stmt)
|
||||
static void
|
||||
HandleAlterRole(AlterRoleStmt *stmt)
|
||||
{
|
||||
const char *role_name = stmt->role->rolename;
|
||||
char *role_name;
|
||||
DefElem *dpass;
|
||||
ListCell *option;
|
||||
bool found = false;
|
||||
@@ -678,6 +681,7 @@ HandleAlterRole(AlterRoleStmt *stmt)
|
||||
|
||||
InitRoleTableIfNeeded();
|
||||
|
||||
role_name = get_rolespec_name(stmt->role);
|
||||
if (RoleIsNeonSuperuser(role_name) && !superuser())
|
||||
elog(ERROR, "can't ALTER neon_superuser");
|
||||
|
||||
@@ -689,9 +693,13 @@ HandleAlterRole(AlterRoleStmt *stmt)
|
||||
if (strcmp(defel->defname, "password") == 0)
|
||||
dpass = defel;
|
||||
}
|
||||
|
||||
/* We only care about updates to the password */
|
||||
if (!dpass)
|
||||
{
|
||||
pfree(role_name);
|
||||
return;
|
||||
}
|
||||
|
||||
entry = hash_search(CurrentDdlTable->role_table,
|
||||
role_name,
|
||||
@@ -704,6 +712,8 @@ HandleAlterRole(AlterRoleStmt *stmt)
|
||||
else
|
||||
entry->password = NULL;
|
||||
entry->type = Op_Set;
|
||||
|
||||
pfree(role_name);
|
||||
}
|
||||
|
||||
static void
|
||||
|
||||
@@ -30,6 +30,7 @@
|
||||
#include "neon.h"
|
||||
#include "control_plane_connector.h"
|
||||
#include "logical_replication_monitor.h"
|
||||
#include "unstable_extensions.h"
|
||||
#include "walsender_hooks.h"
|
||||
#if PG_MAJORVERSION_NUM >= 16
|
||||
#include "storage/ipc.h"
|
||||
@@ -424,6 +425,7 @@ _PG_init(void)
|
||||
LogicalFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
|
||||
SlotFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
|
||||
|
||||
InitUnstableExtensionsSupport();
|
||||
InitLogicalReplicationMonitor();
|
||||
InitControlPlaneConnector();
|
||||
|
||||
|
||||
@@ -42,3 +42,4 @@ InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags)
|
||||
MemoryContextSwitchTo(old_context);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
129
pgxn/neon/unstable_extensions.c
Normal file
129
pgxn/neon/unstable_extensions.c
Normal file
@@ -0,0 +1,129 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "postgres.h"
|
||||
|
||||
#include "nodes/plannodes.h"
|
||||
#include "nodes/parsenodes.h"
|
||||
#include "tcop/utility.h"
|
||||
#include "utils/errcodes.h"
|
||||
#include "utils/guc.h"
|
||||
|
||||
#include "neon_pgversioncompat.h"
|
||||
#include "unstable_extensions.h"
|
||||
|
||||
static bool allow_unstable_extensions = false;
|
||||
static char *unstable_extensions = NULL;
|
||||
|
||||
static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL;
|
||||
|
||||
static bool
|
||||
list_contains(char const* comma_separated_list, char const* val)
|
||||
{
|
||||
char const* occ = comma_separated_list;
|
||||
size_t val_len = strlen(val);
|
||||
|
||||
if (val_len == 0)
|
||||
return false;
|
||||
|
||||
while ((occ = strstr(occ, val)) != NULL)
|
||||
{
|
||||
if ((occ == comma_separated_list || occ[-1] == ',')
|
||||
&& (occ[val_len] == '\0' || occ[val_len] == ','))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
occ += val_len;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
CheckUnstableExtension(
|
||||
PlannedStmt *pstmt,
|
||||
const char *queryString,
|
||||
bool readOnlyTree,
|
||||
ProcessUtilityContext context,
|
||||
ParamListInfo params,
|
||||
QueryEnvironment *queryEnv,
|
||||
DestReceiver *dest,
|
||||
QueryCompletion *qc)
|
||||
{
|
||||
Node *parseTree = pstmt->utilityStmt;
|
||||
|
||||
if (allow_unstable_extensions || unstable_extensions == NULL)
|
||||
goto process;
|
||||
|
||||
switch (nodeTag(parseTree))
|
||||
{
|
||||
case T_CreateExtensionStmt:
|
||||
{
|
||||
CreateExtensionStmt *stmt = castNode(CreateExtensionStmt, parseTree);
|
||||
if (list_contains(unstable_extensions, stmt->extname))
|
||||
{
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
|
||||
errmsg("%s extension is in beta and may be unstable or introduce backward-incompatible changes.\nWe recommend testing it in a separate, dedicated Neon project.", stmt->extname),
|
||||
errhint("to proceed with installation, run SET neon.allow_unstable_extensions='true'")));
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
goto process;
|
||||
}
|
||||
|
||||
process:
|
||||
if (PreviousProcessUtilityHook)
|
||||
{
|
||||
PreviousProcessUtilityHook(
|
||||
pstmt,
|
||||
queryString,
|
||||
readOnlyTree,
|
||||
context,
|
||||
params,
|
||||
queryEnv,
|
||||
dest,
|
||||
qc);
|
||||
}
|
||||
else
|
||||
{
|
||||
standard_ProcessUtility(
|
||||
pstmt,
|
||||
queryString,
|
||||
readOnlyTree,
|
||||
context,
|
||||
params,
|
||||
queryEnv,
|
||||
dest,
|
||||
qc);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
InitUnstableExtensionsSupport(void)
|
||||
{
|
||||
DefineCustomBoolVariable(
|
||||
"neon.allow_unstable_extensions",
|
||||
"Allow unstable extensions to be installed and used",
|
||||
NULL,
|
||||
&allow_unstable_extensions,
|
||||
false,
|
||||
PGC_USERSET,
|
||||
0,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
DefineCustomStringVariable(
|
||||
"neon.unstable_extensions",
|
||||
"List of unstable extensions",
|
||||
NULL,
|
||||
&unstable_extensions,
|
||||
NULL,
|
||||
PGC_SUSET,
|
||||
0,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
PreviousProcessUtilityHook = ProcessUtility_hook;
|
||||
ProcessUtility_hook = CheckUnstableExtension;
|
||||
}
|
||||
6
pgxn/neon/unstable_extensions.h
Normal file
6
pgxn/neon/unstable_extensions.h
Normal file
@@ -0,0 +1,6 @@
|
||||
#ifndef __NEON_UNSTABLE_EXTENSIONS_H__
|
||||
#define __NEON_UNSTABLE_EXTENSIONS_H__
|
||||
|
||||
void InitUnstableExtensionsSupport(void);
|
||||
|
||||
#endif
|
||||
295
poetry.lock
generated
295
poetry.lock
generated
@@ -1,4 +1,4 @@
|
||||
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "aiohappyeyeballs"
|
||||
@@ -1034,24 +1034,25 @@ test-randomorder = ["pytest-randomly"]
|
||||
|
||||
[[package]]
|
||||
name = "docker"
|
||||
version = "4.2.2"
|
||||
version = "7.1.0"
|
||||
description = "A Python library for the Docker Engine API."
|
||||
optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "docker-4.2.2-py2.py3-none-any.whl", hash = "sha256:03a46400c4080cb6f7aa997f881ddd84fef855499ece219d75fbdb53289c17ab"},
|
||||
{file = "docker-4.2.2.tar.gz", hash = "sha256:26eebadce7e298f55b76a88c4f8802476c5eaddbdbe38dbc6cce8781c47c9b54"},
|
||||
{file = "docker-7.1.0-py3-none-any.whl", hash = "sha256:c96b93b7f0a746f9e77d325bcfb87422a3d8bd4f03136ae8a85b37f1898d5fc0"},
|
||||
{file = "docker-7.1.0.tar.gz", hash = "sha256:ad8c70e6e3f8926cb8a92619b832b4ea5299e2831c14284663184e200546fa6c"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
pypiwin32 = {version = "223", markers = "sys_platform == \"win32\" and python_version >= \"3.6\""}
|
||||
requests = ">=2.14.2,<2.18.0 || >2.18.0"
|
||||
six = ">=1.4.0"
|
||||
websocket-client = ">=0.32.0"
|
||||
pywin32 = {version = ">=304", markers = "sys_platform == \"win32\""}
|
||||
requests = ">=2.26.0"
|
||||
urllib3 = ">=1.26.0"
|
||||
|
||||
[package.extras]
|
||||
ssh = ["paramiko (>=2.4.2)"]
|
||||
tls = ["cryptography (>=1.3.4)", "idna (>=2.0.0)", "pyOpenSSL (>=17.5.0)"]
|
||||
dev = ["coverage (==7.2.7)", "pytest (==7.4.2)", "pytest-cov (==4.1.0)", "pytest-timeout (==2.1.0)", "ruff (==0.1.8)"]
|
||||
docs = ["myst-parser (==0.18.0)", "sphinx (==5.1.1)"]
|
||||
ssh = ["paramiko (>=2.4.3)"]
|
||||
websockets = ["websocket-client (>=1.3.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "exceptiongroup"
|
||||
@@ -1416,6 +1417,16 @@ files = [
|
||||
{file = "jsondiff-2.0.0.tar.gz", hash = "sha256:2795844ef075ec8a2b8d385c4d59f5ea48b08e7180fce3cb2787be0db00b1fb4"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jsonnet"
|
||||
version = "0.20.0"
|
||||
description = "Python bindings for Jsonnet - The data templating language"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "jsonnet-0.20.0.tar.gz", hash = "sha256:7e770c7bf3a366b97b650a39430450f77612e74406731eb75c5bd59f3f104d4f"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jsonpatch"
|
||||
version = "1.32"
|
||||
@@ -1521,6 +1532,21 @@ files = [
|
||||
[package.dependencies]
|
||||
six = "*"
|
||||
|
||||
[[package]]
|
||||
name = "jwcrypto"
|
||||
version = "1.5.6"
|
||||
description = "Implementation of JOSE Web standards"
|
||||
optional = false
|
||||
python-versions = ">= 3.8"
|
||||
files = [
|
||||
{file = "jwcrypto-1.5.6-py3-none-any.whl", hash = "sha256:150d2b0ebbdb8f40b77f543fb44ffd2baeff48788be71f67f03566692fd55789"},
|
||||
{file = "jwcrypto-1.5.6.tar.gz", hash = "sha256:771a87762a0c081ae6166958a954f80848820b2ab066937dc8b8379d65b1b039"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
cryptography = ">=3.4"
|
||||
typing-extensions = ">=4.5.0"
|
||||
|
||||
[[package]]
|
||||
name = "kafka-python"
|
||||
version = "2.0.2"
|
||||
@@ -2328,20 +2354,6 @@ files = [
|
||||
[package.extras]
|
||||
diagrams = ["jinja2", "railroad-diagrams"]
|
||||
|
||||
[[package]]
|
||||
name = "pypiwin32"
|
||||
version = "223"
|
||||
description = ""
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "pypiwin32-223-py3-none-any.whl", hash = "sha256:67adf399debc1d5d14dffc1ab5acacb800da569754fafdc576b2a039485aa775"},
|
||||
{file = "pypiwin32-223.tar.gz", hash = "sha256:71be40c1fbd28594214ecaecb58e7aa8b708eabfa0125c8a109ebd51edbd776a"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
pywin32 = ">=223"
|
||||
|
||||
[[package]]
|
||||
name = "pyrsistent"
|
||||
version = "0.18.1"
|
||||
@@ -2561,81 +2573,91 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "pywin32"
|
||||
version = "301"
|
||||
version = "308"
|
||||
description = "Python for Window Extensions"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "pywin32-301-cp35-cp35m-win32.whl", hash = "sha256:93367c96e3a76dfe5003d8291ae16454ca7d84bb24d721e0b74a07610b7be4a7"},
|
||||
{file = "pywin32-301-cp35-cp35m-win_amd64.whl", hash = "sha256:9635df6998a70282bd36e7ac2a5cef9ead1627b0a63b17c731312c7a0daebb72"},
|
||||
{file = "pywin32-301-cp36-cp36m-win32.whl", hash = "sha256:c866f04a182a8cb9b7855de065113bbd2e40524f570db73ef1ee99ff0a5cc2f0"},
|
||||
{file = "pywin32-301-cp36-cp36m-win_amd64.whl", hash = "sha256:dafa18e95bf2a92f298fe9c582b0e205aca45c55f989937c52c454ce65b93c78"},
|
||||
{file = "pywin32-301-cp37-cp37m-win32.whl", hash = "sha256:98f62a3f60aa64894a290fb7494bfa0bfa0a199e9e052e1ac293b2ad3cd2818b"},
|
||||
{file = "pywin32-301-cp37-cp37m-win_amd64.whl", hash = "sha256:fb3b4933e0382ba49305cc6cd3fb18525df7fd96aa434de19ce0878133bf8e4a"},
|
||||
{file = "pywin32-301-cp38-cp38-win32.whl", hash = "sha256:88981dd3cfb07432625b180f49bf4e179fb8cbb5704cd512e38dd63636af7a17"},
|
||||
{file = "pywin32-301-cp38-cp38-win_amd64.whl", hash = "sha256:8c9d33968aa7fcddf44e47750e18f3d034c3e443a707688a008a2e52bbef7e96"},
|
||||
{file = "pywin32-301-cp39-cp39-win32.whl", hash = "sha256:595d397df65f1b2e0beaca63a883ae6d8b6df1cdea85c16ae85f6d2e648133fe"},
|
||||
{file = "pywin32-301-cp39-cp39-win_amd64.whl", hash = "sha256:87604a4087434cd814ad8973bd47d6524bd1fa9e971ce428e76b62a5e0860fdf"},
|
||||
{file = "pywin32-308-cp310-cp310-win32.whl", hash = "sha256:796ff4426437896550d2981b9c2ac0ffd75238ad9ea2d3bfa67a1abd546d262e"},
|
||||
{file = "pywin32-308-cp310-cp310-win_amd64.whl", hash = "sha256:4fc888c59b3c0bef905ce7eb7e2106a07712015ea1c8234b703a088d46110e8e"},
|
||||
{file = "pywin32-308-cp310-cp310-win_arm64.whl", hash = "sha256:a5ab5381813b40f264fa3495b98af850098f814a25a63589a8e9eb12560f450c"},
|
||||
{file = "pywin32-308-cp311-cp311-win32.whl", hash = "sha256:5d8c8015b24a7d6855b1550d8e660d8daa09983c80e5daf89a273e5c6fb5095a"},
|
||||
{file = "pywin32-308-cp311-cp311-win_amd64.whl", hash = "sha256:575621b90f0dc2695fec346b2d6302faebd4f0f45c05ea29404cefe35d89442b"},
|
||||
{file = "pywin32-308-cp311-cp311-win_arm64.whl", hash = "sha256:100a5442b7332070983c4cd03f2e906a5648a5104b8a7f50175f7906efd16bb6"},
|
||||
{file = "pywin32-308-cp312-cp312-win32.whl", hash = "sha256:587f3e19696f4bf96fde9d8a57cec74a57021ad5f204c9e627e15c33ff568897"},
|
||||
{file = "pywin32-308-cp312-cp312-win_amd64.whl", hash = "sha256:00b3e11ef09ede56c6a43c71f2d31857cf7c54b0ab6e78ac659497abd2834f47"},
|
||||
{file = "pywin32-308-cp312-cp312-win_arm64.whl", hash = "sha256:9b4de86c8d909aed15b7011182c8cab38c8850de36e6afb1f0db22b8959e3091"},
|
||||
{file = "pywin32-308-cp313-cp313-win32.whl", hash = "sha256:1c44539a37a5b7b21d02ab34e6a4d314e0788f1690d65b48e9b0b89f31abbbed"},
|
||||
{file = "pywin32-308-cp313-cp313-win_amd64.whl", hash = "sha256:fd380990e792eaf6827fcb7e187b2b4b1cede0585e3d0c9e84201ec27b9905e4"},
|
||||
{file = "pywin32-308-cp313-cp313-win_arm64.whl", hash = "sha256:ef313c46d4c18dfb82a2431e3051ac8f112ccee1a34f29c263c583c568db63cd"},
|
||||
{file = "pywin32-308-cp37-cp37m-win32.whl", hash = "sha256:1f696ab352a2ddd63bd07430080dd598e6369152ea13a25ebcdd2f503a38f1ff"},
|
||||
{file = "pywin32-308-cp37-cp37m-win_amd64.whl", hash = "sha256:13dcb914ed4347019fbec6697a01a0aec61019c1046c2b905410d197856326a6"},
|
||||
{file = "pywin32-308-cp38-cp38-win32.whl", hash = "sha256:5794e764ebcabf4ff08c555b31bd348c9025929371763b2183172ff4708152f0"},
|
||||
{file = "pywin32-308-cp38-cp38-win_amd64.whl", hash = "sha256:3b92622e29d651c6b783e368ba7d6722b1634b8e70bd376fd7610fe1992e19de"},
|
||||
{file = "pywin32-308-cp39-cp39-win32.whl", hash = "sha256:7873ca4dc60ab3287919881a7d4f88baee4a6e639aa6962de25a98ba6b193341"},
|
||||
{file = "pywin32-308-cp39-cp39-win_amd64.whl", hash = "sha256:71b3322d949b4cc20776436a9c9ba0eeedcbc9c650daa536df63f0ff111bb920"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pyyaml"
|
||||
version = "6.0.1"
|
||||
version = "6.0.2"
|
||||
description = "YAML parser and emitter for Python"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "PyYAML-6.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a"},
|
||||
{file = "PyYAML-6.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f"},
|
||||
{file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"},
|
||||
{file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"},
|
||||
{file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"},
|
||||
{file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"},
|
||||
{file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"},
|
||||
{file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"},
|
||||
{file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"},
|
||||
{file = "PyYAML-6.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab"},
|
||||
{file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"},
|
||||
{file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"},
|
||||
{file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"},
|
||||
{file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"},
|
||||
{file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"},
|
||||
{file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"},
|
||||
{file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"},
|
||||
{file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"},
|
||||
{file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"},
|
||||
{file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd"},
|
||||
{file = "PyYAML-6.0.1-cp36-cp36m-win32.whl", hash = "sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585"},
|
||||
{file = "PyYAML-6.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa"},
|
||||
{file = "PyYAML-6.0.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3"},
|
||||
{file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27"},
|
||||
{file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3"},
|
||||
{file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c"},
|
||||
{file = "PyYAML-6.0.1-cp37-cp37m-win32.whl", hash = "sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba"},
|
||||
{file = "PyYAML-6.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867"},
|
||||
{file = "PyYAML-6.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595"},
|
||||
{file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"},
|
||||
{file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"},
|
||||
{file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"},
|
||||
{file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"},
|
||||
{file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"},
|
||||
{file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"},
|
||||
{file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"},
|
||||
{file = "PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859"},
|
||||
{file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"},
|
||||
{file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"},
|
||||
{file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"},
|
||||
{file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"},
|
||||
{file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"},
|
||||
{file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"},
|
||||
{file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
|
||||
{file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"},
|
||||
{file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"},
|
||||
{file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237"},
|
||||
{file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b"},
|
||||
{file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed"},
|
||||
{file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180"},
|
||||
{file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68"},
|
||||
{file = "PyYAML-6.0.2-cp310-cp310-win32.whl", hash = "sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99"},
|
||||
{file = "PyYAML-6.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e"},
|
||||
{file = "PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774"},
|
||||
{file = "PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee"},
|
||||
{file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c"},
|
||||
{file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317"},
|
||||
{file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85"},
|
||||
{file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4"},
|
||||
{file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e"},
|
||||
{file = "PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5"},
|
||||
{file = "PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44"},
|
||||
{file = "PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab"},
|
||||
{file = "PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725"},
|
||||
{file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5"},
|
||||
{file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425"},
|
||||
{file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476"},
|
||||
{file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48"},
|
||||
{file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b"},
|
||||
{file = "PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4"},
|
||||
{file = "PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8"},
|
||||
{file = "PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba"},
|
||||
{file = "PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1"},
|
||||
{file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133"},
|
||||
{file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484"},
|
||||
{file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5"},
|
||||
{file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc"},
|
||||
{file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652"},
|
||||
{file = "PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183"},
|
||||
{file = "PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563"},
|
||||
{file = "PyYAML-6.0.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a"},
|
||||
{file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5"},
|
||||
{file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d"},
|
||||
{file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083"},
|
||||
{file = "PyYAML-6.0.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706"},
|
||||
{file = "PyYAML-6.0.2-cp38-cp38-win32.whl", hash = "sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a"},
|
||||
{file = "PyYAML-6.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff"},
|
||||
{file = "PyYAML-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d"},
|
||||
{file = "PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f"},
|
||||
{file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290"},
|
||||
{file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12"},
|
||||
{file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19"},
|
||||
{file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e"},
|
||||
{file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725"},
|
||||
{file = "PyYAML-6.0.2-cp39-cp39-win32.whl", hash = "sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631"},
|
||||
{file = "PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8"},
|
||||
{file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2890,6 +2912,58 @@ files = [
|
||||
[package.dependencies]
|
||||
mpmath = ">=0.19"
|
||||
|
||||
[[package]]
|
||||
name = "testcontainers"
|
||||
version = "4.8.1"
|
||||
description = "Python library for throwaway instances of anything that can run in a Docker container"
|
||||
optional = false
|
||||
python-versions = "<4.0,>=3.9"
|
||||
files = [
|
||||
{file = "testcontainers-4.8.1-py3-none-any.whl", hash = "sha256:d8ae43e8fe34060fcd5c3f494e0b7652b7774beabe94568a2283d0881e94d489"},
|
||||
{file = "testcontainers-4.8.1.tar.gz", hash = "sha256:5ded4820b7227ad526857eb3caaafcabce1bbac05d22ad194849b136ffae3cb0"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
docker = "*"
|
||||
typing-extensions = "*"
|
||||
urllib3 = "*"
|
||||
wrapt = "*"
|
||||
|
||||
[package.extras]
|
||||
arangodb = ["python-arango (>=7.8,<8.0)"]
|
||||
aws = ["boto3", "httpx"]
|
||||
azurite = ["azure-storage-blob (>=12.19,<13.0)"]
|
||||
chroma = ["chromadb-client"]
|
||||
clickhouse = ["clickhouse-driver"]
|
||||
cosmosdb = ["azure-cosmos"]
|
||||
db2 = ["ibm_db_sa", "sqlalchemy"]
|
||||
generic = ["httpx", "redis"]
|
||||
google = ["google-cloud-datastore (>=2)", "google-cloud-pubsub (>=2)"]
|
||||
influxdb = ["influxdb", "influxdb-client"]
|
||||
k3s = ["kubernetes", "pyyaml"]
|
||||
keycloak = ["python-keycloak"]
|
||||
localstack = ["boto3"]
|
||||
mailpit = ["cryptography"]
|
||||
minio = ["minio"]
|
||||
mongodb = ["pymongo"]
|
||||
mssql = ["pymssql", "sqlalchemy"]
|
||||
mysql = ["pymysql[rsa]", "sqlalchemy"]
|
||||
nats = ["nats-py"]
|
||||
neo4j = ["neo4j"]
|
||||
opensearch = ["opensearch-py"]
|
||||
oracle = ["oracledb", "sqlalchemy"]
|
||||
oracle-free = ["oracledb", "sqlalchemy"]
|
||||
qdrant = ["qdrant-client"]
|
||||
rabbitmq = ["pika"]
|
||||
redis = ["redis"]
|
||||
registry = ["bcrypt"]
|
||||
scylla = ["cassandra-driver (==3.29.1)"]
|
||||
selenium = ["selenium"]
|
||||
sftp = ["cryptography"]
|
||||
test-module-import = ["httpx"]
|
||||
trino = ["trino"]
|
||||
weaviate = ["weaviate-client (>=4.5.4,<5.0.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "toml"
|
||||
version = "0.10.2"
|
||||
@@ -2912,6 +2986,20 @@ files = [
|
||||
{file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "types-jwcrypto"
|
||||
version = "1.5.0.20240925"
|
||||
description = "Typing stubs for jwcrypto"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "types-jwcrypto-1.5.0.20240925.tar.gz", hash = "sha256:50e17b790378c96239344476c7bd13b52d0c7eeb6d16c2d53723e48cc6bbf4fe"},
|
||||
{file = "types_jwcrypto-1.5.0.20240925-py3-none-any.whl", hash = "sha256:2d12a2d528240d326075e896aafec7056b9136bf3207fa6ccf3fcb8fbf9e11a1"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
cryptography = "*"
|
||||
|
||||
[[package]]
|
||||
name = "types-psutil"
|
||||
version = "5.9.5.12"
|
||||
@@ -2945,6 +3033,17 @@ files = [
|
||||
{file = "types_pytest_lazy_fixture-0.6.3.3-py3-none-any.whl", hash = "sha256:a56a55649147ff960ff79d4b2c781a4f769351abc1876873f3116d0bd0c96353"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "types-pyyaml"
|
||||
version = "6.0.12.20240917"
|
||||
description = "Typing stubs for PyYAML"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "types-PyYAML-6.0.12.20240917.tar.gz", hash = "sha256:d1405a86f9576682234ef83bcb4e6fff7c9305c8b1fbad5e0bcd4f7dbdc9c587"},
|
||||
{file = "types_PyYAML-6.0.12.20240917-py3-none-any.whl", hash = "sha256:392b267f1c0fe6022952462bf5d6523f31e37f6cea49b14cee7ad634b6301570"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "types-requests"
|
||||
version = "2.31.0.0"
|
||||
@@ -3019,22 +3118,6 @@ brotli = ["brotli (==1.0.9)", "brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotl
|
||||
secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"]
|
||||
socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "websocket-client"
|
||||
version = "1.3.3"
|
||||
description = "WebSocket client for Python with low level API options"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "websocket-client-1.3.3.tar.gz", hash = "sha256:d58c5f284d6a9bf8379dab423259fe8f85b70d5fa5d2916d5791a84594b122b1"},
|
||||
{file = "websocket_client-1.3.3-py3-none-any.whl", hash = "sha256:5d55652dc1d0b3c734f044337d929aaf83f4f9138816ec680c1aefefb4dc4877"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
docs = ["Sphinx (>=3.4)", "sphinx-rtd-theme (>=0.5)"]
|
||||
optional = ["python-socks", "wsaccel"]
|
||||
test = ["websockets"]
|
||||
|
||||
[[package]]
|
||||
name = "websockets"
|
||||
version = "12.0"
|
||||
@@ -3406,4 +3489,4 @@ cffi = ["cffi (>=1.11)"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "0f4804119f417edf8e1fbd6d715d2e8d70ad731334fa9570304a2203f83339cf"
|
||||
content-hash = "13bfc7479aacfe051abb92252b8ddc2e0c429f4607b2d9d8c4b353d2f75c1927"
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{info, warn};
|
||||
use tracing::{debug, info};
|
||||
|
||||
use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint};
|
||||
use crate::auth::{self, AuthFlow};
|
||||
@@ -21,7 +21,7 @@ pub(crate) async fn authenticate_cleartext(
|
||||
secret: AuthSecret,
|
||||
config: &'static AuthenticationConfig,
|
||||
) -> auth::Result<ComputeCredentials> {
|
||||
warn!("cleartext auth flow override is enabled, proceeding");
|
||||
debug!("cleartext auth flow override is enabled, proceeding");
|
||||
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
|
||||
|
||||
// pause the timer while we communicate with the client
|
||||
@@ -61,7 +61,7 @@ pub(crate) async fn password_hack_no_authentication(
|
||||
info: ComputeUserInfoNoEndpoint,
|
||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
) -> auth::Result<(ComputeUserInfo, Vec<u8>)> {
|
||||
warn!("project not specified, resorting to the password hack auth flow");
|
||||
debug!("project not specified, resorting to the password hack auth flow");
|
||||
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
|
||||
|
||||
// pause the timer while we communicate with the client
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use std::borrow::Cow;
|
||||
use std::future::Future;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, SystemTime};
|
||||
@@ -45,6 +46,7 @@ pub(crate) enum FetchAuthRulesError {
|
||||
RoleJwksNotConfigured,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct AuthRule {
|
||||
pub(crate) id: String,
|
||||
pub(crate) jwks_url: url::Url,
|
||||
@@ -277,7 +279,7 @@ impl JwkCacheEntryLock {
|
||||
|
||||
// get the key from the JWKs if possible. If not, wait for the keys to update.
|
||||
let (jwk, expected_audience) = loop {
|
||||
match guard.find_jwk_and_audience(kid, role_name) {
|
||||
match guard.find_jwk_and_audience(&kid, role_name) {
|
||||
Some(jwk) => break jwk,
|
||||
None if guard.last_retrieved.elapsed() > MIN_RENEW => {
|
||||
let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
|
||||
@@ -312,7 +314,9 @@ impl JwkCacheEntryLock {
|
||||
|
||||
if let Some(aud) = expected_audience {
|
||||
if payload.audience.0.iter().all(|s| s != aud) {
|
||||
return Err(JwtError::InvalidJwtTokenAudience);
|
||||
return Err(JwtError::InvalidClaims(
|
||||
JwtClaimsError::InvalidJwtTokenAudience,
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -320,13 +324,15 @@ impl JwkCacheEntryLock {
|
||||
|
||||
if let Some(exp) = payload.expiration {
|
||||
if now >= exp + CLOCK_SKEW_LEEWAY {
|
||||
return Err(JwtError::JwtTokenHasExpired);
|
||||
return Err(JwtError::InvalidClaims(JwtClaimsError::JwtTokenHasExpired));
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(nbf) = payload.not_before {
|
||||
if nbf >= now + CLOCK_SKEW_LEEWAY {
|
||||
return Err(JwtError::JwtTokenNotYetReadyToUse);
|
||||
return Err(JwtError::InvalidClaims(
|
||||
JwtClaimsError::JwtTokenNotYetReadyToUse,
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -420,8 +426,8 @@ struct JwtHeader<'a> {
|
||||
#[serde(rename = "alg")]
|
||||
algorithm: jose_jwa::Algorithm,
|
||||
/// key id, must be provided for our usecase
|
||||
#[serde(rename = "kid")]
|
||||
key_id: Option<&'a str>,
|
||||
#[serde(rename = "kid", borrow)]
|
||||
key_id: Option<Cow<'a, str>>,
|
||||
}
|
||||
|
||||
/// <https://datatracker.ietf.org/doc/html/rfc7519#section-4.1>
|
||||
@@ -440,17 +446,17 @@ struct JwtPayload<'a> {
|
||||
|
||||
// the following entries are only extracted for the sake of debug logging.
|
||||
/// Issuer of the JWT
|
||||
#[serde(rename = "iss")]
|
||||
issuer: Option<&'a str>,
|
||||
#[serde(rename = "iss", borrow)]
|
||||
issuer: Option<Cow<'a, str>>,
|
||||
/// Subject of the JWT (the user)
|
||||
#[serde(rename = "sub")]
|
||||
subject: Option<&'a str>,
|
||||
#[serde(rename = "sub", borrow)]
|
||||
subject: Option<Cow<'a, str>>,
|
||||
/// Unique token identifier
|
||||
#[serde(rename = "jti")]
|
||||
jwt_id: Option<&'a str>,
|
||||
#[serde(rename = "jti", borrow)]
|
||||
jwt_id: Option<Cow<'a, str>>,
|
||||
/// Unique session identifier
|
||||
#[serde(rename = "sid")]
|
||||
session_id: Option<&'a str>,
|
||||
#[serde(rename = "sid", borrow)]
|
||||
session_id: Option<Cow<'a, str>>,
|
||||
}
|
||||
|
||||
/// `OneOrMany` supports parsing either a single item or an array of items.
|
||||
@@ -585,14 +591,8 @@ pub(crate) enum JwtError {
|
||||
#[error("Provided authentication token is not a valid JWT encoding")]
|
||||
JwtEncoding(#[from] JwtEncodingError),
|
||||
|
||||
#[error("invalid JWT token audience")]
|
||||
InvalidJwtTokenAudience,
|
||||
|
||||
#[error("JWT token has expired")]
|
||||
JwtTokenHasExpired,
|
||||
|
||||
#[error("JWT token is not yet ready to use")]
|
||||
JwtTokenNotYetReadyToUse,
|
||||
#[error(transparent)]
|
||||
InvalidClaims(#[from] JwtClaimsError),
|
||||
|
||||
#[error("invalid P256 key")]
|
||||
InvalidP256Key(jose_jwk::crypto::Error),
|
||||
@@ -644,6 +644,19 @@ pub enum JwtEncodingError {
|
||||
InvalidCompactForm,
|
||||
}
|
||||
|
||||
#[derive(Error, Debug, PartialEq)]
|
||||
#[non_exhaustive]
|
||||
pub enum JwtClaimsError {
|
||||
#[error("invalid JWT token audience")]
|
||||
InvalidJwtTokenAudience,
|
||||
|
||||
#[error("JWT token has expired")]
|
||||
JwtTokenHasExpired,
|
||||
|
||||
#[error("JWT token is not yet ready to use")]
|
||||
JwtTokenNotYetReadyToUse,
|
||||
}
|
||||
|
||||
#[allow(dead_code, reason = "Debug use only")]
|
||||
#[derive(Debug)]
|
||||
pub(crate) enum KeyType {
|
||||
@@ -680,6 +693,8 @@ mod tests {
|
||||
use hyper_util::rt::TokioIo;
|
||||
use rand::rngs::OsRng;
|
||||
use rsa::pkcs8::DecodePrivateKey;
|
||||
use serde::Serialize;
|
||||
use serde_json::json;
|
||||
use signature::Signer;
|
||||
use tokio::net::TcpListener;
|
||||
|
||||
@@ -693,6 +708,7 @@ mod tests {
|
||||
key: jose_jwk::Key::Ec(pk),
|
||||
prm: jose_jwk::Parameters {
|
||||
kid: Some(kid),
|
||||
alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Es256)),
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
@@ -706,24 +722,47 @@ mod tests {
|
||||
key: jose_jwk::Key::Rsa(pk),
|
||||
prm: jose_jwk::Parameters {
|
||||
kid: Some(kid),
|
||||
alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Rs256)),
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
(sk, jwk)
|
||||
}
|
||||
|
||||
fn now() -> u64 {
|
||||
SystemTime::now()
|
||||
.duration_since(SystemTime::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_secs()
|
||||
}
|
||||
|
||||
fn build_jwt_payload(kid: String, sig: jose_jwa::Signing) -> String {
|
||||
let now = now();
|
||||
let body = typed_json::json! {{
|
||||
"exp": now + 3600,
|
||||
"nbf": now,
|
||||
"aud": ["audience1", "neon", "audience2"],
|
||||
"sub": "user1",
|
||||
"sid": "session1",
|
||||
"jti": "token1",
|
||||
"iss": "neon-testing",
|
||||
}};
|
||||
build_custom_jwt_payload(kid, body, sig)
|
||||
}
|
||||
|
||||
fn build_custom_jwt_payload(
|
||||
kid: String,
|
||||
body: impl Serialize,
|
||||
sig: jose_jwa::Signing,
|
||||
) -> String {
|
||||
let header = JwtHeader {
|
||||
algorithm: jose_jwa::Algorithm::Signing(sig),
|
||||
key_id: Some(&kid),
|
||||
key_id: Some(Cow::Owned(kid)),
|
||||
};
|
||||
let body = typed_json::json! {{
|
||||
"exp": SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs() + 3600,
|
||||
}};
|
||||
|
||||
let header =
|
||||
base64::encode_config(serde_json::to_string(&header).unwrap(), URL_SAFE_NO_PAD);
|
||||
let body = base64::encode_config(body.to_string(), URL_SAFE_NO_PAD);
|
||||
let body = base64::encode_config(serde_json::to_string(&body).unwrap(), URL_SAFE_NO_PAD);
|
||||
|
||||
format!("{header}.{body}")
|
||||
}
|
||||
@@ -738,6 +777,16 @@ mod tests {
|
||||
format!("{payload}.{sig}")
|
||||
}
|
||||
|
||||
fn new_custom_ec_jwt(kid: String, key: &p256::SecretKey, body: impl Serialize) -> String {
|
||||
use p256::ecdsa::{Signature, SigningKey};
|
||||
|
||||
let payload = build_custom_jwt_payload(kid, body, jose_jwa::Signing::Es256);
|
||||
let sig: Signature = SigningKey::from(key).sign(payload.as_bytes());
|
||||
let sig = base64::encode_config(sig.to_bytes(), URL_SAFE_NO_PAD);
|
||||
|
||||
format!("{payload}.{sig}")
|
||||
}
|
||||
|
||||
fn new_rsa_jwt(kid: String, key: rsa::RsaPrivateKey) -> String {
|
||||
use rsa::pkcs1v15::SigningKey;
|
||||
use rsa::signature::SignatureEncoding;
|
||||
@@ -809,37 +858,34 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
|
||||
-----END PRIVATE KEY-----
|
||||
";
|
||||
|
||||
#[tokio::test]
|
||||
async fn renew() {
|
||||
let (rs1, jwk1) = new_rsa_jwk(RS1, "1".into());
|
||||
let (rs2, jwk2) = new_rsa_jwk(RS2, "2".into());
|
||||
let (ec1, jwk3) = new_ec_jwk("3".into());
|
||||
let (ec2, jwk4) = new_ec_jwk("4".into());
|
||||
#[derive(Clone)]
|
||||
struct Fetch(Vec<AuthRule>);
|
||||
|
||||
let foo_jwks = jose_jwk::JwkSet {
|
||||
keys: vec![jwk1, jwk3],
|
||||
};
|
||||
let bar_jwks = jose_jwk::JwkSet {
|
||||
keys: vec![jwk2, jwk4],
|
||||
};
|
||||
impl FetchAuthRules for Fetch {
|
||||
async fn fetch_auth_rules(
|
||||
&self,
|
||||
_ctx: &RequestMonitoring,
|
||||
_endpoint: EndpointId,
|
||||
) -> Result<Vec<AuthRule>, FetchAuthRulesError> {
|
||||
Ok(self.0.clone())
|
||||
}
|
||||
}
|
||||
|
||||
async fn jwks_server(
|
||||
router: impl for<'a> Fn(&'a str) -> Option<Vec<u8>> + Send + Sync + 'static,
|
||||
) -> SocketAddr {
|
||||
let router = Arc::new(router);
|
||||
let service = service_fn(move |req| {
|
||||
let foo_jwks = foo_jwks.clone();
|
||||
let bar_jwks = bar_jwks.clone();
|
||||
let router = Arc::clone(&router);
|
||||
async move {
|
||||
let jwks = match req.uri().path() {
|
||||
"/foo" => &foo_jwks,
|
||||
"/bar" => &bar_jwks,
|
||||
_ => {
|
||||
return Response::builder()
|
||||
.status(404)
|
||||
.body(Full::new(Bytes::new()));
|
||||
}
|
||||
};
|
||||
let body = serde_json::to_vec(jwks).unwrap();
|
||||
Response::builder()
|
||||
.status(200)
|
||||
.body(Full::new(Bytes::from(body)))
|
||||
match router(req.uri().path()) {
|
||||
Some(body) => Response::builder()
|
||||
.status(200)
|
||||
.body(Full::new(Bytes::from(body))),
|
||||
None => Response::builder()
|
||||
.status(404)
|
||||
.body(Full::new(Bytes::new())),
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
@@ -854,84 +900,61 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
|
||||
}
|
||||
});
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
addr
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct Fetch(SocketAddr, Vec<RoleNameInt>);
|
||||
#[tokio::test]
|
||||
async fn check_jwt_happy_path() {
|
||||
let (rs1, jwk1) = new_rsa_jwk(RS1, "rs1".into());
|
||||
let (rs2, jwk2) = new_rsa_jwk(RS2, "rs2".into());
|
||||
let (ec1, jwk3) = new_ec_jwk("ec1".into());
|
||||
let (ec2, jwk4) = new_ec_jwk("ec2".into());
|
||||
|
||||
impl FetchAuthRules for Fetch {
|
||||
async fn fetch_auth_rules(
|
||||
&self,
|
||||
_ctx: &RequestMonitoring,
|
||||
_endpoint: EndpointId,
|
||||
) -> Result<Vec<AuthRule>, FetchAuthRulesError> {
|
||||
Ok(vec![
|
||||
AuthRule {
|
||||
id: "foo".to_owned(),
|
||||
jwks_url: format!("http://{}/foo", self.0).parse().unwrap(),
|
||||
audience: None,
|
||||
role_names: self.1.clone(),
|
||||
},
|
||||
AuthRule {
|
||||
id: "bar".to_owned(),
|
||||
jwks_url: format!("http://{}/bar", self.0).parse().unwrap(),
|
||||
audience: None,
|
||||
role_names: self.1.clone(),
|
||||
},
|
||||
])
|
||||
}
|
||||
}
|
||||
let foo_jwks = jose_jwk::JwkSet {
|
||||
keys: vec![jwk1, jwk3],
|
||||
};
|
||||
let bar_jwks = jose_jwk::JwkSet {
|
||||
keys: vec![jwk2, jwk4],
|
||||
};
|
||||
|
||||
let jwks_addr = jwks_server(move |path| match path {
|
||||
"/foo" => Some(serde_json::to_vec(&foo_jwks).unwrap()),
|
||||
"/bar" => Some(serde_json::to_vec(&bar_jwks).unwrap()),
|
||||
_ => None,
|
||||
})
|
||||
.await;
|
||||
|
||||
let role_name1 = RoleName::from("anonymous");
|
||||
let role_name2 = RoleName::from("authenticated");
|
||||
|
||||
let fetch = Fetch(
|
||||
addr,
|
||||
vec![
|
||||
RoleNameInt::from(&role_name1),
|
||||
RoleNameInt::from(&role_name2),
|
||||
],
|
||||
);
|
||||
let roles = vec![
|
||||
RoleNameInt::from(&role_name1),
|
||||
RoleNameInt::from(&role_name2),
|
||||
];
|
||||
let rules = vec![
|
||||
AuthRule {
|
||||
id: "foo".to_owned(),
|
||||
jwks_url: format!("http://{jwks_addr}/foo").parse().unwrap(),
|
||||
audience: None,
|
||||
role_names: roles.clone(),
|
||||
},
|
||||
AuthRule {
|
||||
id: "bar".to_owned(),
|
||||
jwks_url: format!("http://{jwks_addr}/bar").parse().unwrap(),
|
||||
audience: None,
|
||||
role_names: roles.clone(),
|
||||
},
|
||||
];
|
||||
|
||||
let fetch = Fetch(rules);
|
||||
let jwk_cache = JwkCache::default();
|
||||
|
||||
let endpoint = EndpointId::from("ep");
|
||||
|
||||
let jwk_cache = Arc::new(JwkCacheEntryLock::default());
|
||||
|
||||
let jwt1 = new_rsa_jwt("1".into(), rs1);
|
||||
let jwt2 = new_rsa_jwt("2".into(), rs2);
|
||||
let jwt3 = new_ec_jwt("3".into(), &ec1);
|
||||
let jwt4 = new_ec_jwt("4".into(), &ec2);
|
||||
|
||||
// had the wrong kid, therefore will have the wrong ecdsa signature
|
||||
let bad_jwt = new_ec_jwt("3".into(), &ec2);
|
||||
// this role_name is not accepted
|
||||
let bad_role_name = RoleName::from("cloud_admin");
|
||||
|
||||
let err = jwk_cache
|
||||
.check_jwt(
|
||||
&RequestMonitoring::test(),
|
||||
&bad_jwt,
|
||||
&client,
|
||||
endpoint.clone(),
|
||||
&role_name1,
|
||||
&fetch,
|
||||
)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert!(err.to_string().contains("signature error"));
|
||||
|
||||
let err = jwk_cache
|
||||
.check_jwt(
|
||||
&RequestMonitoring::test(),
|
||||
&jwt1,
|
||||
&client,
|
||||
endpoint.clone(),
|
||||
&bad_role_name,
|
||||
&fetch,
|
||||
)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert!(err.to_string().contains("jwk not found"));
|
||||
let jwt1 = new_rsa_jwt("rs1".into(), rs1);
|
||||
let jwt2 = new_rsa_jwt("rs2".into(), rs2);
|
||||
let jwt3 = new_ec_jwt("ec1".into(), &ec1);
|
||||
let jwt4 = new_ec_jwt("ec2".into(), &ec2);
|
||||
|
||||
let tokens = [jwt1, jwt2, jwt3, jwt4];
|
||||
let role_names = [role_name1, role_name2];
|
||||
@@ -940,15 +963,250 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
|
||||
jwk_cache
|
||||
.check_jwt(
|
||||
&RequestMonitoring::test(),
|
||||
token,
|
||||
&client,
|
||||
endpoint.clone(),
|
||||
role,
|
||||
&fetch,
|
||||
token,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// AWS Cognito escapes the `/` in the URL.
|
||||
#[tokio::test]
|
||||
async fn check_jwt_regression_cognito_issuer() {
|
||||
let (key, jwk) = new_ec_jwk("key".into());
|
||||
|
||||
let now = now();
|
||||
let token = new_custom_ec_jwt(
|
||||
"key".into(),
|
||||
&key,
|
||||
typed_json::json! {{
|
||||
"sub": "dd9a73fd-e785-4a13-aae1-e691ce43e89d",
|
||||
// cognito uses `\/`. I cannot replicated that easily here as serde_json will refuse
|
||||
// to write that escape character. instead I will make a bogus URL using `\` instead.
|
||||
"iss": "https:\\\\cognito-idp.us-west-2.amazonaws.com\\us-west-2_abcdefgh",
|
||||
"client_id": "abcdefghijklmnopqrstuvwxyz",
|
||||
"origin_jti": "6759d132-3fe7-446e-9e90-2fe7e8017893",
|
||||
"event_id": "ec9c36ab-b01d-46a0-94e4-87fde6767065",
|
||||
"token_use": "access",
|
||||
"scope": "aws.cognito.signin.user.admin",
|
||||
"auth_time":now,
|
||||
"exp":now + 60,
|
||||
"iat":now,
|
||||
"jti": "b241614b-0b93-4bdc-96db-0a3c7061d9c0",
|
||||
"username": "dd9a73fd-e785-4a13-aae1-e691ce43e89d",
|
||||
}},
|
||||
);
|
||||
|
||||
let jwks = jose_jwk::JwkSet { keys: vec![jwk] };
|
||||
|
||||
let jwks_addr = jwks_server(move |_path| Some(serde_json::to_vec(&jwks).unwrap())).await;
|
||||
|
||||
let role_name = RoleName::from("anonymous");
|
||||
let rules = vec![AuthRule {
|
||||
id: "aws-cognito".to_owned(),
|
||||
jwks_url: format!("http://{jwks_addr}/").parse().unwrap(),
|
||||
audience: None,
|
||||
role_names: vec![RoleNameInt::from(&role_name)],
|
||||
}];
|
||||
|
||||
let fetch = Fetch(rules);
|
||||
let jwk_cache = JwkCache::default();
|
||||
|
||||
let endpoint = EndpointId::from("ep");
|
||||
|
||||
jwk_cache
|
||||
.check_jwt(
|
||||
&RequestMonitoring::test(),
|
||||
endpoint.clone(),
|
||||
&role_name,
|
||||
&fetch,
|
||||
&token,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn check_jwt_invalid_signature() {
|
||||
let (_, jwk) = new_ec_jwk("1".into());
|
||||
let (key, _) = new_ec_jwk("1".into());
|
||||
|
||||
// has a matching kid, but signed by the wrong key
|
||||
let bad_jwt = new_ec_jwt("1".into(), &key);
|
||||
|
||||
let jwks = jose_jwk::JwkSet { keys: vec![jwk] };
|
||||
let jwks_addr = jwks_server(move |path| match path {
|
||||
"/" => Some(serde_json::to_vec(&jwks).unwrap()),
|
||||
_ => None,
|
||||
})
|
||||
.await;
|
||||
|
||||
let role = RoleName::from("authenticated");
|
||||
|
||||
let rules = vec![AuthRule {
|
||||
id: String::new(),
|
||||
jwks_url: format!("http://{jwks_addr}/").parse().unwrap(),
|
||||
audience: None,
|
||||
role_names: vec![RoleNameInt::from(&role)],
|
||||
}];
|
||||
|
||||
let fetch = Fetch(rules);
|
||||
let jwk_cache = JwkCache::default();
|
||||
|
||||
let ep = EndpointId::from("ep");
|
||||
|
||||
let ctx = RequestMonitoring::test();
|
||||
let err = jwk_cache
|
||||
.check_jwt(&ctx, ep, &role, &fetch, &bad_jwt)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert!(
|
||||
matches!(err, JwtError::Signature(_)),
|
||||
"expected \"signature error\", got {err:?}"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn check_jwt_unknown_role() {
|
||||
let (key, jwk) = new_rsa_jwk(RS1, "1".into());
|
||||
let jwt = new_rsa_jwt("1".into(), key);
|
||||
|
||||
let jwks = jose_jwk::JwkSet { keys: vec![jwk] };
|
||||
let jwks_addr = jwks_server(move |path| match path {
|
||||
"/" => Some(serde_json::to_vec(&jwks).unwrap()),
|
||||
_ => None,
|
||||
})
|
||||
.await;
|
||||
|
||||
let role = RoleName::from("authenticated");
|
||||
let rules = vec![AuthRule {
|
||||
id: String::new(),
|
||||
jwks_url: format!("http://{jwks_addr}/").parse().unwrap(),
|
||||
audience: None,
|
||||
role_names: vec![RoleNameInt::from(&role)],
|
||||
}];
|
||||
|
||||
let fetch = Fetch(rules);
|
||||
let jwk_cache = JwkCache::default();
|
||||
|
||||
let ep = EndpointId::from("ep");
|
||||
|
||||
// this role_name is not accepted
|
||||
let bad_role_name = RoleName::from("cloud_admin");
|
||||
|
||||
let ctx = RequestMonitoring::test();
|
||||
let err = jwk_cache
|
||||
.check_jwt(&ctx, ep, &bad_role_name, &fetch, &jwt)
|
||||
.await
|
||||
.unwrap_err();
|
||||
|
||||
assert!(
|
||||
matches!(err, JwtError::JwkNotFound),
|
||||
"expected \"jwk not found\", got {err:?}"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn check_jwt_invalid_claims() {
|
||||
let (key, jwk) = new_ec_jwk("1".into());
|
||||
|
||||
let jwks = jose_jwk::JwkSet { keys: vec![jwk] };
|
||||
let jwks_addr = jwks_server(move |path| match path {
|
||||
"/" => Some(serde_json::to_vec(&jwks).unwrap()),
|
||||
_ => None,
|
||||
})
|
||||
.await;
|
||||
|
||||
let now = SystemTime::now()
|
||||
.duration_since(SystemTime::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_secs();
|
||||
|
||||
struct Test {
|
||||
body: serde_json::Value,
|
||||
error: JwtClaimsError,
|
||||
}
|
||||
|
||||
let table = vec![
|
||||
Test {
|
||||
body: json! {{
|
||||
"nbf": now + 60,
|
||||
"aud": "neon",
|
||||
}},
|
||||
error: JwtClaimsError::JwtTokenNotYetReadyToUse,
|
||||
},
|
||||
Test {
|
||||
body: json! {{
|
||||
"exp": now - 60,
|
||||
"aud": ["neon"],
|
||||
}},
|
||||
error: JwtClaimsError::JwtTokenHasExpired,
|
||||
},
|
||||
Test {
|
||||
body: json! {{
|
||||
}},
|
||||
error: JwtClaimsError::InvalidJwtTokenAudience,
|
||||
},
|
||||
Test {
|
||||
body: json! {{
|
||||
"aud": [],
|
||||
}},
|
||||
error: JwtClaimsError::InvalidJwtTokenAudience,
|
||||
},
|
||||
Test {
|
||||
body: json! {{
|
||||
"aud": "foo",
|
||||
}},
|
||||
error: JwtClaimsError::InvalidJwtTokenAudience,
|
||||
},
|
||||
Test {
|
||||
body: json! {{
|
||||
"aud": ["foo"],
|
||||
}},
|
||||
error: JwtClaimsError::InvalidJwtTokenAudience,
|
||||
},
|
||||
Test {
|
||||
body: json! {{
|
||||
"aud": ["foo", "bar"],
|
||||
}},
|
||||
error: JwtClaimsError::InvalidJwtTokenAudience,
|
||||
},
|
||||
];
|
||||
|
||||
let role = RoleName::from("authenticated");
|
||||
|
||||
let rules = vec![AuthRule {
|
||||
id: String::new(),
|
||||
jwks_url: format!("http://{jwks_addr}/").parse().unwrap(),
|
||||
audience: Some("neon".to_string()),
|
||||
role_names: vec![RoleNameInt::from(&role)],
|
||||
}];
|
||||
|
||||
let fetch = Fetch(rules);
|
||||
let jwk_cache = JwkCache::default();
|
||||
|
||||
let ep = EndpointId::from("ep");
|
||||
|
||||
let ctx = RequestMonitoring::test();
|
||||
for test in table {
|
||||
let jwt = new_custom_ec_jwt("1".into(), &key, test.body);
|
||||
|
||||
match jwk_cache
|
||||
.check_jwt(&ctx, ep.clone(), &role, &fetch, &jwt)
|
||||
.await
|
||||
{
|
||||
Err(JwtError::InvalidClaims(error)) if error == test.error => {}
|
||||
Err(err) => {
|
||||
panic!("expected {:?}, got {err:?}", test.error)
|
||||
}
|
||||
Ok(_payload) => {
|
||||
panic!("expected {:?}, got ok", test.error)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -137,9 +137,6 @@ struct ProxyCliArgs {
|
||||
/// size of the threadpool for password hashing
|
||||
#[clap(long, default_value_t = 4)]
|
||||
scram_thread_pool_size: u8,
|
||||
/// Disable dynamic rate limiter and store the metrics to ensure its production behaviour.
|
||||
#[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
disable_dynamic_rate_limiter: bool,
|
||||
/// Endpoint rate limiter max number of requests per second.
|
||||
///
|
||||
/// Provided in the form `<Requests Per Second>@<Bucket Duration Size>`.
|
||||
@@ -615,9 +612,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
and metric-collection-interval must be specified"
|
||||
),
|
||||
};
|
||||
if !args.disable_dynamic_rate_limiter {
|
||||
bail!("dynamic rate limiter should be disabled");
|
||||
}
|
||||
|
||||
let config::ConcurrencyLockOptions {
|
||||
shards,
|
||||
|
||||
5
proxy/src/serverless/error.rs
Normal file
5
proxy/src/serverless/error.rs
Normal file
@@ -0,0 +1,5 @@
|
||||
use http::StatusCode;
|
||||
|
||||
pub trait HttpCodeError {
|
||||
fn get_http_status_code(&self) -> StatusCode;
|
||||
}
|
||||
@@ -6,6 +6,7 @@ mod backend;
|
||||
pub mod cancel_set;
|
||||
mod conn_pool;
|
||||
mod conn_pool_lib;
|
||||
mod error;
|
||||
mod http_conn_pool;
|
||||
mod http_util;
|
||||
mod json;
|
||||
|
||||
@@ -28,6 +28,7 @@ use uuid::Uuid;
|
||||
use super::backend::{LocalProxyConnError, PoolingBackend};
|
||||
use super::conn_pool::{AuthData, ConnInfoWithAuth};
|
||||
use super::conn_pool_lib::{self, ConnInfo};
|
||||
use super::error::HttpCodeError;
|
||||
use super::http_util::json_response;
|
||||
use super::json::{json_to_pg_text, pg_text_row_to_json, JsonConversionError};
|
||||
use super::local_conn_pool;
|
||||
@@ -238,7 +239,6 @@ fn get_conn_info(
|
||||
Ok(ConnInfoWithAuth { conn_info, auth })
|
||||
}
|
||||
|
||||
// TODO: return different http error codes
|
||||
pub(crate) async fn handle(
|
||||
config: &'static ProxyConfig,
|
||||
ctx: RequestMonitoring,
|
||||
@@ -319,9 +319,8 @@ pub(crate) async fn handle(
|
||||
"forwarding error to user"
|
||||
);
|
||||
|
||||
// TODO: this shouldn't always be bad request.
|
||||
json_response(
|
||||
StatusCode::BAD_REQUEST,
|
||||
e.get_http_status_code(),
|
||||
json!({
|
||||
"message": message,
|
||||
"code": code,
|
||||
@@ -405,6 +404,25 @@ impl UserFacingError for SqlOverHttpError {
|
||||
}
|
||||
}
|
||||
|
||||
impl HttpCodeError for SqlOverHttpError {
|
||||
fn get_http_status_code(&self) -> StatusCode {
|
||||
match self {
|
||||
SqlOverHttpError::ReadPayload(_) => StatusCode::BAD_REQUEST,
|
||||
SqlOverHttpError::ConnectCompute(h) => match h.get_error_kind() {
|
||||
ErrorKind::User => StatusCode::BAD_REQUEST,
|
||||
_ => StatusCode::INTERNAL_SERVER_ERROR,
|
||||
},
|
||||
SqlOverHttpError::ConnInfo(_) => StatusCode::BAD_REQUEST,
|
||||
SqlOverHttpError::RequestTooLarge(_) => StatusCode::PAYLOAD_TOO_LARGE,
|
||||
SqlOverHttpError::ResponseTooLarge(_) => StatusCode::INSUFFICIENT_STORAGE,
|
||||
SqlOverHttpError::InvalidIsolationLevel => StatusCode::BAD_REQUEST,
|
||||
SqlOverHttpError::Postgres(_) => StatusCode::BAD_REQUEST,
|
||||
SqlOverHttpError::JsonConversion(_) => StatusCode::INTERNAL_SERVER_ERROR,
|
||||
SqlOverHttpError::Cancelled(_) => StatusCode::INTERNAL_SERVER_ERROR,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum ReadPayloadError {
|
||||
#[error("could not read the HTTP request body: {0}")]
|
||||
|
||||
@@ -42,6 +42,13 @@ pytest-repeat = "^0.9.3"
|
||||
websockets = "^12.0"
|
||||
clickhouse-connect = "^0.7.16"
|
||||
kafka-python = "^2.0.2"
|
||||
jwcrypto = "^1.5.6"
|
||||
h2 = "^4.1.0"
|
||||
types-jwcrypto = "^1.5.0.20240925"
|
||||
pyyaml = "^6.0.2"
|
||||
types-pyyaml = "^6.0.12.20240917"
|
||||
testcontainers = "^4.8.1"
|
||||
jsonnet = "^0.20.0"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
mypy = "==1.3.0"
|
||||
@@ -70,12 +77,14 @@ strict = true
|
||||
|
||||
[[tool.mypy.overrides]]
|
||||
module = [
|
||||
"_jsonnet.*",
|
||||
"asyncpg.*",
|
||||
"pg8000.*",
|
||||
"allure.*",
|
||||
"allure_commons.*",
|
||||
"allure_pytest.*",
|
||||
"kafka.*",
|
||||
"testcontainers.*",
|
||||
]
|
||||
ignore_missing_imports = true
|
||||
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
//!
|
||||
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use postgres_backend::QueryError;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
@@ -176,7 +175,7 @@ pub async fn append_logical_message(
|
||||
truncate_lsn: msg.truncate_lsn,
|
||||
proposer_uuid: [0u8; 16],
|
||||
},
|
||||
wal_data: Bytes::from(wal_data),
|
||||
wal_data,
|
||||
});
|
||||
|
||||
let response = tli.process_msg(&append_request).await?;
|
||||
|
||||
@@ -21,18 +21,15 @@ use postgres_backend::QueryError;
|
||||
use pq_proto::BeMessage;
|
||||
use serde::Deserialize;
|
||||
use serde::Serialize;
|
||||
use std::future;
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
use tokio::io::AsyncRead;
|
||||
use tokio::io::AsyncWrite;
|
||||
use tokio::sync::mpsc::channel;
|
||||
use tokio::sync::mpsc::error::TryRecvError;
|
||||
use tokio::sync::mpsc::Receiver;
|
||||
use tokio::sync::mpsc::Sender;
|
||||
use tokio::sync::mpsc::{channel, Receiver, Sender};
|
||||
use tokio::task;
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio::time::Duration;
|
||||
use tokio::time::Instant;
|
||||
use tokio::time::{Duration, MissedTickBehavior};
|
||||
use tracing::*;
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
@@ -444,9 +441,9 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
}
|
||||
}
|
||||
|
||||
// Send keepalive messages to walproposer, to make sure it receives updates
|
||||
// even when it writes a steady stream of messages.
|
||||
const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1);
|
||||
/// The WAL flush interval. This ensures we periodically flush the WAL and send AppendResponses to
|
||||
/// walproposer, even when it's writing a steady stream of messages.
|
||||
const FLUSH_INTERVAL: Duration = Duration::from_secs(1);
|
||||
|
||||
/// Encapsulates a task which takes messages from msg_rx, processes and pushes
|
||||
/// replies to reply_tx.
|
||||
@@ -494,67 +491,76 @@ impl WalAcceptor {
|
||||
async fn run(&mut self) -> anyhow::Result<()> {
|
||||
let walreceiver_guard = self.tli.get_walreceivers().register(self.conn_id);
|
||||
|
||||
// After this timestamp we will stop processing AppendRequests and send a response
|
||||
// to the walproposer. walproposer sends at least one AppendRequest per second,
|
||||
// we will send keepalives by replying to these requests once per second.
|
||||
let mut next_keepalive = Instant::now();
|
||||
// Periodically flush the WAL.
|
||||
let mut flush_ticker = tokio::time::interval(FLUSH_INTERVAL);
|
||||
flush_ticker.set_missed_tick_behavior(MissedTickBehavior::Delay);
|
||||
flush_ticker.tick().await; // skip the initial, immediate tick
|
||||
|
||||
while let Some(mut next_msg) = self.msg_rx.recv().await {
|
||||
// Update walreceiver state in shmem for reporting.
|
||||
if let ProposerAcceptorMessage::Elected(_) = &next_msg {
|
||||
walreceiver_guard.get().status = WalReceiverStatus::Streaming;
|
||||
}
|
||||
// Tracks unflushed appends.
|
||||
let mut dirty = false;
|
||||
|
||||
let reply_msg = if matches!(next_msg, ProposerAcceptorMessage::AppendRequest(_)) {
|
||||
// Loop through AppendRequests while available to write as many WAL records as
|
||||
// possible without fsyncing.
|
||||
//
|
||||
// Make sure the WAL is flushed before returning, see:
|
||||
// https://github.com/neondatabase/neon/issues/9259
|
||||
//
|
||||
// Note: this will need to be rewritten if we want to read non-AppendRequest messages here.
|
||||
// Otherwise, we might end up in a situation where we read a message, but don't
|
||||
// process it.
|
||||
while let ProposerAcceptorMessage::AppendRequest(append_request) = next_msg {
|
||||
let noflush_msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request);
|
||||
|
||||
if let Some(reply) = self.tli.process_msg(&noflush_msg).await? {
|
||||
if self.reply_tx.send(reply).await.is_err() {
|
||||
break; // disconnected, flush WAL and return on next send/recv
|
||||
}
|
||||
}
|
||||
|
||||
// get out of this loop if keepalive time is reached
|
||||
if Instant::now() >= next_keepalive {
|
||||
loop {
|
||||
let reply = tokio::select! {
|
||||
// Process inbound message.
|
||||
msg = self.msg_rx.recv() => {
|
||||
// If disconnected, break to flush WAL and return.
|
||||
let Some(mut msg) = msg else {
|
||||
break;
|
||||
};
|
||||
|
||||
// Update walreceiver state in shmem for reporting.
|
||||
if let ProposerAcceptorMessage::Elected(_) = &msg {
|
||||
walreceiver_guard.get().status = WalReceiverStatus::Streaming;
|
||||
}
|
||||
|
||||
// continue pulling AppendRequests if available
|
||||
match self.msg_rx.try_recv() {
|
||||
Ok(msg) => next_msg = msg,
|
||||
Err(TryRecvError::Empty) => break,
|
||||
// on disconnect, flush WAL and return on next send/recv
|
||||
Err(TryRecvError::Disconnected) => break,
|
||||
};
|
||||
// Don't flush the WAL on every append, only periodically via flush_ticker.
|
||||
// This batches multiple appends per fsync. If the channel is empty after
|
||||
// sending the reply, we'll schedule an immediate flush.
|
||||
if let ProposerAcceptorMessage::AppendRequest(append_request) = msg {
|
||||
msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request);
|
||||
dirty = true;
|
||||
}
|
||||
|
||||
self.tli.process_msg(&msg).await?
|
||||
}
|
||||
|
||||
// flush all written WAL to the disk
|
||||
self.tli
|
||||
.process_msg(&ProposerAcceptorMessage::FlushWAL)
|
||||
.await?
|
||||
} else {
|
||||
// process message other than AppendRequest
|
||||
self.tli.process_msg(&next_msg).await?
|
||||
// While receiving AppendRequests, flush the WAL periodically and respond with an
|
||||
// AppendResponse to let walproposer know we're still alive.
|
||||
_ = flush_ticker.tick(), if dirty => {
|
||||
dirty = false;
|
||||
self.tli
|
||||
.process_msg(&ProposerAcceptorMessage::FlushWAL)
|
||||
.await?
|
||||
}
|
||||
|
||||
// If there are no pending messages, flush the WAL immediately.
|
||||
//
|
||||
// TODO: this should be done via flush_ticker.reset_immediately(), but that's always
|
||||
// delayed by 1ms due to this bug: https://github.com/tokio-rs/tokio/issues/6866.
|
||||
_ = future::ready(()), if dirty && self.msg_rx.is_empty() => {
|
||||
dirty = false;
|
||||
flush_ticker.reset();
|
||||
self.tli
|
||||
.process_msg(&ProposerAcceptorMessage::FlushWAL)
|
||||
.await?
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(reply) = reply_msg {
|
||||
// Send reply, if any.
|
||||
if let Some(reply) = reply {
|
||||
if self.reply_tx.send(reply).await.is_err() {
|
||||
return Ok(()); // chan closed, streaming terminated
|
||||
break; // disconnected, break to flush WAL and return
|
||||
}
|
||||
// reset keepalive time
|
||||
next_keepalive = Instant::now() + KEEPALIVE_INTERVAL;
|
||||
}
|
||||
}
|
||||
|
||||
// Flush WAL on disconnect, see https://github.com/neondatabase/neon/issues/9259.
|
||||
if dirty {
|
||||
self.tli
|
||||
.process_msg(&ProposerAcceptorMessage::FlushWAL)
|
||||
.await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -151,8 +151,7 @@ impl WalProposer {
|
||||
for _ in 0..cnt {
|
||||
self.disk
|
||||
.lock()
|
||||
.insert_logical_message("prefix", b"message")
|
||||
.expect("failed to generate logical message");
|
||||
.insert_logical_message(c"prefix", b"message");
|
||||
}
|
||||
|
||||
let end_lsn = self.disk.lock().flush_rec_ptr();
|
||||
|
||||
@@ -1,24 +1,7 @@
|
||||
use std::{ffi::CString, sync::Arc};
|
||||
use std::{ffi::CStr, sync::Arc};
|
||||
|
||||
use byteorder::{LittleEndian, WriteBytesExt};
|
||||
use crc32c::crc32c_append;
|
||||
use parking_lot::{Mutex, MutexGuard};
|
||||
use postgres_ffi::{
|
||||
pg_constants::{
|
||||
RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE, XLP_LONG_HEADER, XLR_BLOCK_ID_DATA_LONG,
|
||||
XLR_BLOCK_ID_DATA_SHORT,
|
||||
},
|
||||
v16::{
|
||||
wal_craft_test_export::{XLogLongPageHeaderData, XLogPageHeaderData, XLOG_PAGE_MAGIC},
|
||||
xlog_utils::{
|
||||
XLogSegNoOffsetToRecPtr, XlLogicalMessage, XLOG_RECORD_CRC_OFFS,
|
||||
XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
|
||||
XLP_FIRST_IS_CONTRECORD,
|
||||
},
|
||||
XLogRecord,
|
||||
},
|
||||
WAL_SEGMENT_SIZE, XLOG_BLCKSZ,
|
||||
};
|
||||
use postgres_ffi::v16::wal_generator::WalGenerator;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use super::block_storage::BlockStorage;
|
||||
@@ -35,6 +18,7 @@ impl DiskWalProposer {
|
||||
internal_available_lsn: Lsn(0),
|
||||
prev_lsn: Lsn(0),
|
||||
disk: BlockStorage::new(),
|
||||
wal_generator: WalGenerator::new(),
|
||||
}),
|
||||
})
|
||||
}
|
||||
@@ -51,6 +35,8 @@ pub struct State {
|
||||
prev_lsn: Lsn,
|
||||
// actual WAL storage
|
||||
disk: BlockStorage,
|
||||
// WAL record generator
|
||||
wal_generator: WalGenerator,
|
||||
}
|
||||
|
||||
impl State {
|
||||
@@ -66,6 +52,9 @@ impl State {
|
||||
/// Update the internal available LSN to the given value.
|
||||
pub fn reset_to(&mut self, lsn: Lsn) {
|
||||
self.internal_available_lsn = lsn;
|
||||
self.prev_lsn = Lsn(0); // Safekeeper doesn't care if this is omitted
|
||||
self.wal_generator.lsn = self.internal_available_lsn;
|
||||
self.wal_generator.prev_lsn = self.prev_lsn;
|
||||
}
|
||||
|
||||
/// Get current LSN.
|
||||
@@ -73,242 +62,11 @@ impl State {
|
||||
self.internal_available_lsn
|
||||
}
|
||||
|
||||
/// Generate a new WAL record at the current LSN.
|
||||
pub fn insert_logical_message(&mut self, prefix: &str, msg: &[u8]) -> anyhow::Result<()> {
|
||||
let prefix_cstr = CString::new(prefix)?;
|
||||
let prefix_bytes = prefix_cstr.as_bytes_with_nul();
|
||||
|
||||
let lm = XlLogicalMessage {
|
||||
db_id: 0,
|
||||
transactional: 0,
|
||||
prefix_size: prefix_bytes.len() as ::std::os::raw::c_ulong,
|
||||
message_size: msg.len() as ::std::os::raw::c_ulong,
|
||||
};
|
||||
|
||||
let record_bytes = lm.encode();
|
||||
let rdatas: Vec<&[u8]> = vec![&record_bytes, prefix_bytes, msg];
|
||||
insert_wal_record(self, rdatas, RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE)
|
||||
}
|
||||
}
|
||||
|
||||
fn insert_wal_record(
|
||||
state: &mut State,
|
||||
rdatas: Vec<&[u8]>,
|
||||
rmid: u8,
|
||||
info: u8,
|
||||
) -> anyhow::Result<()> {
|
||||
// bytes right after the header, in the same rdata block
|
||||
let mut scratch = Vec::new();
|
||||
let mainrdata_len: usize = rdatas.iter().map(|rdata| rdata.len()).sum();
|
||||
|
||||
if mainrdata_len > 0 {
|
||||
if mainrdata_len > 255 {
|
||||
scratch.push(XLR_BLOCK_ID_DATA_LONG);
|
||||
// TODO: verify endiness
|
||||
let _ = scratch.write_u32::<LittleEndian>(mainrdata_len as u32);
|
||||
} else {
|
||||
scratch.push(XLR_BLOCK_ID_DATA_SHORT);
|
||||
scratch.push(mainrdata_len as u8);
|
||||
}
|
||||
}
|
||||
|
||||
let total_len: u32 = (XLOG_SIZE_OF_XLOG_RECORD + scratch.len() + mainrdata_len) as u32;
|
||||
let size = maxalign(total_len);
|
||||
assert!(size as usize > XLOG_SIZE_OF_XLOG_RECORD);
|
||||
|
||||
let start_bytepos = recptr_to_bytepos(state.internal_available_lsn);
|
||||
let end_bytepos = start_bytepos + size as u64;
|
||||
|
||||
let start_recptr = bytepos_to_recptr(start_bytepos);
|
||||
let end_recptr = bytepos_to_recptr(end_bytepos);
|
||||
|
||||
assert!(recptr_to_bytepos(start_recptr) == start_bytepos);
|
||||
assert!(recptr_to_bytepos(end_recptr) == end_bytepos);
|
||||
|
||||
let mut crc = crc32c_append(0, &scratch);
|
||||
for rdata in &rdatas {
|
||||
crc = crc32c_append(crc, rdata);
|
||||
}
|
||||
|
||||
let mut header = XLogRecord {
|
||||
xl_tot_len: total_len,
|
||||
xl_xid: 0,
|
||||
xl_prev: state.prev_lsn.0,
|
||||
xl_info: info,
|
||||
xl_rmid: rmid,
|
||||
__bindgen_padding_0: [0u8; 2usize],
|
||||
xl_crc: crc,
|
||||
};
|
||||
|
||||
// now we have the header and can finish the crc
|
||||
let header_bytes = header.encode()?;
|
||||
let crc = crc32c_append(crc, &header_bytes[0..XLOG_RECORD_CRC_OFFS]);
|
||||
header.xl_crc = crc;
|
||||
|
||||
let mut header_bytes = header.encode()?.to_vec();
|
||||
assert!(header_bytes.len() == XLOG_SIZE_OF_XLOG_RECORD);
|
||||
|
||||
header_bytes.extend_from_slice(&scratch);
|
||||
|
||||
// finish rdatas
|
||||
let mut rdatas = rdatas;
|
||||
rdatas.insert(0, &header_bytes);
|
||||
|
||||
write_walrecord_to_disk(state, total_len as u64, rdatas, start_recptr, end_recptr)?;
|
||||
|
||||
state.internal_available_lsn = end_recptr;
|
||||
state.prev_lsn = start_recptr;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_walrecord_to_disk(
|
||||
state: &mut State,
|
||||
total_len: u64,
|
||||
rdatas: Vec<&[u8]>,
|
||||
start: Lsn,
|
||||
end: Lsn,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut curr_ptr = start;
|
||||
let mut freespace = insert_freespace(curr_ptr);
|
||||
let mut written: usize = 0;
|
||||
|
||||
assert!(freespace >= size_of::<u32>());
|
||||
|
||||
for mut rdata in rdatas {
|
||||
while rdata.len() >= freespace {
|
||||
assert!(
|
||||
curr_ptr.segment_offset(WAL_SEGMENT_SIZE) >= XLOG_SIZE_OF_XLOG_SHORT_PHD
|
||||
|| freespace == 0
|
||||
);
|
||||
|
||||
state.write(curr_ptr.0, &rdata[..freespace]);
|
||||
rdata = &rdata[freespace..];
|
||||
written += freespace;
|
||||
curr_ptr = Lsn(curr_ptr.0 + freespace as u64);
|
||||
|
||||
let mut new_page = XLogPageHeaderData {
|
||||
xlp_magic: XLOG_PAGE_MAGIC as u16,
|
||||
xlp_info: XLP_BKP_REMOVABLE,
|
||||
xlp_tli: 1,
|
||||
xlp_pageaddr: curr_ptr.0,
|
||||
xlp_rem_len: (total_len - written as u64) as u32,
|
||||
..Default::default() // Put 0 in padding fields.
|
||||
};
|
||||
if new_page.xlp_rem_len > 0 {
|
||||
new_page.xlp_info |= XLP_FIRST_IS_CONTRECORD;
|
||||
}
|
||||
|
||||
if curr_ptr.segment_offset(WAL_SEGMENT_SIZE) == 0 {
|
||||
new_page.xlp_info |= XLP_LONG_HEADER;
|
||||
let long_page = XLogLongPageHeaderData {
|
||||
std: new_page,
|
||||
xlp_sysid: 0,
|
||||
xlp_seg_size: WAL_SEGMENT_SIZE as u32,
|
||||
xlp_xlog_blcksz: XLOG_BLCKSZ as u32,
|
||||
};
|
||||
let header_bytes = long_page.encode()?;
|
||||
assert!(header_bytes.len() == XLOG_SIZE_OF_XLOG_LONG_PHD);
|
||||
state.write(curr_ptr.0, &header_bytes);
|
||||
curr_ptr = Lsn(curr_ptr.0 + header_bytes.len() as u64);
|
||||
} else {
|
||||
let header_bytes = new_page.encode()?;
|
||||
assert!(header_bytes.len() == XLOG_SIZE_OF_XLOG_SHORT_PHD);
|
||||
state.write(curr_ptr.0, &header_bytes);
|
||||
curr_ptr = Lsn(curr_ptr.0 + header_bytes.len() as u64);
|
||||
}
|
||||
freespace = insert_freespace(curr_ptr);
|
||||
}
|
||||
|
||||
assert!(
|
||||
curr_ptr.segment_offset(WAL_SEGMENT_SIZE) >= XLOG_SIZE_OF_XLOG_SHORT_PHD
|
||||
|| rdata.is_empty()
|
||||
);
|
||||
state.write(curr_ptr.0, rdata);
|
||||
curr_ptr = Lsn(curr_ptr.0 + rdata.len() as u64);
|
||||
written += rdata.len();
|
||||
freespace -= rdata.len();
|
||||
}
|
||||
|
||||
assert!(written == total_len as usize);
|
||||
curr_ptr.0 = maxalign(curr_ptr.0);
|
||||
assert!(curr_ptr == end);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn maxalign<T>(size: T) -> T
|
||||
where
|
||||
T: std::ops::BitAnd<Output = T>
|
||||
+ std::ops::Add<Output = T>
|
||||
+ std::ops::Not<Output = T>
|
||||
+ From<u8>,
|
||||
{
|
||||
(size + T::from(7)) & !T::from(7)
|
||||
}
|
||||
|
||||
fn insert_freespace(ptr: Lsn) -> usize {
|
||||
if ptr.block_offset() == 0 {
|
||||
0
|
||||
} else {
|
||||
(XLOG_BLCKSZ as u64 - ptr.block_offset()) as usize
|
||||
}
|
||||
}
|
||||
|
||||
const XLP_BKP_REMOVABLE: u16 = 0x0004;
|
||||
const USABLE_BYTES_IN_PAGE: u64 = (XLOG_BLCKSZ - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64;
|
||||
const USABLE_BYTES_IN_SEGMENT: u64 = ((WAL_SEGMENT_SIZE / XLOG_BLCKSZ) as u64
|
||||
* USABLE_BYTES_IN_PAGE)
|
||||
- (XLOG_SIZE_OF_XLOG_RECORD - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64;
|
||||
|
||||
fn bytepos_to_recptr(bytepos: u64) -> Lsn {
|
||||
let fullsegs = bytepos / USABLE_BYTES_IN_SEGMENT;
|
||||
let mut bytesleft = bytepos % USABLE_BYTES_IN_SEGMENT;
|
||||
|
||||
let seg_offset = if bytesleft < (XLOG_BLCKSZ - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64 {
|
||||
// fits on first page of segment
|
||||
bytesleft + XLOG_SIZE_OF_XLOG_SHORT_PHD as u64
|
||||
} else {
|
||||
// account for the first page on segment with long header
|
||||
bytesleft -= (XLOG_BLCKSZ - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64;
|
||||
let fullpages = bytesleft / USABLE_BYTES_IN_PAGE;
|
||||
bytesleft %= USABLE_BYTES_IN_PAGE;
|
||||
|
||||
XLOG_BLCKSZ as u64
|
||||
+ fullpages * XLOG_BLCKSZ as u64
|
||||
+ bytesleft
|
||||
+ XLOG_SIZE_OF_XLOG_SHORT_PHD as u64
|
||||
};
|
||||
|
||||
Lsn(XLogSegNoOffsetToRecPtr(
|
||||
fullsegs,
|
||||
seg_offset as u32,
|
||||
WAL_SEGMENT_SIZE,
|
||||
))
|
||||
}
|
||||
|
||||
fn recptr_to_bytepos(ptr: Lsn) -> u64 {
|
||||
let fullsegs = ptr.segment_number(WAL_SEGMENT_SIZE);
|
||||
let offset = ptr.segment_offset(WAL_SEGMENT_SIZE) as u64;
|
||||
|
||||
let fullpages = offset / XLOG_BLCKSZ as u64;
|
||||
let offset = offset % XLOG_BLCKSZ as u64;
|
||||
|
||||
if fullpages == 0 {
|
||||
fullsegs * USABLE_BYTES_IN_SEGMENT
|
||||
+ if offset > 0 {
|
||||
assert!(offset >= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64);
|
||||
offset - XLOG_SIZE_OF_XLOG_SHORT_PHD as u64
|
||||
} else {
|
||||
0
|
||||
}
|
||||
} else {
|
||||
fullsegs * USABLE_BYTES_IN_SEGMENT
|
||||
+ (XLOG_BLCKSZ - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64
|
||||
+ (fullpages - 1) * USABLE_BYTES_IN_PAGE
|
||||
+ if offset > 0 {
|
||||
assert!(offset >= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64);
|
||||
offset - XLOG_SIZE_OF_XLOG_SHORT_PHD as u64
|
||||
} else {
|
||||
0
|
||||
}
|
||||
/// Inserts a logical record in the WAL at the current LSN.
|
||||
pub fn insert_logical_message(&mut self, prefix: &CStr, msg: &[u8]) {
|
||||
let record = self.wal_generator.generate_logical_message(prefix, msg);
|
||||
self.disk.write(self.internal_available_lsn.into(), &record);
|
||||
self.prev_lsn = self.internal_available_lsn;
|
||||
self.internal_available_lsn += record.len() as u64;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4958,16 +4958,7 @@ impl Service {
|
||||
stripe_size,
|
||||
},
|
||||
placement_policy: Some(PlacementPolicy::Attached(0)), // No secondaries, for convenient debug/hacking
|
||||
|
||||
// There is no way to know what the tenant's config was: revert to defaults
|
||||
//
|
||||
// TODO: remove `switch_aux_file_policy` once we finish auxv2 migration
|
||||
//
|
||||
// we write to both v1+v2 storage, so that the test case can use either storage format for testing
|
||||
config: TenantConfig {
|
||||
switch_aux_file_policy: Some(models::AuxFilePolicy::CrossValidation),
|
||||
..TenantConfig::default()
|
||||
},
|
||||
config: TenantConfig::default(),
|
||||
})
|
||||
.await?;
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@ from __future__ import annotations
|
||||
pytest_plugins = (
|
||||
"fixtures.pg_version",
|
||||
"fixtures.parametrize",
|
||||
"fixtures.h2server",
|
||||
"fixtures.httpserver",
|
||||
"fixtures.compute_reconfigure",
|
||||
"fixtures.storage_controller_proxy",
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user