Compare commits

..

5 Commits

Author SHA1 Message Date
Conrad Ludgate
c0e1e1dd74 standardise logging 2025-06-25 15:54:26 +01:00
Conrad Ludgate
010cd34635 simplify error handling 2025-06-25 15:54:26 +01:00
Conrad Ludgate
11ffe4c86c remove unused error retries 2025-06-25 15:54:26 +01:00
Conrad Ludgate
d6a5085664 move h2::handshake outside of hypermechanism 2025-06-25 15:54:26 +01:00
Conrad Ludgate
16d9889a51 move authenticate outside of tokiomechanism 2025-06-25 15:54:26 +01:00
192 changed files with 4323 additions and 11923 deletions

View File

@@ -4,7 +4,6 @@
!Cargo.lock
!Cargo.toml
!Makefile
!postgres.mk
!rust-toolchain.toml
!scripts/ninstall.sh
!docker-compose/run-tests.sh

View File

@@ -94,6 +94,11 @@ jobs:
run: |
make "neon-pg-ext-${{ matrix.postgres-version }}" -j$(sysctl -n hw.ncpu)
- name: Get postgres headers ${{ matrix.postgres-version }}
if: steps.cache_pg.outputs.cache-hit != 'true'
run: |
make postgres-headers-${{ matrix.postgres-version }} -j$(sysctl -n hw.ncpu)
- name: Upload "pg_install/${{ matrix.postgres-version }}" artifact
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
@@ -135,12 +140,6 @@ jobs:
name: pg_install--v17
path: pg_install/v17
# `actions/download-artifact` doesn't preserve permissions:
# https://github.com/actions/download-artifact?tab=readme-ov-file#permission-loss
- name: Make pg_install/v*/bin/* executable
run: |
chmod +x pg_install/v*/bin/*
- name: Cache walproposer-lib
id: cache_walproposer_lib
uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
@@ -168,7 +167,7 @@ jobs:
- name: Build walproposer-lib (only for v17)
if: steps.cache_walproposer_lib.outputs.cache-hit != 'true'
run:
make walproposer-lib -j$(sysctl -n hw.ncpu) PG_INSTALL_CACHED=1
make walproposer-lib -j$(sysctl -n hw.ncpu)
- name: Upload "build/walproposer-lib" artifact
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2

View File

@@ -69,7 +69,7 @@ jobs:
submodules: true
- name: Check for file changes
uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 # v3.0.2
uses: step-security/paths-filter@v3
id: files-changed
with:
token: ${{ secrets.GITHUB_TOKEN }}

View File

@@ -153,7 +153,7 @@ jobs:
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
- name: Benchmark database maintenance
if: ${{ matrix.test_maintenance }}
if: ${{ matrix.test_maintenance == 'true' }}
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ env.BUILD_TYPE }}

View File

@@ -53,7 +53,7 @@ jobs:
submodules: true
- name: Check for Postgres changes
uses: dorny/paths-filter@1441771bbfdd59dcd748680ee64ebd8faab1a242 #v3
uses: step-security/paths-filter@v3
id: files_changed
with:
token: ${{ github.token }}

View File

@@ -34,7 +34,7 @@ jobs:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
- uses: step-security/changed-files@3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1
id: python-src
with:
files: |
@@ -45,7 +45,7 @@ jobs:
poetry.lock
pyproject.toml
- uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
- uses: step-security/changed-files@3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1
id: rust-src
with:
files: |

View File

@@ -60,23 +60,22 @@ jobs:
} >> "$GITHUB_ENV"
- name: Run proxy-bench
run: ${PROXY_BENCH_PATH}/run.sh
run: ./${PROXY_BENCH_PATH}/run.sh
- name: Ingest Bench Results # neon repo script
if: always()
if: success()
run: |
mkdir -p $TEST_OUTPUT
python $NEON_DIR/scripts/proxy_bench_results_ingest.py --out $TEST_OUTPUT
- name: Push Metrics to Proxy perf database
if: always()
if: success()
env:
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PROXY_TEST_RESULT_CONNSTR }}"
REPORT_FROM: $TEST_OUTPUT
run: $NEON_DIR/scripts/generate_and_push_perf_report.sh
- name: Docker cleanup
if: always()
run: docker compose down
- name: Notify Failure

6
.gitignore vendored
View File

@@ -6,7 +6,6 @@
/tmp_check_cli
__pycache__/
test_output/
neon_previous/
.vscode
.idea
*.swp
@@ -25,11 +24,6 @@ compaction-suite-results.*
*.o
*.so
*.Po
*.pid
# pgindent typedef lists
*.list
# various files for local testing
/proxy/.subzero
local_proxy.json

3235
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -152,7 +152,6 @@ pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointe
procfs = "0.16"
prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
prost = "0.13.5"
prost-types = "0.13.5"
rand = "0.8"
redis = { version = "0.29.2", features = ["tokio-rustls-comp", "keep-alive"] }
regex = "1.10.2"
@@ -200,7 +199,7 @@ tokio-postgres-rustls = "0.12.0"
tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
tokio-stream = "0.1"
tokio-tar = "0.3"
tokio-util = { version = "0.7.10", features = ["io", "io-util", "rt"] }
tokio-util = { version = "0.7.10", features = ["io", "rt"] }
toml = "0.8"
toml_edit = "0.22"
tonic = { version = "0.13.1", default-features = false, features = ["channel", "codegen", "gzip", "prost", "router", "server", "tls-ring", "tls-native-roots", "zstd"] }

View File

@@ -40,7 +40,6 @@ COPY --chown=nonroot vendor/postgres-v16 vendor/postgres-v16
COPY --chown=nonroot vendor/postgres-v17 vendor/postgres-v17
COPY --chown=nonroot pgxn pgxn
COPY --chown=nonroot Makefile Makefile
COPY --chown=nonroot postgres.mk postgres.mk
COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh
ENV BUILD_TYPE=release

129
Makefile
View File

@@ -4,14 +4,11 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
# managers.
POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/
# Supported PostgreSQL versions
POSTGRES_VERSIONS = v17 v16 v15 v14
# CARGO_BUILD_FLAGS: Extra flags to pass to `cargo build`. `--locked`
# and `--features testing` are popular examples.
#
# CARGO_PROFILE: Set to override the cargo profile to use. By default,
# it is derived from BUILD_TYPE.
# CARGO_PROFILE: You can also set to override the cargo profile to
# use. By default, it is derived from BUILD_TYPE.
# All intermediate build artifacts are stored here.
BUILD_DIR := build
@@ -98,24 +95,91 @@ CACHEDIR_TAG_CONTENTS := "Signature: 8a477f597d28d172789f06886806bc55"
# Top level Makefile to build Neon and PostgreSQL
#
.PHONY: all
all: neon postgres-install neon-pg-ext
all: neon postgres neon-pg-ext
### Neon Rust bits
#
# The 'postgres_ffi' depends on the Postgres headers.
.PHONY: neon
neon: postgres-headers-install walproposer-lib cargo-target-dir
neon: postgres-headers walproposer-lib cargo-target-dir
+@echo "Compiling Neon"
$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS) $(CARGO_PROFILE)
.PHONY: cargo-target-dir
cargo-target-dir:
# https://github.com/rust-lang/cargo/issues/14281
mkdir -p target
test -e target/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > target/CACHEDIR.TAG
### PostgreSQL parts
# Some rules are duplicated for Postgres v14 and 15. We may want to refactor
# to avoid the duplication in the future, but it's tolerable for now.
#
$(BUILD_DIR)/%/config.status:
mkdir -p $(BUILD_DIR)
test -e $(BUILD_DIR)/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > $(BUILD_DIR)/CACHEDIR.TAG
+@echo "Configuring Postgres $* build"
@test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \
echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \
echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \
exit 1; }
mkdir -p $(BUILD_DIR)/$*
VERSION=$*; \
EXTRA_VERSION=$$(cd $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION && git rev-parse HEAD); \
(cd $(BUILD_DIR)/$$VERSION && \
env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION/configure \
CFLAGS='$(PG_CFLAGS)' LDFLAGS='$(PG_LDFLAGS)' \
$(PG_CONFIGURE_OPTS) --with-extra-version=" ($$EXTRA_VERSION)" \
--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$$VERSION > configure.log)
# nicer alias to run 'configure'
# Note: I've been unable to use templates for this part of our configuration.
# I'm not sure why it wouldn't work, but this is the only place (apart from
# the "build-all-versions" entry points) where direct mention of PostgreSQL
# versions is used.
.PHONY: postgres-configure-v17
postgres-configure-v17: $(BUILD_DIR)/v17/config.status
.PHONY: postgres-configure-v16
postgres-configure-v16: $(BUILD_DIR)/v16/config.status
.PHONY: postgres-configure-v15
postgres-configure-v15: $(BUILD_DIR)/v15/config.status
.PHONY: postgres-configure-v14
postgres-configure-v14: $(BUILD_DIR)/v14/config.status
# Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)/<version>/include
.PHONY: postgres-headers-%
postgres-headers-%: postgres-configure-%
+@echo "Installing PostgreSQL $* headers"
$(MAKE) -C $(BUILD_DIR)/$*/src/include MAKELEVEL=0 install
# Compile and install PostgreSQL
.PHONY: postgres-%
postgres-%: postgres-configure-% \
postgres-headers-% # to prevent `make install` conflicts with neon's `postgres-headers`
+@echo "Compiling PostgreSQL $*"
$(MAKE) -C $(BUILD_DIR)/$* MAKELEVEL=0 install
+@echo "Compiling pg_prewarm $*"
$(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_prewarm install
+@echo "Compiling pg_buffercache $*"
$(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_buffercache install
+@echo "Compiling pg_visibility $*"
$(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_visibility install
+@echo "Compiling pageinspect $*"
$(MAKE) -C $(BUILD_DIR)/$*/contrib/pageinspect install
+@echo "Compiling pg_trgm $*"
$(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_trgm install
+@echo "Compiling amcheck $*"
$(MAKE) -C $(BUILD_DIR)/$*/contrib/amcheck install
+@echo "Compiling test_decoding $*"
$(MAKE) -C $(BUILD_DIR)/$*/contrib/test_decoding install
.PHONY: postgres-check-%
postgres-check-%: postgres-%
$(MAKE) -C $(BUILD_DIR)/$* MAKELEVEL=0 check
.PHONY: neon-pg-ext-%
neon-pg-ext-%: postgres-install-%
neon-pg-ext-%: postgres-%
+@echo "Compiling neon-specific Postgres extensions for $*"
mkdir -p $(BUILD_DIR)/pgxn-$*
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
@@ -154,14 +218,39 @@ ifeq ($(UNAME_S),Linux)
pg_crc32c.o
endif
# Shorthand to call neon-pg-ext-% target for all Postgres versions
.PHONY: neon-pg-ext
neon-pg-ext: $(foreach pg_version,$(POSTGRES_VERSIONS),neon-pg-ext-$(pg_version))
neon-pg-ext: \
neon-pg-ext-v14 \
neon-pg-ext-v15 \
neon-pg-ext-v16 \
neon-pg-ext-v17
# shorthand to build all Postgres versions
.PHONY: postgres
postgres: \
postgres-v14 \
postgres-v15 \
postgres-v16 \
postgres-v17
.PHONY: postgres-headers
postgres-headers: \
postgres-headers-v14 \
postgres-headers-v15 \
postgres-headers-v16 \
postgres-headers-v17
.PHONY: postgres-check
postgres-check: \
postgres-check-v14 \
postgres-check-v15 \
postgres-check-v16 \
postgres-check-v17
# This removes everything
.PHONY: distclean
distclean:
$(RM) -r $(POSTGRES_INSTALL_DIR) $(BUILD_DIR)
$(RM) -r $(POSTGRES_INSTALL_DIR)
$(CARGO_CMD_PREFIX) cargo clean
.PHONY: fmt
@@ -209,19 +298,3 @@ neon-pgindent: postgres-v17-pg-bsd-indent neon-pg-ext-v17
.PHONY: setup-pre-commit-hook
setup-pre-commit-hook:
ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit
# Targets for building PostgreSQL are defined in postgres.mk.
#
# But if the caller has indicated that PostgreSQL is already
# installed, by setting the PG_INSTALL_CACHED variable, skip it.
ifdef PG_INSTALL_CACHED
postgres-install: skip-install
$(foreach pg_version,$(POSTGRES_VERSIONS),postgres-install-$(pg_version)): skip-install
postgres-headers-install:
+@echo "Skipping installation of PostgreSQL headers because PG_INSTALL_CACHED is set"
skip-install:
+@echo "Skipping PostgreSQL installation because PG_INSTALL_CACHED is set"
else
include postgres.mk
endif

View File

@@ -165,7 +165,6 @@ RUN curl -fsSL \
&& rm sql_exporter.tar.gz
# protobuf-compiler (protoc)
# Keep the version the same as in compute/compute-node.Dockerfile
ENV PROTOC_VERSION=25.1
RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
&& unzip -q protoc.zip -d protoc \
@@ -180,7 +179,7 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/
&& mv s5cmd /usr/local/bin/s5cmd
# LLVM
ENV LLVM_VERSION=20
ENV LLVM_VERSION=19
RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
&& echo "deb http://apt.llvm.org/${DEBIAN_VERSION}/ llvm-toolchain-${DEBIAN_VERSION}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
&& apt update \
@@ -293,7 +292,7 @@ WORKDIR /home/nonroot
# Rust
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
ENV RUSTC_VERSION=1.88.0
ENV RUSTC_VERSION=1.87.0
ENV RUSTUP_HOME="/home/nonroot/.rustup"
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
ARG RUSTFILT_VERSION=0.2.1

View File

@@ -115,9 +115,6 @@ ARG EXTENSIONS=all
FROM $BASE_IMAGE_SHA AS build-deps
ARG DEBIAN_VERSION
# Keep in sync with build-tools.Dockerfile
ENV PROTOC_VERSION=25.1
# Use strict mode for bash to catch errors early
SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
@@ -152,14 +149,8 @@ RUN case $DEBIAN_VERSION in \
libclang-dev \
jsonnet \
$VERSION_INSTALLS \
&& apt clean && rm -rf /var/lib/apt/lists/* \
&& useradd -ms /bin/bash nonroot -b /home \
# Install protoc from binary release, since Debian's versions are too old.
&& curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
&& unzip -q protoc.zip -d protoc \
&& mv protoc/bin/protoc /usr/local/bin/protoc \
&& mv protoc/include/google /usr/local/include/google \
&& rm -rf protoc.zip protoc
&& apt clean && rm -rf /var/lib/apt/lists/* && \
useradd -ms /bin/bash nonroot -b /home
#########################################################################################
#
@@ -1179,7 +1170,7 @@ COPY --from=pgrag-src /ext-src/ /ext-src/
# Install it using virtual environment, because Python 3.11 (the default version on Debian 12 (Bookworm)) complains otherwise
WORKDIR /ext-src/onnxruntime-src
RUN apt update && apt install --no-install-recommends --no-install-suggests -y \
python3 python3-pip python3-venv && \
python3 python3-pip python3-venv protobuf-compiler && \
apt clean && rm -rf /var/lib/apt/lists/* && \
python3 -m venv venv && \
. venv/bin/activate && \
@@ -1572,7 +1563,6 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \
FROM build-deps AS pgaudit-src
ARG PG_VERSION
WORKDIR /ext-src
COPY "compute/patches/pgaudit-parallel_workers-${PG_VERSION}.patch" .
RUN case "${PG_VERSION}" in \
"v14") \
export PGAUDIT_VERSION=1.6.3 \
@@ -1595,8 +1585,7 @@ RUN case "${PG_VERSION}" in \
esac && \
wget https://github.com/pgaudit/pgaudit/archive/refs/tags/${PGAUDIT_VERSION}.tar.gz -O pgaudit.tar.gz && \
echo "${PGAUDIT_CHECKSUM} pgaudit.tar.gz" | sha256sum --check && \
mkdir pgaudit-src && cd pgaudit-src && tar xzf ../pgaudit.tar.gz --strip-components=1 -C . && \
patch -p1 < "/ext-src/pgaudit-parallel_workers-${PG_VERSION}.patch"
mkdir pgaudit-src && cd pgaudit-src && tar xzf ../pgaudit.tar.gz --strip-components=1 -C .
FROM pg-build AS pgaudit-build
COPY --from=pgaudit-src /ext-src/ /ext-src/
@@ -1985,7 +1974,7 @@ RUN apt update && \
locales \
lsof \
procps \
rsyslog-gnutls \
rsyslog \
screen \
tcpdump \
$VERSION_INSTALLS && \

View File

@@ -8,8 +8,6 @@
import 'sql_exporter/compute_logical_snapshot_files.libsonnet',
import 'sql_exporter/compute_logical_snapshots_bytes.libsonnet',
import 'sql_exporter/compute_max_connections.libsonnet',
import 'sql_exporter/compute_pg_oldest_frozen_xid_age.libsonnet',
import 'sql_exporter/compute_pg_oldest_mxid_age.libsonnet',
import 'sql_exporter/compute_receive_lsn.libsonnet',
import 'sql_exporter/compute_subscriptions_count.libsonnet',
import 'sql_exporter/connection_counts.libsonnet',

View File

@@ -1,13 +0,0 @@
{
metric_name: 'compute_pg_oldest_frozen_xid_age',
type: 'gauge',
help: 'Age of oldest XIDs that have not been frozen by VACUUM. An indicator of how long it has been since VACUUM last ran.',
key_labels: [
'database_name',
],
value_label: 'metric',
values: [
'frozen_xid_age',
],
query: importstr 'sql_exporter/compute_pg_oldest_frozen_xid_age.sql',
}

View File

@@ -1,4 +0,0 @@
SELECT datname database_name,
age(datfrozenxid) frozen_xid_age
FROM pg_database
ORDER BY frozen_xid_age DESC LIMIT 10;

View File

@@ -1,13 +0,0 @@
{
metric_name: 'compute_pg_oldest_mxid_age',
type: 'gauge',
help: 'Age of oldest MXIDs that have not been replaced by VACUUM. An indicator of how long it has been since VACUUM last ran.',
key_labels: [
'database_name',
],
value_label: 'metric',
values: [
'min_mxid_age',
],
query: importstr 'sql_exporter/compute_pg_oldest_mxid_age.sql',
}

View File

@@ -1,4 +0,0 @@
SELECT datname database_name,
mxid_age(datminmxid) min_mxid_age
FROM pg_database
ORDER BY min_mxid_age DESC LIMIT 10;

View File

@@ -1,8 +1,8 @@
diff --git a/sql/anon.sql b/sql/anon.sql
index 0cdc769..b450327 100644
index 0cdc769..f6cc950 100644
--- a/sql/anon.sql
+++ b/sql/anon.sql
@@ -1141,3 +1141,15 @@ $$
@@ -1141,3 +1141,8 @@ $$
-- TODO : https://en.wikipedia.org/wiki/L-diversity
-- TODO : https://en.wikipedia.org/wiki/T-closeness
@@ -11,13 +11,6 @@ index 0cdc769..b450327 100644
+
+GRANT ALL ON SCHEMA anon to neon_superuser;
+GRANT ALL ON ALL TABLES IN SCHEMA anon TO neon_superuser;
+
+DO $$
+BEGIN
+ IF current_setting('server_version_num')::int >= 150000 THEN
+ GRANT SET ON PARAMETER anon.transparent_dynamic_masking TO neon_superuser;
+ END IF;
+END $$;
diff --git a/sql/init.sql b/sql/init.sql
index 7da6553..9b6164b 100644
--- a/sql/init.sql

View File

@@ -1,143 +0,0 @@
commit 7220bb3a3f23fa27207d77562dcc286f9a123313
Author: Tristan Partin <tristan.partin@databricks.com>
Date: 2025-06-23 02:09:31 +0000
Disable logging in parallel workers
When a query uses parallel workers, pgaudit will log the same query for
every parallel worker. This is undesireable since it can result in log
amplification for queries that use parallel workers.
Signed-off-by: Tristan Partin <tristan.partin@databricks.com>
diff --git a/expected/pgaudit.out b/expected/pgaudit.out
index baa8011..a601375 100644
--- a/expected/pgaudit.out
+++ b/expected/pgaudit.out
@@ -2563,6 +2563,37 @@ COMMIT;
NOTICE: AUDIT: SESSION,12,4,MISC,COMMIT,,,COMMIT;,<not logged>
DROP TABLE part_test;
NOTICE: AUDIT: SESSION,13,1,DDL,DROP TABLE,,,DROP TABLE part_test;,<not logged>
+--
+-- Test logging in parallel workers
+SET pgaudit.log = 'read';
+SET pgaudit.log_client = on;
+SET pgaudit.log_level = 'notice';
+-- Force parallel execution for testing
+SET max_parallel_workers_per_gather = 2;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET min_parallel_index_scan_size = 0;
+-- Create table with enough data to trigger parallel execution
+CREATE TABLE parallel_test (id int, data text);
+INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data';
+SELECT count(*) FROM parallel_test;
+NOTICE: AUDIT: SESSION,14,1,READ,SELECT,,,SELECT count(*) FROM parallel_test;,<not logged>
+ count
+-------
+ 1000
+(1 row)
+
+-- Cleanup parallel test
+DROP TABLE parallel_test;
+RESET max_parallel_workers_per_gather;
+RESET parallel_tuple_cost;
+RESET parallel_setup_cost;
+RESET min_parallel_table_scan_size;
+RESET min_parallel_index_scan_size;
+RESET pgaudit.log;
+RESET pgaudit.log_client;
+RESET pgaudit.log_level;
-- Cleanup
-- Set client_min_messages up to warning to avoid noise
SET client_min_messages = 'warning';
diff --git a/pgaudit.c b/pgaudit.c
index 5e6fd38..ac9ded2 100644
--- a/pgaudit.c
+++ b/pgaudit.c
@@ -11,6 +11,7 @@
#include "postgres.h"
#include "access/htup_details.h"
+#include "access/parallel.h"
#include "access/sysattr.h"
#include "access/xact.h"
#include "access/relation.h"
@@ -1303,7 +1304,7 @@ pgaudit_ExecutorStart_hook(QueryDesc *queryDesc, int eflags)
{
AuditEventStackItem *stackItem = NULL;
- if (!internalStatement)
+ if (!internalStatement && !IsParallelWorker())
{
/* Push the audit even onto the stack */
stackItem = stack_push();
@@ -1384,7 +1385,7 @@ pgaudit_ExecutorCheckPerms_hook(List *rangeTabls, bool abort)
/* Log DML if the audit role is valid or session logging is enabled */
if ((auditOid != InvalidOid || auditLogBitmap != 0) &&
- !IsAbortedTransactionBlockState())
+ !IsAbortedTransactionBlockState() && !IsParallelWorker())
{
/* If auditLogRows is on, wait for rows processed to be set */
if (auditLogRows && auditEventStack != NULL)
@@ -1438,7 +1439,7 @@ pgaudit_ExecutorRun_hook(QueryDesc *queryDesc, ScanDirection direction, uint64 c
else
standard_ExecutorRun(queryDesc, direction, count, execute_once);
- if (auditLogRows && !internalStatement)
+ if (auditLogRows && !internalStatement && !IsParallelWorker())
{
/* Find an item from the stack by the query memory context */
stackItem = stack_find_context(queryDesc->estate->es_query_cxt);
@@ -1458,7 +1459,7 @@ pgaudit_ExecutorEnd_hook(QueryDesc *queryDesc)
AuditEventStackItem *stackItem = NULL;
AuditEventStackItem *auditEventStackFull = NULL;
- if (auditLogRows && !internalStatement)
+ if (auditLogRows && !internalStatement && !IsParallelWorker())
{
/* Find an item from the stack by the query memory context */
stackItem = stack_find_context(queryDesc->estate->es_query_cxt);
diff --git a/sql/pgaudit.sql b/sql/pgaudit.sql
index cc1374a..1870a60 100644
--- a/sql/pgaudit.sql
+++ b/sql/pgaudit.sql
@@ -1612,6 +1612,36 @@ COMMIT;
DROP TABLE part_test;
+--
+-- Test logging in parallel workers
+SET pgaudit.log = 'read';
+SET pgaudit.log_client = on;
+SET pgaudit.log_level = 'notice';
+
+-- Force parallel execution for testing
+SET max_parallel_workers_per_gather = 2;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET min_parallel_index_scan_size = 0;
+
+-- Create table with enough data to trigger parallel execution
+CREATE TABLE parallel_test (id int, data text);
+INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data';
+
+SELECT count(*) FROM parallel_test;
+
+-- Cleanup parallel test
+DROP TABLE parallel_test;
+RESET max_parallel_workers_per_gather;
+RESET parallel_tuple_cost;
+RESET parallel_setup_cost;
+RESET min_parallel_table_scan_size;
+RESET min_parallel_index_scan_size;
+RESET pgaudit.log;
+RESET pgaudit.log_client;
+RESET pgaudit.log_level;
+
-- Cleanup
-- Set client_min_messages up to warning to avoid noise
SET client_min_messages = 'warning';

View File

@@ -1,143 +0,0 @@
commit 29dc2847f6255541992f18faf8a815dfab79631a
Author: Tristan Partin <tristan.partin@databricks.com>
Date: 2025-06-23 02:09:31 +0000
Disable logging in parallel workers
When a query uses parallel workers, pgaudit will log the same query for
every parallel worker. This is undesireable since it can result in log
amplification for queries that use parallel workers.
Signed-off-by: Tristan Partin <tristan.partin@databricks.com>
diff --git a/expected/pgaudit.out b/expected/pgaudit.out
index b22560b..73f0327 100644
--- a/expected/pgaudit.out
+++ b/expected/pgaudit.out
@@ -2563,6 +2563,37 @@ COMMIT;
NOTICE: AUDIT: SESSION,12,4,MISC,COMMIT,,,COMMIT;,<not logged>
DROP TABLE part_test;
NOTICE: AUDIT: SESSION,13,1,DDL,DROP TABLE,,,DROP TABLE part_test;,<not logged>
+--
+-- Test logging in parallel workers
+SET pgaudit.log = 'read';
+SET pgaudit.log_client = on;
+SET pgaudit.log_level = 'notice';
+-- Force parallel execution for testing
+SET max_parallel_workers_per_gather = 2;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET min_parallel_index_scan_size = 0;
+-- Create table with enough data to trigger parallel execution
+CREATE TABLE parallel_test (id int, data text);
+INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data';
+SELECT count(*) FROM parallel_test;
+NOTICE: AUDIT: SESSION,14,1,READ,SELECT,,,SELECT count(*) FROM parallel_test;,<not logged>
+ count
+-------
+ 1000
+(1 row)
+
+-- Cleanup parallel test
+DROP TABLE parallel_test;
+RESET max_parallel_workers_per_gather;
+RESET parallel_tuple_cost;
+RESET parallel_setup_cost;
+RESET min_parallel_table_scan_size;
+RESET min_parallel_index_scan_size;
+RESET pgaudit.log;
+RESET pgaudit.log_client;
+RESET pgaudit.log_level;
-- Cleanup
-- Set client_min_messages up to warning to avoid noise
SET client_min_messages = 'warning';
diff --git a/pgaudit.c b/pgaudit.c
index 5e6fd38..ac9ded2 100644
--- a/pgaudit.c
+++ b/pgaudit.c
@@ -11,6 +11,7 @@
#include "postgres.h"
#include "access/htup_details.h"
+#include "access/parallel.h"
#include "access/sysattr.h"
#include "access/xact.h"
#include "access/relation.h"
@@ -1303,7 +1304,7 @@ pgaudit_ExecutorStart_hook(QueryDesc *queryDesc, int eflags)
{
AuditEventStackItem *stackItem = NULL;
- if (!internalStatement)
+ if (!internalStatement && !IsParallelWorker())
{
/* Push the audit even onto the stack */
stackItem = stack_push();
@@ -1384,7 +1385,7 @@ pgaudit_ExecutorCheckPerms_hook(List *rangeTabls, bool abort)
/* Log DML if the audit role is valid or session logging is enabled */
if ((auditOid != InvalidOid || auditLogBitmap != 0) &&
- !IsAbortedTransactionBlockState())
+ !IsAbortedTransactionBlockState() && !IsParallelWorker())
{
/* If auditLogRows is on, wait for rows processed to be set */
if (auditLogRows && auditEventStack != NULL)
@@ -1438,7 +1439,7 @@ pgaudit_ExecutorRun_hook(QueryDesc *queryDesc, ScanDirection direction, uint64 c
else
standard_ExecutorRun(queryDesc, direction, count, execute_once);
- if (auditLogRows && !internalStatement)
+ if (auditLogRows && !internalStatement && !IsParallelWorker())
{
/* Find an item from the stack by the query memory context */
stackItem = stack_find_context(queryDesc->estate->es_query_cxt);
@@ -1458,7 +1459,7 @@ pgaudit_ExecutorEnd_hook(QueryDesc *queryDesc)
AuditEventStackItem *stackItem = NULL;
AuditEventStackItem *auditEventStackFull = NULL;
- if (auditLogRows && !internalStatement)
+ if (auditLogRows && !internalStatement && !IsParallelWorker())
{
/* Find an item from the stack by the query memory context */
stackItem = stack_find_context(queryDesc->estate->es_query_cxt);
diff --git a/sql/pgaudit.sql b/sql/pgaudit.sql
index 8052426..7f0667b 100644
--- a/sql/pgaudit.sql
+++ b/sql/pgaudit.sql
@@ -1612,6 +1612,36 @@ COMMIT;
DROP TABLE part_test;
+--
+-- Test logging in parallel workers
+SET pgaudit.log = 'read';
+SET pgaudit.log_client = on;
+SET pgaudit.log_level = 'notice';
+
+-- Force parallel execution for testing
+SET max_parallel_workers_per_gather = 2;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET min_parallel_index_scan_size = 0;
+
+-- Create table with enough data to trigger parallel execution
+CREATE TABLE parallel_test (id int, data text);
+INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data';
+
+SELECT count(*) FROM parallel_test;
+
+-- Cleanup parallel test
+DROP TABLE parallel_test;
+RESET max_parallel_workers_per_gather;
+RESET parallel_tuple_cost;
+RESET parallel_setup_cost;
+RESET min_parallel_table_scan_size;
+RESET min_parallel_index_scan_size;
+RESET pgaudit.log;
+RESET pgaudit.log_client;
+RESET pgaudit.log_level;
+
-- Cleanup
-- Set client_min_messages up to warning to avoid noise
SET client_min_messages = 'warning';

View File

@@ -1,143 +0,0 @@
commit cc708dde7ef2af2a8120d757102d2e34c0463a0f
Author: Tristan Partin <tristan.partin@databricks.com>
Date: 2025-06-23 02:09:31 +0000
Disable logging in parallel workers
When a query uses parallel workers, pgaudit will log the same query for
every parallel worker. This is undesireable since it can result in log
amplification for queries that use parallel workers.
Signed-off-by: Tristan Partin <tristan.partin@databricks.com>
diff --git a/expected/pgaudit.out b/expected/pgaudit.out
index 8772054..9b66ac6 100644
--- a/expected/pgaudit.out
+++ b/expected/pgaudit.out
@@ -2556,6 +2556,37 @@ DROP SERVER fdw_server;
NOTICE: AUDIT: SESSION,11,1,DDL,DROP SERVER,,,DROP SERVER fdw_server;,<not logged>
DROP EXTENSION postgres_fdw;
NOTICE: AUDIT: SESSION,12,1,DDL,DROP EXTENSION,,,DROP EXTENSION postgres_fdw;,<not logged>
+--
+-- Test logging in parallel workers
+SET pgaudit.log = 'read';
+SET pgaudit.log_client = on;
+SET pgaudit.log_level = 'notice';
+-- Force parallel execution for testing
+SET max_parallel_workers_per_gather = 2;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET min_parallel_index_scan_size = 0;
+-- Create table with enough data to trigger parallel execution
+CREATE TABLE parallel_test (id int, data text);
+INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data';
+SELECT count(*) FROM parallel_test;
+NOTICE: AUDIT: SESSION,13,1,READ,SELECT,,,SELECT count(*) FROM parallel_test;,<not logged>
+ count
+-------
+ 1000
+(1 row)
+
+-- Cleanup parallel test
+DROP TABLE parallel_test;
+RESET max_parallel_workers_per_gather;
+RESET parallel_tuple_cost;
+RESET parallel_setup_cost;
+RESET min_parallel_table_scan_size;
+RESET min_parallel_index_scan_size;
+RESET pgaudit.log;
+RESET pgaudit.log_client;
+RESET pgaudit.log_level;
-- Cleanup
-- Set client_min_messages up to warning to avoid noise
SET client_min_messages = 'warning';
diff --git a/pgaudit.c b/pgaudit.c
index 004d1f9..f061164 100644
--- a/pgaudit.c
+++ b/pgaudit.c
@@ -11,6 +11,7 @@
#include "postgres.h"
#include "access/htup_details.h"
+#include "access/parallel.h"
#include "access/sysattr.h"
#include "access/xact.h"
#include "access/relation.h"
@@ -1339,7 +1340,7 @@ pgaudit_ExecutorStart_hook(QueryDesc *queryDesc, int eflags)
{
AuditEventStackItem *stackItem = NULL;
- if (!internalStatement)
+ if (!internalStatement && !IsParallelWorker())
{
/* Push the audit even onto the stack */
stackItem = stack_push();
@@ -1420,7 +1421,7 @@ pgaudit_ExecutorCheckPerms_hook(List *rangeTabls, List *permInfos, bool abort)
/* Log DML if the audit role is valid or session logging is enabled */
if ((auditOid != InvalidOid || auditLogBitmap != 0) &&
- !IsAbortedTransactionBlockState())
+ !IsAbortedTransactionBlockState() && !IsParallelWorker())
{
/* If auditLogRows is on, wait for rows processed to be set */
if (auditLogRows && auditEventStack != NULL)
@@ -1475,7 +1476,7 @@ pgaudit_ExecutorRun_hook(QueryDesc *queryDesc, ScanDirection direction, uint64 c
else
standard_ExecutorRun(queryDesc, direction, count, execute_once);
- if (auditLogRows && !internalStatement)
+ if (auditLogRows && !internalStatement && !IsParallelWorker())
{
/* Find an item from the stack by the query memory context */
stackItem = stack_find_context(queryDesc->estate->es_query_cxt);
@@ -1495,7 +1496,7 @@ pgaudit_ExecutorEnd_hook(QueryDesc *queryDesc)
AuditEventStackItem *stackItem = NULL;
AuditEventStackItem *auditEventStackFull = NULL;
- if (auditLogRows && !internalStatement)
+ if (auditLogRows && !internalStatement && !IsParallelWorker())
{
/* Find an item from the stack by the query memory context */
stackItem = stack_find_context(queryDesc->estate->es_query_cxt);
diff --git a/sql/pgaudit.sql b/sql/pgaudit.sql
index 6aae88b..de6d7fd 100644
--- a/sql/pgaudit.sql
+++ b/sql/pgaudit.sql
@@ -1631,6 +1631,36 @@ DROP USER MAPPING FOR regress_user1 SERVER fdw_server;
DROP SERVER fdw_server;
DROP EXTENSION postgres_fdw;
+--
+-- Test logging in parallel workers
+SET pgaudit.log = 'read';
+SET pgaudit.log_client = on;
+SET pgaudit.log_level = 'notice';
+
+-- Force parallel execution for testing
+SET max_parallel_workers_per_gather = 2;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET min_parallel_index_scan_size = 0;
+
+-- Create table with enough data to trigger parallel execution
+CREATE TABLE parallel_test (id int, data text);
+INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data';
+
+SELECT count(*) FROM parallel_test;
+
+-- Cleanup parallel test
+DROP TABLE parallel_test;
+RESET max_parallel_workers_per_gather;
+RESET parallel_tuple_cost;
+RESET parallel_setup_cost;
+RESET min_parallel_table_scan_size;
+RESET min_parallel_index_scan_size;
+RESET pgaudit.log;
+RESET pgaudit.log_client;
+RESET pgaudit.log_level;
+
-- Cleanup
-- Set client_min_messages up to warning to avoid noise
SET client_min_messages = 'warning';

View File

@@ -1,143 +0,0 @@
commit 8d02e4c6c5e1e8676251b0717a46054267091cb4
Author: Tristan Partin <tristan.partin@databricks.com>
Date: 2025-06-23 02:09:31 +0000
Disable logging in parallel workers
When a query uses parallel workers, pgaudit will log the same query for
every parallel worker. This is undesireable since it can result in log
amplification for queries that use parallel workers.
Signed-off-by: Tristan Partin <tristan.partin@databricks.com>
diff --git a/expected/pgaudit.out b/expected/pgaudit.out
index d696287..4b1059a 100644
--- a/expected/pgaudit.out
+++ b/expected/pgaudit.out
@@ -2568,6 +2568,37 @@ DROP SERVER fdw_server;
NOTICE: AUDIT: SESSION,11,1,DDL,DROP SERVER,,,DROP SERVER fdw_server,<not logged>
DROP EXTENSION postgres_fdw;
NOTICE: AUDIT: SESSION,12,1,DDL,DROP EXTENSION,,,DROP EXTENSION postgres_fdw,<not logged>
+--
+-- Test logging in parallel workers
+SET pgaudit.log = 'read';
+SET pgaudit.log_client = on;
+SET pgaudit.log_level = 'notice';
+-- Force parallel execution for testing
+SET max_parallel_workers_per_gather = 2;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET min_parallel_index_scan_size = 0;
+-- Create table with enough data to trigger parallel execution
+CREATE TABLE parallel_test (id int, data text);
+INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data';
+SELECT count(*) FROM parallel_test;
+NOTICE: AUDIT: SESSION,13,1,READ,SELECT,,,SELECT count(*) FROM parallel_test,<not logged>
+ count
+-------
+ 1000
+(1 row)
+
+-- Cleanup parallel test
+DROP TABLE parallel_test;
+RESET max_parallel_workers_per_gather;
+RESET parallel_tuple_cost;
+RESET parallel_setup_cost;
+RESET min_parallel_table_scan_size;
+RESET min_parallel_index_scan_size;
+RESET pgaudit.log;
+RESET pgaudit.log_client;
+RESET pgaudit.log_level;
-- Cleanup
-- Set client_min_messages up to warning to avoid noise
SET client_min_messages = 'warning';
diff --git a/pgaudit.c b/pgaudit.c
index 1764af1..0e48875 100644
--- a/pgaudit.c
+++ b/pgaudit.c
@@ -11,6 +11,7 @@
#include "postgres.h"
#include "access/htup_details.h"
+#include "access/parallel.h"
#include "access/sysattr.h"
#include "access/xact.h"
#include "access/relation.h"
@@ -1406,7 +1407,7 @@ pgaudit_ExecutorStart_hook(QueryDesc *queryDesc, int eflags)
{
AuditEventStackItem *stackItem = NULL;
- if (!internalStatement)
+ if (!internalStatement && !IsParallelWorker())
{
/* Push the audit event onto the stack */
stackItem = stack_push();
@@ -1489,7 +1490,7 @@ pgaudit_ExecutorCheckPerms_hook(List *rangeTabls, List *permInfos, bool abort)
/* Log DML if the audit role is valid or session logging is enabled */
if ((auditOid != InvalidOid || auditLogBitmap != 0) &&
- !IsAbortedTransactionBlockState())
+ !IsAbortedTransactionBlockState() && !IsParallelWorker())
{
/* If auditLogRows is on, wait for rows processed to be set */
if (auditLogRows && auditEventStack != NULL)
@@ -1544,7 +1545,7 @@ pgaudit_ExecutorRun_hook(QueryDesc *queryDesc, ScanDirection direction, uint64 c
else
standard_ExecutorRun(queryDesc, direction, count, execute_once);
- if (auditLogRows && !internalStatement)
+ if (auditLogRows && !internalStatement && !IsParallelWorker())
{
/* Find an item from the stack by the query memory context */
stackItem = stack_find_context(queryDesc->estate->es_query_cxt);
@@ -1564,7 +1565,7 @@ pgaudit_ExecutorEnd_hook(QueryDesc *queryDesc)
AuditEventStackItem *stackItem = NULL;
AuditEventStackItem *auditEventStackFull = NULL;
- if (auditLogRows && !internalStatement)
+ if (auditLogRows && !internalStatement && !IsParallelWorker())
{
/* Find an item from the stack by the query memory context */
stackItem = stack_find_context(queryDesc->estate->es_query_cxt);
diff --git a/sql/pgaudit.sql b/sql/pgaudit.sql
index e161f01..c873098 100644
--- a/sql/pgaudit.sql
+++ b/sql/pgaudit.sql
@@ -1637,6 +1637,36 @@ DROP USER MAPPING FOR regress_user1 SERVER fdw_server;
DROP SERVER fdw_server;
DROP EXTENSION postgres_fdw;
+--
+-- Test logging in parallel workers
+SET pgaudit.log = 'read';
+SET pgaudit.log_client = on;
+SET pgaudit.log_level = 'notice';
+
+-- Force parallel execution for testing
+SET max_parallel_workers_per_gather = 2;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET min_parallel_index_scan_size = 0;
+
+-- Create table with enough data to trigger parallel execution
+CREATE TABLE parallel_test (id int, data text);
+INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data';
+
+SELECT count(*) FROM parallel_test;
+
+-- Cleanup parallel test
+DROP TABLE parallel_test;
+RESET max_parallel_workers_per_gather;
+RESET parallel_tuple_cost;
+RESET parallel_setup_cost;
+RESET min_parallel_table_scan_size;
+RESET min_parallel_index_scan_size;
+RESET pgaudit.log;
+RESET pgaudit.log_client;
+RESET pgaudit.log_level;
+
-- Cleanup
-- Set client_min_messages up to warning to avoid noise
SET client_min_messages = 'warning';

View File

@@ -27,7 +27,6 @@ fail.workspace = true
flate2.workspace = true
futures.workspace = true
http.workspace = true
hostname-validator = "1.1"
indexmap.workspace = true
itertools.workspace = true
jsonwebtoken.workspace = true
@@ -39,7 +38,6 @@ once_cell.workspace = true
opentelemetry.workspace = true
opentelemetry_sdk.workspace = true
p256 = { version = "0.13", features = ["pem"] }
pageserver_page_api.workspace = true
postgres.workspace = true
regex.workspace = true
reqwest = { workspace = true, features = ["json"] }
@@ -55,7 +53,6 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
tokio-postgres.workspace = true
tokio-util.workspace = true
tokio-stream.workspace = true
tonic.workspace = true
tower-otel.workspace = true
tracing.workspace = true
tracing-opentelemetry.workspace = true

View File

@@ -36,8 +36,6 @@
use std::ffi::OsString;
use std::fs::File;
use std::process::exit;
use std::sync::Arc;
use std::sync::atomic::AtomicU64;
use std::sync::mpsc;
use std::thread;
use std::time::Duration;
@@ -192,9 +190,7 @@ fn main() -> Result<()> {
cgroup: cli.cgroup,
#[cfg(target_os = "linux")]
vm_monitor_addr: cli.vm_monitor_addr,
installed_extensions_collection_interval: Arc::new(AtomicU64::new(
cli.installed_extensions_collection_interval,
)),
installed_extensions_collection_interval: cli.installed_extensions_collection_interval,
},
config,
)?;

View File

@@ -6,7 +6,7 @@ use compute_api::responses::{
LfcPrewarmState, TlsConfig,
};
use compute_api::spec::{
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverProtocol, PgIdent,
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent,
};
use futures::StreamExt;
use futures::future::join_all;
@@ -15,17 +15,17 @@ use itertools::Itertools;
use nix::sys::signal::{Signal, kill};
use nix::unistd::Pid;
use once_cell::sync::Lazy;
use pageserver_page_api::{self as page_api, BaseBackupCompression};
use postgres;
use postgres::NoTls;
use postgres::error::SqlState;
use remote_storage::{DownloadError, RemotePath};
use std::collections::{HashMap, HashSet};
use std::net::SocketAddr;
use std::os::unix::fs::{PermissionsExt, symlink};
use std::path::Path;
use std::process::{Command, Stdio};
use std::str::FromStr;
use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
use std::sync::atomic::{AtomicU32, Ordering};
use std::sync::{Arc, Condvar, Mutex, RwLock};
use std::time::{Duration, Instant};
use std::{env, fs};
@@ -36,7 +36,6 @@ use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
use utils::measured_stream::MeasuredReader;
use utils::pid_file;
use utils::shard::{ShardCount, ShardIndex, ShardNumber};
use crate::configurator::launch_configurator;
use crate::disk_quota::set_disk_quota;
@@ -70,7 +69,6 @@ pub static BUILD_TAG: Lazy<String> = Lazy::new(|| {
.unwrap_or(BUILD_TAG_DEFAULT)
.to_string()
});
const DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL: u64 = 3600;
/// Static configuration params that don't change after startup. These mostly
/// come from the CLI args, or are derived from them.
@@ -104,7 +102,7 @@ pub struct ComputeNodeParams {
pub remote_ext_base_url: Option<Url>,
/// Interval for installed extensions collection
pub installed_extensions_collection_interval: Arc<AtomicU64>,
pub installed_extensions_collection_interval: u64,
}
/// Compute node info shared across several `compute_ctl` threads.
@@ -127,9 +125,6 @@ pub struct ComputeNode {
// key: ext_archive_name, value: started download time, download_completed?
pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
pub compute_ctl_config: ComputeCtlConfig,
/// Handle to the extension stats collection task
extension_stats_task: Mutex<Option<tokio::task::JoinHandle<()>>>,
}
// store some metrics about download size that might impact startup time
@@ -223,8 +218,7 @@ pub struct ParsedSpec {
pub pageserver_connstr: String,
pub safekeeper_connstrings: Vec<String>,
pub storage_auth_token: Option<String>,
/// k8s dns name and port
pub endpoint_storage_addr: Option<String>,
pub endpoint_storage_addr: Option<SocketAddr>,
pub endpoint_storage_token: Option<String>,
}
@@ -319,10 +313,13 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
.or(Err("invalid timeline id"))?
};
let endpoint_storage_addr: Option<String> = spec
let endpoint_storage_addr: Option<SocketAddr> = spec
.endpoint_storage_addr
.clone()
.or_else(|| spec.cluster.settings.find("neon.endpoint_storage_addr"));
.or_else(|| spec.cluster.settings.find("neon.endpoint_storage_addr"))
.unwrap_or_default()
.parse()
.ok();
let endpoint_storage_token = spec
.endpoint_storage_token
.clone()
@@ -432,7 +429,6 @@ impl ComputeNode {
state_changed: Condvar::new(),
ext_download_progress: RwLock::new(HashMap::new()),
compute_ctl_config: config.compute_ctl_config,
extension_stats_task: Mutex::new(None),
})
}
@@ -520,9 +516,6 @@ impl ComputeNode {
None
};
// Terminate the extension stats collection task
this.terminate_extension_stats_task();
// Terminate the vm_monitor so it releases the file watcher on
// /sys/fs/cgroup/neon-postgres.
// Note: the vm-monitor only runs on linux because it requires cgroups.
@@ -759,15 +752,10 @@ impl ComputeNode {
// Configure and start rsyslog for compliance audit logging
match pspec.spec.audit_log_level {
ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
let remote_tls_endpoint =
std::env::var("AUDIT_LOGGING_TLS_ENDPOINT").unwrap_or("".to_string());
let remote_plain_endpoint =
let remote_endpoint =
std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
if remote_plain_endpoint.is_empty() && remote_tls_endpoint.is_empty() {
anyhow::bail!(
"AUDIT_LOGGING_ENDPOINT and AUDIT_LOGGING_TLS_ENDPOINT are both empty"
);
if remote_endpoint.is_empty() {
anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty");
}
let log_directory_path = Path::new(&self.params.pgdata).join("log");
@@ -783,8 +771,7 @@ impl ComputeNode {
log_directory_path.clone(),
endpoint_id,
project_id,
&remote_plain_endpoint,
&remote_tls_endpoint,
&remote_endpoint,
)?;
// Launch a background task to clean up the audit logs
@@ -1011,80 +998,13 @@ impl ComputeNode {
Ok(())
}
/// Fetches a basebackup from the Pageserver using the compute state's Pageserver connstring and
/// unarchives it to `pgdata` directory, replacing any existing contents.
// Get basebackup from the libpq connection to pageserver using `connstr` and
// unarchive it to `pgdata` directory overriding all its previous content.
#[instrument(skip_all, fields(%lsn))]
fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
let spec = compute_state.pspec.as_ref().expect("spec must be set");
let start_time = Instant::now();
let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
let started = Instant::now();
let (connected, size) = match PageserverProtocol::from_connstring(shard0_connstr)? {
PageserverProtocol::Libpq => self.try_get_basebackup_libpq(spec, lsn)?,
PageserverProtocol::Grpc => self.try_get_basebackup_grpc(spec, lsn)?,
};
let mut state = self.state.lock().unwrap();
state.metrics.pageserver_connect_micros =
connected.duration_since(started).as_micros() as u64;
state.metrics.basebackup_bytes = size as u64;
state.metrics.basebackup_ms = started.elapsed().as_millis() as u64;
Ok(())
}
/// Fetches a basebackup via gRPC. The connstring must use grpc://. Returns the timestamp when
/// the connection was established, and the (compressed) size of the basebackup.
fn try_get_basebackup_grpc(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
let shard0_connstr = spec
.pageserver_connstr
.split(',')
.next()
.unwrap()
.to_string();
let shard_index = match spec.pageserver_connstr.split(',').count() as u8 {
0 | 1 => ShardIndex::unsharded(),
count => ShardIndex::new(ShardNumber(0), ShardCount(count)),
};
let (reader, connected) = tokio::runtime::Handle::current().block_on(async move {
let mut client = page_api::Client::new(
shard0_connstr,
spec.tenant_id,
spec.timeline_id,
shard_index,
spec.storage_auth_token.clone(),
None, // NB: base backups use payload compression
)
.await?;
let connected = Instant::now();
let reader = client
.get_base_backup(page_api::GetBaseBackupRequest {
lsn: (lsn != Lsn(0)).then_some(lsn),
compression: BaseBackupCompression::Gzip,
replica: spec.spec.mode != ComputeMode::Primary,
full: false,
})
.await?;
anyhow::Ok((reader, connected))
})?;
let mut reader = MeasuredReader::new(tokio_util::io::SyncIoBridge::new(reader));
// Set `ignore_zeros` so that unpack() reads the entire stream and doesn't just stop at the
// end-of-archive marker. If the server errors, the tar::Builder drop handler will write an
// end-of-archive marker before the error is emitted, and we would not see the error.
let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut reader));
ar.set_ignore_zeros(true);
ar.unpack(&self.params.pgdata)?;
Ok((connected, reader.get_byte_count()))
}
/// Fetches a basebackup via libpq. The connstring must use postgresql://. Returns the timestamp
/// when the connection was established, and the (compressed) size of the basebackup.
fn try_get_basebackup_libpq(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
let mut config = postgres::Config::from_str(shard0_connstr)?;
@@ -1098,14 +1018,16 @@ impl ComputeNode {
}
config.application_name("compute_ctl");
config.options(&format!(
"-c neon.compute_mode={}",
spec.spec.mode.to_type_str()
));
if let Some(spec) = &compute_state.pspec {
config.options(&format!(
"-c neon.compute_mode={}",
spec.spec.mode.to_type_str()
));
}
// Connect to pageserver
let mut client = config.connect(NoTls)?;
let connected = Instant::now();
let pageserver_connect_micros = start_time.elapsed().as_micros() as u64;
let basebackup_cmd = match lsn {
Lsn(0) => {
@@ -1142,13 +1064,16 @@ impl ComputeNode {
// Set `ignore_zeros` so that unpack() reads all the Copy data and
// doesn't stop at the end-of-archive marker. Otherwise, if the server
// sends an Error after finishing the tarball, we will not notice it.
// The tar::Builder drop handler will write an end-of-archive marker
// before emitting the error, and we would not see it otherwise.
let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut bufreader));
ar.set_ignore_zeros(true);
ar.unpack(&self.params.pgdata)?;
Ok((connected, measured_reader.get_byte_count()))
// Report metrics
let mut state = self.state.lock().unwrap();
state.metrics.pageserver_connect_micros = pageserver_connect_micros;
state.metrics.basebackup_bytes = measured_reader.get_byte_count() as u64;
state.metrics.basebackup_ms = start_time.elapsed().as_millis() as u64;
Ok(())
}
// Gets the basebackup in a retry loop
@@ -1685,8 +1610,6 @@ impl ComputeNode {
tls_config = self.compute_ctl_config.tls.clone();
}
self.update_installed_extensions_collection_interval(&spec);
let max_concurrent_connections = self.max_service_connections(compute_state, &spec);
// Merge-apply spec & changes to PostgreSQL state.
@@ -1751,8 +1674,6 @@ impl ComputeNode {
let tls_config = self.tls_config(&spec);
self.update_installed_extensions_collection_interval(&spec);
if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings {
info!("tuning pgbouncer");
@@ -2357,73 +2278,24 @@ LIMIT 100",
}
pub fn spawn_extension_stats_task(&self) {
// Cancel any existing task
if let Some(handle) = self.extension_stats_task.lock().unwrap().take() {
handle.abort();
}
let conf = self.tokio_conn_conf.clone();
let atomic_interval = self.params.installed_extensions_collection_interval.clone();
let mut installed_extensions_collection_interval =
2 * atomic_interval.load(std::sync::atomic::Ordering::SeqCst);
info!(
"[NEON_EXT_SPAWN] Spawning background installed extensions worker with Timeout: {}",
installed_extensions_collection_interval
);
let handle = tokio::spawn(async move {
let installed_extensions_collection_interval =
self.params.installed_extensions_collection_interval;
tokio::spawn(async move {
// An initial sleep is added to ensure that two collections don't happen at the same time.
// The first collection happens during compute startup.
tokio::time::sleep(tokio::time::Duration::from_secs(
installed_extensions_collection_interval,
))
.await;
let mut interval = tokio::time::interval(tokio::time::Duration::from_secs(
installed_extensions_collection_interval,
));
loop {
info!(
"[NEON_EXT_INT_SLEEP]: Interval: {}",
installed_extensions_collection_interval
);
// Sleep at the start of the loop to ensure that two collections don't happen at the same time.
// The first collection happens during compute startup.
tokio::time::sleep(tokio::time::Duration::from_secs(
installed_extensions_collection_interval,
))
.await;
interval.tick().await;
let _ = installed_extensions(conf.clone()).await;
// Acquire a read lock on the compute spec and then update the interval if necessary
installed_extensions_collection_interval = std::cmp::max(
installed_extensions_collection_interval,
2 * atomic_interval.load(std::sync::atomic::Ordering::SeqCst),
);
}
});
// Store the new task handle
*self.extension_stats_task.lock().unwrap() = Some(handle);
}
fn terminate_extension_stats_task(&self) {
if let Some(handle) = self.extension_stats_task.lock().unwrap().take() {
handle.abort();
}
}
fn update_installed_extensions_collection_interval(&self, spec: &ComputeSpec) {
// Update the interval for collecting installed extensions statistics
// If the value is -1, we never suspend so set the value to default collection.
// If the value is 0, it means default, we will just continue to use the default.
if spec.suspend_timeout_seconds == -1 || spec.suspend_timeout_seconds == 0 {
info!(
"[NEON_EXT_INT_UPD] Spec Timeout: {}, New Timeout: {}",
spec.suspend_timeout_seconds, DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL
);
self.params.installed_extensions_collection_interval.store(
DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL,
std::sync::atomic::Ordering::SeqCst,
);
} else {
info!(
"[NEON_EXT_INT_UPD] Spec Timeout: {}",
spec.suspend_timeout_seconds
);
self.params.installed_extensions_collection_interval.store(
spec.suspend_timeout_seconds as u64,
std::sync::atomic::Ordering::SeqCst,
);
}
}
}

View File

@@ -10,13 +10,7 @@ input(type="imfile" File="{log_directory}/*.log"
startmsg.regex="^[[:digit:]]{{4}}-[[:digit:]]{{2}}-[[:digit:]]{{2}} [[:digit:]]{{2}}:[[:digit:]]{{2}}:[[:digit:]]{{2}}.[[:digit:]]{{3}} GMT,")
# the directory to store rsyslog state files
global(
workDirectory="/var/log/rsyslog"
DefaultNetstreamDriverCAFile="/etc/ssl/certs/ca-certificates.crt"
)
# Whether the remote syslog receiver uses tls
set $.remote_syslog_tls = "{remote_syslog_tls}";
global(workDirectory="/var/log/rsyslog")
# Construct json, endpoint_id and project_id as additional metadata
set $.json_log!endpoint_id = "{endpoint_id}";
@@ -27,29 +21,5 @@ set $.json_log!msg = $msg;
template(name="PgAuditLog" type="string"
string="<%PRI%>1 %TIMESTAMP:::date-rfc3339% %HOSTNAME% - - - - %$.json_log%")
# Forward to remote syslog receiver (over TLS)
if ( $syslogtag == 'pgaudit_log' ) then {{
if ( $.remote_syslog_tls == 'true' ) then {{
action(type="omfwd" target="{remote_syslog_host}" port="{remote_syslog_port}" protocol="tcp"
template="PgAuditLog"
queue.type="linkedList"
queue.size="1000"
action.ResumeRetryCount="10"
StreamDriver="gtls"
StreamDriverMode="1"
StreamDriverAuthMode="x509/name"
StreamDriverPermittedPeers="{remote_syslog_host}"
StreamDriver.CheckExtendedKeyPurpose="on"
StreamDriver.PermitExpiredCerts="off"
)
stop
}} else {{
action(type="omfwd" target="{remote_syslog_host}" port="{remote_syslog_port}" protocol="tcp"
template="PgAuditLog"
queue.type="linkedList"
queue.size="1000"
action.ResumeRetryCount="10"
)
stop
}}
}}
# Forward to remote syslog receiver (@@<hostname>:<port>;format
local5.info @@{remote_endpoint};PgAuditLog

View File

@@ -4,9 +4,7 @@ use std::thread;
use std::time::{Duration, SystemTime};
use anyhow::{Result, bail};
use compute_api::spec::{ComputeMode, PageserverProtocol};
use itertools::Itertools as _;
use pageserver_page_api as page_api;
use compute_api::spec::ComputeMode;
use postgres::{NoTls, SimpleQueryMessage};
use tracing::{info, warn};
use utils::id::{TenantId, TimelineId};
@@ -78,17 +76,25 @@ fn acquire_lsn_lease_with_retry(
loop {
// Note: List of pageservers is dynamic, need to re-read configs before each attempt.
let (connstrings, auth) = {
let configs = {
let state = compute.state.lock().unwrap();
let spec = state.pspec.as_ref().expect("spec must be set");
(
spec.pageserver_connstr.clone(),
spec.storage_auth_token.clone(),
)
let conn_strings = spec.pageserver_connstr.split(',');
conn_strings
.map(|connstr| {
let mut config = postgres::Config::from_str(connstr).expect("Invalid connstr");
if let Some(storage_auth_token) = &spec.storage_auth_token {
config.password(storage_auth_token.clone());
}
config
})
.collect::<Vec<_>>()
};
let result =
try_acquire_lsn_lease(&connstrings, auth.as_deref(), tenant_id, timeline_id, lsn);
let result = try_acquire_lsn_lease(tenant_id, timeline_id, lsn, &configs);
match result {
Ok(Some(res)) => {
return Ok(res);
@@ -110,104 +116,68 @@ fn acquire_lsn_lease_with_retry(
}
}
/// Tries to acquire LSN leases on all Pageserver shards.
/// Tries to acquire an LSN lease through PS page_service API.
fn try_acquire_lsn_lease(
connstrings: &str,
auth: Option<&str>,
tenant_id: TenantId,
timeline_id: TimelineId,
lsn: Lsn,
configs: &[postgres::Config],
) -> Result<Option<SystemTime>> {
let connstrings = connstrings.split(',').collect_vec();
let shard_count = connstrings.len();
let mut leases = Vec::new();
for (shard_number, &connstring) in connstrings.iter().enumerate() {
let tenant_shard_id = match shard_count {
0 | 1 => TenantShardId::unsharded(tenant_id),
shard_count => TenantShardId {
tenant_id,
shard_number: ShardNumber(shard_number as u8),
shard_count: ShardCount::new(shard_count as u8),
},
fn get_valid_until(
config: &postgres::Config,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
lsn: Lsn,
) -> Result<Option<SystemTime>> {
let mut client = config.connect(NoTls)?;
let cmd = format!("lease lsn {tenant_shard_id} {timeline_id} {lsn} ");
let res = client.simple_query(&cmd)?;
let msg = match res.first() {
Some(msg) => msg,
None => bail!("empty response"),
};
let row = match msg {
SimpleQueryMessage::Row(row) => row,
_ => bail!("error parsing lsn lease response"),
};
let lease = match PageserverProtocol::from_connstring(connstring)? {
PageserverProtocol::Libpq => {
acquire_lsn_lease_libpq(connstring, auth, tenant_shard_id, timeline_id, lsn)?
}
PageserverProtocol::Grpc => {
acquire_lsn_lease_grpc(connstring, auth, tenant_shard_id, timeline_id, lsn)?
}
};
leases.push(lease);
// Note: this will be None if a lease is explicitly not granted.
let valid_until_str = row.get("valid_until");
let valid_until = valid_until_str.map(|s| {
SystemTime::UNIX_EPOCH
.checked_add(Duration::from_millis(u128::from_str(s).unwrap() as u64))
.expect("Time larger than max SystemTime could handle")
});
Ok(valid_until)
}
Ok(leases.into_iter().min().flatten())
}
let shard_count = configs.len();
/// Acquires an LSN lease on a single shard, using the libpq API. The connstring must use a
/// postgresql:// scheme.
fn acquire_lsn_lease_libpq(
connstring: &str,
auth: Option<&str>,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
lsn: Lsn,
) -> Result<Option<SystemTime>> {
let mut config = postgres::Config::from_str(connstring)?;
if let Some(auth) = auth {
config.password(auth);
}
let mut client = config.connect(NoTls)?;
let cmd = format!("lease lsn {tenant_shard_id} {timeline_id} {lsn} ");
let res = client.simple_query(&cmd)?;
let msg = match res.first() {
Some(msg) => msg,
None => bail!("empty response"),
};
let row = match msg {
SimpleQueryMessage::Row(row) => row,
_ => bail!("error parsing lsn lease response"),
let valid_until = if shard_count > 1 {
configs
.iter()
.enumerate()
.map(|(shard_number, config)| {
let tenant_shard_id = TenantShardId {
tenant_id,
shard_count: ShardCount::new(shard_count as u8),
shard_number: ShardNumber(shard_number as u8),
};
get_valid_until(config, tenant_shard_id, timeline_id, lsn)
})
.collect::<Result<Vec<Option<SystemTime>>>>()?
.into_iter()
.min()
.unwrap()
} else {
get_valid_until(
&configs[0],
TenantShardId::unsharded(tenant_id),
timeline_id,
lsn,
)?
};
// Note: this will be None if a lease is explicitly not granted.
let valid_until_str = row.get("valid_until");
let valid_until = valid_until_str.map(|s| {
SystemTime::UNIX_EPOCH
.checked_add(Duration::from_millis(u128::from_str(s).unwrap() as u64))
.expect("Time larger than max SystemTime could handle")
});
Ok(valid_until)
}
/// Acquires an LSN lease on a single shard, using the gRPC API. The connstring must use a
/// grpc:// scheme.
fn acquire_lsn_lease_grpc(
connstring: &str,
auth: Option<&str>,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
lsn: Lsn,
) -> Result<Option<SystemTime>> {
tokio::runtime::Handle::current().block_on(async move {
let mut client = page_api::Client::new(
connstring.to_string(),
tenant_shard_id.tenant_id,
timeline_id,
tenant_shard_id.to_index(),
auth.map(String::from),
None,
)
.await?;
let req = page_api::LeaseLsnRequest { lsn };
match client.lease_lsn(req).await {
Ok(expires) => Ok(Some(expires)),
// Lease couldn't be acquired because the LSN has been garbage collected.
Err(err) if err.code() == tonic::Code::FailedPrecondition => Ok(None),
Err(err) => Err(err.into()),
}
})
}

View File

@@ -4,10 +4,8 @@ use std::path::Path;
use std::process::Command;
use std::time::Duration;
use std::{fs::OpenOptions, io::Write};
use url::{Host, Url};
use anyhow::{Context, Result, anyhow};
use hostname_validator;
use tracing::{error, info, instrument, warn};
const POSTGRES_LOGS_CONF_PATH: &str = "/etc/rsyslog.d/postgres_logs.conf";
@@ -84,84 +82,18 @@ fn restart_rsyslog() -> Result<()> {
Ok(())
}
fn parse_audit_syslog_address(
remote_plain_endpoint: &str,
remote_tls_endpoint: &str,
) -> Result<(String, u16, String)> {
let tls;
let remote_endpoint = if !remote_tls_endpoint.is_empty() {
tls = "true".to_string();
remote_tls_endpoint
} else {
tls = "false".to_string();
remote_plain_endpoint
};
// Urlify the remote_endpoint, so parsing can be done with url::Url.
let url_str = format!("http://{remote_endpoint}");
let url = Url::parse(&url_str).map_err(|err| {
anyhow!("Error parsing {remote_endpoint}, expected host:port, got {err:?}")
})?;
let is_valid = url.scheme() == "http"
&& url.path() == "/"
&& url.query().is_none()
&& url.fragment().is_none()
&& url.username() == ""
&& url.password().is_none();
if !is_valid {
return Err(anyhow!(
"Invalid address format {remote_endpoint}, expected host:port"
));
}
let host = match url.host() {
Some(Host::Domain(h)) if hostname_validator::is_valid(h) => h.to_string(),
Some(Host::Ipv4(ip4)) => ip4.to_string(),
Some(Host::Ipv6(ip6)) => ip6.to_string(),
_ => return Err(anyhow!("Invalid host")),
};
let port = url
.port()
.ok_or_else(|| anyhow!("Invalid port in {remote_endpoint}"))?;
Ok((host, port, tls))
}
fn generate_audit_rsyslog_config(
log_directory: String,
endpoint_id: &str,
project_id: &str,
remote_syslog_host: &str,
remote_syslog_port: u16,
remote_syslog_tls: &str,
) -> String {
format!(
include_str!("config_template/compute_audit_rsyslog_template.conf"),
log_directory = log_directory,
endpoint_id = endpoint_id,
project_id = project_id,
remote_syslog_host = remote_syslog_host,
remote_syslog_port = remote_syslog_port,
remote_syslog_tls = remote_syslog_tls
)
}
pub fn configure_audit_rsyslog(
log_directory: String,
endpoint_id: &str,
project_id: &str,
remote_endpoint: &str,
remote_tls_endpoint: &str,
) -> Result<()> {
let (remote_syslog_host, remote_syslog_port, remote_syslog_tls) =
parse_audit_syslog_address(remote_endpoint, remote_tls_endpoint).unwrap();
let config_content = generate_audit_rsyslog_config(
log_directory,
endpoint_id,
project_id,
&remote_syslog_host,
remote_syslog_port,
&remote_syslog_tls,
let config_content: String = format!(
include_str!("config_template/compute_audit_rsyslog_template.conf"),
log_directory = log_directory,
endpoint_id = endpoint_id,
project_id = project_id,
remote_endpoint = remote_endpoint
);
info!("rsyslog config_content: {}", config_content);
@@ -326,8 +258,6 @@ pub fn launch_pgaudit_gc(log_directory: String) {
mod tests {
use crate::rsyslog::PostgresLogsRsyslogConfig;
use super::{generate_audit_rsyslog_config, parse_audit_syslog_address};
#[test]
fn test_postgres_logs_config() {
{
@@ -357,146 +287,4 @@ mod tests {
assert!(res.is_err());
}
}
#[test]
fn test_parse_audit_syslog_address() {
{
// host:port format (plaintext)
let parsed = parse_audit_syslog_address("collector.host.tld:5555", "");
assert!(parsed.is_ok());
assert_eq!(
parsed.unwrap(),
(
String::from("collector.host.tld"),
5555,
String::from("false")
)
);
}
{
// host:port format with ipv4 ip address (plaintext)
let parsed = parse_audit_syslog_address("10.0.0.1:5555", "");
assert!(parsed.is_ok());
assert_eq!(
parsed.unwrap(),
(String::from("10.0.0.1"), 5555, String::from("false"))
);
}
{
// host:port format with ipv6 ip address (plaintext)
let parsed =
parse_audit_syslog_address("[7e60:82ed:cb2e:d617:f904:f395:aaca:e252]:5555", "");
assert_eq!(
parsed.unwrap(),
(
String::from("7e60:82ed:cb2e:d617:f904:f395:aaca:e252"),
5555,
String::from("false")
)
);
}
{
// Only TLS host:port defined
let parsed = parse_audit_syslog_address("", "tls.host.tld:5556");
assert_eq!(
parsed.unwrap(),
(String::from("tls.host.tld"), 5556, String::from("true"))
);
}
{
// tls host should take precedence, when both defined
let parsed = parse_audit_syslog_address("plaintext.host.tld:5555", "tls.host.tld:5556");
assert_eq!(
parsed.unwrap(),
(String::from("tls.host.tld"), 5556, String::from("true"))
);
}
{
// host without port (plaintext)
let parsed = parse_audit_syslog_address("collector.host.tld", "");
assert!(parsed.is_err());
}
{
// port without host
let parsed = parse_audit_syslog_address(":5555", "");
assert!(parsed.is_err());
}
{
// valid host with invalid port
let parsed = parse_audit_syslog_address("collector.host.tld:90001", "");
assert!(parsed.is_err());
}
{
// invalid hostname with valid port
let parsed = parse_audit_syslog_address("-collector.host.tld:5555", "");
assert!(parsed.is_err());
}
{
// parse error
let parsed = parse_audit_syslog_address("collector.host.tld:::5555", "");
assert!(parsed.is_err());
}
}
#[test]
fn test_generate_audit_rsyslog_config() {
{
// plaintext version
let log_directory = "/tmp/log".to_string();
let endpoint_id = "ep-test-endpoint-id";
let project_id = "test-project-id";
let remote_syslog_host = "collector.host.tld";
let remote_syslog_port = 5555;
let remote_syslog_tls = "false";
let conf_str = generate_audit_rsyslog_config(
log_directory,
endpoint_id,
project_id,
remote_syslog_host,
remote_syslog_port,
remote_syslog_tls,
);
assert!(conf_str.contains(r#"set $.remote_syslog_tls = "false";"#));
assert!(conf_str.contains(r#"type="omfwd""#));
assert!(conf_str.contains(r#"target="collector.host.tld""#));
assert!(conf_str.contains(r#"port="5555""#));
assert!(conf_str.contains(r#"StreamDriverPermittedPeers="collector.host.tld""#));
}
{
// TLS version
let log_directory = "/tmp/log".to_string();
let endpoint_id = "ep-test-endpoint-id";
let project_id = "test-project-id";
let remote_syslog_host = "collector.host.tld";
let remote_syslog_port = 5556;
let remote_syslog_tls = "true";
let conf_str = generate_audit_rsyslog_config(
log_directory,
endpoint_id,
project_id,
remote_syslog_host,
remote_syslog_port,
remote_syslog_tls,
);
assert!(conf_str.contains(r#"set $.remote_syslog_tls = "true";"#));
assert!(conf_str.contains(r#"type="omfwd""#));
assert!(conf_str.contains(r#"target="collector.host.tld""#));
assert!(conf_str.contains(r#"port="5556""#));
assert!(conf_str.contains(r#"StreamDriverPermittedPeers="collector.host.tld""#));
}
}
}

View File

@@ -3,8 +3,7 @@
"timestamp": "2021-05-23T18:25:43.511Z",
"operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8b",
"suspend_timeout_seconds": 3600,
"cluster": {
"cluster_id": "test-cluster-42",
"name": "Zenith Test",

View File

@@ -16,9 +16,9 @@ use std::time::Duration;
use anyhow::{Context, Result, anyhow, bail};
use clap::Parser;
use compute_api::requests::ComputeClaimsScope;
use compute_api::spec::{ComputeMode, PageserverProtocol};
use compute_api::spec::ComputeMode;
use control_plane::broker::StorageBroker;
use control_plane::endpoint::{ComputeControlPlane, EndpointTerminateMode};
use control_plane::endpoint::{ComputeControlPlane, EndpointTerminateMode, PageserverProtocol};
use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage};
use control_plane::local_env;
use control_plane::local_env::{
@@ -64,9 +64,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
const DEFAULT_BRANCH_NAME: &str = "main";
project_git_version!(GIT_VERSION);
#[allow(dead_code)]
const DEFAULT_PG_VERSION: PgMajorVersion = PgMajorVersion::PG17;
const DEFAULT_PG_VERSION_NUM: &str = "17";
const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
@@ -169,7 +167,7 @@ struct TenantCreateCmdArgs {
#[clap(short = 'c')]
config: Vec<String>,
#[arg(default_value = DEFAULT_PG_VERSION_NUM)]
#[arg(default_value_t = DEFAULT_PG_VERSION)]
#[clap(long, help = "Postgres version to use for the initial timeline")]
pg_version: PgMajorVersion,
@@ -292,7 +290,7 @@ struct TimelineCreateCmdArgs {
#[clap(long, help = "Human-readable alias for the new timeline")]
branch_name: String,
#[arg(default_value = DEFAULT_PG_VERSION_NUM)]
#[arg(default_value_t = DEFAULT_PG_VERSION)]
#[clap(long, help = "Postgres version")]
pg_version: PgMajorVersion,
}
@@ -324,7 +322,7 @@ struct TimelineImportCmdArgs {
#[clap(long, help = "Lsn the basebackup ends at")]
end_lsn: Option<Lsn>,
#[arg(default_value = DEFAULT_PG_VERSION_NUM)]
#[arg(default_value_t = DEFAULT_PG_VERSION)]
#[clap(long, help = "Postgres version of the backup being imported")]
pg_version: PgMajorVersion,
}
@@ -603,7 +601,7 @@ struct EndpointCreateCmdArgs {
)]
config_only: bool,
#[arg(default_value = DEFAULT_PG_VERSION_NUM)]
#[arg(default_value_t = DEFAULT_PG_VERSION)]
#[clap(long, help = "Postgres version")]
pg_version: PgMajorVersion,
@@ -1651,9 +1649,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
// If --safekeepers argument is given, use only the listed
// safekeeper nodes; otherwise all from the env.
let safekeepers = parse_safekeepers(&args.safekeepers)?;
endpoint
.reconfigure(Some(pageservers), None, safekeepers, None)
.await?;
endpoint.reconfigure(pageservers, None, safekeepers).await?;
}
EndpointCmd::Stop(args) => {
let endpoint_id = &args.endpoint_id;

View File

@@ -56,8 +56,8 @@ use compute_api::responses::{
TlsConfig,
};
use compute_api::spec::{
Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PageserverProtocol,
PgIdent, RemoteExtSpec, Role,
Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent,
RemoteExtSpec, Role,
};
use jsonwebtoken::jwk::{
AlgorithmParameters, CommonParameters, EllipticCurve, Jwk, JwkSet, KeyAlgorithm, KeyOperations,
@@ -373,6 +373,29 @@ impl std::fmt::Display for EndpointTerminateMode {
}
}
/// Protocol used to connect to a Pageserver.
#[derive(Clone, Copy, Debug)]
pub enum PageserverProtocol {
Libpq,
Grpc,
}
impl PageserverProtocol {
/// Returns the URL scheme for the protocol, used in connstrings.
pub fn scheme(&self) -> &'static str {
match self {
Self::Libpq => "postgresql",
Self::Grpc => "grpc",
}
}
}
impl Display for PageserverProtocol {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(self.scheme())
}
}
impl Endpoint {
fn from_dir_entry(entry: std::fs::DirEntry, env: &LocalEnv) -> Result<Endpoint> {
if !entry.file_type()?.is_dir() {
@@ -780,7 +803,6 @@ impl Endpoint {
endpoint_storage_addr: Some(endpoint_storage_addr),
endpoint_storage_token: Some(endpoint_storage_token),
autoprewarm: false,
suspend_timeout_seconds: -1, // Only used in neon_local.
};
// this strange code is needed to support respec() in tests
@@ -975,11 +997,12 @@ impl Endpoint {
pub async fn reconfigure(
&self,
pageservers: Option<Vec<(PageserverProtocol, Host, u16)>>,
pageservers: Vec<(PageserverProtocol, Host, u16)>,
stripe_size: Option<ShardStripeSize>,
safekeepers: Option<Vec<NodeId>>,
safekeeper_generation: Option<SafekeeperGeneration>,
) -> Result<()> {
anyhow::ensure!(!pageservers.is_empty(), "no pageservers provided");
let (mut spec, compute_ctl_config) = {
let config_path = self.endpoint_path().join("config.json");
let file = std::fs::File::open(config_path)?;
@@ -991,24 +1014,16 @@ impl Endpoint {
let postgresql_conf = self.read_postgresql_conf()?;
spec.cluster.postgresql_conf = Some(postgresql_conf);
// If pageservers are not specified, don't change them.
if let Some(pageservers) = pageservers {
anyhow::ensure!(!pageservers.is_empty(), "no pageservers provided");
let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
spec.pageserver_connstring = Some(pageserver_connstr);
if stripe_size.is_some() {
spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
}
let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
spec.pageserver_connstring = Some(pageserver_connstr);
if stripe_size.is_some() {
spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
}
// If safekeepers are not specified, don't change them.
if let Some(safekeepers) = safekeepers {
let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?;
spec.safekeeper_connstrings = safekeeper_connstrings;
if let Some(g) = safekeeper_generation {
spec.safekeepers_generation = Some(g.into_inner());
}
}
let client = reqwest::Client::builder()
@@ -1046,24 +1061,6 @@ impl Endpoint {
}
}
pub async fn reconfigure_pageservers(
&self,
pageservers: Vec<(PageserverProtocol, Host, u16)>,
stripe_size: Option<ShardStripeSize>,
) -> Result<()> {
self.reconfigure(Some(pageservers), stripe_size, None, None)
.await
}
pub async fn reconfigure_safekeepers(
&self,
safekeepers: Vec<NodeId>,
generation: SafekeeperGeneration,
) -> Result<()> {
self.reconfigure(None, None, Some(safekeepers), Some(generation))
.await
}
pub async fn stop(
&self,
mode: EndpointTerminateMode,

View File

@@ -12,7 +12,6 @@ use std::{env, fs};
use anyhow::{Context, bail};
use clap::ValueEnum;
use pageserver_api::config::PostHogConfig;
use pem::Pem;
use postgres_backend::AuthType;
use reqwest::{Certificate, Url};
@@ -212,9 +211,7 @@ pub struct NeonStorageControllerConf {
pub use_local_compute_notifications: bool,
pub timeline_safekeeper_count: Option<usize>,
pub posthog_config: Option<PostHogConfig>,
pub timeline_safekeeper_count: Option<i64>,
pub kick_secondary_downloads: Option<bool>,
}
@@ -248,7 +245,6 @@ impl Default for NeonStorageControllerConf {
use_https_safekeeper_api: false,
use_local_compute_notifications: true,
timeline_safekeeper_count: None,
posthog_config: None,
kick_secondary_downloads: None,
}
}

View File

@@ -638,28 +638,10 @@ impl StorageController {
args.push("--timelines-onto-safekeepers".to_string());
}
// neon_local is used in test environments where we often have less than 3 safekeepers.
if self.config.timeline_safekeeper_count.is_some() || self.env.safekeepers.len() < 3 {
let sk_cnt = self
.config
.timeline_safekeeper_count
.unwrap_or(self.env.safekeepers.len());
if let Some(sk_cnt) = self.config.timeline_safekeeper_count {
args.push(format!("--timeline-safekeeper-count={sk_cnt}"));
}
let mut envs = vec![
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
];
if let Some(posthog_config) = &self.config.posthog_config {
envs.push((
"POSTHOG_CONFIG".to_string(),
serde_json::to_string(posthog_config)?,
));
}
println!("Starting storage controller");
background_process::start_process(
@@ -667,7 +649,10 @@ impl StorageController {
&instance_dir,
&self.env.storage_controller_bin(),
args,
envs,
vec![
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
],
background_process::InitialPidFile::Create(self.pid_file(start_args.instance_id)),
&start_args.start_timeout,
|| async {

View File

@@ -4,7 +4,6 @@
"timestamp": "2022-10-12T18:00:00.000Z",
"operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c",
"suspend_timeout_seconds": -1,
"cluster": {
"cluster_id": "docker_compose",

View File

@@ -1,396 +0,0 @@
# Memo: Endpoint Persistent Unlogged Files Storage
Created on 2024-11-05
Implemented on N/A
## Summary
A design for a storage system that allows storage of files required to make
Neon's Endpoints have a better experience at or after a reboot.
## Motivation
Several systems inside PostgreSQL (and Neon) need some persistent storage for
optimal workings across reboots and restarts, but still work without.
Examples are the query-level statistics files of `pg_stat_statements` in
`pg_stat/pg_stat_statements.stat`, and `pg_prewarm`'s `autoprewarm.blocks`.
We need a storage system that can store and manage these files for each
Endpoint, without necessarily granting users access to an unlimited storage
device.
## Goals
- Store known files for Endpoints with reasonable persistence.
_Data loss in this service, while annoying and bad for UX, won't lose any
customer's data._
## Non Goals (if relevant)
- This storage system does not need branching, file versioning, or other such
features. The files are as ephemeral to the timeline of the data as the
Endpoints that host the data.
- This storage system does not need to store _all_ user files, only 'known'
user files.
- This storage system does not need to be hosted fully inside Computes.
_Instead, this will be a separate component similar to Pageserver,
SafeKeeper, the S3 proxy used for dynamically loaded extensions, etc._
## Impacted components
- Compute needs new code to load and store these files in its lifetime.
- Control Plane needs to consider this new storage system when signalling
the deletion of an Endpoint, Timeline, or Tenant.
- Control Plane needs to consider this new storage system when it resets
or re-assigns an endpoint's timeline/branch state.
A new service is created: the Endpoint Persistent Unlogged Files Storage
service. This could be integrated in e.g. Pageserver or Control Plane, or a
separately hosted service.
## Proposed implementation
Endpoint-related data files are managed by a newly designed service (which
optionally is integrated in an existing service like Pageserver or Control
Plane), which stores data directly into S3 or any blob storage of choice.
Upon deletion of the Endpoint, or reassignment of the endpoint to a different
branch, this ephemeral data is dropped: the data stored may not match the
state of the branch's data after reassignment, and on endpoint deletion the
data won't have any use to the user.
Compute gets credentials (JWT token with Tenant, Timeline & Endpoint claims)
which it can use to authenticate to this new service and retrieve and store
data associated with this endpoint. This limited scope reduces leaks of data
across endpoints and timeline resets, and limits the ability of endpoints to
mess with other endpoints' data.
The path of this endpoint data in S3 is initially as follows:
s3://<regional-epufs-bucket>/
tenants/
<hex-tenant-id>/
tenants/
<hex-timeline-id>/
endpoints/
<endpoint-id>/
pgdata/
<file_path_in_pgdatadir>
For other blob storages an equivalent or similar path can be constructed.
### Reliability, failure modes and corner cases (if relevant)
Reliability is important, but not critical to the workings of Neon. The data
stored in this service will, when lost, reduce performance, but won't be a
cause of permanent data loss - only operational metadata is stored.
Most, if not all, blob storage services have sufficiently high persistence
guarantees to cater our need for persistence and uptime. The only concern with
blob storages is that the access latency is generally higher than local disk,
but for the object types stored (cache state, ...) I don't think this will be
much of an issue.
### Interaction/Sequence diagram (if relevant)
In these diagrams you can replace S3 with any persistent storage device of
choice, but S3 is chosen as representative name: The well-known and short name
of AWS' blob storage. Azure Blob Storage should work too, but it has a much
longer name making it less practical for the diagrams.
Write data:
```http
POST /tenants/<tenant-id>/timelines/<tl-id>/endpoints/<endpoint-id>/pgdata/<the-pgdata-path>
Host: epufs.svc.neon.local
<<<
200 OK
{
"version": "<opaque>", # opaque file version token, changes when the file contents change
"size": <bytes>,
}
```
```mermaid
sequenceDiagram
autonumber
participant co as Compute
participant ep as EPUFS
participant s3 as Blob Storage
co-->ep: Connect with credentials
co->>+ep: Store Unlogged Persistent File
opt is authenticated
ep->>s3: Write UPF to S3
end
ep->>-co: OK / Failure / Auth Failure
co-->ep: Cancel connection
```
Read data: (optional with cache-relevant request parameters, e.g. If-Modified-Since)
```http
GET /tenants/<tenant-id>/timelines/<tl-id>/endpoints/<endpoint-id>/pgdata/<the-pgdata-path>
Host: epufs.svc.neon.local
<<<
200 OK
<file data>
```
```mermaid
sequenceDiagram
autonumber
participant co as Compute
participant ep as EPUFS
participant s3 as Blob Storage
co->>+ep: Read Unlogged Persistent File
opt is authenticated
ep->>+s3: Request UPF from storage
s3->>-ep: Receive UPF from storage
end
ep->>-co: OK(response) / Failure(storage, auth, ...)
```
Compute Startup:
```mermaid
sequenceDiagram
autonumber
participant co as Compute
participant ps as Pageserver
participant ep as EPUFS
participant es as Extension server
note over co: Bind endpoint ep-xxx
par Get basebackup
co->>+ps: Request basebackup @ LSN
ps-)ps: Construct basebackup
ps->>-co: Receive basebackup TAR @ LSN
and Get startup-critical Unlogged Persistent Files
co->>+ep: Get all UPFs of endpoint ep-xxx
ep-)ep: Retrieve and gather all UPFs
ep->>-co: TAR of UPFs
and Get startup-critical extensions
loop For every startup-critical extension
co->>es: Get critical extension
es->>co: Receive critical extension
end
end
note over co: Start compute
```
CPlane ops:
```http
DELETE /tenants/<tenant-id>/timelines/<timeline-id>/endpoints/<endpoint-id>
Host: epufs.svc.neon.local
<<<
200 OK
{
"tenant": "<tenant-id>",
"timeline": "<timeline-id>",
"endpoint": "<endpoint-id>",
"deleted": {
"files": <count>,
"bytes": <count>,
},
}
```
```http
DELETE /tenants/<tenant-id>/timelines/<timeline-id>
Host: epufs.svc.neon.local
<<<
200 OK
{
"tenant": "<tenant-id>",
"timeline": "<timeline-id>",
"deleted": {
"files": <count>,
"bytes": <count>,
},
}
```
```http
DELETE /tenants/<tenant-id>
Host: epufs.svc.neon.local
<<<
200 OK
{
"tenant": "<tenant-id>",
"deleted": {
"files": <count>,
"bytes": <count>,
},
}
```
```mermaid
sequenceDiagram
autonumber
participant cp as Control Plane
participant ep as EPUFS
participant s3 as Blob Storage
alt Tenant deleted
cp-)ep: Tenant deleted
loop For every object associated with removed tenant
ep->>s3: Remove data of deleted tenant from Storage
end
opt
ep-)cp: Tenant cleanup complete
end
alt Timeline deleted
cp-)ep: Timeline deleted
loop For every object associated with removed timeline
ep->>s3: Remove data of deleted timeline from Storage
end
opt
ep-)cp: Timeline cleanup complete
end
else Endpoint reassigned or removed
cp->>+ep: Endpoint reassigned
loop For every object associated with reassigned/removed endpoint
ep->>s3: Remove data from Storage
end
ep->>-cp: Cleanup complete
end
```
### Scalability (if relevant)
Provisionally: As this service is going to be part of compute startup, this
service should be able to quickly respond to all requests. Therefore this
service is deployed to every AZ we host Computes in, and Computes communicate
(generally) only to the EPUFS endpoint of the AZ they're hosted in.
Local caching of frequently restarted endpoints' data or metadata may be
needed for best performance. However, due to the regional nature of stored
data but zonal nature of the service deployment, we should be careful when we
implement any local caching, as it is possible that computes in AZ 1 will
update data originally written and thus cached by AZ 2. Cache version tests
and invalidation is therefore required if we want to roll out caching to this
service, which is too broad a scope for an MVC. This is why caching is left
out of scope for this RFC, and should be considered separately after this RFC
is implemented.
### Security implications (if relevant)
This service must be able to authenticate users at least by Tenant ID,
Timeline ID and Endpoint ID. This will use the existing JWT infrastructure of
Compute, which will be upgraded to the extent needed to support Timeline- and
Endpoint-based claims.
The service requires unlimited access to (a prefix of) a blob storage bucket,
and thus must be hosted outside the Compute VM sandbox.
A service that generates pre-signed request URLs for Compute to download the
data from that URL is likely problematic, too: Compute would be able to write
unlimited data to the bucket, or exfiltrate this signed URL to get read/write
access to specific objects in this bucket, which would still effectively give
users access to the S3 bucket (but with improved access logging).
There may be a use case for transferring data associated with one endpoint to
another endpoint (e.g. to make one endpoint warm its caches with the state of
another endpoint), but that's not currently in scope, and specific needs may
be solved through out-of-line communication of data or pre-signed URLs.
### Unresolved questions (if relevant)
Caching of files is not in the implementation scope of the document, but
should at some future point be considered to maximize performance.
## Alternative implementation (if relevant)
Several ideas have come up to solve this issue:
### Use AUXfile
One prevalent idea was to WAL-log the files using our AUXfile mechanism.
Benefits:
+ We already have this storage mechanism
Demerits:
- It isn't available on read replicas
- Additional WAL will be consumed during shutdown and after the shutdown
checkpoint, which needs PG modifications to work without panics.
- It increases the data we need to manage in our versioned storage, thus
causing higher storage costs with higher retention due to duplication at
the storage layer.
### Sign URLs for read/write operations, instead of proxying them
Benefits:
+ The service can be implemented with a much reduced IO budget
Demerits:
- Users could get access to these signed credentials
- Not all blob storage services may implement URL signing
### Give endpoints each their own directly accessed block volume
Benefits:
+ Easier to integrate for PostgreSQL
Demerits:
- Little control on data size and contents
- Potentially problematic as we'd need to store data all across the pgdata
directory.
- EBS is not a good candidate
- Attaches in 10s of seconds, if not more; i.e. too cold to start
- Shared EBS volumes are a no-go, as you'd have to schedule the endpoint
with users of the same EBS volumes, which can't work with VM migration
- EBS storage costs are very high (>80$/kilotenant when using a
volume/tenant)
- EBS volumes can't be mounted across AZ boundaries
- Bucket per endpoint is unfeasible
- S3 buckets are priced at $20/month per 1k, which we could better spend
on developers.
- Allocating service accounts takes time (100s of ms), and service accounts
are a limited resource, too; so they're not a good candidate to allocate
on a per-endpoint basis.
- Giving credentials limited to prefix has similar issues as the pre-signed
URL approach.
- Bucket DNS lookup will fill DNS caches and put pressure on DNS lookup
much more than our current systems would.
- Volumes bound by hypervisor are unlikely
- This requires significant investment and increased software on the
hypervisor.
- It is unclear if we can attach volumes after boot, i.e. for pooled
instances.
### Put the files into a table
Benefits:
+ Mostly already available in PostgreSQL
Demerits:
- Uses WAL
- Can't be used after shutdown checkpoint
- Needs a RW endpoint, and table & catalog access to write to this data
- Gets hit with DB size limitations
- Depending on user acces:
- Inaccessible:
The user doesn't have control over database size caused by
these systems.
- Accessible:
The user can corrupt these files and cause the system to crash while
user-corrupted files are present, thus increasing on-call overhead.
## Definition of Done (if relevant)
This project is done if we have:
- One S3 bucket equivalent per region, which stores this per-endpoint data.
- A new service endpoint in at least every AZ, which indirectly grants
endpoints access to the data stored for these endpoints in these buckets.
- Compute writes & reads temp-data at shutdown and startup, respectively, for
at least the pg_prewarm or lfc_prewarm state files.
- Cleanup of endpoint data is triggered when the endpoint is deleted or is
detached from its current timeline.

View File

@@ -1,179 +0,0 @@
# Storage Feature Flags
In this RFC, we will describe how we will implement per-tenant feature flags.
## PostHog as Feature Flag Service
Before we start, let's talk about how current feature flag services work. PostHog is the feature flag service we are currently using across multiple user-facing components in the company. PostHog has two modes of operation: HTTP evaluation and server-side local evaluation.
Let's assume we have a storage feature flag called gc-compaction and we want to roll it out to scale-tier users with resident size >= 10GB and <= 100GB.
### Define User Profiles
The first step is to synchronize our user profiles to the PostHog service. We can simply assume that each tenant is a user in PostHog. Each user profile has some properties associated with it. In our case, it will be: plan type (free, scale, enterprise, etc); resident size (in bytes); primary pageserver (string); region (string).
### Define Feature Flags
We would create a feature flag called gc-compaction in PostHog with 4 variants: disabled, stage-1, stage-2, fully-enabled. We will flip the feature flags from disabled to fully-enabled stage by stage for some percentage of our users.
### Option 1: HTTP Evaluation Mode
When using PostHog's HTTP evaluation mode, the client will make request to the PostHog service, asking for the value of a feature flag for a specific user.
* Control plane will report the plan type to PostHog each time it attaches a tenant to the storcon or when the user upgrades/downgrades. It calls the PostHog profile API to associate tenant ID with the plan type. Assume we have X active tenants and such attach or plan change event happens each week, that would be 4X profile update requests per month.
* Pageservers will report the resident size and the primary pageserver to the PostHog service. Assume we report resident size every 24 hours, that would be 30X requests per month.
* Each tenant will request the state of the feature flag every 1 hour, that's 720X requests per month.
* The Rust client would be easy to implement as we only need to call the `/decide` API on PostHog.
Using the HTTP evaluation mode we will issue 754X requests a month.
### Option 2: Local Evaluation Mode
When using PostHog's HTTP evaluation mode, the client (usually the server in a browser/server architecture) will poll the feature flag configuration every 30s (default in the Python client) from PostHog. Such configuration contains data like:
<details>
<summary>Example JSON response from the PostHog local evaluation API</summary>
```
[
{
"id": 1,
"name": "Beta Feature",
"key": "person-flag",
"is_simple_flag": True,
"active": True,
"filters": {
"groups": [
{
"properties": [
{
"key": "location",
"operator": "exact",
"value": ["Straße"],
"type": "person",
}
],
"rollout_percentage": 100,
},
{
"properties": [
{
"key": "star",
"operator": "exact",
"value": ["ſun"],
"type": "person",
}
],
"rollout_percentage": 100,
},
],
},
}
]
```
</details>
Note that the API only contains information like "under what condition => rollout percentage". The user is responsible to provide the properties required to the client for local evaluation, and the PostHog service (web UI) cannot know if a feature is enabled for the tenant or not until the client uses the `capture` API to report the result back. To control the rollout percentage, the user ID gets mapped to a float number in `[0, 1)` on a consistent hash ring. All values <= the percentage will get the feature enabled or set to the desired value.
To use the local evaluation mode, the system needs:
* Assume each pageserver will poll PostHog for the local evaluation JSON every 5 minutes (instead of the 30s default as it's too frequent). That's 8640Y per month, Y is the number of pageservers. Local evaluation requests cost 10x more than the normal decide request, so that's 86400Y request units to bill.
* Storcon needs to store the plan type in the database and pass that information to the pageserver when attaching the tenant.
* Storcon also needs to update PostHog with the active tenants, for example, when the tenant gets detached/attached. Assume each active tenant gets detached/attached every week, that would be 4X requests per month.
* We do not need to update bill type or resident size to PostHog as all these are evaluated locally.
* After each local evaluation of the feature flag, we need to call PostHog's capture event API to update the result of the evaluation that the feature is enabled. We can do this when the flag gets changed compared with the last cached state in memory. That would be at least 4X (assume we do deployment every week so the cache gets cleared) and maybe an additional multiplifier of 10 assume we have 10 active features.
In this case, we will issue 86400Y + 40X requests per month.
Assume X = 1,000,000 and Y = 100,
| | HTTP Evaluation | Local Evaluation |
|---|---|---|
| Latency of propagating the conditions/properties for feature flag | 24 hours | available locally |
| Latency of applying the feature flag | 1 hour | 5 minutes |
| Can properties be reported from different services | Yes | No |
| Do we need to sync billing info etc to pageserver | No | Yes |
| Cost | 75400$ / month | 4864$ / month |
# Our Solution
We will use PostHog _only_ as an UI to configure the feature flags. Whether a feature is enabled or not can only be queried through storcon/pageserver instead of using the PostHog UI. (We could report it back to PostHog via `capture_event` but it costs $$$.) This allows us to ramp up the feature flag functionality fast at first. At the same time, it would also give us the option to migrate to our own solution once we want to have more properties and more complex evaluation rules in our system.
* We will create several fake users (tenants) in PostHog that contains all the properties we will use for evaluating a feature flag (i.e., resident size, billing type, pageserver id, etc.)
* We will use PostHog's local evaluation API to poll the configuration of the feature flags and evaluate them locally on each of the pageserver.
* The evaluation result will not be reported back to PostHog.
* Storcon needs to pull some information from cplane database.
* To know if a feature is currently enabled or not, we need to call the storcon/pageserver API; and we won't be able to know if a feature has been enabled on a tenant before easily: we need to look at the Grafana logs.
We only need to pay for the 86400Y local evaluation requests (that would be setting Y=0 in solution 2 => $864/month, and even less if we proxy it through storcon).
## Implementation
* Pageserver: implement a PostHog local evaluation client. The client will be shared across all tenants on the pageserver with a single API: `evaluate(tenant_id, feature_flag, properties) -> json`.
* Storcon: if we need plan type as the evaluation condition, pull it from cplane database.
* Storcon/Pageserver: implement an HTTP API `:tenant_id/feature/:feature` to retrieve the current feature flag status.
* Storcon/Pageserver: a loop to update the feature flag spec on both storcon and pageserver. Pageserver loop will only be activated if storcon does not push the specs to the pageserver.
## Difference from Tenant Config
* Feature flags can be modified by percentage, and the default config for each feature flag can be modified in UI without going through the release process.
* Feature flags are more flexible and won't be persisted anywhere and will be passed as plain JSON over the wire so that do not need to handle backward/forward compatibility as in tenant config.
* The expectation of tenant config is that once we add a flag we cannot remove it (or it will be hard to remove), but feature flags are more flexible.
# Final Implementation
* We added a new crate `posthog_lite_client` that supports local feature evaluations.
* We set up two projects "Storage (staging)" and "Storage (production)" in the PostHog console.
* Each pageserver reports 10 fake tenants to PostHog so that we can get all combinations of regions (and other properties) in the PostHog UI.
* Supported properties: AZ, neon_region, pageserver, tenant_id.
* You may use "Pageserver Feature Flags" dashboard to see the evaluation status.
* The feature flag spec is polled on storcon every 30s (in each of the region) and storcon will propagate the spec to the pageservers.
* The pageserver housekeeping loop updates the tenant-specific properties (e.g., remote size) for evaluation.
Each tenant has a `feature_resolver` object. After you add a feature flag in the PostHog console, you can retrieve it with:
```rust
// Boolean flag
self
.feature_resolver
.evaluate_boolean("flag")
.is_ok()
// Multivariate flag
self
.feature_resolver
.evaluate_multivariate("gc-comapction-strategy")
.ok();
```
The user needs to handle the case where the evaluation result is an error. This can occur in a variety of cases:
* During the pageserver start, the feature flag spec has not been retrieved.
* No condition group is matched.
* The feature flag spec contains an operand/operation not supported by the lite PostHog library.
For boolean flags, the return value is `Result<(), Error>`. `Ok(())` means the flag is evaluated to true. Otherwise,
there is either an error in evaluation or it does not match any groups.
For multivariate flags, the return value is `Result<String, Error>`. `Ok(variant)` indicates the flag is evaluated
to a variant. Otherwise, there is either an error in evaluation or it does not match any groups.
The evaluation logic is documented in the PostHog lite library. It compares the consistent hash of a flag key + tenant_id
with the rollout percentage and determines which tenant to roll out a specific feature.
Users can use the feature flag evaluation API to get the flag evaluation result of a specific tenant for debugging purposes.
```
curl http://localhost:9898/v1/tenant/:tenant_id/feature_flag?flag=:key&as=multivariate/boolean"
```
By default, the storcon pushes the feature flag specs to the pageservers every 30 seconds, which means that a change in feature flag in the
PostHog UI will propagate to the pageservers within 30 seconds.
# Future Works
* Support dynamic tenant properties like logical size as the evaluation condition.
* Support properties like `plan_type` (needs cplane to pass it down).
* Report feature flag evaluation result back to PostHog (if the cost is okay).
* Fast feature flag evaluation cache on critical paths (e.g., cache a feature flag result in `AtomicBool` and use it on the read path).

View File

@@ -1,399 +0,0 @@
# Compute rolling restart with prewarm
Created on 2025-03-17
Implemented on _TBD_
Author: Alexey Kondratov (@ololobus)
## Summary
This RFC describes an approach to reduce performance degradation due to missing caches after compute node restart, i.e.:
1. Rolling restart of the running instance via 'warm' replica.
2. Auto-prewarm compute caches after unplanned restart or scale-to-zero.
## Motivation
Neon currently implements several features that guarantee high uptime of compute nodes:
1. Storage high-availability (HA), i.e. each tenant shard has a secondary pageserver location, so we can quickly switch over compute to it in case of primary pageserver failure.
2. Fast compute provisioning, i.e. we have a fleet of pre-created empty computes, that are ready to serve workload, so restarting unresponsive compute is very fast.
3. Preemptive NeonVM compute provisioning in case of k8s node unavailability.
This helps us to be well-within the uptime SLO of 99.95% most of the time. Problems begin when we go up to multi-TB workloads and 32-64 CU computes.
During restart, compute loses all caches: LFC, shared buffers, file system cache. Depending on the workload, it can take a lot of time to warm up the caches,
so that performance could be degraded and might be even unacceptable for certain workloads. The latter means that although current approach works well for small to
medium workloads, we still have to do some additional work to avoid performance degradation after restart of large instances.
## Non Goals
- Details of the persistence storage for prewarm data are out of scope, there is a separate RFC for that: <https://github.com/neondatabase/neon/pull/9661>.
- Complete compute/Postgres HA setup and flow. Although it was originally in scope of this RFC, during preliminary research it appeared to be a rabbit hole, so it's worth of a separate RFC.
- Low-level implementation details for Postgres replica-to-primary promotion. There are a lot of things to think and care about: how to start walproposer, [logical replication failover](https://www.postgresql.org/docs/current/logical-replication-failover.html), and so on, but it's worth of at least a separate one-pager design document if not RFC.
## Impacted components
Postgres, compute_ctl, Control plane, Endpoint storage for unlogged storage of compute files.
For the latter, we will need to implement a uniform abstraction layer on top of S3, ABS, etc., but
S3 is used in text interchangeably with 'endpoint storage' for simplicity.
## Proposed implementation
### compute_ctl spec changes and auto-prewarm
We are going to extend the current compute spec with the following attributes
```rust
struct ComputeSpec {
/// [All existing attributes]
...
/// Whether to do auto-prewarm at start or not.
/// Default to `false`.
pub lfc_auto_prewarm: bool
/// Interval in seconds between automatic dumps of
/// LFC state into S3. Default `None`, which means 'off'.
pub lfc_dump_interval_sec: Option<i32>
}
```
When `lfc_dump_interval_sec` is set to `N`, `compute_ctl` will periodically dump the LFC state
and store it in S3, so that it could be used either for auto-prewarm after restart or by replica
during the rolling restart. For enabling periodic dumping, we should consider the following value
`lfc_dump_interval_sec=300` (5 minutes), same as in the upstream's `pg_prewarm.autoprewarm_interval`.
When `lfc_auto_prewarm` is set to `true`, `compute_ctl` will start prewarming the LFC upon restart
iif some of the previous states is present in S3.
### compute_ctl API
1. `POST /store_lfc_state` -- dump LFC state using Postgres SQL interface and store result in S3.
This has to be a blocking call, i.e. it will return only after the state is stored in S3.
If there is any concurrent request in progress, we should return `429 Too Many Requests`,
and let the caller to retry.
2. `GET /dump_lfc_state` -- dump LFC state using Postgres SQL interface and return it as is
in text format suitable for the future restore/prewarm. This API is not strictly needed at
the end state, but could be useful for a faster prototyping of a complete rolling restart flow
with prewarm, as it doesn't require persistent for LFC state storage.
3. `POST /restore_lfc_state` -- restore/prewarm LFC state with request
```yaml
RestoreLFCStateRequest:
oneOf:
- type: object
required:
- lfc_state
properties:
lfc_state:
type: string
description: Raw LFC content dumped with GET `/dump_lfc_state`
- type: object
required:
- lfc_cache_key
properties:
lfc_cache_key:
type: string
description: |
endpoint_id of the source endpoint on the same branch
to use as a 'donor' for LFC content. Compute will look up
LFC content dump in S3 using this key and do prewarm.
```
where `lfc_state` and `lfc_cache_key` are mutually exclusive.
The actual prewarming will happen asynchronously, so the caller need to check the
prewarm status using the compute's standard `GET /status` API.
4. `GET /status` -- extend existing API with following attributes
```rust
struct ComputeStatusResponse {
// [All existing attributes]
...
pub prewarm_state: PrewarmState
}
/// Compute prewarm state. Will be stored in the shared Compute state
/// in compute_ctl
struct PrewarmState {
pub status: PrewarmStatus
/// Total number of pages to prewarm
pub pages_total: i64
/// Number of pages prewarmed so far
pub pages_processed: i64
/// Optional prewarm error
pub error: Option<String>
}
pub enum PrewarmStatus {
/// Prewarming was never requested on this compute
Off,
/// Prewarming was requested, but not started yet
Pending,
/// Prewarming is in progress. The caller should follow
/// `PrewarmState::progress`.
InProgress,
/// Prewarming has been successfully completed
Completed,
/// Prewarming failed. The caller should look at
/// `PrewarmState::error` for the reason.
Failed,
/// It is intended to be used by auto-prewarm if none of
/// the previous LFC states is available in S3.
/// This is a distinct state from the `Failed` because
/// technically it's not a failure and could happen if
/// compute was restart before it dumped anything into S3,
/// or just after the initial rollout of the feature.
Skipped,
}
```
5. `POST /promote` -- this is a **blocking** API call to promote compute replica into primary.
This API should be very similar to the existing `POST /configure` API, i.e. accept the
spec (primary spec, because originally compute was started as replica). It's a distinct
API method because semantics and response codes are different:
- If promotion is done successfully, it will return `200 OK`.
- If compute is already primary, the call will be no-op and `compute_ctl`
will return `412 Precondition Failed`.
- If, for some reason, second request reaches compute that is in progress of promotion,
it will respond with `429 Too Many Requests`.
- If compute hit any permanent failure during promotion `500 Internal Server Error`
will be returned.
### Control plane operations
The complete flow will be present as a sequence diagram in the next section, but here
we just want to list some important steps that have to be done by control plane during
the rolling restart via warm replica, but without much of low-level implementation details.
1. Register the 'intent' of the instance restart, but not yet interrupt any workload at
primary and also accept new connections. This may require some endpoint state machine
changes, e.g. introduction of the `pending_restart` state. Being in this state also
**mustn't prevent any other operations except restart**: suspend, live-reconfiguration
(e.g. due to notify-attach call from the storage controller), deletion.
2. Start new replica compute on the same timeline and start prewarming it. This process
may take quite a while, so the same concurrency considerations as in 1. should be applied
here as well.
3. When warm replica is ready, control plane should:
3.1. Terminate the primary compute. Starting from here, **this is a critical section**,
if anything goes off, the only option is to start the primary normally and proceed
with auto-prewarm.
3.2. Send cache invalidation message to all proxies, notifying them that all new connections
should request and wait for the new connection details. At this stage, proxy has to also
drop any existing connections to the old primary, so they didn't do stale reads.
3.3. Attach warm replica compute to the primary endpoint inside control plane metadata
database.
3.4. Promote replica to primary.
3.5. When everything is done, finalize the endpoint state to be just `active`.
### Complete rolling restart flow
```mermaid
sequenceDiagram
autonumber
participant proxy as Neon proxy
participant cplane as Control plane
participant primary as Compute (primary)
box Compute (replica)
participant ctl as compute_ctl
participant pg as Postgres
end
box Endpoint unlogged storage
participant s3proxy as Endpoint storage service
participant s3 as S3/ABS/etc.
end
cplane ->> primary: POST /store_lfc_state
primary -->> cplane: 200 OK
cplane ->> ctl: POST /restore_lfc_state
activate ctl
ctl -->> cplane: 202 Accepted
activate cplane
cplane ->> ctl: GET /status: poll prewarm status
ctl ->> s3proxy: GET /read_file
s3proxy ->> s3: read file
s3 -->> s3proxy: file content
s3proxy -->> ctl: 200 OK: file content
proxy ->> cplane: GET /proxy_wake_compute
cplane -->> proxy: 200 OK: old primary conninfo
ctl ->> pg: prewarm LFC
activate pg
pg -->> ctl: prewarm is completed
deactivate pg
ctl -->> cplane: 200 OK: prewarm is completed
deactivate ctl
deactivate cplane
cplane -->> cplane: reassign replica compute to endpoint,<br>start terminating the old primary compute
activate cplane
cplane ->> proxy: invalidate caches
proxy ->> cplane: GET /proxy_wake_compute
cplane -x primary: POST /terminate
primary -->> cplane: 200 OK
note over primary: old primary<br>compute terminated
cplane ->> ctl: POST /promote
activate ctl
ctl ->> pg: pg_ctl promote
activate pg
pg -->> ctl: done
deactivate pg
ctl -->> cplane: 200 OK
deactivate ctl
cplane -->> cplane: finalize operation
cplane -->> proxy: 200 OK: new primary conninfo
deactivate cplane
```
### Network bandwidth and prewarm speed
It's currently known that pageserver can sustain about 3000 RPS per shard for a few running computes.
Large tenants are usually split into 8 shards, so the final formula may look like this:
```text
8 shards * 3000 RPS * 8 KB =~ 190 MB/s
```
so depending on the LFC size, prewarming will take at least:
- ~5s for 1 GB
- ~50s for 10 GB
- ~5m for 100 GB
- \>1h for 1 TB
In total, one pageserver is normally capped by 30k RPS, so it obviously can't sustain many computes
doing prewarm at the same time. Later, we may need an additional mechanism for computes to throttle
the prewarming requests gracefully.
### Reliability, failure modes and corner cases
We consider following failures while implementing this RFC:
1. Compute got interrupted/crashed/restarted during prewarm. The caller -- control plane -- should
detect that and start prewarm from the beginning.
2. Control plane promotion request timed out or hit network issues. If it never reached the
compute, control plane should just repeat it. If it did reach the compute, then during
retry control plane can hit `409` as previous request triggered the promotion already.
In this case, control plane need to retry until either `200` or
permanent error `500` is returned.
3. Compute got interrupted/crashed/restarted during promotion. At restart it will ask for
a spec from control plane, and its content should signal compute to start as **primary**,
so it's expected that control plane will continue polling for certain period of time and
will discover that compute is ready to accept connections if restart is fast enough.
4. Any other unexpected failure or timeout during prewarming. This **failure mustn't be fatal**,
control plane has to report failure, terminate replica and keep primary running.
5. Any other unexpected failure or timeout during promotion. Unfortunately, at this moment
we already have the primary node stopped, so the only option is to start primary again
and proceed with auto-prewarm.
6. Any unexpected failure during auto-prewarm. This **failure mustn't be fatal**,
`compute_ctl` has to report the failure, but do not crash the compute.
7. Control plane failed to confirm that old primary has terminated. This can happen, especially
in the future HA setup. In this case, control plane has to ensure that it sent VM deletion
and pod termination requests to k8s, so long-term we do not have two running primaries
on the same timeline.
### Security implications
There are two security implications to consider:
1. Access to `compute_ctl` API. It has to be accessible from the outside of compute, so all
new API methods have to be exposed on the **external** HTTP port and **must** be authenticated
with JWT.
2. Read/write only your own LFC state data in S3. Although it's not really a security concern,
since LFC state is just a mapping of blocks present in LFC at certain moment in time;
it still has to be highly restricted, so that i) only computes on the same timeline can
read S3 state; ii) each compute can only write to the path that contains it's `endpoint_id`.
Both of this must be validated by Endpoint storage service using the JWT token provided by `compute_ctl`.
### Unresolved questions
#### Billing, metrics and monitoring
Currently, we only label computes with `endpoint_id` after attaching them to the endpoint.
In this proposal, this means that temporary replica will remain unlabelled until it's promoted
to primary. We can also hide it from users in the control plane API, but what to do with
billing and monitoring is still unclear.
We can probably mark it as 'billable' and tag with `project_id`, so it will be billed, but
not interfere in any way with the current primary monitoring.
Another thing to consider is how logs and metrics export will switch to the new compute.
It's expected that OpenTelemetry collector will auto-discover the new compute and start
scraping metrics from it.
#### Auto-prewarm
It's still an open question whether we need auto-prewarm at all. The author's gut-feeling is
that yes, we need it, but might be not for all workloads, so it could end up exposed as a
user-controllable knob on the endpoint. There are two arguments for that:
1. Auto-prewarm existing in upstream's `pg_prewarm`, _probably for a reason_.
2. There are still could be 2 flows when we cannot perform the rolling restart via the warm
replica: i) any failure or interruption during promotion; ii) wake up after scale-to-zero.
The latter might be challenged as well, i.e. one can argue that auto-prewarm may and will
compete with user-workload for storage resources. This is correct, but it might as well
reduce the time to get warm LFC and good performance.
#### Low-level details of the replica promotion
There are many things to consider here, but three items just off the top of my head:
1. How to properly start the `walproposer` inside Postgres.
2. What to do with logical replication. Currently, we do not include logical replication slots
inside basebackup, because nobody advances them at replica, so they just prevent the WAL
deletion. Yet, we do need to have them at primary after promotion. Starting with Postgres 17,
there is a new feature called
[logical replication failover](https://www.postgresql.org/docs/current/logical-replication-failover.html)
and `synchronized_standby_slots` setting, but we need a plan for the older versions. Should we
request a new basebackup during promotion?
3. How do we guarantee that replica will receive all the latest WAL from safekeepers? Do some
'shallow' version of sync safekeepers without data copying? Or just a standard version of
sync safekeepers?
## Alternative implementation
The proposal already assumes one of the alternatives -- do not have any persistent storage for
LFC state. This is possible to implement faster with the proposed API, but it means that
we do not implement auto-prewarm yet.
## Definition of Done
At the end of implementing this RFC we should have two high-level settings that enable:
1. Auto-prewarm of user computes upon restart.
2. Perform primary compute restart via the warm replica promotion.
It also has to be decided what's the criteria for enabling one or both of these flows for
certain clients.

View File

@@ -12,7 +12,6 @@ jsonwebtoken.workspace = true
serde.workspace = true
serde_json.workspace = true
regex.workspace = true
url.workspace = true
utils = { path = "../utils" }
remote_storage = { version = "0.1", path = "../remote_storage/" }

View File

@@ -4,14 +4,11 @@
//! provide it by calling the compute_ctl's `/compute_ctl` endpoint, or
//! compute_ctl can fetch it by calling the control plane's API.
use std::collections::HashMap;
use std::fmt::Display;
use anyhow::anyhow;
use indexmap::IndexMap;
use regex::Regex;
use remote_storage::RemotePath;
use serde::{Deserialize, Serialize};
use url::Url;
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
@@ -184,11 +181,6 @@ pub struct ComputeSpec {
/// Download LFC state from endpoint_storage and pass it to Postgres on startup
#[serde(default)]
pub autoprewarm: bool,
/// Suspend timeout in seconds.
///
/// We use this value to derive other values, such as the installed extensions metric.
pub suspend_timeout_seconds: i64,
}
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
@@ -437,47 +429,6 @@ pub struct JwksSettings {
pub jwt_audience: Option<String>,
}
/// Protocol used to connect to a Pageserver. Parsed from the connstring scheme.
#[derive(Clone, Copy, Debug, Default)]
pub enum PageserverProtocol {
/// The original protocol based on libpq and COPY. Uses postgresql:// or postgres:// scheme.
#[default]
Libpq,
/// A newer, gRPC-based protocol. Uses grpc:// scheme.
Grpc,
}
impl PageserverProtocol {
/// Parses the protocol from a connstring scheme. Defaults to Libpq if no scheme is given.
/// Errors if the connstring is an invalid URL.
pub fn from_connstring(connstring: &str) -> anyhow::Result<Self> {
let scheme = match Url::parse(connstring) {
Ok(url) => url.scheme().to_lowercase(),
Err(url::ParseError::RelativeUrlWithoutBase) => return Ok(Self::default()),
Err(err) => return Err(anyhow!("invalid connstring URL: {err}")),
};
match scheme.as_str() {
"postgresql" | "postgres" => Ok(Self::Libpq),
"grpc" => Ok(Self::Grpc),
scheme => Err(anyhow!("invalid protocol scheme: {scheme}")),
}
}
/// Returns the URL scheme for the protocol, for use in connstrings.
pub fn scheme(&self) -> &'static str {
match self {
Self::Libpq => "postgresql",
Self::Grpc => "grpc",
}
}
}
impl Display for PageserverProtocol {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(self.scheme())
}
}
#[cfg(test)]
mod tests {
use std::fs::File;

View File

@@ -3,7 +3,6 @@
"timestamp": "2021-05-23T18:25:43.511Z",
"operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8b",
"suspend_timeout_seconds": 3600,
"cluster": {
"cluster_id": "test-cluster-42",

View File

@@ -19,7 +19,6 @@ byteorder.workspace = true
utils.workspace = true
postgres_ffi_types.workspace = true
postgres_versioninfo.workspace = true
posthog_client_lite.workspace = true
enum-map.workspace = true
strum.workspace = true
strum_macros.workspace = true
@@ -30,13 +29,12 @@ humantime-serde.workspace = true
chrono = { workspace = true, features = ["serde"] }
itertools.workspace = true
storage_broker.workspace = true
camino = { workspace = true, features = ["serde1"] }
camino = {workspace = true, features = ["serde1"]}
remote_storage.workspace = true
postgres_backend.workspace = true
nix = { workspace = true, optional = true }
nix = {workspace = true, optional = true}
reqwest.workspace = true
rand.workspace = true
tracing.workspace = true
tracing-utils.workspace = true
once_cell.workspace = true

View File

@@ -4,7 +4,6 @@ use camino::Utf8PathBuf;
mod tests;
use const_format::formatcp;
use posthog_client_lite::PostHogClientConfig;
pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
@@ -64,66 +63,25 @@ impl Display for NodeMetadata {
}
}
/// PostHog integration config. This is used in pageserver, storcon, and neon_local.
/// Ensure backward compatibility when adding new fields.
/// PostHog integration config.
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub struct PostHogConfig {
/// PostHog project ID
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
pub project_id: Option<String>,
pub project_id: String,
/// Server-side (private) API key
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
pub server_api_key: Option<String>,
pub server_api_key: String,
/// Client-side (public) API key
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
pub client_api_key: Option<String>,
pub client_api_key: String,
/// Private API URL
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
pub private_api_url: Option<String>,
pub private_api_url: String,
/// Public API URL
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
pub public_api_url: Option<String>,
/// Refresh interval for the feature flag spec.
/// The storcon will push the feature flag spec to the pageserver. If the pageserver does not receive
/// the spec for `refresh_interval`, it will fetch the spec from the PostHog API.
#[serde(default)]
pub public_api_url: String,
/// Refresh interval for the feature flag spec
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(with = "humantime_serde")]
pub refresh_interval: Option<Duration>,
}
impl PostHogConfig {
pub fn try_into_posthog_config(self) -> Result<PostHogClientConfig, &'static str> {
let Some(project_id) = self.project_id else {
return Err("project_id is required");
};
let Some(server_api_key) = self.server_api_key else {
return Err("server_api_key is required");
};
let Some(client_api_key) = self.client_api_key else {
return Err("client_api_key is required");
};
let Some(private_api_url) = self.private_api_url else {
return Err("private_api_url is required");
};
let Some(public_api_url) = self.public_api_url else {
return Err("public_api_url is required");
};
Ok(PostHogClientConfig {
project_id,
server_api_key,
client_api_key,
private_api_url,
public_api_url,
})
}
}
/// `pageserver.toml`
///
/// We use serde derive with `#[serde(default)]` to generate a deserializer
@@ -409,9 +367,6 @@ pub struct BasebackupCacheConfig {
// TODO(diko): support max_entry_size_bytes.
// pub max_entry_size_bytes: u64,
pub max_size_entries: usize,
/// Size of the channel used to send prepare requests to the basebackup cache worker.
/// If exceeded, new prepare requests will be dropped.
pub prepare_channel_size: usize,
}
impl Default for BasebackupCacheConfig {
@@ -420,8 +375,7 @@ impl Default for BasebackupCacheConfig {
cleanup_period: Duration::from_secs(60),
max_total_size_bytes: 1024 * 1024 * 1024, // 1 GiB
// max_entry_size_bytes: 16 * 1024 * 1024, // 16 MiB
max_size_entries: 10000,
prepare_channel_size: 100,
max_size_entries: 1000,
}
}
}

View File

@@ -420,7 +420,6 @@ impl From<NodeSchedulingPolicy> for String {
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
pub enum SkSchedulingPolicy {
Active,
Activating,
Pause,
Decomissioned,
}
@@ -431,7 +430,6 @@ impl FromStr for SkSchedulingPolicy {
fn from_str(s: &str) -> Result<Self, Self::Err> {
Ok(match s {
"active" => Self::Active,
"activating" => Self::Activating,
"pause" => Self::Pause,
"decomissioned" => Self::Decomissioned,
_ => {
@@ -448,7 +446,6 @@ impl From<SkSchedulingPolicy> for String {
use SkSchedulingPolicy::*;
match value {
Active => "active",
Activating => "activating",
Pause => "pause",
Decomissioned => "decomissioned",
}
@@ -549,11 +546,6 @@ pub struct TimelineImportRequest {
pub sk_set: Vec<NodeId>,
}
#[derive(serde::Serialize, serde::Deserialize, Clone)]
pub struct TimelineSafekeeperMigrateRequest {
pub new_sk_set: Vec<NodeId>,
}
#[cfg(test)]
mod test {
use serde_json;

View File

@@ -21,9 +21,7 @@ use utils::{completion, serde_system_time};
use crate::config::Ratio;
use crate::key::{CompactKey, Key};
use crate::shard::{
DEFAULT_STRIPE_SIZE, ShardCount, ShardIdentity, ShardStripeSize, TenantShardId,
};
use crate::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId};
/// The state of a tenant in this pageserver.
///
@@ -477,7 +475,7 @@ pub struct TenantShardSplitResponse {
}
/// Parameters that apply to all shards in a tenant. Used during tenant creation.
#[derive(Clone, Copy, Serialize, Deserialize, Debug)]
#[derive(Serialize, Deserialize, Debug)]
#[serde(deny_unknown_fields)]
pub struct ShardParameters {
pub count: ShardCount,
@@ -499,15 +497,6 @@ impl Default for ShardParameters {
}
}
impl From<ShardIdentity> for ShardParameters {
fn from(identity: ShardIdentity) -> Self {
Self {
count: identity.count,
stripe_size: identity.stripe_size,
}
}
}
#[derive(Debug, Default, Clone, Eq, PartialEq)]
pub enum FieldPatch<T> {
Upsert(T),

View File

@@ -37,7 +37,6 @@ use std::hash::{Hash, Hasher};
pub use ::utils::shard::*;
use postgres_ffi_types::forknum::INIT_FORKNUM;
use serde::{Deserialize, Serialize};
use utils::critical;
use crate::key::Key;
use crate::models::ShardParameters;
@@ -180,7 +179,7 @@ impl ShardIdentity {
/// For use when creating ShardIdentity instances for new shards, where a creation request
/// specifies the ShardParameters that apply to all shards.
pub fn from_params(number: ShardNumber, params: ShardParameters) -> Self {
pub fn from_params(number: ShardNumber, params: &ShardParameters) -> Self {
Self {
number,
count: params.count,
@@ -189,17 +188,6 @@ impl ShardIdentity {
}
}
/// Asserts that the given shard identities are equal. Changes to shard parameters will likely
/// result in data corruption.
pub fn assert_equal(&self, other: ShardIdentity) {
if self != &other {
// TODO: for now, we're conservative and just log errors in production. Turn this into a
// real assertion when we're confident it doesn't misfire, and also reject requests that
// attempt to change it with an error response.
critical!("shard identity mismatch: {self:?} != {other:?}");
}
}
fn is_broken(&self) -> bool {
self.layout == LAYOUT_BROKEN
}

View File

@@ -78,13 +78,7 @@ pub fn is_expected_io_error(e: &io::Error) -> bool {
use io::ErrorKind::*;
matches!(
e.kind(),
HostUnreachable
| NetworkUnreachable
| BrokenPipe
| ConnectionRefused
| ConnectionAborted
| ConnectionReset
| TimedOut,
BrokenPipe | ConnectionRefused | ConnectionAborted | ConnectionReset | TimedOut
)
}

View File

@@ -1,22 +1,17 @@
//! A background loop that fetches feature flags from PostHog and updates the feature store.
use std::{
sync::Arc,
time::{Duration, SystemTime},
};
use std::{sync::Arc, time::Duration};
use arc_swap::ArcSwap;
use tokio_util::sync::CancellationToken;
use tracing::{Instrument, info_span};
use crate::{
CaptureEvent, FeatureStore, LocalEvaluationResponse, PostHogClient, PostHogClientConfig,
};
use crate::{CaptureEvent, FeatureStore, PostHogClient, PostHogClientConfig};
/// A background loop that fetches feature flags from PostHog and updates the feature store.
pub struct FeatureResolverBackgroundLoop {
posthog_client: PostHogClient,
feature_store: ArcSwap<(SystemTime, Arc<FeatureStore>)>,
feature_store: ArcSwap<FeatureStore>,
cancel: CancellationToken,
}
@@ -24,35 +19,11 @@ impl FeatureResolverBackgroundLoop {
pub fn new(config: PostHogClientConfig, shutdown_pageserver: CancellationToken) -> Self {
Self {
posthog_client: PostHogClient::new(config),
feature_store: ArcSwap::new(Arc::new((
SystemTime::UNIX_EPOCH,
Arc::new(FeatureStore::new()),
))),
feature_store: ArcSwap::new(Arc::new(FeatureStore::new())),
cancel: shutdown_pageserver,
}
}
/// Update the feature store with a new feature flag spec bypassing the normal refresh loop.
pub fn update(&self, spec: String) -> anyhow::Result<()> {
let resp: LocalEvaluationResponse = serde_json::from_str(&spec)?;
self.update_feature_store_nofail(resp, "http_propagate");
Ok(())
}
fn update_feature_store_nofail(&self, resp: LocalEvaluationResponse, source: &'static str) {
let project_id = self.posthog_client.config.project_id.parse::<u64>().ok();
match FeatureStore::new_with_flags(resp.flags, project_id) {
Ok(feature_store) => {
self.feature_store
.store(Arc::new((SystemTime::now(), Arc::new(feature_store))));
tracing::info!("Feature flag updated from {}", source);
}
Err(e) => {
tracing::warn!("Cannot process feature flag spec from {}: {}", source, e);
}
}
}
pub fn spawn(
self: Arc<Self>,
handle: &tokio::runtime::Handle,
@@ -76,17 +47,6 @@ impl FeatureResolverBackgroundLoop {
_ = ticker.tick() => {}
_ = cancel.cancelled() => break
}
{
let last_update = this.feature_store.load().0;
if let Ok(elapsed) = last_update.elapsed() {
if elapsed < refresh_period {
tracing::debug!(
"Skipping feature flag refresh because it's too soon"
);
continue;
}
}
}
let resp = match this
.posthog_client
.get_feature_flags_local_evaluation()
@@ -98,7 +58,16 @@ impl FeatureResolverBackgroundLoop {
continue;
}
};
this.update_feature_store_nofail(resp, "refresh_loop");
let project_id = this.posthog_client.config.project_id.parse::<u64>().ok();
match FeatureStore::new_with_flags(resp.flags, project_id) {
Ok(feature_store) => {
this.feature_store.store(Arc::new(feature_store));
tracing::info!("Feature flag updated");
}
Err(e) => {
tracing::warn!("Cannot process feature flag spec: {}", e);
}
}
}
tracing::info!("PostHog feature resolver stopped");
}
@@ -123,6 +92,6 @@ impl FeatureResolverBackgroundLoop {
}
pub fn feature_store(&self) -> Arc<FeatureStore> {
self.feature_store.load().1.clone()
self.feature_store.load_full()
}
}

View File

@@ -544,8 +544,17 @@ impl PostHogClient {
self.config.server_api_key.starts_with("phs_")
}
/// Get the raw JSON spec, same as `get_feature_flags_local_evaluation` but without parsing.
pub async fn get_feature_flags_local_evaluation_raw(&self) -> anyhow::Result<String> {
/// Fetch the feature flag specs from the server.
///
/// This is unfortunately an undocumented API at:
/// - <https://posthog.com/docs/api/feature-flags#get-api-projects-project_id-feature_flags-local_evaluation>
/// - <https://posthog.com/docs/feature-flags/local-evaluation>
///
/// The handling logic in [`FeatureStore`] mostly follows the Python API implementation.
/// See `_compute_flag_locally` in <https://github.com/PostHog/posthog-python/blob/master/posthog/client.py>
pub async fn get_feature_flags_local_evaluation(
&self,
) -> anyhow::Result<LocalEvaluationResponse> {
// BASE_URL/api/projects/:project_id/feature_flags/local_evaluation
// with bearer token of self.server_api_key
// OR
@@ -579,22 +588,7 @@ impl PostHogClient {
body
));
}
Ok(body)
}
/// Fetch the feature flag specs from the server.
///
/// This is unfortunately an undocumented API at:
/// - <https://posthog.com/docs/api/feature-flags#get-api-projects-project_id-feature_flags-local_evaluation>
/// - <https://posthog.com/docs/feature-flags/local-evaluation>
///
/// The handling logic in [`FeatureStore`] mostly follows the Python API implementation.
/// See `_compute_flag_locally` in <https://github.com/PostHog/posthog-python/blob/master/posthog/client.py>
pub async fn get_feature_flags_local_evaluation(
&self,
) -> Result<LocalEvaluationResponse, anyhow::Error> {
let raw = self.get_feature_flags_local_evaluation_raw().await?;
Ok(serde_json::from_str(&raw)?)
Ok(serde_json::from_str(&body)?)
}
/// Capture an event. This will only be used to report the feature flag usage back to PostHog, though

View File

@@ -52,7 +52,7 @@ pub(crate) async fn hi(str: &[u8], salt: &[u8], iterations: u32) -> [u8; 32] {
}
// yield every ~250us
// hopefully reduces tail latencies
if i.is_multiple_of(1024) {
if i % 1024 == 0 {
yield_now().await
}
}

View File

@@ -90,7 +90,7 @@ pub struct InnerClient {
}
impl InnerClient {
pub fn start(&mut self) -> Result<PartialQuery<'_>, Error> {
pub fn start(&mut self) -> Result<PartialQuery, Error> {
self.responses.waiting += 1;
Ok(PartialQuery(Some(self)))
}
@@ -227,7 +227,7 @@ impl Client {
&mut self,
statement: &str,
params: I,
) -> Result<RowStream<'_>, Error>
) -> Result<RowStream, Error>
where
S: AsRef<str>,
I: IntoIterator<Item = Option<S>>,
@@ -262,7 +262,7 @@ impl Client {
pub(crate) async fn simple_query_raw(
&mut self,
query: &str,
) -> Result<SimpleQueryStream<'_>, Error> {
) -> Result<SimpleQueryStream, Error> {
simple_query::simple_query(self.inner_mut(), query).await
}

View File

@@ -1,12 +1,13 @@
use std::net::IpAddr;
use postgres_protocol2::message::backend::Message;
use tokio::io::{AsyncRead, AsyncWrite};
use tokio::net::TcpStream;
use tokio::sync::mpsc;
use crate::client::SocketConfig;
use crate::codec::BackendMessage;
use crate::config::Host;
use crate::config::{Host, SslMode};
use crate::connect_raw::connect_raw;
use crate::connect_socket::connect_socket;
use crate::connect_tls::connect_tls;
@@ -46,13 +47,7 @@ where
{
let socket = connect_socket(host_addr, host, port, config.connect_timeout).await?;
let stream = connect_tls(socket, config.ssl_mode, tls).await?;
let RawConnection {
stream,
parameters,
delayed_notice,
process_id,
secret_key,
} = connect_raw(stream, config).await?;
let raw = connect_raw(stream, config).await?;
let socket_config = SocketConfig {
host_addr,
@@ -61,24 +56,46 @@ where
connect_timeout: config.connect_timeout,
};
let (client_tx, conn_rx) = mpsc::unbounded_channel();
let (conn_tx, client_rx) = mpsc::channel(4);
let client = Client::new(
client_tx,
client_rx,
socket_config,
config.ssl_mode,
process_id,
secret_key,
);
// delayed notices are always sent as "Async" messages.
let delayed = delayed_notice
.into_iter()
.map(|m| BackendMessage::Async(Message::NoticeResponse(m)))
.collect();
let connection = Connection::new(stream, delayed, parameters, conn_tx, conn_rx);
Ok((client, connection))
Ok(raw.into_managed_conn(socket_config, config.ssl_mode))
}
impl<S, T> RawConnection<S, T>
where
S: AsyncRead + AsyncWrite + Unpin,
T: AsyncRead + AsyncWrite + Unpin,
{
pub fn into_managed_conn(
self,
socket_config: SocketConfig,
ssl_mode: SslMode,
) -> (Client, Connection<S, T>) {
let RawConnection {
stream,
parameters,
delayed_notice,
process_id,
secret_key,
} = self;
let (client_tx, conn_rx) = mpsc::unbounded_channel();
let (conn_tx, client_rx) = mpsc::channel(4);
let client = Client::new(
client_tx,
client_rx,
socket_config,
ssl_mode,
process_id,
secret_key,
);
// delayed notices are always sent as "Async" messages.
let delayed = delayed_notice
.into_iter()
.map(|m| BackendMessage::Async(Message::NoticeResponse(m)))
.collect();
let connection = Connection::new(stream, delayed, parameters, conn_tx, conn_rx);
(client, connection)
}
}

View File

@@ -12,11 +12,7 @@ mod private {
/// This trait is "sealed", and cannot be implemented outside of this crate.
pub trait GenericClient: private::Sealed {
/// Like `Client::query_raw_txt`.
async fn query_raw_txt<S, I>(
&mut self,
statement: &str,
params: I,
) -> Result<RowStream<'_>, Error>
async fn query_raw_txt<S, I>(&mut self, statement: &str, params: I) -> Result<RowStream, Error>
where
S: AsRef<str> + Sync + Send,
I: IntoIterator<Item = Option<S>> + Sync + Send,
@@ -26,11 +22,7 @@ pub trait GenericClient: private::Sealed {
impl private::Sealed for Client {}
impl GenericClient for Client {
async fn query_raw_txt<S, I>(
&mut self,
statement: &str,
params: I,
) -> Result<RowStream<'_>, Error>
async fn query_raw_txt<S, I>(&mut self, statement: &str, params: I) -> Result<RowStream, Error>
where
S: AsRef<str> + Sync + Send,
I: IntoIterator<Item = Option<S>> + Sync + Send,
@@ -43,11 +35,7 @@ impl GenericClient for Client {
impl private::Sealed for Transaction<'_> {}
impl GenericClient for Transaction<'_> {
async fn query_raw_txt<S, I>(
&mut self,
statement: &str,
params: I,
) -> Result<RowStream<'_>, Error>
async fn query_raw_txt<S, I>(&mut self, statement: &str, params: I) -> Result<RowStream, Error>
where
S: AsRef<str> + Sync + Send,
I: IntoIterator<Item = Option<S>> + Sync + Send,

View File

@@ -47,7 +47,7 @@ impl<'a> Transaction<'a> {
&mut self,
statement: &str,
params: I,
) -> Result<RowStream<'_>, Error>
) -> Result<RowStream, Error>
where
S: AsRef<str>,
I: IntoIterator<Item = Option<S>>,

View File

@@ -210,7 +210,7 @@ pub struct TimelineStatus {
}
/// Request to switch membership configuration.
#[derive(Clone, Serialize, Deserialize)]
#[derive(Serialize, Deserialize)]
#[serde(transparent)]
pub struct TimelineMembershipSwitchRequest {
pub mconf: Configuration,
@@ -221,8 +221,6 @@ pub struct TimelineMembershipSwitchRequest {
pub struct TimelineMembershipSwitchResponse {
pub previous_conf: Configuration,
pub current_conf: Configuration,
pub term: Term,
pub flush_lsn: Lsn,
}
#[derive(Clone, Copy, Serialize, Deserialize)]

View File

@@ -24,28 +24,12 @@ macro_rules! critical {
if cfg!(debug_assertions) {
panic!($($arg)*);
}
// Increment both metrics
$crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical();
let backtrace = std::backtrace::Backtrace::capture();
tracing::error!("CRITICAL: {}\n{backtrace}", format!($($arg)*));
}};
}
#[macro_export]
macro_rules! critical_timeline {
($tenant_shard_id:expr, $timeline_id:expr, $($arg:tt)*) => {{
if cfg!(debug_assertions) {
panic!($($arg)*);
}
// Increment both metrics
$crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical();
$crate::logging::HADRON_CRITICAL_STORAGE_EVENT_COUNT_METRIC.inc(&$tenant_shard_id.to_string(), &$timeline_id.to_string());
let backtrace = std::backtrace::Backtrace::capture();
tracing::error!("CRITICAL: [tenant_shard_id: {}, timeline_id: {}] {}\n{backtrace}",
$tenant_shard_id, $timeline_id, format!($($arg)*));
}};
}
#[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)]
#[strum(serialize_all = "snake_case")]
pub enum LogFormat {
@@ -77,36 +61,6 @@ pub struct TracingEventCountMetric {
trace: IntCounter,
}
// Begin Hadron: Add a HadronCriticalStorageEventCountMetric metric that is sliced by tenant_id and timeline_id
pub struct HadronCriticalStorageEventCountMetric {
critical: IntCounterVec,
}
pub static HADRON_CRITICAL_STORAGE_EVENT_COUNT_METRIC: Lazy<HadronCriticalStorageEventCountMetric> =
Lazy::new(|| {
let vec = metrics::register_int_counter_vec!(
"hadron_critical_storage_event_count",
"Number of critical storage events, by tenant_id and timeline_id",
&["tenant_shard_id", "timeline_id"]
)
.expect("failed to define metric");
HadronCriticalStorageEventCountMetric::new(vec)
});
impl HadronCriticalStorageEventCountMetric {
fn new(vec: IntCounterVec) -> Self {
Self { critical: vec }
}
// Allow public access from `critical!` macro.
pub fn inc(&self, tenant_shard_id: &str, timeline_id: &str) {
self.critical
.with_label_values(&[tenant_shard_id, timeline_id])
.inc();
}
}
// End Hadron
pub static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(|| {
let vec = metrics::register_int_counter_vec!(
"libmetrics_tracing_event_count",

View File

@@ -86,14 +86,6 @@ pub enum GateError {
GateClosed,
}
impl GateError {
pub fn is_cancel(&self) -> bool {
match self {
GateError::GateClosed => true,
}
}
}
impl Default for Gate {
fn default() -> Self {
Self {

View File

@@ -844,13 +844,4 @@ impl Client {
.await
.map_err(Error::ReceiveBody)
}
pub async fn update_feature_flag_spec(&self, spec: String) -> Result<()> {
let uri = format!("{}/v1/feature_flag_spec", self.mgmt_api_endpoint);
self.request(Method::POST, uri, spec)
.await?
.json()
.await
.map_err(Error::ReceiveBody)
}
}

View File

@@ -9,14 +9,12 @@ anyhow.workspace = true
bytes.workspace = true
futures.workspace = true
pageserver_api.workspace = true
postgres_ffi_types.workspace = true
postgres_ffi.workspace = true
prost.workspace = true
prost-types.workspace = true
strum.workspace = true
strum_macros.workspace = true
thiserror.workspace = true
tokio.workspace = true
tokio-util.workspace = true
tonic.workspace = true
utils.workspace = true
workspace_hack.workspace = true

View File

@@ -35,8 +35,6 @@
syntax = "proto3";
package page_api;
import "google/protobuf/timestamp.proto";
service PageService {
// Returns whether a relation exists.
rpc CheckRelExists(CheckRelExistsRequest) returns (CheckRelExistsResponse);
@@ -66,10 +64,6 @@ service PageService {
// Fetches an SLRU segment.
rpc GetSlruSegment (GetSlruSegmentRequest) returns (GetSlruSegmentResponse);
// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't garbage
// collect the LSN until the lease expires. Must be acquired on all relevant shards.
rpc LeaseLsn (LeaseLsnRequest) returns (LeaseLsnResponse);
}
// The LSN a request should read at.
@@ -116,19 +110,6 @@ message GetBaseBackupRequest {
bool replica = 2;
// If true, include relation files in the base backup. Mainly for debugging and tests.
bool full = 3;
// Compression algorithm to use. Base backups send a compressed payload instead of using gRPC
// compression, so that we can cache compressed backups on the server.
BaseBackupCompression compression = 4;
}
// Base backup compression algorithms.
enum BaseBackupCompression {
// Unknown algorithm. Used when clients send an unsupported algorithm.
BASE_BACKUP_COMPRESSION_UNKNOWN = 0;
// No compression.
BASE_BACKUP_COMPRESSION_NONE = 1;
// GZIP compression.
BASE_BACKUP_COMPRESSION_GZIP = 2;
}
// Base backup response chunk, returned as an ordered stream.
@@ -258,17 +239,3 @@ message GetSlruSegmentRequest {
message GetSlruSegmentResponse {
bytes segment = 1;
}
// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't garbage
// collect the LSN until the lease expires. Must be acquired on all relevant shards.
message LeaseLsnRequest {
// The LSN to lease. Can't be 0 or below the current GC cutoff.
uint64 lsn = 1;
}
// Lease acquisition response. If the lease could not be granted because the LSN has already been
// garbage collected, a FailedPrecondition status will be returned instead.
message LeaseLsnResponse {
// The lease expiration time.
google.protobuf.Timestamp expires = 1;
}

View File

@@ -1,7 +1,8 @@
use anyhow::Result;
use futures::{Stream, StreamExt as _, TryStreamExt as _};
use tokio::io::AsyncRead;
use tokio_util::io::StreamReader;
use std::convert::TryInto;
use bytes::Bytes;
use futures::TryStreamExt;
use futures::{Stream, StreamExt};
use tonic::metadata::AsciiMetadataValue;
use tonic::metadata::errors::InvalidMetadataValue;
use tonic::transport::Channel;
@@ -11,6 +12,8 @@ use utils::id::TenantId;
use utils::id::TimelineId;
use utils::shard::ShardIndex;
use anyhow::Result;
use crate::model;
use crate::proto;
@@ -66,7 +69,6 @@ impl tonic::service::Interceptor for AuthInterceptor {
Ok(req)
}
}
#[derive(Clone)]
pub struct Client {
client: proto::PageServiceClient<
@@ -93,6 +95,7 @@ impl Client {
if let Some(compression) = compression {
// TODO: benchmark this (including network latency).
// TODO: consider enabling compression by default.
client = client
.accept_compressed(compression)
.send_compressed(compression);
@@ -118,15 +121,22 @@ impl Client {
pub async fn get_base_backup(
&mut self,
req: model::GetBaseBackupRequest,
) -> Result<impl AsyncRead + use<>, tonic::Status> {
let req = proto::GetBaseBackupRequest::from(req);
let chunks = self.client.get_base_backup(req).await?.into_inner();
let reader = StreamReader::new(
chunks
.map_ok(|resp| resp.chunk)
.map_err(std::io::Error::other),
);
Ok(reader)
) -> Result<impl Stream<Item = Result<Bytes, tonic::Status>> + 'static, tonic::Status> {
let proto_req = proto::GetBaseBackupRequest::from(req);
let response_stream: Streaming<proto::GetBaseBackupResponseChunk> =
self.client.get_base_backup(proto_req).await?.into_inner();
// TODO: Consider dechunking internally
let domain_stream = response_stream.map(|chunk_res| {
chunk_res.and_then(|proto_chunk| {
proto_chunk.try_into().map_err(|e| {
tonic::Status::internal(format!("Failed to convert response chunk: {e}"))
})
})
});
Ok(domain_stream)
}
/// Returns the total size of a database, as # of bytes.
@@ -187,17 +197,4 @@ impl Client {
let response = self.client.get_slru_segment(proto_req).await?;
Ok(response.into_inner().try_into()?)
}
/// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't
/// garbage collect the LSN until the lease expires. Must be acquired on all relevant shards.
///
/// Returns the lease expiration time, or a FailedPrecondition status if the lease could not be
/// acquired because the LSN has already been garbage collected.
pub async fn lease_lsn(
&mut self,
req: model::LeaseLsnRequest,
) -> Result<model::LeaseLsnResponse, tonic::Status> {
let req = proto::LeaseLsnRequest::from(req);
Ok(self.client.lease_lsn(req).await?.into_inner().try_into()?)
}
}

View File

@@ -16,11 +16,10 @@
//! stream combinators without dealing with errors, and avoids validating the same message twice.
use std::fmt::Display;
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use bytes::Bytes;
use postgres_ffi_types::Oid;
// TODO: split out Lsn, RelTag, SlruKind and other basic types to a separate crate, to avoid
use postgres_ffi::Oid;
// TODO: split out Lsn, RelTag, SlruKind, Oid and other basic types to a separate crate, to avoid
// pulling in all of their other crate dependencies when building the client.
use utils::lsn::Lsn;
@@ -192,21 +191,15 @@ pub struct GetBaseBackupRequest {
pub replica: bool,
/// If true, include relation files in the base backup. Mainly for debugging and tests.
pub full: bool,
/// Compression algorithm to use. Base backups send a compressed payload instead of using gRPC
/// compression, so that we can cache compressed backups on the server.
pub compression: BaseBackupCompression,
}
impl TryFrom<proto::GetBaseBackupRequest> for GetBaseBackupRequest {
type Error = ProtocolError;
fn try_from(pb: proto::GetBaseBackupRequest) -> Result<Self, Self::Error> {
Ok(Self {
impl From<proto::GetBaseBackupRequest> for GetBaseBackupRequest {
fn from(pb: proto::GetBaseBackupRequest) -> Self {
Self {
lsn: (pb.lsn != 0).then_some(Lsn(pb.lsn)),
replica: pb.replica,
full: pb.full,
compression: pb.compression.try_into()?,
})
}
}
}
@@ -216,55 +209,10 @@ impl From<GetBaseBackupRequest> for proto::GetBaseBackupRequest {
lsn: request.lsn.unwrap_or_default().0,
replica: request.replica,
full: request.full,
compression: request.compression.into(),
}
}
}
/// Base backup compression algorithm.
#[derive(Clone, Copy, Debug)]
pub enum BaseBackupCompression {
None,
Gzip,
}
impl TryFrom<proto::BaseBackupCompression> for BaseBackupCompression {
type Error = ProtocolError;
fn try_from(pb: proto::BaseBackupCompression) -> Result<Self, Self::Error> {
match pb {
proto::BaseBackupCompression::Unknown => Err(ProtocolError::invalid("compression", pb)),
proto::BaseBackupCompression::None => Ok(Self::None),
proto::BaseBackupCompression::Gzip => Ok(Self::Gzip),
}
}
}
impl TryFrom<i32> for BaseBackupCompression {
type Error = ProtocolError;
fn try_from(compression: i32) -> Result<Self, Self::Error> {
proto::BaseBackupCompression::try_from(compression)
.map_err(|_| ProtocolError::invalid("compression", compression))
.and_then(Self::try_from)
}
}
impl From<BaseBackupCompression> for proto::BaseBackupCompression {
fn from(compression: BaseBackupCompression) -> Self {
match compression {
BaseBackupCompression::None => Self::None,
BaseBackupCompression::Gzip => Self::Gzip,
}
}
}
impl From<BaseBackupCompression> for i32 {
fn from(compression: BaseBackupCompression) -> Self {
proto::BaseBackupCompression::from(compression).into()
}
}
pub type GetBaseBackupResponseChunk = Bytes;
impl TryFrom<proto::GetBaseBackupResponseChunk> for GetBaseBackupResponseChunk {
@@ -704,54 +652,3 @@ impl From<GetSlruSegmentResponse> for proto::GetSlruSegmentResponse {
// SlruKind is defined in pageserver_api::reltag.
pub type SlruKind = pageserver_api::reltag::SlruKind;
/// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't garbage
/// collect the LSN until the lease expires.
pub struct LeaseLsnRequest {
/// The LSN to lease.
pub lsn: Lsn,
}
impl TryFrom<proto::LeaseLsnRequest> for LeaseLsnRequest {
type Error = ProtocolError;
fn try_from(pb: proto::LeaseLsnRequest) -> Result<Self, Self::Error> {
if pb.lsn == 0 {
return Err(ProtocolError::Missing("lsn"));
}
Ok(Self { lsn: Lsn(pb.lsn) })
}
}
impl From<LeaseLsnRequest> for proto::LeaseLsnRequest {
fn from(request: LeaseLsnRequest) -> Self {
Self { lsn: request.lsn.0 }
}
}
/// Lease expiration time. If the lease could not be granted because the LSN has already been
/// garbage collected, a FailedPrecondition status will be returned instead.
pub type LeaseLsnResponse = SystemTime;
impl TryFrom<proto::LeaseLsnResponse> for LeaseLsnResponse {
type Error = ProtocolError;
fn try_from(pb: proto::LeaseLsnResponse) -> Result<Self, Self::Error> {
let expires = pb.expires.ok_or(ProtocolError::Missing("expires"))?;
UNIX_EPOCH
.checked_add(Duration::new(expires.seconds as u64, expires.nanos as u32))
.ok_or_else(|| ProtocolError::invalid("expires", expires))
}
}
impl From<LeaseLsnResponse> for proto::LeaseLsnResponse {
fn from(response: LeaseLsnResponse) -> Self {
let expires = response.duration_since(UNIX_EPOCH).unwrap_or_default();
Self {
expires: Some(prost_types::Timestamp {
seconds: expires.as_secs() as i64,
nanos: expires.subsec_nanos() as i32,
}),
}
}
}

View File

@@ -317,7 +317,6 @@ impl Client for LibpqClient {
/// A gRPC Pageserver client.
struct GrpcClient {
inner: page_api::Client,
compression: page_api::BaseBackupCompression,
}
impl GrpcClient {
@@ -332,14 +331,10 @@ impl GrpcClient {
ttid.timeline_id,
ShardIndex::unsharded(),
None,
None, // NB: uses payload compression
compression.then_some(tonic::codec::CompressionEncoding::Zstd),
)
.await?;
let compression = match compression {
true => page_api::BaseBackupCompression::Gzip,
false => page_api::BaseBackupCompression::None,
};
Ok(Self { inner, compression })
Ok(Self { inner })
}
}
@@ -353,8 +348,10 @@ impl Client for GrpcClient {
lsn,
replica: false,
full: false,
compression: self.compression,
};
Ok(Box::pin(self.inner.get_base_backup(req).await?))
let stream = self.inner.get_base_backup(req).await?;
Ok(Box::pin(StreamReader::new(
stream.map_err(std::io::Error::other),
)))
}
}

View File

@@ -14,7 +14,6 @@ use std::fmt::Write as FmtWrite;
use std::time::{Instant, SystemTime};
use anyhow::{Context, anyhow};
use async_compression::tokio::write::GzipEncoder;
use bytes::{BufMut, Bytes, BytesMut};
use fail::fail_point;
use pageserver_api::key::{Key, rel_block_to_key};
@@ -26,7 +25,8 @@ use postgres_ffi::{
};
use postgres_ffi_types::constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
use postgres_ffi_types::forknum::{INIT_FORKNUM, MAIN_FORKNUM};
use tokio::io::{self, AsyncWrite, AsyncWriteExt as _};
use tokio::io;
use tokio::io::AsyncWrite;
use tokio_tar::{Builder, EntryType, Header};
use tracing::*;
use utils::lsn::Lsn;
@@ -97,7 +97,6 @@ impl From<BasebackupError> for tonic::Status {
/// * When working without safekeepers. In this situation it is important to match the lsn
/// we are taking basebackup on with the lsn that is used in pageserver's walreceiver
/// to start the replication.
#[allow(clippy::too_many_arguments)]
pub async fn send_basebackup_tarball<'a, W>(
write: &'a mut W,
timeline: &'a Timeline,
@@ -105,7 +104,6 @@ pub async fn send_basebackup_tarball<'a, W>(
prev_lsn: Option<Lsn>,
full_backup: bool,
replica: bool,
gzip_level: Option<async_compression::Level>,
ctx: &'a RequestContext,
) -> Result<(), BasebackupError>
where
@@ -124,7 +122,7 @@ where
// prev_lsn value; that happens if the timeline was just branched from
// an old LSN and it doesn't have any WAL of its own yet. We will set
// prev_lsn to Lsn(0) if we cannot provide the correct value.
let (backup_prev, lsn) = if let Some(req_lsn) = req_lsn {
let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn {
// Backup was requested at a particular LSN. The caller should've
// already checked that it's a valid LSN.
@@ -145,7 +143,7 @@ where
};
// Consolidate the derived and the provided prev_lsn values
let prev_record_lsn = if let Some(provided_prev_lsn) = prev_lsn {
let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
if backup_prev != Lsn(0) && backup_prev != provided_prev_lsn {
return Err(BasebackupError::Server(anyhow!(
"backup_prev {backup_prev} != provided_prev_lsn {provided_prev_lsn}"
@@ -157,55 +155,30 @@ where
};
info!(
"taking basebackup lsn={lsn}, prev_lsn={prev_record_lsn} \
(full_backup={full_backup}, replica={replica}, gzip={gzip_level:?})",
);
let span = info_span!("send_tarball", backup_lsn=%lsn);
let io_concurrency = IoConcurrency::spawn_from_conf(
timeline.conf.get_vectored_concurrent_io,
timeline
.gate
.enter()
.map_err(|_| BasebackupError::Shutdown)?,
"taking basebackup lsn={}, prev_lsn={} (full_backup={}, replica={})",
backup_lsn, prev_lsn, full_backup, replica
);
if let Some(gzip_level) = gzip_level {
let mut encoder = GzipEncoder::with_quality(write, gzip_level);
Basebackup {
ar: Builder::new_non_terminated(&mut encoder),
timeline,
lsn,
prev_record_lsn,
full_backup,
replica,
ctx,
io_concurrency,
}
let basebackup = Basebackup {
ar: Builder::new_non_terminated(write),
timeline,
lsn: backup_lsn,
prev_record_lsn: prev_lsn,
full_backup,
replica,
ctx,
io_concurrency: IoConcurrency::spawn_from_conf(
timeline.conf.get_vectored_concurrent_io,
timeline
.gate
.enter()
.map_err(|_| BasebackupError::Shutdown)?,
),
};
basebackup
.send_tarball()
.instrument(span)
.await?;
encoder
.shutdown()
.await
.map_err(|err| BasebackupError::Client(err, "gzip"))?;
} else {
Basebackup {
ar: Builder::new_non_terminated(write),
timeline,
lsn,
prev_record_lsn,
full_backup,
replica,
ctx,
io_concurrency,
}
.send_tarball()
.instrument(span)
.await?;
}
Ok(())
.instrument(info_span!("send_tarball", backup_lsn=%backup_lsn))
.await
}
/// This is short-living object only for the time of tarball creation,

View File

@@ -1,12 +1,13 @@
use std::{collections::HashMap, sync::Arc};
use anyhow::Context;
use async_compression::tokio::write::GzipEncoder;
use camino::{Utf8Path, Utf8PathBuf};
use metrics::core::{AtomicU64, GenericCounter};
use pageserver_api::{config::BasebackupCacheConfig, models::TenantState};
use tokio::{
io::{AsyncWriteExt, BufWriter},
sync::mpsc::{Receiver, Sender, error::TrySendError},
sync::mpsc::{UnboundedReceiver, UnboundedSender},
};
use tokio_util::sync::CancellationToken;
use utils::{
@@ -19,8 +20,8 @@ use crate::{
basebackup::send_basebackup_tarball,
context::{DownloadBehavior, RequestContext},
metrics::{
BASEBACKUP_CACHE_ENTRIES, BASEBACKUP_CACHE_PREPARE, BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE,
BASEBACKUP_CACHE_READ, BASEBACKUP_CACHE_SIZE,
BASEBACKUP_CACHE_ENTRIES, BASEBACKUP_CACHE_PREPARE, BASEBACKUP_CACHE_READ,
BASEBACKUP_CACHE_SIZE,
},
task_mgr::TaskKind,
tenant::{
@@ -35,8 +36,8 @@ pub struct BasebackupPrepareRequest {
pub lsn: Lsn,
}
pub type BasebackupPrepareSender = Sender<BasebackupPrepareRequest>;
pub type BasebackupPrepareReceiver = Receiver<BasebackupPrepareRequest>;
pub type BasebackupPrepareSender = UnboundedSender<BasebackupPrepareRequest>;
pub type BasebackupPrepareReceiver = UnboundedReceiver<BasebackupPrepareRequest>;
#[derive(Clone)]
struct CacheEntry {
@@ -60,65 +61,40 @@ struct CacheEntry {
/// and ~1 RPS for get requests.
pub struct BasebackupCache {
data_dir: Utf8PathBuf,
config: Option<BasebackupCacheConfig>,
entries: std::sync::Mutex<HashMap<TenantTimelineId, CacheEntry>>,
prepare_sender: BasebackupPrepareSender,
read_hit_count: GenericCounter<AtomicU64>,
read_miss_count: GenericCounter<AtomicU64>,
read_err_count: GenericCounter<AtomicU64>,
prepare_skip_count: GenericCounter<AtomicU64>,
}
impl BasebackupCache {
/// Create a new BasebackupCache instance.
/// Also returns a BasebackupPrepareReceiver which is needed to start
/// the background task.
/// The cache is initialized from the data_dir in the background task.
/// The cache will return `None` for any get requests until the initialization is complete.
/// The background task is spawned separately using [`Self::spawn_background_task`]
/// to avoid a circular dependency between the cache and the tenant manager.
pub fn new(
/// Creates a BasebackupCache and spawns the background task.
/// The initialization of the cache is performed in the background and does not
/// block the caller. The cache will return `None` for any get requests until
/// initialization is complete.
pub fn spawn(
runtime_handle: &tokio::runtime::Handle,
data_dir: Utf8PathBuf,
config: Option<BasebackupCacheConfig>,
) -> (Arc<Self>, BasebackupPrepareReceiver) {
let chan_size = config.as_ref().map(|c| c.max_size_entries).unwrap_or(1);
let (prepare_sender, prepare_receiver) = tokio::sync::mpsc::channel(chan_size);
prepare_receiver: BasebackupPrepareReceiver,
tenant_manager: Arc<TenantManager>,
cancel: CancellationToken,
) -> Arc<Self> {
let cache = Arc::new(BasebackupCache {
data_dir,
config,
entries: std::sync::Mutex::new(HashMap::new()),
prepare_sender,
read_hit_count: BASEBACKUP_CACHE_READ.with_label_values(&["hit"]),
read_miss_count: BASEBACKUP_CACHE_READ.with_label_values(&["miss"]),
read_err_count: BASEBACKUP_CACHE_READ.with_label_values(&["error"]),
prepare_skip_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["skip"]),
});
(cache, prepare_receiver)
}
/// Spawns the background task.
/// The background task initializes the cache from the disk,
/// processes prepare requests, and cleans up outdated cache entries.
/// Noop if the cache is disabled (config is None).
pub fn spawn_background_task(
self: Arc<Self>,
runtime_handle: &tokio::runtime::Handle,
prepare_receiver: BasebackupPrepareReceiver,
tenant_manager: Arc<TenantManager>,
cancel: CancellationToken,
) {
if let Some(config) = self.config.clone() {
if let Some(config) = config {
let background = BackgroundTask {
c: self,
c: cache.clone(),
config,
tenant_manager,
@@ -133,45 +109,8 @@ impl BasebackupCache {
};
runtime_handle.spawn(background.run(prepare_receiver));
}
}
/// Send a basebackup prepare request to the background task.
/// The basebackup will be prepared asynchronously, it does not block the caller.
/// The request will be skipped if any cache limits are exceeded.
pub fn send_prepare(&self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, lsn: Lsn) {
let req = BasebackupPrepareRequest {
tenant_shard_id,
timeline_id,
lsn,
};
BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE.inc();
let res = self.prepare_sender.try_send(req);
if let Err(e) = res {
BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE.dec();
self.prepare_skip_count.inc();
match e {
TrySendError::Full(_) => {
// Basebackup prepares are pretty rare, normally we should not hit this.
tracing::info!(
tenant_id = %tenant_shard_id.tenant_id,
%timeline_id,
%lsn,
"Basebackup prepare channel is full, skipping the request"
);
}
TrySendError::Closed(_) => {
// Normal during shutdown, not critical.
tracing::info!(
tenant_id = %tenant_shard_id.tenant_id,
%timeline_id,
%lsn,
"Basebackup prepare channel is closed, skipping the request"
);
}
}
}
cache
}
/// Gets a basebackup entry from the cache.
@@ -184,10 +123,6 @@ impl BasebackupCache {
timeline_id: TimelineId,
lsn: Lsn,
) -> Option<tokio::fs::File> {
if !self.is_enabled() {
return None;
}
// Fast path. Check if the entry exists using the in-memory state.
let tti = TenantTimelineId::new(tenant_id, timeline_id);
if self.entries.lock().unwrap().get(&tti).map(|e| e.lsn) != Some(lsn) {
@@ -215,10 +150,6 @@ impl BasebackupCache {
}
}
pub fn is_enabled(&self) -> bool {
self.config.is_some()
}
// Private methods.
fn entry_filename(tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn) -> String {
@@ -436,7 +367,6 @@ impl BackgroundTask {
loop {
tokio::select! {
Some(req) = prepare_receiver.recv() => {
BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE.dec();
if let Err(err) = self.prepare_basebackup(
req.tenant_shard_id,
req.timeline_id,
@@ -664,6 +594,13 @@ impl BackgroundTask {
let file = tokio::fs::File::create(entry_tmp_path).await?;
let mut writer = BufWriter::new(file);
let mut encoder = GzipEncoder::with_quality(
&mut writer,
// Level::Best because compression is not on the hot path of basebackup requests.
// The decompression is almost not affected by the compression level.
async_compression::Level::Best,
);
// We may receive a request before the WAL record is applied to the timeline.
// Wait for the requested LSN to be applied.
timeline
@@ -676,19 +613,17 @@ impl BackgroundTask {
.await?;
send_basebackup_tarball(
&mut writer,
&mut encoder,
timeline,
Some(req_lsn),
None,
false,
false,
// Level::Best because compression is not on the hot path of basebackup requests.
// The decompression is almost not affected by the compression level.
Some(async_compression::Level::Best),
&ctx,
)
.await?;
encoder.shutdown().await?;
writer.flush().await?;
writer.into_inner().sync_all().await?;

View File

@@ -569,10 +569,8 @@ fn start_pageserver(
pageserver::l0_flush::L0FlushGlobalState::new(conf.l0_flush.clone());
// Scan the local 'tenants/' directory and start loading the tenants
let (basebackup_cache, basebackup_prepare_receiver) = BasebackupCache::new(
conf.basebackup_cache_dir(),
conf.basebackup_cache_config.clone(),
);
let (basebackup_prepare_sender, basebackup_prepare_receiver) =
tokio::sync::mpsc::unbounded_channel();
let deletion_queue_client = deletion_queue.new_client();
let background_purges = mgr::BackgroundPurges::default();
@@ -584,7 +582,7 @@ fn start_pageserver(
remote_storage: remote_storage.clone(),
deletion_queue_client,
l0_flush_global_state,
basebackup_cache: Arc::clone(&basebackup_cache),
basebackup_prepare_sender,
feature_resolver: feature_resolver.clone(),
},
shutdown_pageserver.clone(),
@@ -592,8 +590,10 @@ fn start_pageserver(
let tenant_manager = Arc::new(tenant_manager);
BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(tenant_manager.clone(), order))?;
basebackup_cache.spawn_background_task(
let basebackup_cache = BasebackupCache::spawn(
BACKGROUND_RUNTIME.handle(),
conf.basebackup_cache_dir(),
conf.basebackup_cache_config.clone(),
basebackup_prepare_receiver,
Arc::clone(&tenant_manager),
shutdown_pageserver.child_token(),
@@ -806,6 +806,7 @@ fn start_pageserver(
} else {
None
},
basebackup_cache,
);
// Spawn a Pageserver gRPC server task. It will spawn separate tasks for

View File

@@ -37,7 +37,7 @@ async fn main() -> anyhow::Result<()> {
not_modified_since: Lsn(23),
},
batch_key: 42,
message: format!("message {msg}"),
message: format!("message {}", msg),
}));
let Ok(res) = tokio::time::timeout(Duration::from_secs(10), fut).await else {
eprintln!("pipe seems full");

View File

@@ -762,40 +762,4 @@ mod tests {
let result = PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir);
assert_eq!(result.is_ok(), is_valid);
}
#[test]
fn test_config_posthog_config_is_valid() {
let input = r#"
control_plane_api = "http://localhost:6666"
[posthog_config]
server_api_key = "phs_AAA"
client_api_key = "phc_BBB"
project_id = "000"
private_api_url = "https://us.posthog.com"
public_api_url = "https://us.i.posthog.com"
"#;
let config_toml = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input)
.expect("posthogconfig is valid");
let workdir = Utf8PathBuf::from("/nonexistent");
PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir)
.expect("parse_and_validate");
}
#[test]
fn test_config_posthog_incomplete_config_is_valid() {
let input = r#"
control_plane_api = "http://localhost:6666"
[posthog_config]
server_api_key = "phs_AAA"
private_api_url = "https://us.posthog.com"
public_api_url = "https://us.i.posthog.com"
"#;
let config_toml = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input)
.expect("posthogconfig is valid");
let workdir = Utf8PathBuf::from("/nonexistent");
PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir)
.expect("parse_and_validate");
}
}

View File

@@ -99,7 +99,7 @@ pub(super) async fn upload_metrics_bucket(
// Compose object path
let datetime: DateTime<Utc> = SystemTime::now().into();
let ts_prefix = datetime.format("year=%Y/month=%m/day=%d/hour=%H/%H:%M:%SZ");
let ts_prefix = datetime.format("year=%Y/month=%m/day=%d/%H:%M:%SZ");
let path = RemotePath::from_string(&format!("{ts_prefix}_{node_id}.ndjson.gz"))?;
// Set up a gzip writer into a buffer
@@ -109,7 +109,7 @@ pub(super) async fn upload_metrics_bucket(
// Serialize and write into compressed buffer
let started_at = std::time::Instant::now();
for res in serialize_in_chunks_ndjson(CHUNK_SIZE, metrics, idempotency_keys) {
for res in serialize_in_chunks(CHUNK_SIZE, metrics, idempotency_keys) {
let (_chunk, body) = res?;
gzip_writer.write_all(&body).await?;
}
@@ -216,86 +216,6 @@ fn serialize_in_chunks<'a>(
}
}
/// Serializes the input metrics as NDJSON in chunks of chunk_size. Each event
/// is serialized as a separate JSON object on its own line. The provided
/// idempotency keys are injected into the corresponding metric events (reused
/// across different metrics sinks), and must have the same length as input.
fn serialize_in_chunks_ndjson<'a>(
chunk_size: usize,
input: &'a [NewRawMetric],
idempotency_keys: &'a [IdempotencyKey<'a>],
) -> impl ExactSizeIterator<Item = Result<(&'a [NewRawMetric], bytes::Bytes), serde_json::Error>> + 'a
{
use bytes::BufMut;
assert_eq!(input.len(), idempotency_keys.len());
struct Iter<'a> {
inner: std::slice::Chunks<'a, NewRawMetric>,
idempotency_keys: std::slice::Iter<'a, IdempotencyKey<'a>>,
chunk_size: usize,
// write to a BytesMut so that we can cheaply clone the frozen Bytes for retries
buffer: bytes::BytesMut,
// chunk amount of events are reused to produce the serialized document
scratch: Vec<Event<Ids, Name>>,
}
impl<'a> Iterator for Iter<'a> {
type Item = Result<(&'a [NewRawMetric], bytes::Bytes), serde_json::Error>;
fn next(&mut self) -> Option<Self::Item> {
let chunk = self.inner.next()?;
if self.scratch.is_empty() {
// first round: create events with N strings
self.scratch.extend(
chunk
.iter()
.zip(&mut self.idempotency_keys)
.map(|(raw_metric, key)| raw_metric.as_event(key)),
);
} else {
// next rounds: update_in_place to reuse allocations
assert_eq!(self.scratch.len(), self.chunk_size);
itertools::izip!(self.scratch.iter_mut(), chunk, &mut self.idempotency_keys)
.for_each(|(slot, raw_metric, key)| raw_metric.update_in_place(slot, key));
}
// Serialize each event as NDJSON (one JSON object per line)
for event in self.scratch[..chunk.len()].iter() {
let res = serde_json::to_writer((&mut self.buffer).writer(), event);
if let Err(e) = res {
return Some(Err(e));
}
// Add newline after each event to follow NDJSON format
self.buffer.put_u8(b'\n');
}
Some(Ok((chunk, self.buffer.split().freeze())))
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.inner.size_hint()
}
}
impl ExactSizeIterator for Iter<'_> {}
let buffer = bytes::BytesMut::new();
let inner = input.chunks(chunk_size);
let idempotency_keys = idempotency_keys.iter();
let scratch = Vec::new();
Iter {
inner,
idempotency_keys,
chunk_size,
buffer,
scratch,
}
}
trait RawMetricExt {
fn as_event(&self, key: &IdempotencyKey<'_>) -> Event<Ids, Name>;
fn update_in_place(&self, event: &mut Event<Ids, Name>, key: &IdempotencyKey<'_>);
@@ -559,43 +479,6 @@ mod tests {
}
}
#[test]
fn chunked_serialization_ndjson() {
let examples = metric_samples();
assert!(examples.len() > 1);
let now = Utc::now();
let idempotency_keys = (0..examples.len())
.map(|i| FixedGen::new(now, "1", i as u16).generate())
.collect::<Vec<_>>();
// Parse NDJSON format - each line is a separate JSON object
let parse_ndjson = |body: &[u8]| -> Vec<Event<Ids, Name>> {
let body_str = std::str::from_utf8(body).unwrap();
body_str
.trim_end_matches('\n')
.lines()
.filter(|line| !line.is_empty())
.map(|line| serde_json::from_str::<Event<Ids, Name>>(line).unwrap())
.collect()
};
let correct = serialize_in_chunks_ndjson(examples.len(), &examples, &idempotency_keys)
.map(|res| res.unwrap().1)
.flat_map(|body| parse_ndjson(&body))
.collect::<Vec<_>>();
for chunk_size in 1..examples.len() {
let actual = serialize_in_chunks_ndjson(chunk_size, &examples, &idempotency_keys)
.map(|res| res.unwrap().1)
.flat_map(|body| parse_ndjson(&body))
.collect::<Vec<_>>();
// if these are equal, it means that multi-chunking version works as well
assert_eq!(correct, actual);
}
}
#[derive(Clone, Copy)]
struct FixedGen<'a>(chrono::DateTime<chrono::Utc>, &'a str, u16);

View File

@@ -3,16 +3,15 @@ use std::{collections::HashMap, sync::Arc, time::Duration};
use arc_swap::ArcSwap;
use pageserver_api::config::NodeMetadata;
use posthog_client_lite::{
CaptureEvent, FeatureResolverBackgroundLoop, PostHogEvaluationError,
CaptureEvent, FeatureResolverBackgroundLoop, PostHogClientConfig, PostHogEvaluationError,
PostHogFlagFilterPropertyValue,
};
use rand::Rng;
use remote_storage::RemoteStorageKind;
use serde_json::json;
use tokio_util::sync::CancellationToken;
use utils::id::TenantId;
use crate::{config::PageServerConf, metrics::FEATURE_FLAG_EVALUATION, tenant::TenantShard};
use crate::{config::PageServerConf, metrics::FEATURE_FLAG_EVALUATION};
const DEFAULT_POSTHOG_REFRESH_INTERVAL: Duration = Duration::from_secs(600);
@@ -32,13 +31,6 @@ impl FeatureResolver {
}
}
pub fn update(&self, spec: String) -> anyhow::Result<()> {
if let Some(inner) = &self.inner {
inner.update(spec)?;
}
Ok(())
}
pub fn spawn(
conf: &PageServerConf,
shutdown_pageserver: CancellationToken,
@@ -46,24 +38,16 @@ impl FeatureResolver {
) -> anyhow::Result<Self> {
// DO NOT block in this function: make it return as fast as possible to avoid startup delays.
if let Some(posthog_config) = &conf.posthog_config {
let posthog_client_config = match posthog_config.clone().try_into_posthog_config() {
Ok(config) => config,
Err(e) => {
tracing::warn!(
"invalid posthog config, skipping posthog integration: {}",
e
);
return Ok(FeatureResolver {
inner: None,
internal_properties: None,
force_overrides_for_testing: Arc::new(ArcSwap::new(Arc::new(
HashMap::new(),
))),
});
}
};
let inner =
FeatureResolverBackgroundLoop::new(posthog_client_config, shutdown_pageserver);
let inner = FeatureResolverBackgroundLoop::new(
PostHogClientConfig {
server_api_key: posthog_config.server_api_key.clone(),
client_api_key: posthog_config.client_api_key.clone(),
project_id: posthog_config.project_id.clone(),
private_api_url: posthog_config.private_api_url.clone(),
public_api_url: posthog_config.public_api_url.clone(),
},
shutdown_pageserver,
);
let inner = Arc::new(inner);
// The properties shared by all tenants on this pageserver.
@@ -139,7 +123,6 @@ impl FeatureResolver {
}
Arc::new(properties)
};
let fake_tenants = {
let mut tenants = Vec::new();
for i in 0..10 {
@@ -149,16 +132,9 @@ impl FeatureResolver {
conf.id,
i
);
let tenant_properties = PerTenantProperties {
remote_size_mb: Some(rand::thread_rng().gen_range(100.0..1000000.00)),
}
.into_posthog_properties();
let properties = Self::collect_properties_inner(
distinct_id.clone(),
Some(&internal_properties),
&tenant_properties,
);
tenants.push(CaptureEvent {
event: "initial_tenant_report".to_string(),
@@ -192,7 +168,6 @@ impl FeatureResolver {
fn collect_properties_inner(
tenant_id: String,
internal_properties: Option<&HashMap<String, PostHogFlagFilterPropertyValue>>,
tenant_properties: &HashMap<String, PostHogFlagFilterPropertyValue>,
) -> HashMap<String, PostHogFlagFilterPropertyValue> {
let mut properties = HashMap::new();
if let Some(internal_properties) = internal_properties {
@@ -204,9 +179,6 @@ impl FeatureResolver {
"tenant_id".to_string(),
PostHogFlagFilterPropertyValue::String(tenant_id),
);
for (key, value) in tenant_properties.iter() {
properties.insert(key.clone(), value.clone());
}
properties
}
@@ -214,13 +186,8 @@ impl FeatureResolver {
pub(crate) fn collect_properties(
&self,
tenant_id: TenantId,
tenant_properties: &HashMap<String, PostHogFlagFilterPropertyValue>,
) -> HashMap<String, PostHogFlagFilterPropertyValue> {
Self::collect_properties_inner(
tenant_id.to_string(),
self.internal_properties.as_deref(),
tenant_properties,
)
Self::collect_properties_inner(tenant_id.to_string(), self.internal_properties.as_deref())
}
/// Evaluate a multivariate feature flag. Currently, we do not support any properties.
@@ -232,7 +199,6 @@ impl FeatureResolver {
&self,
flag_key: &str,
tenant_id: TenantId,
tenant_properties: &HashMap<String, PostHogFlagFilterPropertyValue>,
) -> Result<String, PostHogEvaluationError> {
let force_overrides = self.force_overrides_for_testing.load();
if let Some(value) = force_overrides.get(flag_key) {
@@ -243,7 +209,7 @@ impl FeatureResolver {
let res = inner.feature_store().evaluate_multivariate(
flag_key,
&tenant_id.to_string(),
&self.collect_properties(tenant_id, tenant_properties),
&self.collect_properties(tenant_id),
);
match &res {
Ok(value) => {
@@ -276,7 +242,6 @@ impl FeatureResolver {
&self,
flag_key: &str,
tenant_id: TenantId,
tenant_properties: &HashMap<String, PostHogFlagFilterPropertyValue>,
) -> Result<(), PostHogEvaluationError> {
let force_overrides = self.force_overrides_for_testing.load();
if let Some(value) = force_overrides.get(flag_key) {
@@ -291,7 +256,7 @@ impl FeatureResolver {
let res = inner.feature_store().evaluate_boolean(
flag_key,
&tenant_id.to_string(),
&self.collect_properties(tenant_id, tenant_properties),
&self.collect_properties(tenant_id),
);
match &res {
Ok(()) => {
@@ -337,78 +302,3 @@ impl FeatureResolver {
.store(Arc::new(force_overrides));
}
}
struct PerTenantProperties {
pub remote_size_mb: Option<f64>,
}
impl PerTenantProperties {
pub fn into_posthog_properties(self) -> HashMap<String, PostHogFlagFilterPropertyValue> {
let mut properties = HashMap::new();
if let Some(remote_size_mb) = self.remote_size_mb {
properties.insert(
"tenant_remote_size_mb".to_string(),
PostHogFlagFilterPropertyValue::Number(remote_size_mb),
);
}
properties
}
}
#[derive(Clone)]
pub struct TenantFeatureResolver {
inner: FeatureResolver,
tenant_id: TenantId,
cached_tenant_properties: Arc<ArcSwap<HashMap<String, PostHogFlagFilterPropertyValue>>>,
}
impl TenantFeatureResolver {
pub fn new(inner: FeatureResolver, tenant_id: TenantId) -> Self {
Self {
inner,
tenant_id,
cached_tenant_properties: Arc::new(ArcSwap::new(Arc::new(HashMap::new()))),
}
}
pub fn evaluate_multivariate(&self, flag_key: &str) -> Result<String, PostHogEvaluationError> {
self.inner.evaluate_multivariate(
flag_key,
self.tenant_id,
&self.cached_tenant_properties.load(),
)
}
pub fn evaluate_boolean(&self, flag_key: &str) -> Result<(), PostHogEvaluationError> {
self.inner.evaluate_boolean(
flag_key,
self.tenant_id,
&self.cached_tenant_properties.load(),
)
}
pub fn collect_properties(&self) -> HashMap<String, PostHogFlagFilterPropertyValue> {
self.inner
.collect_properties(self.tenant_id, &self.cached_tenant_properties.load())
}
pub fn is_feature_flag_boolean(&self, flag_key: &str) -> Result<bool, PostHogEvaluationError> {
self.inner.is_feature_flag_boolean(flag_key)
}
pub fn update_cached_tenant_properties(&self, tenant_shard: &TenantShard) {
let mut remote_size_mb = None;
for timeline in tenant_shard.list_timelines() {
let size = timeline.metrics.resident_physical_size_get();
if size == 0 {
remote_size_mb = None;
}
if let Some(ref mut remote_size_mb) = remote_size_mb {
*remote_size_mb += size as f64 / 1024.0 / 1024.0;
}
}
self.cached_tenant_properties.store(Arc::new(
PerTenantProperties { remote_size_mb }.into_posthog_properties(),
));
}
}

View File

@@ -1893,13 +1893,9 @@ async fn update_tenant_config_handler(
let location_conf = LocationConf::attached_single(
new_tenant_conf.clone(),
tenant.get_generation(),
ShardParameters::from(tenant.get_shard_identity()),
&ShardParameters::default(),
);
tenant
.get_shard_identity()
.assert_equal(location_conf.shard); // not strictly necessary since we construct it above
crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
.await
.map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
@@ -1941,13 +1937,9 @@ async fn patch_tenant_config_handler(
let location_conf = LocationConf::attached_single(
updated,
tenant.get_generation(),
ShardParameters::from(tenant.get_shard_identity()),
&ShardParameters::default(),
);
tenant
.get_shard_identity()
.assert_equal(location_conf.shard); // not strictly necessary since we construct it above
crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
.await
.map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
@@ -2438,7 +2430,6 @@ async fn timeline_offload_handler(
.map_err(|e| {
match e {
OffloadError::Cancelled => ApiError::ResourceUnavailable("Timeline shutting down".into()),
OffloadError::AlreadyInProgress => ApiError::Conflict("Timeline already being offloaded or deleted".into()),
_ => ApiError::InternalServerError(anyhow!(e))
}
})?;
@@ -3698,25 +3689,23 @@ async fn tenant_evaluate_feature_flag(
let tenant = state
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id)?;
// TODO: the properties we get here might be stale right after it is collected. But such races are rare (updated every 10s)
// and we don't need to worry about it for now.
let properties = tenant.feature_resolver.collect_properties();
let properties = tenant.feature_resolver.collect_properties(tenant_shard_id.tenant_id);
if as_type.as_deref() == Some("boolean") {
let result = tenant.feature_resolver.evaluate_boolean(&flag);
let result = tenant.feature_resolver.evaluate_boolean(&flag, tenant_shard_id.tenant_id);
let result = result.map(|_| true).map_err(|e| e.to_string());
json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
} else if as_type.as_deref() == Some("multivariate") {
let result = tenant.feature_resolver.evaluate_multivariate(&flag).map_err(|e| e.to_string());
let result = tenant.feature_resolver.evaluate_multivariate(&flag, tenant_shard_id.tenant_id).map_err(|e| e.to_string());
json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
} else {
// Auto infer the type of the feature flag.
let is_boolean = tenant.feature_resolver.is_feature_flag_boolean(&flag).map_err(|e| ApiError::InternalServerError(anyhow::anyhow!("{e}")))?;
if is_boolean {
let result = tenant.feature_resolver.evaluate_boolean(&flag);
let result = tenant.feature_resolver.evaluate_boolean(&flag, tenant_shard_id.tenant_id);
let result = result.map(|_| true).map_err(|e| e.to_string());
json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
} else {
let result = tenant.feature_resolver.evaluate_multivariate(&flag).map_err(|e| e.to_string());
let result = tenant.feature_resolver.evaluate_multivariate(&flag, tenant_shard_id.tenant_id).map_err(|e| e.to_string());
json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
}
}
@@ -3754,20 +3743,6 @@ async fn force_override_feature_flag_for_testing_delete(
json_response(StatusCode::OK, ())
}
async fn update_feature_flag_spec(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
check_permission(&request, None)?;
let body = json_request(&mut request).await?;
let state = get_state(&request);
state
.feature_resolver
.update(body)
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, ())
}
/// Common functionality of all the HTTP API handlers.
///
/// - Adds a tracing span to each request (by `request_span`)
@@ -4153,8 +4128,5 @@ pub fn make_router(
.delete("/v1/feature_flag/:flag_key", |r| {
testing_api_handler("force override feature flag - delete", r, force_override_feature_flag_for_testing_delete)
})
.post("/v1/feature_flag_spec", |r| {
api_handler(r, update_feature_flag_spec)
})
.any(handler_404))
}

View File

@@ -4439,14 +4439,6 @@ pub(crate) static BASEBACKUP_CACHE_SIZE: Lazy<UIntGauge> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE: Lazy<UIntGauge> = Lazy::new(|| {
register_uint_gauge!(
"pageserver_basebackup_cache_prepare_queue_size",
"Number of requests in the basebackup prepare channel"
)
.expect("failed to define a metric")
});
static PAGESERVER_CONFIG_IGNORED_ITEMS: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_config_ignored_items",

View File

@@ -12,9 +12,9 @@ use std::task::{Context, Poll};
use std::time::{Duration, Instant, SystemTime};
use std::{io, str};
use anyhow::{Context as _, bail};
use anyhow::{Context as _, anyhow, bail};
use async_compression::tokio::write::GzipEncoder;
use bytes::{Buf as _, BufMut as _, BytesMut};
use chrono::Utc;
use futures::future::BoxFuture;
use futures::{FutureExt, Stream};
use itertools::Itertools;
@@ -63,6 +63,7 @@ use utils::{failpoint_support, span_record};
use crate::auth::check_permission;
use crate::basebackup::{self, BasebackupError};
use crate::basebackup_cache::BasebackupCache;
use crate::config::PageServerConf;
use crate::context::{
DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
@@ -137,6 +138,7 @@ pub fn spawn(
perf_trace_dispatch: Option<Dispatch>,
tcp_listener: tokio::net::TcpListener,
tls_config: Option<Arc<rustls::ServerConfig>>,
basebackup_cache: Arc<BasebackupCache>,
) -> Listener {
let cancel = CancellationToken::new();
let libpq_ctx = RequestContext::todo_child(
@@ -158,6 +160,7 @@ pub fn spawn(
conf.pg_auth_type,
tls_config,
conf.page_service_pipelining.clone(),
basebackup_cache,
libpq_ctx,
cancel.clone(),
)
@@ -216,6 +219,7 @@ pub async fn libpq_listener_main(
auth_type: AuthType,
tls_config: Option<Arc<rustls::ServerConfig>>,
pipelining_config: PageServicePipeliningConfig,
basebackup_cache: Arc<BasebackupCache>,
listener_ctx: RequestContext,
listener_cancel: CancellationToken,
) -> Connections {
@@ -259,6 +263,7 @@ pub async fn libpq_listener_main(
auth_type,
tls_config.clone(),
pipelining_config.clone(),
Arc::clone(&basebackup_cache),
connection_ctx,
connections_cancel.child_token(),
gate_guard,
@@ -301,6 +306,7 @@ async fn page_service_conn_main(
auth_type: AuthType,
tls_config: Option<Arc<rustls::ServerConfig>>,
pipelining_config: PageServicePipeliningConfig,
basebackup_cache: Arc<BasebackupCache>,
connection_ctx: RequestContext,
cancel: CancellationToken,
gate_guard: GateGuard,
@@ -366,6 +372,7 @@ async fn page_service_conn_main(
pipelining_config,
conf.get_vectored_concurrent_io,
perf_span_fields,
basebackup_cache,
connection_ctx,
cancel.clone(),
gate_guard,
@@ -419,6 +426,8 @@ struct PageServerHandler {
pipelining_config: PageServicePipeliningConfig,
get_vectored_concurrent_io: GetVectoredConcurrentIo,
basebackup_cache: Arc<BasebackupCache>,
gate_guard: GateGuard,
}
@@ -904,6 +913,7 @@ impl PageServerHandler {
pipelining_config: PageServicePipeliningConfig,
get_vectored_concurrent_io: GetVectoredConcurrentIo,
perf_span_fields: ConnectionPerfSpanFields,
basebackup_cache: Arc<BasebackupCache>,
connection_ctx: RequestContext,
cancel: CancellationToken,
gate_guard: GateGuard,
@@ -917,6 +927,7 @@ impl PageServerHandler {
cancel,
pipelining_config,
get_vectored_concurrent_io,
basebackup_cache,
gate_guard,
}
}
@@ -2602,16 +2613,26 @@ impl PageServerHandler {
prev_lsn,
full_backup,
replica,
None,
&ctx,
)
.await?;
} else {
let mut writer = BufWriter::new(pgb.copyout_writer());
let cached = timeline
.get_cached_basebackup_if_enabled(lsn, prev_lsn, full_backup, replica, gzip)
.await;
let cached = {
// Basebackup is cached only for this combination of parameters.
if timeline.is_basebackup_cache_enabled()
&& gzip
&& lsn.is_some()
&& prev_lsn.is_none()
{
self.basebackup_cache
.get(tenant_id, timeline_id, lsn.unwrap())
.await
} else {
None
}
};
if let Some(mut cached) = cached {
from_cache = true;
@@ -2620,6 +2641,31 @@ impl PageServerHandler {
.map_err(|err| {
BasebackupError::Client(err, "handle_basebackup_request,cached,copy")
})?;
} else if gzip {
let mut encoder = GzipEncoder::with_quality(
&mut writer,
// NOTE using fast compression because it's on the critical path
// for compute startup. For an empty database, we get
// <100KB with this method. The Level::Best compression method
// gives us <20KB, but maybe we should add basebackup caching
// on compute shutdown first.
async_compression::Level::Fastest,
);
basebackup::send_basebackup_tarball(
&mut encoder,
&timeline,
lsn,
prev_lsn,
full_backup,
replica,
&ctx,
)
.await?;
// shutdown the encoder to ensure the gzip footer is written
encoder
.shutdown()
.await
.map_err(|e| QueryError::Disconnected(ConnectionError::Io(e)))?;
} else {
basebackup::send_basebackup_tarball(
&mut writer,
@@ -2628,11 +2674,6 @@ impl PageServerHandler {
prev_lsn,
full_backup,
replica,
// NB: using fast compression because it's on the critical path for compute
// startup. For an empty database, we get <100KB with this method. The
// Level::Best compression method gives us <20KB, but maybe we should add
// basebackup caching on compute shutdown first.
gzip.then_some(async_compression::Level::Fastest),
&ctx,
)
.await?;
@@ -3512,7 +3553,7 @@ impl proto::PageService for GrpcPageServiceHandler {
if timeline.is_archived() == Some(true) {
return Err(tonic::Status::failed_precondition("timeline is archived"));
}
let req: page_api::GetBaseBackupRequest = req.into_inner().try_into()?;
let req: page_api::GetBaseBackupRequest = req.into_inner().into();
span_record!(lsn=?req.lsn);
@@ -3538,50 +3579,20 @@ impl proto::PageService for GrpcPageServiceHandler {
let span = Span::current();
let (mut simplex_read, mut simplex_write) = tokio::io::simplex(CHUNK_SIZE);
let jh = tokio::spawn(async move {
let gzip_level = match req.compression {
page_api::BaseBackupCompression::None => None,
// NB: using fast compression because it's on the critical path for compute
// startup. For an empty database, we get <100KB with this method. The
// Level::Best compression method gives us <20KB, but maybe we should add
// basebackup caching on compute shutdown first.
page_api::BaseBackupCompression::Gzip => Some(async_compression::Level::Fastest),
};
// Check for a cached basebackup.
let cached = timeline
.get_cached_basebackup_if_enabled(
req.lsn,
None,
req.full,
req.replica,
gzip_level.is_some(),
)
.await;
let result = if let Some(mut cached) = cached {
// If we have a cached basebackup, send it.
tokio::io::copy(&mut cached, &mut simplex_write)
.await
.map(|_| ())
.map_err(|err| BasebackupError::Client(err, "cached,copy"))
} else {
basebackup::send_basebackup_tarball(
&mut simplex_write,
&timeline,
req.lsn,
None,
req.full,
req.replica,
gzip_level,
&ctx,
)
.instrument(span) // propagate request span
.await
};
simplex_write
.shutdown()
.await
.map_err(|err| BasebackupError::Client(err, "simplex_write"))?;
let result = basebackup::send_basebackup_tarball(
&mut simplex_write,
&timeline,
req.lsn,
None,
req.full,
req.replica,
&ctx,
)
.instrument(span) // propagate request span
.await;
simplex_write.shutdown().await.map_err(|err| {
BasebackupError::Server(anyhow!("simplex shutdown failed: {err}"))
})?;
result
});
@@ -3761,36 +3772,6 @@ impl proto::PageService for GrpcPageServiceHandler {
let resp: page_api::GetSlruSegmentResponse = resp.segment;
Ok(tonic::Response::new(resp.into()))
}
#[instrument(skip_all, fields(lsn))]
async fn lease_lsn(
&self,
req: tonic::Request<proto::LeaseLsnRequest>,
) -> Result<tonic::Response<proto::LeaseLsnResponse>, tonic::Status> {
let timeline = self.get_request_timeline(&req).await?;
let ctx = self.ctx.with_scope_timeline(&timeline);
// Validate and convert the request, and decorate the span.
let req: page_api::LeaseLsnRequest = req.into_inner().try_into()?;
span_record!(lsn=%req.lsn);
// Attempt to acquire a lease. Return FailedPrecondition if the lease could not be granted.
let lease_length = timeline.get_lsn_lease_length();
let expires = match timeline.renew_lsn_lease(req.lsn, lease_length, &ctx) {
Ok(lease) => lease.valid_until,
Err(err) => return Err(tonic::Status::failed_precondition(format!("{err}"))),
};
// TODO: is this spammy? Move it compute-side?
info!(
"acquired lease for {} until {}",
req.lsn,
chrono::DateTime::<Utc>::from(expires).to_rfc3339()
);
Ok(tonic::Response::new(expires.into()))
}
}
/// gRPC middleware layer that handles observability concerns:

View File

@@ -3015,7 +3015,7 @@ mod tests {
// This shard will get the even blocks
let shard = ShardIdentity::from_params(
ShardNumber(0),
ShardParameters {
&ShardParameters {
count: ShardCount(2),
stripe_size: ShardStripeSize(1),
},

View File

@@ -80,13 +80,13 @@ use self::timeline::uninit::{TimelineCreateGuard, TimelineExclusionError, Uninit
use self::timeline::{
EvictionTaskTenantState, GcCutoffs, TimelineDeleteProgress, TimelineResources, WaitLsnError,
};
use crate::basebackup_cache::BasebackupCache;
use crate::basebackup_cache::BasebackupPrepareSender;
use crate::config::PageServerConf;
use crate::context;
use crate::context::RequestContextBuilder;
use crate::context::{DownloadBehavior, RequestContext};
use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
use crate::feature_resolver::{FeatureResolver, TenantFeatureResolver};
use crate::feature_resolver::FeatureResolver;
use crate::l0_flush::L0FlushGlobalState;
use crate::metrics::{
BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, CONCURRENT_INITDBS,
@@ -162,7 +162,7 @@ pub struct TenantSharedResources {
pub remote_storage: GenericRemoteStorage,
pub deletion_queue_client: DeletionQueueClient,
pub l0_flush_global_state: L0FlushGlobalState,
pub basebackup_cache: Arc<BasebackupCache>,
pub basebackup_prepare_sender: BasebackupPrepareSender,
pub feature_resolver: FeatureResolver,
}
@@ -331,7 +331,7 @@ pub struct TenantShard {
deletion_queue_client: DeletionQueueClient,
/// A channel to send async requests to prepare a basebackup for the basebackup cache.
basebackup_cache: Arc<BasebackupCache>,
basebackup_prepare_sender: BasebackupPrepareSender,
/// Cached logical sizes updated updated on each [`TenantShard::gather_size_inputs`].
cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
@@ -386,7 +386,7 @@ pub struct TenantShard {
l0_flush_global_state: L0FlushGlobalState,
pub(crate) feature_resolver: TenantFeatureResolver,
pub(crate) feature_resolver: FeatureResolver,
}
impl std::fmt::Debug for TenantShard {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -1363,7 +1363,7 @@ impl TenantShard {
remote_storage,
deletion_queue_client,
l0_flush_global_state,
basebackup_cache,
basebackup_prepare_sender,
feature_resolver,
} = resources;
@@ -1380,7 +1380,7 @@ impl TenantShard {
remote_storage.clone(),
deletion_queue_client,
l0_flush_global_state,
basebackup_cache,
basebackup_prepare_sender,
feature_resolver,
));
@@ -3263,7 +3263,7 @@ impl TenantShard {
};
let gc_compaction_strategy = self
.feature_resolver
.evaluate_multivariate("gc-comapction-strategy")
.evaluate_multivariate("gc-comapction-strategy", self.tenant_shard_id.tenant_id)
.ok();
let span = if let Some(gc_compaction_strategy) = gc_compaction_strategy {
info_span!("gc_compact_timeline", timeline_id = %timeline.timeline_id, strategy = %gc_compaction_strategy)
@@ -3285,7 +3285,6 @@ impl TenantShard {
.or_else(|err| match err {
// Ignore this, we likely raced with unarchival.
OffloadError::NotArchived => Ok(()),
OffloadError::AlreadyInProgress => Ok(()),
err => Err(err),
})?;
}
@@ -3409,9 +3408,6 @@ impl TenantShard {
if let Some(ref walredo_mgr) = self.walredo_mgr {
walredo_mgr.maybe_quiesce(WALREDO_IDLE_TIMEOUT);
}
// Update the feature resolver with the latest tenant-spcific data.
self.feature_resolver.update_cached_tenant_properties(self);
}
pub fn timeline_has_no_attached_children(&self, timeline_id: TimelineId) -> bool {
@@ -3876,10 +3872,6 @@ impl TenantShard {
&self.tenant_shard_id
}
pub(crate) fn get_shard_identity(&self) -> ShardIdentity {
self.shard_identity
}
pub(crate) fn get_shard_stripe_size(&self) -> ShardStripeSize {
self.shard_identity.stripe_size
}
@@ -4388,7 +4380,7 @@ impl TenantShard {
remote_storage: GenericRemoteStorage,
deletion_queue_client: DeletionQueueClient,
l0_flush_global_state: L0FlushGlobalState,
basebackup_cache: Arc<BasebackupCache>,
basebackup_prepare_sender: BasebackupPrepareSender,
feature_resolver: FeatureResolver,
) -> TenantShard {
assert!(!attached_conf.location.generation.is_none());
@@ -4493,11 +4485,8 @@ impl TenantShard {
ongoing_timeline_detach: std::sync::Mutex::default(),
gc_block: Default::default(),
l0_flush_global_state,
basebackup_cache,
feature_resolver: TenantFeatureResolver::new(
feature_resolver,
tenant_shard_id.tenant_id,
),
basebackup_prepare_sender,
feature_resolver,
}
}
@@ -4536,10 +4525,6 @@ impl TenantShard {
Ok(toml_edit::de::from_str::<LocationConf>(&config)?)
}
/// Stores a tenant location config to disk.
///
/// NB: make sure to call `ShardIdentity::assert_equal` before persisting a new config, to avoid
/// changes to shard parameters that may result in data corruption.
#[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
pub(super) async fn persist_tenant_config(
conf: &'static PageServerConf,
@@ -5429,7 +5414,7 @@ impl TenantShard {
pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
l0_compaction_trigger: self.l0_compaction_trigger.clone(),
l0_flush_global_state: self.l0_flush_global_state.clone(),
basebackup_cache: self.basebackup_cache.clone(),
basebackup_prepare_sender: self.basebackup_prepare_sender.clone(),
feature_resolver: self.feature_resolver.clone(),
}
}
@@ -6015,7 +6000,7 @@ pub(crate) mod harness {
) -> anyhow::Result<Arc<TenantShard>> {
let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));
let (basebackup_cache, _) = BasebackupCache::new(Utf8PathBuf::new(), None);
let (basebackup_requst_sender, _) = tokio::sync::mpsc::unbounded_channel();
let tenant = Arc::new(TenantShard::new(
TenantState::Attaching,
@@ -6023,7 +6008,7 @@ pub(crate) mod harness {
AttachedTenantConf::try_from(LocationConf::attached_single(
self.tenant_conf.clone(),
self.generation,
ShardParameters::default(),
&ShardParameters::default(),
))
.unwrap(),
self.shard_identity,
@@ -6033,7 +6018,7 @@ pub(crate) mod harness {
self.deletion_queue.new_client(),
// TODO: ideally we should run all unit tests with both configs
L0FlushGlobalState::new(L0FlushConfig::default()),
basebackup_cache,
basebackup_requst_sender,
FeatureResolver::new_disabled(),
));
@@ -11444,11 +11429,11 @@ mod tests {
if left != right {
eprintln!("---LEFT---");
for left in left.iter() {
eprintln!("{left}");
eprintln!("{}", left);
}
eprintln!("---RIGHT---");
for right in right.iter() {
eprintln!("{right}");
eprintln!("{}", right);
}
assert_eq!(left, right);
}

View File

@@ -12,7 +12,6 @@
use pageserver_api::models;
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
use serde::{Deserialize, Serialize};
use utils::critical;
use utils::generation::Generation;
#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
@@ -137,7 +136,7 @@ impl LocationConf {
pub(crate) fn attached_single(
tenant_conf: pageserver_api::models::TenantConfig,
generation: Generation,
shard_params: models::ShardParameters,
shard_params: &models::ShardParameters,
) -> Self {
Self {
mode: LocationMode::Attached(AttachedLocationConfig {
@@ -172,16 +171,6 @@ impl LocationConf {
}
}
// This should never happen.
// TODO: turn this into a proper assertion.
if stripe_size != self.shard.stripe_size {
critical!(
"stripe size mismatch: {} != {}",
self.shard.stripe_size,
stripe_size,
);
}
self.shard.stripe_size = stripe_size;
}

View File

@@ -880,9 +880,6 @@ impl TenantManager {
// phase of writing config and/or waiting for flush, before returning.
match fast_path_taken {
Some(FastPathModified::Attached(tenant)) => {
tenant
.shard_identity
.assert_equal(new_location_config.shard);
TenantShard::persist_tenant_config(
self.conf,
&tenant_shard_id,
@@ -917,10 +914,7 @@ impl TenantManager {
return Ok(Some(tenant));
}
Some(FastPathModified::Secondary(secondary_tenant)) => {
secondary_tenant
.shard_identity
.assert_equal(new_location_config.shard);
Some(FastPathModified::Secondary(_secondary_tenant)) => {
TenantShard::persist_tenant_config(
self.conf,
&tenant_shard_id,
@@ -954,10 +948,6 @@ impl TenantManager {
match slot_guard.get_old_value() {
Some(TenantSlot::Attached(tenant)) => {
tenant
.shard_identity
.assert_equal(new_location_config.shard);
// The case where we keep a Tenant alive was covered above in the special case
// for Attached->Attached transitions in the same generation. By this point,
// if we see an attached tenant we know it will be discarded and should be
@@ -991,13 +981,9 @@ impl TenantManager {
// rather than assuming it to be empty.
spawn_mode = SpawnMode::Eager;
}
Some(TenantSlot::Secondary(secondary_tenant)) => {
secondary_tenant
.shard_identity
.assert_equal(new_location_config.shard);
Some(TenantSlot::Secondary(state)) => {
info!("Shutting down secondary tenant");
secondary_tenant.shutdown().await;
state.shutdown().await;
}
Some(TenantSlot::InProgress(_)) => {
// This should never happen: acquire_slot should error out
@@ -2214,7 +2200,7 @@ impl TenantManager {
selector: ShardSelector,
) -> ShardResolveResult {
let tenants = self.tenants.read().unwrap();
let mut want_shard: Option<ShardIndex> = None;
let mut want_shard = None;
let mut any_in_progress = None;
match &*tenants {
@@ -2239,23 +2225,14 @@ impl TenantManager {
return ShardResolveResult::Found(tenant.clone());
}
ShardSelector::Page(key) => {
// Each time we find an attached slot with a different shard count,
// recompute the expected shard number: during shard splits we might
// have multiple shards with the old shard count.
if want_shard.is_none()
|| want_shard.unwrap().shard_count != tenant.shard_identity.count
{
want_shard = Some(ShardIndex {
shard_number: tenant.shard_identity.get_shard_number(&key),
shard_count: tenant.shard_identity.count,
});
// First slot we see for this tenant, calculate the expected shard number
// for the key: we will use this for checking if this and subsequent
// slots contain the key, rather than recalculating the hash each time.
if want_shard.is_none() {
want_shard = Some(tenant.shard_identity.get_shard_number(&key));
}
if Some(ShardIndex {
shard_number: tenant.shard_identity.number,
shard_count: tenant.shard_identity.count,
}) == want_shard
{
if Some(tenant.shard_identity.number) == want_shard {
return ShardResolveResult::Found(tenant.clone());
}
}
@@ -2914,18 +2891,14 @@ mod tests {
use std::collections::BTreeMap;
use std::sync::Arc;
use camino::Utf8PathBuf;
use storage_broker::BrokerClientChannel;
use tracing::Instrument;
use super::super::harness::TenantHarness;
use super::TenantsMap;
use crate::{
basebackup_cache::BasebackupCache,
tenant::{
TenantSharedResources,
mgr::{BackgroundPurges, TenantManager, TenantSlot},
},
use crate::tenant::{
TenantSharedResources,
mgr::{BackgroundPurges, TenantManager, TenantSlot},
};
#[tokio::test(start_paused = true)]
@@ -2951,7 +2924,9 @@ mod tests {
// Invoke remove_tenant_from_memory with a cleanup hook that blocks until we manually
// permit it to proceed: that will stick the tenant in InProgress
let (basebackup_cache, _) = BasebackupCache::new(Utf8PathBuf::new(), None);
let (basebackup_prepare_sender, _) = tokio::sync::mpsc::unbounded_channel::<
crate::basebackup_cache::BasebackupPrepareRequest,
>();
let tenant_manager = TenantManager {
tenants: std::sync::RwLock::new(TenantsMap::Open(tenants)),
@@ -2965,7 +2940,7 @@ mod tests {
l0_flush_global_state: crate::l0_flush::L0FlushGlobalState::new(
h.conf.l0_flush.clone(),
),
basebackup_cache,
basebackup_prepare_sender,
feature_resolver: crate::feature_resolver::FeatureResolver::new_disabled(),
},
cancel: tokio_util::sync::CancellationToken::new(),

View File

@@ -101,7 +101,7 @@ pub(crate) struct SecondaryTenant {
// Secondary mode does not need the full shard identity or the pageserver_api::models::TenantConfig. However,
// storing these enables us to report our full LocationConf, enabling convenient reconciliation
// by the control plane (see [`Self::get_location_conf`])
pub(crate) shard_identity: ShardIdentity,
shard_identity: ShardIdentity,
tenant_conf: std::sync::Mutex<pageserver_api::models::TenantConfig>,
// Internal state used by the Downloader.

View File

@@ -55,11 +55,11 @@ pub struct BatchLayerWriter {
}
impl BatchLayerWriter {
pub fn new(conf: &'static PageServerConf) -> Self {
Self {
pub async fn new(conf: &'static PageServerConf) -> anyhow::Result<Self> {
Ok(Self {
generated_layer_writers: Vec::new(),
conf,
}
})
}
pub fn add_unfinished_image_writer(
@@ -182,7 +182,7 @@ impl BatchLayerWriter {
/// An image writer that takes images and produces multiple image layers.
#[must_use]
pub struct SplitImageLayerWriter<'a> {
inner: Option<ImageLayerWriter>,
inner: ImageLayerWriter,
target_layer_size: u64,
lsn: Lsn,
conf: &'static PageServerConf,
@@ -196,7 +196,7 @@ pub struct SplitImageLayerWriter<'a> {
impl<'a> SplitImageLayerWriter<'a> {
#[allow(clippy::too_many_arguments)]
pub fn new(
pub async fn new(
conf: &'static PageServerConf,
timeline_id: TimelineId,
tenant_shard_id: TenantShardId,
@@ -205,19 +205,30 @@ impl<'a> SplitImageLayerWriter<'a> {
target_layer_size: u64,
gate: &'a utils::sync::gate::Gate,
cancel: CancellationToken,
) -> Self {
Self {
ctx: &RequestContext,
) -> anyhow::Result<Self> {
Ok(Self {
target_layer_size,
inner: None,
inner: ImageLayerWriter::new(
conf,
timeline_id,
tenant_shard_id,
&(start_key..Key::MAX),
lsn,
gate,
cancel.clone(),
ctx,
)
.await?,
conf,
timeline_id,
tenant_shard_id,
batches: BatchLayerWriter::new(conf),
batches: BatchLayerWriter::new(conf).await?,
lsn,
start_key,
gate,
cancel,
}
})
}
pub async fn put_image(
@@ -226,31 +237,12 @@ impl<'a> SplitImageLayerWriter<'a> {
img: Bytes,
ctx: &RequestContext,
) -> Result<(), PutError> {
if self.inner.is_none() {
self.inner = Some(
ImageLayerWriter::new(
self.conf,
self.timeline_id,
self.tenant_shard_id,
&(self.start_key..Key::MAX),
self.lsn,
self.gate,
self.cancel.clone(),
ctx,
)
.await
.map_err(PutError::Other)?,
);
}
let inner = self.inner.as_mut().unwrap();
// The current estimation is an upper bound of the space that the key/image could take
// because we did not consider compression in this estimation. The resulting image layer
// could be smaller than the target size.
let addition_size_estimation = KEY_SIZE as u64 + img.len() as u64;
if inner.num_keys() >= 1
&& inner.estimated_size() + addition_size_estimation >= self.target_layer_size
if self.inner.num_keys() >= 1
&& self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
{
let next_image_writer = ImageLayerWriter::new(
self.conf,
@@ -264,7 +256,7 @@ impl<'a> SplitImageLayerWriter<'a> {
)
.await
.map_err(PutError::Other)?;
let prev_image_writer = std::mem::replace(inner, next_image_writer);
let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
self.batches.add_unfinished_image_writer(
prev_image_writer,
self.start_key..key,
@@ -272,7 +264,7 @@ impl<'a> SplitImageLayerWriter<'a> {
);
self.start_key = key;
}
inner.put_image(key, img, ctx).await
self.inner.put_image(key, img, ctx).await
}
pub(crate) async fn finish_with_discard_fn<D, F>(
@@ -289,10 +281,8 @@ impl<'a> SplitImageLayerWriter<'a> {
let Self {
mut batches, inner, ..
} = self;
if let Some(inner) = inner {
if inner.num_keys() != 0 {
batches.add_unfinished_image_writer(inner, self.start_key..end_key, self.lsn);
}
if inner.num_keys() != 0 {
batches.add_unfinished_image_writer(inner, self.start_key..end_key, self.lsn);
}
batches.finish_with_discard_fn(tline, ctx, discard_fn).await
}
@@ -329,7 +319,7 @@ pub struct SplitDeltaLayerWriter<'a> {
}
impl<'a> SplitDeltaLayerWriter<'a> {
pub fn new(
pub async fn new(
conf: &'static PageServerConf,
timeline_id: TimelineId,
tenant_shard_id: TenantShardId,
@@ -337,8 +327,8 @@ impl<'a> SplitDeltaLayerWriter<'a> {
target_layer_size: u64,
gate: &'a utils::sync::gate::Gate,
cancel: CancellationToken,
) -> Self {
Self {
) -> anyhow::Result<Self> {
Ok(Self {
target_layer_size,
inner: None,
conf,
@@ -346,10 +336,10 @@ impl<'a> SplitDeltaLayerWriter<'a> {
tenant_shard_id,
lsn_range,
last_key_written: Key::MIN,
batches: BatchLayerWriter::new(conf),
batches: BatchLayerWriter::new(conf).await?,
gate,
cancel,
}
})
}
pub async fn put_value(
@@ -507,7 +497,10 @@ mod tests {
4 * 1024 * 1024,
&tline.gate,
tline.cancel.clone(),
);
&ctx,
)
.await
.unwrap();
let mut delta_writer = SplitDeltaLayerWriter::new(
tenant.conf,
@@ -517,7 +510,9 @@ mod tests {
4 * 1024 * 1024,
&tline.gate,
tline.cancel.clone(),
);
)
.await
.unwrap();
image_writer
.put_image(get_key(0), get_img(0), &ctx)
@@ -583,7 +578,10 @@ mod tests {
4 * 1024 * 1024,
&tline.gate,
tline.cancel.clone(),
);
&ctx,
)
.await
.unwrap();
let mut delta_writer = SplitDeltaLayerWriter::new(
tenant.conf,
tline.timeline_id,
@@ -592,7 +590,9 @@ mod tests {
4 * 1024 * 1024,
&tline.gate,
tline.cancel.clone(),
);
)
.await
.unwrap();
const N: usize = 2000;
for i in 0..N {
let i = i as u32;
@@ -679,7 +679,10 @@ mod tests {
4 * 1024,
&tline.gate,
tline.cancel.clone(),
);
&ctx,
)
.await
.unwrap();
let mut delta_writer = SplitDeltaLayerWriter::new(
tenant.conf,
@@ -689,7 +692,9 @@ mod tests {
4 * 1024,
&tline.gate,
tline.cancel.clone(),
);
)
.await
.unwrap();
image_writer
.put_image(get_key(0), get_img(0), &ctx)
@@ -765,7 +770,9 @@ mod tests {
4 * 1024 * 1024,
&tline.gate,
tline.cancel.clone(),
);
)
.await
.unwrap();
for i in 0..N {
let i = i as u32;

View File

@@ -17,17 +17,14 @@ use tracing::*;
use utils::backoff::exponential_backoff_duration;
use utils::completion::Barrier;
use utils::pausable_failpoint;
use utils::sync::gate::GateError;
use crate::context::{DownloadBehavior, RequestContext};
use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS};
use crate::task_mgr::{self, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS, TaskKind};
use crate::tenant::blob_io::WriteBlobError;
use crate::tenant::throttle::Stats;
use crate::tenant::timeline::CompactionError;
use crate::tenant::timeline::compaction::CompactionOutcome;
use crate::tenant::{TenantShard, TenantState};
use crate::virtual_file::owned_buffers_io::write::FlushTaskError;
/// Semaphore limiting concurrent background tasks (across all tenants).
///
@@ -316,20 +313,7 @@ pub(crate) fn log_compaction_error(
let timeline = root_cause
.downcast_ref::<PageReconstructError>()
.is_some_and(|e| e.is_stopping());
let buffered_writer_flush_task_canelled = root_cause
.downcast_ref::<FlushTaskError>()
.is_some_and(|e| e.is_cancel());
let write_blob_cancelled = root_cause
.downcast_ref::<WriteBlobError>()
.is_some_and(|e| e.is_cancel());
let gate_closed = root_cause
.downcast_ref::<GateError>()
.is_some_and(|e| e.is_cancel());
let is_stopping = upload_queue
|| timeline
|| buffered_writer_flush_task_canelled
|| write_blob_cancelled
|| gate_closed;
let is_stopping = upload_queue || timeline;
if is_stopping {
Level::INFO

View File

@@ -78,7 +78,7 @@ use utils::rate_limit::RateLimit;
use utils::seqwait::SeqWait;
use utils::simple_rcu::{Rcu, RcuReadGuard};
use utils::sync::gate::{Gate, GateGuard};
use utils::{completion, critical_timeline, fs_ext, pausable_failpoint};
use utils::{completion, critical, fs_ext, pausable_failpoint};
#[cfg(test)]
use wal_decoder::models::value::Value;
use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
@@ -95,18 +95,18 @@ use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer};
use super::tasks::log_compaction_error;
use super::upload_queue::NotInitialized;
use super::{
AttachedTenantConf, GcError, HeatMapTimeline, MaybeOffloaded,
AttachedTenantConf, BasebackupPrepareSender, GcError, HeatMapTimeline, MaybeOffloaded,
debug_assert_current_span_has_tenant_and_timeline_id,
};
use crate::PERF_TRACE_TARGET;
use crate::aux_file::AuxFileSizeEstimator;
use crate::basebackup_cache::BasebackupCache;
use crate::basebackup_cache::BasebackupPrepareRequest;
use crate::config::PageServerConf;
use crate::context::{
DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
};
use crate::disk_usage_eviction_task::{DiskUsageEvictionInfo, EvictionCandidate, finite_f32};
use crate::feature_resolver::TenantFeatureResolver;
use crate::feature_resolver::FeatureResolver;
use crate::keyspace::{KeyPartitioning, KeySpace};
use crate::l0_flush::{self, L0FlushGlobalState};
use crate::metrics::{
@@ -201,8 +201,8 @@ pub struct TimelineResources {
pub pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,
pub l0_compaction_trigger: Arc<Notify>,
pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
pub basebackup_cache: Arc<BasebackupCache>,
pub feature_resolver: TenantFeatureResolver,
pub basebackup_prepare_sender: BasebackupPrepareSender,
pub feature_resolver: FeatureResolver,
}
pub struct Timeline {
@@ -448,9 +448,9 @@ pub struct Timeline {
wait_lsn_log_slow: tokio::sync::Semaphore,
/// A channel to send async requests to prepare a basebackup for the basebackup cache.
basebackup_cache: Arc<BasebackupCache>,
basebackup_prepare_sender: BasebackupPrepareSender,
feature_resolver: TenantFeatureResolver,
feature_resolver: FeatureResolver,
}
pub(crate) enum PreviousHeatmap {
@@ -763,7 +763,7 @@ pub(crate) enum CreateImageLayersError {
PageReconstructError(#[source] PageReconstructError),
#[error(transparent)]
Other(anyhow::Error),
Other(#[from] anyhow::Error),
}
impl From<layer_manager::Shutdown> for CreateImageLayersError {
@@ -2500,37 +2500,6 @@ impl Timeline {
.unwrap_or(self.conf.default_tenant_conf.basebackup_cache_enabled)
}
/// Try to get a basebackup from the on-disk cache.
pub(crate) async fn get_cached_basebackup(&self, lsn: Lsn) -> Option<tokio::fs::File> {
self.basebackup_cache
.get(self.tenant_shard_id.tenant_id, self.timeline_id, lsn)
.await
}
/// Convenience method to attempt fetching a basebackup for the timeline if enabled and safe for
/// the given request parameters.
///
/// TODO: consider moving this onto GrpcPageServiceHandler once the libpq handler is gone.
pub async fn get_cached_basebackup_if_enabled(
&self,
lsn: Option<Lsn>,
prev_lsn: Option<Lsn>,
full: bool,
replica: bool,
gzip: bool,
) -> Option<tokio::fs::File> {
if !self.is_basebackup_cache_enabled() || !self.basebackup_cache.is_enabled() {
return None;
}
// We have to know which LSN to fetch the basebackup for.
let lsn = lsn?;
// We only cache gzipped, non-full basebackups for primary computes with automatic prev_lsn.
if prev_lsn.is_some() || full || replica || !gzip {
return None;
}
self.get_cached_basebackup(lsn).await
}
/// Prepare basebackup for the given LSN and store it in the basebackup cache.
/// The method is asynchronous and returns immediately.
/// The actual basebackup preparation is performed in the background
@@ -2552,8 +2521,17 @@ impl Timeline {
return;
}
self.basebackup_cache
.send_prepare(self.tenant_shard_id, self.timeline_id, lsn);
let res = self
.basebackup_prepare_sender
.send(BasebackupPrepareRequest {
tenant_shard_id: self.tenant_shard_id,
timeline_id: self.timeline_id,
lsn,
});
if let Err(e) = res {
// May happen during shutdown, it's not critical.
info!("Failed to send shutdown checkpoint: {e:#}");
}
}
}
@@ -3110,7 +3088,7 @@ impl Timeline {
wait_lsn_log_slow: tokio::sync::Semaphore::new(1),
basebackup_cache: resources.basebackup_cache,
basebackup_prepare_sender: resources.basebackup_prepare_sender,
feature_resolver: resources.feature_resolver,
};
@@ -4680,16 +4658,6 @@ impl Timeline {
mut layer_flush_start_rx: tokio::sync::watch::Receiver<(u64, Lsn)>,
ctx: &RequestContext,
) {
// Always notify waiters about the flush loop exiting since the loop might stop
// when the timeline hasn't been cancelled.
let scopeguard_rx = layer_flush_start_rx.clone();
scopeguard::defer! {
let (flush_counter, _) = *scopeguard_rx.borrow();
let _ = self
.layer_flush_done_tx
.send_replace((flush_counter, Err(FlushLayerError::Cancelled)));
}
// Subscribe to L0 delta layer updates, for compaction backpressure.
let mut watch_l0 = match self
.layers
@@ -4719,6 +4687,9 @@ impl Timeline {
let result = loop {
if self.cancel.is_cancelled() {
info!("dropping out of flush loop for timeline shutdown");
// Note: we do not bother transmitting into [`layer_flush_done_tx`], because
// anyone waiting on that will respect self.cancel as well: they will stop
// waiting at the same time we as drop out of this loop.
return;
}
@@ -4729,7 +4700,7 @@ impl Timeline {
}
// Fetch the next layer to flush, if any.
let (layer, l0_count, frozen_count, frozen_size, open_layer_size) = {
let (layer, l0_count, frozen_count, frozen_size) = {
let layers = self.layers.read(LayerManagerLockHolder::FlushLoop).await;
let Ok(lm) = layers.layer_map() else {
info!("dropping out of flush loop for timeline shutdown");
@@ -4742,13 +4713,8 @@ impl Timeline {
.iter()
.map(|l| l.estimated_in_mem_size())
.sum();
let open_layer_size: u64 = lm
.open_layer
.as_ref()
.map(|l| l.estimated_in_mem_size())
.unwrap_or(0);
let layer = lm.frozen_layers.front().cloned();
(layer, l0_count, frozen_count, frozen_size, open_layer_size)
(layer, l0_count, frozen_count, frozen_size)
// drop 'layers' lock
};
let Some(layer) = layer else {
@@ -4761,7 +4727,7 @@ impl Timeline {
if l0_count >= stall_threshold {
warn!(
"stalling layer flushes for compaction backpressure at {l0_count} \
L0 layers ({frozen_count} frozen layers with {frozen_size} bytes, {open_layer_size} bytes in open layer)"
L0 layers ({frozen_count} frozen layers with {frozen_size} bytes)"
);
let stall_timer = self
.metrics
@@ -4814,7 +4780,7 @@ impl Timeline {
let delay = flush_duration.as_secs_f64();
info!(
"delaying layer flush by {delay:.3}s for compaction backpressure at \
{l0_count} L0 layers ({frozen_count} frozen layers with {frozen_size} bytes, {open_layer_size} bytes in open layer)"
{l0_count} L0 layers ({frozen_count} frozen layers with {frozen_size} bytes)"
);
let _delay_timer = self
.metrics
@@ -5313,7 +5279,6 @@ impl Timeline {
ctx: &RequestContext,
img_range: Range<Key>,
io_concurrency: IoConcurrency,
progress: Option<(usize, usize)>,
) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
let mut wrote_keys = false;
@@ -5390,15 +5355,11 @@ impl Timeline {
}
}
let progress_report = progress
.map(|(idx, total)| format!("({idx}/{total}) "))
.unwrap_or_default();
if wrote_keys {
// Normal path: we have written some data into the new image layer for this
// partition, so flush it to disk.
info!(
"{} produced image layer for rel {}",
progress_report,
"produced image layer for rel {}",
ImageLayerName {
key_range: img_range.clone(),
lsn
@@ -5408,12 +5369,7 @@ impl Timeline {
unfinished_image_layer: image_layer_writer,
})
} else {
tracing::debug!(
"{} no data in range {}-{}",
progress_report,
img_range.start,
img_range.end
);
tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
Ok(ImageLayerCreationOutcome::Empty)
}
}
@@ -5605,7 +5561,7 @@ impl Timeline {
self.should_check_if_image_layers_required(lsn)
};
let mut batch_image_writer = BatchLayerWriter::new(self.conf);
let mut batch_image_writer = BatchLayerWriter::new(self.conf).await?;
let mut all_generated = true;
@@ -5648,8 +5604,7 @@ impl Timeline {
}
}
let total = partition_parts.len();
for (idx, partition) in partition_parts.iter().enumerate() {
for partition in partition_parts.iter() {
if self.cancel.is_cancelled() {
return Err(CreateImageLayersError::Cancelled);
}
@@ -5710,8 +5665,7 @@ impl Timeline {
self.cancel.clone(),
ctx,
)
.await
.map_err(CreateImageLayersError::Other)?;
.await?;
fail_point!("image-layer-writer-fail-before-finish", |_| {
Err(CreateImageLayersError::Other(anyhow::anyhow!(
@@ -5734,7 +5688,6 @@ impl Timeline {
ctx,
img_range.clone(),
io_concurrency,
Some((idx, total)),
)
.await?
} else {
@@ -5807,10 +5760,7 @@ impl Timeline {
}
}
let image_layers = batch_image_writer
.finish(self, ctx)
.await
.map_err(CreateImageLayersError::Other)?;
let image_layers = batch_image_writer.finish(self, ctx).await?;
let mut guard = self.layers.write(LayerManagerLockHolder::Compaction).await;
@@ -6824,11 +6774,7 @@ impl Timeline {
Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled),
Err(walredo::Error::Other(err)) => {
if fire_critical_error {
critical_timeline!(
self.tenant_shard_id,
self.timeline_id,
"walredo failure during page reconstruction: {err:?}"
);
critical!("walredo failure during page reconstruction: {err:?}");
}
return Err(PageReconstructError::WalRedo(
err.context("reconstruct a page image"),

View File

@@ -9,7 +9,7 @@ use std::ops::{Deref, Range};
use std::sync::Arc;
use std::time::{Duration, Instant};
use super::layer_manager::LayerManagerLockHolder;
use super::layer_manager::{LayerManagerLockHolder, LayerManagerReadGuard};
use super::{
CompactFlags, CompactOptions, CompactionError, CreateImageLayersError, DurationRecorder,
GetVectoredError, ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration,
@@ -36,7 +36,7 @@ use serde::Serialize;
use tokio::sync::{OwnedSemaphorePermit, Semaphore};
use tokio_util::sync::CancellationToken;
use tracing::{Instrument, debug, error, info, info_span, trace, warn};
use utils::critical_timeline;
use utils::critical;
use utils::id::TimelineId;
use utils::lsn::Lsn;
use wal_decoder::models::record::NeonWalRecord;
@@ -101,11 +101,7 @@ pub enum GcCompactionQueueItem {
/// Whether the compaction is triggered automatically (determines whether we need to update L2 LSN)
auto: bool,
},
SubCompactionJob {
i: usize,
total: usize,
options: CompactOptions,
},
SubCompactionJob(CompactOptions),
Notify(GcCompactionJobId, Option<Lsn>),
}
@@ -167,7 +163,7 @@ impl GcCompactionQueueItem {
running,
job_id: id.0,
}),
GcCompactionQueueItem::SubCompactionJob { options, .. } => Some(CompactInfoResponse {
GcCompactionQueueItem::SubCompactionJob(options) => Some(CompactInfoResponse {
compact_key_range: options.compact_key_range,
compact_lsn_range: options.compact_lsn_range,
sub_compaction: options.sub_compaction,
@@ -493,7 +489,7 @@ impl GcCompactionQueue {
.map(|job| job.compact_lsn_range.end)
.max()
.unwrap();
for (i, job) in jobs.into_iter().enumerate() {
for job in jobs {
// Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions`
// until we do further refactors to allow directly call `compact_with_gc`.
let mut flags: EnumSet<CompactFlags> = EnumSet::default();
@@ -511,11 +507,7 @@ impl GcCompactionQueue {
compact_lsn_range: Some(job.compact_lsn_range.into()),
sub_compaction_max_job_size_mb: None,
};
pending_tasks.push(GcCompactionQueueItem::SubCompactionJob {
options,
i,
total: jobs_len,
});
pending_tasks.push(GcCompactionQueueItem::SubCompactionJob(options));
}
if !auto {
@@ -659,7 +651,7 @@ impl GcCompactionQueue {
}
}
}
GcCompactionQueueItem::SubCompactionJob { options, i, total } => {
GcCompactionQueueItem::SubCompactionJob(options) => {
// TODO: error handling, clear the queue if any task fails?
let _gc_guard = match gc_block.start().await {
Ok(guard) => guard,
@@ -671,7 +663,6 @@ impl GcCompactionQueue {
)));
}
};
info!("running gc-compaction subcompaction job {}/{}", i, total);
let res = timeline.compact_with_options(cancel, options, ctx).await;
let compaction_result = match res {
Ok(res) => res,
@@ -1319,7 +1310,7 @@ impl Timeline {
|| cfg!(feature = "testing")
|| self
.feature_resolver
.evaluate_boolean("image-compaction-boundary")
.evaluate_boolean("image-compaction-boundary", self.tenant_shard_id.tenant_id)
.is_ok()
{
let last_repartition_lsn = self.partitioning.read().1;
@@ -1390,11 +1381,7 @@ impl Timeline {
GetVectoredError::MissingKey(_),
) = err
{
critical_timeline!(
self.tenant_shard_id,
self.timeline_id,
"missing key during compaction: {err:?}"
);
critical!("missing key during compaction: {err:?}");
}
})?;
@@ -1422,11 +1409,7 @@ impl Timeline {
// Alert on critical errors that indicate data corruption.
Err(err) if err.is_critical() => {
critical_timeline!(
self.tenant_shard_id,
self.timeline_id,
"could not compact, repartitioning keyspace failed: {err:?}"
);
critical!("could not compact, repartitioning keyspace failed: {err:?}");
}
// Log other errors. No partitioning? This is normal, if the timeline was just created
@@ -1608,15 +1591,13 @@ impl Timeline {
let started = Instant::now();
let mut replace_image_layers = Vec::new();
let total = layers_to_rewrite.len();
for (i, layer) in layers_to_rewrite.into_iter().enumerate() {
for layer in layers_to_rewrite {
if self.cancel.is_cancelled() {
return Err(CompactionError::ShuttingDown);
}
info!(layer=%layer, "rewriting layer after shard split: {}/{}", i, total);
info!(layer=%layer, "rewriting layer after shard split");
let mut image_layer_writer = ImageLayerWriter::new(
self.conf,
self.timeline_id,
@@ -1798,14 +1779,20 @@ impl Timeline {
} = {
let phase1_span = info_span!("compact_level0_phase1");
let ctx = ctx.attached_child();
let stats = CompactLevel0Phase1StatsBuilder {
let mut stats = CompactLevel0Phase1StatsBuilder {
version: Some(2),
tenant_id: Some(self.tenant_shard_id),
timeline_id: Some(self.timeline_id),
..Default::default()
};
let begin = tokio::time::Instant::now();
let phase1_layers_locked = self.layers.read(LayerManagerLockHolder::Compaction).await;
let now = tokio::time::Instant::now();
stats.read_lock_acquisition_micros =
DurationRecorder::Recorded(RecordedDuration(now - begin), now);
self.compact_level0_phase1(
phase1_layers_locked,
stats,
target_file_size,
force_compaction_ignore_threshold,
@@ -1826,19 +1813,16 @@ impl Timeline {
}
/// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
async fn compact_level0_phase1(
self: &Arc<Self>,
async fn compact_level0_phase1<'a>(
self: &'a Arc<Self>,
guard: LayerManagerReadGuard<'a>,
mut stats: CompactLevel0Phase1StatsBuilder,
target_file_size: u64,
force_compaction_ignore_threshold: bool,
ctx: &RequestContext,
) -> Result<CompactLevel0Phase1Result, CompactionError> {
let begin = tokio::time::Instant::now();
let guard = self.layers.read(LayerManagerLockHolder::Compaction).await;
let now = tokio::time::Instant::now();
stats.read_lock_acquisition_micros =
DurationRecorder::Recorded(RecordedDuration(now - begin), now);
stats.read_lock_held_spawn_blocking_startup_micros =
stats.read_lock_acquisition_micros.till_now(); // set by caller
let layers = guard.layer_map()?;
let level0_deltas = layers.level0_deltas();
stats.level0_deltas_count = Some(level0_deltas.len());
@@ -1873,12 +1857,6 @@ impl Timeline {
.map(|x| guard.get_from_desc(x))
.collect::<Vec<_>>();
drop_layer_manager_rlock(guard);
// The is the last LSN that we have seen for L0 compaction in the timeline. This LSN might be updated
// by the time we finish the compaction. So we need to get it here.
let l0_last_record_lsn = self.get_last_record_lsn();
// Gather the files to compact in this iteration.
//
// Start with the oldest Level 0 delta file, and collect any other
@@ -1966,7 +1944,9 @@ impl Timeline {
// we don't accidentally use it later in the function.
drop(level0_deltas);
stats.compaction_prerequisites_micros = stats.read_lock_acquisition_micros.till_now();
stats.read_lock_held_prerequisites_micros = stats
.read_lock_held_spawn_blocking_startup_micros
.till_now();
// TODO: replace with streaming k-merge
let all_keys = {
@@ -1988,7 +1968,7 @@ impl Timeline {
all_keys
};
stats.read_lock_held_key_sort_micros = stats.compaction_prerequisites_micros.till_now();
stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now();
// Determine N largest holes where N is number of compacted layers. The vec is sorted by key range start.
//
@@ -2022,6 +2002,7 @@ impl Timeline {
}
}
let max_holes = deltas_to_compact.len();
let last_record_lsn = self.get_last_record_lsn();
let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
let min_hole_coverage_size = 3; // TODO: something more flexible?
// min-heap (reserve space for one more element added before eviction)
@@ -2040,12 +2021,8 @@ impl Timeline {
// has not so much sense, because largest holes will corresponds field1/field2 changes.
// But we are mostly interested to eliminate holes which cause generation of excessive image layers.
// That is why it is better to measure size of hole as number of covering image layers.
let coverage_size = {
// TODO: optimize this with copy-on-write layer map.
let guard = self.layers.read(LayerManagerLockHolder::Compaction).await;
let layers = guard.layer_map()?;
layers.image_coverage(&key_range, l0_last_record_lsn).len()
};
let coverage_size =
layers.image_coverage(&key_range, last_record_lsn).len();
if coverage_size >= min_hole_coverage_size {
heap.push(Hole {
key_range,
@@ -2064,6 +2041,7 @@ impl Timeline {
holes
};
stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
drop_layer_manager_rlock(guard);
if self.cancel.is_cancelled() {
return Err(CompactionError::ShuttingDown);
@@ -2404,8 +2382,9 @@ struct CompactLevel0Phase1StatsBuilder {
tenant_id: Option<TenantShardId>,
timeline_id: Option<TimelineId>,
read_lock_acquisition_micros: DurationRecorder,
read_lock_held_spawn_blocking_startup_micros: DurationRecorder,
read_lock_held_key_sort_micros: DurationRecorder,
compaction_prerequisites_micros: DurationRecorder,
read_lock_held_prerequisites_micros: DurationRecorder,
read_lock_held_compute_holes_micros: DurationRecorder,
read_lock_drop_micros: DurationRecorder,
write_layer_files_micros: DurationRecorder,
@@ -2420,8 +2399,9 @@ struct CompactLevel0Phase1Stats {
tenant_id: TenantShardId,
timeline_id: TimelineId,
read_lock_acquisition_micros: RecordedDuration,
read_lock_held_spawn_blocking_startup_micros: RecordedDuration,
read_lock_held_key_sort_micros: RecordedDuration,
compaction_prerequisites_micros: RecordedDuration,
read_lock_held_prerequisites_micros: RecordedDuration,
read_lock_held_compute_holes_micros: RecordedDuration,
read_lock_drop_micros: RecordedDuration,
write_layer_files_micros: RecordedDuration,
@@ -2446,12 +2426,16 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
.read_lock_acquisition_micros
.into_recorded()
.ok_or_else(|| anyhow!("read_lock_acquisition_micros not set"))?,
read_lock_held_spawn_blocking_startup_micros: value
.read_lock_held_spawn_blocking_startup_micros
.into_recorded()
.ok_or_else(|| anyhow!("read_lock_held_spawn_blocking_startup_micros not set"))?,
read_lock_held_key_sort_micros: value
.read_lock_held_key_sort_micros
.into_recorded()
.ok_or_else(|| anyhow!("read_lock_held_key_sort_micros not set"))?,
compaction_prerequisites_micros: value
.compaction_prerequisites_micros
read_lock_held_prerequisites_micros: value
.read_lock_held_prerequisites_micros
.into_recorded()
.ok_or_else(|| anyhow!("read_lock_held_prerequisites_micros not set"))?,
read_lock_held_compute_holes_micros: value
@@ -3519,16 +3503,22 @@ impl Timeline {
// Only create image layers when there is no ancestor branches. TODO: create covering image layer
// when some condition meet.
let mut image_layer_writer = if !has_data_below {
Some(SplitImageLayerWriter::new(
self.conf,
self.timeline_id,
self.tenant_shard_id,
job_desc.compaction_key_range.start,
lowest_retain_lsn,
self.get_compaction_target_size(),
&self.gate,
self.cancel.clone(),
))
Some(
SplitImageLayerWriter::new(
self.conf,
self.timeline_id,
self.tenant_shard_id,
job_desc.compaction_key_range.start,
lowest_retain_lsn,
self.get_compaction_target_size(),
&self.gate,
self.cancel.clone(),
ctx,
)
.await
.context("failed to create image layer writer")
.map_err(CompactionError::Other)?,
)
} else {
None
};
@@ -3541,7 +3531,10 @@ impl Timeline {
self.get_compaction_target_size(),
&self.gate,
self.cancel.clone(),
);
)
.await
.context("failed to create delta layer writer")
.map_err(CompactionError::Other)?;
#[derive(Default)]
struct RewritingLayers {
@@ -4337,8 +4330,7 @@ impl TimelineAdaptor {
self.timeline.cancel.clone(),
ctx,
)
.await
.map_err(CreateImageLayersError::Other)?;
.await?;
fail_point!("image-layer-writer-fail-before-finish", |_| {
Err(CreateImageLayersError::Other(anyhow::anyhow!(
@@ -4347,10 +4339,7 @@ impl TimelineAdaptor {
});
let keyspace = KeySpace {
ranges: self
.get_keyspace(key_range, lsn, ctx)
.await
.map_err(CreateImageLayersError::Other)?,
ranges: self.get_keyspace(key_range, lsn, ctx).await?,
};
// TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly
let outcome = self
@@ -4362,7 +4351,6 @@ impl TimelineAdaptor {
ctx,
key_range.clone(),
IoConcurrency::sequential(),
None,
)
.await?;
@@ -4370,13 +4358,9 @@ impl TimelineAdaptor {
unfinished_image_layer,
} = outcome
{
let (desc, path) = unfinished_image_layer
.finish(ctx)
.await
.map_err(CreateImageLayersError::Other)?;
let (desc, path) = unfinished_image_layer.finish(ctx).await?;
let image_layer =
Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)
.map_err(CreateImageLayersError::Other)?;
Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?;
self.new_images.push(image_layer);
}

View File

@@ -241,17 +241,8 @@ impl DeleteTimelineFlow {
{
Ok(r) => r,
Err(DownloadError::NotFound) => {
// Deletion is already complete.
// As we came here, we will need to remove the timeline from the tenant though.
// Deletion is already complete
tracing::info!("Timeline already deleted in remote storage");
if let TimelineOrOffloaded::Offloaded(_) = &timeline {
// We only supoprt this for offloaded timelines, as we don't know which state non-offloaded timelines are in.
tracing::info!(
"Timeline with gone index part is offloaded timeline. Removing from tenant."
);
remove_maybe_offloaded_timeline_from_tenant(tenant, &timeline, &guard)
.await?;
}
return Ok(());
}
Err(e) => {

View File

@@ -182,7 +182,6 @@ pub(crate) async fn generate_tombstone_image_layer(
detached: &Arc<Timeline>,
ancestor: &Arc<Timeline>,
ancestor_lsn: Lsn,
historic_layers_to_copy: &Vec<Layer>,
ctx: &RequestContext,
) -> Result<Option<ResidentLayer>, Error> {
tracing::info!(
@@ -200,20 +199,6 @@ pub(crate) async fn generate_tombstone_image_layer(
let image_lsn = ancestor_lsn;
{
for layer in historic_layers_to_copy {
let desc = layer.layer_desc();
if !desc.is_delta
&& desc.lsn_range.start == image_lsn
&& overlaps_with(&key_range, &desc.key_range)
{
tracing::info!(
layer=%layer, "will copy tombstone from ancestor instead of creating a new one"
);
return Ok(None);
}
}
let layers = detached
.layers
.read(LayerManagerLockHolder::DetachAncestor)
@@ -465,8 +450,7 @@ pub(super) async fn prepare(
Vec::with_capacity(straddling_branchpoint.len() + rest_of_historic.len() + 1);
if let Some(tombstone_layer) =
generate_tombstone_image_layer(detached, &ancestor, ancestor_lsn, &rest_of_historic, ctx)
.await?
generate_tombstone_image_layer(detached, &ancestor, ancestor_lsn, ctx).await?
{
new_layers.push(tombstone_layer.into());
}
@@ -901,7 +885,7 @@ async fn remote_copy(
}
tracing::info!("Deleting orphan layer file to make way for hard linking");
// Delete orphan layer file and try again, to ensure this layer has a well understood source
std::fs::remove_file(&adoptee_path)
std::fs::remove_file(adopted_path)
.map_err(|e| Error::launder(e.into(), Error::Prepare))?;
std::fs::hard_link(adopted_path, &adoptee_path)
.map_err(|e| Error::launder(e.into(), Error::Prepare))?;

View File

@@ -887,7 +887,7 @@ mod tests {
.expect("we still have it");
}
fn make_relation_key_for_shard(shard: ShardNumber, params: ShardParameters) -> Key {
fn make_relation_key_for_shard(shard: ShardNumber, params: &ShardParameters) -> Key {
rel_block_to_key(
RelTag {
spcnode: 1663,
@@ -917,14 +917,14 @@ mod tests {
let child0 = Arc::new_cyclic(|myself| StubTimeline {
gate: Default::default(),
id: timeline_id,
shard: ShardIdentity::from_params(ShardNumber(0), child_params),
shard: ShardIdentity::from_params(ShardNumber(0), &child_params),
per_timeline_state: PerTimelineState::default(),
myself: myself.clone(),
});
let child1 = Arc::new_cyclic(|myself| StubTimeline {
gate: Default::default(),
id: timeline_id,
shard: ShardIdentity::from_params(ShardNumber(1), child_params),
shard: ShardIdentity::from_params(ShardNumber(1), &child_params),
per_timeline_state: PerTimelineState::default(),
myself: myself.clone(),
});
@@ -937,7 +937,7 @@ mod tests {
let handle = cache
.get(
timeline_id,
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), child_params)),
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
&StubManager {
shards: vec![parent.clone()],
},
@@ -961,7 +961,7 @@ mod tests {
let handle = cache
.get(
timeline_id,
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), child_params)),
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
&StubManager {
shards: vec![], // doesn't matter what's in here, the cache is fully loaded
},
@@ -978,7 +978,7 @@ mod tests {
let parent_handle = cache
.get(
timeline_id,
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(0), child_params)),
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(0), &child_params)),
&StubManager {
shards: vec![parent.clone()],
},
@@ -995,7 +995,7 @@ mod tests {
let handle = cache
.get(
timeline_id,
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), child_params)),
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
&StubManager {
shards: vec![child0.clone(), child1.clone()], // <====== this changed compared to previous loop
},

View File

@@ -19,8 +19,6 @@ pub(crate) enum OffloadError {
NotArchived,
#[error(transparent)]
RemoteStorage(anyhow::Error),
#[error("Offload or deletion already in progress")]
AlreadyInProgress,
#[error("Unexpected offload error: {0}")]
Other(anyhow::Error),
}
@@ -46,26 +44,20 @@ pub(crate) async fn offload_timeline(
timeline.timeline_id,
TimelineDeleteGuardKind::Offload,
);
let (timeline, guard) = match delete_guard_res {
Ok(timeline_and_guard) => timeline_and_guard,
Err(DeleteTimelineError::HasChildren(children)) => {
let is_archived = timeline.is_archived();
if is_archived == Some(true) {
tracing::error!("timeline is archived but has non-archived children: {children:?}");
return Err(OffloadError::NotArchived);
}
tracing::info!(
?is_archived,
"timeline is not archived and has unarchived children"
);
if let Err(DeleteTimelineError::HasChildren(children)) = delete_guard_res {
let is_archived = timeline.is_archived();
if is_archived == Some(true) {
tracing::error!("timeline is archived but has non-archived children: {children:?}");
return Err(OffloadError::NotArchived);
}
Err(DeleteTimelineError::AlreadyInProgress(_)) => {
tracing::info!("timeline offload or deletion already in progress");
return Err(OffloadError::AlreadyInProgress);
}
Err(e) => return Err(OffloadError::Other(anyhow::anyhow!(e))),
tracing::info!(
?is_archived,
"timeline is not archived and has unarchived children"
);
return Err(OffloadError::NotArchived);
};
let (timeline, guard) =
delete_guard_res.map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;
let TimelineOrOffloaded::Timeline(timeline) = timeline else {
tracing::error!("timeline already offloaded, but given timeline object");

View File

@@ -25,7 +25,7 @@ use tokio_postgres::replication::ReplicationStream;
use tokio_postgres::{Client, SimpleQueryMessage, SimpleQueryRow};
use tokio_util::sync::CancellationToken;
use tracing::{Instrument, debug, error, info, trace, warn};
use utils::critical_timeline;
use utils::critical;
use utils::id::NodeId;
use utils::lsn::Lsn;
use utils::pageserver_feedback::PageserverFeedback;
@@ -368,13 +368,9 @@ pub(super) async fn handle_walreceiver_connection(
match raw_wal_start_lsn.cmp(&expected_wal_start) {
std::cmp::Ordering::Greater => {
let msg = format!(
"Gap in streamed WAL: [{expected_wal_start}, {raw_wal_start_lsn}"
);
critical_timeline!(
timeline.tenant_shard_id,
timeline.timeline_id,
"{msg}"
"Gap in streamed WAL: [{expected_wal_start}, {raw_wal_start_lsn})"
);
critical!("{msg}");
return Err(WalReceiverError::Other(anyhow!(msg)));
}
std::cmp::Ordering::Less => {
@@ -387,11 +383,7 @@ pub(super) async fn handle_walreceiver_connection(
"Received record with next_record_lsn multiple times ({} < {})",
first_rec.next_record_lsn, expected_wal_start
);
critical_timeline!(
timeline.tenant_shard_id,
timeline.timeline_id,
"{msg}"
);
critical!("{msg}");
return Err(WalReceiverError::Other(anyhow!(msg)));
}
}
@@ -460,11 +452,7 @@ pub(super) async fn handle_walreceiver_connection(
// TODO: we can't differentiate cancellation errors with
// anyhow::Error, so just ignore it if we're cancelled.
if !cancellation.is_cancelled() && !timeline.is_stopping() {
critical_timeline!(
timeline.tenant_shard_id,
timeline.timeline_id,
"{err:?}"
);
critical!("{err:?}")
}
})?;

View File

@@ -40,7 +40,7 @@ use tracing::*;
use utils::bin_ser::{DeserializeError, SerializeError};
use utils::lsn::Lsn;
use utils::rate_limit::RateLimit;
use utils::{critical_timeline, failpoint_support};
use utils::{critical, failpoint_support};
use wal_decoder::models::record::NeonWalRecord;
use wal_decoder::models::*;
@@ -418,30 +418,18 @@ impl WalIngest {
// as there has historically been cases where PostgreSQL has cleared spurious VM pages. See:
// https://github.com/neondatabase/neon/pull/10634.
let Some(vm_size) = get_relsize(modification, vm_rel, ctx).await? else {
critical_timeline!(
modification.tline.tenant_shard_id,
modification.tline.timeline_id,
"clear_vm_bits for unknown VM relation {vm_rel}"
);
critical!("clear_vm_bits for unknown VM relation {vm_rel}");
return Ok(());
};
if let Some(blknum) = new_vm_blk {
if blknum >= vm_size {
critical_timeline!(
modification.tline.tenant_shard_id,
modification.tline.timeline_id,
"new_vm_blk {blknum} not in {vm_rel} of size {vm_size}"
);
critical!("new_vm_blk {blknum} not in {vm_rel} of size {vm_size}");
new_vm_blk = None;
}
}
if let Some(blknum) = old_vm_blk {
if blknum >= vm_size {
critical_timeline!(
modification.tline.tenant_shard_id,
modification.tline.timeline_id,
"old_vm_blk {blknum} not in {vm_rel} of size {vm_size}"
);
critical!("old_vm_blk {blknum} not in {vm_rel} of size {vm_size}");
old_vm_blk = None;
}
}

View File

@@ -1295,8 +1295,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
if (iteration_hits != 0)
{
/* chunk offset (#
of pages) into the LFC file */
/* chunk offset (# of pages) into the LFC file */
off_t first_read_offset = (off_t) entry_offset * lfc_blocks_per_chunk;
int nwrite = iov_last_used - first_block_in_chunk_read;
/* offset of first IOV */
@@ -1314,6 +1313,16 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
lfc_disable("read");
return -1;
}
/*
* We successfully read the pages we know were valid when we
* started reading; now mark those pages as read
*/
for (int i = first_block_in_chunk_read; i < iov_last_used; i++)
{
if (BITMAP_ISSET(chunk_mask, i))
BITMAP_SET(mask, buf_offset + i);
}
}
/* Place entry to the head of LRU list */
@@ -1331,15 +1340,6 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
{
lfc_ctl->time_read += io_time_us;
inc_page_cache_read_wait(io_time_us);
/*
* We successfully read the pages we know were valid when we
* started reading; now mark those pages as read
*/
for (int i = first_block_in_chunk_read; i < iov_last_used; i++)
{
if (BITMAP_ISSET(chunk_mask, i))
BITMAP_SET(mask, buf_offset + i);
}
}
CriticalAssert(entry->access_count > 0);

View File

@@ -87,14 +87,6 @@ static const struct config_enum_entry running_xacts_overflow_policies[] = {
{NULL, 0, false}
};
static const struct config_enum_entry debug_compare_local_modes[] = {
{"none", DEBUG_COMPARE_LOCAL_NONE, false},
{"prefetch", DEBUG_COMPARE_LOCAL_PREFETCH, false},
{"lfc", DEBUG_COMPARE_LOCAL_LFC, false},
{"all", DEBUG_COMPARE_LOCAL_ALL, false},
{NULL, 0, false}
};
/*
* XXX: These private to procarray.c, but we need them here.
*/
@@ -527,16 +519,6 @@ _PG_init(void)
GUC_UNIT_KB,
NULL, NULL, NULL);
DefineCustomEnumVariable(
"neon.debug_compare_local",
"Debug mode for compaing content of pages in prefetch ring/LFC/PS and local disk",
NULL,
&debug_compare_local,
DEBUG_COMPARE_LOCAL_NONE,
debug_compare_local_modes,
PGC_POSTMASTER,
0,
NULL, NULL, NULL);
/*
* Important: This must happen after other parts of the extension are
* loaded, otherwise any settings to GUCs that were set before the

View File

@@ -177,22 +177,6 @@ extern StringInfoData nm_pack_request(NeonRequest *msg);
extern NeonResponse *nm_unpack_response(StringInfo s);
extern char *nm_to_string(NeonMessage *msg);
/*
* If debug_compare_local>DEBUG_COMPARE_LOCAL_NONE, we pass through all the SMGR API
* calls to md.c, and *also* do the calls to the Page Server. On every
* read, compare the versions we read from local disk and Page Server,
* and Assert that they are identical.
*/
typedef enum
{
DEBUG_COMPARE_LOCAL_NONE, /* normal mode - pages are storted locally only for unlogged relations */
DEBUG_COMPARE_LOCAL_PREFETCH, /* if page is found in prefetch ring, then compare it with local and return */
DEBUG_COMPARE_LOCAL_LFC, /* if page is found in LFC or prefetch ring, then compare it with local and return */
DEBUG_COMPARE_LOCAL_ALL /* always fetch page from PS and compare it with local */
} DebugCompareLocalMode;
extern int debug_compare_local;
/*
* API
*/

View File

@@ -76,11 +76,21 @@
typedef PGAlignedBlock PGIOAlignedBlock;
#endif
/*
* If DEBUG_COMPARE_LOCAL is defined, we pass through all the SMGR API
* calls to md.c, and *also* do the calls to the Page Server. On every
* read, compare the versions we read from local disk and Page Server,
* and Assert that they are identical.
*/
/* #define DEBUG_COMPARE_LOCAL */
#ifdef DEBUG_COMPARE_LOCAL
#include "access/nbtree.h"
#include "storage/bufpage.h"
#include "access/xlog_internal.h"
static char *hexdump_page(char *page);
#endif
#define IS_LOCAL_REL(reln) (\
NInfoGetDbOid(InfoFromSMgrRel(reln)) != 0 && \
@@ -98,8 +108,6 @@ typedef enum
UNLOGGED_BUILD_NOT_PERMANENT
} UnloggedBuildPhase;
int debug_compare_local;
static NRelFileInfo unlogged_build_rel_info;
static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
@@ -470,10 +478,9 @@ neon_init(void)
old_redo_read_buffer_filter = redo_read_buffer_filter;
redo_read_buffer_filter = neon_redo_read_buffer_filter;
if (debug_compare_local)
{
mdinit();
}
#ifdef DEBUG_COMPARE_LOCAL
mdinit();
#endif
}
/*
@@ -796,16 +803,13 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
case RELPERSISTENCE_TEMP:
case RELPERSISTENCE_UNLOGGED:
if (debug_compare_local)
{
mdcreate(reln, forkNum, forkNum == INIT_FORKNUM || isRedo);
if (forkNum == MAIN_FORKNUM)
mdcreate(reln, INIT_FORKNUM, true);
}
else
{
mdcreate(reln, forkNum, isRedo);
}
#ifdef DEBUG_COMPARE_LOCAL
mdcreate(reln, forkNum, forkNum == INIT_FORKNUM || isRedo);
if (forkNum == MAIN_FORKNUM)
mdcreate(reln, INIT_FORKNUM, true);
#else
mdcreate(reln, forkNum, isRedo);
#endif
return;
default:
@@ -844,11 +848,10 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
else
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
if (debug_compare_local)
{
if (IS_LOCAL_REL(reln))
mdcreate(reln, forkNum, isRedo);
}
#ifdef DEBUG_COMPARE_LOCAL
if (IS_LOCAL_REL(reln))
mdcreate(reln, forkNum, isRedo);
#endif
}
/*
@@ -874,7 +877,7 @@ neon_unlink(NRelFileInfoBackend rinfo, ForkNumber forkNum, bool isRedo)
{
/*
* Might or might not exist locally, depending on whether it's an unlogged
* or permanent relation (or if debug_compare_local is set). Try to
* or permanent relation (or if DEBUG_COMPARE_LOCAL is set). Try to
* unlink, it won't do any harm if the file doesn't exist.
*/
mdunlink(rinfo, forkNum, isRedo);
@@ -970,11 +973,10 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer);
if (debug_compare_local)
{
if (IS_LOCAL_REL(reln))
mdextend(reln, forkNum, blkno, buffer, skipFsync);
}
#ifdef DEBUG_COMPARE_LOCAL
if (IS_LOCAL_REL(reln))
mdextend(reln, forkNum, blkno, buffer, skipFsync);
#endif
/*
* smgr_extend is often called with an all-zeroes page, so
@@ -1049,11 +1051,10 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
relpath(reln->smgr_rlocator, forkNum),
InvalidBlockNumber)));
if (debug_compare_local)
{
if (IS_LOCAL_REL(reln))
mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync);
}
#ifdef DEBUG_COMPARE_LOCAL
if (IS_LOCAL_REL(reln))
mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync);
#endif
/* Don't log any pages if we're not allowed to do so. */
if (!XLogInsertAllowed())
@@ -1264,11 +1265,10 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
communicator_prefetch_pump_state();
if (debug_compare_local)
{
if (IS_LOCAL_REL(reln))
mdwriteback(reln, forknum, blocknum, nblocks);
}
#ifdef DEBUG_COMPARE_LOCAL
if (IS_LOCAL_REL(reln))
mdwriteback(reln, forknum, blocknum, nblocks);
#endif
}
/*
@@ -1282,6 +1282,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
communicator_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL);
}
#ifdef DEBUG_COMPARE_LOCAL
static void
compare_with_local(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void* buffer, XLogRecPtr request_lsn)
{
@@ -1363,6 +1364,7 @@ compare_with_local(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, voi
}
}
}
#endif
#if PG_MAJORVERSION_NUM < 17
@@ -1415,28 +1417,22 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present))
{
/* Prefetch hit */
if (debug_compare_local >= DEBUG_COMPARE_LOCAL_PREFETCH)
{
compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
}
if (debug_compare_local <= DEBUG_COMPARE_LOCAL_PREFETCH)
{
return;
}
#ifdef DEBUG_COMPARE_LOCAL
compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
#else
return;
#endif
}
/* Try to read from local file cache */
if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
{
MyNeonCounters->file_cache_hits_total++;
if (debug_compare_local >= DEBUG_COMPARE_LOCAL_LFC)
{
compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
}
if (debug_compare_local <= DEBUG_COMPARE_LOCAL_LFC)
{
return;
}
#ifdef DEBUG_COMPARE_LOCAL
compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
#else
return;
#endif
}
neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
@@ -1446,15 +1442,15 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
*/
communicator_prefetch_pump_state();
if (debug_compare_local)
{
compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
}
#ifdef DEBUG_COMPARE_LOCAL
compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
#endif
}
#endif /* PG_MAJORVERSION_NUM <= 16 */
#if PG_MAJORVERSION_NUM >= 17
#ifdef DEBUG_COMPARE_LOCAL
static void
compare_with_localv(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void** buffers, BlockNumber nblocks, neon_request_lsns* request_lsns, bits8* read_pages)
{
@@ -1469,6 +1465,7 @@ compare_with_localv(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, vo
}
}
}
#endif
static void
@@ -1519,19 +1516,13 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
blocknum, request_lsns, nblocks,
buffers, read_pages);
if (debug_compare_local >= DEBUG_COMPARE_LOCAL_PREFETCH)
{
compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
}
if (debug_compare_local <= DEBUG_COMPARE_LOCAL_PREFETCH && prefetch_result == nblocks)
{
#ifdef DEBUG_COMPARE_LOCAL
compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
memset(read_pages, 0, sizeof(read_pages));
#else
if (prefetch_result == nblocks)
return;
}
if (debug_compare_local > DEBUG_COMPARE_LOCAL_PREFETCH)
{
memset(read_pages, 0, sizeof(read_pages));
}
#endif
/* Try to read from local file cache */
lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers,
@@ -1540,19 +1531,14 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
if (lfc_result > 0)
MyNeonCounters->file_cache_hits_total += lfc_result;
if (debug_compare_local >= DEBUG_COMPARE_LOCAL_LFC)
{
compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
}
if (debug_compare_local <= DEBUG_COMPARE_LOCAL_LFC && prefetch_result + lfc_result == nblocks)
{
/* Read all blocks from LFC, so we're done */
#ifdef DEBUG_COMPARE_LOCAL
compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
memset(read_pages, 0, sizeof(read_pages));
#else
/* Read all blocks from LFC, so we're done */
if (prefetch_result + lfc_result == nblocks)
return;
}
if (debug_compare_local > DEBUG_COMPARE_LOCAL_LFC)
{
memset(read_pages, 0, sizeof(read_pages));
}
#endif
communicator_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
buffers, nblocks, read_pages);
@@ -1562,14 +1548,14 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
*/
communicator_prefetch_pump_state();
if (debug_compare_local)
{
memset(read_pages, 0xFF, sizeof(read_pages));
compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
}
#ifdef DEBUG_COMPARE_LOCAL
memset(read_pages, 0xFF, sizeof(read_pages));
compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
#endif
}
#endif
#ifdef DEBUG_COMPARE_LOCAL
static char *
hexdump_page(char *page)
{
@@ -1588,6 +1574,7 @@ hexdump_page(char *page)
return result.data;
}
#endif
#if PG_MAJORVERSION_NUM < 17
/*
@@ -1609,8 +1596,12 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
switch (reln->smgr_relpersistence)
{
case 0:
#ifndef DEBUG_COMPARE_LOCAL
/* This is a bit tricky. Check if the relation exists locally */
if (mdexists(reln, debug_compare_local ? INIT_FORKNUM : forknum))
if (mdexists(reln, forknum))
#else
if (mdexists(reln, INIT_FORKNUM))
#endif
{
/* It exists locally. Guess it's unlogged then. */
#if PG_MAJORVERSION_NUM >= 17
@@ -1665,17 +1656,14 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
communicator_prefetch_pump_state();
if (debug_compare_local)
{
if (IS_LOCAL_REL(reln))
{
#ifdef DEBUG_COMPARE_LOCAL
if (IS_LOCAL_REL(reln))
#if PG_MAJORVERSION_NUM >= 17
mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
#else
mdwrite(reln, forknum, blocknum, buffer, skipFsync);
mdwrite(reln, forknum, blocknum, buffer, skipFsync);
#endif
}
}
#endif
}
#endif
@@ -1689,8 +1677,12 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
switch (reln->smgr_relpersistence)
{
case 0:
#ifndef DEBUG_COMPARE_LOCAL
/* This is a bit tricky. Check if the relation exists locally */
if (mdexists(reln, debug_compare_local ? INIT_FORKNUM : forknum))
if (mdexists(reln, forknum))
#else
if (mdexists(reln, INIT_FORKNUM))
#endif
{
/* It exists locally. Guess it's unlogged then. */
mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync);
@@ -1728,11 +1720,10 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
communicator_prefetch_pump_state();
if (debug_compare_local)
{
if (IS_LOCAL_REL(reln))
mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync);
}
#ifdef DEBUG_COMPARE_LOCAL
if (IS_LOCAL_REL(reln))
mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync);
#endif
}
#endif
@@ -1871,11 +1862,10 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, Blo
*/
neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forknum);
if (debug_compare_local)
{
if (IS_LOCAL_REL(reln))
mdtruncate(reln, forknum, old_blocks, nblocks);
}
#ifdef DEBUG_COMPARE_LOCAL
if (IS_LOCAL_REL(reln))
mdtruncate(reln, forknum, old_blocks, nblocks);
#endif
}
/*
@@ -1914,11 +1904,10 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
communicator_prefetch_pump_state();
if (debug_compare_local)
{
if (IS_LOCAL_REL(reln))
mdimmedsync(reln, forknum);
}
#ifdef DEBUG_COMPARE_LOCAL
if (IS_LOCAL_REL(reln))
mdimmedsync(reln, forknum);
#endif
}
#if PG_MAJORVERSION_NUM >= 17
@@ -1945,11 +1934,10 @@ neon_registersync(SMgrRelation reln, ForkNumber forknum)
neon_log(SmgrTrace, "[NEON_SMGR] registersync noop");
if (debug_compare_local)
{
if (IS_LOCAL_REL(reln))
mdimmedsync(reln, forknum);
}
#ifdef DEBUG_COMPARE_LOCAL
if (IS_LOCAL_REL(reln))
mdimmedsync(reln, forknum);
#endif
}
#endif
@@ -1990,11 +1978,10 @@ neon_start_unlogged_build(SMgrRelation reln)
case RELPERSISTENCE_UNLOGGED:
unlogged_build_rel_info = InfoFromSMgrRel(reln);
unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT;
if (debug_compare_local)
{
if (!IsParallelWorker())
mdcreate(reln, INIT_FORKNUM, true);
}
#ifdef DEBUG_COMPARE_LOCAL
if (!IsParallelWorker())
mdcreate(reln, INIT_FORKNUM, true);
#endif
return;
default:
@@ -2022,7 +2009,11 @@ neon_start_unlogged_build(SMgrRelation reln)
*/
if (!IsParallelWorker())
{
mdcreate(reln, debug_compare_local ? INIT_FORKNUM : MAIN_FORKNUM, false);
#ifndef DEBUG_COMPARE_LOCAL
mdcreate(reln, MAIN_FORKNUM, false);
#else
mdcreate(reln, INIT_FORKNUM, true);
#endif
}
}
@@ -2116,14 +2107,14 @@ neon_end_unlogged_build(SMgrRelation reln)
lfc_invalidate(InfoFromNInfoB(rinfob), forknum, nblocks);
mdclose(reln, forknum);
if (!debug_compare_local)
{
/* use isRedo == true, so that we drop it immediately */
mdunlink(rinfob, forknum, true);
}
#ifndef DEBUG_COMPARE_LOCAL
/* use isRedo == true, so that we drop it immediately */
mdunlink(rinfob, forknum, true);
#endif
}
if (debug_compare_local)
mdunlink(rinfob, INIT_FORKNUM, true);
#ifdef DEBUG_COMPARE_LOCAL
mdunlink(rinfob, INIT_FORKNUM, true);
#endif
}
NRelFileInfoInvalidate(unlogged_build_rel_info);
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;

View File

@@ -1,121 +0,0 @@
# Sub-makefile for compiling PostgreSQL as part of Neon. This is
# included from the main Makefile, and is not meant to be called
# directly.
#
# CI workflows and Dockerfiles can take advantage of the following
# properties for caching:
#
# - Compiling the targets in this file only builds the PostgreSQL sources
# under the vendor/ subdirectory, nothing else from the repository.
# - All outputs go to POSTGRES_INSTALL_DIR (by default 'pg_install',
# see parent Makefile)
# - intermediate build artifacts go to BUILD_DIR
#
#
# Variables passed from the parent Makefile that control what gets
# installed and where:
# - POSTGRES_VERSIONS
# - POSTGRES_INSTALL_DIR
# - BUILD_DIR
#
# Variables passed from the parent Makefile that affect the build
# process and the resulting binaries:
# - PG_CONFIGURE_OPTS
# - PG_CFLAGS
# - PG_LDFLAGS
# - EXTRA_PATH_OVERRIDES
###
### Main targets
###
### These are called from the main Makefile, and can also be called
### directly from command line
# Compile and install a specific PostgreSQL version
postgres-install-%: postgres-configure-% \
postgres-headers-install-% # to prevent `make install` conflicts with neon's `postgres-headers`
# Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)/<version>/include
#
# This is implicitly part of the 'postgres-install-%' target, but this can be handy
# if you want to install just the headers without building PostgreSQL, e.g. for building
# extensions.
postgres-headers-install-%: postgres-configure-%
+@echo "Installing PostgreSQL $* headers"
$(MAKE) -C $(BUILD_DIR)/$*/src/include MAKELEVEL=0 install
# Run Postgres regression tests
postgres-check-%: postgres-install-%
$(MAKE) -C $(BUILD_DIR)/$* MAKELEVEL=0 check
###
### Shorthands for the main targets, for convenience
###
# Same as the above main targets, but for all supported PostgreSQL versions
# For example, 'make postgres-install' is equivalent to
# 'make postgres-install-v14 postgres-install-v15 postgres-install-v16 postgres-install-v17'
all_version_targets=postgres-install postgres-headers-install postgres-check
.PHONY: $(all_version_targets)
$(all_version_targets): postgres-%: $(foreach pg_version,$(POSTGRES_VERSIONS),postgres-%-$(pg_version))
.PHONY: postgres
postgres: postgres-install
.PHONY: postgres-headers
postgres-headers: postgres-headers-install
# 'postgres-v17' is an alias for 'postgres-install-v17' etc.
$(foreach pg_version,$(POSTGRES_VERSIONS),postgres-$(pg_version)): postgres-%: postgres-install-%
###
### Intermediate targets
###
### These are not intended to be called directly, but are dependencies for the
### main targets.
# Run 'configure'
$(BUILD_DIR)/%/config.status:
mkdir -p $(BUILD_DIR)
test -e $(BUILD_DIR)/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > $(BUILD_DIR)/CACHEDIR.TAG
+@echo "Configuring Postgres $* build"
@test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \
echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \
echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \
exit 1; }
mkdir -p $(BUILD_DIR)/$*
VERSION=$*; \
EXTRA_VERSION=$$(cd $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION && git rev-parse HEAD); \
(cd $(BUILD_DIR)/$$VERSION && \
env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION/configure \
CFLAGS='$(PG_CFLAGS)' LDFLAGS='$(PG_LDFLAGS)' \
$(PG_CONFIGURE_OPTS) --with-extra-version=" ($$EXTRA_VERSION)" \
--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$$VERSION > configure.log)
# nicer alias to run 'configure'.
#
# This tries to accomplish this rule:
#
# postgres-configure-%: $(BUILD_DIR)/%/config.status
#
# XXX: I'm not sure why the above rule doesn't work directly. But this accomplishses
# the same thing
$(foreach pg_version,$(POSTGRES_VERSIONS),postgres-configure-$(pg_version)): postgres-configure-%: FORCE $(BUILD_DIR)/%/config.status
# Compile and install PostgreSQL (and a few contrib modules used in tests)
postgres-install-%: postgres-configure-% \
postgres-headers-install-% # to prevent `make install` conflicts with neon's `postgres-headers-install`
+@echo "Compiling PostgreSQL $*"
$(MAKE) -C $(BUILD_DIR)/$* MAKELEVEL=0 install
$(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_prewarm install
$(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_buffercache install
$(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_visibility install
$(MAKE) -C $(BUILD_DIR)/$*/contrib/pageinspect install
$(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_trgm install
$(MAKE) -C $(BUILD_DIR)/$*/contrib/amcheck install
$(MAKE) -C $(BUILD_DIR)/$*/contrib/test_decoding install
.PHONY: FORCE
FORCE:

View File

@@ -5,9 +5,8 @@ edition = "2024"
license.workspace = true
[features]
default = ["rest_broker"]
default = []
testing = ["dep:tokio-postgres"]
rest_broker = ["subzero-core", "jsonpath_lib", "ouroboros"]
[dependencies]
ahash.workspace = true
@@ -104,9 +103,6 @@ uuid.workspace = true
x509-cert.workspace = true
redis.workspace = true
zerocopy.workspace = true
subzero-core = { git = "https://github.com/neondatabase-labs/subzero", rev = "0b3d3278f5f9ac9311a7280cb1676de80e021f06", features = ["postgresql"], optional = true }
jsonpath_lib = { version = "0.3.0", optional = true }
ouroboros = { version = "0.18", optional = true }
# jwt stuff
jose-jwa = "0.1.2"

View File

@@ -138,69 +138,3 @@ Now from client you can start a new session:
```sh
PGSSLROOTCERT=./server.crt psql "postgresql://proxy:password@endpoint.local.neon.build:4432/postgres?sslmode=verify-full"
```
## auth broker setup:
Create a postgres instance:
```sh
docker run \
--detach \
--name proxy-postgres \
--env POSTGRES_HOST_AUTH_METHOD=trust \
--env POSTGRES_USER=authenticated \
--env POSTGRES_DB=database \
--publish 5432:5432 \
postgres:17-bookworm
```
Create a configuration file called `local_proxy.json` in the root of the repo (used also by the auth broker to validate JWTs)
```sh
{
"jwks": [
{
"id": "1",
"role_names": ["authenticator", "authenticated", "anon"],
"jwks_url": "https://climbing-minnow-11.clerk.accounts.dev/.well-known/jwks.json",
"provider_name": "foo",
"jwt_audience": null
}
]
}
```
Start the local proxy:
```sh
cargo run --bin local_proxy --features testing -- \
--disable-pg-session-jwt \
--http 0.0.0.0:7432
```
Start the auth broker:
```sh
LOGFMT=text OTEL_SDK_DISABLED=true cargo run --bin proxy --features testing -- \
-c server.crt -k server.key \
--is-auth-broker true \
--is-rest-broker true \
--wss 0.0.0.0:8080 \
--http 0.0.0.0:7002 \
--auth-backend local
```
Create a JWT in your auth provider (e.g. Clerk) and set it in the `NEON_JWT` environment variable.
```sh
export NEON_JWT="..."
```
Run a query against the auth broker:
```sh
curl -k "https://foo.local.neon.build:8080/sql" \
-H "Authorization: Bearer $NEON_JWT" \
-H "neon-connection-string: postgresql://authenticator@foo.local.neon.build/database" \
-d '{"query":"select 1","params":[]}'
```
Make a rest request against the auth broker (rest broker):
```sh
curl -k "https://foo.local.neon.build:8080/database/rest/v1/items?select=id,name&id=eq.1" \
-H "Authorization: Bearer $NEON_JWT"
```

View File

@@ -164,20 +164,21 @@ async fn authenticate(
})?
.map_err(ConsoleRedirectError::from)?;
if auth_config.ip_allowlist_check_enabled
&& let Some(allowed_ips) = &db_info.allowed_ips
&& !auth::check_peer_addr_is_in_list(&ctx.peer_addr(), allowed_ips)
{
return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr()));
if auth_config.ip_allowlist_check_enabled {
if let Some(allowed_ips) = &db_info.allowed_ips {
if !auth::check_peer_addr_is_in_list(&ctx.peer_addr(), allowed_ips) {
return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr()));
}
}
}
// Check if the access over the public internet is allowed, otherwise block. Note that
// the console redirect is not behind the VPC service endpoint, so we don't need to check
// the VPC endpoint ID.
if let Some(public_access_allowed) = db_info.public_access_allowed
&& !public_access_allowed
{
return Err(auth::AuthError::NetworkNotAllowed);
if let Some(public_access_allowed) = db_info.public_access_allowed {
if !public_access_allowed {
return Err(auth::AuthError::NetworkNotAllowed);
}
}
client.write_message(BeMessage::NoticeResponse("Connecting to database."));

View File

@@ -399,36 +399,36 @@ impl JwkCacheEntryLock {
tracing::debug!(?payload, "JWT signature valid with claims");
if let Some(aud) = expected_audience
&& payload.audience.0.iter().all(|s| s != aud)
{
return Err(JwtError::InvalidClaims(
JwtClaimsError::InvalidJwtTokenAudience,
));
if let Some(aud) = expected_audience {
if payload.audience.0.iter().all(|s| s != aud) {
return Err(JwtError::InvalidClaims(
JwtClaimsError::InvalidJwtTokenAudience,
));
}
}
let now = SystemTime::now();
if let Some(exp) = payload.expiration
&& now >= exp + CLOCK_SKEW_LEEWAY
{
return Err(JwtError::InvalidClaims(JwtClaimsError::JwtTokenHasExpired(
exp.duration_since(SystemTime::UNIX_EPOCH)
.unwrap_or_default()
.as_secs(),
)));
}
if let Some(nbf) = payload.not_before
&& nbf >= now + CLOCK_SKEW_LEEWAY
{
return Err(JwtError::InvalidClaims(
JwtClaimsError::JwtTokenNotYetReadyToUse(
nbf.duration_since(SystemTime::UNIX_EPOCH)
if let Some(exp) = payload.expiration {
if now >= exp + CLOCK_SKEW_LEEWAY {
return Err(JwtError::InvalidClaims(JwtClaimsError::JwtTokenHasExpired(
exp.duration_since(SystemTime::UNIX_EPOCH)
.unwrap_or_default()
.as_secs(),
),
));
)));
}
}
if let Some(nbf) = payload.not_before {
if nbf >= now + CLOCK_SKEW_LEEWAY {
return Err(JwtError::InvalidClaims(
JwtClaimsError::JwtTokenNotYetReadyToUse(
nbf.duration_since(SystemTime::UNIX_EPOCH)
.unwrap_or_default()
.as_secs(),
),
));
}
}
Ok(ComputeCredentialKeys::JwtPayload(payloadb))

View File

@@ -171,6 +171,7 @@ impl ComputeUserInfo {
pub(crate) enum ComputeCredentialKeys {
AuthKeys(AuthKeys),
JwtPayload(Vec<u8>),
None,
}
impl TryFrom<ComputeUserInfoMaybeEndpoint> for ComputeUserInfo {
@@ -345,13 +346,15 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> {
Err(e) => {
// The password could have been changed, so we invalidate the cache.
// We should only invalidate the cache if the TTL might have expired.
if e.is_password_failed()
&& let ControlPlaneClient::ProxyV1(api) = &*api
&& let Some(ep) = &user_info.endpoint_id
{
api.caches
.project_info
.maybe_invalidate_role_secret(ep, &user_info.user);
if e.is_password_failed() {
#[allow(irrefutable_let_patterns)]
if let ControlPlaneClient::ProxyV1(api) = &*api {
if let Some(ep) = &user_info.endpoint_id {
api.caches
.project_info
.maybe_invalidate_role_secret(ep, &user_info.user);
}
}
}
Err(e)

View File

@@ -1,40 +1,43 @@
use std::net::SocketAddr;
use std::pin::pin;
use std::str::FromStr;
use std::sync::Arc;
use std::time::Duration;
use anyhow::bail;
use anyhow::{Context, bail, ensure};
use arc_swap::ArcSwapOption;
use camino::Utf8PathBuf;
use camino::{Utf8Path, Utf8PathBuf};
use clap::Parser;
use compute_api::spec::LocalProxySpec;
use futures::future::Either;
use thiserror::Error;
use tokio::net::TcpListener;
use tokio::sync::Notify;
use tokio::task::JoinSet;
use tokio_util::sync::CancellationToken;
use tracing::{debug, error, info};
use tracing::{debug, error, info, warn};
use utils::sentry_init::init_sentry;
use utils::{pid_file, project_build_tag, project_git_version};
use crate::auth::backend::jwt::JwkCache;
use crate::auth::backend::local::LocalBackend;
use crate::auth::backend::local::{JWKS_ROLE_MAP, LocalBackend};
use crate::auth::{self};
use crate::cancellation::CancellationHandler;
#[cfg(feature = "rest_broker")]
use crate::config::RestConfig;
use crate::config::{
self, AuthenticationConfig, ComputeConfig, HttpConfig, ProxyConfig, RetryConfig,
refresh_config_loop,
};
use crate::control_plane::locks::ApiLocks;
use crate::control_plane::messages::{EndpointJwksResponse, JwksSettings};
use crate::ext::TaskExt;
use crate::http::health_server::AppMetrics;
use crate::intern::RoleNameInt;
use crate::metrics::{Metrics, ThreadPoolMetrics};
use crate::rate_limiter::{EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo};
use crate::scram::threadpool::ThreadPool;
use crate::serverless::cancel_set::CancelSet;
use crate::serverless::{self, GlobalConnPoolOptions};
use crate::tls::client_config::compute_client_config_with_root_certs;
use crate::types::RoleName;
use crate::url::ApiUrl;
project_git_version!(GIT_VERSION);
@@ -79,11 +82,6 @@ struct LocalProxyCliArgs {
/// Path of the local proxy PID file
#[clap(long, default_value = "./local_proxy.pid")]
pid_path: Utf8PathBuf,
/// Disable pg_session_jwt extension installation
/// This is useful for testing the local proxy with vanilla postgres.
#[clap(long, default_value = "false")]
#[cfg(feature = "testing")]
disable_pg_session_jwt: bool,
}
#[derive(clap::Args, Clone, Copy, Debug)]
@@ -279,18 +277,12 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
accept_jwts: true,
console_redirect_confirmation_timeout: Duration::ZERO,
},
#[cfg(feature = "rest_broker")]
rest_config: RestConfig {
is_rest_broker: false,
db_schema_cache: None,
},
proxy_protocol_v2: config::ProxyProtocolV2::Rejected,
handshake_timeout: Duration::from_secs(10),
region: "local".into(),
wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?,
connect_compute_locks,
connect_to_compute: compute_config,
#[cfg(feature = "testing")]
disable_pg_session_jwt: args.disable_pg_session_jwt,
})))
}
@@ -302,3 +294,132 @@ fn build_auth_backend(args: &LocalProxyCliArgs) -> &'static auth::Backend<'stati
Box::leak(Box::new(auth_backend))
}
#[derive(Error, Debug)]
enum RefreshConfigError {
#[error(transparent)]
Read(#[from] std::io::Error),
#[error(transparent)]
Parse(#[from] serde_json::Error),
#[error(transparent)]
Validate(anyhow::Error),
#[error(transparent)]
Tls(anyhow::Error),
}
async fn refresh_config_loop(config: &ProxyConfig, path: Utf8PathBuf, rx: Arc<Notify>) {
let mut init = true;
loop {
rx.notified().await;
match refresh_config_inner(config, &path).await {
Ok(()) => {}
// don't log for file not found errors if this is the first time we are checking
// for computes that don't use local_proxy, this is not an error.
Err(RefreshConfigError::Read(e))
if init && e.kind() == std::io::ErrorKind::NotFound =>
{
debug!(error=?e, ?path, "could not read config file");
}
Err(RefreshConfigError::Tls(e)) => {
error!(error=?e, ?path, "could not read TLS certificates");
}
Err(e) => {
error!(error=?e, ?path, "could not read config file");
}
}
init = false;
}
}
async fn refresh_config_inner(
config: &ProxyConfig,
path: &Utf8Path,
) -> Result<(), RefreshConfigError> {
let bytes = tokio::fs::read(&path).await?;
let data: LocalProxySpec = serde_json::from_slice(&bytes)?;
let mut jwks_set = vec![];
fn parse_jwks_settings(jwks: compute_api::spec::JwksSettings) -> anyhow::Result<JwksSettings> {
let mut jwks_url = url::Url::from_str(&jwks.jwks_url).context("parsing JWKS url")?;
ensure!(
jwks_url.has_authority()
&& (jwks_url.scheme() == "http" || jwks_url.scheme() == "https"),
"Invalid JWKS url. Must be HTTP",
);
ensure!(
jwks_url.host().is_some_and(|h| h != url::Host::Domain("")),
"Invalid JWKS url. No domain listed",
);
// clear username, password and ports
jwks_url
.set_username("")
.expect("url can be a base and has a valid host and is not a file. should not error");
jwks_url
.set_password(None)
.expect("url can be a base and has a valid host and is not a file. should not error");
// local testing is hard if we need to have a specific restricted port
if cfg!(not(feature = "testing")) {
jwks_url.set_port(None).expect(
"url can be a base and has a valid host and is not a file. should not error",
);
}
// clear query params
jwks_url.set_fragment(None);
jwks_url.query_pairs_mut().clear().finish();
if jwks_url.scheme() != "https" {
// local testing is hard if we need to set up https support.
if cfg!(not(feature = "testing")) {
jwks_url
.set_scheme("https")
.expect("should not error to set the scheme to https if it was http");
} else {
warn!(scheme = jwks_url.scheme(), "JWKS url is not HTTPS");
}
}
Ok(JwksSettings {
id: jwks.id,
jwks_url,
_provider_name: jwks.provider_name,
jwt_audience: jwks.jwt_audience,
role_names: jwks
.role_names
.into_iter()
.map(RoleName::from)
.map(|s| RoleNameInt::from(&s))
.collect(),
})
}
for jwks in data.jwks.into_iter().flatten() {
jwks_set.push(parse_jwks_settings(jwks).map_err(RefreshConfigError::Validate)?);
}
info!("successfully loaded new config");
JWKS_ROLE_MAP.store(Some(Arc::new(EndpointJwksResponse { jwks: jwks_set })));
if let Some(tls_config) = data.tls {
let tls_config = tokio::task::spawn_blocking(move || {
crate::tls::server_config::configure_tls(
tls_config.key_path.as_ref(),
tls_config.cert_path.as_ref(),
None,
false,
)
})
.await
.propagate_task_panic()
.map_err(RefreshConfigError::Tls)?;
config.tls_config.store(Some(Arc::new(tls_config)));
}
Ok(())
}

Some files were not shown because too many files have changed in this diff Show More