mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-14 08:52:56 +00:00
Merge remote-tracking branch 'origin/main' into allow-tenant_create-with-tenant-token
(Minor) Conflicts:
pageserver/src/http/routes.rs
test_runner/regress/test_auth.py
This commit is contained in:
@@ -57,14 +57,14 @@ runs:
|
||||
if ! which allure; then
|
||||
ALLURE_ZIP=allure-${ALLURE_VERSION}.zip
|
||||
wget -q https://github.com/allure-framework/allure2/releases/download/${ALLURE_VERSION}/${ALLURE_ZIP}
|
||||
echo "${ALLURE_ZIP_MD5} ${ALLURE_ZIP}" | md5sum -c
|
||||
echo "${ALLURE_ZIP_SHA256} ${ALLURE_ZIP}" | sha256sum --check
|
||||
unzip -q ${ALLURE_ZIP}
|
||||
echo "$(pwd)/allure-${ALLURE_VERSION}/bin" >> $GITHUB_PATH
|
||||
rm -f ${ALLURE_ZIP}
|
||||
fi
|
||||
env:
|
||||
ALLURE_VERSION: 2.22.0
|
||||
ALLURE_ZIP_MD5: d5c9f0989b896482536956340a7d5ec9
|
||||
ALLURE_VERSION: 2.22.1
|
||||
ALLURE_ZIP_SHA256: fdc7a62d94b14c5e0bf25198ae1feded6b005fdbed864b4d3cb4e5e901720b0b
|
||||
|
||||
# Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
|
||||
- name: Acquire lock
|
||||
|
||||
10
.github/actions/run-python-test-set/action.yml
vendored
10
.github/actions/run-python-test-set/action.yml
vendored
@@ -36,14 +36,6 @@ inputs:
|
||||
description: 'Region name for real s3 tests'
|
||||
required: false
|
||||
default: ''
|
||||
real_s3_access_key_id:
|
||||
description: 'Access key id'
|
||||
required: false
|
||||
default: ''
|
||||
real_s3_secret_access_key:
|
||||
description: 'Secret access key'
|
||||
required: false
|
||||
default: ''
|
||||
rerun_flaky:
|
||||
description: 'Whether to rerun flaky tests'
|
||||
required: false
|
||||
@@ -104,8 +96,6 @@ runs:
|
||||
COMPATIBILITY_POSTGRES_DISTRIB_DIR: /tmp/neon-previous/pg_install
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: ${{ inputs.build_type }}
|
||||
AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
|
||||
COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
|
||||
ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
|
||||
ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
|
||||
|
||||
41
.github/workflows/build_and_test.yml
vendored
41
.github/workflows/build_and_test.yml
vendored
@@ -264,7 +264,7 @@ jobs:
|
||||
export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
|
||||
export REMOTE_STORAGE_S3_REGION=eu-central-1
|
||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||
${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test pagination_tests -- s3_pagination_should_work --exact
|
||||
${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
|
||||
|
||||
- name: Install rust binaries
|
||||
run: |
|
||||
@@ -346,10 +346,8 @@ jobs:
|
||||
test_selection: regress
|
||||
needs_postgres_source: true
|
||||
run_with_real_s3: true
|
||||
real_s3_bucket: ci-tests-s3
|
||||
real_s3_region: us-west-2
|
||||
real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}"
|
||||
real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}"
|
||||
real_s3_bucket: neon-github-ci-tests
|
||||
real_s3_region: eu-central-1
|
||||
rerun_flaky: true
|
||||
pg_version: ${{ matrix.pg_version }}
|
||||
env:
|
||||
@@ -409,9 +407,7 @@ jobs:
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
|
||||
- uses: actions/github-script@v6
|
||||
if: >
|
||||
!cancelled() &&
|
||||
github.event_name == 'pull_request'
|
||||
if: ${{ !cancelled() }}
|
||||
with:
|
||||
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
|
||||
retries: 5
|
||||
@@ -421,7 +417,7 @@ jobs:
|
||||
reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}",
|
||||
}
|
||||
|
||||
const script = require("./scripts/pr-comment-test-report.js")
|
||||
const script = require("./scripts/comment-test-report.js")
|
||||
await script({
|
||||
github,
|
||||
context,
|
||||
@@ -496,19 +492,24 @@ jobs:
|
||||
env:
|
||||
COMMIT_URL: ${{ github.server_url }}/${{ github.repository }}/commit/${{ github.event.pull_request.head.sha || github.sha }}
|
||||
run: |
|
||||
scripts/coverage \
|
||||
--dir=/tmp/coverage report \
|
||||
scripts/coverage --dir=/tmp/coverage \
|
||||
report \
|
||||
--input-objects=/tmp/coverage/binaries.list \
|
||||
--commit-url=${COMMIT_URL} \
|
||||
--format=github
|
||||
|
||||
scripts/coverage --dir=/tmp/coverage \
|
||||
report \
|
||||
--input-objects=/tmp/coverage/binaries.list \
|
||||
--format=lcov
|
||||
|
||||
- name: Upload coverage report
|
||||
id: upload-coverage-report
|
||||
env:
|
||||
BUCKET: neon-github-public-dev
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
run: |
|
||||
aws s3 cp --only-show-errors --recursive /tmp/coverage/report s3://neon-github-public-dev/code-coverage/${COMMIT_SHA}
|
||||
aws s3 cp --only-show-errors --recursive /tmp/coverage/report s3://${BUCKET}/code-coverage/${COMMIT_SHA}
|
||||
|
||||
REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/index.html
|
||||
echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
|
||||
@@ -663,6 +664,9 @@ jobs:
|
||||
project: nrdv0s4kcs
|
||||
push: true
|
||||
tags: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:depot-${{needs.tag.outputs.build-tag}}
|
||||
build-args: |
|
||||
GIT_VERSION=${{ github.sha }}
|
||||
REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
|
||||
compute-tools-image:
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
@@ -777,7 +781,7 @@ jobs:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
env:
|
||||
VM_BUILDER_VERSION: v0.4.6
|
||||
VM_BUILDER_VERSION: v0.8.0
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -787,21 +791,18 @@ jobs:
|
||||
|
||||
- name: Downloading vm-builder
|
||||
run: |
|
||||
curl -L https://github.com/neondatabase/neonvm/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
|
||||
curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
|
||||
chmod +x vm-builder
|
||||
|
||||
# Note: we need a separate pull step here because otherwise vm-builder will try to pull, and
|
||||
# it won't have the proper authentication (written at v0.6.0)
|
||||
- name: Pulling compute-node image
|
||||
run: |
|
||||
docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Building VM compute-node rootfs
|
||||
run: |
|
||||
docker build -t temp-vm-compute-node --build-arg SRC_IMAGE=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -f Dockerfile.vm-compute-node .
|
||||
|
||||
- name: Build vm image
|
||||
run: |
|
||||
# note: as of 2023-01-12, vm-builder requires a trailing ":latest" for local images
|
||||
./vm-builder -use-inittab -src=temp-vm-compute-node:latest -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
./vm-builder -enable-file-cache -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Pushing vm-compute-node image
|
||||
run: |
|
||||
|
||||
650
Cargo.lock
generated
650
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
30
Cargo.toml
30
Cargo.toml
@@ -3,12 +3,26 @@ members = [
|
||||
"compute_tools",
|
||||
"control_plane",
|
||||
"pageserver",
|
||||
"pageserver/ctl",
|
||||
"proxy",
|
||||
"safekeeper",
|
||||
"storage_broker",
|
||||
"workspace_hack",
|
||||
"trace",
|
||||
"libs/*",
|
||||
"libs/compute_api",
|
||||
"libs/pageserver_api",
|
||||
"libs/postgres_ffi",
|
||||
"libs/safekeeper_api",
|
||||
"libs/utils",
|
||||
"libs/consumption_metrics",
|
||||
"libs/postgres_backend",
|
||||
"libs/pq_proto",
|
||||
"libs/tenant_size_model",
|
||||
"libs/metrics",
|
||||
"libs/postgres_connection",
|
||||
"libs/remote_storage",
|
||||
"libs/tracing-utils",
|
||||
"libs/postgres_ffi/wal_craft",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
@@ -22,7 +36,7 @@ async-stream = "0.3"
|
||||
async-trait = "0.1"
|
||||
atty = "0.2.14"
|
||||
aws-config = { version = "0.55", default-features = false, features=["rustls"] }
|
||||
aws-sdk-s3 = "0.25"
|
||||
aws-sdk-s3 = "0.27"
|
||||
aws-smithy-http = "0.55"
|
||||
aws-credential-types = "0.55"
|
||||
aws-types = "0.55"
|
||||
@@ -126,11 +140,11 @@ env_logger = "0.10"
|
||||
log = "0.4"
|
||||
|
||||
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
|
||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" }
|
||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" }
|
||||
tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }
|
||||
|
||||
## Other git libraries
|
||||
@@ -166,7 +180,7 @@ tonic-build = "0.9"
|
||||
|
||||
# This is only needed for proxy's tests.
|
||||
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" }
|
||||
|
||||
# Changes the MAX_THREADS limit from 4096 to 32768.
|
||||
# This is a temporary workaround for using tracing from many threads in safekeepers code,
|
||||
|
||||
@@ -47,8 +47,7 @@ RUN set -e \
|
||||
&& mold -run cargo build \
|
||||
--bin pg_sni_router \
|
||||
--bin pageserver \
|
||||
--bin pageserver_binutils \
|
||||
--bin draw_timeline_dir \
|
||||
--bin pagectl \
|
||||
--bin safekeeper \
|
||||
--bin storage_broker \
|
||||
--bin proxy \
|
||||
@@ -73,8 +72,7 @@ RUN set -e \
|
||||
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pg_sni_router /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver_binutils /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/draw_timeline_dir /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
|
||||
|
||||
@@ -517,6 +517,22 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405
|
||||
cargo pgx install --release && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-pgx-ulid-build"
|
||||
# Compile "pgx_ulid" extension
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM rust-extensions-build AS pg-pgx-ulid-build
|
||||
|
||||
RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \
|
||||
echo "908b7358e6f846e87db508ae5349fb56a88ee6305519074b12f3d5b0ff09f791 pgx_ulid.tar.gz" | sha256sum --check && \
|
||||
mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
|
||||
sed -i 's/pgx = "=0.7.3"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgx install --release && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "neon-pg-ext-build"
|
||||
@@ -547,6 +563,7 @@ COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY pgxn/ pgxn/
|
||||
|
||||
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
@@ -556,6 +573,10 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
|
||||
-C pgxn/neon_utils \
|
||||
-s install && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
|
||||
-C pgxn/hnsw \
|
||||
-s install
|
||||
|
||||
#########################################################################################
|
||||
@@ -632,6 +653,7 @@ RUN apt update && \
|
||||
libxml2 \
|
||||
libxslt1.1 \
|
||||
libzstd1 \
|
||||
libcurl4-openssl-dev \
|
||||
procps && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
|
||||
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
|
||||
|
||||
@@ -1,70 +0,0 @@
|
||||
# Note: this file *mostly* just builds on Dockerfile.compute-node
|
||||
|
||||
ARG SRC_IMAGE
|
||||
ARG VM_INFORMANT_VERSION=v0.1.14
|
||||
# on libcgroup update, make sure to check bootstrap.sh for changes
|
||||
ARG LIBCGROUP_VERSION=v2.0.3
|
||||
|
||||
# Pull VM informant, to copy from later
|
||||
FROM neondatabase/vm-informant:$VM_INFORMANT_VERSION as informant
|
||||
|
||||
# Build cgroup-tools
|
||||
#
|
||||
# At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically
|
||||
# libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-informant
|
||||
# requires cgroup v2, so we'll build cgroup-tools ourselves.
|
||||
FROM debian:bullseye-slim as libcgroup-builder
|
||||
ARG LIBCGROUP_VERSION
|
||||
|
||||
RUN set -exu \
|
||||
&& apt update \
|
||||
&& apt install --no-install-recommends -y \
|
||||
git \
|
||||
ca-certificates \
|
||||
automake \
|
||||
cmake \
|
||||
make \
|
||||
gcc \
|
||||
byacc \
|
||||
flex \
|
||||
libtool \
|
||||
libpam0g-dev \
|
||||
&& git clone --depth 1 -b $LIBCGROUP_VERSION https://github.com/libcgroup/libcgroup \
|
||||
&& INSTALL_DIR="/libcgroup-install" \
|
||||
&& mkdir -p "$INSTALL_DIR/bin" "$INSTALL_DIR/include" \
|
||||
&& cd libcgroup \
|
||||
# extracted from bootstrap.sh, with modified flags:
|
||||
&& (test -d m4 || mkdir m4) \
|
||||
&& autoreconf -fi \
|
||||
&& rm -rf autom4te.cache \
|
||||
&& CFLAGS="-O3" ./configure --prefix="$INSTALL_DIR" --sysconfdir=/etc --localstatedir=/var --enable-opaque-hierarchy="name=systemd" \
|
||||
# actually build the thing...
|
||||
&& make install
|
||||
|
||||
# Combine, starting from non-VM compute node image.
|
||||
FROM $SRC_IMAGE as base
|
||||
|
||||
# Temporarily set user back to root so we can run adduser, set inittab
|
||||
USER root
|
||||
RUN adduser vm-informant --disabled-password --no-create-home
|
||||
|
||||
RUN set -e \
|
||||
&& rm -f /etc/inittab \
|
||||
&& touch /etc/inittab
|
||||
|
||||
RUN set -e \
|
||||
&& echo "::sysinit:cgconfigparser -l /etc/cgconfig.conf -s 1664" >> /etc/inittab \
|
||||
&& CONNSTR="dbname=postgres user=cloud_admin sslmode=disable" \
|
||||
&& ARGS="--auto-restart --cgroup=neon-postgres --pgconnstr=\"$CONNSTR\"" \
|
||||
&& echo "::respawn:su vm-informant -c '/usr/local/bin/vm-informant $ARGS'" >> /etc/inittab
|
||||
|
||||
USER postgres
|
||||
|
||||
ADD vm-cgconfig.conf /etc/cgconfig.conf
|
||||
COPY --from=informant /usr/bin/vm-informant /usr/local/bin/vm-informant
|
||||
|
||||
COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/
|
||||
COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/
|
||||
COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
|
||||
|
||||
ENTRYPOINT ["/usr/sbin/cgexec", "-g", "*:neon-postgres", "/usr/local/bin/compute_ctl"]
|
||||
8
Makefile
8
Makefile
@@ -138,6 +138,11 @@ neon-pg-ext-%: postgres-%
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install
|
||||
+@echo "Compiling hnsw $*"
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/hnsw-$*
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile install
|
||||
|
||||
.PHONY: neon-pg-ext-clean-%
|
||||
neon-pg-ext-clean-%:
|
||||
@@ -153,6 +158,9 @@ neon-pg-ext-clean-%:
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile clean
|
||||
|
||||
.PHONY: neon-pg-ext
|
||||
neon-pg-ext: \
|
||||
|
||||
@@ -28,18 +28,19 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
|
||||
* On Ubuntu or Debian, this set of packages should be sufficient to build the code:
|
||||
```bash
|
||||
apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
|
||||
libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler
|
||||
libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
|
||||
libcurl4-openssl-dev
|
||||
```
|
||||
* On Fedora, these packages are needed:
|
||||
```bash
|
||||
dnf install flex bison readline-devel zlib-devel openssl-devel \
|
||||
libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
|
||||
protobuf-devel
|
||||
protobuf-devel libcurl-devel
|
||||
```
|
||||
* On Arch based systems, these packages are needed:
|
||||
```bash
|
||||
pacman -S base-devel readline zlib libseccomp openssl clang \
|
||||
postgresql-libs cmake postgresql protobuf
|
||||
postgresql-libs cmake postgresql protobuf curl
|
||||
```
|
||||
|
||||
Building Neon requires 3.15+ version of `protoc` (protobuf-compiler). If your distribution provides an older version, you can install a newer version from [here](https://github.com/protocolbuffers/protobuf/releases).
|
||||
|
||||
@@ -59,6 +59,9 @@ fn main() -> Result<()> {
|
||||
|
||||
let matches = cli().get_matches();
|
||||
|
||||
let http_port = *matches
|
||||
.get_one::<u16>("http-port")
|
||||
.expect("http-port is required");
|
||||
let pgdata = matches
|
||||
.get_one::<String>("pgdata")
|
||||
.expect("PGDATA path is required");
|
||||
@@ -178,7 +181,8 @@ fn main() -> Result<()> {
|
||||
|
||||
// Launch http service first, so we were able to serve control-plane
|
||||
// requests, while configuration is still in progress.
|
||||
let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
|
||||
let _http_handle =
|
||||
launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
|
||||
|
||||
if !spec_set {
|
||||
// No spec provided, hang waiting for it.
|
||||
@@ -286,6 +290,14 @@ fn cli() -> clap::Command {
|
||||
let version = option_env!("CARGO_PKG_VERSION").unwrap_or("unknown");
|
||||
clap::Command::new("compute_ctl")
|
||||
.version(version)
|
||||
.arg(
|
||||
Arg::new("http-port")
|
||||
.long("http-port")
|
||||
.value_name("HTTP_PORT")
|
||||
.default_value("3080")
|
||||
.value_parser(clap::value_parser!(u16))
|
||||
.required(false),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("connstr")
|
||||
.short('C')
|
||||
|
||||
@@ -1,19 +1,3 @@
|
||||
//
|
||||
// XXX: This starts to be scarry similar to the `PostgresNode` from `control_plane`,
|
||||
// but there are several things that makes `PostgresNode` usage inconvenient in the
|
||||
// cloud:
|
||||
// - it inherits from `LocalEnv`, which contains **all-all** the information about
|
||||
// a complete service running
|
||||
// - it uses `PageServerNode` with information about http endpoint, which we do not
|
||||
// need in the cloud again
|
||||
// - many tiny pieces like, for example, we do not use `pg_ctl` in the cloud
|
||||
//
|
||||
// Thus, to use `PostgresNode` in the cloud, we need to 'mock' a bunch of required
|
||||
// attributes (not required for the cloud). Yet, it is still tempting to unify these
|
||||
// `PostgresNode` and `ComputeNode` and use one in both places.
|
||||
//
|
||||
// TODO: stabilize `ComputeNode` and think about using it in the `control_plane`.
|
||||
//
|
||||
use std::fs;
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
use std::path::Path;
|
||||
@@ -106,26 +90,38 @@ pub struct ParsedSpec {
|
||||
impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
type Error = String;
|
||||
fn try_from(spec: ComputeSpec) -> Result<Self, String> {
|
||||
// Extract the options from the spec file that are needed to connect to
|
||||
// the storage system.
|
||||
//
|
||||
// For backwards-compatibility, the top-level fields in the spec file
|
||||
// may be empty. In that case, we need to dig them from the GUCs in the
|
||||
// cluster.settings field.
|
||||
let pageserver_connstr = spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("neon.pageserver_connstring")
|
||||
.pageserver_connstring
|
||||
.clone()
|
||||
.or_else(|| spec.cluster.settings.find("neon.pageserver_connstring"))
|
||||
.ok_or("pageserver connstr should be provided")?;
|
||||
let storage_auth_token = spec.storage_auth_token.clone();
|
||||
let tenant_id: TenantId = spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("neon.tenant_id")
|
||||
.ok_or("tenant id should be provided")
|
||||
.map(|s| TenantId::from_str(&s))?
|
||||
.or(Err("invalid tenant id"))?;
|
||||
let timeline_id: TimelineId = spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("neon.timeline_id")
|
||||
.ok_or("timeline id should be provided")
|
||||
.map(|s| TimelineId::from_str(&s))?
|
||||
.or(Err("invalid timeline id"))?;
|
||||
let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id {
|
||||
tenant_id
|
||||
} else {
|
||||
spec.cluster
|
||||
.settings
|
||||
.find("neon.tenant_id")
|
||||
.ok_or("tenant id should be provided")
|
||||
.map(|s| TenantId::from_str(&s))?
|
||||
.or(Err("invalid tenant id"))?
|
||||
};
|
||||
let timeline_id: TimelineId = if let Some(timeline_id) = spec.timeline_id {
|
||||
timeline_id
|
||||
} else {
|
||||
spec.cluster
|
||||
.settings
|
||||
.find("neon.timeline_id")
|
||||
.ok_or("timeline id should be provided")
|
||||
.map(|s| TimelineId::from_str(&s))?
|
||||
.or(Err("invalid timeline id"))?
|
||||
};
|
||||
|
||||
Ok(ParsedSpec {
|
||||
spec,
|
||||
@@ -295,8 +291,8 @@ impl ComputeNode {
|
||||
update_pg_hba(pgdata_path)?;
|
||||
|
||||
match spec.mode {
|
||||
ComputeMode::Primary | ComputeMode::Static(..) => {}
|
||||
ComputeMode::Replica => {
|
||||
ComputeMode::Primary => {}
|
||||
ComputeMode::Replica | ComputeMode::Static(..) => {
|
||||
add_standby_signal(pgdata_path)?;
|
||||
}
|
||||
}
|
||||
@@ -362,6 +358,8 @@ impl ComputeNode {
|
||||
};
|
||||
|
||||
// Proceed with post-startup configuration. Note, that order of operations is important.
|
||||
// Disable DDL forwarding because control plane already knows about these roles/databases.
|
||||
client.simple_query("SET neon.forward_ddl = false")?;
|
||||
let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
|
||||
handle_roles(spec, &mut client)?;
|
||||
handle_databases(spec, &mut client)?;
|
||||
@@ -374,7 +372,7 @@ impl ComputeNode {
|
||||
|
||||
info!(
|
||||
"finished configuration of compute for project {}",
|
||||
spec.cluster.cluster_id
|
||||
spec.cluster.cluster_id.as_deref().unwrap_or("None")
|
||||
);
|
||||
|
||||
Ok(())
|
||||
@@ -403,7 +401,9 @@ impl ComputeNode {
|
||||
self.pg_reload_conf(&mut client)?;
|
||||
|
||||
// Proceed with post-startup configuration. Note, that order of operations is important.
|
||||
// Disable DDL forwarding because control plane already knows about these roles/databases.
|
||||
if spec.mode == ComputeMode::Primary {
|
||||
client.simple_query("SET neon.forward_ddl = false")?;
|
||||
handle_roles(&spec, &mut client)?;
|
||||
handle_databases(&spec, &mut client)?;
|
||||
handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
|
||||
@@ -430,7 +430,7 @@ impl ComputeNode {
|
||||
let spec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
info!(
|
||||
"starting compute for project {}, operation {}, tenant {}, timeline {}",
|
||||
spec.spec.cluster.cluster_id,
|
||||
spec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
|
||||
spec.spec.operation_uuid.as_deref().unwrap_or("None"),
|
||||
spec.tenant_id,
|
||||
spec.timeline_id,
|
||||
|
||||
@@ -5,6 +5,7 @@ use std::path::Path;
|
||||
|
||||
use anyhow::Result;
|
||||
|
||||
use crate::pg_helpers::escape_conf_value;
|
||||
use crate::pg_helpers::PgOptionsSerialize;
|
||||
use compute_api::spec::{ComputeMode, ComputeSpec};
|
||||
|
||||
@@ -36,10 +37,44 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
|
||||
// File::create() destroys the file content if it exists.
|
||||
let mut file = File::create(path)?;
|
||||
|
||||
writeln!(file, "# Managed by compute_ctl: begin")?;
|
||||
// Write the postgresql.conf content from the spec file as is.
|
||||
if let Some(conf) = &spec.cluster.postgresql_conf {
|
||||
writeln!(file, "{}", conf)?;
|
||||
}
|
||||
|
||||
write!(file, "{}", &spec.cluster.settings.as_pg_settings())?;
|
||||
|
||||
// Add options for connecting to storage
|
||||
writeln!(file, "# Neon storage settings")?;
|
||||
if let Some(s) = &spec.pageserver_connstring {
|
||||
writeln!(
|
||||
file,
|
||||
"neon.pageserver_connstring='{}'",
|
||||
escape_conf_value(s)
|
||||
)?;
|
||||
}
|
||||
if !spec.safekeeper_connstrings.is_empty() {
|
||||
writeln!(
|
||||
file,
|
||||
"neon.safekeepers='{}'",
|
||||
escape_conf_value(&spec.safekeeper_connstrings.join(","))
|
||||
)?;
|
||||
}
|
||||
if let Some(s) = &spec.tenant_id {
|
||||
writeln!(
|
||||
file,
|
||||
"neon.tenant_id='{}'",
|
||||
escape_conf_value(&s.to_string())
|
||||
)?;
|
||||
}
|
||||
if let Some(s) = &spec.timeline_id {
|
||||
writeln!(
|
||||
file,
|
||||
"neon.timeline_id='{}'",
|
||||
escape_conf_value(&s.to_string())
|
||||
)?;
|
||||
}
|
||||
|
||||
match spec.mode {
|
||||
ComputeMode::Primary => {}
|
||||
ComputeMode::Static(lsn) => {
|
||||
@@ -53,7 +88,12 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
|
||||
}
|
||||
}
|
||||
|
||||
writeln!(file, "# Managed by compute_ctl: end")?;
|
||||
// If there are any extra options in the 'settings' field, append those
|
||||
if spec.cluster.settings.is_some() {
|
||||
writeln!(file, "# Managed by compute_ctl: begin")?;
|
||||
write!(file, "{}", spec.cluster.settings.as_pg_settings())?;
|
||||
writeln!(file, "# Managed by compute_ctl: end")?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -220,8 +220,8 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
|
||||
|
||||
// Main Hyper HTTP server function that runs it and blocks waiting on it forever.
|
||||
#[tokio::main]
|
||||
async fn serve(state: Arc<ComputeNode>) {
|
||||
let addr = SocketAddr::from(([0, 0, 0, 0], 3080));
|
||||
async fn serve(port: u16, state: Arc<ComputeNode>) {
|
||||
let addr = SocketAddr::from(([0, 0, 0, 0], port));
|
||||
|
||||
let make_service = make_service_fn(move |_conn| {
|
||||
let state = state.clone();
|
||||
@@ -256,10 +256,10 @@ async fn serve(state: Arc<ComputeNode>) {
|
||||
}
|
||||
|
||||
/// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`.
|
||||
pub fn launch_http_server(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
|
||||
pub fn launch_http_server(port: u16, state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
|
||||
let state = Arc::clone(state);
|
||||
|
||||
Ok(thread::Builder::new()
|
||||
.name("http-endpoint".into())
|
||||
.spawn(move || serve(state))?)
|
||||
.spawn(move || serve(port, state))?)
|
||||
}
|
||||
|
||||
@@ -33,5 +33,7 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
|
||||
.init();
|
||||
tracing::info!("logging and tracing started");
|
||||
|
||||
utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -23,7 +23,7 @@ fn escape_literal(s: &str) -> String {
|
||||
|
||||
/// Escape a string so that it can be used in postgresql.conf.
|
||||
/// Same as escape_literal, currently.
|
||||
fn escape_conf_value(s: &str) -> String {
|
||||
pub fn escape_conf_value(s: &str) -> String {
|
||||
s.replace('\'', "''").replace('\\', "\\\\")
|
||||
}
|
||||
|
||||
@@ -121,9 +121,8 @@ impl RoleExt for Role {
|
||||
/// string of arguments.
|
||||
fn to_pg_options(&self) -> String {
|
||||
// XXX: consider putting LOGIN as a default option somewhere higher, e.g. in control-plane.
|
||||
// For now, we do not use generic `options` for roles. Once used, add
|
||||
// `self.options.as_pg_options()` somewhere here.
|
||||
let mut params: String = "LOGIN".to_string();
|
||||
let mut params: String = self.options.as_pg_options();
|
||||
params.push_str(" LOGIN");
|
||||
|
||||
if let Some(pass) = &self.encrypted_password {
|
||||
// Some time ago we supported only md5 and treated all encrypted_password as md5.
|
||||
|
||||
@@ -62,7 +62,7 @@ fn do_control_plane_request(
|
||||
}
|
||||
}
|
||||
|
||||
/// Request spec from the control-plane by compute_id. If `NEON_CONSOLE_JWT`
|
||||
/// Request spec from the control-plane by compute_id. If `NEON_CONTROL_PLANE_TOKEN`
|
||||
/// env variable is set, it will be used for authorization.
|
||||
pub fn get_spec_from_control_plane(
|
||||
base_uri: &str,
|
||||
|
||||
@@ -16,7 +16,7 @@ mod pg_helpers_tests {
|
||||
);
|
||||
assert_eq!(
|
||||
spec.cluster.roles.first().unwrap().to_pg_options(),
|
||||
"LOGIN PASSWORD 'md56b1d16b78004bbd51fa06af9eda75972'"
|
||||
" LOGIN PASSWORD 'md56b1d16b78004bbd51fa06af9eda75972'"
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -41,7 +41,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
|
||||
const DEFAULT_BRANCH_NAME: &str = "main";
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
const DEFAULT_PG_VERSION: &str = "14";
|
||||
const DEFAULT_PG_VERSION: &str = "15";
|
||||
|
||||
fn default_conf() -> String {
|
||||
format!(
|
||||
@@ -476,10 +476,11 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
|
||||
|
||||
println!("Creating endpoint for imported timeline ...");
|
||||
cplane.new_endpoint(
|
||||
tenant_id,
|
||||
name,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
None,
|
||||
None,
|
||||
pg_version,
|
||||
ComputeMode::Primary,
|
||||
)?;
|
||||
@@ -591,7 +592,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
||||
|
||||
table.add_row([
|
||||
endpoint_id.as_str(),
|
||||
&endpoint.address.to_string(),
|
||||
&endpoint.pg_address.to_string(),
|
||||
&endpoint.timeline_id.to_string(),
|
||||
branch_name,
|
||||
lsn_str.as_str(),
|
||||
@@ -620,8 +621,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
||||
.get_branch_timeline_id(branch_name, tenant_id)
|
||||
.ok_or_else(|| anyhow!("Found no timeline id for branch name '{branch_name}'"))?;
|
||||
|
||||
let port: Option<u16> = sub_args.get_one::<u16>("port").copied();
|
||||
|
||||
let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied();
|
||||
let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
|
||||
let pg_version = sub_args
|
||||
.get_one::<u32>("pg-version")
|
||||
.copied()
|
||||
@@ -639,14 +640,38 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
||||
(Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
|
||||
};
|
||||
|
||||
cplane.new_endpoint(tenant_id, &endpoint_id, timeline_id, port, pg_version, mode)?;
|
||||
cplane.new_endpoint(
|
||||
&endpoint_id,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
pg_port,
|
||||
http_port,
|
||||
pg_version,
|
||||
mode,
|
||||
)?;
|
||||
}
|
||||
"start" => {
|
||||
let port: Option<u16> = sub_args.get_one::<u16>("port").copied();
|
||||
let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied();
|
||||
let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
|
||||
let endpoint_id = sub_args
|
||||
.get_one::<String>("endpoint_id")
|
||||
.ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;
|
||||
|
||||
// If --safekeepers argument is given, use only the listed safekeeper nodes.
|
||||
let safekeepers =
|
||||
if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
|
||||
let mut safekeepers: Vec<NodeId> = Vec::new();
|
||||
for sk_id in safekeepers_str.split(',').map(str::trim) {
|
||||
let sk_id = NodeId(u64::from_str(sk_id).map_err(|_| {
|
||||
anyhow!("invalid node ID \"{sk_id}\" in --safekeepers list")
|
||||
})?);
|
||||
safekeepers.push(sk_id);
|
||||
}
|
||||
safekeepers
|
||||
} else {
|
||||
env.safekeepers.iter().map(|sk| sk.id).collect()
|
||||
};
|
||||
|
||||
let endpoint = cplane.endpoints.get(endpoint_id.as_str());
|
||||
|
||||
let auth_token = if matches!(env.pageserver.pg_auth_type, AuthType::NeonJWT) {
|
||||
@@ -673,7 +698,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
||||
_ => {}
|
||||
}
|
||||
println!("Starting existing endpoint {endpoint_id}...");
|
||||
endpoint.start(&auth_token)?;
|
||||
endpoint.start(&auth_token, safekeepers)?;
|
||||
} else {
|
||||
let branch_name = sub_args
|
||||
.get_one::<String>("branch-name")
|
||||
@@ -709,14 +734,15 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
||||
println!("Starting new endpoint {endpoint_id} (PostgreSQL v{pg_version}) on timeline {timeline_id} ...");
|
||||
|
||||
let ep = cplane.new_endpoint(
|
||||
tenant_id,
|
||||
endpoint_id,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
port,
|
||||
pg_port,
|
||||
http_port,
|
||||
pg_version,
|
||||
mode,
|
||||
)?;
|
||||
ep.start(&auth_token)?;
|
||||
ep.start(&auth_token, safekeepers)?;
|
||||
}
|
||||
}
|
||||
"stop" => {
|
||||
@@ -944,11 +970,22 @@ fn cli() -> Command {
|
||||
.value_parser(value_parser!(u32))
|
||||
.default_value(DEFAULT_PG_VERSION);
|
||||
|
||||
let port_arg = Arg::new("port")
|
||||
.long("port")
|
||||
let pg_port_arg = Arg::new("pg-port")
|
||||
.long("pg-port")
|
||||
.required(false)
|
||||
.value_parser(value_parser!(u16))
|
||||
.value_name("port");
|
||||
.value_name("pg-port");
|
||||
|
||||
let http_port_arg = Arg::new("http-port")
|
||||
.long("http-port")
|
||||
.required(false)
|
||||
.value_parser(value_parser!(u16))
|
||||
.value_name("http-port");
|
||||
|
||||
let safekeepers_arg = Arg::new("safekeepers")
|
||||
.long("safekeepers")
|
||||
.required(false)
|
||||
.value_name("safekeepers");
|
||||
|
||||
let stop_mode_arg = Arg::new("stop-mode")
|
||||
.short('m')
|
||||
@@ -1093,7 +1130,8 @@ fn cli() -> Command {
|
||||
.arg(branch_name_arg.clone())
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(lsn_arg.clone())
|
||||
.arg(port_arg.clone())
|
||||
.arg(pg_port_arg.clone())
|
||||
.arg(http_port_arg.clone())
|
||||
.arg(
|
||||
Arg::new("config-only")
|
||||
.help("Don't do basebackup, create endpoint directory with only config files")
|
||||
@@ -1109,9 +1147,11 @@ fn cli() -> Command {
|
||||
.arg(branch_name_arg)
|
||||
.arg(timeline_id_arg)
|
||||
.arg(lsn_arg)
|
||||
.arg(port_arg)
|
||||
.arg(pg_port_arg)
|
||||
.arg(http_port_arg)
|
||||
.arg(pg_version_arg)
|
||||
.arg(hot_standby_arg)
|
||||
.arg(safekeepers_arg)
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("stop")
|
||||
|
||||
@@ -1,3 +1,9 @@
|
||||
//! Code to manage the storage broker
|
||||
//!
|
||||
//! In the local test environment, the data for each safekeeper is stored in
|
||||
//!
|
||||
//! .neon/safekeepers/<safekeeper id>
|
||||
//!
|
||||
use anyhow::Context;
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
@@ -1,40 +1,71 @@
|
||||
//! Code to manage compute endpoints
|
||||
//!
|
||||
//! In the local test environment, the data for each endpoint is stored in
|
||||
//!
|
||||
//! .neon/endpoints/<endpoint id>
|
||||
//!
|
||||
//! Some basic information about the endpoint, like the tenant and timeline IDs,
|
||||
//! are stored in the `endpoint.json` file. The `endpoint.json` file is created
|
||||
//! when the endpoint is created, and doesn't change afterwards.
|
||||
//!
|
||||
//! The endpoint is managed by the `compute_ctl` binary. When an endpoint is
|
||||
//! started, we launch `compute_ctl` It synchronizes the safekeepers, downloads
|
||||
//! the basebackup from the pageserver to initialize the the data directory, and
|
||||
//! finally launches the PostgreSQL process. It watches the PostgreSQL process
|
||||
//! until it exits.
|
||||
//!
|
||||
//! When an endpoint is created, a `postgresql.conf` file is also created in
|
||||
//! the endpoint's directory. The file can be modified before starting PostgreSQL.
|
||||
//! However, the `postgresql.conf` file in the endpoint directory is not used directly
|
||||
//! by PostgreSQL. It is passed to `compute_ctl`, and `compute_ctl` writes another
|
||||
//! copy of it in the data directory.
|
||||
//!
|
||||
//! Directory contents:
|
||||
//!
|
||||
//! ```ignore
|
||||
//! .neon/endpoints/main/
|
||||
//! compute.log - log output of `compute_ctl` and `postgres`
|
||||
//! endpoint.json - serialized `EndpointConf` struct
|
||||
//! postgresql.conf - postgresql settings
|
||||
//! spec.json - passed to `compute_ctl`
|
||||
//! pgdata/
|
||||
//! postgresql.conf - copy of postgresql.conf created by `compute_ctl`
|
||||
//! zenith.signal
|
||||
//! <other PostgreSQL files>
|
||||
//! ```
|
||||
//!
|
||||
use std::collections::BTreeMap;
|
||||
use std::fs::{self, File};
|
||||
use std::io::Write;
|
||||
use std::net::SocketAddr;
|
||||
use std::net::TcpStream;
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
use std::path::PathBuf;
|
||||
use std::process::{Command, Stdio};
|
||||
use std::str::FromStr;
|
||||
use std::process::Command;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::pageserver::PageServerNode;
|
||||
use crate::postgresql_conf::PostgresConf;
|
||||
|
||||
use compute_api::spec::ComputeMode;
|
||||
use compute_api::responses::{ComputeState, ComputeStatus};
|
||||
use compute_api::spec::{Cluster, ComputeMode, ComputeSpec};
|
||||
|
||||
// contents of a endpoint.json file
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
pub struct EndpointConf {
|
||||
name: String,
|
||||
endpoint_id: String,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
tenant_id: TenantId,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
timeline_id: TimelineId,
|
||||
mode: ComputeMode,
|
||||
port: u16,
|
||||
pg_port: u16,
|
||||
http_port: u16,
|
||||
pg_version: u32,
|
||||
}
|
||||
|
||||
@@ -57,11 +88,11 @@ impl ComputeControlPlane {
|
||||
let pageserver = Arc::new(PageServerNode::from_env(&env));
|
||||
|
||||
let mut endpoints = BTreeMap::default();
|
||||
for endpoint_dir in fs::read_dir(env.endpoints_path())
|
||||
for endpoint_dir in std::fs::read_dir(env.endpoints_path())
|
||||
.with_context(|| format!("failed to list {}", env.endpoints_path().display()))?
|
||||
{
|
||||
let ep = Endpoint::from_dir_entry(endpoint_dir?, &env, &pageserver)?;
|
||||
endpoints.insert(ep.name.clone(), Arc::new(ep));
|
||||
endpoints.insert(ep.endpoint_id.clone(), Arc::new(ep));
|
||||
}
|
||||
|
||||
Ok(ComputeControlPlane {
|
||||
@@ -76,25 +107,28 @@ impl ComputeControlPlane {
|
||||
1 + self
|
||||
.endpoints
|
||||
.values()
|
||||
.map(|ep| ep.address.port())
|
||||
.map(|ep| std::cmp::max(ep.pg_address.port(), ep.http_address.port()))
|
||||
.max()
|
||||
.unwrap_or(self.base_port)
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn new_endpoint(
|
||||
&mut self,
|
||||
endpoint_id: &str,
|
||||
tenant_id: TenantId,
|
||||
name: &str,
|
||||
timeline_id: TimelineId,
|
||||
port: Option<u16>,
|
||||
pg_port: Option<u16>,
|
||||
http_port: Option<u16>,
|
||||
pg_version: u32,
|
||||
mode: ComputeMode,
|
||||
) -> Result<Arc<Endpoint>> {
|
||||
let port = port.unwrap_or_else(|| self.get_port());
|
||||
|
||||
let pg_port = pg_port.unwrap_or_else(|| self.get_port());
|
||||
let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
|
||||
let ep = Arc::new(Endpoint {
|
||||
name: name.to_owned(),
|
||||
address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
|
||||
endpoint_id: endpoint_id.to_owned(),
|
||||
pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
|
||||
http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port),
|
||||
env: self.env.clone(),
|
||||
pageserver: Arc::clone(&self.pageserver),
|
||||
timeline_id,
|
||||
@@ -102,21 +136,27 @@ impl ComputeControlPlane {
|
||||
tenant_id,
|
||||
pg_version,
|
||||
});
|
||||
ep.create_pgdata()?;
|
||||
|
||||
ep.create_endpoint_dir()?;
|
||||
std::fs::write(
|
||||
ep.endpoint_path().join("endpoint.json"),
|
||||
serde_json::to_string_pretty(&EndpointConf {
|
||||
name: name.to_string(),
|
||||
endpoint_id: endpoint_id.to_string(),
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
mode,
|
||||
port,
|
||||
http_port,
|
||||
pg_port,
|
||||
pg_version,
|
||||
})?,
|
||||
)?;
|
||||
ep.setup_pg_conf()?;
|
||||
std::fs::write(
|
||||
ep.endpoint_path().join("postgresql.conf"),
|
||||
ep.setup_pg_conf()?.to_string(),
|
||||
)?;
|
||||
|
||||
self.endpoints.insert(ep.name.clone(), Arc::clone(&ep));
|
||||
self.endpoints
|
||||
.insert(ep.endpoint_id.clone(), Arc::clone(&ep));
|
||||
|
||||
Ok(ep)
|
||||
}
|
||||
@@ -127,13 +167,15 @@ impl ComputeControlPlane {
|
||||
#[derive(Debug)]
|
||||
pub struct Endpoint {
|
||||
/// used as the directory name
|
||||
name: String,
|
||||
endpoint_id: String,
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub mode: ComputeMode,
|
||||
|
||||
// port and address of the Postgres server
|
||||
pub address: SocketAddr,
|
||||
// port and address of the Postgres server and `compute_ctl`'s HTTP API
|
||||
pub pg_address: SocketAddr,
|
||||
pub http_address: SocketAddr,
|
||||
|
||||
// postgres major version in the format: 14, 15, etc.
|
||||
pg_version: u32,
|
||||
|
||||
@@ -158,16 +200,16 @@ impl Endpoint {
|
||||
|
||||
// parse data directory name
|
||||
let fname = entry.file_name();
|
||||
let name = fname.to_str().unwrap().to_string();
|
||||
let endpoint_id = fname.to_str().unwrap().to_string();
|
||||
|
||||
// Read the endpoint.json file
|
||||
let conf: EndpointConf =
|
||||
serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;
|
||||
|
||||
// ok now
|
||||
Ok(Endpoint {
|
||||
address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.port),
|
||||
name,
|
||||
pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port),
|
||||
http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port),
|
||||
endpoint_id,
|
||||
env: env.clone(),
|
||||
pageserver: Arc::clone(pageserver),
|
||||
timeline_id: conf.timeline_id,
|
||||
@@ -177,104 +219,17 @@ impl Endpoint {
|
||||
})
|
||||
}
|
||||
|
||||
fn sync_safekeepers(&self, auth_token: &Option<String>, pg_version: u32) -> Result<Lsn> {
|
||||
let pg_path = self.env.pg_bin_dir(pg_version)?.join("postgres");
|
||||
let mut cmd = Command::new(pg_path);
|
||||
|
||||
cmd.arg("--sync-safekeepers")
|
||||
.env_clear()
|
||||
.env(
|
||||
"LD_LIBRARY_PATH",
|
||||
self.env.pg_lib_dir(pg_version)?.to_str().unwrap(),
|
||||
)
|
||||
.env(
|
||||
"DYLD_LIBRARY_PATH",
|
||||
self.env.pg_lib_dir(pg_version)?.to_str().unwrap(),
|
||||
)
|
||||
.env("PGDATA", self.pgdata().to_str().unwrap())
|
||||
.stdout(Stdio::piped())
|
||||
// Comment this to avoid capturing stderr (useful if command hangs)
|
||||
.stderr(Stdio::piped());
|
||||
|
||||
if let Some(token) = auth_token {
|
||||
cmd.env("NEON_AUTH_TOKEN", token);
|
||||
}
|
||||
|
||||
let sync_handle = cmd
|
||||
.spawn()
|
||||
.expect("postgres --sync-safekeepers failed to start");
|
||||
|
||||
let sync_output = sync_handle
|
||||
.wait_with_output()
|
||||
.expect("postgres --sync-safekeepers failed");
|
||||
if !sync_output.status.success() {
|
||||
anyhow::bail!(
|
||||
"sync-safekeepers failed: '{}'",
|
||||
String::from_utf8_lossy(&sync_output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
let lsn = Lsn::from_str(std::str::from_utf8(&sync_output.stdout)?.trim())?;
|
||||
println!("Safekeepers synced on {}", lsn);
|
||||
Ok(lsn)
|
||||
}
|
||||
|
||||
/// Get basebackup from the pageserver as a tar archive and extract it
|
||||
/// to the `self.pgdata()` directory.
|
||||
fn do_basebackup(&self, lsn: Option<Lsn>) -> Result<()> {
|
||||
println!(
|
||||
"Extracting base backup to create postgres instance: path={} port={}",
|
||||
self.pgdata().display(),
|
||||
self.address.port()
|
||||
);
|
||||
|
||||
let sql = if let Some(lsn) = lsn {
|
||||
format!("basebackup {} {} {}", self.tenant_id, self.timeline_id, lsn)
|
||||
} else {
|
||||
format!("basebackup {} {}", self.tenant_id, self.timeline_id)
|
||||
};
|
||||
|
||||
let mut client = self
|
||||
.pageserver
|
||||
.page_server_psql_client()
|
||||
.context("connecting to page server failed")?;
|
||||
|
||||
let copyreader = client
|
||||
.copy_out(sql.as_str())
|
||||
.context("page server 'basebackup' command failed")?;
|
||||
|
||||
// Read the archive directly from the `CopyOutReader`
|
||||
//
|
||||
// Set `ignore_zeros` so that unpack() reads all the Copy data and
|
||||
// doesn't stop at the end-of-archive marker. Otherwise, if the server
|
||||
// sends an Error after finishing the tarball, we will not notice it.
|
||||
let mut ar = tar::Archive::new(copyreader);
|
||||
ar.set_ignore_zeros(true);
|
||||
ar.unpack(&self.pgdata())
|
||||
.context("extracting base backup failed")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn create_pgdata(&self) -> Result<()> {
|
||||
fs::create_dir_all(self.pgdata()).with_context(|| {
|
||||
fn create_endpoint_dir(&self) -> Result<()> {
|
||||
std::fs::create_dir_all(self.endpoint_path()).with_context(|| {
|
||||
format!(
|
||||
"could not create data directory {}",
|
||||
self.pgdata().display()
|
||||
"could not create endpoint directory {}",
|
||||
self.endpoint_path().display()
|
||||
)
|
||||
})?;
|
||||
fs::set_permissions(self.pgdata().as_path(), fs::Permissions::from_mode(0o700))
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"could not set permissions in data directory {}",
|
||||
self.pgdata().display()
|
||||
)
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
// Write postgresql.conf with default configuration
|
||||
// and PG_VERSION file to the data directory of a new endpoint.
|
||||
fn setup_pg_conf(&self) -> Result<()> {
|
||||
// Generate postgresql.conf with default configuration
|
||||
fn setup_pg_conf(&self) -> Result<PostgresConf> {
|
||||
let mut conf = PostgresConf::new();
|
||||
conf.append("max_wal_senders", "10");
|
||||
conf.append("wal_log_hints", "off");
|
||||
@@ -287,25 +242,14 @@ impl Endpoint {
|
||||
// wal_sender_timeout is the maximum time to wait for WAL replication.
|
||||
// It also defines how often the walreciever will send a feedback message to the wal sender.
|
||||
conf.append("wal_sender_timeout", "5s");
|
||||
conf.append("listen_addresses", &self.address.ip().to_string());
|
||||
conf.append("port", &self.address.port().to_string());
|
||||
conf.append("listen_addresses", &self.pg_address.ip().to_string());
|
||||
conf.append("port", &self.pg_address.port().to_string());
|
||||
conf.append("wal_keep_size", "0");
|
||||
// walproposer panics when basebackup is invalid, it is pointless to restart in this case.
|
||||
conf.append("restart_after_crash", "off");
|
||||
|
||||
// Configure the Neon Postgres extension to fetch pages from pageserver
|
||||
let pageserver_connstr = {
|
||||
let config = &self.pageserver.pg_connection_config;
|
||||
let (host, port) = (config.host(), config.port());
|
||||
|
||||
// NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere.
|
||||
format!("postgresql://no_user@{host}:{port}")
|
||||
};
|
||||
// Load the 'neon' extension
|
||||
conf.append("shared_preload_libraries", "neon");
|
||||
conf.append_line("");
|
||||
conf.append("neon.pageserver_connstring", &pageserver_connstr);
|
||||
conf.append("neon.tenant_id", &self.tenant_id.to_string());
|
||||
conf.append("neon.timeline_id", &self.timeline_id.to_string());
|
||||
|
||||
conf.append_line("");
|
||||
// Replication-related configurations, such as WAL sending
|
||||
@@ -390,46 +334,11 @@ impl Endpoint {
|
||||
}
|
||||
}
|
||||
|
||||
let mut file = File::create(self.pgdata().join("postgresql.conf"))?;
|
||||
file.write_all(conf.to_string().as_bytes())?;
|
||||
|
||||
let mut file = File::create(self.pgdata().join("PG_VERSION"))?;
|
||||
file.write_all(self.pg_version.to_string().as_bytes())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn load_basebackup(&self, auth_token: &Option<String>) -> Result<()> {
|
||||
let backup_lsn = match &self.mode {
|
||||
ComputeMode::Primary => {
|
||||
if !self.env.safekeepers.is_empty() {
|
||||
// LSN 0 means that it is bootstrap and we need to download just
|
||||
// latest data from the pageserver. That is a bit clumsy but whole bootstrap
|
||||
// procedure evolves quite actively right now, so let's think about it again
|
||||
// when things would be more stable (TODO).
|
||||
let lsn = self.sync_safekeepers(auth_token, self.pg_version)?;
|
||||
if lsn == Lsn(0) {
|
||||
None
|
||||
} else {
|
||||
Some(lsn)
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
ComputeMode::Static(lsn) => Some(*lsn),
|
||||
ComputeMode::Replica => {
|
||||
None // Take the latest snapshot available to start with
|
||||
}
|
||||
};
|
||||
|
||||
self.do_basebackup(backup_lsn)?;
|
||||
|
||||
Ok(())
|
||||
Ok(conf)
|
||||
}
|
||||
|
||||
pub fn endpoint_path(&self) -> PathBuf {
|
||||
self.env.endpoints_path().join(&self.name)
|
||||
self.env.endpoints_path().join(&self.endpoint_id)
|
||||
}
|
||||
|
||||
pub fn pgdata(&self) -> PathBuf {
|
||||
@@ -439,7 +348,7 @@ impl Endpoint {
|
||||
pub fn status(&self) -> &str {
|
||||
let timeout = Duration::from_millis(300);
|
||||
let has_pidfile = self.pgdata().join("postmaster.pid").exists();
|
||||
let can_connect = TcpStream::connect_timeout(&self.address, timeout).is_ok();
|
||||
let can_connect = TcpStream::connect_timeout(&self.pg_address, timeout).is_ok();
|
||||
|
||||
match (has_pidfile, can_connect) {
|
||||
(true, true) => "running",
|
||||
@@ -457,8 +366,6 @@ impl Endpoint {
|
||||
&[
|
||||
"-D",
|
||||
self.pgdata().to_str().unwrap(),
|
||||
"-l",
|
||||
self.pgdata().join("pg.log").to_str().unwrap(),
|
||||
"-w", //wait till pg_ctl actually does what was asked
|
||||
],
|
||||
args,
|
||||
@@ -494,36 +401,183 @@ impl Endpoint {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn start(&self, auth_token: &Option<String>) -> Result<()> {
|
||||
pub fn start(&self, auth_token: &Option<String>, safekeepers: Vec<NodeId>) -> Result<()> {
|
||||
if self.status() == "running" {
|
||||
anyhow::bail!("The endpoint is already running");
|
||||
}
|
||||
|
||||
// 1. We always start Postgres from scratch, so
|
||||
// if old dir exists, preserve 'postgresql.conf' and drop the directory
|
||||
let postgresql_conf_path = self.pgdata().join("postgresql.conf");
|
||||
let postgresql_conf = fs::read(&postgresql_conf_path).with_context(|| {
|
||||
format!(
|
||||
"failed to read config file in {}",
|
||||
postgresql_conf_path.to_str().unwrap()
|
||||
)
|
||||
})?;
|
||||
fs::remove_dir_all(self.pgdata())?;
|
||||
self.create_pgdata()?;
|
||||
// Slurp the endpoints/<endpoint id>/postgresql.conf file into
|
||||
// memory. We will include it in the spec file that we pass to
|
||||
// `compute_ctl`, and `compute_ctl` will write it to the postgresql.conf
|
||||
// in the data directory.
|
||||
let postgresql_conf_path = self.endpoint_path().join("postgresql.conf");
|
||||
let postgresql_conf = match std::fs::read(&postgresql_conf_path) {
|
||||
Ok(content) => String::from_utf8(content)?,
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => "".to_string(),
|
||||
Err(e) => {
|
||||
return Err(anyhow::Error::new(e).context(format!(
|
||||
"failed to read config file in {}",
|
||||
postgresql_conf_path.to_str().unwrap()
|
||||
)))
|
||||
}
|
||||
};
|
||||
|
||||
// 2. Bring back config files
|
||||
fs::write(&postgresql_conf_path, postgresql_conf)?;
|
||||
|
||||
// 3. Load basebackup
|
||||
self.load_basebackup(auth_token)?;
|
||||
|
||||
if self.mode != ComputeMode::Primary {
|
||||
File::create(self.pgdata().join("standby.signal"))?;
|
||||
// We always start the compute node from scratch, so if the Postgres
|
||||
// data dir exists from a previous launch, remove it first.
|
||||
if self.pgdata().exists() {
|
||||
std::fs::remove_dir_all(self.pgdata())?;
|
||||
}
|
||||
|
||||
// 4. Finally start postgres
|
||||
println!("Starting postgres at '{}'", self.connstr());
|
||||
self.pg_ctl(&["start"], auth_token)
|
||||
let pageserver_connstring = {
|
||||
let config = &self.pageserver.pg_connection_config;
|
||||
let (host, port) = (config.host(), config.port());
|
||||
|
||||
// NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere.
|
||||
format!("postgresql://no_user@{host}:{port}")
|
||||
};
|
||||
let mut safekeeper_connstrings = Vec::new();
|
||||
if self.mode == ComputeMode::Primary {
|
||||
for sk_id in safekeepers {
|
||||
let sk = self
|
||||
.env
|
||||
.safekeepers
|
||||
.iter()
|
||||
.find(|node| node.id == sk_id)
|
||||
.ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
|
||||
safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.pg_port));
|
||||
}
|
||||
}
|
||||
|
||||
// Create spec file
|
||||
let spec = ComputeSpec {
|
||||
format_version: 1.0,
|
||||
operation_uuid: None,
|
||||
cluster: Cluster {
|
||||
cluster_id: None, // project ID: not used
|
||||
name: None, // project name: not used
|
||||
state: None,
|
||||
roles: vec![],
|
||||
databases: vec![],
|
||||
settings: None,
|
||||
postgresql_conf: Some(postgresql_conf),
|
||||
},
|
||||
delta_operations: None,
|
||||
tenant_id: Some(self.tenant_id),
|
||||
timeline_id: Some(self.timeline_id),
|
||||
mode: self.mode,
|
||||
pageserver_connstring: Some(pageserver_connstring),
|
||||
safekeeper_connstrings,
|
||||
storage_auth_token: auth_token.clone(),
|
||||
};
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
|
||||
|
||||
// Open log file. We'll redirect the stdout and stderr of `compute_ctl` to it.
|
||||
let logfile = std::fs::OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(self.endpoint_path().join("compute.log"))?;
|
||||
|
||||
// Launch compute_ctl
|
||||
println!("Starting postgres node at '{}'", self.connstr());
|
||||
let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
|
||||
cmd.args(["--http-port", &self.http_address.port().to_string()])
|
||||
.args(["--pgdata", self.pgdata().to_str().unwrap()])
|
||||
.args(["--connstr", &self.connstr()])
|
||||
.args([
|
||||
"--spec-path",
|
||||
self.endpoint_path().join("spec.json").to_str().unwrap(),
|
||||
])
|
||||
.args([
|
||||
"--pgbin",
|
||||
self.env
|
||||
.pg_bin_dir(self.pg_version)?
|
||||
.join("postgres")
|
||||
.to_str()
|
||||
.unwrap(),
|
||||
])
|
||||
.stdin(std::process::Stdio::null())
|
||||
.stderr(logfile.try_clone()?)
|
||||
.stdout(logfile);
|
||||
let _child = cmd.spawn()?;
|
||||
|
||||
// Wait for it to start
|
||||
let mut attempt = 0;
|
||||
const ATTEMPT_INTERVAL: Duration = Duration::from_millis(100);
|
||||
const MAX_ATTEMPTS: u32 = 10 * 30; // Wait up to 30 s
|
||||
loop {
|
||||
attempt += 1;
|
||||
match self.get_status() {
|
||||
Ok(state) => {
|
||||
match state.status {
|
||||
ComputeStatus::Init => {
|
||||
if attempt == MAX_ATTEMPTS {
|
||||
bail!("compute startup timed out; still in Init state");
|
||||
}
|
||||
// keep retrying
|
||||
}
|
||||
ComputeStatus::Running => {
|
||||
// All good!
|
||||
break;
|
||||
}
|
||||
ComputeStatus::Failed => {
|
||||
bail!(
|
||||
"compute startup failed: {}",
|
||||
state
|
||||
.error
|
||||
.as_deref()
|
||||
.unwrap_or("<no error from compute_ctl>")
|
||||
);
|
||||
}
|
||||
ComputeStatus::Empty
|
||||
| ComputeStatus::ConfigurationPending
|
||||
| ComputeStatus::Configuration => {
|
||||
bail!("unexpected compute status: {:?}", state.status)
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
if attempt == MAX_ATTEMPTS {
|
||||
return Err(e).context(
|
||||
"timed out waiting to connect to compute_ctl HTTP; last error: {e}",
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
std::thread::sleep(ATTEMPT_INTERVAL);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Call the /status HTTP API
|
||||
pub fn get_status(&self) -> Result<ComputeState> {
|
||||
let client = reqwest::blocking::Client::new();
|
||||
|
||||
let response = client
|
||||
.request(
|
||||
reqwest::Method::GET,
|
||||
format!(
|
||||
"http://{}:{}/status",
|
||||
self.http_address.ip(),
|
||||
self.http_address.port()
|
||||
),
|
||||
)
|
||||
.send()?;
|
||||
|
||||
// Interpret the response
|
||||
let status = response.status();
|
||||
if !(status.is_client_error() || status.is_server_error()) {
|
||||
Ok(response.json()?)
|
||||
} else {
|
||||
// reqwest does not export its error construction utility functions, so let's craft the message ourselves
|
||||
let url = response.url().to_owned();
|
||||
let msg = match response.text() {
|
||||
Ok(err_body) => format!("Error: {}", err_body),
|
||||
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
|
||||
};
|
||||
Err(anyhow::anyhow!(msg))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn stop(&self, destroy: bool) -> Result<()> {
|
||||
@@ -540,7 +594,7 @@ impl Endpoint {
|
||||
"Destroying postgres data directory '{}'",
|
||||
self.pgdata().to_str().unwrap()
|
||||
);
|
||||
fs::remove_dir_all(self.endpoint_path())?;
|
||||
std::fs::remove_dir_all(self.endpoint_path())?;
|
||||
} else {
|
||||
self.pg_ctl(&["stop"], &None)?;
|
||||
}
|
||||
@@ -549,10 +603,10 @@ impl Endpoint {
|
||||
|
||||
pub fn connstr(&self) -> String {
|
||||
format!(
|
||||
"host={} port={} user={} dbname={}",
|
||||
self.address.ip(),
|
||||
self.address.port(),
|
||||
"postgresql://{}@{}:{}/{}",
|
||||
"cloud_admin",
|
||||
self.pg_address.ip(),
|
||||
self.pg_address.port(),
|
||||
"postgres"
|
||||
)
|
||||
}
|
||||
|
||||
@@ -24,7 +24,7 @@ use utils::{
|
||||
|
||||
use crate::safekeeper::SafekeeperNode;
|
||||
|
||||
pub const DEFAULT_PG_VERSION: u32 = 14;
|
||||
pub const DEFAULT_PG_VERSION: u32 = 15;
|
||||
|
||||
//
|
||||
// This data structures represents neon_local CLI config
|
||||
@@ -37,7 +37,7 @@ pub const DEFAULT_PG_VERSION: u32 = 14;
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
pub struct LocalEnv {
|
||||
// Base directory for all the nodes (the pageserver, safekeepers and
|
||||
// compute nodes).
|
||||
// compute endpoints).
|
||||
//
|
||||
// This is not stored in the config file. Rather, this is the path where the
|
||||
// config file itself is. It is read from the NEON_REPO_DIR env variable or
|
||||
|
||||
@@ -1,3 +1,9 @@
|
||||
//! Code to manage pageservers
|
||||
//!
|
||||
//! In the local test environment, the pageserver stores its data directly in
|
||||
//!
|
||||
//! .neon/
|
||||
//!
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
@@ -369,6 +375,11 @@ impl PageServerNode {
|
||||
evictions_low_residence_duration_metric_threshold: settings
|
||||
.remove("evictions_low_residence_duration_metric_threshold")
|
||||
.map(|x| x.to_string()),
|
||||
gc_feedback: settings
|
||||
.remove("gc_feedback")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'gc_feedback' as bool")?,
|
||||
};
|
||||
|
||||
// If tenant ID was not specified, generate one
|
||||
@@ -463,6 +474,11 @@ impl PageServerNode {
|
||||
evictions_low_residence_duration_metric_threshold: settings
|
||||
.remove("evictions_low_residence_duration_metric_threshold")
|
||||
.map(|x| x.to_string()),
|
||||
gc_feedback: settings
|
||||
.remove("gc_feedback")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'gc_feedback' as bool")?,
|
||||
}
|
||||
};
|
||||
|
||||
@@ -499,6 +515,9 @@ impl PageServerNode {
|
||||
ancestor_timeline_id: Option<TimelineId>,
|
||||
pg_version: Option<u32>,
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
// If timeline ID was not specified, generate one
|
||||
let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
|
||||
|
||||
self.http_request(
|
||||
Method::POST,
|
||||
format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
|
||||
|
||||
@@ -1,3 +1,9 @@
|
||||
//! Code to manage safekeepers
|
||||
//!
|
||||
//! In the local test environment, the data for each safekeeper is stored in
|
||||
//!
|
||||
//! .neon/safekeepers/<safekeeper id>
|
||||
//!
|
||||
use std::io::Write;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Child;
|
||||
|
||||
@@ -1,6 +1,14 @@
|
||||
#!/bin/bash
|
||||
set -eux
|
||||
|
||||
# Generate a random tenant or timeline ID
|
||||
#
|
||||
# Takes a variable name as argument. The result is stored in that variable.
|
||||
generate_id() {
|
||||
local -n resvar=$1
|
||||
printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM
|
||||
}
|
||||
|
||||
PG_VERSION=${PG_VERSION:-14}
|
||||
|
||||
SPEC_FILE_ORG=/var/db/postgres/specs/spec.json
|
||||
@@ -13,29 +21,29 @@ done
|
||||
echo "Page server is ready."
|
||||
|
||||
echo "Create a tenant and timeline"
|
||||
generate_id tenant_id
|
||||
PARAMS=(
|
||||
-sb
|
||||
-X POST
|
||||
-H "Content-Type: application/json"
|
||||
-d "{}"
|
||||
-d "{\"new_tenant_id\": \"${tenant_id}\"}"
|
||||
http://pageserver:9898/v1/tenant/
|
||||
)
|
||||
tenant_id=$(curl "${PARAMS[@]}" | sed 's/"//g')
|
||||
result=$(curl "${PARAMS[@]}")
|
||||
echo $result | jq .
|
||||
|
||||
generate_id timeline_id
|
||||
PARAMS=(
|
||||
-sb
|
||||
-X POST
|
||||
-H "Content-Type: application/json"
|
||||
-d "{\"tenant_id\":\"${tenant_id}\", \"pg_version\": ${PG_VERSION}}"
|
||||
-d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}"
|
||||
"http://pageserver:9898/v1/tenant/${tenant_id}/timeline/"
|
||||
)
|
||||
result=$(curl "${PARAMS[@]}")
|
||||
echo $result | jq .
|
||||
|
||||
echo "Overwrite tenant id and timeline id in spec file"
|
||||
tenant_id=$(echo ${result} | jq -r .tenant_id)
|
||||
timeline_id=$(echo ${result} | jq -r .timeline_id)
|
||||
|
||||
sed "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE_ORG} > ${SPEC_FILE}
|
||||
sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE}
|
||||
|
||||
|
||||
@@ -52,9 +52,7 @@ completion, or shield the rest of the code from surprise cancellations
|
||||
by spawning a separate task. The code that handles incoming HTTP
|
||||
requests, for example, spawns a separate task for each request,
|
||||
because Hyper will drop the request-handling Future if the HTTP
|
||||
connection is lost. (FIXME: our HTTP handlers do not do that
|
||||
currently, but we should fix that. See [issue
|
||||
3478](https://github.com/neondatabase/neon/issues/3478)).
|
||||
connection is lost.
|
||||
|
||||
|
||||
#### How to cancel, then?
|
||||
|
||||
@@ -5,13 +5,13 @@ use serde::{Deserialize, Serialize, Serializer};
|
||||
|
||||
use crate::spec::ComputeSpec;
|
||||
|
||||
#[derive(Serialize, Debug)]
|
||||
#[derive(Serialize, Debug, Deserialize)]
|
||||
pub struct GenericAPIError {
|
||||
pub error: String,
|
||||
}
|
||||
|
||||
/// Response of the /status API
|
||||
#[derive(Serialize, Debug)]
|
||||
#[derive(Serialize, Debug, Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub struct ComputeStatusResponse {
|
||||
pub start_time: DateTime<Utc>,
|
||||
@@ -23,7 +23,7 @@ pub struct ComputeStatusResponse {
|
||||
pub error: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
#[derive(Deserialize, Serialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub struct ComputeState {
|
||||
pub status: ComputeStatus,
|
||||
@@ -33,7 +33,7 @@ pub struct ComputeState {
|
||||
pub error: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Clone, Copy, Debug, PartialEq, Eq)]
|
||||
#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum ComputeStatus {
|
||||
// Spec wasn't provided at start, waiting for it to be
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
//! and connect it to the storage nodes.
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
/// String type alias representing Postgres identifier and
|
||||
@@ -14,7 +15,7 @@ pub type PgIdent = String;
|
||||
/// Cluster spec or configuration represented as an optional number of
|
||||
/// delta operations + final cluster state description.
|
||||
#[serde_as]
|
||||
#[derive(Clone, Debug, Default, Deserialize)]
|
||||
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
|
||||
pub struct ComputeSpec {
|
||||
pub format_version: f32,
|
||||
|
||||
@@ -26,9 +27,32 @@ pub struct ComputeSpec {
|
||||
pub cluster: Cluster,
|
||||
pub delta_operations: Option<Vec<DeltaOp>>,
|
||||
|
||||
// Information needed to connect to the storage layer.
|
||||
//
|
||||
// `tenant_id`, `timeline_id` and `pageserver_connstring` are always needed.
|
||||
//
|
||||
// Depending on `mode`, this can be a primary read-write node, a read-only
|
||||
// replica, or a read-only node pinned at an older LSN.
|
||||
// `safekeeper_connstrings` must be set for a primary.
|
||||
//
|
||||
// For backwards compatibility, the control plane may leave out all of
|
||||
// these, and instead set the "neon.tenant_id", "neon.timeline_id",
|
||||
// etc. GUCs in cluster.settings. TODO: Once the control plane has been
|
||||
// updated to fill these fields, we can make these non optional.
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub tenant_id: Option<TenantId>,
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub timeline_id: Option<TimelineId>,
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub pageserver_connstring: Option<String>,
|
||||
#[serde(default)]
|
||||
pub safekeeper_connstrings: Vec<String>,
|
||||
|
||||
#[serde(default)]
|
||||
pub mode: ComputeMode,
|
||||
|
||||
/// If set, 'storage_auth_token' is used as the password to authenticate to
|
||||
/// the pageserver and safekeepers.
|
||||
pub storage_auth_token: Option<String>,
|
||||
}
|
||||
|
||||
@@ -47,13 +71,19 @@ pub enum ComputeMode {
|
||||
Replica,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default, Deserialize)]
|
||||
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
|
||||
pub struct Cluster {
|
||||
pub cluster_id: String,
|
||||
pub name: String,
|
||||
pub cluster_id: Option<String>,
|
||||
pub name: Option<String>,
|
||||
pub state: Option<String>,
|
||||
pub roles: Vec<Role>,
|
||||
pub databases: Vec<Database>,
|
||||
|
||||
/// Desired contents of 'postgresql.conf' file. (The 'compute_ctl'
|
||||
/// tool may add additional settings to the final file.)
|
||||
pub postgresql_conf: Option<String>,
|
||||
|
||||
/// Additional settings that will be appended to the 'postgresql.conf' file.
|
||||
pub settings: GenericOptions,
|
||||
}
|
||||
|
||||
@@ -63,7 +93,7 @@ pub struct Cluster {
|
||||
/// - DROP ROLE
|
||||
/// - ALTER ROLE name RENAME TO new_name
|
||||
/// - ALTER DATABASE name RENAME TO new_name
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
#[derive(Clone, Debug, Deserialize, Serialize)]
|
||||
pub struct DeltaOp {
|
||||
pub action: String,
|
||||
pub name: PgIdent,
|
||||
@@ -72,7 +102,7 @@ pub struct DeltaOp {
|
||||
|
||||
/// Rust representation of Postgres role info with only those fields
|
||||
/// that matter for us.
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
#[derive(Clone, Debug, Deserialize, Serialize)]
|
||||
pub struct Role {
|
||||
pub name: PgIdent,
|
||||
pub encrypted_password: Option<String>,
|
||||
@@ -81,7 +111,7 @@ pub struct Role {
|
||||
|
||||
/// Rust representation of Postgres database info with only those fields
|
||||
/// that matter for us.
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
#[derive(Clone, Debug, Deserialize, Serialize)]
|
||||
pub struct Database {
|
||||
pub name: PgIdent,
|
||||
pub owner: PgIdent,
|
||||
@@ -91,7 +121,7 @@ pub struct Database {
|
||||
/// Common type representing both SQL statement params with or without value,
|
||||
/// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config
|
||||
/// options like `wal_level = logical`.
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
#[derive(Clone, Debug, Deserialize, Serialize)]
|
||||
pub struct GenericOption {
|
||||
pub name: String,
|
||||
pub value: Option<String>,
|
||||
|
||||
@@ -18,7 +18,29 @@ use crate::reltag::RelTag;
|
||||
use anyhow::bail;
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
|
||||
/// A state of a tenant in pageserver's memory.
|
||||
/// The state of a tenant in this pageserver.
|
||||
///
|
||||
/// ```mermaid
|
||||
/// stateDiagram-v2
|
||||
///
|
||||
/// [*] --> Loading: spawn_load()
|
||||
/// [*] --> Attaching: spawn_attach()
|
||||
///
|
||||
/// Loading --> Activating: activate()
|
||||
/// Attaching --> Activating: activate()
|
||||
/// Activating --> Active: infallible
|
||||
///
|
||||
/// Loading --> Broken: load() failure
|
||||
/// Attaching --> Broken: attach() failure
|
||||
///
|
||||
/// Active --> Stopping: set_stopping(), part of shutdown & detach
|
||||
/// Stopping --> Broken: late error in remove_tenant_from_memory
|
||||
///
|
||||
/// Broken --> [*]: ignore / detach / shutdown
|
||||
/// Stopping --> [*]: remove_from_memory complete
|
||||
///
|
||||
/// Active --> Broken: cfg(testing)-only tenant break point
|
||||
/// ```
|
||||
#[derive(
|
||||
Clone,
|
||||
PartialEq,
|
||||
@@ -26,51 +48,73 @@ use bytes::{BufMut, Bytes, BytesMut};
|
||||
serde::Serialize,
|
||||
serde::Deserialize,
|
||||
strum_macros::Display,
|
||||
strum_macros::EnumString,
|
||||
strum_macros::EnumVariantNames,
|
||||
strum_macros::AsRefStr,
|
||||
strum_macros::IntoStaticStr,
|
||||
)]
|
||||
#[serde(tag = "slug", content = "data")]
|
||||
pub enum TenantState {
|
||||
/// This tenant is being loaded from local disk
|
||||
/// This tenant is being loaded from local disk.
|
||||
///
|
||||
/// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
|
||||
Loading,
|
||||
/// This tenant is being downloaded from cloud storage.
|
||||
/// This tenant is being attached to the pageserver.
|
||||
///
|
||||
/// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
|
||||
Attaching,
|
||||
/// Tenant is fully operational
|
||||
/// The tenant is transitioning from Loading/Attaching to Active.
|
||||
///
|
||||
/// While in this state, the individual timelines are being activated.
|
||||
///
|
||||
/// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
|
||||
Activating(ActivatingFrom),
|
||||
/// The tenant has finished activating and is open for business.
|
||||
///
|
||||
/// Transitions out of this state are possible through `set_stopping()` and `set_broken()`.
|
||||
Active,
|
||||
/// A tenant is recognized by pageserver, but it is being detached or the
|
||||
/// The tenant is recognized by pageserver, but it is being detached or the
|
||||
/// system is being shut down.
|
||||
///
|
||||
/// Transitions out of this state are possible through `set_broken()`.
|
||||
Stopping,
|
||||
/// A tenant is recognized by the pageserver, but can no longer be used for
|
||||
/// any operations, because it failed to be activated.
|
||||
/// The tenant is recognized by the pageserver, but can no longer be used for
|
||||
/// any operations.
|
||||
///
|
||||
/// If the tenant fails to load or attach, it will transition to this state
|
||||
/// and it is guaranteed that no background tasks are running in its name.
|
||||
///
|
||||
/// The other way to transition into this state is from `Stopping` state
|
||||
/// through `set_broken()` called from `remove_tenant_from_memory()`. That happens
|
||||
/// if the cleanup future executed by `remove_tenant_from_memory()` fails.
|
||||
Broken { reason: String, backtrace: String },
|
||||
}
|
||||
|
||||
impl TenantState {
|
||||
pub fn attachment_status(&self) -> TenantAttachmentStatus {
|
||||
use TenantAttachmentStatus::*;
|
||||
|
||||
// Below TenantState::Activating is used as "transient" or "transparent" state for
|
||||
// attachment_status determining.
|
||||
match self {
|
||||
// The attach procedure writes the marker file before adding the Attaching tenant to the tenants map.
|
||||
// So, technically, we can return Attached here.
|
||||
// However, as soon as Console observes Attached, it will proceed with the Postgres-level health check.
|
||||
// But, our attach task might still be fetching the remote timelines, etc.
|
||||
// So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
|
||||
Self::Attaching => Maybe,
|
||||
Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe,
|
||||
// tenant mgr startup distinguishes attaching from loading via marker file.
|
||||
// If it's loading, there is no attach marker file, i.e., attach had finished in the past.
|
||||
Self::Loading => Attached,
|
||||
Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached,
|
||||
// We only reach Active after successful load / attach.
|
||||
// So, call atttachment status Attached.
|
||||
Self::Active => Attached,
|
||||
// If the (initial or resumed) attach procedure fails, the tenant becomes Broken.
|
||||
// However, it also becomes Broken if the regular load fails.
|
||||
// We would need a separate TenantState variant to distinguish these cases.
|
||||
// However, there's no practical difference from Console's perspective.
|
||||
// It will run a Postgres-level health check as soon as it observes Attached.
|
||||
// That will fail on Broken tenants.
|
||||
// Console can then rollback the attach, or, wait for operator to fix the Broken tenant.
|
||||
Self::Broken { .. } => Attached,
|
||||
// From Console's perspective there's no practical difference
|
||||
// because attachment_status is polled by console only during attach operation execution.
|
||||
Self::Broken { reason, .. } => Failed {
|
||||
reason: reason.to_owned(),
|
||||
},
|
||||
// Why is Stopping a Maybe case? Because, during pageserver shutdown,
|
||||
// we set the Stopping state irrespective of whether the tenant
|
||||
// has finished attaching or not.
|
||||
@@ -98,8 +142,17 @@ impl std::fmt::Debug for TenantState {
|
||||
}
|
||||
}
|
||||
|
||||
/// The only [`TenantState`] variants we could be `TenantState::Activating` from.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
pub enum ActivatingFrom {
|
||||
/// Arrived to [`TenantState::Activating`] from [`TenantState::Loading`]
|
||||
Loading,
|
||||
/// Arrived to [`TenantState::Activating`] from [`TenantState::Attaching`]
|
||||
Attaching,
|
||||
}
|
||||
|
||||
/// A state of a timeline in pageserver's memory.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
pub enum TimelineState {
|
||||
/// The timeline is recognized by the pageserver but is not yet operational.
|
||||
/// In particular, the walreceiver connection loop is not running for this timeline.
|
||||
@@ -112,15 +165,14 @@ pub enum TimelineState {
|
||||
/// It cannot transition back into any other state.
|
||||
Stopping,
|
||||
/// The timeline is broken and not operational (previous states: Loading or Active).
|
||||
Broken,
|
||||
Broken { reason: String, backtrace: String },
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TimelineCreateRequest {
|
||||
#[serde(default)]
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub new_timeline_id: Option<TimelineId>,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub new_timeline_id: TimelineId,
|
||||
#[serde(default)]
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub ancestor_timeline_id: Option<TimelineId>,
|
||||
@@ -170,6 +222,7 @@ pub struct TenantConfig {
|
||||
pub eviction_policy: Option<serde_json::Value>,
|
||||
pub min_resident_size_override: Option<u64>,
|
||||
pub evictions_low_residence_duration_metric_threshold: Option<String>,
|
||||
pub gc_feedback: Option<bool>,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
@@ -228,17 +281,41 @@ impl TenantConfigRequest {
|
||||
eviction_policy: None,
|
||||
min_resident_size_override: None,
|
||||
evictions_low_residence_duration_metric_threshold: None,
|
||||
gc_feedback: None,
|
||||
};
|
||||
TenantConfigRequest { tenant_id, config }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct TenantAttachRequest {
|
||||
pub config: TenantAttachConfig,
|
||||
}
|
||||
|
||||
/// Newtype to enforce deny_unknown_fields on TenantConfig for
|
||||
/// its usage inside `TenantAttachRequest`.
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantAttachConfig {
|
||||
#[serde(flatten)]
|
||||
allowing_unknown_fields: TenantConfig,
|
||||
}
|
||||
|
||||
impl std::ops::Deref for TenantAttachConfig {
|
||||
type Target = TenantConfig;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.allowing_unknown_fields
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
#[serde(tag = "slug", content = "data", rename_all = "snake_case")]
|
||||
pub enum TenantAttachmentStatus {
|
||||
Maybe,
|
||||
Attached,
|
||||
Failed { reason: String },
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
@@ -732,7 +809,9 @@ mod tests {
|
||||
"slug": "Active",
|
||||
},
|
||||
"current_physical_size": 42,
|
||||
"attachment_status": "attached",
|
||||
"attachment_status": {
|
||||
"slug":"attached",
|
||||
}
|
||||
});
|
||||
|
||||
let original_broken = TenantInfo {
|
||||
@@ -754,7 +833,9 @@ mod tests {
|
||||
}
|
||||
},
|
||||
"current_physical_size": 42,
|
||||
"attachment_status": "attached",
|
||||
"attachment_status": {
|
||||
"slug":"attached",
|
||||
}
|
||||
});
|
||||
|
||||
assert_eq!(
|
||||
@@ -795,5 +876,68 @@ mod tests {
|
||||
"expect unknown field `unknown_field` error, got: {}",
|
||||
err
|
||||
);
|
||||
|
||||
let attach_request = json!({
|
||||
"config": {
|
||||
"unknown_field": "unknown_value".to_string(),
|
||||
},
|
||||
});
|
||||
let err = serde_json::from_value::<TenantAttachRequest>(attach_request).unwrap_err();
|
||||
assert!(
|
||||
err.to_string().contains("unknown field `unknown_field`"),
|
||||
"expect unknown field `unknown_field` error, got: {}",
|
||||
err
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tenantstatus_activating_serde() {
|
||||
let states = [
|
||||
TenantState::Activating(ActivatingFrom::Loading),
|
||||
TenantState::Activating(ActivatingFrom::Attaching),
|
||||
];
|
||||
let expected = "[{\"slug\":\"Activating\",\"data\":\"Loading\"},{\"slug\":\"Activating\",\"data\":\"Attaching\"}]";
|
||||
|
||||
let actual = serde_json::to_string(&states).unwrap();
|
||||
|
||||
assert_eq!(actual, expected);
|
||||
|
||||
let parsed = serde_json::from_str::<Vec<TenantState>>(&actual).unwrap();
|
||||
|
||||
assert_eq!(states.as_slice(), &parsed);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tenantstatus_activating_strum() {
|
||||
// tests added, because we use these for metrics
|
||||
let examples = [
|
||||
(line!(), TenantState::Loading, "Loading"),
|
||||
(line!(), TenantState::Attaching, "Attaching"),
|
||||
(
|
||||
line!(),
|
||||
TenantState::Activating(ActivatingFrom::Loading),
|
||||
"Activating",
|
||||
),
|
||||
(
|
||||
line!(),
|
||||
TenantState::Activating(ActivatingFrom::Attaching),
|
||||
"Activating",
|
||||
),
|
||||
(line!(), TenantState::Active, "Active"),
|
||||
(line!(), TenantState::Stopping, "Stopping"),
|
||||
(
|
||||
line!(),
|
||||
TenantState::Broken {
|
||||
reason: "Example".into(),
|
||||
backtrace: "Looooong backtrace".into(),
|
||||
},
|
||||
"Broken",
|
||||
),
|
||||
];
|
||||
|
||||
for (line, rendered, expected) in examples {
|
||||
let actual: &'static str = rendered.into();
|
||||
assert_eq!(actual, expected, "example on {line}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,7 +24,6 @@ workspace_hack.workspace = true
|
||||
[dev-dependencies]
|
||||
env_logger.workspace = true
|
||||
postgres.workspace = true
|
||||
wal_craft = { path = "wal_craft" }
|
||||
|
||||
[build-dependencies]
|
||||
anyhow.workspace = true
|
||||
|
||||
@@ -33,6 +33,7 @@ macro_rules! postgres_ffi {
|
||||
}
|
||||
pub mod controlfile_utils;
|
||||
pub mod nonrelfile_utils;
|
||||
pub mod wal_craft_test_export;
|
||||
pub mod waldecoder_handler;
|
||||
pub mod xlog_utils;
|
||||
|
||||
@@ -45,8 +46,15 @@ macro_rules! postgres_ffi {
|
||||
};
|
||||
}
|
||||
|
||||
postgres_ffi!(v14);
|
||||
postgres_ffi!(v15);
|
||||
#[macro_export]
|
||||
macro_rules! for_all_postgres_versions {
|
||||
($macro:tt) => {
|
||||
$macro!(v14);
|
||||
$macro!(v15);
|
||||
};
|
||||
}
|
||||
|
||||
for_all_postgres_versions! { postgres_ffi }
|
||||
|
||||
pub mod pg_constants;
|
||||
pub mod relfile_utils;
|
||||
|
||||
6
libs/postgres_ffi/src/wal_craft_test_export.rs
Normal file
6
libs/postgres_ffi/src/wal_craft_test_export.rs
Normal file
@@ -0,0 +1,6 @@
|
||||
//! This module is for WAL craft to test with postgres_ffi. Should not import any thing in normal usage.
|
||||
|
||||
pub use super::PG_MAJORVERSION;
|
||||
pub use super::xlog_utils::*;
|
||||
pub use super::bindings::*;
|
||||
pub use crate::WAL_SEGMENT_SIZE;
|
||||
@@ -481,220 +481,4 @@ pub fn encode_logical_message(prefix: &str, message: &str) -> Vec<u8> {
|
||||
wal
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::super::PG_MAJORVERSION;
|
||||
use super::*;
|
||||
use regex::Regex;
|
||||
use std::cmp::min;
|
||||
use std::fs;
|
||||
use std::{env, str::FromStr};
|
||||
use utils::const_assert;
|
||||
|
||||
fn init_logging() {
|
||||
let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(
|
||||
format!("wal_craft=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"),
|
||||
))
|
||||
.is_test(true)
|
||||
.try_init();
|
||||
}
|
||||
|
||||
fn test_end_of_wal<C: wal_craft::Crafter>(test_name: &str) {
|
||||
use wal_craft::*;
|
||||
|
||||
let pg_version = PG_MAJORVERSION[1..3].parse::<u32>().unwrap();
|
||||
|
||||
// Craft some WAL
|
||||
let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("..")
|
||||
.join("..");
|
||||
let cfg = Conf {
|
||||
pg_version,
|
||||
pg_distrib_dir: top_path.join("pg_install"),
|
||||
datadir: top_path.join(format!("test_output/{}-{PG_MAJORVERSION}", test_name)),
|
||||
};
|
||||
if cfg.datadir.exists() {
|
||||
fs::remove_dir_all(&cfg.datadir).unwrap();
|
||||
}
|
||||
cfg.initdb().unwrap();
|
||||
let srv = cfg.start_server().unwrap();
|
||||
let (intermediate_lsns, expected_end_of_wal_partial) =
|
||||
C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
|
||||
let intermediate_lsns: Vec<Lsn> = intermediate_lsns
|
||||
.iter()
|
||||
.map(|&lsn| u64::from(lsn).into())
|
||||
.collect();
|
||||
let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into();
|
||||
srv.kill();
|
||||
|
||||
// Check find_end_of_wal on the initial WAL
|
||||
let last_segment = cfg
|
||||
.wal_dir()
|
||||
.read_dir()
|
||||
.unwrap()
|
||||
.map(|f| f.unwrap().file_name().into_string().unwrap())
|
||||
.filter(|fname| IsXLogFileName(fname))
|
||||
.max()
|
||||
.unwrap();
|
||||
check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal);
|
||||
for start_lsn in intermediate_lsns
|
||||
.iter()
|
||||
.chain(std::iter::once(&expected_end_of_wal))
|
||||
{
|
||||
// Erase all WAL before `start_lsn` to ensure it's not used by `find_end_of_wal`.
|
||||
// We assume that `start_lsn` is non-decreasing.
|
||||
info!(
|
||||
"Checking with start_lsn={}, erasing WAL before it",
|
||||
start_lsn
|
||||
);
|
||||
for file in fs::read_dir(cfg.wal_dir()).unwrap().flatten() {
|
||||
let fname = file.file_name().into_string().unwrap();
|
||||
if !IsXLogFileName(&fname) {
|
||||
continue;
|
||||
}
|
||||
let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE);
|
||||
let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
|
||||
if seg_start_lsn > u64::from(*start_lsn) {
|
||||
continue;
|
||||
}
|
||||
let mut f = File::options().write(true).open(file.path()).unwrap();
|
||||
const ZEROS: [u8; WAL_SEGMENT_SIZE] = [0u8; WAL_SEGMENT_SIZE];
|
||||
f.write_all(
|
||||
&ZEROS[0..min(
|
||||
WAL_SEGMENT_SIZE,
|
||||
(u64::from(*start_lsn) - seg_start_lsn) as usize,
|
||||
)],
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
check_end_of_wal(&cfg, &last_segment, *start_lsn, expected_end_of_wal);
|
||||
}
|
||||
}
|
||||
|
||||
fn check_pg_waldump_end_of_wal(
|
||||
cfg: &wal_craft::Conf,
|
||||
last_segment: &str,
|
||||
expected_end_of_wal: Lsn,
|
||||
) {
|
||||
// Get the actual end of WAL by pg_waldump
|
||||
let waldump_output = cfg
|
||||
.pg_waldump("000000010000000000000001", last_segment)
|
||||
.unwrap()
|
||||
.stderr;
|
||||
let waldump_output = std::str::from_utf8(&waldump_output).unwrap();
|
||||
let caps = match Regex::new(r"invalid record length at (.+):")
|
||||
.unwrap()
|
||||
.captures(waldump_output)
|
||||
{
|
||||
Some(caps) => caps,
|
||||
None => {
|
||||
error!("Unable to parse pg_waldump's stderr:\n{}", waldump_output);
|
||||
panic!();
|
||||
}
|
||||
};
|
||||
let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
|
||||
info!(
|
||||
"waldump erred on {}, expected wal end at {}",
|
||||
waldump_wal_end, expected_end_of_wal
|
||||
);
|
||||
assert_eq!(waldump_wal_end, expected_end_of_wal);
|
||||
}
|
||||
|
||||
fn check_end_of_wal(
|
||||
cfg: &wal_craft::Conf,
|
||||
last_segment: &str,
|
||||
start_lsn: Lsn,
|
||||
expected_end_of_wal: Lsn,
|
||||
) {
|
||||
// Check end_of_wal on non-partial WAL segment (we treat it as fully populated)
|
||||
// let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
|
||||
// info!(
|
||||
// "find_end_of_wal returned wal_end={} with non-partial WAL segment",
|
||||
// wal_end
|
||||
// );
|
||||
// assert_eq!(wal_end, expected_end_of_wal_non_partial);
|
||||
|
||||
// Rename file to partial to actually find last valid lsn, then rename it back.
|
||||
fs::rename(
|
||||
cfg.wal_dir().join(last_segment),
|
||||
cfg.wal_dir().join(format!("{}.partial", last_segment)),
|
||||
)
|
||||
.unwrap();
|
||||
let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
|
||||
info!(
|
||||
"find_end_of_wal returned wal_end={} with partial WAL segment",
|
||||
wal_end
|
||||
);
|
||||
assert_eq!(wal_end, expected_end_of_wal);
|
||||
fs::rename(
|
||||
cfg.wal_dir().join(format!("{}.partial", last_segment)),
|
||||
cfg.wal_dir().join(last_segment),
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
const_assert!(WAL_SEGMENT_SIZE == 16 * 1024 * 1024);
|
||||
|
||||
#[test]
|
||||
pub fn test_find_end_of_wal_simple() {
|
||||
init_logging();
|
||||
test_end_of_wal::<wal_craft::Simple>("test_find_end_of_wal_simple");
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_find_end_of_wal_crossing_segment_followed_by_small_one() {
|
||||
init_logging();
|
||||
test_end_of_wal::<wal_craft::WalRecordCrossingSegmentFollowedBySmallOne>(
|
||||
"test_find_end_of_wal_crossing_segment_followed_by_small_one",
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_find_end_of_wal_last_crossing_segment() {
|
||||
init_logging();
|
||||
test_end_of_wal::<wal_craft::LastWalRecordCrossingSegment>(
|
||||
"test_find_end_of_wal_last_crossing_segment",
|
||||
);
|
||||
}
|
||||
|
||||
/// Check the math in update_next_xid
|
||||
///
|
||||
/// NOTE: These checks are sensitive to the value of XID_CHECKPOINT_INTERVAL,
|
||||
/// currently 1024.
|
||||
#[test]
|
||||
pub fn test_update_next_xid() {
|
||||
let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
|
||||
let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();
|
||||
|
||||
checkpoint.nextXid = FullTransactionId { value: 10 };
|
||||
assert_eq!(checkpoint.nextXid.value, 10);
|
||||
|
||||
// The input XID gets rounded up to the next XID_CHECKPOINT_INTERVAL
|
||||
// boundary
|
||||
checkpoint.update_next_xid(100);
|
||||
assert_eq!(checkpoint.nextXid.value, 1024);
|
||||
|
||||
// No change
|
||||
checkpoint.update_next_xid(500);
|
||||
assert_eq!(checkpoint.nextXid.value, 1024);
|
||||
checkpoint.update_next_xid(1023);
|
||||
assert_eq!(checkpoint.nextXid.value, 1024);
|
||||
|
||||
// The function returns the *next* XID, given the highest XID seen so
|
||||
// far. So when we pass 1024, the nextXid gets bumped up to the next
|
||||
// XID_CHECKPOINT_INTERVAL boundary.
|
||||
checkpoint.update_next_xid(1024);
|
||||
assert_eq!(checkpoint.nextXid.value, 2048);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_encode_logical_message() {
|
||||
let expected = [
|
||||
64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255,
|
||||
38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114,
|
||||
101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
|
||||
];
|
||||
let actual = encode_logical_message("prefix", "message");
|
||||
assert_eq!(expected, actual[..]);
|
||||
}
|
||||
}
|
||||
// If you need to craft WAL and write tests for this module, put it at wal_craft crate.
|
||||
|
||||
@@ -15,3 +15,7 @@ postgres_ffi.workspace = true
|
||||
tempfile.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
regex.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
@@ -10,6 +10,20 @@ use std::process::Command;
|
||||
use std::time::{Duration, Instant};
|
||||
use tempfile::{tempdir, TempDir};
|
||||
|
||||
macro_rules! xlog_utils_test {
|
||||
($version:ident) => {
|
||||
#[path = "."]
|
||||
mod $version {
|
||||
pub use postgres_ffi::$version::wal_craft_test_export::*;
|
||||
#[allow(clippy::duplicate_mod)]
|
||||
#[cfg(test)]
|
||||
mod xlog_utils_test;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
postgres_ffi::for_all_postgres_versions! { xlog_utils_test }
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct Conf {
|
||||
pub pg_version: u32,
|
||||
|
||||
219
libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
Normal file
219
libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
Normal file
@@ -0,0 +1,219 @@
|
||||
//! Tests for postgres_ffi xlog_utils module. Put it here to break cyclic dependency.
|
||||
|
||||
use super::*;
|
||||
use crate::{error, info};
|
||||
use regex::Regex;
|
||||
use std::cmp::min;
|
||||
use std::fs::{self, File};
|
||||
use std::io::Write;
|
||||
use std::{env, str::FromStr};
|
||||
use utils::const_assert;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
fn init_logging() {
|
||||
let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(
|
||||
format!("crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"),
|
||||
))
|
||||
.is_test(true)
|
||||
.try_init();
|
||||
}
|
||||
|
||||
fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
|
||||
use crate::*;
|
||||
|
||||
let pg_version = PG_MAJORVERSION[1..3].parse::<u32>().unwrap();
|
||||
|
||||
// Craft some WAL
|
||||
let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("..")
|
||||
.join("..")
|
||||
.join("..");
|
||||
let cfg = Conf {
|
||||
pg_version,
|
||||
pg_distrib_dir: top_path.join("pg_install"),
|
||||
datadir: top_path.join(format!("test_output/{}-{PG_MAJORVERSION}", test_name)),
|
||||
};
|
||||
if cfg.datadir.exists() {
|
||||
fs::remove_dir_all(&cfg.datadir).unwrap();
|
||||
}
|
||||
cfg.initdb().unwrap();
|
||||
let srv = cfg.start_server().unwrap();
|
||||
let (intermediate_lsns, expected_end_of_wal_partial) =
|
||||
C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
|
||||
let intermediate_lsns: Vec<Lsn> = intermediate_lsns
|
||||
.iter()
|
||||
.map(|&lsn| u64::from(lsn).into())
|
||||
.collect();
|
||||
let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into();
|
||||
srv.kill();
|
||||
|
||||
// Check find_end_of_wal on the initial WAL
|
||||
let last_segment = cfg
|
||||
.wal_dir()
|
||||
.read_dir()
|
||||
.unwrap()
|
||||
.map(|f| f.unwrap().file_name().into_string().unwrap())
|
||||
.filter(|fname| IsXLogFileName(fname))
|
||||
.max()
|
||||
.unwrap();
|
||||
check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal);
|
||||
for start_lsn in intermediate_lsns
|
||||
.iter()
|
||||
.chain(std::iter::once(&expected_end_of_wal))
|
||||
{
|
||||
// Erase all WAL before `start_lsn` to ensure it's not used by `find_end_of_wal`.
|
||||
// We assume that `start_lsn` is non-decreasing.
|
||||
info!(
|
||||
"Checking with start_lsn={}, erasing WAL before it",
|
||||
start_lsn
|
||||
);
|
||||
for file in fs::read_dir(cfg.wal_dir()).unwrap().flatten() {
|
||||
let fname = file.file_name().into_string().unwrap();
|
||||
if !IsXLogFileName(&fname) {
|
||||
continue;
|
||||
}
|
||||
let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE);
|
||||
let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
|
||||
if seg_start_lsn > u64::from(*start_lsn) {
|
||||
continue;
|
||||
}
|
||||
let mut f = File::options().write(true).open(file.path()).unwrap();
|
||||
const ZEROS: [u8; WAL_SEGMENT_SIZE] = [0u8; WAL_SEGMENT_SIZE];
|
||||
f.write_all(
|
||||
&ZEROS[0..min(
|
||||
WAL_SEGMENT_SIZE,
|
||||
(u64::from(*start_lsn) - seg_start_lsn) as usize,
|
||||
)],
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
check_end_of_wal(&cfg, &last_segment, *start_lsn, expected_end_of_wal);
|
||||
}
|
||||
}
|
||||
|
||||
fn check_pg_waldump_end_of_wal(
|
||||
cfg: &crate::Conf,
|
||||
last_segment: &str,
|
||||
expected_end_of_wal: Lsn,
|
||||
) {
|
||||
// Get the actual end of WAL by pg_waldump
|
||||
let waldump_output = cfg
|
||||
.pg_waldump("000000010000000000000001", last_segment)
|
||||
.unwrap()
|
||||
.stderr;
|
||||
let waldump_output = std::str::from_utf8(&waldump_output).unwrap();
|
||||
let caps = match Regex::new(r"invalid record length at (.+):")
|
||||
.unwrap()
|
||||
.captures(waldump_output)
|
||||
{
|
||||
Some(caps) => caps,
|
||||
None => {
|
||||
error!("Unable to parse pg_waldump's stderr:\n{}", waldump_output);
|
||||
panic!();
|
||||
}
|
||||
};
|
||||
let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
|
||||
info!(
|
||||
"waldump erred on {}, expected wal end at {}",
|
||||
waldump_wal_end, expected_end_of_wal
|
||||
);
|
||||
assert_eq!(waldump_wal_end, expected_end_of_wal);
|
||||
}
|
||||
|
||||
fn check_end_of_wal(
|
||||
cfg: &crate::Conf,
|
||||
last_segment: &str,
|
||||
start_lsn: Lsn,
|
||||
expected_end_of_wal: Lsn,
|
||||
) {
|
||||
// Check end_of_wal on non-partial WAL segment (we treat it as fully populated)
|
||||
// let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
|
||||
// info!(
|
||||
// "find_end_of_wal returned wal_end={} with non-partial WAL segment",
|
||||
// wal_end
|
||||
// );
|
||||
// assert_eq!(wal_end, expected_end_of_wal_non_partial);
|
||||
|
||||
// Rename file to partial to actually find last valid lsn, then rename it back.
|
||||
fs::rename(
|
||||
cfg.wal_dir().join(last_segment),
|
||||
cfg.wal_dir().join(format!("{}.partial", last_segment)),
|
||||
)
|
||||
.unwrap();
|
||||
let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
|
||||
info!(
|
||||
"find_end_of_wal returned wal_end={} with partial WAL segment",
|
||||
wal_end
|
||||
);
|
||||
assert_eq!(wal_end, expected_end_of_wal);
|
||||
fs::rename(
|
||||
cfg.wal_dir().join(format!("{}.partial", last_segment)),
|
||||
cfg.wal_dir().join(last_segment),
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
const_assert!(WAL_SEGMENT_SIZE == 16 * 1024 * 1024);
|
||||
|
||||
#[test]
|
||||
pub fn test_find_end_of_wal_simple() {
|
||||
init_logging();
|
||||
test_end_of_wal::<crate::Simple>("test_find_end_of_wal_simple");
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_find_end_of_wal_crossing_segment_followed_by_small_one() {
|
||||
init_logging();
|
||||
test_end_of_wal::<crate::WalRecordCrossingSegmentFollowedBySmallOne>(
|
||||
"test_find_end_of_wal_crossing_segment_followed_by_small_one",
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_find_end_of_wal_last_crossing_segment() {
|
||||
init_logging();
|
||||
test_end_of_wal::<crate::LastWalRecordCrossingSegment>(
|
||||
"test_find_end_of_wal_last_crossing_segment",
|
||||
);
|
||||
}
|
||||
|
||||
/// Check the math in update_next_xid
|
||||
///
|
||||
/// NOTE: These checks are sensitive to the value of XID_CHECKPOINT_INTERVAL,
|
||||
/// currently 1024.
|
||||
#[test]
|
||||
pub fn test_update_next_xid() {
|
||||
let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
|
||||
let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();
|
||||
|
||||
checkpoint.nextXid = FullTransactionId { value: 10 };
|
||||
assert_eq!(checkpoint.nextXid.value, 10);
|
||||
|
||||
// The input XID gets rounded up to the next XID_CHECKPOINT_INTERVAL
|
||||
// boundary
|
||||
checkpoint.update_next_xid(100);
|
||||
assert_eq!(checkpoint.nextXid.value, 1024);
|
||||
|
||||
// No change
|
||||
checkpoint.update_next_xid(500);
|
||||
assert_eq!(checkpoint.nextXid.value, 1024);
|
||||
checkpoint.update_next_xid(1023);
|
||||
assert_eq!(checkpoint.nextXid.value, 1024);
|
||||
|
||||
// The function returns the *next* XID, given the highest XID seen so
|
||||
// far. So when we pass 1024, the nextXid gets bumped up to the next
|
||||
// XID_CHECKPOINT_INTERVAL boundary.
|
||||
checkpoint.update_next_xid(1024);
|
||||
assert_eq!(checkpoint.nextXid.value, 2048);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_encode_logical_message() {
|
||||
let expected = [
|
||||
64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255,
|
||||
38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114,
|
||||
101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
|
||||
];
|
||||
let actual = encode_logical_message("prefix", "message");
|
||||
assert_eq!(expected, actual[..]);
|
||||
}
|
||||
@@ -111,6 +111,8 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
) -> Result<Download, DownloadError>;
|
||||
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
|
||||
|
||||
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
|
||||
}
|
||||
|
||||
pub struct Download {
|
||||
@@ -223,6 +225,14 @@ impl GenericRemoteStorage {
|
||||
Self::Unreliable(s) => s.delete(path).await,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.delete_objects(paths).await,
|
||||
Self::AwsS3(s) => s.delete_objects(paths).await,
|
||||
Self::Unreliable(s) => s.delete_objects(paths).await,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl GenericRemoteStorage {
|
||||
|
||||
@@ -17,7 +17,7 @@ use tokio::{
|
||||
io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
|
||||
};
|
||||
use tracing::*;
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
|
||||
|
||||
use crate::{Download, DownloadError, RemotePath};
|
||||
|
||||
@@ -101,19 +101,35 @@ impl RemoteStorage for LocalFs {
|
||||
Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
|
||||
None => Cow::Borrowed(&self.storage_root),
|
||||
};
|
||||
Ok(get_all_files(path.as_ref(), false)
|
||||
|
||||
let prefixes_to_filter = get_all_files(path.as_ref(), false)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?
|
||||
.into_iter()
|
||||
.map(|path| {
|
||||
path.strip_prefix(&self.storage_root)
|
||||
.context("Failed to strip preifix")
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let mut prefixes = Vec::with_capacity(prefixes_to_filter.len());
|
||||
|
||||
// filter out empty directories to mirror s3 behavior.
|
||||
for prefix in prefixes_to_filter {
|
||||
if prefix.is_dir()
|
||||
&& is_directory_empty(&prefix)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
prefixes.push(
|
||||
prefix
|
||||
.strip_prefix(&self.storage_root)
|
||||
.context("Failed to strip prefix")
|
||||
.and_then(RemotePath::new)
|
||||
.expect(
|
||||
"We list files for storage root, hence should be able to remote the prefix",
|
||||
)
|
||||
})
|
||||
.collect())
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
Ok(prefixes)
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
@@ -291,11 +307,25 @@ impl RemoteStorage for LocalFs {
|
||||
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
|
||||
let file_path = path.with_base(&self.storage_root);
|
||||
if file_path.exists() && file_path.is_file() {
|
||||
Ok(fs::remove_file(file_path).await?)
|
||||
} else {
|
||||
bail!("File {file_path:?} either does not exist or is not a file")
|
||||
if !file_path.exists() {
|
||||
// See https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObject.html
|
||||
// > If there isn't a null version, Amazon S3 does not remove any objects but will still respond that the command was successful.
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if !file_path.is_file() {
|
||||
anyhow::bail!("{file_path:?} is not a file");
|
||||
}
|
||||
Ok(fs::remove_file(file_path)
|
||||
.await
|
||||
.map_err(|e| anyhow::anyhow!(e))?)
|
||||
}
|
||||
|
||||
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
|
||||
for path in paths {
|
||||
self.delete(path).await?
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -320,7 +350,7 @@ where
|
||||
let file_type = dir_entry.file_type().await?;
|
||||
let entry_path = dir_entry.path();
|
||||
if file_type.is_symlink() {
|
||||
debug!("{entry_path:?} us a symlink, skipping")
|
||||
debug!("{entry_path:?} is a symlink, skipping")
|
||||
} else if file_type.is_dir() {
|
||||
if recursive {
|
||||
paths.extend(get_all_files(&entry_path, true).await?.into_iter())
|
||||
@@ -595,15 +625,11 @@ mod fs_tests {
|
||||
storage.delete(&upload_target).await?;
|
||||
assert!(storage.list().await?.is_empty());
|
||||
|
||||
match storage.delete(&upload_target).await {
|
||||
Ok(()) => panic!("Should not allow deleting non-existing storage files"),
|
||||
Err(e) => {
|
||||
let error_string = e.to_string();
|
||||
assert!(error_string.contains("does not exist"));
|
||||
let expected_path = upload_target.with_base(&storage.storage_root);
|
||||
assert!(error_string.contains(expected_path.to_str().unwrap()));
|
||||
}
|
||||
}
|
||||
storage
|
||||
.delete(&upload_target)
|
||||
.await
|
||||
.expect("Should allow deleting non-existing storage files");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -17,6 +17,7 @@ use aws_sdk_s3::{
|
||||
error::SdkError,
|
||||
operation::get_object::GetObjectError,
|
||||
primitives::ByteStream,
|
||||
types::{Delete, ObjectIdentifier},
|
||||
Client,
|
||||
};
|
||||
use aws_smithy_http::body::SdkBody;
|
||||
@@ -81,12 +82,24 @@ pub(super) mod metrics {
|
||||
.inc();
|
||||
}
|
||||
|
||||
pub fn inc_delete_objects(count: u64) {
|
||||
S3_REQUESTS_COUNT
|
||||
.with_label_values(&["delete_object"])
|
||||
.inc_by(count);
|
||||
}
|
||||
|
||||
pub fn inc_delete_object_fail() {
|
||||
S3_REQUESTS_FAIL_COUNT
|
||||
.with_label_values(&["delete_object"])
|
||||
.inc();
|
||||
}
|
||||
|
||||
pub fn inc_delete_objects_fail(count: u64) {
|
||||
S3_REQUESTS_FAIL_COUNT
|
||||
.with_label_values(&["delete_object"])
|
||||
.inc_by(count);
|
||||
}
|
||||
|
||||
pub fn inc_list_objects() {
|
||||
S3_REQUESTS_COUNT.with_label_values(&["list_objects"]).inc();
|
||||
}
|
||||
@@ -396,6 +409,34 @@ impl RemoteStorage for S3Bucket {
|
||||
})
|
||||
.await
|
||||
}
|
||||
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
|
||||
let _guard = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
.await
|
||||
.context("Concurrency limiter semaphore got closed during S3 delete")?;
|
||||
|
||||
let mut delete_objects = Vec::with_capacity(paths.len());
|
||||
for path in paths {
|
||||
let obj_id = ObjectIdentifier::builder()
|
||||
.set_key(Some(self.relative_path_to_s3_object(path)))
|
||||
.build();
|
||||
delete_objects.push(obj_id);
|
||||
}
|
||||
|
||||
metrics::inc_delete_objects(paths.len() as u64);
|
||||
self.client
|
||||
.delete_objects()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.delete(Delete::builder().set_objects(Some(delete_objects)).build())
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
metrics::inc_delete_objects_fail(paths.len() as u64);
|
||||
e
|
||||
})?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
|
||||
let _guard = self
|
||||
|
||||
@@ -119,4 +119,11 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
self.attempt(RemoteOp::Delete(path.clone()))?;
|
||||
self.inner.delete(path).await
|
||||
}
|
||||
|
||||
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
|
||||
for path in paths {
|
||||
self.delete(path).await?
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@ use std::sync::Arc;
|
||||
use std::time::UNIX_EPOCH;
|
||||
|
||||
use anyhow::Context;
|
||||
use once_cell::sync::OnceCell;
|
||||
use remote_storage::{
|
||||
GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
|
||||
};
|
||||
@@ -14,8 +15,12 @@ use test_context::{test_context, AsyncTestContext};
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::{debug, error, info};
|
||||
|
||||
static LOGGING_DONE: OnceCell<()> = OnceCell::new();
|
||||
|
||||
const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
|
||||
|
||||
const BASE_PREFIX: &str = "test/";
|
||||
|
||||
/// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
|
||||
/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
|
||||
/// See the client creation in [`create_s3_client`] for details on the required env vars.
|
||||
@@ -38,20 +43,20 @@ const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_
|
||||
///
|
||||
/// Lastly, the test attempts to clean up and remove all uploaded S3 files.
|
||||
/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
|
||||
#[test_context(MaybeEnabledS3)]
|
||||
#[test_context(MaybeEnabledS3WithTestBlobs)]
|
||||
#[tokio::test]
|
||||
async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
|
||||
async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3WithTestBlobs) -> anyhow::Result<()> {
|
||||
let ctx = match ctx {
|
||||
MaybeEnabledS3::Enabled(ctx) => ctx,
|
||||
MaybeEnabledS3::Disabled => return Ok(()),
|
||||
MaybeEnabledS3::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"),
|
||||
MaybeEnabledS3WithTestBlobs::Enabled(ctx) => ctx,
|
||||
MaybeEnabledS3WithTestBlobs::Disabled => return Ok(()),
|
||||
MaybeEnabledS3WithTestBlobs::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"),
|
||||
};
|
||||
|
||||
let test_client = Arc::clone(&ctx.client_with_excessive_pagination);
|
||||
let test_client = Arc::clone(&ctx.enabled.client);
|
||||
let expected_remote_prefixes = ctx.remote_prefixes.clone();
|
||||
|
||||
let base_prefix =
|
||||
RemotePath::new(Path::new(ctx.base_prefix_str)).context("common_prefix construction")?;
|
||||
let base_prefix = RemotePath::new(Path::new(ctx.enabled.base_prefix))
|
||||
.context("common_prefix construction")?;
|
||||
let root_remote_prefixes = test_client
|
||||
.list_prefixes(None)
|
||||
.await
|
||||
@@ -83,27 +88,122 @@ async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3) -> anyhow::Result<(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test_context(MaybeEnabledS3)]
|
||||
#[tokio::test]
|
||||
async fn s3_delete_non_exising_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
|
||||
let ctx = match ctx {
|
||||
MaybeEnabledS3::Enabled(ctx) => ctx,
|
||||
MaybeEnabledS3::Disabled => return Ok(()),
|
||||
};
|
||||
|
||||
let path = RemotePath::new(&PathBuf::from(format!(
|
||||
"{}/for_sure_there_is_nothing_there_really",
|
||||
ctx.base_prefix,
|
||||
)))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
ctx.client.delete(&path).await.expect("should succeed");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test_context(MaybeEnabledS3)]
|
||||
#[tokio::test]
|
||||
async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
|
||||
let ctx = match ctx {
|
||||
MaybeEnabledS3::Enabled(ctx) => ctx,
|
||||
MaybeEnabledS3::Disabled => return Ok(()),
|
||||
};
|
||||
|
||||
let path1 = RemotePath::new(&PathBuf::from(format!("{}/path1", ctx.base_prefix,)))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let path2 = RemotePath::new(&PathBuf::from(format!("{}/path2", ctx.base_prefix,)))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let data1 = "remote blob data1".as_bytes();
|
||||
let data1_len = data1.len();
|
||||
let data2 = "remote blob data2".as_bytes();
|
||||
let data2_len = data2.len();
|
||||
ctx.client
|
||||
.upload(std::io::Cursor::new(data1), data1_len, &path1, None)
|
||||
.await?;
|
||||
|
||||
ctx.client
|
||||
.upload(std::io::Cursor::new(data2), data2_len, &path2, None)
|
||||
.await?;
|
||||
|
||||
ctx.client.delete_objects(&[path1, path2]).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn ensure_logging_ready() {
|
||||
LOGGING_DONE.get_or_init(|| {
|
||||
utils::logging::init(
|
||||
utils::logging::LogFormat::Test,
|
||||
utils::logging::TracingErrorLayerEnablement::Disabled,
|
||||
)
|
||||
.expect("logging init failed");
|
||||
});
|
||||
}
|
||||
|
||||
struct EnabledS3 {
|
||||
client: Arc<GenericRemoteStorage>,
|
||||
base_prefix: &'static str,
|
||||
}
|
||||
|
||||
impl EnabledS3 {
|
||||
async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
|
||||
let client = create_s3_client(max_keys_in_list_response)
|
||||
.context("S3 client creation")
|
||||
.expect("S3 client creation failed");
|
||||
|
||||
EnabledS3 {
|
||||
client,
|
||||
base_prefix: BASE_PREFIX,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
enum MaybeEnabledS3 {
|
||||
Enabled(EnabledS3),
|
||||
Disabled,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl AsyncTestContext for MaybeEnabledS3 {
|
||||
async fn setup() -> Self {
|
||||
ensure_logging_ready();
|
||||
|
||||
if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
|
||||
info!(
|
||||
"`{}` env variable is not set, skipping the test",
|
||||
ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME
|
||||
);
|
||||
return Self::Disabled;
|
||||
}
|
||||
|
||||
Self::Enabled(EnabledS3::setup(None).await)
|
||||
}
|
||||
}
|
||||
|
||||
enum MaybeEnabledS3WithTestBlobs {
|
||||
Enabled(S3WithTestBlobs),
|
||||
Disabled,
|
||||
UploadsFailed(anyhow::Error, S3WithTestBlobs),
|
||||
}
|
||||
|
||||
struct S3WithTestBlobs {
|
||||
client_with_excessive_pagination: Arc<GenericRemoteStorage>,
|
||||
base_prefix_str: &'static str,
|
||||
enabled: EnabledS3,
|
||||
remote_prefixes: HashSet<RemotePath>,
|
||||
remote_blobs: HashSet<RemotePath>,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl AsyncTestContext for MaybeEnabledS3 {
|
||||
impl AsyncTestContext for MaybeEnabledS3WithTestBlobs {
|
||||
async fn setup() -> Self {
|
||||
utils::logging::init(
|
||||
utils::logging::LogFormat::Test,
|
||||
utils::logging::TracingErrorLayerEnablement::Disabled,
|
||||
)
|
||||
.expect("logging init failed");
|
||||
ensure_logging_ready();
|
||||
if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
|
||||
info!(
|
||||
"`{}` env variable is not set, skipping the test",
|
||||
@@ -115,23 +215,14 @@ impl AsyncTestContext for MaybeEnabledS3 {
|
||||
let max_keys_in_list_response = 10;
|
||||
let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
|
||||
|
||||
let client_with_excessive_pagination = create_s3_client(max_keys_in_list_response)
|
||||
.context("S3 client creation")
|
||||
.expect("S3 client creation failed");
|
||||
let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await;
|
||||
|
||||
let base_prefix_str = "test/";
|
||||
match upload_s3_data(
|
||||
&client_with_excessive_pagination,
|
||||
base_prefix_str,
|
||||
upload_tasks_count,
|
||||
)
|
||||
.await
|
||||
{
|
||||
match upload_s3_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
|
||||
ControlFlow::Continue(uploads) => {
|
||||
info!("Remote objects created successfully");
|
||||
|
||||
Self::Enabled(S3WithTestBlobs {
|
||||
client_with_excessive_pagination,
|
||||
base_prefix_str,
|
||||
enabled,
|
||||
remote_prefixes: uploads.prefixes,
|
||||
remote_blobs: uploads.blobs,
|
||||
})
|
||||
@@ -139,8 +230,7 @@ impl AsyncTestContext for MaybeEnabledS3 {
|
||||
ControlFlow::Break(uploads) => Self::UploadsFailed(
|
||||
anyhow::anyhow!("One or multiple blobs failed to upload to S3"),
|
||||
S3WithTestBlobs {
|
||||
client_with_excessive_pagination,
|
||||
base_prefix_str,
|
||||
enabled,
|
||||
remote_prefixes: uploads.prefixes,
|
||||
remote_blobs: uploads.blobs,
|
||||
},
|
||||
@@ -152,13 +242,15 @@ impl AsyncTestContext for MaybeEnabledS3 {
|
||||
match self {
|
||||
Self::Disabled => {}
|
||||
Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
|
||||
cleanup(&ctx.client_with_excessive_pagination, ctx.remote_blobs).await;
|
||||
cleanup(&ctx.enabled.client, ctx.remote_blobs).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn create_s3_client(max_keys_per_list_response: i32) -> anyhow::Result<Arc<GenericRemoteStorage>> {
|
||||
fn create_s3_client(
|
||||
max_keys_per_list_response: Option<i32>,
|
||||
) -> anyhow::Result<Arc<GenericRemoteStorage>> {
|
||||
let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET")
|
||||
.context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?;
|
||||
let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION")
|
||||
@@ -176,7 +268,7 @@ fn create_s3_client(max_keys_per_list_response: i32) -> anyhow::Result<Arc<Gener
|
||||
prefix_in_bucket: Some(format!("pagination_should_work_test_{random_prefix_part}/")),
|
||||
endpoint: None,
|
||||
concurrency_limit: NonZeroUsize::new(100).unwrap(),
|
||||
max_keys_per_list_response: Some(max_keys_per_list_response),
|
||||
max_keys_per_list_response,
|
||||
}),
|
||||
};
|
||||
Ok(Arc::new(
|
||||
33
libs/utils/src/completion.rs
Normal file
33
libs/utils/src/completion.rs
Normal file
@@ -0,0 +1,33 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use tokio::sync::{mpsc, Mutex};
|
||||
|
||||
/// While a reference is kept around, the associated [`Barrier::wait`] will wait.
|
||||
///
|
||||
/// Can be cloned, moved and kept around in futures as "guard objects".
|
||||
#[derive(Clone)]
|
||||
pub struct Completion(mpsc::Sender<()>);
|
||||
|
||||
/// Barrier will wait until all clones of [`Completion`] have been dropped.
|
||||
#[derive(Clone)]
|
||||
pub struct Barrier(Arc<Mutex<mpsc::Receiver<()>>>);
|
||||
|
||||
impl Barrier {
|
||||
pub async fn wait(self) {
|
||||
self.0.lock().await.recv().await;
|
||||
}
|
||||
|
||||
pub async fn maybe_wait(barrier: Option<Barrier>) {
|
||||
if let Some(b) = barrier {
|
||||
b.wait().await
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Create new Guard and Barrier pair.
|
||||
pub fn channel() -> (Completion, Barrier) {
|
||||
let (tx, rx) = mpsc::channel::<()>(1);
|
||||
let rx = Mutex::new(rx);
|
||||
let rx = Arc::new(rx);
|
||||
(Completion(tx), Barrier(rx))
|
||||
}
|
||||
@@ -1,6 +1,8 @@
|
||||
/// Extensions to `std::fs` types.
|
||||
use std::{fs, io, path::Path};
|
||||
|
||||
use anyhow::Context;
|
||||
|
||||
pub trait PathExt {
|
||||
/// Returns an error if `self` is not a directory.
|
||||
fn is_empty_dir(&self) -> io::Result<bool>;
|
||||
@@ -15,10 +17,19 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn is_directory_empty(path: impl AsRef<Path>) -> anyhow::Result<bool> {
|
||||
let mut dir = tokio::fs::read_dir(&path)
|
||||
.await
|
||||
.context(format!("read_dir({})", path.as_ref().display()))?;
|
||||
Ok(dir.next_entry().await?.is_none())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use std::path::PathBuf;
|
||||
|
||||
use crate::fs_ext::is_directory_empty;
|
||||
|
||||
#[test]
|
||||
fn is_empty_dir() {
|
||||
use super::PathExt;
|
||||
@@ -42,4 +53,26 @@ mod test {
|
||||
std::fs::remove_file(&file_path).unwrap();
|
||||
assert!(file_path.is_empty_dir().is_err());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn is_empty_dir_async() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let dir_path = dir.path();
|
||||
|
||||
// test positive case
|
||||
assert!(
|
||||
is_directory_empty(dir_path).await.expect("test failure"),
|
||||
"new tempdir should be empty"
|
||||
);
|
||||
|
||||
// invoke on a file to ensure it returns an error
|
||||
let file_path: PathBuf = dir_path.join("testfile");
|
||||
let f = std::fs::File::create(&file_path).unwrap();
|
||||
drop(f);
|
||||
assert!(is_directory_empty(&file_path).await.is_err());
|
||||
|
||||
// do it again on a path, we know to be nonexistent
|
||||
std::fs::remove_file(&file_path).unwrap();
|
||||
assert!(is_directory_empty(file_path).await.is_err());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,23 +1,20 @@
|
||||
use crate::auth::{Claims, JwtAuth};
|
||||
use crate::http::error;
|
||||
use anyhow::{anyhow, Context};
|
||||
use crate::http::error::{api_error_handler, route_error_handler, ApiError};
|
||||
use anyhow::Context;
|
||||
use hyper::header::{HeaderName, AUTHORIZATION};
|
||||
use hyper::http::HeaderValue;
|
||||
use hyper::Method;
|
||||
use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server};
|
||||
use hyper::{header::CONTENT_TYPE, Body, Request, Response};
|
||||
use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
|
||||
use once_cell::sync::Lazy;
|
||||
use routerify::ext::RequestExt;
|
||||
use routerify::{Middleware, RequestInfo, Router, RouterBuilder, RouterService};
|
||||
use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
|
||||
use tokio::task::JoinError;
|
||||
use tracing::{self, debug, info, info_span, warn, Instrument};
|
||||
|
||||
use std::future::Future;
|
||||
use std::net::TcpListener;
|
||||
use std::str::FromStr;
|
||||
|
||||
use super::error::ApiError;
|
||||
|
||||
static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"libmetrics_metric_handler_requests_total",
|
||||
@@ -35,8 +32,18 @@ struct RequestId(String);
|
||||
/// Adds a tracing info_span! instrumentation around the handler events,
|
||||
/// logs the request start and end events for non-GET requests and non-200 responses.
|
||||
///
|
||||
/// Usage: Replace `my_handler` with `|r| request_span(r, my_handler)`
|
||||
///
|
||||
/// Use this to distinguish between logs of different HTTP requests: every request handler wrapped
|
||||
/// in this type will get request info logged in the wrapping span, including the unique request ID.
|
||||
/// with this will get request info logged in the wrapping span, including the unique request ID.
|
||||
///
|
||||
/// This also handles errors, logging them and converting them to an HTTP error response.
|
||||
///
|
||||
/// NB: If the client disconnects, Hyper will drop the Future, without polling it to
|
||||
/// completion. In other words, the handler must be async cancellation safe! request_span
|
||||
/// prints a warning to the log when that happens, so that you have some trace of it in
|
||||
/// the log.
|
||||
///
|
||||
///
|
||||
/// There could be other ways to implement similar functionality:
|
||||
///
|
||||
@@ -54,60 +61,56 @@ struct RequestId(String);
|
||||
/// tries to achive with its `.instrument` used in the current approach.
|
||||
///
|
||||
/// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced.
|
||||
pub struct RequestSpan<E, R, H>(pub H)
|
||||
pub async fn request_span<R, H>(request: Request<Body>, handler: H) -> R::Output
|
||||
where
|
||||
E: Into<Box<dyn std::error::Error + Send + Sync>> + 'static,
|
||||
R: Future<Output = Result<Response<Body>, E>> + Send + 'static,
|
||||
H: Fn(Request<Body>) -> R + Send + Sync + 'static;
|
||||
|
||||
impl<E, R, H> RequestSpan<E, R, H>
|
||||
where
|
||||
E: Into<Box<dyn std::error::Error + Send + Sync>> + 'static,
|
||||
R: Future<Output = Result<Response<Body>, E>> + Send + 'static,
|
||||
H: Fn(Request<Body>) -> R + Send + Sync + 'static,
|
||||
R: Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
|
||||
H: FnOnce(Request<Body>) -> R + Send + Sync + 'static,
|
||||
{
|
||||
/// Creates a tracing span around inner request handler and executes the request handler in the contex of that span.
|
||||
/// Use as `|r| RequestSpan(my_handler).handle(r)` instead of `my_handler` as the request handler to get the span enabled.
|
||||
pub async fn handle(self, request: Request<Body>) -> Result<Response<Body>, E> {
|
||||
let request_id = request.context::<RequestId>().unwrap_or_default().0;
|
||||
let method = request.method();
|
||||
let path = request.uri().path();
|
||||
let request_span = info_span!("request", %method, %path, %request_id);
|
||||
let request_id = request.context::<RequestId>().unwrap_or_default().0;
|
||||
let method = request.method();
|
||||
let path = request.uri().path();
|
||||
let request_span = info_span!("request", %method, %path, %request_id);
|
||||
|
||||
let log_quietly = method == Method::GET;
|
||||
async move {
|
||||
let cancellation_guard = RequestCancelled::warn_when_dropped_without_responding();
|
||||
if log_quietly {
|
||||
debug!("Handling request");
|
||||
} else {
|
||||
info!("Handling request");
|
||||
}
|
||||
|
||||
// Note that we reuse `error::handler` here and not returning and error at all,
|
||||
// yet cannot use `!` directly in the method signature due to `routerify::RouterBuilder` limitation.
|
||||
// Usage of the error handler also means that we expect only the `ApiError` errors to be raised in this call.
|
||||
//
|
||||
// Panics are not handled separately, there's a `tracing_panic_hook` from another module to do that globally.
|
||||
let res = (self.0)(request).await;
|
||||
|
||||
cancellation_guard.disarm();
|
||||
|
||||
match res {
|
||||
Ok(response) => {
|
||||
let response_status = response.status();
|
||||
if log_quietly && response_status.is_success() {
|
||||
debug!("Request handled, status: {response_status}");
|
||||
} else {
|
||||
info!("Request handled, status: {response_status}");
|
||||
}
|
||||
Ok(response)
|
||||
}
|
||||
Err(e) => Ok(error::handler(e.into()).await),
|
||||
}
|
||||
let log_quietly = method == Method::GET;
|
||||
async move {
|
||||
let cancellation_guard = RequestCancelled::warn_when_dropped_without_responding();
|
||||
if log_quietly {
|
||||
debug!("Handling request");
|
||||
} else {
|
||||
info!("Handling request");
|
||||
}
|
||||
|
||||
// No special handling for panics here. There's a `tracing_panic_hook` from another
|
||||
// module to do that globally.
|
||||
let res = handler(request).await;
|
||||
|
||||
cancellation_guard.disarm();
|
||||
|
||||
// Log the result if needed.
|
||||
//
|
||||
// We also convert any errors into an Ok response with HTTP error code here.
|
||||
// `make_router` sets a last-resort error handler that would do the same, but
|
||||
// we prefer to do it here, before we exit the request span, so that the error
|
||||
// is still logged with the span.
|
||||
//
|
||||
// (Because we convert errors to Ok response, we never actually return an error,
|
||||
// and we could declare the function to return the never type (`!`). However,
|
||||
// using `routerify::RouterBuilder` requires a proper error type.)
|
||||
match res {
|
||||
Ok(response) => {
|
||||
let response_status = response.status();
|
||||
if log_quietly && response_status.is_success() {
|
||||
debug!("Request handled, status: {response_status}");
|
||||
} else {
|
||||
info!("Request handled, status: {response_status}");
|
||||
}
|
||||
Ok(response)
|
||||
}
|
||||
Err(err) => Ok(api_error_handler(err)),
|
||||
}
|
||||
.instrument(request_span)
|
||||
.await
|
||||
}
|
||||
.instrument(request_span)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Drop guard to WARN in case the request was dropped before completion.
|
||||
@@ -207,10 +210,8 @@ pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
|
||||
.middleware(Middleware::post_with_info(
|
||||
add_request_id_header_to_response,
|
||||
))
|
||||
.get("/metrics", |r| {
|
||||
RequestSpan(prometheus_metrics_handler).handle(r)
|
||||
})
|
||||
.err_handler(error::handler)
|
||||
.get("/metrics", |r| request_span(r, prometheus_metrics_handler))
|
||||
.err_handler(route_error_handler)
|
||||
}
|
||||
|
||||
pub fn attach_openapi_ui(
|
||||
@@ -220,12 +221,14 @@ pub fn attach_openapi_ui(
|
||||
ui_mount_path: &'static str,
|
||||
) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
router_builder
|
||||
.get(spec_mount_path, move |r| {
|
||||
RequestSpan(move |_| async move { Ok(Response::builder().body(Body::from(spec)).unwrap()) })
|
||||
.handle(r)
|
||||
})
|
||||
.get(ui_mount_path, move |r| RequestSpan( move |_| async move {
|
||||
Ok(Response::builder().body(Body::from(format!(r#"
|
||||
.get(spec_mount_path,
|
||||
move |r| request_span(r, move |_| async move {
|
||||
Ok(Response::builder().body(Body::from(spec)).unwrap())
|
||||
})
|
||||
)
|
||||
.get(ui_mount_path,
|
||||
move |r| request_span(r, move |_| async move {
|
||||
Ok(Response::builder().body(Body::from(format!(r#"
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
@@ -255,7 +258,8 @@ pub fn attach_openapi_ui(
|
||||
</body>
|
||||
</html>
|
||||
"#, spec_mount_path))).unwrap())
|
||||
}).handle(r))
|
||||
})
|
||||
)
|
||||
}
|
||||
|
||||
fn parse_token(header_value: &str) -> Result<&str, ApiError> {
|
||||
@@ -343,40 +347,6 @@ pub fn check_permission_with(
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Start listening for HTTP requests on given socket.
|
||||
///
|
||||
/// 'shutdown_future' can be used to stop. If the Future becomes
|
||||
/// ready, we stop listening for new requests, and the function returns.
|
||||
///
|
||||
pub fn serve_thread_main<S>(
|
||||
router_builder: RouterBuilder<hyper::Body, ApiError>,
|
||||
listener: TcpListener,
|
||||
shutdown_future: S,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
S: Future<Output = ()> + Send + Sync,
|
||||
{
|
||||
info!("Starting an HTTP endpoint at {}", listener.local_addr()?);
|
||||
|
||||
// Create a Service from the router above to handle incoming requests.
|
||||
let service = RouterService::new(router_builder.build().map_err(|err| anyhow!(err))?).unwrap();
|
||||
|
||||
// Enter a single-threaded tokio runtime bound to the current thread
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()?;
|
||||
|
||||
let _guard = runtime.enter();
|
||||
|
||||
let server = Server::from_tcp(listener)?
|
||||
.serve(service)
|
||||
.with_graceful_shutdown(shutdown_future);
|
||||
|
||||
runtime.block_on(server)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
@@ -21,7 +21,7 @@ pub enum ApiError {
|
||||
Conflict(String),
|
||||
|
||||
#[error("Precondition failed: {0}")]
|
||||
PreconditionFailed(&'static str),
|
||||
PreconditionFailed(Box<str>),
|
||||
|
||||
#[error(transparent)]
|
||||
InternalServerError(anyhow::Error),
|
||||
@@ -83,13 +83,24 @@ impl HttpErrorBody {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn handler(err: routerify::RouteError) -> Response<Body> {
|
||||
let api_error = err
|
||||
.downcast::<ApiError>()
|
||||
.expect("handler should always return api error");
|
||||
pub async fn route_error_handler(err: routerify::RouteError) -> Response<Body> {
|
||||
match err.downcast::<ApiError>() {
|
||||
Ok(api_error) => api_error_handler(*api_error),
|
||||
Err(other_error) => {
|
||||
// We expect all the request handlers to return an ApiError, so this should
|
||||
// not be reached. But just in case.
|
||||
error!("Error processing HTTP request: {other_error:?}");
|
||||
HttpErrorBody::response_from_msg_and_status(
|
||||
other_error.to_string(),
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
|
||||
// Print a stack trace for Internal Server errors
|
||||
if let ApiError::InternalServerError(_) = api_error.as_ref() {
|
||||
if let ApiError::InternalServerError(_) = api_error {
|
||||
error!("Error processing HTTP request: {api_error:?}");
|
||||
} else {
|
||||
error!("Error processing HTTP request: {api_error:#}");
|
||||
|
||||
@@ -8,12 +8,26 @@ use super::error::ApiError;
|
||||
pub async fn json_request<T: for<'de> Deserialize<'de>>(
|
||||
request: &mut Request<Body>,
|
||||
) -> Result<T, ApiError> {
|
||||
let whole_body = hyper::body::aggregate(request.body_mut())
|
||||
json_request_or_empty_body(request)
|
||||
.await?
|
||||
.context("missing request body")
|
||||
.map_err(ApiError::BadRequest)
|
||||
}
|
||||
|
||||
/// Will be removed as part of https://github.com/neondatabase/neon/issues/4282
|
||||
pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
|
||||
request: &mut Request<Body>,
|
||||
) -> Result<Option<T>, ApiError> {
|
||||
let body = hyper::body::aggregate(request.body_mut())
|
||||
.await
|
||||
.context("Failed to read request body")
|
||||
.map_err(ApiError::BadRequest)?;
|
||||
serde_json::from_reader(whole_body.reader())
|
||||
if body.remaining() == 0 {
|
||||
return Ok(None);
|
||||
}
|
||||
serde_json::from_reader(body.reader())
|
||||
.context("Failed to parse json request")
|
||||
.map(Some)
|
||||
.map_err(ApiError::BadRequest)
|
||||
}
|
||||
|
||||
|
||||
@@ -60,6 +60,9 @@ pub mod tracing_span_assert;
|
||||
|
||||
pub mod rate_limit;
|
||||
|
||||
/// Simple once-barrier and a guard which keeps barrier awaiting.
|
||||
pub mod completion;
|
||||
|
||||
mod failpoint_macro_helpers {
|
||||
|
||||
/// use with fail::cfg("$name", "return(2000)")
|
||||
|
||||
@@ -33,7 +33,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
|
||||
min_lsn = min(min_lsn, lsn_range.start);
|
||||
max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1));
|
||||
|
||||
updates.insert_historic(Arc::new(layer));
|
||||
updates.insert_historic(layer.get_persistent_layer_desc(), Arc::new(layer));
|
||||
}
|
||||
|
||||
println!("min: {min_lsn}, max: {max_lsn}");
|
||||
@@ -215,7 +215,7 @@ fn bench_sequential(c: &mut Criterion) {
|
||||
is_incremental: false,
|
||||
short_id: format!("Layer {}", i),
|
||||
};
|
||||
updates.insert_historic(Arc::new(layer));
|
||||
updates.insert_historic(layer.get_persistent_layer_desc(), Arc::new(layer));
|
||||
}
|
||||
updates.flush();
|
||||
println!("Finished layer map init in {:?}", now.elapsed());
|
||||
|
||||
18
pageserver/ctl/Cargo.toml
Normal file
18
pageserver/ctl/Cargo.toml
Normal file
@@ -0,0 +1,18 @@
|
||||
[package]
|
||||
name = "pagectl"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
bytes.workspace = true
|
||||
clap = { workspace = true, features = ["string"] }
|
||||
git-version.workspace = true
|
||||
pageserver = { path = ".." }
|
||||
postgres_ffi.workspace = true
|
||||
utils.workspace = true
|
||||
svg_fmt.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
@@ -12,7 +12,7 @@
|
||||
//! Example use:
|
||||
//! ```
|
||||
//! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
|
||||
//! $ grep "__" | cargo run --release --bin draw_timeline_dir > out.svg
|
||||
//! $ grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
|
||||
//! $ firefox out.svg
|
||||
//! ```
|
||||
//!
|
||||
@@ -62,7 +62,7 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
|
||||
(keys, lsns)
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
pub fn main() -> Result<()> {
|
||||
// Parse layer filenames from stdin
|
||||
let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
|
||||
let stdin = io::stdin();
|
||||
@@ -6,7 +6,7 @@ use anyhow::Result;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::ops::Range;
|
||||
use std::{env, fs, path::Path, path::PathBuf, str, str::FromStr};
|
||||
use std::{fs, path::Path, str};
|
||||
|
||||
use pageserver::page_cache::PAGE_SZ;
|
||||
use pageserver::repository::{Key, KEY_SIZE};
|
||||
@@ -18,12 +18,14 @@ use pageserver::virtual_file::VirtualFile;
|
||||
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
|
||||
use crate::AnalyzeLayerMapCmd;
|
||||
|
||||
const MIN_HOLE_LENGTH: i128 = (128 * 1024 * 1024 / PAGE_SZ) as i128;
|
||||
const DEFAULT_MAX_HOLES: usize = 10;
|
||||
|
||||
/// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
|
||||
#[derive(PartialEq, Eq)]
|
||||
struct Hole(Range<Key>);
|
||||
pub struct Hole(Range<Key>);
|
||||
|
||||
impl Ord for Hole {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
@@ -39,11 +41,11 @@ impl PartialOrd for Hole {
|
||||
}
|
||||
}
|
||||
|
||||
struct LayerFile {
|
||||
key_range: Range<Key>,
|
||||
lsn_range: Range<Lsn>,
|
||||
is_delta: bool,
|
||||
holes: Vec<Hole>,
|
||||
pub(crate) struct LayerFile {
|
||||
pub key_range: Range<Key>,
|
||||
pub lsn_range: Range<Lsn>,
|
||||
pub is_delta: bool,
|
||||
pub holes: Vec<Hole>,
|
||||
}
|
||||
|
||||
impl LayerFile {
|
||||
@@ -67,7 +69,7 @@ impl LayerFile {
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_filename(name: &str) -> Option<LayerFile> {
|
||||
pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
|
||||
let split: Vec<&str> = name.split("__").collect();
|
||||
if split.len() != 2 {
|
||||
return None;
|
||||
@@ -127,18 +129,9 @@ fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
|
||||
Ok(holes)
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let args: Vec<String> = env::args().collect();
|
||||
if args.len() < 2 {
|
||||
println!("Usage: layer_map_analyzer PAGESERVER_DATA_DIR [MAX_HOLES]");
|
||||
return Ok(());
|
||||
}
|
||||
let storage_path = PathBuf::from_str(&args[1])?;
|
||||
let max_holes = if args.len() > 2 {
|
||||
args[2].parse::<usize>().unwrap()
|
||||
} else {
|
||||
DEFAULT_MAX_HOLES
|
||||
};
|
||||
pub(crate) fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
|
||||
let storage_path = &cmd.path;
|
||||
let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES);
|
||||
|
||||
// Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
|
||||
pageserver::virtual_file::init(10);
|
||||
169
pageserver/ctl/src/layers.rs
Normal file
169
pageserver/ctl/src/layers.rs
Normal file
@@ -0,0 +1,169 @@
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use anyhow::Result;
|
||||
use clap::Subcommand;
|
||||
use pageserver::tenant::block_io::BlockCursor;
|
||||
use pageserver::tenant::disk_btree::DiskBtreeReader;
|
||||
use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
|
||||
use pageserver::{page_cache, virtual_file};
|
||||
use pageserver::{
|
||||
repository::{Key, KEY_SIZE},
|
||||
tenant::{
|
||||
block_io::FileBlockReader, disk_btree::VisitDirection,
|
||||
storage_layer::delta_layer::DELTA_KEY_SIZE,
|
||||
},
|
||||
virtual_file::VirtualFile,
|
||||
};
|
||||
use std::fs;
|
||||
use utils::bin_ser::BeSer;
|
||||
|
||||
use crate::layer_map_analyzer::parse_filename;
|
||||
|
||||
#[derive(Subcommand)]
|
||||
pub(crate) enum LayerCmd {
|
||||
/// List all tenants and timelines under the pageserver path
|
||||
///
|
||||
/// Example: `cargo run --bin pagectl layer list .neon/`
|
||||
List { path: PathBuf },
|
||||
/// List all layers of a given tenant and timeline
|
||||
///
|
||||
/// Example: `cargo run --bin pagectl layer list .neon/`
|
||||
ListLayer {
|
||||
path: PathBuf,
|
||||
tenant: String,
|
||||
timeline: String,
|
||||
},
|
||||
/// Dump all information of a layer file
|
||||
DumpLayer {
|
||||
path: PathBuf,
|
||||
tenant: String,
|
||||
timeline: String,
|
||||
/// The id from list-layer command
|
||||
id: usize,
|
||||
},
|
||||
}
|
||||
|
||||
fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
|
||||
use pageserver::tenant::blob_io::BlobCursor;
|
||||
use pageserver::tenant::block_io::BlockReader;
|
||||
|
||||
let path = path.as_ref();
|
||||
virtual_file::init(10);
|
||||
page_cache::init(100);
|
||||
let file = FileBlockReader::new(VirtualFile::open(path)?);
|
||||
let summary_blk = file.read_blk(0)?;
|
||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
actual_summary.index_start_blk,
|
||||
actual_summary.index_root_blk,
|
||||
&file,
|
||||
);
|
||||
// TODO(chi): dedup w/ `delta_layer.rs` by exposing the API.
|
||||
let mut all = vec![];
|
||||
tree_reader.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
|key, value_offset| {
|
||||
let curr = Key::from_slice(&key[..KEY_SIZE]);
|
||||
all.push((curr, BlobRef(value_offset)));
|
||||
true
|
||||
},
|
||||
)?;
|
||||
let mut cursor = BlockCursor::new(&file);
|
||||
for (k, v) in all {
|
||||
let value = cursor.read_blob(v.pos())?;
|
||||
println!("key:{} value_len:{}", k, value.len());
|
||||
}
|
||||
// TODO(chi): special handling for last key?
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
match cmd {
|
||||
LayerCmd::List { path } => {
|
||||
for tenant in fs::read_dir(path.join("tenants"))? {
|
||||
let tenant = tenant?;
|
||||
if !tenant.file_type()?.is_dir() {
|
||||
continue;
|
||||
}
|
||||
println!("tenant {}", tenant.file_name().to_string_lossy());
|
||||
for timeline in fs::read_dir(tenant.path().join("timelines"))? {
|
||||
let timeline = timeline?;
|
||||
if !timeline.file_type()?.is_dir() {
|
||||
continue;
|
||||
}
|
||||
println!("- timeline {}", timeline.file_name().to_string_lossy());
|
||||
}
|
||||
}
|
||||
}
|
||||
LayerCmd::ListLayer {
|
||||
path,
|
||||
tenant,
|
||||
timeline,
|
||||
} => {
|
||||
let timeline_path = path
|
||||
.join("tenants")
|
||||
.join(tenant)
|
||||
.join("timelines")
|
||||
.join(timeline);
|
||||
let mut idx = 0;
|
||||
for layer in fs::read_dir(timeline_path)? {
|
||||
let layer = layer?;
|
||||
if let Some(layer_file) = parse_filename(&layer.file_name().into_string().unwrap())
|
||||
{
|
||||
println!(
|
||||
"[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}",
|
||||
idx,
|
||||
layer_file.key_range.start,
|
||||
layer_file.key_range.end,
|
||||
layer_file.lsn_range.start,
|
||||
layer_file.lsn_range.end,
|
||||
layer_file.is_delta,
|
||||
);
|
||||
idx += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
LayerCmd::DumpLayer {
|
||||
path,
|
||||
tenant,
|
||||
timeline,
|
||||
id,
|
||||
} => {
|
||||
let timeline_path = path
|
||||
.join("tenants")
|
||||
.join(tenant)
|
||||
.join("timelines")
|
||||
.join(timeline);
|
||||
let mut idx = 0;
|
||||
for layer in fs::read_dir(timeline_path)? {
|
||||
let layer = layer?;
|
||||
if let Some(layer_file) = parse_filename(&layer.file_name().into_string().unwrap())
|
||||
{
|
||||
if *id == idx {
|
||||
// TODO(chi): dedup code
|
||||
println!(
|
||||
"[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}",
|
||||
idx,
|
||||
layer_file.key_range.start,
|
||||
layer_file.key_range.end,
|
||||
layer_file.lsn_range.start,
|
||||
layer_file.lsn_range.end,
|
||||
layer_file.is_delta,
|
||||
);
|
||||
|
||||
if layer_file.is_delta {
|
||||
read_delta_file(layer.path())?;
|
||||
} else {
|
||||
anyhow::bail!("not supported yet :(");
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
idx += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
179
pageserver/ctl/src/main.rs
Normal file
179
pageserver/ctl/src/main.rs
Normal file
@@ -0,0 +1,179 @@
|
||||
//! A helper tool to manage pageserver binary files.
|
||||
//! Accepts a file as an argument, attempts to parse it with all ways possible
|
||||
//! and prints its interpreted context.
|
||||
//!
|
||||
//! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.
|
||||
|
||||
mod draw_timeline_dir;
|
||||
mod layer_map_analyzer;
|
||||
mod layers;
|
||||
|
||||
use clap::{Parser, Subcommand};
|
||||
use layers::LayerCmd;
|
||||
use pageserver::{
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
page_cache,
|
||||
task_mgr::TaskKind,
|
||||
tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
|
||||
virtual_file,
|
||||
};
|
||||
use postgres_ffi::ControlFileData;
|
||||
use std::path::{Path, PathBuf};
|
||||
use utils::{lsn::Lsn, project_git_version};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(
|
||||
version = GIT_VERSION,
|
||||
about = "Neon Pageserver binutils",
|
||||
long_about = "Reads pageserver (and related) binary files management utility"
|
||||
)]
|
||||
#[command(propagate_version = true)]
|
||||
struct CliOpts {
|
||||
#[command(subcommand)]
|
||||
command: Commands,
|
||||
}
|
||||
|
||||
#[derive(Subcommand)]
|
||||
enum Commands {
|
||||
Metadata(MetadataCmd),
|
||||
PrintLayerFile(PrintLayerFileCmd),
|
||||
DrawTimeline {},
|
||||
AnalyzeLayerMap(AnalyzeLayerMapCmd),
|
||||
#[command(subcommand)]
|
||||
Layer(LayerCmd),
|
||||
}
|
||||
|
||||
/// Read and update pageserver metadata file
|
||||
#[derive(Parser)]
|
||||
struct MetadataCmd {
|
||||
/// Input metadata file path
|
||||
metadata_path: PathBuf,
|
||||
/// Replace disk consistent Lsn
|
||||
disk_consistent_lsn: Option<Lsn>,
|
||||
/// Replace previous record Lsn
|
||||
prev_record_lsn: Option<Lsn>,
|
||||
/// Replace latest gc cuttoff
|
||||
latest_gc_cuttoff: Option<Lsn>,
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
struct PrintLayerFileCmd {
|
||||
/// Pageserver data path
|
||||
path: PathBuf,
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
struct AnalyzeLayerMapCmd {
|
||||
/// Pageserver data path
|
||||
path: PathBuf,
|
||||
/// Max holes
|
||||
max_holes: Option<usize>,
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
let cli = CliOpts::parse();
|
||||
|
||||
match cli.command {
|
||||
Commands::Layer(cmd) => {
|
||||
layers::main(&cmd)?;
|
||||
}
|
||||
Commands::Metadata(cmd) => {
|
||||
handle_metadata(&cmd)?;
|
||||
}
|
||||
Commands::DrawTimeline {} => {
|
||||
draw_timeline_dir::main()?;
|
||||
}
|
||||
Commands::AnalyzeLayerMap(cmd) => {
|
||||
layer_map_analyzer::main(&cmd)?;
|
||||
}
|
||||
Commands::PrintLayerFile(cmd) => {
|
||||
if let Err(e) = read_pg_control_file(&cmd.path) {
|
||||
println!(
|
||||
"Failed to read input file as a pg control one: {e:#}\n\
|
||||
Attempting to read it as layer file"
|
||||
);
|
||||
print_layerfile(&cmd.path)?;
|
||||
}
|
||||
}
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
|
||||
let control_file = ControlFileData::decode(&std::fs::read(control_file_path)?)?;
|
||||
println!("{control_file:?}");
|
||||
let control_file_initdb = Lsn(control_file.checkPoint);
|
||||
println!(
|
||||
"pg_initdb_lsn: {}, aligned: {}",
|
||||
control_file_initdb,
|
||||
control_file_initdb.align()
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn print_layerfile(path: &Path) -> anyhow::Result<()> {
|
||||
// Basic initialization of things that don't change after startup
|
||||
virtual_file::init(10);
|
||||
page_cache::init(100);
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
dump_layerfile_from_path(path, true, &ctx)
|
||||
}
|
||||
|
||||
fn handle_metadata(
|
||||
MetadataCmd {
|
||||
metadata_path: path,
|
||||
disk_consistent_lsn,
|
||||
prev_record_lsn,
|
||||
latest_gc_cuttoff,
|
||||
}: &MetadataCmd,
|
||||
) -> Result<(), anyhow::Error> {
|
||||
let metadata_bytes = std::fs::read(path)?;
|
||||
let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
|
||||
println!("Current metadata:\n{meta:?}");
|
||||
let mut update_meta = false;
|
||||
if let Some(disk_consistent_lsn) = disk_consistent_lsn {
|
||||
meta = TimelineMetadata::new(
|
||||
*disk_consistent_lsn,
|
||||
meta.prev_record_lsn(),
|
||||
meta.ancestor_timeline(),
|
||||
meta.ancestor_lsn(),
|
||||
meta.latest_gc_cutoff_lsn(),
|
||||
meta.initdb_lsn(),
|
||||
meta.pg_version(),
|
||||
);
|
||||
update_meta = true;
|
||||
}
|
||||
if let Some(prev_record_lsn) = prev_record_lsn {
|
||||
meta = TimelineMetadata::new(
|
||||
meta.disk_consistent_lsn(),
|
||||
Some(*prev_record_lsn),
|
||||
meta.ancestor_timeline(),
|
||||
meta.ancestor_lsn(),
|
||||
meta.latest_gc_cutoff_lsn(),
|
||||
meta.initdb_lsn(),
|
||||
meta.pg_version(),
|
||||
);
|
||||
update_meta = true;
|
||||
}
|
||||
if let Some(latest_gc_cuttoff) = latest_gc_cuttoff {
|
||||
meta = TimelineMetadata::new(
|
||||
meta.disk_consistent_lsn(),
|
||||
meta.prev_record_lsn(),
|
||||
meta.ancestor_timeline(),
|
||||
meta.ancestor_lsn(),
|
||||
*latest_gc_cuttoff,
|
||||
meta.initdb_lsn(),
|
||||
meta.pg_version(),
|
||||
);
|
||||
update_meta = true;
|
||||
}
|
||||
|
||||
if update_meta {
|
||||
let metadata_bytes = meta.to_bytes()?;
|
||||
std::fs::write(path, metadata_bytes)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -9,6 +9,7 @@ use clap::{Arg, ArgAction, Command};
|
||||
use fail::FailScenario;
|
||||
use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
|
||||
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
|
||||
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tracing::*;
|
||||
|
||||
@@ -18,9 +19,7 @@ use pageserver::{
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
http, page_cache, page_service, task_mgr,
|
||||
task_mgr::TaskKind,
|
||||
task_mgr::{
|
||||
BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
|
||||
},
|
||||
task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
|
||||
tenant::mgr,
|
||||
virtual_file,
|
||||
};
|
||||
@@ -276,7 +275,18 @@ fn start_pageserver(
|
||||
let pageserver_listener = tcp_listener::bind(pg_addr)?;
|
||||
|
||||
// Launch broker client
|
||||
WALRECEIVER_RUNTIME.block_on(pageserver::broker_client::init_broker_client(conf))?;
|
||||
// The storage_broker::connect call needs to happen inside a tokio runtime thread.
|
||||
let broker_client = WALRECEIVER_RUNTIME
|
||||
.block_on(async {
|
||||
// Note: we do not attempt connecting here (but validate endpoints sanity).
|
||||
storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)
|
||||
})
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"create broker client for uri={:?} keepalive_interval={:?}",
|
||||
&conf.broker_endpoint, conf.broker_keepalive_interval,
|
||||
)
|
||||
})?;
|
||||
|
||||
// Initialize authentication for incoming connections
|
||||
let http_auth;
|
||||
@@ -325,8 +335,118 @@ fn start_pageserver(
|
||||
// Set up remote storage client
|
||||
let remote_storage = create_remote_storage_client(conf)?;
|
||||
|
||||
// Startup staging or optimizing:
|
||||
//
|
||||
// We want to minimize downtime for `page_service` connections, and trying not to overload
|
||||
// BACKGROUND_RUNTIME by doing initial compactions and initial logical sizes at the same time.
|
||||
//
|
||||
// init_done_rx will notify when all initial load operations have completed.
|
||||
//
|
||||
// background_jobs_can_start (same name used to hold off background jobs from starting at
|
||||
// consumer side) will be dropped once we can start the background jobs. Currently it is behind
|
||||
// completing all initial logical size calculations (init_logical_size_done_rx) and a timeout
|
||||
// (background_task_maximum_delay).
|
||||
let (init_done_tx, init_done_rx) = utils::completion::channel();
|
||||
|
||||
let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel();
|
||||
|
||||
let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel();
|
||||
|
||||
let order = pageserver::InitializationOrder {
|
||||
initial_tenant_load: Some(init_done_tx),
|
||||
initial_logical_size_can_start: init_done_rx.clone(),
|
||||
initial_logical_size_attempt: init_logical_size_done_tx,
|
||||
background_jobs_can_start: background_jobs_barrier.clone(),
|
||||
};
|
||||
|
||||
// Scan the local 'tenants/' directory and start loading the tenants
|
||||
BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(conf, remote_storage.clone()))?;
|
||||
let init_started_at = std::time::Instant::now();
|
||||
let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
|
||||
|
||||
BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
|
||||
conf,
|
||||
broker_client.clone(),
|
||||
remote_storage.clone(),
|
||||
order,
|
||||
))?;
|
||||
|
||||
BACKGROUND_RUNTIME.spawn({
|
||||
let init_done_rx = init_done_rx;
|
||||
let shutdown_pageserver = shutdown_pageserver.clone();
|
||||
let drive_init = async move {
|
||||
// NOTE: unlike many futures in pageserver, this one is cancellation-safe
|
||||
let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed"));
|
||||
|
||||
init_done_rx.wait().await;
|
||||
// initial logical sizes can now start, as they were waiting on init_done_rx.
|
||||
|
||||
scopeguard::ScopeGuard::into_inner(guard);
|
||||
|
||||
let init_done = std::time::Instant::now();
|
||||
let elapsed = init_done - init_started_at;
|
||||
|
||||
tracing::info!(
|
||||
elapsed_millis = elapsed.as_millis(),
|
||||
"Initial load completed"
|
||||
);
|
||||
|
||||
let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait());
|
||||
|
||||
let timeout = conf.background_task_maximum_delay;
|
||||
|
||||
let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
|
||||
|
||||
let init_sizes_done = tokio::select! {
|
||||
_ = &mut init_sizes_done => {
|
||||
let now = std::time::Instant::now();
|
||||
tracing::info!(
|
||||
from_init_done_millis = (now - init_done).as_millis(),
|
||||
from_init_millis = (now - init_started_at).as_millis(),
|
||||
"Initial logical sizes completed"
|
||||
);
|
||||
None
|
||||
}
|
||||
_ = tokio::time::sleep(timeout) => {
|
||||
tracing::info!(
|
||||
timeout_millis = timeout.as_millis(),
|
||||
"Initial logical size timeout elapsed; starting background jobs"
|
||||
);
|
||||
Some(init_sizes_done)
|
||||
}
|
||||
};
|
||||
|
||||
scopeguard::ScopeGuard::into_inner(guard);
|
||||
|
||||
// allow background jobs to start
|
||||
drop(background_jobs_can_start);
|
||||
|
||||
if let Some(init_sizes_done) = init_sizes_done {
|
||||
// ending up here is not a bug; at the latest logical sizes will be queried by
|
||||
// consumption metrics.
|
||||
let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
|
||||
init_sizes_done.await;
|
||||
|
||||
scopeguard::ScopeGuard::into_inner(guard);
|
||||
|
||||
let now = std::time::Instant::now();
|
||||
tracing::info!(
|
||||
from_init_done_millis = (now - init_done).as_millis(),
|
||||
from_init_millis = (now - init_started_at).as_millis(),
|
||||
"Initial logical sizes completed after timeout (background jobs already started)"
|
||||
);
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
async move {
|
||||
let mut drive_init = std::pin::pin!(drive_init);
|
||||
// just race these tasks
|
||||
tokio::select! {
|
||||
_ = shutdown_pageserver.cancelled() => {},
|
||||
_ = &mut drive_init => {},
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// shared state between the disk-usage backed eviction background task and the http endpoint
|
||||
// that allows triggering disk-usage based eviction manually. note that the http endpoint
|
||||
@@ -339,6 +459,7 @@ fn start_pageserver(
|
||||
conf,
|
||||
remote_storage.clone(),
|
||||
disk_usage_eviction_state.clone(),
|
||||
background_jobs_barrier.clone(),
|
||||
)?;
|
||||
}
|
||||
|
||||
@@ -351,6 +472,7 @@ fn start_pageserver(
|
||||
conf,
|
||||
launch_ts,
|
||||
http_auth,
|
||||
broker_client.clone(),
|
||||
remote_storage,
|
||||
disk_usage_eviction_state,
|
||||
)?
|
||||
@@ -375,6 +497,7 @@ fn start_pageserver(
|
||||
);
|
||||
|
||||
if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
|
||||
let background_jobs_barrier = background_jobs_barrier;
|
||||
let metrics_ctx = RequestContext::todo_child(
|
||||
TaskKind::MetricsCollection,
|
||||
// This task itself shouldn't download anything.
|
||||
@@ -390,6 +513,18 @@ fn start_pageserver(
|
||||
"consumption metrics collection",
|
||||
true,
|
||||
async move {
|
||||
// first wait until background jobs are cleared to launch.
|
||||
//
|
||||
// this is because we only process active tenants and timelines, and the
|
||||
// Timeline::get_current_logical_size will spawn the logical size calculation,
|
||||
// which will not be rate-limited.
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return Ok(()); },
|
||||
_ = background_jobs_barrier.wait() => {}
|
||||
};
|
||||
|
||||
pageserver::consumption_metrics::collect_metrics(
|
||||
metric_collection_endpoint,
|
||||
conf.metric_collection_interval,
|
||||
@@ -427,6 +562,7 @@ fn start_pageserver(
|
||||
async move {
|
||||
page_service::libpq_listener_main(
|
||||
conf,
|
||||
broker_client,
|
||||
pg_auth,
|
||||
pageserver_listener,
|
||||
conf.pg_auth_type,
|
||||
@@ -437,6 +573,8 @@ fn start_pageserver(
|
||||
);
|
||||
}
|
||||
|
||||
let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
|
||||
|
||||
// All started up! Now just sit and wait for shutdown signal.
|
||||
ShutdownSignals::handle(|signal| match signal {
|
||||
Signal::Quit => {
|
||||
@@ -452,6 +590,11 @@ fn start_pageserver(
|
||||
"Got {}. Terminating gracefully in fast shutdown mode",
|
||||
signal.name()
|
||||
);
|
||||
|
||||
// This cancels the `shutdown_pageserver` cancellation tree.
|
||||
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
|
||||
// The plan is to change that over time.
|
||||
shutdown_pageserver.take();
|
||||
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
|
||||
unreachable!()
|
||||
}
|
||||
|
||||
@@ -1,174 +0,0 @@
|
||||
//! A helper tool to manage pageserver binary files.
|
||||
//! Accepts a file as an argument, attempts to parse it with all ways possible
|
||||
//! and prints its interpreted context.
|
||||
//!
|
||||
//! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.
|
||||
use std::{
|
||||
path::{Path, PathBuf},
|
||||
str::FromStr,
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use clap::{value_parser, Arg, Command};
|
||||
|
||||
use pageserver::{
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
page_cache,
|
||||
task_mgr::TaskKind,
|
||||
tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
|
||||
virtual_file,
|
||||
};
|
||||
use postgres_ffi::ControlFileData;
|
||||
use utils::{lsn::Lsn, project_git_version};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
const METADATA_SUBCOMMAND: &str = "metadata";
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
let arg_matches = cli().get_matches();
|
||||
|
||||
match arg_matches.subcommand() {
|
||||
Some((subcommand_name, subcommand_matches)) => {
|
||||
let path = subcommand_matches
|
||||
.get_one::<PathBuf>("metadata_path")
|
||||
.context("'metadata_path' argument is missing")?
|
||||
.to_path_buf();
|
||||
anyhow::ensure!(
|
||||
subcommand_name == METADATA_SUBCOMMAND,
|
||||
"Unknown subcommand {subcommand_name}"
|
||||
);
|
||||
handle_metadata(&path, subcommand_matches)?;
|
||||
}
|
||||
None => {
|
||||
let path = arg_matches
|
||||
.get_one::<PathBuf>("path")
|
||||
.context("'path' argument is missing")?
|
||||
.to_path_buf();
|
||||
println!(
|
||||
"No subcommand specified, attempting to guess the format for file {}",
|
||||
path.display()
|
||||
);
|
||||
if let Err(e) = read_pg_control_file(&path) {
|
||||
println!(
|
||||
"Failed to read input file as a pg control one: {e:#}\n\
|
||||
Attempting to read it as layer file"
|
||||
);
|
||||
print_layerfile(&path)?;
|
||||
}
|
||||
}
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
|
||||
let control_file = ControlFileData::decode(&std::fs::read(control_file_path)?)?;
|
||||
println!("{control_file:?}");
|
||||
let control_file_initdb = Lsn(control_file.checkPoint);
|
||||
println!(
|
||||
"pg_initdb_lsn: {}, aligned: {}",
|
||||
control_file_initdb,
|
||||
control_file_initdb.align()
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn print_layerfile(path: &Path) -> anyhow::Result<()> {
|
||||
// Basic initialization of things that don't change after startup
|
||||
virtual_file::init(10);
|
||||
page_cache::init(100);
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
dump_layerfile_from_path(path, true, &ctx)
|
||||
}
|
||||
|
||||
fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), anyhow::Error> {
|
||||
let metadata_bytes = std::fs::read(path)?;
|
||||
let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
|
||||
println!("Current metadata:\n{meta:?}");
|
||||
let mut update_meta = false;
|
||||
if let Some(disk_consistent_lsn) = arg_matches.get_one::<String>("disk_consistent_lsn") {
|
||||
meta = TimelineMetadata::new(
|
||||
Lsn::from_str(disk_consistent_lsn)?,
|
||||
meta.prev_record_lsn(),
|
||||
meta.ancestor_timeline(),
|
||||
meta.ancestor_lsn(),
|
||||
meta.latest_gc_cutoff_lsn(),
|
||||
meta.initdb_lsn(),
|
||||
meta.pg_version(),
|
||||
);
|
||||
update_meta = true;
|
||||
}
|
||||
if let Some(prev_record_lsn) = arg_matches.get_one::<String>("prev_record_lsn") {
|
||||
meta = TimelineMetadata::new(
|
||||
meta.disk_consistent_lsn(),
|
||||
Some(Lsn::from_str(prev_record_lsn)?),
|
||||
meta.ancestor_timeline(),
|
||||
meta.ancestor_lsn(),
|
||||
meta.latest_gc_cutoff_lsn(),
|
||||
meta.initdb_lsn(),
|
||||
meta.pg_version(),
|
||||
);
|
||||
update_meta = true;
|
||||
}
|
||||
if let Some(latest_gc_cuttoff) = arg_matches.get_one::<String>("latest_gc_cuttoff") {
|
||||
meta = TimelineMetadata::new(
|
||||
meta.disk_consistent_lsn(),
|
||||
meta.prev_record_lsn(),
|
||||
meta.ancestor_timeline(),
|
||||
meta.ancestor_lsn(),
|
||||
Lsn::from_str(latest_gc_cuttoff)?,
|
||||
meta.initdb_lsn(),
|
||||
meta.pg_version(),
|
||||
);
|
||||
update_meta = true;
|
||||
}
|
||||
|
||||
if update_meta {
|
||||
let metadata_bytes = meta.to_bytes()?;
|
||||
std::fs::write(path, metadata_bytes)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn cli() -> Command {
|
||||
Command::new("Neon Pageserver binutils")
|
||||
.about("Reads pageserver (and related) binary files management utility")
|
||||
.version(GIT_VERSION)
|
||||
.arg(
|
||||
Arg::new("path")
|
||||
.help("Input file path")
|
||||
.value_parser(value_parser!(PathBuf))
|
||||
.required(false),
|
||||
)
|
||||
.subcommand(
|
||||
Command::new(METADATA_SUBCOMMAND)
|
||||
.about("Read and update pageserver metadata file")
|
||||
.arg(
|
||||
Arg::new("metadata_path")
|
||||
.help("Input metadata file path")
|
||||
.value_parser(value_parser!(PathBuf))
|
||||
.required(false),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("disk_consistent_lsn")
|
||||
.long("disk_consistent_lsn")
|
||||
.help("Replace disk consistent Lsn"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("prev_record_lsn")
|
||||
.long("prev_record_lsn")
|
||||
.help("Replace previous record Lsn"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("latest_gc_cuttoff")
|
||||
.long("latest_gc_cuttoff")
|
||||
.help("Replace latest gc cuttoff"),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn verify_cli() {
|
||||
cli().debug_assert();
|
||||
}
|
||||
@@ -1,48 +0,0 @@
|
||||
//! The broker client instance of the pageserver, created during pageserver startup.
|
||||
//! Used by each timelines' [`walreceiver`].
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
|
||||
use anyhow::Context;
|
||||
use once_cell::sync::OnceCell;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use tracing::*;
|
||||
|
||||
static BROKER_CLIENT: OnceCell<BrokerClientChannel> = OnceCell::new();
|
||||
|
||||
///
|
||||
/// Initialize the broker client. This must be called once at page server startup.
|
||||
///
|
||||
pub async fn init_broker_client(conf: &'static PageServerConf) -> anyhow::Result<()> {
|
||||
let broker_endpoint = conf.broker_endpoint.clone();
|
||||
|
||||
// Note: we do not attempt connecting here (but validate endpoints sanity).
|
||||
let broker_client =
|
||||
storage_broker::connect(broker_endpoint.clone(), conf.broker_keepalive_interval).context(
|
||||
format!(
|
||||
"Failed to create broker client to {}",
|
||||
&conf.broker_endpoint
|
||||
),
|
||||
)?;
|
||||
|
||||
if BROKER_CLIENT.set(broker_client).is_err() {
|
||||
panic!("broker already initialized");
|
||||
}
|
||||
|
||||
info!(
|
||||
"Initialized broker client with endpoints: {}",
|
||||
broker_endpoint
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Get a handle to the broker client
|
||||
///
|
||||
pub fn get_broker_client() -> &'static BrokerClientChannel {
|
||||
BROKER_CLIENT.get().expect("broker client not initialized")
|
||||
}
|
||||
|
||||
pub fn is_broker_client_initialized() -> bool {
|
||||
BROKER_CLIENT.get().is_some()
|
||||
}
|
||||
@@ -63,6 +63,7 @@ pub mod defaults {
|
||||
pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "1 hour";
|
||||
pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
|
||||
pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
|
||||
pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
|
||||
|
||||
///
|
||||
/// Default built-in configuration file.
|
||||
@@ -91,9 +92,10 @@ pub mod defaults {
|
||||
#cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}'
|
||||
#synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}'
|
||||
|
||||
|
||||
#disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}}
|
||||
|
||||
#background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}'
|
||||
|
||||
# [tenant_config]
|
||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
|
||||
@@ -108,7 +110,7 @@ pub mod defaults {
|
||||
|
||||
#min_resident_size_override = .. # in bytes
|
||||
#evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
|
||||
|
||||
#gc_feedback = false
|
||||
# [remote_storage]
|
||||
|
||||
"###
|
||||
@@ -187,6 +189,15 @@ pub struct PageServerConf {
|
||||
pub test_remote_failures: u64,
|
||||
|
||||
pub ondemand_download_behavior_treat_error_as_warn: bool,
|
||||
|
||||
/// How long will background tasks be delayed at most after initial load of tenants.
|
||||
///
|
||||
/// Our largest initialization completions are in the range of 100-200s, so perhaps 10s works
|
||||
/// as we now isolate initial loading, initial logical size calculation and background tasks.
|
||||
/// Smaller nodes will have background tasks "not running" for this long unless every timeline
|
||||
/// has it's initial logical size calculated. Not running background tasks for some seconds is
|
||||
/// not terrible.
|
||||
pub background_task_maximum_delay: Duration,
|
||||
}
|
||||
|
||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||
@@ -259,6 +270,8 @@ struct PageServerConfigBuilder {
|
||||
test_remote_failures: BuilderValue<u64>,
|
||||
|
||||
ondemand_download_behavior_treat_error_as_warn: BuilderValue<bool>,
|
||||
|
||||
background_task_maximum_delay: BuilderValue<Duration>,
|
||||
}
|
||||
|
||||
impl Default for PageServerConfigBuilder {
|
||||
@@ -316,6 +329,11 @@ impl Default for PageServerConfigBuilder {
|
||||
test_remote_failures: Set(0),
|
||||
|
||||
ondemand_download_behavior_treat_error_as_warn: Set(false),
|
||||
|
||||
background_task_maximum_delay: Set(humantime::parse_duration(
|
||||
DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
|
||||
)
|
||||
.unwrap()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -440,6 +458,10 @@ impl PageServerConfigBuilder {
|
||||
BuilderValue::Set(ondemand_download_behavior_treat_error_as_warn);
|
||||
}
|
||||
|
||||
pub fn background_task_maximum_delay(&mut self, delay: Duration) {
|
||||
self.background_task_maximum_delay = BuilderValue::Set(delay);
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let concurrent_tenant_size_logical_size_queries = self
|
||||
.concurrent_tenant_size_logical_size_queries
|
||||
@@ -522,6 +544,9 @@ impl PageServerConfigBuilder {
|
||||
.ok_or(anyhow!(
|
||||
"missing ondemand_download_behavior_treat_error_as_warn"
|
||||
))?,
|
||||
background_task_maximum_delay: self
|
||||
.background_task_maximum_delay
|
||||
.ok_or(anyhow!("missing background_task_maximum_delay"))?,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -710,6 +735,7 @@ impl PageServerConf {
|
||||
)
|
||||
},
|
||||
"ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
|
||||
"background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?),
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
@@ -828,6 +854,14 @@ impl PageServerConf {
|
||||
)?);
|
||||
}
|
||||
|
||||
if let Some(gc_feedback) = item.get("gc_feedback") {
|
||||
t_conf.gc_feedback = Some(
|
||||
gc_feedback
|
||||
.as_bool()
|
||||
.with_context(|| "configure option gc_feedback is not a bool".to_string())?,
|
||||
);
|
||||
}
|
||||
|
||||
Ok(t_conf)
|
||||
}
|
||||
|
||||
@@ -869,6 +903,7 @@ impl PageServerConf {
|
||||
disk_usage_based_eviction: None,
|
||||
test_remote_failures: 0,
|
||||
ondemand_download_behavior_treat_error_as_warn: false,
|
||||
background_task_maximum_delay: Duration::ZERO,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1028,6 +1063,7 @@ metric_collection_endpoint = 'http://localhost:80/metrics'
|
||||
synthetic_size_calculation_interval = '333 s'
|
||||
|
||||
log_format = 'json'
|
||||
background_task_maximum_delay = '334 s'
|
||||
|
||||
"#;
|
||||
|
||||
@@ -1086,6 +1122,9 @@ log_format = 'json'
|
||||
disk_usage_based_eviction: None,
|
||||
test_remote_failures: 0,
|
||||
ondemand_download_behavior_treat_error_as_warn: false,
|
||||
background_task_maximum_delay: humantime::parse_duration(
|
||||
defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY
|
||||
)?,
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -1140,6 +1179,7 @@ log_format = 'json'
|
||||
disk_usage_based_eviction: None,
|
||||
test_remote_failures: 0,
|
||||
ondemand_download_behavior_treat_error_as_warn: false,
|
||||
background_task_maximum_delay: Duration::from_secs(334),
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
|
||||
@@ -88,6 +88,7 @@
|
||||
use crate::task_mgr::TaskKind;
|
||||
|
||||
// The main structure of this module, see module-level comment.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct RequestContext {
|
||||
task_kind: TaskKind,
|
||||
download_behavior: DownloadBehavior,
|
||||
@@ -95,7 +96,7 @@ pub struct RequestContext {
|
||||
|
||||
/// Desired behavior if the operation requires an on-demand download
|
||||
/// to proceed.
|
||||
#[derive(Clone, Copy, PartialEq, Eq)]
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
|
||||
pub enum DownloadBehavior {
|
||||
/// Download the layer file. It can take a while.
|
||||
Download,
|
||||
|
||||
@@ -54,6 +54,7 @@ use serde::{Deserialize, Serialize};
|
||||
use tokio::time::Instant;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, error, info, instrument, warn, Instrument};
|
||||
use utils::completion;
|
||||
use utils::serde_percent::Percent;
|
||||
|
||||
use crate::{
|
||||
@@ -82,6 +83,7 @@ pub fn launch_disk_usage_global_eviction_task(
|
||||
conf: &'static PageServerConf,
|
||||
storage: GenericRemoteStorage,
|
||||
state: Arc<State>,
|
||||
background_jobs_barrier: completion::Barrier,
|
||||
) -> anyhow::Result<()> {
|
||||
let Some(task_config) = &conf.disk_usage_based_eviction else {
|
||||
info!("disk usage based eviction task not configured");
|
||||
@@ -98,14 +100,16 @@ pub fn launch_disk_usage_global_eviction_task(
|
||||
"disk usage based eviction",
|
||||
false,
|
||||
async move {
|
||||
disk_usage_eviction_task(
|
||||
&state,
|
||||
task_config,
|
||||
storage,
|
||||
&conf.tenants_path(),
|
||||
task_mgr::shutdown_token(),
|
||||
)
|
||||
.await;
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
|
||||
// wait until initial load is complete, because we cannot evict from loading tenants.
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return Ok(()); },
|
||||
_ = background_jobs_barrier.wait() => { }
|
||||
};
|
||||
|
||||
disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
|
||||
.await;
|
||||
info!("disk usage based eviction task finishing");
|
||||
Ok(())
|
||||
},
|
||||
|
||||
@@ -215,7 +215,7 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/NotFoundError"
|
||||
"412":
|
||||
description: Tenant is missing
|
||||
description: Tenant is missing, or timeline has children
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
@@ -363,11 +363,30 @@ paths:
|
||||
* MUST NOT ASSUME that the request has been lost, based on the observation
|
||||
that a subsequent tenant status request returns 404. The request may
|
||||
still be in flight. It must be retried.
|
||||
|
||||
The client SHOULD supply a `TenantConfig` for the tenant in the request body.
|
||||
Settings specified in the config override the pageserver's defaults.
|
||||
It is guaranteed that the config settings are applied before the pageserver
|
||||
starts operating on the tenant. E.g., if the config specifies a specific
|
||||
PITR interval for a tenant, then that setting will be in effect before the
|
||||
pageserver starts the garbage collection loop. This enables a client to
|
||||
guarantee a specific PITR setting across detach/attach cycles.
|
||||
The pageserver will reject the request if it cannot parse the config, or
|
||||
if there are any unknown fields in it.
|
||||
|
||||
If the client does not supply a config, the pageserver will use its defaults.
|
||||
This behavior is deprecated: https://github.com/neondatabase/neon/issues/4282
|
||||
requestBody:
|
||||
required: false
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TenantAttachRequest"
|
||||
responses:
|
||||
"202":
|
||||
description: Tenant attaching scheduled
|
||||
"400":
|
||||
description: Error when no tenant id found in path parameters
|
||||
description: Bad Request
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
@@ -660,6 +679,8 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
type: object
|
||||
required:
|
||||
- new_timeline_id
|
||||
properties:
|
||||
new_timeline_id:
|
||||
type: string
|
||||
@@ -908,12 +929,28 @@ components:
|
||||
writing to the tenant's S3 state, so, DO NOT ATTACH the
|
||||
tenant to any other pageserver, or we risk split-brain.
|
||||
- `attached` means that the attach operation has completed,
|
||||
maybe successfully, maybe not. Perform a health check at
|
||||
the Postgres level to determine healthiness of the tenant.
|
||||
successfully
|
||||
- `failed` means that attach has failed. For reason check corresponding `reason` failed.
|
||||
`failed` is the terminal state, retrying attach call wont resolve the issue.
|
||||
For example this can be caused by s3 being unreachable. The retry may be implemented
|
||||
with call to detach, though it would be better to not automate it and inspec failed state
|
||||
manually before proceeding with a retry.
|
||||
|
||||
See the tenant `/attach` endpoint for more information.
|
||||
type: string
|
||||
enum: [ "maybe", "attached" ]
|
||||
type: object
|
||||
required:
|
||||
- slug
|
||||
- data
|
||||
properties:
|
||||
slug:
|
||||
type: string
|
||||
enum: [ "maybe", "attached", "failed" ]
|
||||
data:
|
||||
type: object
|
||||
properties:
|
||||
reason:
|
||||
type: string
|
||||
|
||||
TenantCreateRequest:
|
||||
allOf:
|
||||
- $ref: '#/components/schemas/TenantConfig'
|
||||
@@ -924,6 +961,13 @@ components:
|
||||
new_tenant_id:
|
||||
type: string
|
||||
format: hex
|
||||
TenantAttachRequest:
|
||||
type: object
|
||||
required:
|
||||
- config
|
||||
properties:
|
||||
config:
|
||||
$ref: '#/components/schemas/TenantConfig'
|
||||
TenantConfigRequest:
|
||||
allOf:
|
||||
- $ref: '#/components/schemas/TenantConfig'
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
//!
|
||||
//! Management HTTP API
|
||||
//!
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
@@ -5,12 +8,14 @@ use anyhow::{anyhow, Context, Result};
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use metrics::launch_timestamp::LaunchTimestamp;
|
||||
use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
|
||||
use pageserver_api::models::{DownloadRemoteLayersTaskSpawnRequest, TenantAttachRequest};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use tenant_size_model::{SizeResult, StorageModel};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::http::endpoint::RequestSpan;
|
||||
use utils::http::endpoint::request_span;
|
||||
use utils::http::json::json_request_or_empty_body;
|
||||
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
|
||||
|
||||
use super::models::{
|
||||
@@ -23,7 +28,9 @@ use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::mgr::{TenantMapInsertError, TenantStateError};
|
||||
use crate::tenant::mgr::{
|
||||
GetTenantError, SetNewTenantConfigError, TenantMapInsertError, TenantStateError,
|
||||
};
|
||||
use crate::tenant::size::ModelInputs;
|
||||
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
||||
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
|
||||
@@ -42,7 +49,6 @@ use utils::{
|
||||
};
|
||||
|
||||
// Imports only used for testing APIs
|
||||
#[cfg(feature = "testing")]
|
||||
use super::models::ConfigureFailpointsRequest;
|
||||
|
||||
struct State {
|
||||
@@ -50,6 +56,7 @@ struct State {
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
allowlist_routes: Vec<Uri>,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
|
||||
}
|
||||
|
||||
@@ -58,6 +65,7 @@ impl State {
|
||||
conf: &'static PageServerConf,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
|
||||
) -> anyhow::Result<Self> {
|
||||
let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
|
||||
@@ -69,6 +77,7 @@ impl State {
|
||||
auth,
|
||||
allowlist_routes,
|
||||
remote_storage,
|
||||
broker_client,
|
||||
disk_usage_eviction_state,
|
||||
})
|
||||
}
|
||||
@@ -139,14 +148,45 @@ impl From<TenantStateError> for ApiError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<GetTenantError> for ApiError {
|
||||
fn from(tse: GetTenantError) -> ApiError {
|
||||
match tse {
|
||||
GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)),
|
||||
e @ GetTenantError::NotActive(_) => {
|
||||
// Why is this not `ApiError::NotFound`?
|
||||
// Because we must be careful to never return 404 for a tenant if it does
|
||||
// in fact exist locally. If we did, the caller could draw the conclusion
|
||||
// that it can attach the tenant to another PS and we'd be in split-brain.
|
||||
//
|
||||
// (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
|
||||
ApiError::InternalServerError(anyhow::Error::new(e))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<SetNewTenantConfigError> for ApiError {
|
||||
fn from(e: SetNewTenantConfigError) -> ApiError {
|
||||
match e {
|
||||
SetNewTenantConfigError::GetTenant(tid) => {
|
||||
ApiError::NotFound(anyhow!("tenant {}", tid))
|
||||
}
|
||||
e @ SetNewTenantConfigError::Persist(_) => {
|
||||
ApiError::InternalServerError(anyhow::Error::new(e))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<crate::tenant::DeleteTimelineError> for ApiError {
|
||||
fn from(value: crate::tenant::DeleteTimelineError) -> Self {
|
||||
use crate::tenant::DeleteTimelineError::*;
|
||||
match value {
|
||||
NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found")),
|
||||
HasChildren => ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Cannot delete timeline which has child timelines"
|
||||
)),
|
||||
HasChildren(children) => ApiError::PreconditionFailed(
|
||||
format!("Cannot delete timeline which has child timelines: {children:?}")
|
||||
.into_boxed_str(),
|
||||
),
|
||||
Other(e) => ApiError::InternalServerError(e),
|
||||
}
|
||||
}
|
||||
@@ -158,9 +198,9 @@ impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
|
||||
match value {
|
||||
// Report Precondition failed so client can distinguish between
|
||||
// "tenant is missing" case from "timeline is missing"
|
||||
Tenant(TenantStateError::NotFound(..)) => {
|
||||
ApiError::PreconditionFailed("Requested tenant is missing")
|
||||
}
|
||||
Tenant(GetTenantError::NotFound(..)) => ApiError::PreconditionFailed(
|
||||
"Requested tenant is missing".to_owned().into_boxed_str(),
|
||||
),
|
||||
Tenant(t) => ApiError::from(t),
|
||||
Timeline(t) => ApiError::from(t),
|
||||
}
|
||||
@@ -253,23 +293,29 @@ fn build_timeline_info_common(
|
||||
}
|
||||
|
||||
// healthcheck handler
|
||||
async fn status_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn status_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
let config = get_config(&request);
|
||||
json_response(StatusCode::OK, StatusResponse { id: config.id })
|
||||
}
|
||||
|
||||
async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn timeline_create_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let request_data: TimelineCreateRequest = json_request(&mut request).await?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let new_timeline_id = request_data
|
||||
.new_timeline_id
|
||||
.unwrap_or_else(TimelineId::generate);
|
||||
let new_timeline_id = request_data.new_timeline_id;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
async {
|
||||
let tenant = mgr::get_tenant(tenant_id, true).await?;
|
||||
match tenant.create_timeline(
|
||||
@@ -277,6 +323,7 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
||||
request_data.ancestor_timeline_id.map(TimelineId::from),
|
||||
request_data.ancestor_start_lsn,
|
||||
request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
|
||||
state.broker_client.clone(),
|
||||
&ctx,
|
||||
)
|
||||
.await {
|
||||
@@ -290,11 +337,14 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
||||
Err(err) => Err(ApiError::InternalServerError(err)),
|
||||
}
|
||||
}
|
||||
.instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
|
||||
.instrument(info_span!("timeline_create", tenant = %tenant_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
|
||||
.await
|
||||
}
|
||||
|
||||
async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn timeline_list_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let include_non_incremental_logical_size: Option<bool> =
|
||||
parse_query_param(&request, "include-non-incremental-logical-size")?;
|
||||
@@ -328,7 +378,10 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
json_response(StatusCode::OK, response_data)
|
||||
}
|
||||
|
||||
async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn timeline_detail_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let include_non_incremental_logical_size: Option<bool> =
|
||||
@@ -362,7 +415,10 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
|
||||
json_response(StatusCode::OK, timeline_info)
|
||||
}
|
||||
|
||||
async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn get_lsn_by_timestamp_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
@@ -386,11 +442,19 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
|
||||
json_response(StatusCode::OK, result)
|
||||
}
|
||||
|
||||
// TODO makes sense to provide tenant config right away the same way as it handled in tenant_create
|
||||
async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn tenant_attach_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let maybe_body: Option<TenantAttachRequest> = json_request_or_empty_body(&mut request).await?;
|
||||
let tenant_conf = match maybe_body {
|
||||
Some(request) => TenantConfOpt::try_from(&*request.config).map_err(ApiError::BadRequest)?,
|
||||
None => TenantConfOpt::default(),
|
||||
};
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||
|
||||
info!("Handling tenant attach {tenant_id}");
|
||||
@@ -401,9 +465,8 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
mgr::attach_tenant(
|
||||
state.conf,
|
||||
tenant_id,
|
||||
// XXX: Attach should provide the config, especially during tenant migration.
|
||||
// See https://github.com/neondatabase/neon/issues/1555
|
||||
TenantConfOpt::default(),
|
||||
tenant_conf,
|
||||
state.broker_client.clone(),
|
||||
remote_storage.clone(),
|
||||
&ctx,
|
||||
)
|
||||
@@ -418,7 +481,10 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
|
||||
async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn timeline_delete_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
@@ -429,10 +495,14 @@ async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body
|
||||
.instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
// FIXME: needs to be an error for console to retry it. Ideally Accepted should be used and retried until 404.
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
|
||||
async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn tenant_detach_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
let detach_ignored: Option<bool> = parse_query_param(&request, "detach_ignored")?;
|
||||
@@ -446,21 +516,33 @@ async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn tenant_load_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn tenant_load_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||
|
||||
let state = get_state(&request);
|
||||
mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone(), &ctx)
|
||||
.instrument(info_span!("load", tenant = %tenant_id))
|
||||
.await?;
|
||||
mgr::load_tenant(
|
||||
state.conf,
|
||||
tenant_id,
|
||||
state.broker_client.clone(),
|
||||
state.remote_storage.clone(),
|
||||
&ctx,
|
||||
)
|
||||
.instrument(info_span!("load", tenant = %tenant_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
|
||||
async fn tenant_ignore_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn tenant_ignore_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
@@ -473,7 +555,10 @@ async fn tenant_ignore_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn tenant_list_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
|
||||
let response_data = mgr::list_tenants()
|
||||
@@ -493,7 +578,10 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
json_response(StatusCode::OK, response_data)
|
||||
}
|
||||
|
||||
async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn tenant_status(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
@@ -507,7 +595,7 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
|
||||
}
|
||||
|
||||
let state = tenant.current_state();
|
||||
Ok(TenantInfo {
|
||||
Result::<_, ApiError>::Ok(TenantInfo {
|
||||
id: tenant_id,
|
||||
state: state.clone(),
|
||||
current_physical_size: Some(current_physical_size),
|
||||
@@ -515,8 +603,7 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
|
||||
})
|
||||
}
|
||||
.instrument(info_span!("tenant_status_handler", tenant = %tenant_id))
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, tenant_info)
|
||||
}
|
||||
@@ -534,7 +621,10 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
|
||||
/// Note: we don't update the cached size and prometheus metric here.
|
||||
/// The retention period might be different, and it's nice to have a method to just calculate it
|
||||
/// without modifying anything anyway.
|
||||
async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn tenant_size_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
|
||||
@@ -599,7 +689,10 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
)
|
||||
}
|
||||
|
||||
async fn layer_map_info_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn layer_map_info_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let reset: LayerAccessStatsReset =
|
||||
@@ -613,7 +706,10 @@ async fn layer_map_info_handler(request: Request<Body>) -> Result<Response<Body>
|
||||
json_response(StatusCode::OK, layer_map_info)
|
||||
}
|
||||
|
||||
async fn layer_download_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn layer_download_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
@@ -636,7 +732,10 @@ async fn layer_download_handler(request: Request<Body>) -> Result<Response<Body>
|
||||
}
|
||||
}
|
||||
|
||||
async fn evict_timeline_layer_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn evict_timeline_layer_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
@@ -714,7 +813,10 @@ pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>,
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn tenant_create_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let request_data: TenantCreateRequest = json_request(&mut request).await?;
|
||||
let target_tenant_id = request_data.new_tenant_id;
|
||||
check_permission(&request, Some(target_tenant_id))?;
|
||||
@@ -735,6 +837,7 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
state.conf,
|
||||
tenant_conf,
|
||||
target_tenant_id,
|
||||
state.broker_client.clone(),
|
||||
state.remote_storage.clone(),
|
||||
&ctx,
|
||||
)
|
||||
@@ -760,7 +863,10 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
)
|
||||
}
|
||||
|
||||
async fn get_tenant_config_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn get_tenant_config_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
@@ -786,6 +892,7 @@ async fn get_tenant_config_handler(request: Request<Body>) -> Result<Response<Bo
|
||||
|
||||
async fn update_tenant_config_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let request_data: TenantConfigRequest = json_request(&mut request).await?;
|
||||
let tenant_id = request_data.tenant_id;
|
||||
@@ -803,21 +910,25 @@ async fn update_tenant_config_handler(
|
||||
}
|
||||
|
||||
/// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`].
|
||||
#[cfg(feature = "testing")]
|
||||
async fn handle_tenant_break(r: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn handle_tenant_break(
|
||||
r: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?;
|
||||
|
||||
let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
|
||||
.await
|
||||
.map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
|
||||
|
||||
tenant.set_broken("broken from test".to_owned());
|
||||
tenant.set_broken("broken from test".to_owned()).await;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn failpoints_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
if !fail::has_failpoints() {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"Cannot manage failpoints because pageserver was compiled without failpoints support"
|
||||
@@ -850,7 +961,10 @@ async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>
|
||||
}
|
||||
|
||||
// Run GC immediately on given timeline.
|
||||
async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn timeline_gc_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
@@ -869,8 +983,10 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
|
||||
}
|
||||
|
||||
// Run compaction immediately on given timeline.
|
||||
#[cfg(feature = "testing")]
|
||||
async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn timeline_compact_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
@@ -891,8 +1007,10 @@ async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Bod
|
||||
}
|
||||
|
||||
// Run checkpoint immediately on given timeline.
|
||||
#[cfg(feature = "testing")]
|
||||
async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn timeline_checkpoint_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
@@ -916,6 +1034,7 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
|
||||
|
||||
async fn timeline_download_remote_layers_handler_post(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
@@ -931,6 +1050,7 @@ async fn timeline_download_remote_layers_handler_post(
|
||||
|
||||
async fn timeline_download_remote_layers_handler_get(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
@@ -954,7 +1074,10 @@ async fn active_timeline_of_active_tenant(
|
||||
.map_err(ApiError::NotFound)
|
||||
}
|
||||
|
||||
async fn always_panic_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn always_panic_handler(
|
||||
req: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
// Deliberately cause a panic to exercise the panic hook registered via std::panic::set_hook().
|
||||
// For pageserver, the relevant panic hook is `tracing_panic_hook` , and the `sentry` crate's wrapper around it.
|
||||
// Use catch_unwind to ensure that tokio nor hyper are distracted by our panic.
|
||||
@@ -965,7 +1088,10 @@ async fn always_panic_handler(req: Request<Body>) -> Result<Response<Body>, ApiE
|
||||
json_response(StatusCode::NO_CONTENT, ())
|
||||
}
|
||||
|
||||
async fn disk_usage_eviction_run(mut r: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn disk_usage_eviction_run(
|
||||
mut r: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&r, None)?;
|
||||
|
||||
#[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize)]
|
||||
@@ -1055,8 +1181,10 @@ async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
)
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
async fn post_tracing_event_handler(mut r: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn post_tracing_event_handler(
|
||||
mut r: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
#[derive(Debug, serde::Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
enum Level {
|
||||
@@ -1086,10 +1214,90 @@ async fn post_tracing_event_handler(mut r: Request<Body>) -> Result<Response<Bod
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
/// Common functionality of all the HTTP API handlers.
|
||||
///
|
||||
/// - Adds a tracing span to each request (by `request_span`)
|
||||
/// - Logs the request depending on the request method (by `request_span`)
|
||||
/// - Logs the response if it was not successful (by `request_span`
|
||||
/// - Shields the handler function from async cancellations. Hyper can drop the handler
|
||||
/// Future if the connection to the client is lost, but most of the pageserver code is
|
||||
/// not async cancellation safe. This converts the dropped future into a graceful cancellation
|
||||
/// request with a CancellationToken.
|
||||
async fn api_handler<R, H>(request: Request<Body>, handler: H) -> Result<Response<Body>, ApiError>
|
||||
where
|
||||
R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
|
||||
H: FnOnce(Request<Body>, CancellationToken) -> R + Send + Sync + 'static,
|
||||
{
|
||||
// Spawn a new task to handle the request, to protect the handler from unexpected
|
||||
// async cancellations. Most pageserver functions are not async cancellation safe.
|
||||
// We arm a drop-guard, so that if Hyper drops the Future, we signal the task
|
||||
// with the cancellation token.
|
||||
let token = CancellationToken::new();
|
||||
let cancel_guard = token.clone().drop_guard();
|
||||
let result = request_span(request, move |r| async {
|
||||
let handle = tokio::spawn(
|
||||
async {
|
||||
let token_cloned = token.clone();
|
||||
let result = handler(r, token).await;
|
||||
if token_cloned.is_cancelled() {
|
||||
info!("Cancelled request finished");
|
||||
}
|
||||
result
|
||||
}
|
||||
.in_current_span(),
|
||||
);
|
||||
|
||||
match handle.await {
|
||||
Ok(result) => result,
|
||||
Err(e) => {
|
||||
// The handler task panicked. We have a global panic handler that logs the
|
||||
// panic with its backtrace, so no need to log that here. Only log a brief
|
||||
// message to make it clear that we returned the error to the client.
|
||||
error!("HTTP request handler task panicked: {e:#}");
|
||||
|
||||
// Don't return an Error here, because then fallback error handler that was
|
||||
// installed in make_router() will print the error. Instead, construct the
|
||||
// HTTP error response and return that.
|
||||
Ok(
|
||||
ApiError::InternalServerError(anyhow!("HTTP request handler task panicked"))
|
||||
.into_response(),
|
||||
)
|
||||
}
|
||||
}
|
||||
})
|
||||
.await;
|
||||
|
||||
cancel_guard.disarm();
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// Like api_handler, but returns an error response if the server is built without
|
||||
/// the 'testing' feature.
|
||||
async fn testing_api_handler<R, H>(
|
||||
desc: &str,
|
||||
request: Request<Body>,
|
||||
handler: H,
|
||||
) -> Result<Response<Body>, ApiError>
|
||||
where
|
||||
R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
|
||||
H: FnOnce(Request<Body>, CancellationToken) -> R + Send + Sync + 'static,
|
||||
{
|
||||
if cfg!(feature = "testing") {
|
||||
api_handler(request, handler).await
|
||||
} else {
|
||||
std::future::ready(Err(ApiError::BadRequest(anyhow!(
|
||||
"Cannot {desc} because pageserver was compiled without testing APIs",
|
||||
))))
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
pub fn make_router(
|
||||
conf: &'static PageServerConf,
|
||||
launch_ts: &'static LaunchTimestamp,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
broker_client: BrokerClientChannel,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
|
||||
) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
|
||||
@@ -1114,121 +1322,99 @@ pub fn make_router(
|
||||
.expect("construct launch timestamp header middleware"),
|
||||
);
|
||||
|
||||
macro_rules! testing_api {
|
||||
($handler_desc:literal, $handler:path $(,)?) => {{
|
||||
#[cfg(not(feature = "testing"))]
|
||||
async fn cfg_disabled(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
Err(ApiError::BadRequest(anyhow!(concat!(
|
||||
"Cannot ",
|
||||
$handler_desc,
|
||||
" because pageserver was compiled without testing APIs",
|
||||
))))
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
let handler = $handler;
|
||||
#[cfg(not(feature = "testing"))]
|
||||
let handler = cfg_disabled;
|
||||
|
||||
move |r| RequestSpan(handler).handle(r)
|
||||
}};
|
||||
}
|
||||
|
||||
Ok(router
|
||||
.data(Arc::new(
|
||||
State::new(conf, auth, remote_storage, disk_usage_eviction_state)
|
||||
.context("Failed to initialize router state")?,
|
||||
State::new(
|
||||
conf,
|
||||
auth,
|
||||
remote_storage,
|
||||
broker_client,
|
||||
disk_usage_eviction_state,
|
||||
)
|
||||
.context("Failed to initialize router state")?,
|
||||
))
|
||||
.get("/v1/status", |r| RequestSpan(status_handler).handle(r))
|
||||
.put(
|
||||
"/v1/failpoints",
|
||||
testing_api!("manage failpoints", failpoints_handler),
|
||||
)
|
||||
.get("/v1/tenant", |r| RequestSpan(tenant_list_handler).handle(r))
|
||||
.post("/v1/tenant", |r| {
|
||||
RequestSpan(tenant_create_handler).handle(r)
|
||||
})
|
||||
.get("/v1/tenant/:tenant_id", |r| {
|
||||
RequestSpan(tenant_status).handle(r)
|
||||
.get("/v1/status", |r| api_handler(r, status_handler))
|
||||
.put("/v1/failpoints", |r| {
|
||||
testing_api_handler("manage failpoints", r, failpoints_handler)
|
||||
})
|
||||
.get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
|
||||
.post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
|
||||
.get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
|
||||
.get("/v1/tenant/:tenant_id/synthetic_size", |r| {
|
||||
RequestSpan(tenant_size_handler).handle(r)
|
||||
api_handler(r, tenant_size_handler)
|
||||
})
|
||||
.put("/v1/tenant/config", |r| {
|
||||
RequestSpan(update_tenant_config_handler).handle(r)
|
||||
api_handler(r, update_tenant_config_handler)
|
||||
})
|
||||
.get("/v1/tenant/:tenant_id/config", |r| {
|
||||
RequestSpan(get_tenant_config_handler).handle(r)
|
||||
api_handler(r, get_tenant_config_handler)
|
||||
})
|
||||
.get("/v1/tenant/:tenant_id/timeline", |r| {
|
||||
RequestSpan(timeline_list_handler).handle(r)
|
||||
api_handler(r, timeline_list_handler)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/timeline", |r| {
|
||||
RequestSpan(timeline_create_handler).handle(r)
|
||||
api_handler(r, timeline_create_handler)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/attach", |r| {
|
||||
RequestSpan(tenant_attach_handler).handle(r)
|
||||
api_handler(r, tenant_attach_handler)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/detach", |r| {
|
||||
RequestSpan(tenant_detach_handler).handle(r)
|
||||
api_handler(r, tenant_detach_handler)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/load", |r| {
|
||||
RequestSpan(tenant_load_handler).handle(r)
|
||||
api_handler(r, tenant_load_handler)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/ignore", |r| {
|
||||
RequestSpan(tenant_ignore_handler).handle(r)
|
||||
api_handler(r, tenant_ignore_handler)
|
||||
})
|
||||
.get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
|
||||
RequestSpan(timeline_detail_handler).handle(r)
|
||||
api_handler(r, timeline_detail_handler)
|
||||
})
|
||||
.get(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp",
|
||||
|r| RequestSpan(get_lsn_by_timestamp_handler).handle(r),
|
||||
|r| api_handler(r, get_lsn_by_timestamp_handler),
|
||||
)
|
||||
.put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| {
|
||||
RequestSpan(timeline_gc_handler).handle(r)
|
||||
api_handler(r, timeline_gc_handler)
|
||||
})
|
||||
.put("/v1/tenant/:tenant_id/timeline/:timeline_id/compact", |r| {
|
||||
testing_api_handler("run timeline compaction", r, timeline_compact_handler)
|
||||
})
|
||||
.put(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/compact",
|
||||
testing_api!("run timeline compaction", timeline_compact_handler),
|
||||
)
|
||||
.put(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint",
|
||||
testing_api!("run timeline checkpoint", timeline_checkpoint_handler),
|
||||
|r| testing_api_handler("run timeline checkpoint", r, timeline_checkpoint_handler),
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
|
||||
|r| RequestSpan(timeline_download_remote_layers_handler_post).handle(r),
|
||||
|r| api_handler(r, timeline_download_remote_layers_handler_post),
|
||||
)
|
||||
.get(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
|
||||
|r| RequestSpan(timeline_download_remote_layers_handler_get).handle(r),
|
||||
|r| api_handler(r, timeline_download_remote_layers_handler_get),
|
||||
)
|
||||
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
|
||||
RequestSpan(timeline_delete_handler).handle(r)
|
||||
api_handler(r, timeline_delete_handler)
|
||||
})
|
||||
.get("/v1/tenant/:tenant_id/timeline/:timeline_id/layer", |r| {
|
||||
RequestSpan(layer_map_info_handler).handle(r)
|
||||
api_handler(r, layer_map_info_handler)
|
||||
})
|
||||
.get(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
|
||||
|r| RequestSpan(layer_download_handler).handle(r),
|
||||
|r| api_handler(r, layer_download_handler),
|
||||
)
|
||||
.delete(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
|
||||
|r| RequestSpan(evict_timeline_layer_handler).handle(r),
|
||||
|r| api_handler(r, evict_timeline_layer_handler),
|
||||
)
|
||||
.put("/v1/disk_usage_eviction/run", |r| {
|
||||
RequestSpan(disk_usage_eviction_run).handle(r)
|
||||
api_handler(r, disk_usage_eviction_run)
|
||||
})
|
||||
.put("/v1/tenant/:tenant_id/break", |r| {
|
||||
testing_api_handler("set tenant state to broken", r, handle_tenant_break)
|
||||
})
|
||||
.get("/v1/panic", |r| api_handler(r, always_panic_handler))
|
||||
.post("/v1/tracing/event", |r| {
|
||||
testing_api_handler("emit a tracing event", r, post_tracing_event_handler)
|
||||
})
|
||||
.put(
|
||||
"/v1/tenant/:tenant_id/break",
|
||||
testing_api!("set tenant state to broken", handle_tenant_break),
|
||||
)
|
||||
.get("/v1/panic", |r| RequestSpan(always_panic_handler).handle(r))
|
||||
.post(
|
||||
"/v1/tracing/event",
|
||||
testing_api!("emit a tracing event", post_tracing_event_handler),
|
||||
)
|
||||
.any(handler_404))
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@ use std::ops::Range;
|
||||
///
|
||||
/// Represents a set of Keys, in a compact form.
|
||||
///
|
||||
#[derive(Clone, Debug)]
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct KeySpace {
|
||||
/// Contiguous ranges of keys that belong to the key space. In key order,
|
||||
/// and with no overlap.
|
||||
@@ -61,6 +61,18 @@ impl KeySpace {
|
||||
|
||||
KeyPartitioning { parts }
|
||||
}
|
||||
|
||||
///
|
||||
/// Check if key space contains overlapping range
|
||||
///
|
||||
pub fn overlaps(&self, range: &Range<Key>) -> bool {
|
||||
match self.ranges.binary_search_by_key(&range.end, |r| r.start) {
|
||||
Ok(0) => false,
|
||||
Err(0) => false,
|
||||
Ok(index) => self.ranges[index - 1].end > range.start,
|
||||
Err(index) => self.ranges[index - 1].end > range.start,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
@@ -129,3 +141,226 @@ impl KeySpaceAccum {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// A helper object, to collect a set of keys and key ranges into a KeySpace
|
||||
/// object. Key ranges may be inserted in any order and can overlap.
|
||||
///
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct KeySpaceRandomAccum {
|
||||
ranges: Vec<Range<Key>>,
|
||||
}
|
||||
|
||||
impl KeySpaceRandomAccum {
|
||||
pub fn new() -> Self {
|
||||
Self { ranges: Vec::new() }
|
||||
}
|
||||
|
||||
pub fn add_key(&mut self, key: Key) {
|
||||
self.add_range(singleton_range(key))
|
||||
}
|
||||
|
||||
pub fn add_range(&mut self, range: Range<Key>) {
|
||||
self.ranges.push(range);
|
||||
}
|
||||
|
||||
pub fn to_keyspace(mut self) -> KeySpace {
|
||||
let mut ranges = Vec::new();
|
||||
if !self.ranges.is_empty() {
|
||||
self.ranges.sort_by_key(|r| r.start);
|
||||
let mut start = self.ranges.first().unwrap().start;
|
||||
let mut end = self.ranges.first().unwrap().end;
|
||||
for r in self.ranges {
|
||||
assert!(r.start >= start);
|
||||
if r.start > end {
|
||||
ranges.push(start..end);
|
||||
start = r.start;
|
||||
end = r.end;
|
||||
} else if r.end > end {
|
||||
end = r.end;
|
||||
}
|
||||
}
|
||||
ranges.push(start..end);
|
||||
}
|
||||
KeySpace { ranges }
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::fmt::Write;
|
||||
|
||||
// Helper function to create a key range.
|
||||
//
|
||||
// Make the tests below less verbose.
|
||||
fn kr(irange: Range<i128>) -> Range<Key> {
|
||||
Key::from_i128(irange.start)..Key::from_i128(irange.end)
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
fn dump_keyspace(ks: &KeySpace) {
|
||||
for r in ks.ranges.iter() {
|
||||
println!(" {}..{}", r.start.to_i128(), r.end.to_i128());
|
||||
}
|
||||
}
|
||||
|
||||
fn assert_ks_eq(actual: &KeySpace, expected: Vec<Range<Key>>) {
|
||||
if actual.ranges != expected {
|
||||
let mut msg = String::new();
|
||||
|
||||
writeln!(msg, "expected:").unwrap();
|
||||
for r in &expected {
|
||||
writeln!(msg, " {}..{}", r.start.to_i128(), r.end.to_i128()).unwrap();
|
||||
}
|
||||
writeln!(msg, "got:").unwrap();
|
||||
for r in &actual.ranges {
|
||||
writeln!(msg, " {}..{}", r.start.to_i128(), r.end.to_i128()).unwrap();
|
||||
}
|
||||
panic!("{}", msg);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn keyspace_add_range() {
|
||||
// two separate ranges
|
||||
//
|
||||
// #####
|
||||
// #####
|
||||
let mut ks = KeySpaceRandomAccum::default();
|
||||
ks.add_range(kr(0..10));
|
||||
ks.add_range(kr(20..30));
|
||||
assert_ks_eq(&ks.to_keyspace(), vec![kr(0..10), kr(20..30)]);
|
||||
|
||||
// two separate ranges, added in reverse order
|
||||
//
|
||||
// #####
|
||||
// #####
|
||||
let mut ks = KeySpaceRandomAccum::default();
|
||||
ks.add_range(kr(20..30));
|
||||
ks.add_range(kr(0..10));
|
||||
|
||||
// add range that is adjacent to the end of an existing range
|
||||
//
|
||||
// #####
|
||||
// #####
|
||||
ks.add_range(kr(0..10));
|
||||
ks.add_range(kr(10..30));
|
||||
assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
|
||||
|
||||
// add range that is adjacent to the start of an existing range
|
||||
//
|
||||
// #####
|
||||
// #####
|
||||
let mut ks = KeySpaceRandomAccum::default();
|
||||
ks.add_range(kr(10..30));
|
||||
ks.add_range(kr(0..10));
|
||||
assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
|
||||
|
||||
// add range that overlaps with the end of an existing range
|
||||
//
|
||||
// #####
|
||||
// #####
|
||||
let mut ks = KeySpaceRandomAccum::default();
|
||||
ks.add_range(kr(0..10));
|
||||
ks.add_range(kr(5..30));
|
||||
assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
|
||||
|
||||
// add range that overlaps with the start of an existing range
|
||||
//
|
||||
// #####
|
||||
// #####
|
||||
let mut ks = KeySpaceRandomAccum::default();
|
||||
ks.add_range(kr(5..30));
|
||||
ks.add_range(kr(0..10));
|
||||
assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
|
||||
|
||||
// add range that is fully covered by an existing range
|
||||
//
|
||||
// #########
|
||||
// #####
|
||||
let mut ks = KeySpaceRandomAccum::default();
|
||||
ks.add_range(kr(0..30));
|
||||
ks.add_range(kr(10..20));
|
||||
assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
|
||||
|
||||
// add range that extends an existing range from both ends
|
||||
//
|
||||
// #####
|
||||
// #########
|
||||
let mut ks = KeySpaceRandomAccum::default();
|
||||
ks.add_range(kr(10..20));
|
||||
ks.add_range(kr(0..30));
|
||||
assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
|
||||
|
||||
// add a range that overlaps with two existing ranges, joining them
|
||||
//
|
||||
// ##### #####
|
||||
// #######
|
||||
let mut ks = KeySpaceRandomAccum::default();
|
||||
ks.add_range(kr(0..10));
|
||||
ks.add_range(kr(20..30));
|
||||
ks.add_range(kr(5..25));
|
||||
assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn keyspace_overlaps() {
|
||||
let mut ks = KeySpaceRandomAccum::default();
|
||||
ks.add_range(kr(10..20));
|
||||
ks.add_range(kr(30..40));
|
||||
let ks = ks.to_keyspace();
|
||||
|
||||
// ##### #####
|
||||
// xxxx
|
||||
assert!(!ks.overlaps(&kr(0..5)));
|
||||
|
||||
// ##### #####
|
||||
// xxxx
|
||||
assert!(!ks.overlaps(&kr(5..9)));
|
||||
|
||||
// ##### #####
|
||||
// xxxx
|
||||
assert!(!ks.overlaps(&kr(5..10)));
|
||||
|
||||
// ##### #####
|
||||
// xxxx
|
||||
assert!(ks.overlaps(&kr(5..11)));
|
||||
|
||||
// ##### #####
|
||||
// xxxx
|
||||
assert!(ks.overlaps(&kr(10..15)));
|
||||
|
||||
// ##### #####
|
||||
// xxxx
|
||||
assert!(ks.overlaps(&kr(15..20)));
|
||||
|
||||
// ##### #####
|
||||
// xxxx
|
||||
assert!(ks.overlaps(&kr(15..25)));
|
||||
|
||||
// ##### #####
|
||||
// xxxx
|
||||
assert!(!ks.overlaps(&kr(22..28)));
|
||||
|
||||
// ##### #####
|
||||
// xxxx
|
||||
assert!(!ks.overlaps(&kr(25..30)));
|
||||
|
||||
// ##### #####
|
||||
// xxxx
|
||||
assert!(ks.overlaps(&kr(35..35)));
|
||||
|
||||
// ##### #####
|
||||
// xxxx
|
||||
assert!(!ks.overlaps(&kr(40..45)));
|
||||
|
||||
// ##### #####
|
||||
// xxxx
|
||||
assert!(!ks.overlaps(&kr(45..50)));
|
||||
|
||||
// ##### #####
|
||||
// xxxxxxxxxxx
|
||||
assert!(ks.overlaps(&kr(0..30))); // XXXXX This fails currently!
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
mod auth;
|
||||
pub mod basebackup;
|
||||
pub mod broker_client;
|
||||
pub mod config;
|
||||
pub mod consumption_metrics;
|
||||
pub mod context;
|
||||
@@ -36,7 +35,7 @@ use tracing::info;
|
||||
/// backwards-compatible changes to the metadata format.
|
||||
pub const STORAGE_FORMAT_VERSION: u16 = 3;
|
||||
|
||||
pub const DEFAULT_PG_VERSION: u32 = 14;
|
||||
pub const DEFAULT_PG_VERSION: u32 = 15;
|
||||
|
||||
// Magic constants used to identify different kinds of files
|
||||
pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
|
||||
@@ -46,6 +45,7 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
|
||||
|
||||
pub use crate::metrics::preinitialize_metrics;
|
||||
|
||||
#[tracing::instrument]
|
||||
pub async fn shutdown_pageserver(exit_code: i32) {
|
||||
// Shut down the libpq endpoint task. This prevents new connections from
|
||||
// being accepted.
|
||||
@@ -58,12 +58,6 @@ pub async fn shutdown_pageserver(exit_code: i32) {
|
||||
// the checkpoint and GC tasks.
|
||||
tenant::mgr::shutdown_all_tenants().await;
|
||||
|
||||
// Stop syncing with remote storage.
|
||||
//
|
||||
// FIXME: Does this wait for the sync tasks to finish syncing what's queued up?
|
||||
// Should it?
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::RemoteUploadTask), None, None).await;
|
||||
|
||||
// Shut down the HTTP endpoint last, so that you can still check the server's
|
||||
// status while it's shutting down.
|
||||
// FIXME: We should probably stop accepting commands like attach/detach earlier.
|
||||
@@ -138,6 +132,29 @@ pub fn is_uninit_mark(path: &Path) -> bool {
|
||||
}
|
||||
}
|
||||
|
||||
/// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
|
||||
/// blocking.
|
||||
///
|
||||
/// The instances of this value exist only during startup, otherwise `None` is provided, meaning no
|
||||
/// delaying is needed.
|
||||
#[derive(Clone)]
|
||||
pub struct InitializationOrder {
|
||||
/// Each initial tenant load task carries this until completion.
|
||||
pub initial_tenant_load: Option<utils::completion::Completion>,
|
||||
|
||||
/// Barrier for when we can start initial logical size calculations.
|
||||
pub initial_logical_size_can_start: utils::completion::Barrier,
|
||||
|
||||
/// Each timeline owns a clone of this to be consumed on the initial logical size calculation
|
||||
/// attempt. It is important to drop this once the attempt has completed.
|
||||
pub initial_logical_size_attempt: utils::completion::Completion,
|
||||
|
||||
/// Barrier for when we can start any background jobs.
|
||||
///
|
||||
/// This can be broken up later on, but right now there is just one class of a background job.
|
||||
pub background_jobs_can_start: utils::completion::Barrier,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod backoff_defaults_tests {
|
||||
use super::*;
|
||||
|
||||
@@ -84,6 +84,16 @@ pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static READ_NUM_FS_LAYERS: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_read_num_fs_layers",
|
||||
"Number of persistent layers accessed for processing a read request, including those in the cache",
|
||||
&["tenant_id", "timeline_id"],
|
||||
vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// Metrics collected on operations on the storage repository.
|
||||
static RECONSTRUCT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
@@ -95,6 +105,25 @@ static RECONSTRUCT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_materialized_cache_hits_direct_total",
|
||||
"Number of cache hits from materialized page cache without redo",
|
||||
&["tenant_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static GET_RECONSTRUCT_DATA_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_getpage_get_reconstruct_data_seconds",
|
||||
"Time spent in get_reconstruct_value_data",
|
||||
&["tenant_id", "timeline_id"],
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_materialized_cache_hits_total",
|
||||
@@ -354,6 +383,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
|
||||
0.001000, // 1000 usec
|
||||
0.030, // 30 ms
|
||||
1.000, // 1000 ms
|
||||
30.000, // 30000 ms
|
||||
];
|
||||
|
||||
const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[
|
||||
@@ -622,7 +652,7 @@ pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_wal_redo_wait_seconds",
|
||||
"Time spent waiting for access to the WAL redo process",
|
||||
"Time spent waiting for access to the Postgres WAL redo process",
|
||||
redo_histogram_time_buckets!(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
@@ -631,7 +661,7 @@ pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_wal_redo_records_histogram",
|
||||
"Histogram of number of records replayed per redo",
|
||||
"Histogram of number of records replayed per redo in the Postgres WAL redo process",
|
||||
redo_histogram_count_buckets!(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
@@ -640,7 +670,7 @@ pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
||||
pub static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_wal_redo_bytes_histogram",
|
||||
"Histogram of number of records replayed per redo",
|
||||
"Histogram of number of records replayed per redo sent to Postgres",
|
||||
redo_bytes_histogram_count_buckets!(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
@@ -723,7 +753,9 @@ pub struct TimelineMetrics {
|
||||
tenant_id: String,
|
||||
timeline_id: String,
|
||||
pub reconstruct_time_histo: Histogram,
|
||||
pub get_reconstruct_data_time_histo: Histogram,
|
||||
pub materialized_page_cache_hit_counter: GenericCounter<AtomicU64>,
|
||||
pub materialized_page_cache_hit_upon_request_counter: GenericCounter<AtomicU64>,
|
||||
pub flush_time_histo: StorageTimeMetrics,
|
||||
pub compact_time_histo: StorageTimeMetrics,
|
||||
pub create_images_time_histo: StorageTimeMetrics,
|
||||
@@ -734,6 +766,7 @@ pub struct TimelineMetrics {
|
||||
pub last_record_gauge: IntGauge,
|
||||
pub wait_lsn_time_histo: Histogram,
|
||||
pub resident_physical_size_gauge: UIntGauge,
|
||||
pub read_num_fs_layers: Histogram,
|
||||
/// copy of LayeredTimeline.current_logical_size
|
||||
pub current_logical_size_gauge: UIntGauge,
|
||||
pub num_persistent_files_created: IntCounter,
|
||||
@@ -753,6 +786,9 @@ impl TimelineMetrics {
|
||||
let reconstruct_time_histo = RECONSTRUCT_TIME
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let get_reconstruct_data_time_histo = GET_RECONSTRUCT_DATA_TIME
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
@@ -794,6 +830,12 @@ impl TimelineMetrics {
|
||||
let evictions = EVICTIONS
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let read_num_fs_layers = READ_NUM_FS_LAYERS
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let materialized_page_cache_hit_upon_request_counter = MATERIALIZED_PAGE_CACHE_HIT_DIRECT
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let evictions_with_low_residence_duration =
|
||||
evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);
|
||||
|
||||
@@ -801,7 +843,9 @@ impl TimelineMetrics {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
reconstruct_time_histo,
|
||||
get_reconstruct_data_time_histo,
|
||||
materialized_page_cache_hit_counter,
|
||||
materialized_page_cache_hit_upon_request_counter,
|
||||
flush_time_histo,
|
||||
compact_time_histo,
|
||||
create_images_time_histo,
|
||||
@@ -819,6 +863,7 @@ impl TimelineMetrics {
|
||||
evictions_with_low_residence_duration: std::sync::RwLock::new(
|
||||
evictions_with_low_residence_duration,
|
||||
),
|
||||
read_num_fs_layers,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -828,7 +873,9 @@ impl Drop for TimelineMetrics {
|
||||
let tenant_id = &self.tenant_id;
|
||||
let timeline_id = &self.timeline_id;
|
||||
let _ = RECONSTRUCT_TIME.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = GET_RECONSTRUCT_DATA_TIME.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = MATERIALIZED_PAGE_CACHE_HIT.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = MATERIALIZED_PAGE_CACHE_HIT_DIRECT.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
|
||||
@@ -836,6 +883,8 @@ impl Drop for TimelineMetrics {
|
||||
let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = READ_NUM_FS_LAYERS.remove_label_values(&[tenant_id, timeline_id]);
|
||||
|
||||
self.evictions_with_low_residence_duration
|
||||
.write()
|
||||
.unwrap()
|
||||
|
||||
@@ -50,7 +50,9 @@ use crate::import_datadir::import_wal_from_tar;
|
||||
use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant;
|
||||
use crate::tenant::mgr;
|
||||
use crate::tenant::mgr::GetTenantError;
|
||||
use crate::tenant::{Tenant, Timeline};
|
||||
use crate::trace::Tracer;
|
||||
|
||||
@@ -172,6 +174,7 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
|
||||
///
|
||||
pub async fn libpq_listener_main(
|
||||
conf: &'static PageServerConf,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
listener: TcpListener,
|
||||
auth_type: AuthType,
|
||||
@@ -213,7 +216,14 @@ pub async fn libpq_listener_main(
|
||||
None,
|
||||
"serving compute connection task",
|
||||
false,
|
||||
page_service_conn_main(conf, local_auth, socket, auth_type, connection_ctx),
|
||||
page_service_conn_main(
|
||||
conf,
|
||||
broker_client.clone(),
|
||||
local_auth,
|
||||
socket,
|
||||
auth_type,
|
||||
connection_ctx,
|
||||
),
|
||||
);
|
||||
}
|
||||
Err(err) => {
|
||||
@@ -230,6 +240,7 @@ pub async fn libpq_listener_main(
|
||||
|
||||
async fn page_service_conn_main(
|
||||
conf: &'static PageServerConf,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
socket: tokio::net::TcpStream,
|
||||
auth_type: AuthType,
|
||||
@@ -266,7 +277,7 @@ async fn page_service_conn_main(
|
||||
// and create a child per-query context when it invokes process_query.
|
||||
// But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
|
||||
// and create the per-query context in process_query ourselves.
|
||||
let mut conn_handler = PageServerHandler::new(conf, auth, connection_ctx);
|
||||
let mut conn_handler = PageServerHandler::new(conf, broker_client, auth, connection_ctx);
|
||||
let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
|
||||
|
||||
match pgbackend
|
||||
@@ -324,6 +335,7 @@ impl PageRequestMetrics {
|
||||
|
||||
struct PageServerHandler {
|
||||
_conf: &'static PageServerConf,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
claims: Option<Claims>,
|
||||
|
||||
@@ -337,11 +349,13 @@ struct PageServerHandler {
|
||||
impl PageServerHandler {
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
connection_ctx: RequestContext,
|
||||
) -> Self {
|
||||
PageServerHandler {
|
||||
_conf: conf,
|
||||
broker_client,
|
||||
auth,
|
||||
claims: None,
|
||||
connection_ctx,
|
||||
@@ -494,7 +508,12 @@ impl PageServerHandler {
|
||||
|
||||
let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb)));
|
||||
timeline
|
||||
.import_basebackup_from_tar(&mut copyin_reader, base_lsn, &ctx)
|
||||
.import_basebackup_from_tar(
|
||||
&mut copyin_reader,
|
||||
base_lsn,
|
||||
self.broker_client.clone(),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Read the end of the tar archive.
|
||||
@@ -1131,7 +1150,9 @@ enum GetActiveTenantError {
|
||||
wait_time: Duration,
|
||||
},
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
NotFound(GetTenantError),
|
||||
#[error(transparent)]
|
||||
WaitTenantActive(tenant::WaitToBecomeActiveError),
|
||||
}
|
||||
|
||||
impl From<GetActiveTenantError> for QueryError {
|
||||
@@ -1140,7 +1161,8 @@ impl From<GetActiveTenantError> for QueryError {
|
||||
GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
|
||||
ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
|
||||
),
|
||||
GetActiveTenantError::Other(e) => QueryError::Other(e),
|
||||
GetActiveTenantError::WaitTenantActive(e) => QueryError::Other(anyhow::Error::new(e)),
|
||||
GetActiveTenantError::NotFound(e) => QueryError::Other(anyhow::Error::new(e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1156,13 +1178,16 @@ async fn get_active_tenant_with_timeout(
|
||||
) -> Result<Arc<Tenant>, GetActiveTenantError> {
|
||||
let tenant = match mgr::get_tenant(tenant_id, false).await {
|
||||
Ok(tenant) => tenant,
|
||||
Err(e) => return Err(GetActiveTenantError::Other(e.into())),
|
||||
Err(e @ GetTenantError::NotFound(_)) => return Err(GetActiveTenantError::NotFound(e)),
|
||||
Err(GetTenantError::NotActive(_)) => {
|
||||
unreachable!("we're calling get_tenant with active=false")
|
||||
}
|
||||
};
|
||||
let wait_time = Duration::from_secs(30);
|
||||
match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await {
|
||||
Ok(Ok(())) => Ok(tenant),
|
||||
// no .context(), the error message is good enough and some tests depend on it
|
||||
Ok(Err(wait_error)) => Err(GetActiveTenantError::Other(wait_error)),
|
||||
Ok(Err(e)) => Err(GetActiveTenantError::WaitTenantActive(e)),
|
||||
Err(_) => {
|
||||
let latest_state = tenant.current_state();
|
||||
if latest_state == TenantState::Active {
|
||||
@@ -1177,13 +1202,34 @@ async fn get_active_tenant_with_timeout(
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
enum GetActiveTimelineError {
|
||||
#[error(transparent)]
|
||||
Tenant(GetActiveTenantError),
|
||||
#[error(transparent)]
|
||||
Timeline(anyhow::Error),
|
||||
}
|
||||
|
||||
impl From<GetActiveTimelineError> for QueryError {
|
||||
fn from(e: GetActiveTimelineError) -> Self {
|
||||
match e {
|
||||
GetActiveTimelineError::Tenant(e) => e.into(),
|
||||
GetActiveTimelineError::Timeline(e) => QueryError::Other(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Shorthand for getting a reference to a Timeline of an Active tenant.
|
||||
async fn get_active_tenant_timeline(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Arc<Timeline>, GetActiveTenantError> {
|
||||
let tenant = get_active_tenant_with_timeout(tenant_id, ctx).await?;
|
||||
let timeline = tenant.get_timeline(timeline_id, true)?;
|
||||
) -> Result<Arc<Timeline>, GetActiveTimelineError> {
|
||||
let tenant = get_active_tenant_with_timeout(tenant_id, ctx)
|
||||
.await
|
||||
.map_err(GetActiveTimelineError::Tenant)?;
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(GetActiveTimelineError::Timeline)?;
|
||||
Ok(timeline)
|
||||
}
|
||||
|
||||
@@ -1600,9 +1600,7 @@ pub fn create_test_timeline(
|
||||
pg_version: u32,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<std::sync::Arc<Timeline>> {
|
||||
let tline = tenant
|
||||
.create_empty_timeline(timeline_id, Lsn(8), pg_version, ctx)?
|
||||
.initialize(ctx)?;
|
||||
let tline = tenant.create_test_timeline(timeline_id, Lsn(8), pg_version, ctx)?;
|
||||
let mut m = tline.begin_modification(Lsn(8));
|
||||
m.init_empty()?;
|
||||
m.commit()?;
|
||||
|
||||
@@ -257,6 +257,9 @@ pub enum TaskKind {
|
||||
// task that handles attaching a tenant
|
||||
Attach,
|
||||
|
||||
// Used mostly for background deletion from s3
|
||||
TimelineDeletionWorker,
|
||||
|
||||
// task that handhes metrics collection
|
||||
MetricsCollection,
|
||||
|
||||
@@ -476,18 +479,35 @@ pub async fn shutdown_tasks(
|
||||
&& (timeline_id.is_none() || task_mut.timeline_id == timeline_id)
|
||||
{
|
||||
task.cancel.cancel();
|
||||
victim_tasks.push(Arc::clone(task));
|
||||
victim_tasks.push((
|
||||
Arc::clone(task),
|
||||
task.kind,
|
||||
task_mut.tenant_id,
|
||||
task_mut.timeline_id,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for task in victim_tasks {
|
||||
let log_all = kind.is_none() && tenant_id.is_none() && timeline_id.is_none();
|
||||
|
||||
for (task, task_kind, tenant_id, timeline_id) in victim_tasks {
|
||||
let join_handle = {
|
||||
let mut task_mut = task.mutable.lock().unwrap();
|
||||
task_mut.join_handle.take()
|
||||
};
|
||||
if let Some(mut join_handle) = join_handle {
|
||||
if log_all {
|
||||
if tenant_id.is_none() {
|
||||
// there are quite few of these
|
||||
info!(name = task.name, kind = ?task_kind, "stopping global task");
|
||||
} else {
|
||||
// warn to catch these in tests; there shouldn't be any
|
||||
warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
|
||||
}
|
||||
}
|
||||
let completed = tokio::select! {
|
||||
biased;
|
||||
_ = &mut join_handle => { true },
|
||||
_ = tokio::time::sleep(std::time::Duration::from_secs(1)) => {
|
||||
// allow some time to elapse before logging to cut down the number of log
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -99,6 +99,7 @@ pub struct TenantConf {
|
||||
// See the corresponding metric's help string.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub evictions_low_residence_duration_metric_threshold: Duration,
|
||||
pub gc_feedback: bool,
|
||||
}
|
||||
|
||||
/// Same as TenantConf, but this struct preserves the information about
|
||||
@@ -175,6 +176,10 @@ pub struct TenantConfOpt {
|
||||
#[serde(with = "humantime_serde")]
|
||||
#[serde(default)]
|
||||
pub evictions_low_residence_duration_metric_threshold: Option<Duration>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub gc_feedback: Option<bool>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
@@ -242,6 +247,7 @@ impl TenantConfOpt {
|
||||
evictions_low_residence_duration_metric_threshold: self
|
||||
.evictions_low_residence_duration_metric_threshold
|
||||
.unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold),
|
||||
gc_feedback: self.gc_feedback.unwrap_or(global_conf.gc_feedback),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -278,6 +284,7 @@ impl Default for TenantConf {
|
||||
DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
|
||||
)
|
||||
.expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
|
||||
gc_feedback: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -372,6 +379,7 @@ impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
|
||||
))?,
|
||||
);
|
||||
}
|
||||
tenant_conf.gc_feedback = request_data.gc_feedback;
|
||||
|
||||
Ok(tenant_conf)
|
||||
}
|
||||
|
||||
@@ -51,7 +51,9 @@ use crate::keyspace::KeyPartitioning;
|
||||
use crate::repository::Key;
|
||||
use crate::tenant::storage_layer::InMemoryLayer;
|
||||
use crate::tenant::storage_layer::Layer;
|
||||
use anyhow::Context;
|
||||
use anyhow::Result;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::VecDeque;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
@@ -61,6 +63,8 @@ use historic_layer_coverage::BufferedHistoricLayerCoverage;
|
||||
pub use historic_layer_coverage::Replacement;
|
||||
|
||||
use super::storage_layer::range_eq;
|
||||
use super::storage_layer::PersistentLayerDesc;
|
||||
use super::storage_layer::PersistentLayerKey;
|
||||
|
||||
///
|
||||
/// LayerMap tracks what layers exist on a timeline.
|
||||
@@ -86,11 +90,16 @@ pub struct LayerMap<L: ?Sized> {
|
||||
pub frozen_layers: VecDeque<Arc<InMemoryLayer>>,
|
||||
|
||||
/// Index of the historic layers optimized for search
|
||||
historic: BufferedHistoricLayerCoverage<Arc<L>>,
|
||||
historic: BufferedHistoricLayerCoverage<Arc<PersistentLayerDesc>>,
|
||||
|
||||
/// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
|
||||
/// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
|
||||
l0_delta_layers: Vec<Arc<L>>,
|
||||
l0_delta_layers: Vec<Arc<PersistentLayerDesc>>,
|
||||
|
||||
/// Mapping from persistent layer key to the actual layer object. Currently, it stores delta, image, and
|
||||
/// remote layers. In future refactors, this will be eventually moved out of LayerMap into Timeline, and
|
||||
/// RemoteLayer will be removed.
|
||||
mapping: HashMap<PersistentLayerKey, Arc<L>>,
|
||||
}
|
||||
|
||||
impl<L: ?Sized> Default for LayerMap<L> {
|
||||
@@ -101,6 +110,7 @@ impl<L: ?Sized> Default for LayerMap<L> {
|
||||
frozen_layers: VecDeque::default(),
|
||||
l0_delta_layers: Vec::default(),
|
||||
historic: BufferedHistoricLayerCoverage::default(),
|
||||
mapping: HashMap::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -125,8 +135,9 @@ where
|
||||
///
|
||||
/// Insert an on-disk layer.
|
||||
///
|
||||
pub fn insert_historic(&mut self, layer: Arc<L>) {
|
||||
self.layer_map.insert_historic_noflush(layer)
|
||||
// TODO remove the `layer` argument when `mapping` is refactored out of `LayerMap`
|
||||
pub fn insert_historic(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
|
||||
self.layer_map.insert_historic_noflush(layer_desc, layer)
|
||||
}
|
||||
|
||||
///
|
||||
@@ -134,8 +145,8 @@ where
|
||||
///
|
||||
/// This should be called when the corresponding file on disk has been deleted.
|
||||
///
|
||||
pub fn remove_historic(&mut self, layer: Arc<L>) {
|
||||
self.layer_map.remove_historic_noflush(layer)
|
||||
pub fn remove_historic(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
|
||||
self.layer_map.remove_historic_noflush(layer_desc, layer)
|
||||
}
|
||||
|
||||
/// Replaces existing layer iff it is the `expected`.
|
||||
@@ -150,12 +161,15 @@ where
|
||||
/// that we can replace values only by updating a hashmap.
|
||||
pub fn replace_historic(
|
||||
&mut self,
|
||||
expected_desc: PersistentLayerDesc,
|
||||
expected: &Arc<L>,
|
||||
new_desc: PersistentLayerDesc,
|
||||
new: Arc<L>,
|
||||
) -> anyhow::Result<Replacement<Arc<L>>> {
|
||||
fail::fail_point!("layermap-replace-notfound", |_| Ok(Replacement::NotFound));
|
||||
|
||||
self.layer_map.replace_historic_noflush(expected, new)
|
||||
self.layer_map
|
||||
.replace_historic_noflush(expected_desc, expected, new_desc, new)
|
||||
}
|
||||
|
||||
// We will flush on drop anyway, but this method makes it
|
||||
@@ -230,6 +244,7 @@ where
|
||||
(None, None) => None,
|
||||
(None, Some(image)) => {
|
||||
let lsn_floor = image.get_lsn_range().start;
|
||||
let image = self.get_layer_from_mapping(&image.key()).clone();
|
||||
Some(SearchResult {
|
||||
layer: image,
|
||||
lsn_floor,
|
||||
@@ -237,6 +252,7 @@ where
|
||||
}
|
||||
(Some(delta), None) => {
|
||||
let lsn_floor = delta.get_lsn_range().start;
|
||||
let delta = self.get_layer_from_mapping(&delta.key()).clone();
|
||||
Some(SearchResult {
|
||||
layer: delta,
|
||||
lsn_floor,
|
||||
@@ -247,6 +263,7 @@ where
|
||||
let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end;
|
||||
let image_exact_match = img_lsn + 1 == end_lsn;
|
||||
if image_is_newer || image_exact_match {
|
||||
let image = self.get_layer_from_mapping(&image.key()).clone();
|
||||
Some(SearchResult {
|
||||
layer: image,
|
||||
lsn_floor: img_lsn,
|
||||
@@ -254,6 +271,7 @@ where
|
||||
} else {
|
||||
let lsn_floor =
|
||||
std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1);
|
||||
let delta = self.get_layer_from_mapping(&delta.key()).clone();
|
||||
Some(SearchResult {
|
||||
layer: delta,
|
||||
lsn_floor,
|
||||
@@ -273,16 +291,33 @@ where
|
||||
///
|
||||
/// Helper function for BatchedUpdates::insert_historic
|
||||
///
|
||||
pub(self) fn insert_historic_noflush(&mut self, layer: Arc<L>) {
|
||||
/// TODO(chi): remove L generic so that we do not need to pass layer object.
|
||||
pub(self) fn insert_historic_noflush(
|
||||
&mut self,
|
||||
layer_desc: PersistentLayerDesc,
|
||||
layer: Arc<L>,
|
||||
) {
|
||||
self.mapping.insert(layer_desc.key(), layer.clone());
|
||||
|
||||
// TODO: See #3869, resulting #4088, attempted fix and repro #4094
|
||||
self.historic.insert(
|
||||
historic_layer_coverage::LayerKey::from(&*layer),
|
||||
Arc::clone(&layer),
|
||||
);
|
||||
|
||||
if Self::is_l0(&layer) {
|
||||
self.l0_delta_layers.push(layer);
|
||||
self.l0_delta_layers.push(layer_desc.clone().into());
|
||||
}
|
||||
|
||||
self.historic.insert(
|
||||
historic_layer_coverage::LayerKey::from(&*layer),
|
||||
layer_desc.into(),
|
||||
);
|
||||
}
|
||||
|
||||
fn get_layer_from_mapping(&self, key: &PersistentLayerKey) -> &Arc<L> {
|
||||
let layer = self
|
||||
.mapping
|
||||
.get(key)
|
||||
.with_context(|| format!("{key:?}"))
|
||||
.expect("inconsistent layer mapping");
|
||||
layer
|
||||
}
|
||||
|
||||
///
|
||||
@@ -290,14 +325,16 @@ where
|
||||
///
|
||||
/// Helper function for BatchedUpdates::remove_historic
|
||||
///
|
||||
pub fn remove_historic_noflush(&mut self, layer: Arc<L>) {
|
||||
pub fn remove_historic_noflush(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
|
||||
self.historic
|
||||
.remove(historic_layer_coverage::LayerKey::from(&*layer));
|
||||
|
||||
if Self::is_l0(&layer) {
|
||||
let len_before = self.l0_delta_layers.len();
|
||||
self.l0_delta_layers
|
||||
.retain(|other| !Self::compare_arced_layers(other, &layer));
|
||||
let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
|
||||
l0_delta_layers.retain(|other| {
|
||||
!Self::compare_arced_layers(self.get_layer_from_mapping(&other.key()), &layer)
|
||||
});
|
||||
self.l0_delta_layers = l0_delta_layers;
|
||||
// this assertion is related to use of Arc::ptr_eq in Self::compare_arced_layers,
|
||||
// there's a chance that the comparison fails at runtime due to it comparing (pointer,
|
||||
// vtable) pairs.
|
||||
@@ -307,11 +344,14 @@ where
|
||||
"failed to locate removed historic layer from l0_delta_layers"
|
||||
);
|
||||
}
|
||||
self.mapping.remove(&layer_desc.key());
|
||||
}
|
||||
|
||||
pub(self) fn replace_historic_noflush(
|
||||
&mut self,
|
||||
expected_desc: PersistentLayerDesc,
|
||||
expected: &Arc<L>,
|
||||
new_desc: PersistentLayerDesc,
|
||||
new: Arc<L>,
|
||||
) -> anyhow::Result<Replacement<Arc<L>>> {
|
||||
let key = historic_layer_coverage::LayerKey::from(&**expected);
|
||||
@@ -332,10 +372,9 @@ where
|
||||
|
||||
let l0_index = if expected_l0 {
|
||||
// find the index in case replace worked, we need to replace that as well
|
||||
let pos = self
|
||||
.l0_delta_layers
|
||||
.iter()
|
||||
.position(|slot| Self::compare_arced_layers(slot, expected));
|
||||
let pos = self.l0_delta_layers.iter().position(|slot| {
|
||||
Self::compare_arced_layers(self.get_layer_from_mapping(&slot.key()), expected)
|
||||
});
|
||||
|
||||
if pos.is_none() {
|
||||
return Ok(Replacement::NotFound);
|
||||
@@ -345,16 +384,28 @@ where
|
||||
None
|
||||
};
|
||||
|
||||
let replaced = self.historic.replace(&key, new.clone(), |existing| {
|
||||
Self::compare_arced_layers(existing, expected)
|
||||
let new_desc = Arc::new(new_desc);
|
||||
let replaced = self.historic.replace(&key, new_desc.clone(), |existing| {
|
||||
**existing == expected_desc
|
||||
});
|
||||
|
||||
if let Replacement::Replaced { .. } = &replaced {
|
||||
self.mapping.remove(&expected_desc.key());
|
||||
self.mapping.insert(new_desc.key(), new);
|
||||
if let Some(index) = l0_index {
|
||||
self.l0_delta_layers[index] = new;
|
||||
self.l0_delta_layers[index] = new_desc;
|
||||
}
|
||||
}
|
||||
|
||||
let replaced = match replaced {
|
||||
Replacement::Replaced { in_buffered } => Replacement::Replaced { in_buffered },
|
||||
Replacement::NotFound => Replacement::NotFound,
|
||||
Replacement::RemovalBuffered => Replacement::RemovalBuffered,
|
||||
Replacement::Unexpected(x) => {
|
||||
Replacement::Unexpected(self.get_layer_from_mapping(&x.key()).clone())
|
||||
}
|
||||
};
|
||||
|
||||
Ok(replaced)
|
||||
}
|
||||
|
||||
@@ -383,7 +434,7 @@ where
|
||||
let start = key.start.to_i128();
|
||||
let end = key.end.to_i128();
|
||||
|
||||
let layer_covers = |layer: Option<Arc<L>>| match layer {
|
||||
let layer_covers = |layer: Option<Arc<PersistentLayerDesc>>| match layer {
|
||||
Some(layer) => layer.get_lsn_range().start >= lsn.start,
|
||||
None => false,
|
||||
};
|
||||
@@ -404,7 +455,9 @@ where
|
||||
}
|
||||
|
||||
pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<L>> {
|
||||
self.historic.iter()
|
||||
self.historic
|
||||
.iter()
|
||||
.map(|x| self.get_layer_from_mapping(&x.key()).clone())
|
||||
}
|
||||
|
||||
///
|
||||
@@ -436,14 +489,24 @@ where
|
||||
// Loop through the change events and push intervals
|
||||
for (change_key, change_val) in version.image_coverage.range(start..end) {
|
||||
let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
|
||||
coverage.push((kr, current_val.take()));
|
||||
coverage.push((
|
||||
kr,
|
||||
current_val
|
||||
.take()
|
||||
.map(|l| self.get_layer_from_mapping(&l.key()).clone()),
|
||||
));
|
||||
current_key = change_key;
|
||||
current_val = change_val.clone();
|
||||
}
|
||||
|
||||
// Add the final interval
|
||||
let kr = Key::from_i128(current_key)..Key::from_i128(end);
|
||||
coverage.push((kr, current_val.take()));
|
||||
coverage.push((
|
||||
kr,
|
||||
current_val
|
||||
.take()
|
||||
.map(|l| self.get_layer_from_mapping(&l.key()).clone()),
|
||||
));
|
||||
|
||||
Ok(coverage)
|
||||
}
|
||||
@@ -532,7 +595,9 @@ where
|
||||
let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
|
||||
let lr = lsn.start..val.get_lsn_range().start;
|
||||
if !kr.is_empty() {
|
||||
let base_count = Self::is_reimage_worthy(&val, key) as usize;
|
||||
let base_count =
|
||||
Self::is_reimage_worthy(self.get_layer_from_mapping(&val.key()), key)
|
||||
as usize;
|
||||
let new_limit = limit.map(|l| l - base_count);
|
||||
let max_stacked_deltas_underneath =
|
||||
self.count_deltas(&kr, &lr, new_limit)?;
|
||||
@@ -555,7 +620,9 @@ where
|
||||
let lr = lsn.start..val.get_lsn_range().start;
|
||||
|
||||
if !kr.is_empty() {
|
||||
let base_count = Self::is_reimage_worthy(&val, key) as usize;
|
||||
let base_count =
|
||||
Self::is_reimage_worthy(self.get_layer_from_mapping(&val.key()), key)
|
||||
as usize;
|
||||
let new_limit = limit.map(|l| l - base_count);
|
||||
let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?;
|
||||
max_stacked_deltas = std::cmp::max(
|
||||
@@ -706,7 +773,11 @@ where
|
||||
|
||||
/// Return all L0 delta layers
|
||||
pub fn get_level0_deltas(&self) -> Result<Vec<Arc<L>>> {
|
||||
Ok(self.l0_delta_layers.clone())
|
||||
Ok(self
|
||||
.l0_delta_layers
|
||||
.iter()
|
||||
.map(|x| self.get_layer_from_mapping(&x.key()).clone())
|
||||
.collect())
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer map
|
||||
@@ -809,12 +880,17 @@ mod tests {
|
||||
let layer = LayerDescriptor::from(layer);
|
||||
|
||||
// same skeletan construction; see scenario below
|
||||
let not_found: Arc<dyn Layer> = Arc::new(layer.clone());
|
||||
let new_version: Arc<dyn Layer> = Arc::new(layer);
|
||||
let not_found = Arc::new(layer.clone());
|
||||
let new_version = Arc::new(layer);
|
||||
|
||||
let mut map = LayerMap::default();
|
||||
|
||||
let res = map.batch_update().replace_historic(¬_found, new_version);
|
||||
let res = map.batch_update().replace_historic(
|
||||
not_found.get_persistent_layer_desc(),
|
||||
¬_found,
|
||||
new_version.get_persistent_layer_desc(),
|
||||
new_version,
|
||||
);
|
||||
|
||||
assert!(matches!(res, Ok(Replacement::NotFound)), "{res:?}");
|
||||
}
|
||||
@@ -823,8 +899,8 @@ mod tests {
|
||||
let name = LayerFileName::from_str(layer_name).unwrap();
|
||||
let skeleton = LayerDescriptor::from(name);
|
||||
|
||||
let remote: Arc<dyn Layer> = Arc::new(skeleton.clone());
|
||||
let downloaded: Arc<dyn Layer> = Arc::new(skeleton);
|
||||
let remote = Arc::new(skeleton.clone());
|
||||
let downloaded = Arc::new(skeleton);
|
||||
|
||||
let mut map = LayerMap::default();
|
||||
|
||||
@@ -834,12 +910,18 @@ mod tests {
|
||||
|
||||
let expected_in_counts = (1, usize::from(expected_l0));
|
||||
|
||||
map.batch_update().insert_historic(remote.clone());
|
||||
map.batch_update()
|
||||
.insert_historic(remote.get_persistent_layer_desc(), remote.clone());
|
||||
assert_eq!(count_layer_in(&map, &remote), expected_in_counts);
|
||||
|
||||
let replaced = map
|
||||
.batch_update()
|
||||
.replace_historic(&remote, downloaded.clone())
|
||||
.replace_historic(
|
||||
remote.get_persistent_layer_desc(),
|
||||
&remote,
|
||||
downloaded.get_persistent_layer_desc(),
|
||||
downloaded.clone(),
|
||||
)
|
||||
.expect("name derived attributes are the same");
|
||||
assert!(
|
||||
matches!(replaced, Replacement::Replaced { .. }),
|
||||
@@ -847,11 +929,12 @@ mod tests {
|
||||
);
|
||||
assert_eq!(count_layer_in(&map, &downloaded), expected_in_counts);
|
||||
|
||||
map.batch_update().remove_historic(downloaded.clone());
|
||||
map.batch_update()
|
||||
.remove_historic(downloaded.get_persistent_layer_desc(), downloaded.clone());
|
||||
assert_eq!(count_layer_in(&map, &downloaded), (0, 0));
|
||||
}
|
||||
|
||||
fn count_layer_in(map: &LayerMap<dyn Layer>, layer: &Arc<dyn Layer>) -> (usize, usize) {
|
||||
fn count_layer_in<L: Layer + ?Sized>(map: &LayerMap<L>, layer: &Arc<L>) -> (usize, usize) {
|
||||
let historic = map
|
||||
.iter_historic_layers()
|
||||
.filter(|x| LayerMap::compare_arced_layers(x, layer))
|
||||
|
||||
@@ -204,6 +204,35 @@ fn test_off_by_one() {
|
||||
assert_eq!(version.image_coverage.query(5), None);
|
||||
}
|
||||
|
||||
/// White-box regression test, checking for incorrect removal of node at key.end
|
||||
#[test]
|
||||
fn test_regression() {
|
||||
let mut map = HistoricLayerCoverage::<String>::new();
|
||||
map.insert(
|
||||
LayerKey {
|
||||
key: 0..5,
|
||||
lsn: 0..5,
|
||||
is_image: false,
|
||||
},
|
||||
"Layer 1".to_string(),
|
||||
);
|
||||
map.insert(
|
||||
LayerKey {
|
||||
key: 0..5,
|
||||
lsn: 1..2,
|
||||
is_image: false,
|
||||
},
|
||||
"Layer 2".to_string(),
|
||||
);
|
||||
|
||||
// If an insertion operation improperly deletes the endpoint of a previous layer
|
||||
// (which is more likely to happen with layers that collide on key.end), we will
|
||||
// end up with an infinite layer, covering the entire keyspace. Here we assert
|
||||
// that there's no layer at key 100 because we didn't insert any layer there.
|
||||
let version = map.get_version(100).unwrap();
|
||||
assert_eq!(version.delta_coverage.query(100), None);
|
||||
}
|
||||
|
||||
/// Cover edge cases where layers begin or end on the same key
|
||||
#[test]
|
||||
fn test_key_collision() {
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use std::ops::Range;
|
||||
|
||||
// TODO the `im` crate has 20x more downloads and also has
|
||||
// persistent/immutable BTree. It also runs a bit faster but
|
||||
// results are not the same on some tests.
|
||||
// NOTE the `im` crate has 20x more downloads and also has
|
||||
// persistent/immutable BTree. But it's bugged so rpds is a
|
||||
// better choice https://github.com/neondatabase/neon/issues/3395
|
||||
use rpds::RedBlackTreeMapSync;
|
||||
|
||||
/// Data structure that can efficiently:
|
||||
@@ -10,19 +10,22 @@ use rpds::RedBlackTreeMapSync;
|
||||
/// - iterate the latest layers in a key range
|
||||
/// - insert layers in non-decreasing lsn.start order
|
||||
///
|
||||
/// The struct is parameterized over Value for easier
|
||||
/// testing, but in practice it's some sort of layer.
|
||||
/// For a detailed explanation and justification of this approach, see:
|
||||
/// https://neon.tech/blog/persistent-structures-in-neons-wal-indexing
|
||||
///
|
||||
/// NOTE The struct is parameterized over Value for easier
|
||||
/// testing, but in practice it's some sort of layer.
|
||||
pub struct LayerCoverage<Value> {
|
||||
/// For every change in coverage (as we sweep the key space)
|
||||
/// we store (lsn.end, value).
|
||||
///
|
||||
/// We use an immutable/persistent tree so that we can keep historic
|
||||
/// versions of this coverage without cloning the whole thing and
|
||||
/// incurring quadratic memory cost. See HistoricLayerCoverage.
|
||||
/// NOTE We use an immutable/persistent tree so that we can keep historic
|
||||
/// versions of this coverage without cloning the whole thing and
|
||||
/// incurring quadratic memory cost. See HistoricLayerCoverage.
|
||||
///
|
||||
/// We use the Sync version of the map because we want Self to
|
||||
/// be Sync. Using nonsync might be faster, if we can work with
|
||||
/// that.
|
||||
/// NOTE We use the Sync version of the map because we want Self to
|
||||
/// be Sync. Using nonsync might be faster, if we can work with
|
||||
/// that.
|
||||
nodes: RedBlackTreeMapSync<i128, Option<(u64, Value)>>,
|
||||
}
|
||||
|
||||
@@ -41,6 +44,13 @@ impl<Value: Clone> LayerCoverage<Value> {
|
||||
|
||||
/// Helper function to subdivide the key range without changing any values
|
||||
///
|
||||
/// This operation has no semantic effect by itself. It only helps us pin in
|
||||
/// place the part of the coverage we don't want to change when inserting.
|
||||
///
|
||||
/// As an analogy, think of a polygon. If you add a vertex along one of the
|
||||
/// segments, the polygon is still the same, but it behaves differently when
|
||||
/// we move or delete one of the other points.
|
||||
///
|
||||
/// Complexity: O(log N)
|
||||
fn add_node(&mut self, key: i128) {
|
||||
let value = match self.nodes.range(..=key).last() {
|
||||
@@ -74,7 +84,7 @@ impl<Value: Clone> LayerCoverage<Value> {
|
||||
let mut to_update = Vec::new();
|
||||
let mut to_remove = Vec::new();
|
||||
let mut prev_covered = false;
|
||||
for (k, node) in self.nodes.range(key.clone()) {
|
||||
for (k, node) in self.nodes.range(key) {
|
||||
let needs_cover = match node {
|
||||
None => true,
|
||||
Some((h, _)) => h < &lsn.end,
|
||||
@@ -87,9 +97,8 @@ impl<Value: Clone> LayerCoverage<Value> {
|
||||
}
|
||||
prev_covered = needs_cover;
|
||||
}
|
||||
if !prev_covered {
|
||||
to_remove.push(key.end);
|
||||
}
|
||||
// TODO check if the nodes inserted at key.start and key.end are safe
|
||||
// to remove. It's fine to keep them but they could be redundant.
|
||||
for k in to_update {
|
||||
self.nodes.insert_mut(k, Some((lsn.end, value.clone())));
|
||||
}
|
||||
|
||||
325
pageserver/src/tenant/manifest.rs
Normal file
325
pageserver/src/tenant/manifest.rs
Normal file
@@ -0,0 +1,325 @@
|
||||
//! This module contains the encoding and decoding of the local manifest file.
|
||||
//!
|
||||
//! MANIFEST is a write-ahead log which is stored locally to each timeline. It
|
||||
//! records the state of the storage engine. It contains a snapshot of the
|
||||
//! state and all operations proceeding that snapshot. The file begins with a
|
||||
//! header recording MANIFEST version number. After that, it contains a snapshot.
|
||||
//! The snapshot is followed by a list of operations. Each operation is a list
|
||||
//! of records. Each record is either an addition or a removal of a layer.
|
||||
//!
|
||||
//! With MANIFEST, we can:
|
||||
//!
|
||||
//! 1. recover state quickly by reading the file, potentially boosting the
|
||||
//! startup speed.
|
||||
//! 2. ensure all operations are atomic and avoid corruption, solving issues
|
||||
//! like redundant image layer and preparing us for future compaction
|
||||
//! strategies.
|
||||
//!
|
||||
//! There is also a format for storing all layer files on S3, called
|
||||
//! `index_part.json`. Compared with index_part, MANIFEST is an WAL which
|
||||
//! records all operations as logs, and therefore we can easily replay the
|
||||
//! operations when recovering from crash, while ensuring those operations
|
||||
//! are atomic upon restart.
|
||||
//!
|
||||
//! Currently, this is not used in the system. Future refactors will ensure
|
||||
//! the storage state will be recorded in this file, and the system can be
|
||||
//! recovered from this file. This is tracked in
|
||||
//! https://github.com/neondatabase/neon/issues/4418
|
||||
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use anyhow::Result;
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use crc32c::crc32c;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::log::warn;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use super::storage_layer::PersistentLayerDesc;
|
||||
|
||||
pub struct Manifest {
|
||||
file: VirtualFile,
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
|
||||
pub struct Snapshot {
|
||||
pub layers: Vec<PersistentLayerDesc>,
|
||||
}
|
||||
|
||||
/// serde by default encode this in tagged enum, and therefore it will be something
|
||||
/// like `{ "AddLayer": { ... } }`.
|
||||
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
|
||||
pub enum Record {
|
||||
AddLayer(PersistentLayerDesc),
|
||||
RemoveLayer(PersistentLayerDesc),
|
||||
}
|
||||
|
||||
/// `echo neon.manifest | sha1sum` and take the leading 8 bytes.
|
||||
const MANIFEST_MAGIC_NUMBER: u64 = 0xf5c44592b806109c;
|
||||
const MANIFEST_VERSION: u64 = 1;
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
|
||||
pub struct ManifestHeader {
|
||||
magic_number: u64,
|
||||
version: u64,
|
||||
}
|
||||
|
||||
const MANIFEST_HEADER_LEN: usize = 16;
|
||||
|
||||
impl ManifestHeader {
|
||||
fn encode(&self) -> BytesMut {
|
||||
let mut buf = BytesMut::with_capacity(MANIFEST_HEADER_LEN);
|
||||
buf.put_u64(self.magic_number);
|
||||
buf.put_u64(self.version);
|
||||
buf
|
||||
}
|
||||
|
||||
fn decode(mut buf: &[u8]) -> Self {
|
||||
assert!(buf.len() == MANIFEST_HEADER_LEN, "invalid header");
|
||||
Self {
|
||||
magic_number: buf.get_u64(),
|
||||
version: buf.get_u64(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
|
||||
pub enum Operation {
|
||||
/// A snapshot of the current state.
|
||||
///
|
||||
/// Lsn field represents the LSN that is persisted to disk for this snapshot.
|
||||
Snapshot(Snapshot, Lsn),
|
||||
/// An atomic operation that changes the state.
|
||||
///
|
||||
/// Lsn field represents the LSN that is persisted to disk after the operation is done.
|
||||
/// This will only change when new L0 is flushed to the disk.
|
||||
Operation(Vec<Record>, Lsn),
|
||||
}
|
||||
|
||||
struct RecordHeader {
|
||||
size: u32,
|
||||
checksum: u32,
|
||||
}
|
||||
|
||||
const RECORD_HEADER_LEN: usize = 8;
|
||||
|
||||
impl RecordHeader {
|
||||
fn encode(&self) -> BytesMut {
|
||||
let mut buf = BytesMut::with_capacity(RECORD_HEADER_LEN);
|
||||
buf.put_u32(self.size);
|
||||
buf.put_u32(self.checksum);
|
||||
buf
|
||||
}
|
||||
|
||||
fn decode(mut buf: &[u8]) -> Self {
|
||||
assert!(buf.len() == RECORD_HEADER_LEN, "invalid header");
|
||||
Self {
|
||||
size: buf.get_u32(),
|
||||
checksum: buf.get_u32(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum ManifestLoadError {
|
||||
#[error("manifest header is corrupted")]
|
||||
CorruptedManifestHeader,
|
||||
#[error("unsupported manifest version: got {0}, expected {1}")]
|
||||
UnsupportedVersion(u64, u64),
|
||||
#[error("error when decoding record: {0}")]
|
||||
DecodeRecord(serde_json::Error),
|
||||
#[error("I/O error: {0}")]
|
||||
Io(io::Error),
|
||||
}
|
||||
|
||||
#[must_use = "Should check if the manifest is partially corrupted"]
|
||||
pub struct ManifestPartiallyCorrupted(bool);
|
||||
|
||||
impl Manifest {
|
||||
/// Create a new manifest by writing the manifest header and a snapshot record to the given file.
|
||||
pub fn init(file: VirtualFile, snapshot: Snapshot, lsn: Lsn) -> Result<Self> {
|
||||
let mut manifest = Self { file };
|
||||
manifest.append_manifest_header(ManifestHeader {
|
||||
magic_number: MANIFEST_MAGIC_NUMBER,
|
||||
version: MANIFEST_VERSION,
|
||||
})?;
|
||||
manifest.append_operation(Operation::Snapshot(snapshot, lsn))?;
|
||||
Ok(manifest)
|
||||
}
|
||||
|
||||
/// Load a manifest. Returns the manifest and a list of operations. If the manifest is corrupted,
|
||||
/// the bool flag will be set to true and the user is responsible to reconstruct a new manifest and
|
||||
/// backup the current one.
|
||||
pub fn load(
|
||||
mut file: VirtualFile,
|
||||
) -> Result<(Self, Vec<Operation>, ManifestPartiallyCorrupted), ManifestLoadError> {
|
||||
let mut buf = vec![];
|
||||
file.read_to_end(&mut buf).map_err(ManifestLoadError::Io)?;
|
||||
|
||||
// Read manifest header
|
||||
let mut buf = Bytes::from(buf);
|
||||
if buf.remaining() < MANIFEST_HEADER_LEN {
|
||||
return Err(ManifestLoadError::CorruptedManifestHeader);
|
||||
}
|
||||
let header = ManifestHeader::decode(&buf[..MANIFEST_HEADER_LEN]);
|
||||
buf.advance(MANIFEST_HEADER_LEN);
|
||||
if header.version != MANIFEST_VERSION {
|
||||
return Err(ManifestLoadError::UnsupportedVersion(
|
||||
header.version,
|
||||
MANIFEST_VERSION,
|
||||
));
|
||||
}
|
||||
|
||||
// Read operations
|
||||
let mut operations = Vec::new();
|
||||
let corrupted = loop {
|
||||
if buf.remaining() == 0 {
|
||||
break false;
|
||||
}
|
||||
if buf.remaining() < RECORD_HEADER_LEN {
|
||||
warn!("incomplete header when decoding manifest, could be corrupted");
|
||||
break true;
|
||||
}
|
||||
let RecordHeader { size, checksum } = RecordHeader::decode(&buf[..RECORD_HEADER_LEN]);
|
||||
let size = size as usize;
|
||||
buf.advance(RECORD_HEADER_LEN);
|
||||
if buf.remaining() < size {
|
||||
warn!("incomplete data when decoding manifest, could be corrupted");
|
||||
break true;
|
||||
}
|
||||
let data = &buf[..size];
|
||||
if crc32c(data) != checksum {
|
||||
warn!("checksum mismatch when decoding manifest, could be corrupted");
|
||||
break true;
|
||||
}
|
||||
// if the following decode fails, we cannot use the manifest or safely ignore any record.
|
||||
operations.push(serde_json::from_slice(data).map_err(ManifestLoadError::DecodeRecord)?);
|
||||
buf.advance(size);
|
||||
};
|
||||
Ok((
|
||||
Self { file },
|
||||
operations,
|
||||
ManifestPartiallyCorrupted(corrupted),
|
||||
))
|
||||
}
|
||||
|
||||
fn append_data(&mut self, data: &[u8]) -> Result<()> {
|
||||
if data.len() >= u32::MAX as usize {
|
||||
panic!("data too large");
|
||||
}
|
||||
let header = RecordHeader {
|
||||
size: data.len() as u32,
|
||||
checksum: crc32c(data),
|
||||
};
|
||||
let header = header.encode();
|
||||
self.file.write_all(&header)?;
|
||||
self.file.write_all(data)?;
|
||||
self.file.sync_all()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn append_manifest_header(&mut self, header: ManifestHeader) -> Result<()> {
|
||||
let encoded = header.encode();
|
||||
self.file.write_all(&encoded)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Add an operation to the manifest. The operation will be appended to the end of the file,
|
||||
/// and the file will fsync.
|
||||
pub fn append_operation(&mut self, operation: Operation) -> Result<()> {
|
||||
let encoded = Vec::from(serde_json::to_string(&operation)?);
|
||||
self.append_data(&encoded)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::fs::OpenOptions;
|
||||
|
||||
use crate::repository::Key;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_read_manifest() {
|
||||
let testdir = crate::config::PageServerConf::test_repo_dir("test_read_manifest");
|
||||
std::fs::create_dir_all(&testdir).unwrap();
|
||||
let file = VirtualFile::create(&testdir.join("MANIFEST")).unwrap();
|
||||
let layer1 = PersistentLayerDesc::new_test(Key::from_i128(0)..Key::from_i128(233));
|
||||
let layer2 = PersistentLayerDesc::new_test(Key::from_i128(233)..Key::from_i128(2333));
|
||||
let layer3 = PersistentLayerDesc::new_test(Key::from_i128(2333)..Key::from_i128(23333));
|
||||
let layer4 = PersistentLayerDesc::new_test(Key::from_i128(23333)..Key::from_i128(233333));
|
||||
|
||||
// Write a manifest with a snapshot and some operations
|
||||
let snapshot = Snapshot {
|
||||
layers: vec![layer1, layer2],
|
||||
};
|
||||
let mut manifest = Manifest::init(file, snapshot.clone(), Lsn::from(0)).unwrap();
|
||||
manifest
|
||||
.append_operation(Operation::Operation(
|
||||
vec![Record::AddLayer(layer3.clone())],
|
||||
Lsn::from(1),
|
||||
))
|
||||
.unwrap();
|
||||
drop(manifest);
|
||||
|
||||
// Open the second time and write
|
||||
let file = VirtualFile::open_with_options(
|
||||
&testdir.join("MANIFEST"),
|
||||
OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.create_new(false)
|
||||
.truncate(false),
|
||||
)
|
||||
.unwrap();
|
||||
let (mut manifest, operations, corrupted) = Manifest::load(file).unwrap();
|
||||
assert!(!corrupted.0);
|
||||
assert_eq!(operations.len(), 2);
|
||||
assert_eq!(
|
||||
&operations[0],
|
||||
&Operation::Snapshot(snapshot.clone(), Lsn::from(0))
|
||||
);
|
||||
assert_eq!(
|
||||
&operations[1],
|
||||
&Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
|
||||
);
|
||||
manifest
|
||||
.append_operation(Operation::Operation(
|
||||
vec![
|
||||
Record::RemoveLayer(layer3.clone()),
|
||||
Record::AddLayer(layer4.clone()),
|
||||
],
|
||||
Lsn::from(2),
|
||||
))
|
||||
.unwrap();
|
||||
drop(manifest);
|
||||
|
||||
// Open the third time and verify
|
||||
let file = VirtualFile::open_with_options(
|
||||
&testdir.join("MANIFEST"),
|
||||
OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.create_new(false)
|
||||
.truncate(false),
|
||||
)
|
||||
.unwrap();
|
||||
let (_manifest, operations, corrupted) = Manifest::load(file).unwrap();
|
||||
assert!(!corrupted.0);
|
||||
assert_eq!(operations.len(), 3);
|
||||
assert_eq!(&operations[0], &Operation::Snapshot(snapshot, Lsn::from(0)));
|
||||
assert_eq!(
|
||||
&operations[1],
|
||||
&Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
|
||||
);
|
||||
assert_eq!(
|
||||
&operations[2],
|
||||
&Operation::Operation(
|
||||
vec![Record::RemoveLayer(layer3), Record::AddLayer(layer4)],
|
||||
Lsn::from(2)
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -10,6 +10,7 @@ use tokio::fs;
|
||||
use anyhow::Context;
|
||||
use once_cell::sync::Lazy;
|
||||
use tokio::sync::RwLock;
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::*;
|
||||
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
@@ -20,7 +21,7 @@ use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::task_mgr::{self, TaskKind};
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
|
||||
use crate::IGNORED_TENANT_FILE_NAME;
|
||||
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
|
||||
|
||||
use utils::fs_ext::PathExt;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -58,10 +59,12 @@ static TENANTS: Lazy<RwLock<TenantsMap>> = Lazy::new(|| RwLock::new(TenantsMap::
|
||||
/// Initialize repositories with locally available timelines.
|
||||
/// Timelines that are only partially available locally (remote storage has more data than this pageserver)
|
||||
/// are scheduled for download and added to the tenant once download is completed.
|
||||
#[instrument(skip(conf, remote_storage))]
|
||||
#[instrument(skip_all)]
|
||||
pub async fn init_tenant_mgr(
|
||||
conf: &'static PageServerConf,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
init_order: InitializationOrder,
|
||||
) -> anyhow::Result<()> {
|
||||
// Scan local filesystem for attached tenants
|
||||
let tenants_dir = conf.tenants_path();
|
||||
@@ -116,7 +119,9 @@ pub async fn init_tenant_mgr(
|
||||
match schedule_local_tenant_processing(
|
||||
conf,
|
||||
&tenant_dir_path,
|
||||
broker_client.clone(),
|
||||
remote_storage.clone(),
|
||||
Some(init_order.clone()),
|
||||
&ctx,
|
||||
) {
|
||||
Ok(tenant) => {
|
||||
@@ -150,7 +155,9 @@ pub async fn init_tenant_mgr(
|
||||
pub fn schedule_local_tenant_processing(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_path: &Path,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
init_order: Option<InitializationOrder>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Tenant>> {
|
||||
anyhow::ensure!(
|
||||
@@ -186,7 +193,7 @@ pub fn schedule_local_tenant_processing(
|
||||
let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
|
||||
info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
|
||||
if let Some(remote_storage) = remote_storage {
|
||||
match Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx) {
|
||||
match Tenant::spawn_attach(conf, tenant_id, broker_client, remote_storage, ctx) {
|
||||
Ok(tenant) => tenant,
|
||||
Err(e) => {
|
||||
error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
|
||||
@@ -204,7 +211,14 @@ pub fn schedule_local_tenant_processing(
|
||||
} else {
|
||||
info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
|
||||
// Start loading the tenant into memory. It will initially be in Loading state.
|
||||
Tenant::spawn_load(conf, tenant_id, remote_storage, ctx)
|
||||
Tenant::spawn_load(
|
||||
conf,
|
||||
tenant_id,
|
||||
broker_client,
|
||||
remote_storage,
|
||||
init_order,
|
||||
ctx,
|
||||
)
|
||||
};
|
||||
Ok(tenant)
|
||||
}
|
||||
@@ -219,6 +233,7 @@ pub fn schedule_local_tenant_processing(
|
||||
/// That could be easily misinterpreted by control plane, the consumer of the
|
||||
/// management API. For example, it could attach the tenant on a different pageserver.
|
||||
/// We would then be in split-brain once this pageserver restarts.
|
||||
#[instrument]
|
||||
pub async fn shutdown_all_tenants() {
|
||||
// Prevent new tenants from being created.
|
||||
let tenants_to_shut_down = {
|
||||
@@ -235,39 +250,51 @@ pub async fn shutdown_all_tenants() {
|
||||
tenants_clone
|
||||
}
|
||||
TenantsMap::ShuttingDown(_) => {
|
||||
// TODO: it is possible that detach and shutdown happen at the same time. as a
|
||||
// result, during shutdown we do not wait for detach.
|
||||
error!("already shutting down, this function isn't supposed to be called more than once");
|
||||
return;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let mut tenants_to_freeze_and_flush = Vec::with_capacity(tenants_to_shut_down.len());
|
||||
for (_, tenant) in tenants_to_shut_down {
|
||||
if tenant.is_active() {
|
||||
// updates tenant state, forbidding new GC and compaction iterations from starting
|
||||
tenant.set_stopping();
|
||||
tenants_to_freeze_and_flush.push(tenant);
|
||||
let mut join_set = JoinSet::new();
|
||||
for (tenant_id, tenant) in tenants_to_shut_down {
|
||||
join_set.spawn(
|
||||
async move {
|
||||
let freeze_and_flush = true;
|
||||
|
||||
match tenant.shutdown(freeze_and_flush).await {
|
||||
Ok(()) => debug!("tenant successfully stopped"),
|
||||
Err(super::ShutdownError::AlreadyStopping) => {
|
||||
warn!("tenant was already shutting down")
|
||||
}
|
||||
}
|
||||
}
|
||||
.instrument(info_span!("shutdown", %tenant_id)),
|
||||
);
|
||||
}
|
||||
|
||||
let mut panicked = 0;
|
||||
|
||||
while let Some(res) = join_set.join_next().await {
|
||||
match res {
|
||||
Ok(()) => {}
|
||||
Err(join_error) if join_error.is_cancelled() => {
|
||||
unreachable!("we are not cancelling any of the futures");
|
||||
}
|
||||
Err(join_error) if join_error.is_panic() => {
|
||||
// cannot really do anything, as this panic is likely a bug
|
||||
panicked += 1;
|
||||
}
|
||||
Err(join_error) => {
|
||||
warn!("unknown kind of JoinError: {join_error}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Shut down all existing walreceiver connections and stop accepting the new ones.
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await;
|
||||
|
||||
// Ok, no background tasks running anymore. Flush any remaining data in
|
||||
// memory to disk.
|
||||
//
|
||||
// We assume that any incoming connections that might request pages from
|
||||
// the tenant have already been terminated by the caller, so there
|
||||
// should be no more activity in any of the repositories.
|
||||
//
|
||||
// On error, log it but continue with the shutdown for other tenants.
|
||||
for tenant in tenants_to_freeze_and_flush {
|
||||
let tenant_id = tenant.tenant_id();
|
||||
debug!("shutdown tenant {tenant_id}");
|
||||
|
||||
if let Err(err) = tenant.freeze_and_flush().await {
|
||||
error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}");
|
||||
}
|
||||
if panicked > 0 {
|
||||
warn!(panicked, "observed panicks while shutting down tenants");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -275,6 +302,7 @@ pub async fn create_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: TenantConfOpt,
|
||||
tenant_id: TenantId,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Arc<Tenant>, TenantMapInsertError> {
|
||||
@@ -287,7 +315,7 @@ pub async fn create_tenant(
|
||||
// See https://github.com/neondatabase/neon/issues/4233
|
||||
|
||||
let created_tenant =
|
||||
schedule_local_tenant_processing(conf, &tenant_directory, remote_storage, ctx)?;
|
||||
schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, ctx)?;
|
||||
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
|
||||
// See https://github.com/neondatabase/neon/issues/4233
|
||||
|
||||
@@ -300,11 +328,19 @@ pub async fn create_tenant(
|
||||
}).await
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum SetNewTenantConfigError {
|
||||
#[error(transparent)]
|
||||
GetTenant(#[from] GetTenantError),
|
||||
#[error(transparent)]
|
||||
Persist(anyhow::Error),
|
||||
}
|
||||
|
||||
pub async fn set_new_tenant_config(
|
||||
conf: &'static PageServerConf,
|
||||
new_tenant_conf: TenantConfOpt,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<(), TenantStateError> {
|
||||
) -> Result<(), SetNewTenantConfigError> {
|
||||
info!("configuring tenant {tenant_id}");
|
||||
let tenant = get_tenant(tenant_id, true).await?;
|
||||
|
||||
@@ -314,23 +350,32 @@ pub async fn set_new_tenant_config(
|
||||
&tenant_config_path,
|
||||
new_tenant_conf,
|
||||
false,
|
||||
)?;
|
||||
)
|
||||
.map_err(SetNewTenantConfigError::Persist)?;
|
||||
tenant.set_new_tenant_config(new_tenant_conf);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum GetTenantError {
|
||||
#[error("Tenant {0} not found")]
|
||||
NotFound(TenantId),
|
||||
#[error("Tenant {0} is not active")]
|
||||
NotActive(TenantId),
|
||||
}
|
||||
|
||||
/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
|
||||
/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
|
||||
pub async fn get_tenant(
|
||||
tenant_id: TenantId,
|
||||
active_only: bool,
|
||||
) -> Result<Arc<Tenant>, TenantStateError> {
|
||||
) -> Result<Arc<Tenant>, GetTenantError> {
|
||||
let m = TENANTS.read().await;
|
||||
let tenant = m
|
||||
.get(&tenant_id)
|
||||
.ok_or(TenantStateError::NotFound(tenant_id))?;
|
||||
.ok_or(GetTenantError::NotFound(tenant_id))?;
|
||||
if active_only && !tenant.is_active() {
|
||||
Err(TenantStateError::NotActive(tenant_id))
|
||||
Err(GetTenantError::NotActive(tenant_id))
|
||||
} else {
|
||||
Ok(Arc::clone(tenant))
|
||||
}
|
||||
@@ -339,7 +384,7 @@ pub async fn get_tenant(
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum DeleteTimelineError {
|
||||
#[error("Tenant {0}")]
|
||||
Tenant(#[from] TenantStateError),
|
||||
Tenant(#[from] GetTenantError),
|
||||
|
||||
#[error("Timeline {0}")]
|
||||
Timeline(#[from] crate::tenant::DeleteTimelineError),
|
||||
@@ -351,7 +396,9 @@ pub async fn delete_timeline(
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
let tenant = get_tenant(tenant_id, true).await?;
|
||||
tenant.delete_timeline(timeline_id, ctx).await?;
|
||||
tenant
|
||||
.prepare_and_schedule_delete_timeline(timeline_id, ctx)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -404,6 +451,7 @@ pub async fn detach_tenant(
|
||||
pub async fn load_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), TenantMapInsertError> {
|
||||
@@ -415,7 +463,7 @@ pub async fn load_tenant(
|
||||
.with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
|
||||
}
|
||||
|
||||
let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, remote_storage, ctx)
|
||||
let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, None, ctx)
|
||||
.with_context(|| {
|
||||
format!("Failed to schedule tenant processing in path {tenant_path:?}")
|
||||
})?;
|
||||
@@ -472,6 +520,7 @@ pub async fn attach_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
tenant_conf: TenantConfOpt,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), TenantMapInsertError> {
|
||||
@@ -487,7 +536,7 @@ pub async fn attach_tenant(
|
||||
.context("check for attach marker file existence")?;
|
||||
anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");
|
||||
|
||||
let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, Some(remote_storage), ctx)?;
|
||||
let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, ctx)?;
|
||||
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
|
||||
// See https://github.com/neondatabase/neon/issues/4233
|
||||
|
||||
@@ -563,25 +612,26 @@ where
|
||||
// The exclusive lock here ensures we don't miss the tenant state updates before trying another removal.
|
||||
// tenant-wde cleanup operations may take some time (removing the entire tenant directory), we want to
|
||||
// avoid holding the lock for the entire process.
|
||||
{
|
||||
let tenants_accessor = TENANTS.write().await;
|
||||
match tenants_accessor.get(&tenant_id) {
|
||||
Some(tenant) => match tenant.current_state() {
|
||||
TenantState::Attaching
|
||||
| TenantState::Loading
|
||||
| TenantState::Broken { .. }
|
||||
| TenantState::Active => tenant.set_stopping(),
|
||||
TenantState::Stopping => return Err(TenantStateError::IsStopping(tenant_id)),
|
||||
},
|
||||
None => return Err(TenantStateError::NotFound(tenant_id)),
|
||||
let tenant = {
|
||||
TENANTS
|
||||
.write()
|
||||
.await
|
||||
.get(&tenant_id)
|
||||
.cloned()
|
||||
.ok_or(TenantStateError::NotFound(tenant_id))?
|
||||
};
|
||||
|
||||
let freeze_and_flush = false;
|
||||
|
||||
// shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
|
||||
// that we can continue safely to cleanup.
|
||||
match tenant.shutdown(freeze_and_flush).await {
|
||||
Ok(()) => {}
|
||||
Err(super::ShutdownError::AlreadyStopping) => {
|
||||
return Err(TenantStateError::IsStopping(tenant_id))
|
||||
}
|
||||
}
|
||||
|
||||
// shutdown all tenant and timeline tasks: gc, compaction, page service)
|
||||
// No new tasks will be started for this tenant because it's in `Stopping` state.
|
||||
// Hence, once we're done here, the `tenant_cleanup` callback can mutate tenant on-disk state freely.
|
||||
task_mgr::shutdown_tasks(None, Some(tenant_id), None).await;
|
||||
|
||||
match tenant_cleanup
|
||||
.await
|
||||
.with_context(|| format!("Failed to run cleanup for tenant {tenant_id}"))
|
||||
@@ -597,7 +647,7 @@ where
|
||||
let tenants_accessor = TENANTS.read().await;
|
||||
match tenants_accessor.get(&tenant_id) {
|
||||
Some(tenant) => {
|
||||
tenant.set_broken(e.to_string());
|
||||
tenant.set_broken(e.to_string()).await;
|
||||
}
|
||||
None => {
|
||||
warn!("Tenant {tenant_id} got removed from memory");
|
||||
@@ -663,7 +713,6 @@ pub async fn immediate_gc(
|
||||
Ok(wait_task_done)
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
pub async fn immediate_compact(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
|
||||
@@ -19,14 +19,8 @@ fn parallel_worker(paths: &[PathBuf], next_path_idx: &AtomicUsize) -> io::Result
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {
|
||||
const PARALLEL_PATH_THRESHOLD: usize = 1;
|
||||
if paths.len() <= PARALLEL_PATH_THRESHOLD {
|
||||
for path in paths {
|
||||
fsync_path(path)?;
|
||||
}
|
||||
return Ok(());
|
||||
}
|
||||
fn fsync_in_thread_pool(paths: &[PathBuf]) -> io::Result<()> {
|
||||
// TODO: remove this function in favor of `par_fsync_async` once we asyncify everything.
|
||||
|
||||
/// Use at most this number of threads.
|
||||
/// Increasing this limit will
|
||||
@@ -36,11 +30,11 @@ pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {
|
||||
let num_threads = paths.len().min(MAX_NUM_THREADS);
|
||||
let next_path_idx = AtomicUsize::new(0);
|
||||
|
||||
crossbeam_utils::thread::scope(|s| -> io::Result<()> {
|
||||
std::thread::scope(|s| -> io::Result<()> {
|
||||
let mut handles = vec![];
|
||||
// Spawn `num_threads - 1`, as the current thread is also a worker.
|
||||
for _ in 1..num_threads {
|
||||
handles.push(s.spawn(|_| parallel_worker(paths, &next_path_idx)));
|
||||
handles.push(s.spawn(|| parallel_worker(paths, &next_path_idx)));
|
||||
}
|
||||
|
||||
parallel_worker(paths, &next_path_idx)?;
|
||||
@@ -51,5 +45,41 @@ pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {
|
||||
|
||||
Ok(())
|
||||
})
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
/// Parallel fsync all files. Can be used in non-async context as it is using rayon thread pool.
|
||||
pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {
|
||||
if paths.len() == 1 {
|
||||
fsync_path(&paths[0])?;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
fsync_in_thread_pool(paths)
|
||||
}
|
||||
|
||||
/// Parallel fsync asynchronously. If number of files are less than PARALLEL_PATH_THRESHOLD, fsync is done in the current
|
||||
/// execution thread. Otherwise, we will spawn_blocking and run it in tokio.
|
||||
pub async fn par_fsync_async(paths: &[PathBuf]) -> io::Result<()> {
|
||||
const MAX_CONCURRENT_FSYNC: usize = 64;
|
||||
let mut next = paths.iter().peekable();
|
||||
let mut js = tokio::task::JoinSet::new();
|
||||
loop {
|
||||
while js.len() < MAX_CONCURRENT_FSYNC && next.peek().is_some() {
|
||||
let next = next.next().expect("just peeked");
|
||||
let next = next.to_owned();
|
||||
js.spawn_blocking(move || fsync_path(&next));
|
||||
}
|
||||
|
||||
// now the joinset has been filled up, wait for next to complete
|
||||
if let Some(res) = js.join_next().await {
|
||||
res??;
|
||||
} else {
|
||||
// last item had already completed
|
||||
assert!(
|
||||
next.peek().is_none(),
|
||||
"joinset emptied, we shouldn't have more work"
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -210,13 +210,15 @@ use chrono::{NaiveDateTime, Utc};
|
||||
pub use download::{is_temp_download_file, list_remote_timelines};
|
||||
use scopeguard::ScopeGuard;
|
||||
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::path::Path;
|
||||
use std::sync::atomic::{AtomicU32, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage};
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
|
||||
use std::ops::DerefMut;
|
||||
use tokio::runtime::Runtime;
|
||||
use tracing::{debug, error, info, warn};
|
||||
use tracing::{debug, error, info, instrument, warn};
|
||||
use tracing::{info_span, Instrument};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -225,7 +227,9 @@ use crate::metrics::{
|
||||
RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
|
||||
REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
|
||||
};
|
||||
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use crate::tenant::upload_queue::Delete;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
task_mgr,
|
||||
@@ -259,7 +263,7 @@ const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
|
||||
|
||||
pub enum MaybeDeletedIndexPart {
|
||||
IndexPart(IndexPart),
|
||||
Deleted,
|
||||
Deleted(IndexPart),
|
||||
}
|
||||
|
||||
/// Errors that can arise when calling [`RemoteTimelineClient::stop`].
|
||||
@@ -361,11 +365,42 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Initialize the queue in stopped state. Used in startup path
|
||||
/// to continue deletion operation interrupted by pageserver crash or restart.
|
||||
pub fn init_upload_queue_stopped_to_continue_deletion(
|
||||
&self,
|
||||
index_part: &IndexPart,
|
||||
) -> anyhow::Result<()> {
|
||||
// FIXME: consider newtype for DeletedIndexPart.
|
||||
let deleted_at = index_part.deleted_at.ok_or(anyhow::anyhow!(
|
||||
"bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted"
|
||||
))?;
|
||||
|
||||
{
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
upload_queue.initialize_with_current_remote_index_part(index_part)?;
|
||||
self.update_remote_physical_size_gauge(Some(index_part));
|
||||
}
|
||||
// also locks upload queue, without dropping the guard above it will be a deadlock
|
||||
self.stop().expect("initialized line above");
|
||||
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
|
||||
upload_queue
|
||||
.stopped_mut()
|
||||
.expect("stopped above")
|
||||
.deleted_at = SetDeletedFlagProgress::Successful(deleted_at);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn last_uploaded_consistent_lsn(&self) -> Option<Lsn> {
|
||||
match &*self.upload_queue.lock().unwrap() {
|
||||
UploadQueue::Uninitialized => None,
|
||||
UploadQueue::Initialized(q) => Some(q.last_uploaded_consistent_lsn),
|
||||
UploadQueue::Stopped(q) => Some(q.last_uploaded_consistent_lsn),
|
||||
UploadQueue::Stopped(q) => {
|
||||
Some(q.upload_queue_for_deletion.last_uploaded_consistent_lsn)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -420,7 +455,7 @@ impl RemoteTimelineClient {
|
||||
.await?;
|
||||
|
||||
if index_part.deleted_at.is_some() {
|
||||
Ok(MaybeDeletedIndexPart::Deleted)
|
||||
Ok(MaybeDeletedIndexPart::Deleted(index_part))
|
||||
} else {
|
||||
Ok(MaybeDeletedIndexPart::IndexPart(index_part))
|
||||
}
|
||||
@@ -622,7 +657,11 @@ impl RemoteTimelineClient {
|
||||
|
||||
// schedule the actual deletions
|
||||
for name in names {
|
||||
let op = UploadOp::Delete(RemoteOpFileKind::Layer, name.clone());
|
||||
let op = UploadOp::Delete(Delete {
|
||||
file_kind: RemoteOpFileKind::Layer,
|
||||
layer_file_name: name.clone(),
|
||||
scheduled_from_timeline_delete: false,
|
||||
});
|
||||
self.calls_unfinished_metric_begin(&op);
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
info!("scheduled layer file deletion {}", name.file_name());
|
||||
@@ -639,18 +678,11 @@ impl RemoteTimelineClient {
|
||||
/// Wait for all previously scheduled uploads/deletions to complete
|
||||
///
|
||||
pub async fn wait_completion(self: &Arc<Self>) -> anyhow::Result<()> {
|
||||
let (sender, mut receiver) = tokio::sync::watch::channel(());
|
||||
let barrier_op = UploadOp::Barrier(sender);
|
||||
|
||||
{
|
||||
let mut receiver = {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
upload_queue.queued_operations.push_back(barrier_op);
|
||||
// Don't count this kind of operation!
|
||||
|
||||
// Launch the task immediately, if possible
|
||||
self.launch_queued_tasks(upload_queue);
|
||||
}
|
||||
self.schedule_barrier(upload_queue)
|
||||
};
|
||||
|
||||
if receiver.changed().await.is_err() {
|
||||
anyhow::bail!("wait_completion aborted because upload queue was stopped");
|
||||
@@ -658,6 +690,22 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn schedule_barrier(
|
||||
self: &Arc<Self>,
|
||||
upload_queue: &mut UploadQueueInitialized,
|
||||
) -> tokio::sync::watch::Receiver<()> {
|
||||
let (sender, receiver) = tokio::sync::watch::channel(());
|
||||
let barrier_op = UploadOp::Barrier(sender);
|
||||
|
||||
upload_queue.queued_operations.push_back(barrier_op);
|
||||
// Don't count this kind of operation!
|
||||
|
||||
// Launch the task immediately, if possible
|
||||
self.launch_queued_tasks(upload_queue);
|
||||
|
||||
receiver
|
||||
}
|
||||
|
||||
/// Set the deleted_at field in the remote index file.
|
||||
///
|
||||
/// This fails if the upload queue has not been `stop()`ed.
|
||||
@@ -665,6 +713,7 @@ impl RemoteTimelineClient {
|
||||
/// The caller is responsible for calling `stop()` AND for waiting
|
||||
/// for any ongoing upload tasks to finish after `stop()` has succeeded.
|
||||
/// Check method [`RemoteTimelineClient::stop`] for details.
|
||||
#[instrument(skip_all)]
|
||||
pub(crate) async fn persist_index_part_with_deleted_flag(
|
||||
self: &Arc<Self>,
|
||||
) -> Result<(), PersistIndexPartWithDeletedFlagError> {
|
||||
@@ -674,15 +723,7 @@ impl RemoteTimelineClient {
|
||||
// We must be in stopped state because otherwise
|
||||
// we can have inprogress index part upload that can overwrite the file
|
||||
// with missing is_deleted flag that we going to set below
|
||||
let stopped = match &mut *locked {
|
||||
UploadQueue::Uninitialized => {
|
||||
return Err(anyhow::anyhow!("is not Stopped but Uninitialized").into())
|
||||
}
|
||||
UploadQueue::Initialized(_) => {
|
||||
return Err(anyhow::anyhow!("is not Stopped but Initialized").into())
|
||||
}
|
||||
UploadQueue::Stopped(stopped) => stopped,
|
||||
};
|
||||
let stopped = locked.stopped_mut()?;
|
||||
|
||||
match stopped.deleted_at {
|
||||
SetDeletedFlagProgress::NotRunning => (), // proceed
|
||||
@@ -696,27 +737,17 @@ impl RemoteTimelineClient {
|
||||
let deleted_at = Utc::now().naive_utc();
|
||||
stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at);
|
||||
|
||||
let mut index_part = IndexPart::new(
|
||||
stopped.latest_files.clone(),
|
||||
stopped.last_uploaded_consistent_lsn,
|
||||
stopped
|
||||
.latest_metadata
|
||||
.to_bytes()
|
||||
.context("serialize metadata")?,
|
||||
);
|
||||
let mut index_part = IndexPart::try_from(&stopped.upload_queue_for_deletion)
|
||||
.context("IndexPart serialize")?;
|
||||
index_part.deleted_at = Some(deleted_at);
|
||||
index_part
|
||||
};
|
||||
|
||||
let undo_deleted_at = scopeguard::guard(Arc::clone(self), |self_clone| {
|
||||
let mut locked = self_clone.upload_queue.lock().unwrap();
|
||||
let stopped = match &mut *locked {
|
||||
UploadQueue::Uninitialized | UploadQueue::Initialized(_) => unreachable!(
|
||||
"there's no way out of Stopping, and we checked it's Stopping above: {:?}",
|
||||
locked.as_str(),
|
||||
),
|
||||
UploadQueue::Stopped(stopped) => stopped,
|
||||
};
|
||||
let stopped = locked
|
||||
.stopped_mut()
|
||||
.expect("there's no way out of Stopping, and we checked it's Stopping above");
|
||||
stopped.deleted_at = SetDeletedFlagProgress::NotRunning;
|
||||
});
|
||||
|
||||
@@ -751,13 +782,10 @@ impl RemoteTimelineClient {
|
||||
ScopeGuard::into_inner(undo_deleted_at);
|
||||
{
|
||||
let mut locked = self.upload_queue.lock().unwrap();
|
||||
let stopped = match &mut *locked {
|
||||
UploadQueue::Uninitialized | UploadQueue::Initialized(_) => unreachable!(
|
||||
"there's no way out of Stopping, and we checked it's Stopping above: {:?}",
|
||||
locked.as_str(),
|
||||
),
|
||||
UploadQueue::Stopped(stopped) => stopped,
|
||||
};
|
||||
|
||||
let stopped = locked
|
||||
.stopped_mut()
|
||||
.expect("there's no way out of Stopping, and we checked it's Stopping above");
|
||||
stopped.deleted_at = SetDeletedFlagProgress::Successful(
|
||||
index_part_with_deleted_at
|
||||
.deleted_at
|
||||
@@ -768,6 +796,92 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
|
||||
/// The function deletes layer files one by one, then lists the prefix to see if we leaked something
|
||||
/// deletes leaked files if any and proceeds with deletion of index file at the end.
|
||||
pub(crate) async fn delete_all(self: &Arc<Self>) -> anyhow::Result<()> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let (mut receiver, deletions_queued) = {
|
||||
let mut deletions_queued = 0;
|
||||
|
||||
let mut locked = self.upload_queue.lock().unwrap();
|
||||
let stopped = locked.stopped_mut()?;
|
||||
|
||||
if !matches!(stopped.deleted_at, SetDeletedFlagProgress::Successful(_)) {
|
||||
anyhow::bail!("deleted_at is not set")
|
||||
}
|
||||
|
||||
debug_assert!(stopped.upload_queue_for_deletion.no_pending_work());
|
||||
|
||||
stopped
|
||||
.upload_queue_for_deletion
|
||||
.queued_operations
|
||||
.reserve(stopped.upload_queue_for_deletion.latest_files.len());
|
||||
|
||||
// schedule the actual deletions
|
||||
for name in stopped.upload_queue_for_deletion.latest_files.keys() {
|
||||
let op = UploadOp::Delete(Delete {
|
||||
file_kind: RemoteOpFileKind::Layer,
|
||||
layer_file_name: name.clone(),
|
||||
scheduled_from_timeline_delete: true,
|
||||
});
|
||||
self.calls_unfinished_metric_begin(&op);
|
||||
stopped
|
||||
.upload_queue_for_deletion
|
||||
.queued_operations
|
||||
.push_back(op);
|
||||
|
||||
info!("scheduled layer file deletion {}", name.file_name());
|
||||
deletions_queued += 1;
|
||||
}
|
||||
|
||||
self.launch_queued_tasks(&mut stopped.upload_queue_for_deletion);
|
||||
|
||||
(
|
||||
self.schedule_barrier(&mut stopped.upload_queue_for_deletion),
|
||||
deletions_queued,
|
||||
)
|
||||
};
|
||||
|
||||
receiver.changed().await?;
|
||||
|
||||
// Do not delete index part yet, it is needed for possible retry. If we remove it first
|
||||
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
|
||||
let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
|
||||
let timeline_storage_path = self.conf.remote_path(&timeline_path)?;
|
||||
|
||||
let remaining = self
|
||||
.storage_impl
|
||||
.list_prefixes(Some(&timeline_storage_path))
|
||||
.await?;
|
||||
|
||||
let remaining: Vec<RemotePath> = remaining
|
||||
.into_iter()
|
||||
.filter(|p| p.object_name() != Some(IndexPart::FILE_NAME))
|
||||
.collect();
|
||||
|
||||
if !remaining.is_empty() {
|
||||
warn!(
|
||||
"Found {} files not bound to index_file.json, proceeding with their deletion",
|
||||
remaining.len()
|
||||
);
|
||||
for file in remaining {
|
||||
warn!("Removing {}", file.object_name().unwrap_or_default());
|
||||
self.storage_impl.delete(&file).await?;
|
||||
}
|
||||
}
|
||||
|
||||
let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));
|
||||
|
||||
debug!("deleting index part");
|
||||
self.storage_impl.delete(&index_file_path).await?;
|
||||
|
||||
info!(deletions_queued, "done deleting, including index_part.json");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Pick next tasks from the queue, and start as many of them as possible without violating
|
||||
/// the ordering constraints.
|
||||
@@ -786,7 +900,7 @@ impl RemoteTimelineClient {
|
||||
// have finished.
|
||||
upload_queue.inprogress_tasks.is_empty()
|
||||
}
|
||||
UploadOp::Delete(_, _) => {
|
||||
UploadOp::Delete(_) => {
|
||||
// Wait for preceding uploads to finish. Concurrent deletions are OK, though.
|
||||
upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
|
||||
}
|
||||
@@ -817,7 +931,7 @@ impl RemoteTimelineClient {
|
||||
UploadOp::UploadMetadata(_, _) => {
|
||||
upload_queue.num_inprogress_metadata_uploads += 1;
|
||||
}
|
||||
UploadOp::Delete(_, _) => {
|
||||
UploadOp::Delete(_) => {
|
||||
upload_queue.num_inprogress_deletions += 1;
|
||||
}
|
||||
UploadOp::Barrier(sender) => {
|
||||
@@ -891,7 +1005,6 @@ impl RemoteTimelineClient {
|
||||
unreachable!("we never launch an upload task if the queue is uninitialized, and once it is initialized, we never go back")
|
||||
}
|
||||
}
|
||||
self.calls_unfinished_metric_end(&task.op);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -937,16 +1050,16 @@ impl RemoteTimelineClient {
|
||||
}
|
||||
res
|
||||
}
|
||||
UploadOp::Delete(metric_file_kind, ref layer_file_name) => {
|
||||
UploadOp::Delete(delete) => {
|
||||
let path = &self
|
||||
.conf
|
||||
.timeline_path(&self.timeline_id, &self.tenant_id)
|
||||
.join(layer_file_name.file_name());
|
||||
.join(delete.layer_file_name.file_name());
|
||||
delete::delete_layer(self.conf, &self.storage_impl, path)
|
||||
.measure_remote_op(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
*metric_file_kind,
|
||||
delete.file_kind,
|
||||
RemoteOpKind::Delete,
|
||||
Arc::clone(&self.metrics),
|
||||
)
|
||||
@@ -1012,11 +1125,24 @@ impl RemoteTimelineClient {
|
||||
let mut upload_queue_guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = match upload_queue_guard.deref_mut() {
|
||||
UploadQueue::Uninitialized => panic!("callers are responsible for ensuring this is only called on an initialized queue"),
|
||||
UploadQueue::Stopped(_) => {
|
||||
UploadQueue::Stopped(stopped) => {
|
||||
// Special care is needed for deletions, if it was an earlier deletion (not scheduled from deletion)
|
||||
// then stop() took care of it so we just return.
|
||||
// For deletions that come from delete_all we still want to maintain metrics, launch following tasks, etc.
|
||||
match &task.op {
|
||||
UploadOp::Delete(delete) if delete.scheduled_from_timeline_delete => Some(&mut stopped.upload_queue_for_deletion),
|
||||
_ => None
|
||||
}
|
||||
},
|
||||
UploadQueue::Initialized(qi) => { Some(qi) }
|
||||
};
|
||||
|
||||
let upload_queue = match upload_queue {
|
||||
Some(upload_queue) => upload_queue,
|
||||
None => {
|
||||
info!("another concurrent task already stopped the queue");
|
||||
return;
|
||||
}, // nothing to do
|
||||
UploadQueue::Initialized(qi) => { qi }
|
||||
}
|
||||
};
|
||||
|
||||
upload_queue.inprogress_tasks.remove(&task.task_id);
|
||||
@@ -1029,7 +1155,7 @@ impl RemoteTimelineClient {
|
||||
upload_queue.num_inprogress_metadata_uploads -= 1;
|
||||
upload_queue.last_uploaded_consistent_lsn = lsn; // XXX monotonicity check?
|
||||
}
|
||||
UploadOp::Delete(_, _) => {
|
||||
UploadOp::Delete(_) => {
|
||||
upload_queue.num_inprogress_deletions -= 1;
|
||||
}
|
||||
UploadOp::Barrier(_) => unreachable!(),
|
||||
@@ -1063,8 +1189,8 @@ impl RemoteTimelineClient {
|
||||
reason: "metadata uploads are tiny",
|
||||
},
|
||||
),
|
||||
UploadOp::Delete(file_kind, _) => (
|
||||
*file_kind,
|
||||
UploadOp::Delete(delete) => (
|
||||
delete.file_kind,
|
||||
RemoteOpKind::Delete,
|
||||
DontTrackSize {
|
||||
reason: "should we track deletes? positive or negative sign?",
|
||||
@@ -1111,32 +1237,36 @@ impl RemoteTimelineClient {
|
||||
info!("another concurrent task already shut down the queue");
|
||||
Ok(())
|
||||
}
|
||||
UploadQueue::Initialized(UploadQueueInitialized {
|
||||
latest_files,
|
||||
latest_metadata,
|
||||
last_uploaded_consistent_lsn,
|
||||
..
|
||||
}) => {
|
||||
UploadQueue::Initialized(initialized) => {
|
||||
info!("shutting down upload queue");
|
||||
|
||||
// Replace the queue with the Stopped state, taking ownership of the old
|
||||
// Initialized queue. We will do some checks on it, and then drop it.
|
||||
let qi = {
|
||||
// take or clone what we need
|
||||
let latest_files = std::mem::take(latest_files);
|
||||
let last_uploaded_consistent_lsn = *last_uploaded_consistent_lsn;
|
||||
// this could be Copy
|
||||
let latest_metadata = latest_metadata.clone();
|
||||
|
||||
let stopped = UploadQueueStopped {
|
||||
latest_files,
|
||||
last_uploaded_consistent_lsn,
|
||||
latest_metadata,
|
||||
deleted_at: SetDeletedFlagProgress::NotRunning,
|
||||
// Here we preserve working version of the upload queue for possible use during deletions.
|
||||
// In-place replace of Initialized to Stopped can be done with the help of https://github.com/Sgeo/take_mut
|
||||
// but for this use case it doesnt really makes sense to bring unsafe code only for this usage point.
|
||||
// Deletion is not really perf sensitive so there shouldnt be any problems with cloning a fraction of it.
|
||||
let upload_queue_for_deletion = UploadQueueInitialized {
|
||||
task_counter: 0,
|
||||
latest_files: initialized.latest_files.clone(),
|
||||
latest_files_changes_since_metadata_upload_scheduled: 0,
|
||||
latest_metadata: initialized.latest_metadata.clone(),
|
||||
last_uploaded_consistent_lsn: initialized.last_uploaded_consistent_lsn,
|
||||
num_inprogress_layer_uploads: 0,
|
||||
num_inprogress_metadata_uploads: 0,
|
||||
num_inprogress_deletions: 0,
|
||||
inprogress_tasks: HashMap::default(),
|
||||
queued_operations: VecDeque::default(),
|
||||
};
|
||||
|
||||
let upload_queue =
|
||||
std::mem::replace(&mut *guard, UploadQueue::Stopped(stopped));
|
||||
let upload_queue = std::mem::replace(
|
||||
&mut *guard,
|
||||
UploadQueue::Stopped(UploadQueueStopped {
|
||||
upload_queue_for_deletion,
|
||||
deleted_at: SetDeletedFlagProgress::NotRunning,
|
||||
}),
|
||||
);
|
||||
if let UploadQueue::Initialized(qi) = upload_queue {
|
||||
qi
|
||||
} else {
|
||||
@@ -1144,8 +1274,6 @@ impl RemoteTimelineClient {
|
||||
}
|
||||
};
|
||||
|
||||
assert!(qi.latest_files.is_empty(), "do not use this anymore");
|
||||
|
||||
// consistency check
|
||||
assert_eq!(
|
||||
qi.num_inprogress_layer_uploads
|
||||
@@ -1264,9 +1392,7 @@ mod tests {
|
||||
let harness = TenantHarness::create(test_name)?;
|
||||
let (tenant, ctx) = runtime.block_on(harness.load());
|
||||
// create an empty timeline directory
|
||||
let timeline =
|
||||
tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
|
||||
let _ = timeline.initialize(&ctx).unwrap();
|
||||
let _ = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
|
||||
|
||||
let remote_fs_dir = harness.conf.workdir.join("remote_fs");
|
||||
std::fs::create_dir_all(remote_fs_dir)?;
|
||||
@@ -1410,7 +1536,7 @@ mod tests {
|
||||
// Download back the index.json, and check that the list of files is correct
|
||||
let index_part = match runtime.block_on(client.download_index_file())? {
|
||||
MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
|
||||
MaybeDeletedIndexPart::Deleted => panic!("unexpectedly got deleted index part"),
|
||||
MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
|
||||
};
|
||||
|
||||
assert_file_list(
|
||||
|
||||
@@ -7,9 +7,11 @@ use std::collections::{HashMap, HashSet};
|
||||
use chrono::NaiveDateTime;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use utils::bin_ser::SerializeError;
|
||||
|
||||
use crate::tenant::metadata::TimelineMetadata;
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
use crate::tenant::upload_queue::UploadQueueInitialized;
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -115,6 +117,21 @@ impl IndexPart {
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&UploadQueueInitialized> for IndexPart {
|
||||
type Error = SerializeError;
|
||||
|
||||
fn try_from(upload_queue: &UploadQueueInitialized) -> Result<Self, Self::Error> {
|
||||
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
|
||||
let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
|
||||
|
||||
Ok(Self::new(
|
||||
upload_queue.latest_files.clone(),
|
||||
disk_consistent_lsn,
|
||||
metadata_bytes,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
/// Serialized form of [`LayerFileMetadata`].
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)]
|
||||
pub struct IndexLayerMetadata {
|
||||
|
||||
@@ -4,6 +4,7 @@ pub mod delta_layer;
|
||||
mod filename;
|
||||
mod image_layer;
|
||||
mod inmemory_layer;
|
||||
mod layer_desc;
|
||||
mod remote_layer;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
@@ -37,6 +38,7 @@ pub use delta_layer::{DeltaLayer, DeltaLayerWriter};
|
||||
pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
|
||||
pub use image_layer::{ImageLayer, ImageLayerWriter};
|
||||
pub use inmemory_layer::InMemoryLayer;
|
||||
pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
|
||||
pub use remote_layer::RemoteLayer;
|
||||
|
||||
use super::layer_map::BatchedUpdates;
|
||||
@@ -406,14 +408,23 @@ pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i>;
|
||||
/// An image layer is a snapshot of all the data in a key-range, at a single
|
||||
/// LSN.
|
||||
pub trait PersistentLayer: Layer {
|
||||
fn get_tenant_id(&self) -> TenantId;
|
||||
/// Get the layer descriptor.
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc;
|
||||
|
||||
fn get_tenant_id(&self) -> TenantId {
|
||||
self.layer_desc().tenant_id
|
||||
}
|
||||
|
||||
/// Identify the timeline this layer belongs to
|
||||
fn get_timeline_id(&self) -> TimelineId;
|
||||
fn get_timeline_id(&self) -> TimelineId {
|
||||
self.layer_desc().timeline_id
|
||||
}
|
||||
|
||||
/// File name used for this layer, both in the pageserver's local filesystem
|
||||
/// state as well as in the remote storage.
|
||||
fn filename(&self) -> LayerFileName;
|
||||
fn filename(&self) -> LayerFileName {
|
||||
self.layer_desc().filename()
|
||||
}
|
||||
|
||||
// Path to the layer file in the local filesystem.
|
||||
// `None` for `RemoteLayer`.
|
||||
@@ -443,7 +454,9 @@ pub trait PersistentLayer: Layer {
|
||||
///
|
||||
/// Should not change over the lifetime of the layer object because
|
||||
/// current_physical_size is computed as the som of this value.
|
||||
fn file_size(&self) -> u64;
|
||||
fn file_size(&self) -> u64 {
|
||||
self.layer_desc().file_size
|
||||
}
|
||||
|
||||
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo;
|
||||
|
||||
@@ -472,6 +485,20 @@ pub struct LayerDescriptor {
|
||||
pub short_id: String,
|
||||
}
|
||||
|
||||
impl LayerDescriptor {
|
||||
/// `LayerDescriptor` is only used for testing purpose so it does not matter whether it is image / delta,
|
||||
/// and the tenant / timeline id does not matter.
|
||||
pub fn get_persistent_layer_desc(&self) -> PersistentLayerDesc {
|
||||
PersistentLayerDesc::new_delta(
|
||||
TenantId::from_array([0; 16]),
|
||||
TimelineId::from_array([0; 16]),
|
||||
self.key.clone(),
|
||||
self.lsn.clone(),
|
||||
233,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl Layer for LayerDescriptor {
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
self.key.clone()
|
||||
@@ -542,7 +569,7 @@ impl From<LayerFileName> for LayerDescriptor {
|
||||
///
|
||||
/// This is used by DeltaLayer and ImageLayer. Normally, this holds a reference to the
|
||||
/// global config, and paths to layer files are constructed using the tenant/timeline
|
||||
/// path from the config. But in the 'pageserver_binutils' binary, we need to construct a Layer
|
||||
/// path from the config. But in the 'pagectl' binary, we need to construct a Layer
|
||||
/// struct for a file on disk, without having a page server running, so that we have no
|
||||
/// config. In that case, we use the Path variant to hold the full path to the file on
|
||||
/// disk.
|
||||
|
||||
@@ -56,8 +56,8 @@ use utils::{
|
||||
};
|
||||
|
||||
use super::{
|
||||
DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerFileName, LayerIter,
|
||||
LayerKeyIter, PathOrConf,
|
||||
DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
|
||||
PathOrConf, PersistentLayerDesc,
|
||||
};
|
||||
|
||||
///
|
||||
@@ -89,10 +89,10 @@ impl From<&DeltaLayer> for Summary {
|
||||
magic: DELTA_FILE_MAGIC,
|
||||
format_version: STORAGE_FORMAT_VERSION,
|
||||
|
||||
tenant_id: layer.tenant_id,
|
||||
timeline_id: layer.timeline_id,
|
||||
key_range: layer.key_range.clone(),
|
||||
lsn_range: layer.lsn_range.clone(),
|
||||
tenant_id: layer.desc.tenant_id,
|
||||
timeline_id: layer.desc.timeline_id,
|
||||
key_range: layer.desc.key_range.clone(),
|
||||
lsn_range: layer.desc.lsn_range.clone(),
|
||||
|
||||
index_start_blk: 0,
|
||||
index_root_blk: 0,
|
||||
@@ -110,7 +110,7 @@ const WILL_INIT: u64 = 1;
|
||||
/// reading/deserializing records themselves.
|
||||
///
|
||||
#[derive(Debug, Serialize, Deserialize, Copy, Clone)]
|
||||
struct BlobRef(u64);
|
||||
pub struct BlobRef(pub u64);
|
||||
|
||||
impl BlobRef {
|
||||
pub fn will_init(&self) -> bool {
|
||||
@@ -180,12 +180,7 @@ impl DeltaKey {
|
||||
pub struct DeltaLayer {
|
||||
path_or_conf: PathOrConf,
|
||||
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub key_range: Range<Key>,
|
||||
pub lsn_range: Range<Lsn>,
|
||||
|
||||
pub file_size: u64,
|
||||
pub desc: PersistentLayerDesc,
|
||||
|
||||
access_stats: LayerAccessStats,
|
||||
|
||||
@@ -197,9 +192,9 @@ impl std::fmt::Debug for DeltaLayer {
|
||||
use super::RangeDisplayDebug;
|
||||
|
||||
f.debug_struct("DeltaLayer")
|
||||
.field("key_range", &RangeDisplayDebug(&self.key_range))
|
||||
.field("lsn_range", &self.lsn_range)
|
||||
.field("file_size", &self.file_size)
|
||||
.field("key_range", &RangeDisplayDebug(&self.desc.key_range))
|
||||
.field("lsn_range", &self.desc.lsn_range)
|
||||
.field("file_size", &self.desc.file_size)
|
||||
.field("inner", &self.inner)
|
||||
.finish()
|
||||
}
|
||||
@@ -228,30 +223,16 @@ impl std::fmt::Debug for DeltaLayerInner {
|
||||
}
|
||||
|
||||
impl Layer for DeltaLayer {
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
self.key_range.clone()
|
||||
}
|
||||
|
||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
self.lsn_range.clone()
|
||||
}
|
||||
fn is_incremental(&self) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
fn short_id(&self) -> String {
|
||||
self.filename().file_name()
|
||||
}
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
println!(
|
||||
"----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
self.key_range.start,
|
||||
self.key_range.end,
|
||||
self.lsn_range.start,
|
||||
self.lsn_range.end
|
||||
self.desc.tenant_id,
|
||||
self.desc.timeline_id,
|
||||
self.desc.key_range.start,
|
||||
self.desc.key_range.end,
|
||||
self.desc.lsn_range.start,
|
||||
self.desc.lsn_range.end
|
||||
);
|
||||
|
||||
if !verbose {
|
||||
@@ -324,10 +305,10 @@ impl Layer for DeltaLayer {
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
ensure!(lsn_range.start >= self.lsn_range.start);
|
||||
ensure!(lsn_range.start >= self.desc.lsn_range.start);
|
||||
let mut need_image = true;
|
||||
|
||||
ensure!(self.key_range.contains(&key));
|
||||
ensure!(self.desc.key_range.contains(&key));
|
||||
|
||||
{
|
||||
// Open the file and lock the metadata in memory
|
||||
@@ -402,19 +383,31 @@ impl Layer for DeltaLayer {
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
}
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
self.layer_desc().key_range.clone()
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
self.layer_desc().lsn_range.clone()
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn is_incremental(&self) -> bool {
|
||||
self.layer_desc().is_incremental
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn short_id(&self) -> String {
|
||||
self.layer_desc().short_id()
|
||||
}
|
||||
}
|
||||
|
||||
impl PersistentLayer for DeltaLayer {
|
||||
fn get_tenant_id(&self) -> TenantId {
|
||||
self.tenant_id
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> TimelineId {
|
||||
self.timeline_id
|
||||
}
|
||||
|
||||
fn filename(&self) -> LayerFileName {
|
||||
self.layer_name().into()
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc {
|
||||
&self.desc
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
@@ -444,10 +437,6 @@ impl PersistentLayer for DeltaLayer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn file_size(&self) -> u64 {
|
||||
self.file_size
|
||||
}
|
||||
|
||||
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
let layer_file_name = self.filename().file_name();
|
||||
let lsn_range = self.get_lsn_range();
|
||||
@@ -456,7 +445,7 @@ impl PersistentLayer for DeltaLayer {
|
||||
|
||||
HistoricLayerInfo::Delta {
|
||||
layer_file_name,
|
||||
layer_file_size: self.file_size,
|
||||
layer_file_size: self.desc.file_size,
|
||||
lsn_start: lsn_range.start,
|
||||
lsn_end: lsn_range.end,
|
||||
remote: false,
|
||||
@@ -602,11 +591,13 @@ impl DeltaLayer {
|
||||
) -> DeltaLayer {
|
||||
DeltaLayer {
|
||||
path_or_conf: PathOrConf::Conf(conf),
|
||||
timeline_id,
|
||||
tenant_id,
|
||||
key_range: filename.key_range.clone(),
|
||||
lsn_range: filename.lsn_range.clone(),
|
||||
file_size,
|
||||
desc: PersistentLayerDesc::new_delta(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
filename.key_range.clone(),
|
||||
filename.lsn_range.clone(),
|
||||
file_size,
|
||||
),
|
||||
access_stats,
|
||||
inner: RwLock::new(DeltaLayerInner {
|
||||
loaded: false,
|
||||
@@ -619,7 +610,7 @@ impl DeltaLayer {
|
||||
|
||||
/// Create a DeltaLayer struct representing an existing file on disk.
|
||||
///
|
||||
/// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary.
|
||||
/// This variant is only used for debugging purposes, by the 'pagectl' binary.
|
||||
pub fn new_for_path(path: &Path, file: File) -> Result<Self> {
|
||||
let mut summary_buf = Vec::new();
|
||||
summary_buf.resize(PAGE_SZ, 0);
|
||||
@@ -632,11 +623,13 @@ impl DeltaLayer {
|
||||
|
||||
Ok(DeltaLayer {
|
||||
path_or_conf: PathOrConf::Path(path.to_path_buf()),
|
||||
timeline_id: summary.timeline_id,
|
||||
tenant_id: summary.tenant_id,
|
||||
key_range: summary.key_range,
|
||||
lsn_range: summary.lsn_range,
|
||||
file_size: metadata.len(),
|
||||
desc: PersistentLayerDesc::new_delta(
|
||||
summary.tenant_id,
|
||||
summary.timeline_id,
|
||||
summary.key_range,
|
||||
summary.lsn_range,
|
||||
metadata.len(),
|
||||
),
|
||||
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
|
||||
inner: RwLock::new(DeltaLayerInner {
|
||||
loaded: false,
|
||||
@@ -648,18 +641,14 @@ impl DeltaLayer {
|
||||
}
|
||||
|
||||
fn layer_name(&self) -> DeltaFileName {
|
||||
DeltaFileName {
|
||||
key_range: self.key_range.clone(),
|
||||
lsn_range: self.lsn_range.clone(),
|
||||
}
|
||||
self.desc.delta_file_name()
|
||||
}
|
||||
|
||||
/// Path to the layer file in pageserver workdir.
|
||||
pub fn path(&self) -> PathBuf {
|
||||
Self::path_for(
|
||||
&self.path_or_conf,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
self.desc.timeline_id,
|
||||
self.desc.tenant_id,
|
||||
&self.layer_name(),
|
||||
)
|
||||
}
|
||||
@@ -803,11 +792,13 @@ impl DeltaLayerWriterInner {
|
||||
// set inner.file here. The first read will have to re-open it.
|
||||
let layer = DeltaLayer {
|
||||
path_or_conf: PathOrConf::Conf(self.conf),
|
||||
tenant_id: self.tenant_id,
|
||||
timeline_id: self.timeline_id,
|
||||
key_range: self.key_start..key_end,
|
||||
lsn_range: self.lsn_range.clone(),
|
||||
file_size: metadata.len(),
|
||||
desc: PersistentLayerDesc::new_delta(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
self.key_start..key_end,
|
||||
self.lsn_range.clone(),
|
||||
metadata.len(),
|
||||
),
|
||||
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
|
||||
inner: RwLock::new(DeltaLayerInner {
|
||||
loaded: false,
|
||||
|
||||
@@ -9,6 +9,8 @@ use std::str::FromStr;
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use super::PersistentLayerDesc;
|
||||
|
||||
// Note: Timeline::load_layer_map() relies on this sort order
|
||||
#[derive(PartialEq, Eq, Clone, Hash)]
|
||||
pub struct DeltaFileName {
|
||||
@@ -153,7 +155,7 @@ impl Ord for ImageFileName {
|
||||
impl ImageFileName {
|
||||
pub fn lsn_as_range(&self) -> Range<Lsn> {
|
||||
// Saves from having to copypaste this all over
|
||||
self.lsn..(self.lsn + 1)
|
||||
PersistentLayerDesc::image_layer_lsn_range(self.lsn)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -52,8 +52,8 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use super::filename::{ImageFileName, LayerFileName};
|
||||
use super::{Layer, LayerAccessStatsReset, LayerIter, PathOrConf};
|
||||
use super::filename::ImageFileName;
|
||||
use super::{Layer, LayerAccessStatsReset, LayerIter, PathOrConf, PersistentLayerDesc};
|
||||
|
||||
///
|
||||
/// Header stored in the beginning of the file
|
||||
@@ -84,9 +84,9 @@ impl From<&ImageLayer> for Summary {
|
||||
Self {
|
||||
magic: IMAGE_FILE_MAGIC,
|
||||
format_version: STORAGE_FORMAT_VERSION,
|
||||
tenant_id: layer.tenant_id,
|
||||
timeline_id: layer.timeline_id,
|
||||
key_range: layer.key_range.clone(),
|
||||
tenant_id: layer.desc.tenant_id,
|
||||
timeline_id: layer.desc.timeline_id,
|
||||
key_range: layer.desc.key_range.clone(),
|
||||
lsn: layer.lsn,
|
||||
|
||||
index_start_blk: 0,
|
||||
@@ -104,12 +104,9 @@ impl From<&ImageLayer> for Summary {
|
||||
/// and it needs to be loaded before using it in queries.
|
||||
pub struct ImageLayer {
|
||||
path_or_conf: PathOrConf,
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub key_range: Range<Key>,
|
||||
pub file_size: u64,
|
||||
|
||||
// This entry contains an image of all pages as of this LSN
|
||||
pub desc: PersistentLayerDesc,
|
||||
// This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
|
||||
pub lsn: Lsn,
|
||||
|
||||
access_stats: LayerAccessStats,
|
||||
@@ -122,8 +119,8 @@ impl std::fmt::Debug for ImageLayer {
|
||||
use super::RangeDisplayDebug;
|
||||
|
||||
f.debug_struct("ImageLayer")
|
||||
.field("key_range", &RangeDisplayDebug(&self.key_range))
|
||||
.field("file_size", &self.file_size)
|
||||
.field("key_range", &RangeDisplayDebug(&self.desc.key_range))
|
||||
.field("file_size", &self.desc.file_size)
|
||||
.field("lsn", &self.lsn)
|
||||
.field("inner", &self.inner)
|
||||
.finish()
|
||||
@@ -153,27 +150,15 @@ impl std::fmt::Debug for ImageLayerInner {
|
||||
}
|
||||
|
||||
impl Layer for ImageLayer {
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
self.key_range.clone()
|
||||
}
|
||||
|
||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
// End-bound is exclusive
|
||||
self.lsn..(self.lsn + 1)
|
||||
}
|
||||
fn is_incremental(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn short_id(&self) -> String {
|
||||
self.filename().file_name()
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
println!(
|
||||
"----- image layer for ten {} tli {} key {}-{} at {} ----",
|
||||
self.tenant_id, self.timeline_id, self.key_range.start, self.key_range.end, self.lsn
|
||||
self.desc.tenant_id,
|
||||
self.desc.timeline_id,
|
||||
self.desc.key_range.start,
|
||||
self.desc.key_range.end,
|
||||
self.lsn
|
||||
);
|
||||
|
||||
if !verbose {
|
||||
@@ -203,7 +188,7 @@ impl Layer for ImageLayer {
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
assert!(self.key_range.contains(&key));
|
||||
assert!(self.desc.key_range.contains(&key));
|
||||
assert!(lsn_range.start >= self.lsn);
|
||||
assert!(lsn_range.end >= self.lsn);
|
||||
|
||||
@@ -230,24 +215,37 @@ impl Layer for ImageLayer {
|
||||
Ok(ValueReconstructResult::Missing)
|
||||
}
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
self.layer_desc().key_range.clone()
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
self.layer_desc().lsn_range.clone()
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn is_incremental(&self) -> bool {
|
||||
self.layer_desc().is_incremental
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn short_id(&self) -> String {
|
||||
self.layer_desc().short_id()
|
||||
}
|
||||
}
|
||||
|
||||
impl PersistentLayer for ImageLayer {
|
||||
fn filename(&self) -> LayerFileName {
|
||||
self.layer_name().into()
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc {
|
||||
&self.desc
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
Some(self.path())
|
||||
}
|
||||
|
||||
fn get_tenant_id(&self) -> TenantId {
|
||||
self.tenant_id
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> TimelineId {
|
||||
self.timeline_id
|
||||
}
|
||||
fn iter(&self, _ctx: &RequestContext) -> Result<LayerIter<'_>> {
|
||||
unimplemented!();
|
||||
}
|
||||
@@ -258,17 +256,13 @@ impl PersistentLayer for ImageLayer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn file_size(&self) -> u64 {
|
||||
self.file_size
|
||||
}
|
||||
|
||||
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
let layer_file_name = self.filename().file_name();
|
||||
let lsn_range = self.get_lsn_range();
|
||||
|
||||
HistoricLayerInfo::Image {
|
||||
layer_file_name,
|
||||
layer_file_size: self.file_size,
|
||||
layer_file_size: self.desc.file_size,
|
||||
lsn_start: lsn_range.start,
|
||||
remote: false,
|
||||
access_stats: self.access_stats.as_api_model(reset),
|
||||
@@ -405,11 +399,15 @@ impl ImageLayer {
|
||||
) -> ImageLayer {
|
||||
ImageLayer {
|
||||
path_or_conf: PathOrConf::Conf(conf),
|
||||
timeline_id,
|
||||
tenant_id,
|
||||
key_range: filename.key_range.clone(),
|
||||
desc: PersistentLayerDesc::new_img(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
filename.key_range.clone(),
|
||||
filename.lsn,
|
||||
false,
|
||||
file_size,
|
||||
), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
|
||||
lsn: filename.lsn,
|
||||
file_size,
|
||||
access_stats,
|
||||
inner: RwLock::new(ImageLayerInner {
|
||||
loaded: false,
|
||||
@@ -422,7 +420,7 @@ impl ImageLayer {
|
||||
|
||||
/// Create an ImageLayer struct representing an existing file on disk.
|
||||
///
|
||||
/// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary.
|
||||
/// This variant is only used for debugging purposes, by the 'pagectl' binary.
|
||||
pub fn new_for_path(path: &Path, file: File) -> Result<ImageLayer> {
|
||||
let mut summary_buf = Vec::new();
|
||||
summary_buf.resize(PAGE_SZ, 0);
|
||||
@@ -433,11 +431,15 @@ impl ImageLayer {
|
||||
.context("get file metadata to determine size")?;
|
||||
Ok(ImageLayer {
|
||||
path_or_conf: PathOrConf::Path(path.to_path_buf()),
|
||||
timeline_id: summary.timeline_id,
|
||||
tenant_id: summary.tenant_id,
|
||||
key_range: summary.key_range,
|
||||
desc: PersistentLayerDesc::new_img(
|
||||
summary.tenant_id,
|
||||
summary.timeline_id,
|
||||
summary.key_range,
|
||||
summary.lsn,
|
||||
false,
|
||||
metadata.len(),
|
||||
), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
|
||||
lsn: summary.lsn,
|
||||
file_size: metadata.len(),
|
||||
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
|
||||
inner: RwLock::new(ImageLayerInner {
|
||||
file: None,
|
||||
@@ -449,18 +451,15 @@ impl ImageLayer {
|
||||
}
|
||||
|
||||
fn layer_name(&self) -> ImageFileName {
|
||||
ImageFileName {
|
||||
key_range: self.key_range.clone(),
|
||||
lsn: self.lsn,
|
||||
}
|
||||
self.desc.image_file_name()
|
||||
}
|
||||
|
||||
/// Path to the layer file in pageserver workdir.
|
||||
pub fn path(&self) -> PathBuf {
|
||||
Self::path_for(
|
||||
&self.path_or_conf,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
self.desc.timeline_id,
|
||||
self.desc.tenant_id,
|
||||
&self.layer_name(),
|
||||
)
|
||||
}
|
||||
@@ -484,6 +483,7 @@ struct ImageLayerWriterInner {
|
||||
tenant_id: TenantId,
|
||||
key_range: Range<Key>,
|
||||
lsn: Lsn,
|
||||
is_incremental: bool,
|
||||
|
||||
blob_writer: WriteBlobWriter<VirtualFile>,
|
||||
tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
|
||||
@@ -499,6 +499,7 @@ impl ImageLayerWriterInner {
|
||||
tenant_id: TenantId,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
is_incremental: bool,
|
||||
) -> anyhow::Result<Self> {
|
||||
// Create the file initially with a temporary filename.
|
||||
// We'll atomically rename it to the final name when we're done.
|
||||
@@ -533,6 +534,7 @@ impl ImageLayerWriterInner {
|
||||
lsn,
|
||||
tree: tree_builder,
|
||||
blob_writer,
|
||||
is_incremental,
|
||||
};
|
||||
|
||||
Ok(writer)
|
||||
@@ -588,16 +590,22 @@ impl ImageLayerWriterInner {
|
||||
.metadata()
|
||||
.context("get metadata to determine file size")?;
|
||||
|
||||
let desc = PersistentLayerDesc::new_img(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
self.key_range.clone(),
|
||||
self.lsn,
|
||||
self.is_incremental, // for now, image layer ALWAYS covers the full range
|
||||
metadata.len(),
|
||||
);
|
||||
|
||||
// Note: Because we open the file in write-only mode, we cannot
|
||||
// reuse the same VirtualFile for reading later. That's why we don't
|
||||
// set inner.file here. The first read will have to re-open it.
|
||||
let layer = ImageLayer {
|
||||
path_or_conf: PathOrConf::Conf(self.conf),
|
||||
timeline_id: self.timeline_id,
|
||||
tenant_id: self.tenant_id,
|
||||
key_range: self.key_range.clone(),
|
||||
desc,
|
||||
lsn: self.lsn,
|
||||
file_size: metadata.len(),
|
||||
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
|
||||
inner: RwLock::new(ImageLayerInner {
|
||||
loaded: false,
|
||||
@@ -667,6 +675,7 @@ impl ImageLayerWriter {
|
||||
tenant_id: TenantId,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
is_incremental: bool,
|
||||
) -> anyhow::Result<ImageLayerWriter> {
|
||||
Ok(Self {
|
||||
inner: Some(ImageLayerWriterInner::new(
|
||||
@@ -675,6 +684,7 @@ impl ImageLayerWriter {
|
||||
tenant_id,
|
||||
key_range,
|
||||
lsn,
|
||||
is_incremental,
|
||||
)?),
|
||||
})
|
||||
}
|
||||
|
||||
191
pageserver/src/tenant/storage_layer/layer_desc.rs
Normal file
191
pageserver/src/tenant/storage_layer/layer_desc.rs
Normal file
@@ -0,0 +1,191 @@
|
||||
use anyhow::Result;
|
||||
use std::ops::Range;
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::{context::RequestContext, repository::Key};
|
||||
|
||||
use super::{DeltaFileName, ImageFileName, LayerFileName};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
|
||||
/// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
|
||||
/// a unified way to generate layer information like file name.
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
|
||||
pub struct PersistentLayerDesc {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub key_range: Range<Key>,
|
||||
/// For image layer, this is `[lsn, lsn+1)`.
|
||||
pub lsn_range: Range<Lsn>,
|
||||
/// Whether this is a delta layer.
|
||||
pub is_delta: bool,
|
||||
/// Whether this layer only contains page images for part of the keys in the range. In the current implementation, this should
|
||||
/// always be equal to `is_delta`. If we land the partial image layer PR someday, image layer could also be
|
||||
/// incremental.
|
||||
pub is_incremental: bool,
|
||||
/// File size
|
||||
pub file_size: u64,
|
||||
}
|
||||
|
||||
/// A unique identifier of a persistent layer within the context of one timeline.
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
|
||||
pub struct PersistentLayerKey {
|
||||
pub key_range: Range<Key>,
|
||||
pub lsn_range: Range<Lsn>,
|
||||
pub is_delta: bool,
|
||||
}
|
||||
|
||||
impl PersistentLayerDesc {
|
||||
pub fn key(&self) -> PersistentLayerKey {
|
||||
PersistentLayerKey {
|
||||
key_range: self.key_range.clone(),
|
||||
lsn_range: self.lsn_range.clone(),
|
||||
is_delta: self.is_delta,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn short_id(&self) -> String {
|
||||
self.filename().file_name()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn new_test(key_range: Range<Key>) -> Self {
|
||||
Self {
|
||||
tenant_id: TenantId::generate(),
|
||||
timeline_id: TimelineId::generate(),
|
||||
key_range,
|
||||
lsn_range: Lsn(0)..Lsn(1),
|
||||
is_delta: false,
|
||||
is_incremental: false,
|
||||
file_size: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_img(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
key_range: Range<Key>,
|
||||
lsn: Lsn,
|
||||
is_incremental: bool,
|
||||
file_size: u64,
|
||||
) -> Self {
|
||||
Self {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
key_range,
|
||||
lsn_range: Self::image_layer_lsn_range(lsn),
|
||||
is_delta: false,
|
||||
is_incremental,
|
||||
file_size,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_delta(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
key_range: Range<Key>,
|
||||
lsn_range: Range<Lsn>,
|
||||
file_size: u64,
|
||||
) -> Self {
|
||||
Self {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
key_range,
|
||||
lsn_range,
|
||||
is_delta: true,
|
||||
is_incremental: true,
|
||||
file_size,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the LSN that the image layer covers.
|
||||
pub fn image_layer_lsn(&self) -> Lsn {
|
||||
assert!(!self.is_delta);
|
||||
assert!(self.lsn_range.start + 1 == self.lsn_range.end);
|
||||
self.lsn_range.start
|
||||
}
|
||||
|
||||
/// Get the LSN range corresponding to a single image layer LSN.
|
||||
pub fn image_layer_lsn_range(lsn: Lsn) -> Range<Lsn> {
|
||||
lsn..(lsn + 1)
|
||||
}
|
||||
|
||||
/// Get a delta file name for this layer.
|
||||
///
|
||||
/// Panic: if this is not a delta layer.
|
||||
pub fn delta_file_name(&self) -> DeltaFileName {
|
||||
assert!(self.is_delta);
|
||||
DeltaFileName {
|
||||
key_range: self.key_range.clone(),
|
||||
lsn_range: self.lsn_range.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get a delta file name for this layer.
|
||||
///
|
||||
/// Panic: if this is not an image layer, or the lsn range is invalid
|
||||
pub fn image_file_name(&self) -> ImageFileName {
|
||||
assert!(!self.is_delta);
|
||||
assert!(self.lsn_range.start + 1 == self.lsn_range.end);
|
||||
ImageFileName {
|
||||
key_range: self.key_range.clone(),
|
||||
lsn: self.lsn_range.start,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn filename(&self) -> LayerFileName {
|
||||
if self.is_delta {
|
||||
self.delta_file_name().into()
|
||||
} else {
|
||||
self.image_file_name().into()
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: remove this in the future once we refactor timeline APIs.
|
||||
|
||||
pub fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
self.lsn_range.clone()
|
||||
}
|
||||
|
||||
pub fn get_key_range(&self) -> Range<Key> {
|
||||
self.key_range.clone()
|
||||
}
|
||||
|
||||
pub fn get_timeline_id(&self) -> TimelineId {
|
||||
self.timeline_id
|
||||
}
|
||||
|
||||
pub fn get_tenant_id(&self) -> TenantId {
|
||||
self.tenant_id
|
||||
}
|
||||
|
||||
pub fn is_incremental(&self) -> bool {
|
||||
self.is_incremental
|
||||
}
|
||||
|
||||
pub fn is_delta(&self) -> bool {
|
||||
self.is_delta
|
||||
}
|
||||
|
||||
pub fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
||||
println!(
|
||||
"----- layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
self.key_range.start,
|
||||
self.key_range.end,
|
||||
self.lsn_range.start,
|
||||
self.lsn_range.end
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn file_size(&self) -> u64 {
|
||||
self.file_size
|
||||
}
|
||||
}
|
||||
@@ -18,11 +18,10 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use super::filename::{DeltaFileName, ImageFileName, LayerFileName};
|
||||
use super::image_layer::ImageLayer;
|
||||
use super::filename::{DeltaFileName, ImageFileName};
|
||||
use super::{
|
||||
DeltaLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
|
||||
LayerResidenceStatus, PersistentLayer,
|
||||
DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
|
||||
LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
|
||||
};
|
||||
|
||||
/// RemoteLayer is a not yet downloaded [`ImageLayer`] or
|
||||
@@ -34,19 +33,10 @@ use super::{
|
||||
///
|
||||
/// See: [`crate::context::RequestContext`] for authorization to download
|
||||
pub struct RemoteLayer {
|
||||
tenantid: TenantId,
|
||||
timelineid: TimelineId,
|
||||
key_range: Range<Key>,
|
||||
lsn_range: Range<Lsn>,
|
||||
|
||||
pub file_name: LayerFileName,
|
||||
pub desc: PersistentLayerDesc,
|
||||
|
||||
pub layer_metadata: LayerFileMetadata,
|
||||
|
||||
is_delta: bool,
|
||||
|
||||
is_incremental: bool,
|
||||
|
||||
access_stats: LayerAccessStats,
|
||||
|
||||
pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
|
||||
@@ -66,22 +56,14 @@ pub struct RemoteLayer {
|
||||
impl std::fmt::Debug for RemoteLayer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("RemoteLayer")
|
||||
.field("file_name", &self.file_name)
|
||||
.field("file_name", &self.desc.filename())
|
||||
.field("layer_metadata", &self.layer_metadata)
|
||||
.field("is_incremental", &self.is_incremental)
|
||||
.field("is_incremental", &self.desc.is_incremental)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl Layer for RemoteLayer {
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
self.key_range.clone()
|
||||
}
|
||||
|
||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
self.lsn_range.clone()
|
||||
}
|
||||
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
_key: Key,
|
||||
@@ -95,53 +77,45 @@ impl Layer for RemoteLayer {
|
||||
);
|
||||
}
|
||||
|
||||
fn is_incremental(&self) -> bool {
|
||||
self.is_incremental
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
||||
println!(
|
||||
"----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
|
||||
self.tenantid,
|
||||
self.timelineid,
|
||||
self.key_range.start,
|
||||
self.key_range.end,
|
||||
self.lsn_range.start,
|
||||
self.lsn_range.end
|
||||
self.desc.tenant_id,
|
||||
self.desc.timeline_id,
|
||||
self.desc.key_range.start,
|
||||
self.desc.key_range.end,
|
||||
self.desc.lsn_range.start,
|
||||
self.desc.lsn_range.end
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
self.layer_desc().key_range.clone()
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
self.layer_desc().lsn_range.clone()
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn is_incremental(&self) -> bool {
|
||||
self.layer_desc().is_incremental
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn short_id(&self) -> String {
|
||||
self.filename().file_name()
|
||||
self.layer_desc().short_id()
|
||||
}
|
||||
}
|
||||
|
||||
impl PersistentLayer for RemoteLayer {
|
||||
fn get_tenant_id(&self) -> TenantId {
|
||||
self.tenantid
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> TimelineId {
|
||||
self.timelineid
|
||||
}
|
||||
|
||||
fn filename(&self) -> LayerFileName {
|
||||
if self.is_delta {
|
||||
DeltaFileName {
|
||||
key_range: self.key_range.clone(),
|
||||
lsn_range: self.lsn_range.clone(),
|
||||
}
|
||||
.into()
|
||||
} else {
|
||||
ImageFileName {
|
||||
key_range: self.key_range.clone(),
|
||||
lsn: self.lsn_range.start,
|
||||
}
|
||||
.into()
|
||||
}
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc {
|
||||
&self.desc
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
@@ -168,15 +142,11 @@ impl PersistentLayer for RemoteLayer {
|
||||
true
|
||||
}
|
||||
|
||||
fn file_size(&self) -> u64 {
|
||||
self.layer_metadata.file_size()
|
||||
}
|
||||
|
||||
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
let layer_file_name = self.filename().file_name();
|
||||
let lsn_range = self.get_lsn_range();
|
||||
|
||||
if self.is_delta {
|
||||
if self.desc.is_delta {
|
||||
HistoricLayerInfo::Delta {
|
||||
layer_file_name,
|
||||
layer_file_size: self.layer_metadata.file_size(),
|
||||
@@ -210,13 +180,14 @@ impl RemoteLayer {
|
||||
access_stats: LayerAccessStats,
|
||||
) -> RemoteLayer {
|
||||
RemoteLayer {
|
||||
tenantid,
|
||||
timelineid,
|
||||
key_range: fname.key_range.clone(),
|
||||
lsn_range: fname.lsn_as_range(),
|
||||
is_delta: false,
|
||||
is_incremental: false,
|
||||
file_name: fname.to_owned().into(),
|
||||
desc: PersistentLayerDesc::new_img(
|
||||
tenantid,
|
||||
timelineid,
|
||||
fname.key_range.clone(),
|
||||
fname.lsn,
|
||||
false,
|
||||
layer_metadata.file_size(),
|
||||
),
|
||||
layer_metadata: layer_metadata.clone(),
|
||||
ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
|
||||
download_replacement_failure: std::sync::atomic::AtomicBool::default(),
|
||||
@@ -232,13 +203,13 @@ impl RemoteLayer {
|
||||
access_stats: LayerAccessStats,
|
||||
) -> RemoteLayer {
|
||||
RemoteLayer {
|
||||
tenantid,
|
||||
timelineid,
|
||||
key_range: fname.key_range.clone(),
|
||||
lsn_range: fname.lsn_range.clone(),
|
||||
is_delta: true,
|
||||
is_incremental: true,
|
||||
file_name: fname.to_owned().into(),
|
||||
desc: PersistentLayerDesc::new_delta(
|
||||
tenantid,
|
||||
timelineid,
|
||||
fname.key_range.clone(),
|
||||
fname.lsn_range.clone(),
|
||||
layer_metadata.file_size(),
|
||||
),
|
||||
layer_metadata: layer_metadata.clone(),
|
||||
ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
|
||||
download_replacement_failure: std::sync::atomic::AtomicBool::default(),
|
||||
@@ -256,15 +227,12 @@ impl RemoteLayer {
|
||||
where
|
||||
L: ?Sized + Layer,
|
||||
{
|
||||
if self.is_delta {
|
||||
let fname = DeltaFileName {
|
||||
key_range: self.key_range.clone(),
|
||||
lsn_range: self.lsn_range.clone(),
|
||||
};
|
||||
if self.desc.is_delta {
|
||||
let fname = self.desc.delta_file_name();
|
||||
Arc::new(DeltaLayer::new(
|
||||
conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
self.desc.timeline_id,
|
||||
self.desc.tenant_id,
|
||||
&fname,
|
||||
file_size,
|
||||
self.access_stats.clone_for_residence_change(
|
||||
@@ -273,14 +241,11 @@ impl RemoteLayer {
|
||||
),
|
||||
))
|
||||
} else {
|
||||
let fname = ImageFileName {
|
||||
key_range: self.key_range.clone(),
|
||||
lsn: self.lsn_range.start,
|
||||
};
|
||||
let fname = self.desc.image_file_name();
|
||||
Arc::new(ImageLayer::new(
|
||||
conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
self.desc.timeline_id,
|
||||
self.desc.tenant_id,
|
||||
&fname,
|
||||
file_size,
|
||||
self.access_stats.clone_for_residence_change(
|
||||
|
||||
@@ -9,13 +9,17 @@ use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::metrics::TENANT_TASK_EVENTS;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
|
||||
use crate::tenant::mgr;
|
||||
use crate::tenant::{Tenant, TenantState};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::id::TenantId;
|
||||
use utils::completion;
|
||||
|
||||
pub fn start_background_loops(tenant_id: TenantId) {
|
||||
/// Start per tenant background loops: compaction and gc.
|
||||
pub fn start_background_loops(
|
||||
tenant: &Arc<Tenant>,
|
||||
background_jobs_can_start: Option<&completion::Barrier>,
|
||||
) {
|
||||
let tenant_id = tenant.tenant_id;
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::Compaction,
|
||||
@@ -23,11 +27,20 @@ pub fn start_background_loops(tenant_id: TenantId) {
|
||||
None,
|
||||
&format!("compactor for tenant {tenant_id}"),
|
||||
false,
|
||||
async move {
|
||||
compaction_loop(tenant_id)
|
||||
.instrument(info_span!("compaction_loop", tenant_id = %tenant_id))
|
||||
.await;
|
||||
Ok(())
|
||||
{
|
||||
let tenant = Arc::clone(tenant);
|
||||
let background_jobs_can_start = background_jobs_can_start.cloned();
|
||||
async move {
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return Ok(()) },
|
||||
_ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
|
||||
};
|
||||
compaction_loop(tenant, cancel)
|
||||
.instrument(info_span!("compaction_loop", tenant_id = %tenant_id))
|
||||
.await;
|
||||
Ok(())
|
||||
}
|
||||
},
|
||||
);
|
||||
task_mgr::spawn(
|
||||
@@ -37,11 +50,20 @@ pub fn start_background_loops(tenant_id: TenantId) {
|
||||
None,
|
||||
&format!("garbage collector for tenant {tenant_id}"),
|
||||
false,
|
||||
async move {
|
||||
gc_loop(tenant_id)
|
||||
.instrument(info_span!("gc_loop", tenant_id = %tenant_id))
|
||||
.await;
|
||||
Ok(())
|
||||
{
|
||||
let tenant = Arc::clone(tenant);
|
||||
let background_jobs_can_start = background_jobs_can_start.cloned();
|
||||
async move {
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return Ok(()) },
|
||||
_ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
|
||||
};
|
||||
gc_loop(tenant, cancel)
|
||||
.instrument(info_span!("gc_loop", tenant_id = %tenant_id))
|
||||
.await;
|
||||
Ok(())
|
||||
}
|
||||
},
|
||||
);
|
||||
}
|
||||
@@ -49,27 +71,26 @@ pub fn start_background_loops(tenant_id: TenantId) {
|
||||
///
|
||||
/// Compaction task's main loop
|
||||
///
|
||||
async fn compaction_loop(tenant_id: TenantId) {
|
||||
async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
let wait_duration = Duration::from_secs(2);
|
||||
info!("starting");
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
async {
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
|
||||
let mut first = true;
|
||||
loop {
|
||||
trace!("waking up");
|
||||
|
||||
let tenant = tokio::select! {
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
info!("received cancellation request");
|
||||
return;
|
||||
},
|
||||
tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
|
||||
tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
|
||||
ControlFlow::Break(()) => return,
|
||||
ControlFlow::Continue(tenant) => tenant,
|
||||
ControlFlow::Continue(()) => (),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
let period = tenant.get_compaction_period();
|
||||
|
||||
@@ -119,29 +140,29 @@ async fn compaction_loop(tenant_id: TenantId) {
|
||||
///
|
||||
/// GC task's main loop
|
||||
///
|
||||
async fn gc_loop(tenant_id: TenantId) {
|
||||
async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
let wait_duration = Duration::from_secs(2);
|
||||
info!("starting");
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
async {
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
// GC might require downloading, to find the cutoff LSN that corresponds to the
|
||||
// cutoff specified as time.
|
||||
let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
|
||||
let ctx =
|
||||
RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
|
||||
let mut first = true;
|
||||
loop {
|
||||
trace!("waking up");
|
||||
|
||||
let tenant = tokio::select! {
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
info!("received cancellation request");
|
||||
return;
|
||||
},
|
||||
tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
|
||||
tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
|
||||
ControlFlow::Break(()) => return,
|
||||
ControlFlow::Continue(tenant) => tenant,
|
||||
ControlFlow::Continue(()) => (),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
let period = tenant.get_gc_period();
|
||||
|
||||
@@ -161,7 +182,9 @@ async fn gc_loop(tenant_id: TenantId) {
|
||||
Duration::from_secs(10)
|
||||
} else {
|
||||
// Run gc
|
||||
let res = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await;
|
||||
let res = tenant
|
||||
.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx)
|
||||
.await;
|
||||
if let Err(e) = res {
|
||||
error!("Gc failed, retrying in {:?}: {e:?}", wait_duration);
|
||||
wait_duration
|
||||
@@ -187,23 +210,10 @@ async fn gc_loop(tenant_id: TenantId) {
|
||||
trace!("GC loop stopped.");
|
||||
}
|
||||
|
||||
async fn wait_for_active_tenant(
|
||||
tenant_id: TenantId,
|
||||
wait: Duration,
|
||||
) -> ControlFlow<(), Arc<Tenant>> {
|
||||
let tenant = loop {
|
||||
match mgr::get_tenant(tenant_id, false).await {
|
||||
Ok(tenant) => break tenant,
|
||||
Err(e) => {
|
||||
error!("Failed to get a tenant {tenant_id}: {e:#}");
|
||||
tokio::time::sleep(wait).await;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
|
||||
// if the tenant has a proper status already, no need to wait for anything
|
||||
if tenant.current_state() == TenantState::Active {
|
||||
ControlFlow::Continue(tenant)
|
||||
ControlFlow::Continue(())
|
||||
} else {
|
||||
let mut tenant_state_updates = tenant.subscribe_for_state_updates();
|
||||
loop {
|
||||
@@ -213,7 +223,7 @@ async fn wait_for_active_tenant(
|
||||
match new_state {
|
||||
TenantState::Active => {
|
||||
debug!("Tenant state changed to active, continuing the task loop");
|
||||
return ControlFlow::Continue(tenant);
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
state => {
|
||||
debug!("Not running the task loop, tenant is not active: {state:?}");
|
||||
|
||||
@@ -22,8 +22,7 @@ use tracing::*;
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
use std::cmp::{max, min, Ordering};
|
||||
use std::collections::BinaryHeap;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::{BinaryHeap, HashMap};
|
||||
use std::fs;
|
||||
use std::ops::{Deref, Range};
|
||||
use std::path::{Path, PathBuf};
|
||||
@@ -32,7 +31,6 @@ use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
|
||||
use std::sync::{Arc, Mutex, MutexGuard, RwLock, Weak};
|
||||
use std::time::{Duration, Instant, SystemTime};
|
||||
|
||||
use crate::broker_client::{get_broker_client, is_broker_client_initialized};
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata};
|
||||
use crate::tenant::storage_layer::{
|
||||
@@ -48,7 +46,7 @@ use crate::tenant::{
|
||||
};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::keyspace::{KeyPartitioning, KeySpace};
|
||||
use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceRandomAccum};
|
||||
use crate::metrics::{TimelineMetrics, UNEXPECTED_ONDEMAND_DOWNLOADS};
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
|
||||
@@ -59,6 +57,7 @@ use pageserver_api::reltag::RelTag;
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
use postgres_ffi::to_pg_timestamp;
|
||||
use utils::{
|
||||
completion,
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::{AtomicLsn, Lsn, RecordLsn},
|
||||
seqwait::SeqWait,
|
||||
@@ -123,6 +122,17 @@ pub struct Timeline {
|
||||
|
||||
pub(super) layers: RwLock<LayerMap<dyn PersistentLayer>>,
|
||||
|
||||
/// Set of key ranges which should be covered by image layers to
|
||||
/// allow GC to remove old layers. This set is created by GC and its cutoff LSN is also stored.
|
||||
/// It is used by compaction task when it checks if new image layer should be created.
|
||||
/// Newly created image layer doesn't help to remove the delta layer, until the
|
||||
/// newly created image layer falls off the PITR horizon. So on next GC cycle,
|
||||
/// gc_timeline may still want the new image layer to be created. To avoid redundant
|
||||
/// image layers creation we should check if image layer exists but beyond PITR horizon.
|
||||
/// This is why we need remember GC cutoff LSN.
|
||||
///
|
||||
wanted_image_layers: Mutex<Option<(Lsn, KeySpace)>>,
|
||||
|
||||
last_freeze_at: AtomicLsn,
|
||||
// Atomic would be more appropriate here.
|
||||
last_freeze_ts: RwLock<Instant>,
|
||||
@@ -186,8 +196,9 @@ pub struct Timeline {
|
||||
/// Layer removal lock.
|
||||
/// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
|
||||
/// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`],
|
||||
/// and [`Tenant::delete_timeline`].
|
||||
pub(super) layer_removal_cs: tokio::sync::Mutex<()>,
|
||||
/// and [`Tenant::delete_timeline`]. This is an `Arc<Mutex>` lock because we need an owned
|
||||
/// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`).
|
||||
pub(super) layer_removal_cs: Arc<tokio::sync::Mutex<()>>,
|
||||
|
||||
// Needed to ensure that we can't create a branch at a point that was already garbage collected
|
||||
pub latest_gc_cutoff_lsn: Rcu<Lsn>,
|
||||
@@ -217,7 +228,7 @@ pub struct Timeline {
|
||||
/// or None if WAL receiver has not received anything for this timeline
|
||||
/// yet.
|
||||
pub last_received_wal: Mutex<Option<WalReceiverInfo>>,
|
||||
pub walreceiver: WalReceiver,
|
||||
pub walreceiver: Mutex<Option<WalReceiver>>,
|
||||
|
||||
/// Relation size cache
|
||||
pub rel_size_cache: RwLock<HashMap<RelTag, (Lsn, BlockNumber)>>,
|
||||
@@ -226,7 +237,18 @@ pub struct Timeline {
|
||||
|
||||
state: watch::Sender<TimelineState>,
|
||||
|
||||
/// Prevent two tasks from deleting the timeline at the same time. If held, the
|
||||
/// timeline is being deleted. If 'true', the timeline has already been deleted.
|
||||
pub delete_lock: Arc<tokio::sync::Mutex<bool>>,
|
||||
|
||||
eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,
|
||||
|
||||
/// Barrier to wait before doing initial logical size calculation. Used only during startup.
|
||||
initial_logical_size_can_start: Option<completion::Barrier>,
|
||||
|
||||
/// Completion shared between all timelines loaded during startup; used to delay heavier
|
||||
/// background tasks until some logical sizes have been calculated.
|
||||
initial_logical_size_attempt: Mutex<Option<completion::Completion>>,
|
||||
}
|
||||
|
||||
/// Internal structure to hold all data needed for logical size calculation.
|
||||
@@ -511,7 +533,12 @@ impl Timeline {
|
||||
Some((cached_lsn, cached_img)) => {
|
||||
match cached_lsn.cmp(&lsn) {
|
||||
Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check
|
||||
Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image
|
||||
Ordering::Equal => {
|
||||
self.metrics
|
||||
.materialized_page_cache_hit_upon_request_counter
|
||||
.inc();
|
||||
return Ok(cached_img); // exact LSN match, return the image
|
||||
}
|
||||
Ordering::Greater => {
|
||||
unreachable!("the returned lsn should never be after the requested lsn")
|
||||
}
|
||||
@@ -526,8 +553,10 @@ impl Timeline {
|
||||
img: cached_page_img,
|
||||
};
|
||||
|
||||
let timer = self.metrics.get_reconstruct_data_time_histo.start_timer();
|
||||
self.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
|
||||
.await?;
|
||||
timer.stop_and_record();
|
||||
|
||||
self.metrics
|
||||
.reconstruct_time_histo
|
||||
@@ -612,17 +641,27 @@ impl Timeline {
|
||||
.await
|
||||
{
|
||||
Ok(()) => Ok(()),
|
||||
seqwait_error => {
|
||||
Err(e) => {
|
||||
// don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo
|
||||
drop(_timer);
|
||||
let walreceiver_status = self.walreceiver.status().await;
|
||||
seqwait_error.with_context(|| format!(
|
||||
"Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, {}",
|
||||
lsn,
|
||||
self.get_last_record_lsn(),
|
||||
self.get_disk_consistent_lsn(),
|
||||
walreceiver_status.map(|status| status.to_human_readable_string())
|
||||
.unwrap_or_else(|| "WalReceiver status: Not active".to_string()),
|
||||
))
|
||||
let walreceiver_status = {
|
||||
match &*self.walreceiver.lock().unwrap() {
|
||||
None => "stopping or stopped".to_string(),
|
||||
Some(walreceiver) => match walreceiver.status() {
|
||||
Some(status) => status.to_human_readable_string(),
|
||||
None => "Not active".to_string(),
|
||||
},
|
||||
}
|
||||
};
|
||||
Err(anyhow::Error::new(e).context({
|
||||
format!(
|
||||
"Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}",
|
||||
lsn,
|
||||
self.get_last_record_lsn(),
|
||||
self.get_disk_consistent_lsn(),
|
||||
walreceiver_status,
|
||||
)
|
||||
}))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -650,7 +689,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Outermost timeline compaction operation; downloads needed layers.
|
||||
pub async fn compact(&self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
pub async fn compact(self: &Arc<Self>, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
const ROUNDS: usize = 2;
|
||||
|
||||
let last_record_lsn = self.get_last_record_lsn();
|
||||
@@ -739,7 +778,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Compaction which might need to be retried after downloading remote layers.
|
||||
async fn compact_inner(&self, ctx: &RequestContext) -> Result<(), CompactionError> {
|
||||
async fn compact_inner(self: &Arc<Self>, ctx: &RequestContext) -> Result<(), CompactionError> {
|
||||
//
|
||||
// High level strategy for compaction / image creation:
|
||||
//
|
||||
@@ -774,10 +813,9 @@ impl Timeline {
|
||||
// Below are functions compact_level0() and create_image_layers()
|
||||
// but they are a bit ad hoc and don't quite work like it's explained
|
||||
// above. Rewrite it.
|
||||
let layer_removal_cs = self.layer_removal_cs.lock().await;
|
||||
let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await);
|
||||
// Is the timeline being deleted?
|
||||
let state = *self.state.borrow();
|
||||
if state == TimelineState::Stopping {
|
||||
if self.is_stopping() {
|
||||
return Err(anyhow::anyhow!("timeline is Stopping").into());
|
||||
}
|
||||
|
||||
@@ -808,7 +846,7 @@ impl Timeline {
|
||||
|
||||
// 3. Compact
|
||||
let timer = self.metrics.compact_time_histo.start_timer();
|
||||
self.compact_level0(&layer_removal_cs, target_file_size, ctx)
|
||||
self.compact_level0(layer_removal_cs.clone(), target_file_size, ctx)
|
||||
.await?;
|
||||
timer.stop_and_record();
|
||||
}
|
||||
@@ -897,18 +935,15 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn activate(self: &Arc<Self>, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
if is_broker_client_initialized() {
|
||||
self.launch_wal_receiver(ctx, get_broker_client().clone())?;
|
||||
} else if cfg!(test) {
|
||||
info!("not launching WAL receiver because broker client hasn't been initialized");
|
||||
} else {
|
||||
anyhow::bail!("broker client not initialized");
|
||||
}
|
||||
|
||||
pub fn activate(
|
||||
self: &Arc<Self>,
|
||||
broker_client: BrokerClientChannel,
|
||||
background_jobs_can_start: Option<&completion::Barrier>,
|
||||
ctx: &RequestContext,
|
||||
) {
|
||||
self.launch_wal_receiver(ctx, broker_client);
|
||||
self.set_state(TimelineState::Active);
|
||||
self.launch_eviction_task();
|
||||
Ok(())
|
||||
self.launch_eviction_task(background_jobs_can_start);
|
||||
}
|
||||
|
||||
pub fn set_state(&self, new_state: TimelineState) {
|
||||
@@ -919,26 +954,54 @@ impl Timeline {
|
||||
(st, TimelineState::Loading) => {
|
||||
error!("ignoring transition from {st:?} into Loading state");
|
||||
}
|
||||
(TimelineState::Broken, _) => {
|
||||
error!("Ignoring state update {new_state:?} for broken tenant");
|
||||
(TimelineState::Broken { .. }, new_state) => {
|
||||
error!("Ignoring state update {new_state:?} for broken timeline");
|
||||
}
|
||||
(TimelineState::Stopping, TimelineState::Active) => {
|
||||
error!("Not activating a Stopping timeline");
|
||||
}
|
||||
(_, new_state) => {
|
||||
if matches!(
|
||||
new_state,
|
||||
TimelineState::Stopping | TimelineState::Broken { .. }
|
||||
) {
|
||||
// drop the copmletion guard, if any; it might be holding off the completion
|
||||
// forever needlessly
|
||||
self.initial_logical_size_attempt
|
||||
.lock()
|
||||
.unwrap_or_else(|e| e.into_inner())
|
||||
.take();
|
||||
}
|
||||
self.state.send_replace(new_state);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_broken(&self, reason: String) {
|
||||
let backtrace_str: String = format!("{}", std::backtrace::Backtrace::force_capture());
|
||||
let broken_state = TimelineState::Broken {
|
||||
reason,
|
||||
backtrace: backtrace_str,
|
||||
};
|
||||
self.set_state(broken_state)
|
||||
}
|
||||
|
||||
pub fn current_state(&self) -> TimelineState {
|
||||
*self.state.borrow()
|
||||
self.state.borrow().clone()
|
||||
}
|
||||
|
||||
pub fn is_broken(&self) -> bool {
|
||||
matches!(&*self.state.borrow(), TimelineState::Broken { .. })
|
||||
}
|
||||
|
||||
pub fn is_active(&self) -> bool {
|
||||
self.current_state() == TimelineState::Active
|
||||
}
|
||||
|
||||
pub fn is_stopping(&self) -> bool {
|
||||
self.current_state() == TimelineState::Stopping
|
||||
}
|
||||
|
||||
pub fn subscribe_for_state_updates(&self) -> watch::Receiver<TimelineState> {
|
||||
self.state.subscribe()
|
||||
}
|
||||
@@ -949,7 +1012,7 @@ impl Timeline {
|
||||
) -> Result<(), TimelineState> {
|
||||
let mut receiver = self.state.subscribe();
|
||||
loop {
|
||||
let current_state = *receiver.borrow_and_update();
|
||||
let current_state = receiver.borrow().clone();
|
||||
match current_state {
|
||||
TimelineState::Loading => {
|
||||
receiver
|
||||
@@ -1167,7 +1230,12 @@ impl Timeline {
|
||||
),
|
||||
});
|
||||
|
||||
let replaced = match batch_updates.replace_historic(local_layer, new_remote_layer)? {
|
||||
let replaced = match batch_updates.replace_historic(
|
||||
local_layer.layer_desc().clone(),
|
||||
local_layer,
|
||||
new_remote_layer.layer_desc().clone(),
|
||||
new_remote_layer,
|
||||
)? {
|
||||
Replacement::Replaced { .. } => {
|
||||
if let Err(e) = local_layer.delete_resident_layer_file() {
|
||||
error!("failed to remove layer file on evict after replacement: {e:#?}");
|
||||
@@ -1275,6 +1343,13 @@ impl Timeline {
|
||||
.unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold)
|
||||
}
|
||||
|
||||
fn get_gc_feedback(&self) -> bool {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap();
|
||||
tenant_conf
|
||||
.gc_feedback
|
||||
.unwrap_or(self.conf.default_tenant_conf.gc_feedback)
|
||||
}
|
||||
|
||||
pub(super) fn tenant_conf_updated(&self) {
|
||||
// NB: Most tenant conf options are read by background loops, so,
|
||||
// changes will automatically be picked up.
|
||||
@@ -1309,6 +1384,8 @@ impl Timeline {
|
||||
walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
|
||||
remote_client: Option<RemoteTimelineClient>,
|
||||
pg_version: u32,
|
||||
initial_logical_size_can_start: Option<completion::Barrier>,
|
||||
initial_logical_size_attempt: Option<completion::Completion>,
|
||||
) -> Arc<Self> {
|
||||
let disk_consistent_lsn = metadata.disk_consistent_lsn();
|
||||
let (state, _) = watch::channel(TimelineState::Loading);
|
||||
@@ -1317,15 +1394,7 @@ impl Timeline {
|
||||
let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
|
||||
|
||||
let tenant_conf_guard = tenant_conf.read().unwrap();
|
||||
let wal_connect_timeout = tenant_conf_guard
|
||||
.walreceiver_connect_timeout
|
||||
.unwrap_or(conf.default_tenant_conf.walreceiver_connect_timeout);
|
||||
let lagging_wal_timeout = tenant_conf_guard
|
||||
.lagging_wal_timeout
|
||||
.unwrap_or(conf.default_tenant_conf.lagging_wal_timeout);
|
||||
let max_lsn_wal_lag = tenant_conf_guard
|
||||
.max_lsn_wal_lag
|
||||
.unwrap_or(conf.default_tenant_conf.max_lsn_wal_lag);
|
||||
|
||||
let evictions_low_residence_duration_metric_threshold =
|
||||
Self::get_evictions_low_residence_duration_metric_threshold(
|
||||
&tenant_conf_guard,
|
||||
@@ -1334,18 +1403,6 @@ impl Timeline {
|
||||
drop(tenant_conf_guard);
|
||||
|
||||
Arc::new_cyclic(|myself| {
|
||||
let walreceiver = WalReceiver::new(
|
||||
TenantTimelineId::new(tenant_id, timeline_id),
|
||||
Weak::clone(myself),
|
||||
WalReceiverConf {
|
||||
wal_connect_timeout,
|
||||
lagging_wal_timeout,
|
||||
max_lsn_wal_lag,
|
||||
auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
|
||||
availability_zone: conf.availability_zone.clone(),
|
||||
},
|
||||
);
|
||||
|
||||
let mut result = Timeline {
|
||||
conf,
|
||||
tenant_conf,
|
||||
@@ -1354,9 +1411,10 @@ impl Timeline {
|
||||
tenant_id,
|
||||
pg_version,
|
||||
layers: RwLock::new(LayerMap::default()),
|
||||
wanted_image_layers: Mutex::new(None),
|
||||
|
||||
walredo_mgr,
|
||||
walreceiver,
|
||||
walreceiver: Mutex::new(None),
|
||||
|
||||
remote_client: remote_client.map(Arc::new),
|
||||
|
||||
@@ -1421,6 +1479,10 @@ impl Timeline {
|
||||
eviction_task_timeline_state: tokio::sync::Mutex::new(
|
||||
EvictionTaskTimelineState::default(),
|
||||
),
|
||||
delete_lock: Arc::new(tokio::sync::Mutex::new(false)),
|
||||
|
||||
initial_logical_size_can_start,
|
||||
initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt),
|
||||
};
|
||||
result.repartition_threshold = result.get_checkpoint_distance() / 10;
|
||||
result
|
||||
@@ -1476,17 +1538,49 @@ impl Timeline {
|
||||
*flush_loop_state = FlushLoopState::Running;
|
||||
}
|
||||
|
||||
pub(super) fn launch_wal_receiver(
|
||||
&self,
|
||||
/// Creates and starts the wal receiver.
|
||||
///
|
||||
/// This function is expected to be called at most once per Timeline's lifecycle
|
||||
/// when the timeline is activated.
|
||||
fn launch_wal_receiver(
|
||||
self: &Arc<Self>,
|
||||
ctx: &RequestContext,
|
||||
broker_client: BrokerClientChannel,
|
||||
) -> anyhow::Result<()> {
|
||||
) {
|
||||
info!(
|
||||
"launching WAL receiver for timeline {} of tenant {}",
|
||||
self.timeline_id, self.tenant_id
|
||||
);
|
||||
self.walreceiver.start(ctx, broker_client)?;
|
||||
Ok(())
|
||||
|
||||
let tenant_conf_guard = self.tenant_conf.read().unwrap();
|
||||
let wal_connect_timeout = tenant_conf_guard
|
||||
.walreceiver_connect_timeout
|
||||
.unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout);
|
||||
let lagging_wal_timeout = tenant_conf_guard
|
||||
.lagging_wal_timeout
|
||||
.unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout);
|
||||
let max_lsn_wal_lag = tenant_conf_guard
|
||||
.max_lsn_wal_lag
|
||||
.unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag);
|
||||
drop(tenant_conf_guard);
|
||||
|
||||
let mut guard = self.walreceiver.lock().unwrap();
|
||||
assert!(
|
||||
guard.is_none(),
|
||||
"multiple launches / re-launches of WAL receiver are not supported"
|
||||
);
|
||||
*guard = Some(WalReceiver::start(
|
||||
Arc::clone(self),
|
||||
WalReceiverConf {
|
||||
wal_connect_timeout,
|
||||
lagging_wal_timeout,
|
||||
max_lsn_wal_lag,
|
||||
auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
|
||||
availability_zone: self.conf.availability_zone.clone(),
|
||||
},
|
||||
broker_client,
|
||||
ctx,
|
||||
));
|
||||
}
|
||||
|
||||
///
|
||||
@@ -1537,7 +1631,7 @@ impl Timeline {
|
||||
|
||||
trace!("found layer {}", layer.path().display());
|
||||
total_physical_size += file_size;
|
||||
updates.insert_historic(Arc::new(layer));
|
||||
updates.insert_historic(layer.layer_desc().clone(), Arc::new(layer));
|
||||
num_layers += 1;
|
||||
} else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) {
|
||||
// Create a DeltaLayer struct for each delta file.
|
||||
@@ -1569,7 +1663,7 @@ impl Timeline {
|
||||
|
||||
trace!("found layer {}", layer.path().display());
|
||||
total_physical_size += file_size;
|
||||
updates.insert_historic(Arc::new(layer));
|
||||
updates.insert_historic(layer.layer_desc().clone(), Arc::new(layer));
|
||||
num_layers += 1;
|
||||
} else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
|
||||
// ignore these
|
||||
@@ -1668,7 +1762,7 @@ impl Timeline {
|
||||
anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
|
||||
} else {
|
||||
self.metrics.resident_physical_size_gauge.sub(local_size);
|
||||
updates.remove_historic(local_layer);
|
||||
updates.remove_historic(local_layer.layer_desc().clone(), local_layer);
|
||||
// fall-through to adding the remote layer
|
||||
}
|
||||
} else {
|
||||
@@ -1707,7 +1801,7 @@ impl Timeline {
|
||||
);
|
||||
let remote_layer = Arc::new(remote_layer);
|
||||
|
||||
updates.insert_historic(remote_layer);
|
||||
updates.insert_historic(remote_layer.layer_desc().clone(), remote_layer);
|
||||
}
|
||||
LayerFileName::Delta(deltafilename) => {
|
||||
// Create a RemoteLayer for the delta file.
|
||||
@@ -1734,7 +1828,7 @@ impl Timeline {
|
||||
),
|
||||
);
|
||||
let remote_layer = Arc::new(remote_layer);
|
||||
updates.insert_historic(remote_layer);
|
||||
updates.insert_historic(remote_layer.layer_desc().clone(), remote_layer);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1877,9 +1971,30 @@ impl Timeline {
|
||||
false,
|
||||
// NB: don't log errors here, task_mgr will do that.
|
||||
async move {
|
||||
// no cancellation here, because nothing really waits for this to complete compared
|
||||
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
|
||||
// in case we were created during pageserver initialization, wait for
|
||||
// initialization to complete before proceeding. startup time init runs on the same
|
||||
// runtime.
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return Ok(()); },
|
||||
_ = completion::Barrier::maybe_wait(self_clone.initial_logical_size_can_start.clone()) => {}
|
||||
};
|
||||
|
||||
// hold off background tasks from starting until all timelines get to try at least
|
||||
// once initial logical size calculation; though retry will rarely be useful.
|
||||
// holding off is done because heavier tasks execute blockingly on the same
|
||||
// runtime.
|
||||
//
|
||||
// dropping this at every outcome is probably better than trying to cling on to it,
|
||||
// delay will be terminated by a timeout regardless.
|
||||
let _completion = { self_clone.initial_logical_size_attempt.lock().expect("unexpected initial_logical_size_attempt poisoned").take() };
|
||||
|
||||
// no extra cancellation here, because nothing really waits for this to complete compared
|
||||
// to spawn_ondemand_logical_size_calculation.
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let calculated_size = match self_clone
|
||||
.logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx, cancel)
|
||||
.await
|
||||
@@ -2005,11 +2120,11 @@ impl Timeline {
|
||||
loop {
|
||||
match timeline_state_updates.changed().await {
|
||||
Ok(()) => {
|
||||
let new_state = *timeline_state_updates.borrow();
|
||||
let new_state = timeline_state_updates.borrow().clone();
|
||||
match new_state {
|
||||
// we're running this job for active timelines only
|
||||
TimelineState::Active => continue,
|
||||
TimelineState::Broken
|
||||
TimelineState::Broken { .. }
|
||||
| TimelineState::Stopping
|
||||
| TimelineState::Loading => {
|
||||
break format!("aborted because timeline became inactive (new state: {new_state:?})")
|
||||
@@ -2144,7 +2259,7 @@ impl Timeline {
|
||||
fn delete_historic_layer(
|
||||
&self,
|
||||
// we cannot remove layers otherwise, since gc and compaction will race
|
||||
_layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
|
||||
_layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||
layer: Arc<dyn PersistentLayer>,
|
||||
updates: &mut BatchedUpdates<'_, dyn PersistentLayer>,
|
||||
) -> anyhow::Result<()> {
|
||||
@@ -2161,7 +2276,7 @@ impl Timeline {
|
||||
// won't be needed for page reconstruction for this timeline,
|
||||
// and mark what we can't delete yet as deleted from the layer
|
||||
// map index without actually rebuilding the index.
|
||||
updates.remove_historic(layer);
|
||||
updates.remove_historic(layer.layer_desc().clone(), layer);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -2223,6 +2338,9 @@ impl Timeline {
|
||||
let mut timeline_owned;
|
||||
let mut timeline = self;
|
||||
|
||||
let mut read_count =
|
||||
scopeguard::guard(0, |cnt| self.metrics.read_num_fs_layers.observe(cnt as f64));
|
||||
|
||||
// For debugging purposes, collect the path of layers that we traversed
|
||||
// through. It's included in the error message if we fail to find the key.
|
||||
let mut traversal_path = Vec::<TraversalPathItem>::new();
|
||||
@@ -2357,6 +2475,7 @@ impl Timeline {
|
||||
Err(e) => return Err(PageReconstructError::from(e)),
|
||||
};
|
||||
cont_lsn = lsn_floor;
|
||||
// metrics: open_layer does not count as fs access, so we are not updating `read_count`
|
||||
traversal_path.push((
|
||||
result,
|
||||
cont_lsn,
|
||||
@@ -2383,6 +2502,7 @@ impl Timeline {
|
||||
Err(e) => return Err(PageReconstructError::from(e)),
|
||||
};
|
||||
cont_lsn = lsn_floor;
|
||||
// metrics: open_layer does not count as fs access, so we are not updating `read_count`
|
||||
traversal_path.push((
|
||||
result,
|
||||
cont_lsn,
|
||||
@@ -2417,6 +2537,7 @@ impl Timeline {
|
||||
Err(e) => return Err(PageReconstructError::from(e)),
|
||||
};
|
||||
cont_lsn = lsn_floor;
|
||||
*read_count += 1;
|
||||
traversal_path.push((
|
||||
result,
|
||||
cont_lsn,
|
||||
@@ -2482,7 +2603,7 @@ impl Timeline {
|
||||
(DownloadBehavior::Error, false) => {
|
||||
return Err(PageReconstructError::NeedsDownload(
|
||||
TenantTimelineId::new(self.tenant_id, self.timeline_id),
|
||||
remote_layer.file_name.clone(),
|
||||
remote_layer.filename(),
|
||||
))
|
||||
}
|
||||
}
|
||||
@@ -2608,7 +2729,7 @@ impl Timeline {
|
||||
|
||||
/// Layer flusher task's main loop.
|
||||
async fn flush_loop(
|
||||
&self,
|
||||
self: &Arc<Self>,
|
||||
mut layer_flush_start_rx: tokio::sync::watch::Receiver<u64>,
|
||||
ctx: &RequestContext,
|
||||
) {
|
||||
@@ -2697,9 +2818,9 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Flush one frozen in-memory layer to disk, as a new delta layer.
|
||||
#[instrument(skip(self, frozen_layer, ctx), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))]
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))]
|
||||
async fn flush_frozen_layer(
|
||||
&self,
|
||||
self: &Arc<Self>,
|
||||
frozen_layer: Arc<InMemoryLayer>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
@@ -2719,7 +2840,16 @@ impl Timeline {
|
||||
.await?
|
||||
} else {
|
||||
// normal case, write out a L0 delta layer file.
|
||||
let (delta_path, metadata) = self.create_delta_layer(&frozen_layer)?;
|
||||
let this = self.clone();
|
||||
let frozen_layer = frozen_layer.clone();
|
||||
let span = tracing::info_span!("blocking");
|
||||
let (delta_path, metadata) = tokio::task::spawn_blocking(move || {
|
||||
let _g = span.entered();
|
||||
this.create_delta_layer(&frozen_layer)
|
||||
})
|
||||
.await
|
||||
.context("create_delta_layer spawn_blocking")
|
||||
.and_then(|res| res)?;
|
||||
HashMap::from([(delta_path, metadata)])
|
||||
};
|
||||
|
||||
@@ -2823,7 +2953,7 @@ impl Timeline {
|
||||
|
||||
// Write out the given frozen in-memory layer as a new L0 delta file
|
||||
fn create_delta_layer(
|
||||
&self,
|
||||
self: &Arc<Self>,
|
||||
frozen_layer: &InMemoryLayer,
|
||||
) -> anyhow::Result<(LayerFileName, LayerFileMetadata)> {
|
||||
// Write it out
|
||||
@@ -2839,10 +2969,13 @@ impl Timeline {
|
||||
// TODO: If we're running inside 'flush_frozen_layers' and there are multiple
|
||||
// files to flush, it might be better to first write them all, and then fsync
|
||||
// them all in parallel.
|
||||
par_fsync::par_fsync(&[
|
||||
new_delta_path.clone(),
|
||||
self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
|
||||
])?;
|
||||
|
||||
// First sync the delta layer. We still use par_fsync here to keep everything consistent. Feel free to replace
|
||||
// this with a single fsync in future refactors.
|
||||
par_fsync::par_fsync(&[new_delta_path.clone()]).context("fsync of delta layer")?;
|
||||
// Then sync the parent directory.
|
||||
par_fsync::par_fsync(&[self.conf.timeline_path(&self.timeline_id, &self.tenant_id)])
|
||||
.context("fsync of timeline dir")?;
|
||||
|
||||
// Add it to the layer map
|
||||
let l = Arc::new(new_delta);
|
||||
@@ -2853,7 +2986,7 @@ impl Timeline {
|
||||
LayerResidenceStatus::Resident,
|
||||
LayerResidenceEventReason::LayerCreate,
|
||||
);
|
||||
batch_updates.insert_historic(l);
|
||||
batch_updates.insert_historic(l.layer_desc().clone(), l);
|
||||
batch_updates.flush();
|
||||
|
||||
// update the timeline's physical size
|
||||
@@ -2904,6 +3037,30 @@ impl Timeline {
|
||||
let layers = self.layers.read().unwrap();
|
||||
|
||||
let mut max_deltas = 0;
|
||||
{
|
||||
let wanted_image_layers = self.wanted_image_layers.lock().unwrap();
|
||||
if let Some((cutoff_lsn, wanted)) = &*wanted_image_layers {
|
||||
let img_range =
|
||||
partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end;
|
||||
if wanted.overlaps(&img_range) {
|
||||
//
|
||||
// gc_timeline only pays attention to image layers that are older than the GC cutoff,
|
||||
// but create_image_layers creates image layers at last-record-lsn.
|
||||
// So it's possible that gc_timeline wants a new image layer to be created for a key range,
|
||||
// but the range is already covered by image layers at more recent LSNs. Before we
|
||||
// create a new image layer, check if the range is already covered at more recent LSNs.
|
||||
if !layers
|
||||
.image_layer_exists(&img_range, &(Lsn::min(lsn, *cutoff_lsn)..lsn + 1))?
|
||||
{
|
||||
debug!(
|
||||
"Force generation of layer {}-{} wanted by GC, cutoff={}, lsn={})",
|
||||
img_range.start, img_range.end, cutoff_lsn, lsn
|
||||
);
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for part_range in &partition.ranges {
|
||||
let image_coverage = layers.image_coverage(part_range, lsn)?;
|
||||
@@ -2979,6 +3136,7 @@ impl Timeline {
|
||||
self.tenant_id,
|
||||
&img_range,
|
||||
lsn,
|
||||
false, // image layer always covers the full range
|
||||
)?;
|
||||
|
||||
fail_point!("image-layer-writer-fail-before-finish", |_| {
|
||||
@@ -3023,6 +3181,12 @@ impl Timeline {
|
||||
image_layers.push(image_layer);
|
||||
}
|
||||
}
|
||||
// All layers that the GC wanted us to create have now been created.
|
||||
//
|
||||
// It's possible that another GC cycle happened while we were compacting, and added
|
||||
// something new to wanted_image_layers, and we now clear that before processing it.
|
||||
// That's OK, because the next GC iteration will put it back in.
|
||||
*self.wanted_image_layers.lock().unwrap() = None;
|
||||
|
||||
// Sync the new layer to disk before adding it to the layer map, to make sure
|
||||
// we don't garbage collect something based on the new layer, before it has
|
||||
@@ -3036,17 +3200,22 @@ impl Timeline {
|
||||
let all_paths = image_layers
|
||||
.iter()
|
||||
.map(|layer| layer.path())
|
||||
.chain(std::iter::once(
|
||||
self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
|
||||
))
|
||||
.collect::<Vec<_>>();
|
||||
par_fsync::par_fsync(&all_paths).context("fsync of newly created layer files")?;
|
||||
|
||||
par_fsync::par_fsync_async(&all_paths)
|
||||
.await
|
||||
.context("fsync of newly created layer files")?;
|
||||
|
||||
par_fsync::par_fsync_async(&[self.conf.timeline_path(&self.timeline_id, &self.tenant_id)])
|
||||
.await
|
||||
.context("fsync of timeline dir")?;
|
||||
|
||||
let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len());
|
||||
|
||||
let mut layers = self.layers.write().unwrap();
|
||||
let mut updates = layers.batch_update();
|
||||
let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
|
||||
|
||||
for l in image_layers {
|
||||
let path = l.filename();
|
||||
let metadata = timeline_path
|
||||
@@ -3065,7 +3234,7 @@ impl Timeline {
|
||||
LayerResidenceStatus::Resident,
|
||||
LayerResidenceEventReason::LayerCreate,
|
||||
);
|
||||
updates.insert_historic(l);
|
||||
updates.insert_historic(l.layer_desc().clone(), l);
|
||||
}
|
||||
updates.flush();
|
||||
drop(layers);
|
||||
@@ -3105,9 +3274,9 @@ impl Timeline {
|
||||
/// This method takes the `_layer_removal_cs` guard to highlight it required downloads are
|
||||
/// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the
|
||||
/// start of level0 files compaction, the on-demand download should be revisited as well.
|
||||
async fn compact_level0_phase1(
|
||||
fn compact_level0_phase1(
|
||||
&self,
|
||||
_layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
|
||||
_layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||
target_file_size: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<CompactLevel0Phase1Result, CompactionError> {
|
||||
@@ -3420,13 +3589,13 @@ impl Timeline {
|
||||
if !new_layers.is_empty() {
|
||||
let mut layer_paths: Vec<PathBuf> = new_layers.iter().map(|l| l.path()).collect();
|
||||
|
||||
// also sync the directory
|
||||
layer_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id));
|
||||
|
||||
// Fsync all the layer files and directory using multiple threads to
|
||||
// minimize latency.
|
||||
par_fsync::par_fsync(&layer_paths).context("fsync all new layers")?;
|
||||
|
||||
par_fsync::par_fsync(&[self.conf.timeline_path(&self.timeline_id, &self.tenant_id)])
|
||||
.context("fsync of timeline dir")?;
|
||||
|
||||
layer_paths.pop().unwrap();
|
||||
}
|
||||
|
||||
@@ -3443,17 +3612,26 @@ impl Timeline {
|
||||
/// as Level 1 files.
|
||||
///
|
||||
async fn compact_level0(
|
||||
&self,
|
||||
layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
|
||||
self: &Arc<Self>,
|
||||
layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||
target_file_size: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), CompactionError> {
|
||||
let this = self.clone();
|
||||
let ctx_inner = ctx.clone();
|
||||
let layer_removal_cs_inner = layer_removal_cs.clone();
|
||||
let span = tracing::info_span!("blocking");
|
||||
let CompactLevel0Phase1Result {
|
||||
new_layers,
|
||||
deltas_to_compact,
|
||||
} = self
|
||||
.compact_level0_phase1(layer_removal_cs, target_file_size, ctx)
|
||||
.await?;
|
||||
} = tokio::task::spawn_blocking(move || {
|
||||
let _g = span.entered();
|
||||
this.compact_level0_phase1(layer_removal_cs_inner, target_file_size, &ctx_inner)
|
||||
})
|
||||
.await
|
||||
.context("compact_level0_phase1 spawn_blocking")
|
||||
.map_err(CompactionError::Other)
|
||||
.and_then(|res| res)?;
|
||||
|
||||
if new_layers.is_empty() && deltas_to_compact.is_empty() {
|
||||
// nothing to do
|
||||
@@ -3503,7 +3681,7 @@ impl Timeline {
|
||||
LayerResidenceStatus::Resident,
|
||||
LayerResidenceEventReason::LayerCreate,
|
||||
);
|
||||
updates.insert_historic(x);
|
||||
updates.insert_historic(x.layer_desc().clone(), x);
|
||||
}
|
||||
|
||||
// Now that we have reshuffled the data to set of new delta layers, we can
|
||||
@@ -3511,7 +3689,7 @@ impl Timeline {
|
||||
let mut layer_names_to_delete = Vec::with_capacity(deltas_to_compact.len());
|
||||
for l in deltas_to_compact {
|
||||
layer_names_to_delete.push(l.filename());
|
||||
self.delete_historic_layer(layer_removal_cs, l, &mut updates)?;
|
||||
self.delete_historic_layer(layer_removal_cs.clone(), l, &mut updates)?;
|
||||
}
|
||||
updates.flush();
|
||||
drop(layers);
|
||||
@@ -3631,10 +3809,9 @@ impl Timeline {
|
||||
|
||||
fail_point!("before-timeline-gc");
|
||||
|
||||
let layer_removal_cs = self.layer_removal_cs.lock().await;
|
||||
let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await);
|
||||
// Is the timeline being deleted?
|
||||
let state = *self.state.borrow();
|
||||
if state == TimelineState::Stopping {
|
||||
if self.is_stopping() {
|
||||
anyhow::bail!("timeline is Stopping");
|
||||
}
|
||||
|
||||
@@ -3651,7 +3828,7 @@ impl Timeline {
|
||||
|
||||
let res = self
|
||||
.gc_timeline(
|
||||
&layer_removal_cs,
|
||||
layer_removal_cs.clone(),
|
||||
horizon_cutoff,
|
||||
pitr_cutoff,
|
||||
retain_lsns,
|
||||
@@ -3670,7 +3847,7 @@ impl Timeline {
|
||||
|
||||
async fn gc_timeline(
|
||||
&self,
|
||||
layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
|
||||
layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||
horizon_cutoff: Lsn,
|
||||
pitr_cutoff: Lsn,
|
||||
retain_lsns: Vec<Lsn>,
|
||||
@@ -3720,6 +3897,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
let mut layers_to_remove = Vec::new();
|
||||
let mut wanted_image_layers = KeySpaceRandomAccum::default();
|
||||
|
||||
// Scan all layers in the timeline (remote or on-disk).
|
||||
//
|
||||
@@ -3803,6 +3981,15 @@ impl Timeline {
|
||||
"keeping {} because it is the latest layer",
|
||||
l.filename().file_name()
|
||||
);
|
||||
// Collect delta key ranges that need image layers to allow garbage
|
||||
// collecting the layers.
|
||||
// It is not so obvious whether we need to propagate information only about
|
||||
// delta layers. Image layers can form "stairs" preventing old image from been deleted.
|
||||
// But image layers are in any case less sparse than delta layers. Also we need some
|
||||
// protection from replacing recent image layers with new one after each GC iteration.
|
||||
if self.get_gc_feedback() && l.is_incremental() && !LayerMap::is_l0(&*l) {
|
||||
wanted_image_layers.add_range(l.get_key_range());
|
||||
}
|
||||
result.layers_not_updated += 1;
|
||||
continue 'outer;
|
||||
}
|
||||
@@ -3815,6 +4002,10 @@ impl Timeline {
|
||||
);
|
||||
layers_to_remove.push(Arc::clone(&l));
|
||||
}
|
||||
self.wanted_image_layers
|
||||
.lock()
|
||||
.unwrap()
|
||||
.replace((new_gc_cutoff, wanted_image_layers.to_keyspace()));
|
||||
|
||||
let mut updates = layers.batch_update();
|
||||
if !layers_to_remove.is_empty() {
|
||||
@@ -3829,7 +4020,11 @@ impl Timeline {
|
||||
{
|
||||
for doomed_layer in layers_to_remove {
|
||||
layer_names_to_delete.push(doomed_layer.filename());
|
||||
self.delete_historic_layer(layer_removal_cs, doomed_layer, &mut updates)?; // FIXME: schedule succeeded deletions before returning?
|
||||
self.delete_historic_layer(
|
||||
layer_removal_cs.clone(),
|
||||
doomed_layer,
|
||||
&mut updates,
|
||||
)?; // FIXME: schedule succeeded deletions before returning?
|
||||
result.layers_removed += 1;
|
||||
}
|
||||
}
|
||||
@@ -4001,7 +4196,7 @@ impl Timeline {
|
||||
// Does retries + exponential back-off internally.
|
||||
// When this fails, don't layer further retry attempts here.
|
||||
let result = remote_client
|
||||
.download_layer_file(&remote_layer.file_name, &remote_layer.layer_metadata)
|
||||
.download_layer_file(&remote_layer.filename(), &remote_layer.layer_metadata)
|
||||
.await;
|
||||
|
||||
if let Ok(size) = &result {
|
||||
@@ -4019,7 +4214,7 @@ impl Timeline {
|
||||
{
|
||||
use crate::tenant::layer_map::Replacement;
|
||||
let l: Arc<dyn PersistentLayer> = remote_layer.clone();
|
||||
let failure = match updates.replace_historic(&l, new_layer) {
|
||||
let failure = match updates.replace_historic(l.layer_desc().clone(), &l, new_layer.layer_desc().clone(), new_layer) {
|
||||
Ok(Replacement::Replaced { .. }) => false,
|
||||
Ok(Replacement::NotFound) => {
|
||||
// TODO: the downloaded file should probably be removed, otherwise
|
||||
|
||||
@@ -34,6 +34,8 @@ use crate::{
|
||||
},
|
||||
};
|
||||
|
||||
use utils::completion;
|
||||
|
||||
use super::Timeline;
|
||||
|
||||
#[derive(Default)]
|
||||
@@ -47,8 +49,12 @@ pub struct EvictionTaskTenantState {
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
pub(super) fn launch_eviction_task(self: &Arc<Self>) {
|
||||
pub(super) fn launch_eviction_task(
|
||||
self: &Arc<Self>,
|
||||
background_tasks_can_start: Option<&completion::Barrier>,
|
||||
) {
|
||||
let self_clone = Arc::clone(self);
|
||||
let background_tasks_can_start = background_tasks_can_start.cloned();
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::Eviction,
|
||||
@@ -57,7 +63,13 @@ impl Timeline {
|
||||
&format!("layer eviction for {}/{}", self.tenant_id, self.timeline_id),
|
||||
false,
|
||||
async move {
|
||||
self_clone.eviction_task(task_mgr::shutdown_token()).await;
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return Ok(()); }
|
||||
_ = completion::Barrier::maybe_wait(background_tasks_can_start) => {}
|
||||
};
|
||||
|
||||
self_clone.eviction_task(cancel).await;
|
||||
info!("eviction task finishing");
|
||||
Ok(())
|
||||
},
|
||||
|
||||
@@ -25,20 +25,19 @@ mod walreceiver_connection;
|
||||
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::task_mgr::{self, TaskKind, WALRECEIVER_RUNTIME};
|
||||
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::timeline::walreceiver::connection_manager::{
|
||||
connection_manager_loop_step, ConnectionManagerState,
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use std::future::Future;
|
||||
use std::num::NonZeroU64;
|
||||
use std::ops::ControlFlow;
|
||||
use std::sync::atomic::{self, AtomicBool};
|
||||
use std::sync::{Arc, Weak};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use tokio::select;
|
||||
use tokio::sync::{watch, RwLock};
|
||||
use tokio::sync::watch;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
|
||||
@@ -62,46 +61,23 @@ pub struct WalReceiverConf {
|
||||
|
||||
pub struct WalReceiver {
|
||||
timeline: TenantTimelineId,
|
||||
timeline_ref: Weak<Timeline>,
|
||||
conf: WalReceiverConf,
|
||||
started: AtomicBool,
|
||||
manager_status: Arc<RwLock<Option<ConnectionManagerStatus>>>,
|
||||
manager_status: Arc<std::sync::RwLock<Option<ConnectionManagerStatus>>>,
|
||||
}
|
||||
|
||||
impl WalReceiver {
|
||||
pub fn new(
|
||||
timeline: TenantTimelineId,
|
||||
timeline_ref: Weak<Timeline>,
|
||||
conf: WalReceiverConf,
|
||||
) -> Self {
|
||||
Self {
|
||||
timeline,
|
||||
timeline_ref,
|
||||
conf,
|
||||
started: AtomicBool::new(false),
|
||||
manager_status: Arc::new(RwLock::new(None)),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
timeline: Arc<Timeline>,
|
||||
conf: WalReceiverConf,
|
||||
mut broker_client: BrokerClientChannel,
|
||||
) -> anyhow::Result<()> {
|
||||
if self.started.load(atomic::Ordering::Acquire) {
|
||||
anyhow::bail!("Wal receiver is already started");
|
||||
}
|
||||
|
||||
let timeline = self.timeline_ref.upgrade().with_context(|| {
|
||||
format!("walreceiver start on a dropped timeline {}", self.timeline)
|
||||
})?;
|
||||
|
||||
ctx: &RequestContext,
|
||||
) -> Self {
|
||||
let tenant_id = timeline.tenant_id;
|
||||
let timeline_id = timeline.timeline_id;
|
||||
let walreceiver_ctx =
|
||||
ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
|
||||
let wal_receiver_conf = self.conf.clone();
|
||||
let loop_status = Arc::clone(&self.manager_status);
|
||||
|
||||
let loop_status = Arc::new(std::sync::RwLock::new(None));
|
||||
let manager_status = Arc::clone(&loop_status);
|
||||
task_mgr::spawn(
|
||||
WALRECEIVER_RUNTIME.handle(),
|
||||
TaskKind::WalReceiverManager,
|
||||
@@ -110,15 +86,16 @@ impl WalReceiver {
|
||||
&format!("walreceiver for timeline {tenant_id}/{timeline_id}"),
|
||||
false,
|
||||
async move {
|
||||
info!("WAL receiver manager started, connecting to broker");
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
debug!("WAL receiver manager started, connecting to broker");
|
||||
let mut connection_manager_state = ConnectionManagerState::new(
|
||||
timeline,
|
||||
wal_receiver_conf,
|
||||
conf,
|
||||
);
|
||||
loop {
|
||||
select! {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
info!("WAL receiver shutdown requested, shutting down");
|
||||
trace!("WAL receiver shutdown requested, shutting down");
|
||||
break;
|
||||
},
|
||||
loop_step_result = connection_manager_loop_step(
|
||||
@@ -129,7 +106,7 @@ impl WalReceiver {
|
||||
) => match loop_step_result {
|
||||
ControlFlow::Continue(()) => continue,
|
||||
ControlFlow::Break(()) => {
|
||||
info!("Connection manager loop ended, shutting down");
|
||||
trace!("Connection manager loop ended, shutting down");
|
||||
break;
|
||||
}
|
||||
},
|
||||
@@ -137,29 +114,29 @@ impl WalReceiver {
|
||||
}
|
||||
|
||||
connection_manager_state.shutdown().await;
|
||||
*loop_status.write().await = None;
|
||||
*loop_status.write().unwrap() = None;
|
||||
Ok(())
|
||||
}
|
||||
.instrument(info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id))
|
||||
.instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_id, timeline_id = %timeline_id))
|
||||
);
|
||||
|
||||
self.started.store(true, atomic::Ordering::Release);
|
||||
|
||||
Ok(())
|
||||
Self {
|
||||
timeline: TenantTimelineId::new(tenant_id, timeline_id),
|
||||
manager_status,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn stop(&self) {
|
||||
pub async fn stop(self) {
|
||||
task_mgr::shutdown_tasks(
|
||||
Some(TaskKind::WalReceiverManager),
|
||||
Some(self.timeline.tenant_id),
|
||||
Some(self.timeline.timeline_id),
|
||||
)
|
||||
.await;
|
||||
self.started.store(false, atomic::Ordering::Release);
|
||||
}
|
||||
|
||||
pub(super) async fn status(&self) -> Option<ConnectionManagerStatus> {
|
||||
self.manager_status.read().await.clone()
|
||||
pub(super) fn status(&self) -> Option<ConnectionManagerStatus> {
|
||||
self.manager_status.read().unwrap().clone()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -223,29 +200,19 @@ impl<E: Clone> TaskHandle<E> {
|
||||
TaskEvent::End(match self.join_handle.as_mut() {
|
||||
Some(jh) => {
|
||||
if !jh.is_finished() {
|
||||
// Barring any implementation errors in this module, we can
|
||||
// only arrive here while the task that executes the future
|
||||
// passed to `Self::spawn()` is still execution. Cf the comment
|
||||
// in Self::spawn().
|
||||
//
|
||||
// This was logging at warning level in earlier versions, presumably
|
||||
// to leave some breadcrumbs in case we had an implementation
|
||||
// error that would would make us get stuck in `jh.await`.
|
||||
//
|
||||
// There hasn't been such a bug so far.
|
||||
// But in a busy system, e.g., during pageserver restart,
|
||||
// we arrive here often enough that the warning-level logs
|
||||
// became a distraction.
|
||||
// So, tone them down to info-level.
|
||||
//
|
||||
// XXX: rewrite this module to eliminate the race condition.
|
||||
info!("sender is dropped while join handle is still alive");
|
||||
// See: https://github.com/neondatabase/neon/issues/2885
|
||||
trace!("sender is dropped while join handle is still alive");
|
||||
}
|
||||
|
||||
let res = jh
|
||||
.await
|
||||
.map_err(|e| anyhow::anyhow!("Failed to join task: {e}"))
|
||||
.and_then(|x| x);
|
||||
let res = match jh.await {
|
||||
Ok(res) => res,
|
||||
Err(je) if je.is_cancelled() => unreachable!("not used"),
|
||||
Err(je) if je.is_panic() => {
|
||||
// already logged
|
||||
Ok(())
|
||||
}
|
||||
Err(je) => Err(anyhow::Error::new(je).context("join walreceiver task")),
|
||||
};
|
||||
|
||||
// For cancellation-safety, drop join_handle only after successful .await.
|
||||
self.join_handle = None;
|
||||
@@ -268,12 +235,12 @@ impl<E: Clone> TaskHandle<E> {
|
||||
match jh.await {
|
||||
Ok(Ok(())) => debug!("Shutdown success"),
|
||||
Ok(Err(e)) => error!("Shutdown task error: {e:?}"),
|
||||
Err(join_error) => {
|
||||
if join_error.is_cancelled() {
|
||||
error!("Shutdown task was cancelled");
|
||||
} else {
|
||||
error!("Shutdown task join error: {join_error}")
|
||||
}
|
||||
Err(je) if je.is_cancelled() => unreachable!("not used"),
|
||||
Err(je) if je.is_panic() => {
|
||||
// already logged
|
||||
}
|
||||
Err(je) => {
|
||||
error!("Shutdown task join error: {je}")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,7 +18,7 @@ use crate::metrics::{
|
||||
WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
|
||||
};
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline};
|
||||
use anyhow::Context;
|
||||
use chrono::{NaiveDateTime, Utc};
|
||||
use pageserver_api::models::TimelineState;
|
||||
@@ -29,7 +29,6 @@ use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use storage_broker::Streaming;
|
||||
use tokio::select;
|
||||
use tokio::sync::RwLock;
|
||||
use tracing::*;
|
||||
|
||||
use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
|
||||
@@ -48,7 +47,7 @@ pub(super) async fn connection_manager_loop_step(
|
||||
broker_client: &mut BrokerClientChannel,
|
||||
connection_manager_state: &mut ConnectionManagerState,
|
||||
ctx: &RequestContext,
|
||||
manager_status: &RwLock<Option<ConnectionManagerStatus>>,
|
||||
manager_status: &std::sync::RwLock<Option<ConnectionManagerStatus>>,
|
||||
) -> ControlFlow<(), ()> {
|
||||
match connection_manager_state
|
||||
.timeline
|
||||
@@ -56,8 +55,11 @@ pub(super) async fn connection_manager_loop_step(
|
||||
.await
|
||||
{
|
||||
Ok(()) => {}
|
||||
Err(_) => {
|
||||
info!("Timeline dropped state updates sender before becoming active, stopping wal connection manager loop");
|
||||
Err(new_state) => {
|
||||
debug!(
|
||||
?new_state,
|
||||
"state changed, stopping wal connection manager loop"
|
||||
);
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
}
|
||||
@@ -80,7 +82,7 @@ pub(super) async fn connection_manager_loop_step(
|
||||
// with other streams on this client (other connection managers). When
|
||||
// object goes out of scope, stream finishes in drop() automatically.
|
||||
let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id).await;
|
||||
info!("Subscribed for broker timeline updates");
|
||||
debug!("Subscribed for broker timeline updates");
|
||||
|
||||
loop {
|
||||
let time_until_next_retry = connection_manager_state.time_until_next_retry();
|
||||
@@ -151,13 +153,13 @@ pub(super) async fn connection_manager_loop_step(
|
||||
match new_state {
|
||||
// we're already active as walreceiver, no need to reactivate
|
||||
TimelineState::Active => continue,
|
||||
TimelineState::Broken | TimelineState::Stopping => {
|
||||
info!("timeline entered terminal state {new_state:?}, stopping wal connection manager loop");
|
||||
TimelineState::Broken { .. } | TimelineState::Stopping => {
|
||||
debug!("timeline entered terminal state {new_state:?}, stopping wal connection manager loop");
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
TimelineState::Loading => {
|
||||
warn!("timeline transitioned back to Loading state, that should not happen");
|
||||
return ControlFlow::Continue(new_state);
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -165,12 +167,11 @@ pub(super) async fn connection_manager_loop_step(
|
||||
}
|
||||
}
|
||||
} => match new_event {
|
||||
ControlFlow::Continue(new_state) => {
|
||||
info!("observed timeline state change, new state is {new_state:?}");
|
||||
ControlFlow::Continue(()) => {
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
ControlFlow::Break(()) => {
|
||||
info!("Timeline dropped state updates sender, stopping wal connection manager loop");
|
||||
debug!("Timeline is no longer active, stopping wal connection manager loop");
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
},
|
||||
@@ -195,7 +196,7 @@ pub(super) async fn connection_manager_loop_step(
|
||||
.change_connection(new_candidate, ctx)
|
||||
.await
|
||||
}
|
||||
*manager_status.write().await = Some(connection_manager_state.manager_status());
|
||||
*manager_status.write().unwrap() = Some(connection_manager_state.manager_status());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -391,7 +392,6 @@ impl ConnectionManagerState {
|
||||
|
||||
self.drop_old_connection(true).await;
|
||||
|
||||
let id = self.id;
|
||||
let node_id = new_sk.safekeeper_id;
|
||||
let connect_timeout = self.conf.wal_connect_timeout;
|
||||
let timeline = Arc::clone(&self.timeline);
|
||||
@@ -399,9 +399,13 @@ impl ConnectionManagerState {
|
||||
TaskKind::WalReceiverConnectionHandler,
|
||||
DownloadBehavior::Download,
|
||||
);
|
||||
|
||||
let span = info_span!("connection", %node_id);
|
||||
let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| {
|
||||
async move {
|
||||
super::walreceiver_connection::handle_walreceiver_connection(
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let res = super::walreceiver_connection::handle_walreceiver_connection(
|
||||
timeline,
|
||||
new_sk.wal_source_connconf,
|
||||
events_sender,
|
||||
@@ -410,12 +414,23 @@ impl ConnectionManagerState {
|
||||
ctx,
|
||||
node_id,
|
||||
)
|
||||
.await
|
||||
.context("walreceiver connection handling failure")
|
||||
.await;
|
||||
|
||||
match res {
|
||||
Ok(()) => Ok(()),
|
||||
Err(e) => {
|
||||
use super::walreceiver_connection::ExpectedError;
|
||||
if e.is_expected() {
|
||||
info!("walreceiver connection handling ended: {e:#}");
|
||||
Ok(())
|
||||
} else {
|
||||
// give out an error to have task_mgr give it a really verbose logging
|
||||
Err(e).context("walreceiver connection handling failure")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
.instrument(
|
||||
info_span!("walreceiver_connection", tenant_id = %id.tenant_id, timeline_id = %id.timeline_id, %node_id),
|
||||
)
|
||||
.instrument(span)
|
||||
});
|
||||
|
||||
let now = Utc::now().naive_utc();
|
||||
@@ -1309,9 +1324,8 @@ mod tests {
|
||||
async fn dummy_state(harness: &TenantHarness<'_>) -> ConnectionManagerState {
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let timeline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION, &ctx)
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION, &ctx)
|
||||
.expect("Failed to create an empty timeline for dummy wal connection manager");
|
||||
let timeline = timeline.initialize(&ctx).unwrap();
|
||||
|
||||
ConnectionManagerState {
|
||||
id: TenantTimelineId {
|
||||
|
||||
@@ -21,16 +21,16 @@ use postgres_types::PgLsn;
|
||||
use tokio::{select, sync::watch, time};
|
||||
use tokio_postgres::{replication::ReplicationStream, Client};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, error, info, trace, warn};
|
||||
use tracing::{debug, error, info, trace, warn, Instrument};
|
||||
|
||||
use super::TaskStateUpdate;
|
||||
use crate::metrics::LIVE_CONNECTIONS_COUNT;
|
||||
use crate::{context::RequestContext, metrics::WALRECEIVER_STARTED_CONNECTIONS};
|
||||
use crate::{
|
||||
context::RequestContext,
|
||||
metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS},
|
||||
task_mgr,
|
||||
task_mgr::TaskKind,
|
||||
task_mgr::WALRECEIVER_RUNTIME,
|
||||
tenant::{Timeline, WalReceiverInfo},
|
||||
tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
|
||||
walingest::WalIngest,
|
||||
walrecord::DecodedWALRecord,
|
||||
};
|
||||
@@ -81,13 +81,8 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
config.application_name("pageserver");
|
||||
config.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
|
||||
match time::timeout(connect_timeout, config.connect(postgres::NoTls)).await {
|
||||
Ok(Ok(client_and_conn)) => client_and_conn,
|
||||
Ok(Err(conn_err)) => {
|
||||
let expected_error = ignore_expected_errors(conn_err)?;
|
||||
info!("DB connection stream finished: {expected_error}");
|
||||
return Ok(());
|
||||
}
|
||||
Err(_) => {
|
||||
Ok(client_and_conn) => client_and_conn?,
|
||||
Err(_elapsed) => {
|
||||
// Timing out to connect to a safekeeper node could happen long time, due to
|
||||
// many reasons that pageserver cannot control.
|
||||
// Do not produce an error, but make it visible, that timeouts happen by logging the `event.
|
||||
@@ -97,7 +92,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
}
|
||||
};
|
||||
|
||||
info!("connected!");
|
||||
debug!("connected!");
|
||||
let mut connection_status = WalConnectionStatus {
|
||||
is_connected: true,
|
||||
has_processed_wal: false,
|
||||
@@ -127,20 +122,25 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
"walreceiver connection",
|
||||
false,
|
||||
async move {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
select! {
|
||||
connection_result = connection => match connection_result {
|
||||
Ok(()) => info!("Walreceiver db connection closed"),
|
||||
Ok(()) => debug!("Walreceiver db connection closed"),
|
||||
Err(connection_error) => {
|
||||
if let Err(e) = ignore_expected_errors(connection_error) {
|
||||
warn!("Connection aborted: {e:#}")
|
||||
if connection_error.is_expected() {
|
||||
// silence, because most likely we've already exited the outer call
|
||||
// with a similar error.
|
||||
} else {
|
||||
warn!("Connection aborted: {connection_error:#}")
|
||||
}
|
||||
}
|
||||
},
|
||||
// Future: replace connection_cancellation with connection_ctx cancellation
|
||||
_ = connection_cancellation.cancelled() => info!("Connection cancelled"),
|
||||
_ = connection_cancellation.cancelled() => debug!("Connection cancelled"),
|
||||
}
|
||||
Ok(())
|
||||
},
|
||||
}
|
||||
.instrument(tracing::info_span!("poller")),
|
||||
);
|
||||
|
||||
// Immediately increment the gauge, then create a job to decrement it on task exit.
|
||||
@@ -203,20 +203,13 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
while let Some(replication_message) = {
|
||||
select! {
|
||||
_ = cancellation.cancelled() => {
|
||||
info!("walreceiver interrupted");
|
||||
debug!("walreceiver interrupted");
|
||||
None
|
||||
}
|
||||
replication_message = physical_stream.next() => replication_message,
|
||||
}
|
||||
} {
|
||||
let replication_message = match replication_message {
|
||||
Ok(message) => message,
|
||||
Err(replication_error) => {
|
||||
let expected_error = ignore_expected_errors(replication_error)?;
|
||||
info!("Replication stream finished: {expected_error}");
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
let replication_message = replication_message?;
|
||||
|
||||
let now = Utc::now().naive_utc();
|
||||
let last_rec_lsn_before_msg = last_rec_lsn;
|
||||
@@ -261,8 +254,6 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
let mut modification = timeline.begin_modification(endlsn);
|
||||
while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
// let _enter = info_span!("processing record", lsn = %lsn).entered();
|
||||
|
||||
// It is important to deal with the aligned records as lsn in getPage@LSN is
|
||||
// aligned and can be several bytes bigger. Without this alignment we are
|
||||
// at risk of hitting a deadlock.
|
||||
@@ -421,31 +412,50 @@ async fn identify_system(client: &mut Client) -> anyhow::Result<IdentifySystem>
|
||||
}
|
||||
}
|
||||
|
||||
/// We don't want to report connectivity problems as real errors towards connection manager because
|
||||
/// 1. they happen frequently enough to make server logs hard to read and
|
||||
/// 2. the connection manager can retry other safekeeper.
|
||||
///
|
||||
/// If this function returns `Ok(pg_error)`, it's such an error.
|
||||
/// The caller should log it at info level and then report to connection manager that we're done handling this connection.
|
||||
/// Connection manager will then handle reconnections.
|
||||
///
|
||||
/// If this function returns an `Err()`, the caller can bubble it up using `?`.
|
||||
/// The connection manager will log the error at ERROR level.
|
||||
fn ignore_expected_errors(pg_error: postgres::Error) -> anyhow::Result<postgres::Error> {
|
||||
if pg_error.is_closed()
|
||||
|| pg_error
|
||||
.source()
|
||||
.and_then(|source| source.downcast_ref::<std::io::Error>())
|
||||
.map(is_expected_io_error)
|
||||
.unwrap_or(false)
|
||||
{
|
||||
return Ok(pg_error);
|
||||
} else if let Some(db_error) = pg_error.as_db_error() {
|
||||
if db_error.code() == &SqlState::SUCCESSFUL_COMPLETION
|
||||
&& db_error.message().contains("ending streaming")
|
||||
{
|
||||
return Ok(pg_error);
|
||||
}
|
||||
}
|
||||
Err(pg_error).context("connection error")
|
||||
/// Trait for avoid reporting walreceiver specific expected or "normal" or "ok" errors.
|
||||
pub(super) trait ExpectedError {
|
||||
/// Test if this error is an ok error.
|
||||
///
|
||||
/// We don't want to report connectivity problems as real errors towards connection manager because
|
||||
/// 1. they happen frequently enough to make server logs hard to read and
|
||||
/// 2. the connection manager can retry other safekeeper.
|
||||
///
|
||||
/// If this function returns `true`, it's such an error.
|
||||
/// The caller should log it at info level and then report to connection manager that we're done handling this connection.
|
||||
/// Connection manager will then handle reconnections.
|
||||
///
|
||||
/// If this function returns an `false` the error should be propagated and the connection manager
|
||||
/// will log the error at ERROR level.
|
||||
fn is_expected(&self) -> bool;
|
||||
}
|
||||
|
||||
impl ExpectedError for postgres::Error {
|
||||
fn is_expected(&self) -> bool {
|
||||
self.is_closed()
|
||||
|| self
|
||||
.source()
|
||||
.and_then(|source| source.downcast_ref::<std::io::Error>())
|
||||
.map(is_expected_io_error)
|
||||
.unwrap_or(false)
|
||||
|| self
|
||||
.as_db_error()
|
||||
.filter(|db_error| {
|
||||
db_error.code() == &SqlState::SUCCESSFUL_COMPLETION
|
||||
&& db_error.message().contains("ending streaming")
|
||||
})
|
||||
.is_some()
|
||||
}
|
||||
}
|
||||
|
||||
impl ExpectedError for anyhow::Error {
|
||||
fn is_expected(&self) -> bool {
|
||||
let head = self.downcast_ref::<postgres::Error>();
|
||||
|
||||
let tail = self
|
||||
.chain()
|
||||
.filter_map(|e| e.downcast_ref::<postgres::Error>());
|
||||
|
||||
// check if self or any of the chained/sourced errors are expected
|
||||
head.into_iter().chain(tail).any(|e| e.is_expected())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -76,6 +76,12 @@ pub(crate) struct UploadQueueInitialized {
|
||||
pub(crate) queued_operations: VecDeque<UploadOp>,
|
||||
}
|
||||
|
||||
impl UploadQueueInitialized {
|
||||
pub(super) fn no_pending_work(&self) -> bool {
|
||||
self.inprogress_tasks.is_empty() && self.queued_operations.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub(super) enum SetDeletedFlagProgress {
|
||||
NotRunning,
|
||||
@@ -84,9 +90,7 @@ pub(super) enum SetDeletedFlagProgress {
|
||||
}
|
||||
|
||||
pub(super) struct UploadQueueStopped {
|
||||
pub(super) latest_files: HashMap<LayerFileName, LayerFileMetadata>,
|
||||
pub(super) last_uploaded_consistent_lsn: Lsn,
|
||||
pub(super) latest_metadata: TimelineMetadata,
|
||||
pub(super) upload_queue_for_deletion: UploadQueueInitialized,
|
||||
pub(super) deleted_at: SetDeletedFlagProgress,
|
||||
}
|
||||
|
||||
@@ -187,6 +191,15 @@ impl UploadQueue {
|
||||
UploadQueue::Initialized(x) => Ok(x),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn stopped_mut(&mut self) -> anyhow::Result<&mut UploadQueueStopped> {
|
||||
match self {
|
||||
UploadQueue::Initialized(_) | UploadQueue::Uninitialized => {
|
||||
anyhow::bail!("queue is in state {}", self.as_str())
|
||||
}
|
||||
UploadQueue::Stopped(stopped) => Ok(stopped),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An in-progress upload or delete task.
|
||||
@@ -199,6 +212,13 @@ pub(crate) struct UploadTask {
|
||||
pub(crate) op: UploadOp,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct Delete {
|
||||
pub(crate) file_kind: RemoteOpFileKind,
|
||||
pub(crate) layer_file_name: LayerFileName,
|
||||
pub(crate) scheduled_from_timeline_delete: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) enum UploadOp {
|
||||
/// Upload a layer file
|
||||
@@ -207,8 +227,8 @@ pub(crate) enum UploadOp {
|
||||
/// Upload the metadata file
|
||||
UploadMetadata(IndexPart, Lsn),
|
||||
|
||||
/// Delete a file.
|
||||
Delete(RemoteOpFileKind, LayerFileName),
|
||||
/// Delete a layer file
|
||||
Delete(Delete),
|
||||
|
||||
/// Barrier. When the barrier operation is reached,
|
||||
Barrier(tokio::sync::watch::Sender<()>),
|
||||
@@ -226,7 +246,12 @@ impl std::fmt::Display for UploadOp {
|
||||
)
|
||||
}
|
||||
UploadOp::UploadMetadata(_, lsn) => write!(f, "UploadMetadata(lsn: {})", lsn),
|
||||
UploadOp::Delete(_, path) => write!(f, "Delete({})", path.file_name()),
|
||||
UploadOp::Delete(delete) => write!(
|
||||
f,
|
||||
"Delete(path: {}, scheduled_from_timeline_delete: {})",
|
||||
delete.layer_file_name.file_name(),
|
||||
delete.scheduled_from_timeline_delete
|
||||
),
|
||||
UploadOp::Barrier(_) => write!(f, "Barrier"),
|
||||
}
|
||||
}
|
||||
|
||||
26
pgxn/hnsw/Makefile
Normal file
26
pgxn/hnsw/Makefile
Normal file
@@ -0,0 +1,26 @@
|
||||
EXTENSION = hnsw
|
||||
EXTVERSION = 0.1.0
|
||||
|
||||
MODULE_big = hnsw
|
||||
DATA = $(wildcard *--*.sql)
|
||||
OBJS = hnsw.o hnswalg.o
|
||||
|
||||
TESTS = $(wildcard test/sql/*.sql)
|
||||
REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS))
|
||||
REGRESS_OPTS = --inputdir=test --load-extension=hnsw
|
||||
|
||||
# For auto-vectorization:
|
||||
# - GCC (needs -ftree-vectorize OR -O3) - https://gcc.gnu.org/projects/tree-ssa/vectorization.html
|
||||
PG_CFLAGS += -O3
|
||||
PG_CXXFLAGS += -O3 -std=c++11
|
||||
PG_LDFLAGS += -lstdc++
|
||||
|
||||
all: $(EXTENSION)--$(EXTVERSION).sql
|
||||
|
||||
PG_CONFIG ?= pg_config
|
||||
PGXS := $(shell $(PG_CONFIG) --pgxs)
|
||||
include $(PGXS)
|
||||
|
||||
dist:
|
||||
mkdir -p dist
|
||||
git archive --format zip --prefix=$(EXTENSION)-$(EXTVERSION)/ --output dist/$(EXTENSION)-$(EXTVERSION).zip master
|
||||
25
pgxn/hnsw/README.md
Normal file
25
pgxn/hnsw/README.md
Normal file
@@ -0,0 +1,25 @@
|
||||
# Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors
|
||||
|
||||
This ANN extension of Postgres is based
|
||||
on [ivf-hnsw](https://github.com/dbaranchuk/ivf-hnsw.git) implementation of [HNSW](https://www.pinecone.io/learn/hnsw),
|
||||
the code for the current state-of-the-art billion-scale nearest neighbor search system presented in the paper:
|
||||
|
||||
[Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors](http://openaccess.thecvf.com/content_ECCV_2018/html/Dmitry_Baranchuk_Revisiting_the_Inverted_ECCV_2018_paper.html),
|
||||
<br>
|
||||
Dmitry Baranchuk, Artem Babenko, Yury Malkov
|
||||
|
||||
# Postgres extension
|
||||
|
||||
HNSW index is hold in memory (built on demand) and it's maxial size is limited
|
||||
by `maxelements` index parameter. Another required parameter is nubmer of dimensions (if it is not specified in column type).
|
||||
Optional parameter `ef` specifies number of neighbors which are considered during index construction and search (corresponds `efConstruction` and `efSearch` parameters
|
||||
described in the article).
|
||||
|
||||
# Example of usage:
|
||||
|
||||
```
|
||||
create extension hnsw;
|
||||
create table embeddings(id integer primary key, payload real[]);
|
||||
create index on embeddings using hnsw(payload) with (maxelements=1000000, dims=100, m=32);
|
||||
select id from embeddings order by payload <-> array[1.0, 2.0,...] limit 100;
|
||||
```
|
||||
29
pgxn/hnsw/hnsw--0.1.0.sql
Normal file
29
pgxn/hnsw/hnsw--0.1.0.sql
Normal file
@@ -0,0 +1,29 @@
|
||||
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
|
||||
\echo Use "CREATE EXTENSION hnsw" to load this file. \quit
|
||||
|
||||
-- functions
|
||||
|
||||
CREATE FUNCTION l2_distance(real[], real[]) RETURNS real
|
||||
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
|
||||
|
||||
-- operators
|
||||
|
||||
CREATE OPERATOR <-> (
|
||||
LEFTARG = real[], RIGHTARG = real[], PROCEDURE = l2_distance,
|
||||
COMMUTATOR = '<->'
|
||||
);
|
||||
|
||||
-- access method
|
||||
|
||||
CREATE FUNCTION hnsw_handler(internal) RETURNS index_am_handler
|
||||
AS 'MODULE_PATHNAME' LANGUAGE C;
|
||||
|
||||
CREATE ACCESS METHOD hnsw TYPE INDEX HANDLER hnsw_handler;
|
||||
|
||||
COMMENT ON ACCESS METHOD hnsw IS 'hnsw index access method';
|
||||
|
||||
-- opclasses
|
||||
|
||||
CREATE OPERATOR CLASS knn_ops
|
||||
DEFAULT FOR TYPE real[] USING hnsw AS
|
||||
OPERATOR 1 <-> (real[], real[]) FOR ORDER BY float_ops;
|
||||
551
pgxn/hnsw/hnsw.c
Normal file
551
pgxn/hnsw/hnsw.c
Normal file
@@ -0,0 +1,551 @@
|
||||
#include "postgres.h"
|
||||
|
||||
#include "access/amapi.h"
|
||||
#include "access/generic_xlog.h"
|
||||
#include "access/relation.h"
|
||||
#include "access/reloptions.h"
|
||||
#include "access/tableam.h"
|
||||
#include "catalog/index.h"
|
||||
#include "commands/vacuum.h"
|
||||
#include "nodes/execnodes.h"
|
||||
#include "storage/bufmgr.h"
|
||||
#include "utils/guc.h"
|
||||
#include "utils/selfuncs.h"
|
||||
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
|
||||
#include "hnsw.h"
|
||||
|
||||
PG_MODULE_MAGIC;
|
||||
|
||||
typedef struct {
|
||||
int32 vl_len_; /* varlena header (do not touch directly!) */
|
||||
int dims;
|
||||
int maxelements;
|
||||
int efConstruction;
|
||||
int efSearch;
|
||||
int M;
|
||||
} HnswOptions;
|
||||
|
||||
static relopt_kind hnsw_relopt_kind;
|
||||
|
||||
typedef struct {
|
||||
HierarchicalNSW* hnsw;
|
||||
size_t curr;
|
||||
size_t n_results;
|
||||
ItemPointer results;
|
||||
} HnswScanOpaqueData;
|
||||
|
||||
typedef HnswScanOpaqueData* HnswScanOpaque;
|
||||
|
||||
typedef struct {
|
||||
Oid relid;
|
||||
uint32 status;
|
||||
HierarchicalNSW* hnsw;
|
||||
} HnswHashEntry;
|
||||
|
||||
|
||||
#define SH_PREFIX hnsw_index
|
||||
#define SH_ELEMENT_TYPE HnswHashEntry
|
||||
#define SH_KEY_TYPE Oid
|
||||
#define SH_KEY relid
|
||||
#define SH_STORE_HASH
|
||||
#define SH_GET_HASH(tb, a) ((a)->relid)
|
||||
#define SH_HASH_KEY(tb, key) (key)
|
||||
#define SH_EQUAL(tb, a, b) ((a) == (b))
|
||||
#define SH_SCOPE static inline
|
||||
#define SH_DEFINE
|
||||
#define SH_DECLARE
|
||||
#include "lib/simplehash.h"
|
||||
|
||||
#define INDEX_HASH_SIZE 11
|
||||
|
||||
#define DEFAULT_EF_SEARCH 64
|
||||
|
||||
PGDLLEXPORT void _PG_init(void);
|
||||
|
||||
static hnsw_index_hash *hnsw_indexes;
|
||||
|
||||
/*
|
||||
* Initialize index options and variables
|
||||
*/
|
||||
void
|
||||
_PG_init(void)
|
||||
{
|
||||
hnsw_relopt_kind = add_reloption_kind();
|
||||
add_int_reloption(hnsw_relopt_kind, "dims", "Number of dimensions",
|
||||
0, 0, INT_MAX, AccessExclusiveLock);
|
||||
add_int_reloption(hnsw_relopt_kind, "maxelements", "Maximal number of elements",
|
||||
0, 0, INT_MAX, AccessExclusiveLock);
|
||||
add_int_reloption(hnsw_relopt_kind, "m", "Number of neighbors of each vertex",
|
||||
100, 0, INT_MAX, AccessExclusiveLock);
|
||||
add_int_reloption(hnsw_relopt_kind, "efconstruction", "Number of inspected neighbors during index construction",
|
||||
16, 1, INT_MAX, AccessExclusiveLock);
|
||||
add_int_reloption(hnsw_relopt_kind, "efsearch", "Number of inspected neighbors during index search",
|
||||
64, 1, INT_MAX, AccessExclusiveLock);
|
||||
hnsw_indexes = hnsw_index_create(TopMemoryContext, INDEX_HASH_SIZE, NULL);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
hnsw_build_callback(Relation index, ItemPointer tid, Datum *values,
|
||||
bool *isnull, bool tupleIsAlive, void *state)
|
||||
{
|
||||
HierarchicalNSW* hnsw = (HierarchicalNSW*) state;
|
||||
ArrayType* array;
|
||||
int n_items;
|
||||
label_t label = 0;
|
||||
|
||||
/* Skip nulls */
|
||||
if (isnull[0])
|
||||
return;
|
||||
|
||||
array = DatumGetArrayTypeP(values[0]);
|
||||
n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
|
||||
if (n_items != hnsw_dimensions(hnsw))
|
||||
{
|
||||
elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
|
||||
n_items, hnsw_dimensions(hnsw));
|
||||
}
|
||||
|
||||
memcpy(&label, tid, sizeof(*tid));
|
||||
hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label);
|
||||
}
|
||||
|
||||
static void
|
||||
hnsw_populate(HierarchicalNSW* hnsw, Relation indexRel, Relation heapRel)
|
||||
{
|
||||
IndexInfo* indexInfo = BuildIndexInfo(indexRel);
|
||||
Assert(indexInfo->ii_NumIndexAttrs == 1);
|
||||
table_index_build_scan(heapRel, indexRel, indexInfo,
|
||||
true, true, hnsw_build_callback, (void *) hnsw, NULL);
|
||||
}
|
||||
|
||||
static HierarchicalNSW*
|
||||
hnsw_get_index(Relation indexRel, Relation heapRel)
|
||||
{
|
||||
HierarchicalNSW* hnsw;
|
||||
Oid indexoid = RelationGetRelid(indexRel);
|
||||
HnswHashEntry* entry = hnsw_index_lookup(hnsw_indexes, indexoid);
|
||||
if (entry == NULL)
|
||||
{
|
||||
size_t dims, maxelements;
|
||||
size_t M;
|
||||
size_t maxM;
|
||||
size_t size_links_level0;
|
||||
size_t size_data_per_element;
|
||||
size_t data_size;
|
||||
dsm_handle handle = indexoid << 1; /* make it even */
|
||||
void* impl_private = NULL;
|
||||
void* mapped_address = NULL;
|
||||
Size mapped_size = 0;
|
||||
Size shmem_size;
|
||||
bool exists = true;
|
||||
bool found;
|
||||
HnswOptions *opts = (HnswOptions *) indexRel->rd_options;
|
||||
if (opts == NULL || opts->maxelements == 0 || opts->dims == 0) {
|
||||
elog(ERROR, "HNSW index requires 'maxelements' and 'dims' to be specified");
|
||||
}
|
||||
dims = opts->dims;
|
||||
maxelements = opts->maxelements;
|
||||
M = opts->M;
|
||||
maxM = M * 2;
|
||||
data_size = dims * sizeof(coord_t);
|
||||
size_links_level0 = (maxM + 1) * sizeof(idx_t);
|
||||
size_data_per_element = size_links_level0 + data_size + sizeof(label_t);
|
||||
shmem_size = hnsw_sizeof() + maxelements * size_data_per_element;
|
||||
|
||||
/* first try to attach to existed index */
|
||||
if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private,
|
||||
&mapped_address, &mapped_size, DEBUG1))
|
||||
{
|
||||
/* index doesn't exists: try to create it */
|
||||
if (!dsm_impl_op(DSM_OP_CREATE, handle, shmem_size, &impl_private,
|
||||
&mapped_address, &mapped_size, DEBUG1))
|
||||
{
|
||||
/* We can do it under shared lock, so some other backend may
|
||||
* try to initialize index. If create is failed because index already
|
||||
* created by somebody else, then try to attach to it once again
|
||||
*/
|
||||
if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private,
|
||||
&mapped_address, &mapped_size, ERROR))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
exists = false;
|
||||
}
|
||||
}
|
||||
Assert(mapped_size == shmem_size);
|
||||
hnsw = (HierarchicalNSW*)mapped_address;
|
||||
|
||||
if (!exists)
|
||||
{
|
||||
hnsw_init(hnsw, dims, maxelements, M, maxM, opts->efConstruction);
|
||||
hnsw_populate(hnsw, indexRel, heapRel);
|
||||
}
|
||||
entry = hnsw_index_insert(hnsw_indexes, indexoid, &found);
|
||||
Assert(!found);
|
||||
entry->hnsw = hnsw;
|
||||
}
|
||||
else
|
||||
{
|
||||
hnsw = entry->hnsw;
|
||||
}
|
||||
return hnsw;
|
||||
}
|
||||
|
||||
/*
|
||||
* Start or restart an index scan
|
||||
*/
|
||||
static IndexScanDesc
|
||||
hnsw_beginscan(Relation index, int nkeys, int norderbys)
|
||||
{
|
||||
IndexScanDesc scan = RelationGetIndexScan(index, nkeys, norderbys);
|
||||
HnswScanOpaque so = (HnswScanOpaque) palloc(sizeof(HnswScanOpaqueData));
|
||||
Relation heap = relation_open(index->rd_index->indrelid, NoLock);
|
||||
so->hnsw = hnsw_get_index(index, heap);
|
||||
relation_close(heap, NoLock);
|
||||
so->curr = 0;
|
||||
so->n_results = 0;
|
||||
so->results = NULL;
|
||||
scan->opaque = so;
|
||||
return scan;
|
||||
}
|
||||
|
||||
/*
|
||||
* Start or restart an index scan
|
||||
*/
|
||||
static void
|
||||
hnsw_rescan(IndexScanDesc scan, ScanKey keys, int nkeys, ScanKey orderbys, int norderbys)
|
||||
{
|
||||
HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
|
||||
if (so->results)
|
||||
{
|
||||
pfree(so->results);
|
||||
so->results = NULL;
|
||||
}
|
||||
so->curr = 0;
|
||||
if (orderbys && scan->numberOfOrderBys > 0)
|
||||
memmove(scan->orderByData, orderbys, scan->numberOfOrderBys * sizeof(ScanKeyData));
|
||||
}
|
||||
|
||||
/*
|
||||
* Fetch the next tuple in the given scan
|
||||
*/
|
||||
static bool
|
||||
hnsw_gettuple(IndexScanDesc scan, ScanDirection dir)
|
||||
{
|
||||
HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
|
||||
|
||||
/*
|
||||
* Index can be used to scan backward, but Postgres doesn't support
|
||||
* backward scan on operators
|
||||
*/
|
||||
Assert(ScanDirectionIsForward(dir));
|
||||
|
||||
if (so->curr == 0)
|
||||
{
|
||||
Datum value;
|
||||
ArrayType* array;
|
||||
int n_items;
|
||||
size_t n_results;
|
||||
label_t* results;
|
||||
HnswOptions *opts = (HnswOptions *) scan->indexRelation->rd_options;
|
||||
size_t efSearch = opts ? opts->efSearch : DEFAULT_EF_SEARCH;
|
||||
|
||||
/* Safety check */
|
||||
if (scan->orderByData == NULL)
|
||||
elog(ERROR, "cannot scan HNSW index without order");
|
||||
|
||||
/* No items will match if null */
|
||||
if (scan->orderByData->sk_flags & SK_ISNULL)
|
||||
return false;
|
||||
|
||||
value = scan->orderByData->sk_argument;
|
||||
array = DatumGetArrayTypeP(value);
|
||||
n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
|
||||
if (n_items != hnsw_dimensions(so->hnsw))
|
||||
{
|
||||
elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
|
||||
n_items, hnsw_dimensions(so->hnsw));
|
||||
}
|
||||
|
||||
if (!hnsw_search(so->hnsw, (coord_t*)ARR_DATA_PTR(array), efSearch, &n_results, &results))
|
||||
elog(ERROR, "HNSW index search failed");
|
||||
so->results = (ItemPointer)palloc(n_results*sizeof(ItemPointerData));
|
||||
so->n_results = n_results;
|
||||
for (size_t i = 0; i < n_results; i++)
|
||||
{
|
||||
memcpy(&so->results[i], &results[i], sizeof(so->results[i]));
|
||||
}
|
||||
free(results);
|
||||
}
|
||||
if (so->curr >= so->n_results)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
scan->xs_heaptid = so->results[so->curr++];
|
||||
scan->xs_recheckorderby = false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* End a scan and release resources
|
||||
*/
|
||||
static void
|
||||
hnsw_endscan(IndexScanDesc scan)
|
||||
{
|
||||
HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
|
||||
if (so->results)
|
||||
pfree(so->results);
|
||||
pfree(so);
|
||||
scan->opaque = NULL;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Estimate the cost of an index scan
|
||||
*/
|
||||
static void
|
||||
hnsw_costestimate(PlannerInfo *root, IndexPath *path, double loop_count,
|
||||
Cost *indexStartupCost, Cost *indexTotalCost,
|
||||
Selectivity *indexSelectivity, double *indexCorrelation
|
||||
,double *indexPages
|
||||
)
|
||||
{
|
||||
GenericCosts costs;
|
||||
|
||||
/* Never use index without order */
|
||||
if (path->indexorderbys == NULL)
|
||||
{
|
||||
*indexStartupCost = DBL_MAX;
|
||||
*indexTotalCost = DBL_MAX;
|
||||
*indexSelectivity = 0;
|
||||
*indexCorrelation = 0;
|
||||
*indexPages = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
MemSet(&costs, 0, sizeof(costs));
|
||||
|
||||
genericcostestimate(root, path, loop_count, &costs);
|
||||
|
||||
/* Startup cost and total cost are same */
|
||||
*indexStartupCost = costs.indexTotalCost;
|
||||
*indexTotalCost = costs.indexTotalCost;
|
||||
*indexSelectivity = costs.indexSelectivity;
|
||||
*indexCorrelation = costs.indexCorrelation;
|
||||
*indexPages = costs.numIndexPages;
|
||||
}
|
||||
|
||||
/*
|
||||
* Parse and validate the reloptions
|
||||
*/
|
||||
static bytea *
|
||||
hnsw_options(Datum reloptions, bool validate)
|
||||
{
|
||||
static const relopt_parse_elt tab[] = {
|
||||
{"dims", RELOPT_TYPE_INT, offsetof(HnswOptions, dims)},
|
||||
{"maxelements", RELOPT_TYPE_INT, offsetof(HnswOptions, maxelements)},
|
||||
{"efconstruction", RELOPT_TYPE_INT, offsetof(HnswOptions, efConstruction)},
|
||||
{"efsearch", RELOPT_TYPE_INT, offsetof(HnswOptions, efSearch)},
|
||||
{"m", RELOPT_TYPE_INT, offsetof(HnswOptions, M)}
|
||||
};
|
||||
|
||||
return (bytea *) build_reloptions(reloptions, validate,
|
||||
hnsw_relopt_kind,
|
||||
sizeof(HnswOptions),
|
||||
tab, lengthof(tab));
|
||||
}
|
||||
|
||||
/*
|
||||
* Validate catalog entries for the specified operator class
|
||||
*/
|
||||
static bool
|
||||
hnsw_validate(Oid opclassoid)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Build the index for a logged table
|
||||
*/
|
||||
static IndexBuildResult *
|
||||
hnsw_build(Relation heap, Relation index, IndexInfo *indexInfo)
|
||||
{
|
||||
HierarchicalNSW* hnsw = hnsw_get_index(index, heap);
|
||||
IndexBuildResult* result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
|
||||
result->heap_tuples = result->index_tuples = hnsw_count(hnsw);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
* Insert a tuple into the index
|
||||
*/
|
||||
static bool
|
||||
hnsw_insert(Relation index, Datum *values, bool *isnull, ItemPointer heap_tid,
|
||||
Relation heap, IndexUniqueCheck checkUnique,
|
||||
bool indexUnchanged,
|
||||
IndexInfo *indexInfo)
|
||||
{
|
||||
HierarchicalNSW* hnsw = hnsw_get_index(index, heap);
|
||||
Datum value;
|
||||
ArrayType* array;
|
||||
int n_items;
|
||||
label_t label = 0;
|
||||
|
||||
/* Skip nulls */
|
||||
if (isnull[0])
|
||||
return false;
|
||||
|
||||
/* Detoast value */
|
||||
value = PointerGetDatum(PG_DETOAST_DATUM(values[0]));
|
||||
array = DatumGetArrayTypeP(value);
|
||||
n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
|
||||
if (n_items != hnsw_dimensions(hnsw))
|
||||
{
|
||||
elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
|
||||
n_items, hnsw_dimensions(hnsw));
|
||||
}
|
||||
memcpy(&label, heap_tid, sizeof(*heap_tid));
|
||||
if (!hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label))
|
||||
elog(ERROR, "HNSW index insert failed");
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Build the index for an unlogged table
|
||||
*/
|
||||
static void
|
||||
hnsw_buildempty(Relation index)
|
||||
{
|
||||
/* index will be constructed on dema nd when accessed */
|
||||
}
|
||||
|
||||
/*
|
||||
* Clean up after a VACUUM operation
|
||||
*/
|
||||
static IndexBulkDeleteResult *
|
||||
hnsw_vacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
|
||||
{
|
||||
Relation rel = info->index;
|
||||
|
||||
if (stats == NULL)
|
||||
return NULL;
|
||||
|
||||
stats->num_pages = RelationGetNumberOfBlocks(rel);
|
||||
|
||||
return stats;
|
||||
}
|
||||
|
||||
/*
|
||||
* Bulk delete tuples from the index
|
||||
*/
|
||||
static IndexBulkDeleteResult *
|
||||
hnsw_bulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
|
||||
IndexBulkDeleteCallback callback, void *callback_state)
|
||||
{
|
||||
if (stats == NULL)
|
||||
stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
|
||||
return stats;
|
||||
}
|
||||
|
||||
/*
|
||||
* Define index handler
|
||||
*
|
||||
* See https://www.postgresql.org/docs/current/index-api.html
|
||||
*/
|
||||
PGDLLEXPORT PG_FUNCTION_INFO_V1(hnsw_handler);
|
||||
Datum
|
||||
hnsw_handler(PG_FUNCTION_ARGS)
|
||||
{
|
||||
IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
|
||||
|
||||
amroutine->amstrategies = 0;
|
||||
amroutine->amsupport = 0;
|
||||
amroutine->amoptsprocnum = 0;
|
||||
amroutine->amcanorder = false;
|
||||
amroutine->amcanorderbyop = true;
|
||||
amroutine->amcanbackward = false; /* can change direction mid-scan */
|
||||
amroutine->amcanunique = false;
|
||||
amroutine->amcanmulticol = false;
|
||||
amroutine->amoptionalkey = true;
|
||||
amroutine->amsearcharray = false;
|
||||
amroutine->amsearchnulls = false;
|
||||
amroutine->amstorage = false;
|
||||
amroutine->amclusterable = false;
|
||||
amroutine->ampredlocks = false;
|
||||
amroutine->amcanparallel = false;
|
||||
amroutine->amcaninclude = false;
|
||||
amroutine->amusemaintenanceworkmem = false; /* not used during VACUUM */
|
||||
amroutine->amparallelvacuumoptions = VACUUM_OPTION_PARALLEL_BULKDEL;
|
||||
amroutine->amkeytype = InvalidOid;
|
||||
|
||||
/* Interface functions */
|
||||
amroutine->ambuild = hnsw_build;
|
||||
amroutine->ambuildempty = hnsw_buildempty;
|
||||
amroutine->aminsert = hnsw_insert;
|
||||
amroutine->ambulkdelete = hnsw_bulkdelete;
|
||||
amroutine->amvacuumcleanup = hnsw_vacuumcleanup;
|
||||
amroutine->amcanreturn = NULL; /* tuple not included in heapsort */
|
||||
amroutine->amcostestimate = hnsw_costestimate;
|
||||
amroutine->amoptions = hnsw_options;
|
||||
amroutine->amproperty = NULL; /* TODO AMPROP_DISTANCE_ORDERABLE */
|
||||
amroutine->ambuildphasename = NULL;
|
||||
amroutine->amvalidate = hnsw_validate;
|
||||
amroutine->amadjustmembers = NULL;
|
||||
amroutine->ambeginscan = hnsw_beginscan;
|
||||
amroutine->amrescan = hnsw_rescan;
|
||||
amroutine->amgettuple = hnsw_gettuple;
|
||||
amroutine->amgetbitmap = NULL;
|
||||
amroutine->amendscan = hnsw_endscan;
|
||||
amroutine->ammarkpos = NULL;
|
||||
amroutine->amrestrpos = NULL;
|
||||
|
||||
/* Interface functions to support parallel index scans */
|
||||
amroutine->amestimateparallelscan = NULL;
|
||||
amroutine->aminitparallelscan = NULL;
|
||||
amroutine->amparallelrescan = NULL;
|
||||
|
||||
PG_RETURN_POINTER(amroutine);
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the L2 distance between vectors
|
||||
*/
|
||||
PGDLLEXPORT PG_FUNCTION_INFO_V1(l2_distance);
|
||||
Datum
|
||||
l2_distance(PG_FUNCTION_ARGS)
|
||||
{
|
||||
ArrayType *a = PG_GETARG_ARRAYTYPE_P(0);
|
||||
ArrayType *b = PG_GETARG_ARRAYTYPE_P(1);
|
||||
int a_dim = ArrayGetNItems(ARR_NDIM(a), ARR_DIMS(a));
|
||||
int b_dim = ArrayGetNItems(ARR_NDIM(b), ARR_DIMS(b));
|
||||
dist_t distance = 0.0;
|
||||
dist_t diff;
|
||||
coord_t *ax = (coord_t*)ARR_DATA_PTR(a);
|
||||
coord_t *bx = (coord_t*)ARR_DATA_PTR(b);
|
||||
|
||||
if (a_dim != b_dim)
|
||||
{
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_DATA_EXCEPTION),
|
||||
errmsg("different array dimensions %d and %d", a_dim, b_dim)));
|
||||
}
|
||||
|
||||
for (int i = 0; i < a_dim; i++)
|
||||
{
|
||||
diff = ax[i] - bx[i];
|
||||
distance += diff * diff;
|
||||
}
|
||||
|
||||
PG_RETURN_FLOAT4((dist_t)sqrt(distance));
|
||||
}
|
||||
5
pgxn/hnsw/hnsw.control
Normal file
5
pgxn/hnsw/hnsw.control
Normal file
@@ -0,0 +1,5 @@
|
||||
comment = 'hNsw index'
|
||||
default_version = '0.1.0'
|
||||
module_pathname = '$libdir/hnsw'
|
||||
relocatable = true
|
||||
trusted = true
|
||||
15
pgxn/hnsw/hnsw.h
Normal file
15
pgxn/hnsw/hnsw.h
Normal file
@@ -0,0 +1,15 @@
|
||||
#pragma once
|
||||
|
||||
typedef float coord_t;
|
||||
typedef float dist_t;
|
||||
typedef uint32_t idx_t;
|
||||
typedef uint64_t label_t;
|
||||
|
||||
typedef struct HierarchicalNSW HierarchicalNSW;
|
||||
|
||||
bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results);
|
||||
bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label);
|
||||
void hnsw_init(HierarchicalNSW* hnsw, size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction);
|
||||
int hnsw_dimensions(HierarchicalNSW* hnsw);
|
||||
size_t hnsw_count(HierarchicalNSW* hnsw);
|
||||
size_t hnsw_sizeof(void);
|
||||
379
pgxn/hnsw/hnswalg.cpp
Normal file
379
pgxn/hnsw/hnswalg.cpp
Normal file
@@ -0,0 +1,379 @@
|
||||
#include "hnswalg.h"
|
||||
|
||||
#if defined(__GNUC__)
|
||||
#define PORTABLE_ALIGN32 __attribute__((aligned(32)))
|
||||
#define PREFETCH(addr,hint) __builtin_prefetch(addr, 0, hint)
|
||||
#else
|
||||
#define PORTABLE_ALIGN32 __declspec(align(32))
|
||||
#define PREFETCH(addr,hint)
|
||||
#endif
|
||||
|
||||
HierarchicalNSW::HierarchicalNSW(size_t dim_, size_t maxelements_, size_t M_, size_t maxM_, size_t efConstruction_)
|
||||
{
|
||||
dim = dim_;
|
||||
data_size = dim * sizeof(coord_t);
|
||||
|
||||
efConstruction = efConstruction_;
|
||||
|
||||
maxelements = maxelements_;
|
||||
M = M_;
|
||||
maxM = maxM_;
|
||||
size_links_level0 = (maxM + 1) * sizeof(idx_t);
|
||||
size_data_per_element = size_links_level0 + data_size + sizeof(label_t);
|
||||
offset_data = size_links_level0;
|
||||
offset_label = offset_data + data_size;
|
||||
|
||||
enterpoint_node = 0;
|
||||
cur_element_count = 0;
|
||||
#ifdef __x86_64__
|
||||
use_avx2 = __builtin_cpu_supports("avx2");
|
||||
#endif
|
||||
}
|
||||
|
||||
std::priority_queue<std::pair<dist_t, idx_t>> HierarchicalNSW::searchBaseLayer(const coord_t *point, size_t ef)
|
||||
{
|
||||
std::vector<uint32_t> visited;
|
||||
visited.resize((cur_element_count + 31) >> 5);
|
||||
|
||||
std::priority_queue<std::pair<dist_t, idx_t >> topResults;
|
||||
std::priority_queue<std::pair<dist_t, idx_t >> candidateSet;
|
||||
|
||||
dist_t dist = fstdistfunc(point, getDataByInternalId(enterpoint_node));
|
||||
|
||||
topResults.emplace(dist, enterpoint_node);
|
||||
candidateSet.emplace(-dist, enterpoint_node);
|
||||
visited[enterpoint_node >> 5] = 1 << (enterpoint_node & 31);
|
||||
dist_t lowerBound = dist;
|
||||
|
||||
while (!candidateSet.empty())
|
||||
{
|
||||
std::pair<dist_t, idx_t> curr_el_pair = candidateSet.top();
|
||||
if (-curr_el_pair.first > lowerBound)
|
||||
break;
|
||||
|
||||
candidateSet.pop();
|
||||
idx_t curNodeNum = curr_el_pair.second;
|
||||
|
||||
idx_t* data = get_linklist0(curNodeNum);
|
||||
size_t size = *data++;
|
||||
|
||||
PREFETCH(getDataByInternalId(*data), 0);
|
||||
|
||||
for (size_t j = 0; j < size; ++j) {
|
||||
size_t tnum = *(data + j);
|
||||
|
||||
PREFETCH(getDataByInternalId(*(data + j + 1)), 0);
|
||||
|
||||
if (!(visited[tnum >> 5] & (1 << (tnum & 31)))) {
|
||||
visited[tnum >> 5] |= 1 << (tnum & 31);
|
||||
|
||||
dist = fstdistfunc(point, getDataByInternalId(tnum));
|
||||
|
||||
if (topResults.top().first > dist || topResults.size() < ef) {
|
||||
candidateSet.emplace(-dist, tnum);
|
||||
|
||||
PREFETCH(get_linklist0(candidateSet.top().second), 0);
|
||||
topResults.emplace(dist, tnum);
|
||||
|
||||
if (topResults.size() > ef)
|
||||
topResults.pop();
|
||||
|
||||
lowerBound = topResults.top().first;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return topResults;
|
||||
}
|
||||
|
||||
|
||||
void HierarchicalNSW::getNeighborsByHeuristic(std::priority_queue<std::pair<dist_t, idx_t>> &topResults, size_t NN)
|
||||
{
|
||||
if (topResults.size() < NN)
|
||||
return;
|
||||
|
||||
std::priority_queue<std::pair<dist_t, idx_t>> resultSet;
|
||||
std::vector<std::pair<dist_t, idx_t>> returnlist;
|
||||
|
||||
while (topResults.size() > 0) {
|
||||
resultSet.emplace(-topResults.top().first, topResults.top().second);
|
||||
topResults.pop();
|
||||
}
|
||||
|
||||
while (resultSet.size()) {
|
||||
if (returnlist.size() >= NN)
|
||||
break;
|
||||
std::pair<dist_t, idx_t> curen = resultSet.top();
|
||||
dist_t dist_to_query = -curen.first;
|
||||
resultSet.pop();
|
||||
bool good = true;
|
||||
for (std::pair<dist_t, idx_t> curen2 : returnlist) {
|
||||
dist_t curdist = fstdistfunc(getDataByInternalId(curen2.second),
|
||||
getDataByInternalId(curen.second));
|
||||
if (curdist < dist_to_query) {
|
||||
good = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (good) returnlist.push_back(curen);
|
||||
}
|
||||
for (std::pair<dist_t, idx_t> elem : returnlist)
|
||||
topResults.emplace(-elem.first, elem.second);
|
||||
}
|
||||
|
||||
void HierarchicalNSW::mutuallyConnectNewElement(const coord_t *point, idx_t cur_c,
|
||||
std::priority_queue<std::pair<dist_t, idx_t>> topResults)
|
||||
{
|
||||
getNeighborsByHeuristic(topResults, M);
|
||||
|
||||
std::vector<idx_t> res;
|
||||
res.reserve(M);
|
||||
while (topResults.size() > 0) {
|
||||
res.push_back(topResults.top().second);
|
||||
topResults.pop();
|
||||
}
|
||||
{
|
||||
idx_t* data = get_linklist0(cur_c);
|
||||
if (*data)
|
||||
throw std::runtime_error("Should be blank");
|
||||
|
||||
*data++ = res.size();
|
||||
|
||||
for (size_t idx = 0; idx < res.size(); idx++) {
|
||||
if (data[idx])
|
||||
throw std::runtime_error("Should be blank");
|
||||
data[idx] = res[idx];
|
||||
}
|
||||
}
|
||||
for (size_t idx = 0; idx < res.size(); idx++) {
|
||||
if (res[idx] == cur_c)
|
||||
throw std::runtime_error("Connection to the same element");
|
||||
|
||||
size_t resMmax = maxM;
|
||||
idx_t *ll_other = get_linklist0(res[idx]);
|
||||
idx_t sz_link_list_other = *ll_other;
|
||||
|
||||
if (sz_link_list_other > resMmax || sz_link_list_other < 0)
|
||||
throw std::runtime_error("Bad sz_link_list_other");
|
||||
|
||||
if (sz_link_list_other < resMmax) {
|
||||
idx_t *data = ll_other + 1;
|
||||
data[sz_link_list_other] = cur_c;
|
||||
*ll_other = sz_link_list_other + 1;
|
||||
} else {
|
||||
// finding the "weakest" element to replace it with the new one
|
||||
idx_t *data = ll_other + 1;
|
||||
dist_t d_max = fstdistfunc(getDataByInternalId(cur_c), getDataByInternalId(res[idx]));
|
||||
// Heuristic:
|
||||
std::priority_queue<std::pair<dist_t, idx_t>> candidates;
|
||||
candidates.emplace(d_max, cur_c);
|
||||
|
||||
for (size_t j = 0; j < sz_link_list_other; j++)
|
||||
candidates.emplace(fstdistfunc(getDataByInternalId(data[j]), getDataByInternalId(res[idx])), data[j]);
|
||||
|
||||
getNeighborsByHeuristic(candidates, resMmax);
|
||||
|
||||
size_t indx = 0;
|
||||
while (!candidates.empty()) {
|
||||
data[indx] = candidates.top().second;
|
||||
candidates.pop();
|
||||
indx++;
|
||||
}
|
||||
*ll_other = indx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void HierarchicalNSW::addPoint(const coord_t *point, label_t label)
|
||||
{
|
||||
if (cur_element_count >= maxelements) {
|
||||
throw std::runtime_error("The number of elements exceeds the specified limit");
|
||||
}
|
||||
idx_t cur_c = cur_element_count++;
|
||||
memset((char *) get_linklist0(cur_c), 0, size_data_per_element);
|
||||
memcpy(getDataByInternalId(cur_c), point, data_size);
|
||||
memcpy(getExternalLabel(cur_c), &label, sizeof label);
|
||||
|
||||
// Do nothing for the first element
|
||||
if (cur_c != 0) {
|
||||
std::priority_queue <std::pair<dist_t, idx_t>> topResults = searchBaseLayer(point, efConstruction);
|
||||
mutuallyConnectNewElement(point, cur_c, topResults);
|
||||
}
|
||||
};
|
||||
|
||||
std::priority_queue<std::pair<dist_t, label_t>> HierarchicalNSW::searchKnn(const coord_t *query, size_t k)
|
||||
{
|
||||
std::priority_queue<std::pair<dist_t, label_t>> topResults;
|
||||
auto topCandidates = searchBaseLayer(query, k);
|
||||
while (topCandidates.size() > k) {
|
||||
topCandidates.pop();
|
||||
}
|
||||
while (!topCandidates.empty()) {
|
||||
std::pair<dist_t, idx_t> rez = topCandidates.top();
|
||||
label_t label;
|
||||
memcpy(&label, getExternalLabel(rez.second), sizeof(label));
|
||||
topResults.push(std::pair<dist_t, label_t>(rez.first, label));
|
||||
topCandidates.pop();
|
||||
}
|
||||
|
||||
return topResults;
|
||||
};
|
||||
|
||||
dist_t fstdistfunc_scalar(const coord_t *x, const coord_t *y, size_t n)
|
||||
{
|
||||
dist_t distance = 0.0;
|
||||
|
||||
for (size_t i = 0; i < n; i++)
|
||||
{
|
||||
dist_t diff = x[i] - y[i];
|
||||
distance += diff * diff;
|
||||
}
|
||||
return distance;
|
||||
|
||||
}
|
||||
|
||||
#ifdef __x86_64__
|
||||
#include <immintrin.h>
|
||||
|
||||
__attribute__((target("avx2")))
|
||||
dist_t fstdistfunc_avx2(const coord_t *x, const coord_t *y, size_t n)
|
||||
{
|
||||
const size_t TmpResSz = sizeof(__m256) / sizeof(float);
|
||||
float PORTABLE_ALIGN32 TmpRes[TmpResSz];
|
||||
size_t qty16 = n / 16;
|
||||
const float *pEnd1 = x + (qty16 * 16);
|
||||
__m256 diff, v1, v2;
|
||||
__m256 sum = _mm256_set1_ps(0);
|
||||
|
||||
while (x < pEnd1) {
|
||||
v1 = _mm256_loadu_ps(x);
|
||||
x += 8;
|
||||
v2 = _mm256_loadu_ps(y);
|
||||
y += 8;
|
||||
diff = _mm256_sub_ps(v1, v2);
|
||||
sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
|
||||
|
||||
v1 = _mm256_loadu_ps(x);
|
||||
x += 8;
|
||||
v2 = _mm256_loadu_ps(y);
|
||||
y += 8;
|
||||
diff = _mm256_sub_ps(v1, v2);
|
||||
sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
|
||||
}
|
||||
_mm256_store_ps(TmpRes, sum);
|
||||
float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
|
||||
return (res);
|
||||
}
|
||||
|
||||
dist_t fstdistfunc_sse(const coord_t *x, const coord_t *y, size_t n)
|
||||
{
|
||||
const size_t TmpResSz = sizeof(__m128) / sizeof(float);
|
||||
float PORTABLE_ALIGN32 TmpRes[TmpResSz];
|
||||
size_t qty16 = n / 16;
|
||||
const float *pEnd1 = x + (qty16 * 16);
|
||||
|
||||
__m128 diff, v1, v2;
|
||||
__m128 sum = _mm_set1_ps(0);
|
||||
|
||||
while (x < pEnd1) {
|
||||
v1 = _mm_loadu_ps(x);
|
||||
x += 4;
|
||||
v2 = _mm_loadu_ps(y);
|
||||
y += 4;
|
||||
diff = _mm_sub_ps(v1, v2);
|
||||
sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
|
||||
|
||||
v1 = _mm_loadu_ps(x);
|
||||
x += 4;
|
||||
v2 = _mm_loadu_ps(y);
|
||||
y += 4;
|
||||
diff = _mm_sub_ps(v1, v2);
|
||||
sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
|
||||
|
||||
v1 = _mm_loadu_ps(x);
|
||||
x += 4;
|
||||
v2 = _mm_loadu_ps(y);
|
||||
y += 4;
|
||||
diff = _mm_sub_ps(v1, v2);
|
||||
sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
|
||||
|
||||
v1 = _mm_loadu_ps(x);
|
||||
x += 4;
|
||||
v2 = _mm_loadu_ps(y);
|
||||
y += 4;
|
||||
diff = _mm_sub_ps(v1, v2);
|
||||
sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
|
||||
}
|
||||
_mm_store_ps(TmpRes, sum);
|
||||
float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
|
||||
return res;
|
||||
}
|
||||
#endif
|
||||
|
||||
dist_t HierarchicalNSW::fstdistfunc(const coord_t *x, const coord_t *y)
|
||||
{
|
||||
#ifndef __x86_64__
|
||||
return fstdistfunc_scalar(x, y, dim);
|
||||
#else
|
||||
if(use_avx2)
|
||||
return fstdistfunc_avx2(x, y, dim);
|
||||
|
||||
return fstdistfunc_sse(x, y, dim);
|
||||
#endif
|
||||
}
|
||||
|
||||
bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results)
|
||||
{
|
||||
try
|
||||
{
|
||||
auto result = hnsw->searchKnn(point, efSearch);
|
||||
size_t nResults = result.size();
|
||||
*results = (label_t*)malloc(nResults*sizeof(label_t));
|
||||
for (size_t i = nResults; i-- != 0;)
|
||||
{
|
||||
(*results)[i] = result.top().second;
|
||||
result.pop();
|
||||
}
|
||||
*n_results = nResults;
|
||||
return true;
|
||||
}
|
||||
catch (std::exception& x)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label)
|
||||
{
|
||||
try
|
||||
{
|
||||
hnsw->addPoint(point, label);
|
||||
return true;
|
||||
}
|
||||
catch (std::exception& x)
|
||||
{
|
||||
fprintf(stderr, "Catch %s\n", x.what());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
void hnsw_init(HierarchicalNSW* hnsw, size_t dims, size_t maxelements, size_t M, size_t maxM, size_t efConstruction)
|
||||
{
|
||||
new ((void*)hnsw) HierarchicalNSW(dims, maxelements, M, maxM, efConstruction);
|
||||
}
|
||||
|
||||
|
||||
int hnsw_dimensions(HierarchicalNSW* hnsw)
|
||||
{
|
||||
return (int)hnsw->dim;
|
||||
}
|
||||
|
||||
size_t hnsw_count(HierarchicalNSW* hnsw)
|
||||
{
|
||||
return hnsw->cur_element_count;
|
||||
}
|
||||
|
||||
size_t hnsw_sizeof(void)
|
||||
{
|
||||
return sizeof(HierarchicalNSW);
|
||||
}
|
||||
69
pgxn/hnsw/hnswalg.h
Normal file
69
pgxn/hnsw/hnswalg.h
Normal file
@@ -0,0 +1,69 @@
|
||||
#pragma once
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <map>
|
||||
#include <cmath>
|
||||
#include <queue>
|
||||
#include <stdexcept>
|
||||
|
||||
extern "C" {
|
||||
#include "hnsw.h"
|
||||
}
|
||||
|
||||
struct HierarchicalNSW
|
||||
{
|
||||
size_t maxelements;
|
||||
size_t cur_element_count;
|
||||
|
||||
idx_t enterpoint_node;
|
||||
|
||||
size_t dim;
|
||||
size_t data_size;
|
||||
size_t offset_data;
|
||||
size_t offset_label;
|
||||
size_t size_data_per_element;
|
||||
size_t M;
|
||||
size_t maxM;
|
||||
size_t size_links_level0;
|
||||
size_t efConstruction;
|
||||
|
||||
#ifdef __x86_64__
|
||||
bool use_avx2;
|
||||
#endif
|
||||
|
||||
char data_level0_memory[0]; // varying size
|
||||
|
||||
public:
|
||||
HierarchicalNSW(size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction);
|
||||
~HierarchicalNSW();
|
||||
|
||||
|
||||
inline coord_t *getDataByInternalId(idx_t internal_id) const {
|
||||
return (coord_t *)&data_level0_memory[internal_id * size_data_per_element + offset_data];
|
||||
}
|
||||
|
||||
inline idx_t *get_linklist0(idx_t internal_id) const {
|
||||
return (idx_t*)&data_level0_memory[internal_id * size_data_per_element];
|
||||
}
|
||||
|
||||
inline label_t *getExternalLabel(idx_t internal_id) const {
|
||||
return (label_t *)&data_level0_memory[internal_id * size_data_per_element + offset_label];
|
||||
}
|
||||
|
||||
std::priority_queue<std::pair<dist_t, idx_t>> searchBaseLayer(const coord_t *x, size_t ef);
|
||||
|
||||
void getNeighborsByHeuristic(std::priority_queue<std::pair<dist_t, idx_t>> &topResults, size_t NN);
|
||||
|
||||
void mutuallyConnectNewElement(const coord_t *x, idx_t id, std::priority_queue<std::pair<dist_t, idx_t>> topResults);
|
||||
|
||||
void addPoint(const coord_t *point, label_t label);
|
||||
|
||||
std::priority_queue<std::pair<dist_t, label_t>> searchKnn(const coord_t *query_data, size_t k);
|
||||
|
||||
dist_t fstdistfunc(const coord_t *x, const coord_t *y);
|
||||
};
|
||||
28
pgxn/hnsw/test/expected/knn.out
Normal file
28
pgxn/hnsw/test/expected/knn.out
Normal file
@@ -0,0 +1,28 @@
|
||||
SET enable_seqscan = off;
|
||||
CREATE TABLE t (val real[]);
|
||||
INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL);
|
||||
CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3);
|
||||
INSERT INTO t (val) VALUES (array[1,2,4]);
|
||||
explain SELECT * FROM t ORDER BY val <-> array[3,3,3];
|
||||
QUERY PLAN
|
||||
--------------------------------------------------------------------
|
||||
Index Scan using t_val_idx on t (cost=4.02..8.06 rows=3 width=36)
|
||||
Order By: (val <-> '{3,3,3}'::real[])
|
||||
(2 rows)
|
||||
|
||||
SELECT * FROM t ORDER BY val <-> array[3,3,3];
|
||||
val
|
||||
---------
|
||||
{1,2,3}
|
||||
{1,2,4}
|
||||
{1,1,1}
|
||||
{0,0,0}
|
||||
(4 rows)
|
||||
|
||||
SELECT COUNT(*) FROM t;
|
||||
count
|
||||
-------
|
||||
5
|
||||
(1 row)
|
||||
|
||||
DROP TABLE t;
|
||||
13
pgxn/hnsw/test/sql/knn.sql
Normal file
13
pgxn/hnsw/test/sql/knn.sql
Normal file
@@ -0,0 +1,13 @@
|
||||
SET enable_seqscan = off;
|
||||
|
||||
CREATE TABLE t (val real[]);
|
||||
INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL);
|
||||
CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3);
|
||||
|
||||
INSERT INTO t (val) VALUES (array[1,2,4]);
|
||||
|
||||
explain SELECT * FROM t ORDER BY val <-> array[3,3,3];
|
||||
SELECT * FROM t ORDER BY val <-> array[3,3,3];
|
||||
SELECT COUNT(*) FROM t;
|
||||
|
||||
DROP TABLE t;
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user