mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-29 16:20:37 +00:00
Compare commits
59 Commits
release-69
...
remove-pos
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dc8ca6aaa1 | ||
|
|
af50fd76b7 | ||
|
|
da16233f64 | ||
|
|
80466bdca2 | ||
|
|
248558dee8 | ||
|
|
3bad52543f | ||
|
|
3d64a7ddcd | ||
|
|
25f1e5cfeb | ||
|
|
8dd555d396 | ||
|
|
01b6843e12 | ||
|
|
93987b5a4a | ||
|
|
33baca07b6 | ||
|
|
923974d4da | ||
|
|
e7277885b3 | ||
|
|
80262e724f | ||
|
|
85b954f449 | ||
|
|
76328ada05 | ||
|
|
b54b632c6a | ||
|
|
9909551f47 | ||
|
|
700b102b0f | ||
|
|
dbadb0f9bb | ||
|
|
8297f7a181 | ||
|
|
2090e928d1 | ||
|
|
05eff3a67e | ||
|
|
c6cf5e7c0f | ||
|
|
e0c7f1ce15 | ||
|
|
6f5c262684 | ||
|
|
9768f09f6b | ||
|
|
db900ae9d0 | ||
|
|
4d9036bf1f | ||
|
|
b3bedda6fd | ||
|
|
b782b11b33 | ||
|
|
5069123b6d | ||
|
|
fb0406e9d2 | ||
|
|
b8a311131e | ||
|
|
d589498c6f | ||
|
|
6f34f97573 | ||
|
|
b86432c29e | ||
|
|
ac1205c14c | ||
|
|
e3ff87ce3b | ||
|
|
0595320c87 | ||
|
|
92d5e0e87a | ||
|
|
3a3bd34a28 | ||
|
|
64949a37a9 | ||
|
|
6f8fcdf9ea | ||
|
|
fcb55a2aa2 | ||
|
|
f36cf3f885 | ||
|
|
8dca188974 | ||
|
|
b7fa93f6b7 | ||
|
|
1e8e04bb2c | ||
|
|
94369af782 | ||
|
|
34b6bd416a | ||
|
|
49d5e56c08 | ||
|
|
aca81f5fa4 | ||
|
|
2dcac94194 | ||
|
|
ababa50cce | ||
|
|
163beaf9ad | ||
|
|
5b37485c99 | ||
|
|
ed958da38a |
@@ -27,7 +27,7 @@ jobs:
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned
|
||||
image: neondatabase/build-tools:pinned-bookworm
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
22
.github/workflows/_build-and-test-locally.yml
vendored
22
.github/workflows/_build-and-test-locally.yml
vendored
@@ -53,20 +53,6 @@ jobs:
|
||||
BUILD_TAG: ${{ inputs.build-tag }}
|
||||
|
||||
steps:
|
||||
- name: Fix git ownership
|
||||
run: |
|
||||
# Workaround for `fatal: detected dubious ownership in repository at ...`
|
||||
#
|
||||
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
|
||||
# Ref https://github.com/actions/checkout/issues/785
|
||||
#
|
||||
git config --global --add safe.directory ${{ github.workspace }}
|
||||
git config --global --add safe.directory ${GITHUB_WORKSPACE}
|
||||
for r in 14 15 16 17; do
|
||||
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
|
||||
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
|
||||
done
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
@@ -124,28 +110,28 @@ jobs:
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v14
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
|
||||
|
||||
- name: Cache postgres v15 build
|
||||
id: cache_pg_15
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v15
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
|
||||
|
||||
- name: Cache postgres v16 build
|
||||
id: cache_pg_16
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v16
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
|
||||
|
||||
- name: Cache postgres v17 build
|
||||
id: cache_pg_17
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v17
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
|
||||
|
||||
- name: Build postgres v14
|
||||
if: steps.cache_pg_14.outputs.cache-hit != 'true'
|
||||
|
||||
18
.github/workflows/benchmarking.yml
vendored
18
.github/workflows/benchmarking.yml
vendored
@@ -83,7 +83,7 @@ jobs:
|
||||
|
||||
runs-on: ${{ matrix.RUNNER }}
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned
|
||||
image: neondatabase/build-tools:pinned-bookworm
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
@@ -178,7 +178,7 @@ jobs:
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned
|
||||
image: neondatabase/build-tools:pinned-bookworm
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
@@ -280,7 +280,7 @@ jobs:
|
||||
region_id_default=${{ env.DEFAULT_REGION_ID }}
|
||||
runner_default='["self-hosted", "us-east-2", "x64"]'
|
||||
runner_azure='["self-hosted", "eastus2", "x64"]'
|
||||
image_default="neondatabase/build-tools:pinned"
|
||||
image_default="neondatabase/build-tools:pinned-bookworm"
|
||||
matrix='{
|
||||
"pg_version" : [
|
||||
16
|
||||
@@ -299,9 +299,9 @@ jobs:
|
||||
"include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" },
|
||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
|
||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
|
||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned" },
|
||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned" },
|
||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned" },
|
||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned-bookworm" },
|
||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned-bookworm" },
|
||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned-bookworm" },
|
||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]
|
||||
}'
|
||||
|
||||
@@ -665,7 +665,7 @@ jobs:
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned
|
||||
image: neondatabase/build-tools:pinned-bookworm
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
@@ -772,7 +772,7 @@ jobs:
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned
|
||||
image: neondatabase/build-tools:pinned-bookworm
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
@@ -877,7 +877,7 @@ jobs:
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned
|
||||
image: neondatabase/build-tools:pinned-bookworm
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
@@ -82,7 +82,7 @@ jobs:
|
||||
|
||||
- uses: docker/build-push-action@v6
|
||||
with:
|
||||
file: Dockerfile.build-tools
|
||||
file: build-tools.Dockerfile
|
||||
context: .
|
||||
provenance: false
|
||||
push: true
|
||||
|
||||
26
.github/workflows/build_and_test.yml
vendored
26
.github/workflows/build_and_test.yml
vendored
@@ -683,7 +683,7 @@ jobs:
|
||||
provenance: false
|
||||
push: true
|
||||
pull: true
|
||||
file: compute/Dockerfile.compute-node
|
||||
file: compute/compute-node.Dockerfile
|
||||
cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
|
||||
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
|
||||
tags: |
|
||||
@@ -703,7 +703,7 @@ jobs:
|
||||
provenance: false
|
||||
push: true
|
||||
pull: true
|
||||
file: compute/Dockerfile.compute-node
|
||||
file: compute/compute-node.Dockerfile
|
||||
target: neon-pg-ext-test
|
||||
cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
|
||||
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
|
||||
@@ -728,7 +728,7 @@ jobs:
|
||||
provenance: false
|
||||
push: true
|
||||
pull: true
|
||||
file: compute/Dockerfile.compute-node
|
||||
file: compute/compute-node.Dockerfile
|
||||
cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
|
||||
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-tools-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
|
||||
tags: |
|
||||
@@ -839,6 +839,7 @@ jobs:
|
||||
- name: Build vm image
|
||||
run: |
|
||||
./vm-builder \
|
||||
-size=2G \
|
||||
-spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \
|
||||
-src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
|
||||
-dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}
|
||||
@@ -1078,20 +1079,6 @@ jobs:
|
||||
runs-on: [ self-hosted, small ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||
steps:
|
||||
- name: Fix git ownership
|
||||
run: |
|
||||
# Workaround for `fatal: detected dubious ownership in repository at ...`
|
||||
#
|
||||
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
|
||||
# Ref https://github.com/actions/checkout/issues/785
|
||||
#
|
||||
git config --global --add safe.directory ${{ github.workspace }}
|
||||
git config --global --add safe.directory ${GITHUB_WORKSPACE}
|
||||
for r in 14 15 16 17; do
|
||||
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
|
||||
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
|
||||
done
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Trigger deploy workflow
|
||||
@@ -1130,7 +1117,10 @@ jobs:
|
||||
|
||||
gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \
|
||||
-f deployPgSniRouter=true \
|
||||
-f deployProxy=true \
|
||||
-f deployProxyLink=true \
|
||||
-f deployPrivatelinkProxy=true \
|
||||
-f deployProxyScram=true \
|
||||
-f deployProxyAuthBroker=true \
|
||||
-f branch=main \
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
else
|
||||
|
||||
@@ -31,7 +31,7 @@ jobs:
|
||||
id: get-build-tools-tag
|
||||
env:
|
||||
IMAGE_TAG: |
|
||||
${{ hashFiles('Dockerfile.build-tools',
|
||||
${{ hashFiles('build-tools.Dockerfile',
|
||||
'.github/workflows/check-build-tools-image.yml',
|
||||
'.github/workflows/build-build-tools-image.yml') }}
|
||||
run: |
|
||||
|
||||
2
.github/workflows/cloud-regress.yml
vendored
2
.github/workflows/cloud-regress.yml
vendored
@@ -31,7 +31,7 @@ jobs:
|
||||
|
||||
runs-on: us-east-2
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned
|
||||
image: neondatabase/build-tools:pinned-bookworm
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
|
||||
2
.github/workflows/trigger-e2e-tests.yml
vendored
2
.github/workflows/trigger-e2e-tests.yml
vendored
@@ -112,7 +112,7 @@ jobs:
|
||||
# This isn't exhaustive, just the paths that are most directly compute-related.
|
||||
# For example, compute_ctl also depends on libs/utils, but we don't trigger
|
||||
# an e2e run on that.
|
||||
vendor/*|pgxn/*|compute_tools/*|libs/vm_monitor/*|compute/Dockerfile.compute-node)
|
||||
vendor/*|pgxn/*|compute_tools/*|libs/vm_monitor/*|compute/compute-node.Dockerfile)
|
||||
platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
|
||||
;;
|
||||
*)
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -6,6 +6,8 @@ __pycache__/
|
||||
test_output/
|
||||
.vscode
|
||||
.idea
|
||||
*.swp
|
||||
tags
|
||||
neon.iml
|
||||
/.neon
|
||||
/integration_tests/.neon
|
||||
|
||||
4
Cargo.lock
generated
4
Cargo.lock
generated
@@ -6272,7 +6272,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-epoll-uring"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#cb2dcea2058034bc209e7917b01c5097712a3168"
|
||||
dependencies = [
|
||||
"futures",
|
||||
"nix 0.26.4",
|
||||
@@ -6788,7 +6788,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "uring-common"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#cb2dcea2058034bc209e7917b01c5097712a3168"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"io-uring",
|
||||
|
||||
4
Makefile
4
Makefile
@@ -297,7 +297,7 @@ clean: postgres-clean neon-pg-clean-ext
|
||||
# This removes everything
|
||||
.PHONY: distclean
|
||||
distclean:
|
||||
rm -rf $(POSTGRES_INSTALL_DIR)
|
||||
$(RM) -r $(POSTGRES_INSTALL_DIR)
|
||||
$(CARGO_CMD_PREFIX) cargo clean
|
||||
|
||||
.PHONY: fmt
|
||||
@@ -329,7 +329,7 @@ postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list
|
||||
$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/pgindent --typedefs postgres-$*-typedefs-full.list \
|
||||
$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/ \
|
||||
--excludes $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/exclude_file_patterns
|
||||
rm -f pg*.BAK
|
||||
$(RM) pg*.BAK
|
||||
|
||||
# Indent pxgn/neon.
|
||||
.PHONY: neon-pgindent
|
||||
|
||||
@@ -142,7 +142,7 @@ RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/sourc
|
||||
# Use the same version of libicu as the compute nodes so that
|
||||
# clusters created using inidb on pageserver can be used by computes.
|
||||
#
|
||||
# TODO: at this time, Dockerfile.compute-node uses the debian bullseye libicu
|
||||
# TODO: at this time, compute-node.Dockerfile uses the debian bullseye libicu
|
||||
# package, which is 67.1. We're duplicating that knowledge here, and also, technically,
|
||||
# Debian has a few patches on top of 67.1 that we're not adding here.
|
||||
ENV ICU_VERSION=67.1
|
||||
@@ -34,7 +34,7 @@ sql_exporter_autoscaling.yml: $(jsonnet_files)
|
||||
|
||||
.PHONY: clean
|
||||
clean:
|
||||
rm --force \
|
||||
$(RM) \
|
||||
etc/neon_collector.yml \
|
||||
etc/neon_collector_autoscaling.yml \
|
||||
etc/sql_exporter.yml \
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
This directory contains files that are needed to build the compute
|
||||
images, or included in the compute images.
|
||||
|
||||
Dockerfile.compute-node
|
||||
compute-node.Dockerfile
|
||||
To build the compute image
|
||||
|
||||
vm-image-spec.yaml
|
||||
@@ -14,8 +14,8 @@ etc/
|
||||
patches/
|
||||
Some extensions need to be patched to work with Neon. This
|
||||
directory contains such patches. They are applied to the extension
|
||||
sources in Dockerfile.compute-node
|
||||
sources in compute-node.Dockerfile
|
||||
|
||||
In addition to these, postgres itself, the neon postgres extension,
|
||||
and compute_ctl are built and copied into the compute image by
|
||||
Dockerfile.compute-node.
|
||||
compute-node.Dockerfile.
|
||||
|
||||
@@ -666,7 +666,7 @@ RUN apt-get update && \
|
||||
#
|
||||
# Use new version only for v17
|
||||
# because Release_2024_09_1 has some backward incompatible changes
|
||||
# https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1
|
||||
# https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1
|
||||
ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v17") \
|
||||
@@ -860,13 +860,14 @@ ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
|
||||
USER nonroot
|
||||
WORKDIR /home/nonroot
|
||||
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
echo "v17 is not supported yet by pgrx. Quit" && exit 0;; \
|
||||
esac && \
|
||||
curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
|
||||
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
|
||||
chmod +x rustup-init && \
|
||||
./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
|
||||
rm rustup-init && \
|
||||
case "${PG_VERSION}" in \
|
||||
'v17') \
|
||||
echo 'v17 is not supported yet by pgrx. Quit' && exit 0;; \
|
||||
esac && \
|
||||
cargo install --locked --version 0.11.3 cargo-pgrx && \
|
||||
/bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
|
||||
|
||||
@@ -1041,6 +1042,31 @@ RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg_mooncake"
|
||||
# compile pg_mooncake extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM rust-extensions-build AS pg-mooncake-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PG_MOONCAKE_VERSION=0a7de4c0b5c7b1a5e2175e1c5f4625b97b7346f1
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
|
||||
RUN case "${PG_VERSION}" in \
|
||||
'v14') \
|
||||
echo "pg_mooncake is not supported on Postgres ${PG_VERSION}" && exit 0;; \
|
||||
esac && \
|
||||
git clone --depth 1 --branch neon https://github.com/Mooncake-Labs/pg_mooncake.git pg_mooncake-src && \
|
||||
cd pg_mooncake-src && \
|
||||
git checkout "${PG_MOONCAKE_VERSION}" && \
|
||||
git submodule update --init --depth 1 --recursive && \
|
||||
make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "neon-pg-ext-build"
|
||||
@@ -1084,6 +1110,7 @@ COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
|
||||
COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY pgxn/ pgxn/
|
||||
|
||||
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
@@ -1,7 +1,7 @@
|
||||
local neon = import 'neon.libsonnet';
|
||||
|
||||
local pg_stat_bgwriter = importstr 'sql_exporter/checkpoints_req.sql';
|
||||
local pg_stat_checkpointer = importstr 'sql_exporter/checkpoints_req.17.sql';
|
||||
local pg_stat_bgwriter = importstr 'sql_exporter/checkpoints_timed.sql';
|
||||
local pg_stat_checkpointer = importstr 'sql_exporter/checkpoints_timed.17.sql';
|
||||
|
||||
{
|
||||
metric_name: 'checkpoints_timed',
|
||||
|
||||
@@ -1,5 +1,10 @@
|
||||
SELECT
|
||||
slot_name,
|
||||
pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal
|
||||
pg_wal_lsn_diff(
|
||||
CASE
|
||||
WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn()
|
||||
ELSE pg_current_wal_lsn()
|
||||
END,
|
||||
restart_lsn)::FLOAT8 AS retained_wal
|
||||
FROM pg_replication_slots
|
||||
WHERE active = false;
|
||||
|
||||
@@ -18,7 +18,7 @@ commands:
|
||||
- name: pgbouncer
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
|
||||
shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini 2>&1 > /dev/virtio-ports/tech.neon.log.0'
|
||||
- name: local_proxy
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
|
||||
@@ -18,7 +18,7 @@ commands:
|
||||
- name: pgbouncer
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
|
||||
shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini 2>&1 > /dev/virtio-ports/tech.neon.log.0'
|
||||
- name: local_proxy
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
|
||||
@@ -1073,10 +1073,10 @@ async fn handle_tenant(subcmd: &TenantCmd, env: &mut local_env::LocalEnv) -> any
|
||||
tenant_id,
|
||||
TimelineCreateRequest {
|
||||
new_timeline_id,
|
||||
ancestor_timeline_id: None,
|
||||
ancestor_start_lsn: None,
|
||||
existing_initdb_timeline_id: None,
|
||||
pg_version: Some(args.pg_version),
|
||||
mode: pageserver_api::models::TimelineCreateRequestMode::Bootstrap {
|
||||
existing_initdb_timeline_id: None,
|
||||
pg_version: Some(args.pg_version),
|
||||
},
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
@@ -1133,10 +1133,10 @@ async fn handle_timeline(cmd: &TimelineCmd, env: &mut local_env::LocalEnv) -> Re
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
let create_req = TimelineCreateRequest {
|
||||
new_timeline_id,
|
||||
ancestor_timeline_id: None,
|
||||
existing_initdb_timeline_id: None,
|
||||
ancestor_start_lsn: None,
|
||||
pg_version: Some(args.pg_version),
|
||||
mode: pageserver_api::models::TimelineCreateRequestMode::Bootstrap {
|
||||
existing_initdb_timeline_id: None,
|
||||
pg_version: Some(args.pg_version),
|
||||
},
|
||||
};
|
||||
let timeline_info = storage_controller
|
||||
.tenant_timeline_create(tenant_id, create_req)
|
||||
@@ -1189,10 +1189,11 @@ async fn handle_timeline(cmd: &TimelineCmd, env: &mut local_env::LocalEnv) -> Re
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
let create_req = TimelineCreateRequest {
|
||||
new_timeline_id,
|
||||
ancestor_timeline_id: Some(ancestor_timeline_id),
|
||||
existing_initdb_timeline_id: None,
|
||||
ancestor_start_lsn: start_lsn,
|
||||
pg_version: None,
|
||||
mode: pageserver_api::models::TimelineCreateRequestMode::Branch {
|
||||
ancestor_timeline_id,
|
||||
ancestor_start_lsn: start_lsn,
|
||||
pg_version: None,
|
||||
},
|
||||
};
|
||||
let timeline_info = storage_controller
|
||||
.tenant_timeline_create(tenant_id, create_req)
|
||||
|
||||
@@ -529,28 +529,6 @@ impl PageServerNode {
|
||||
Ok(self.http_client.list_timelines(*tenant_shard_id).await?)
|
||||
}
|
||||
|
||||
pub async fn timeline_create(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
new_timeline_id: TimelineId,
|
||||
ancestor_start_lsn: Option<Lsn>,
|
||||
ancestor_timeline_id: Option<TimelineId>,
|
||||
pg_version: Option<u32>,
|
||||
existing_initdb_timeline_id: Option<TimelineId>,
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
let req = models::TimelineCreateRequest {
|
||||
new_timeline_id,
|
||||
ancestor_start_lsn,
|
||||
ancestor_timeline_id,
|
||||
pg_version,
|
||||
existing_initdb_timeline_id,
|
||||
};
|
||||
Ok(self
|
||||
.http_client
|
||||
.timeline_create(tenant_shard_id, &req)
|
||||
.await?)
|
||||
}
|
||||
|
||||
/// Import a basebackup prepared using either:
|
||||
/// a) `pg_basebackup -F tar`, or
|
||||
/// b) The `fullbackup` pageserver endpoint
|
||||
|
||||
@@ -20,7 +20,16 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt;
|
||||
use postgres_backend::AuthType;
|
||||
use reqwest::Method;
|
||||
use serde::{de::DeserializeOwned, Deserialize, Serialize};
|
||||
use std::{fs, net::SocketAddr, path::PathBuf, str::FromStr, sync::OnceLock};
|
||||
use std::{
|
||||
ffi::OsStr,
|
||||
fs,
|
||||
net::SocketAddr,
|
||||
path::PathBuf,
|
||||
process::ExitStatus,
|
||||
str::FromStr,
|
||||
sync::OnceLock,
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
use tokio::process::Command;
|
||||
use tracing::instrument;
|
||||
use url::Url;
|
||||
@@ -168,16 +177,6 @@ impl StorageController {
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
/// PIDFile for the postgres instance used to store storage controller state
|
||||
fn postgres_pid_file(&self) -> Utf8PathBuf {
|
||||
Utf8PathBuf::from_path_buf(
|
||||
self.env
|
||||
.base_data_dir
|
||||
.join("storage_controller_postgres.pid"),
|
||||
)
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
/// Find the directory containing postgres subdirectories, such `bin` and `lib`
|
||||
///
|
||||
/// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back
|
||||
@@ -296,6 +295,31 @@ impl StorageController {
|
||||
.map_err(anyhow::Error::new)
|
||||
}
|
||||
|
||||
/// Wrapper for the pg_ctl binary, which we spawn as a short-lived subprocess when starting and stopping postgres
|
||||
async fn pg_ctl<I, S>(&self, args: I) -> ExitStatus
|
||||
where
|
||||
I: IntoIterator<Item = S>,
|
||||
S: AsRef<OsStr>,
|
||||
{
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await.unwrap();
|
||||
let bin_path = pg_bin_dir.join("pg_ctl");
|
||||
|
||||
let pg_lib_dir = self.get_pg_lib_dir().await.unwrap();
|
||||
let envs = [
|
||||
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
];
|
||||
|
||||
Command::new(bin_path)
|
||||
.args(args)
|
||||
.envs(envs)
|
||||
.spawn()
|
||||
.expect("Failed to spawn pg_ctl, binary_missing?")
|
||||
.wait()
|
||||
.await
|
||||
.expect("Failed to wait for pg_ctl termination")
|
||||
}
|
||||
|
||||
pub async fn start(&self, start_args: NeonStorageControllerStartArgs) -> anyhow::Result<()> {
|
||||
let instance_dir = self.storage_controller_instance_dir(start_args.instance_id);
|
||||
if let Err(err) = tokio::fs::create_dir(&instance_dir).await {
|
||||
@@ -404,20 +428,34 @@ impl StorageController {
|
||||
db_start_args
|
||||
);
|
||||
|
||||
background_process::start_process(
|
||||
"storage_controller_db",
|
||||
&self.env.base_data_dir,
|
||||
pg_bin_dir.join("pg_ctl").as_std_path(),
|
||||
db_start_args,
|
||||
vec![
|
||||
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
],
|
||||
background_process::InitialPidFile::Create(self.postgres_pid_file()),
|
||||
&start_args.start_timeout,
|
||||
|| self.pg_isready(&pg_bin_dir, postgres_port),
|
||||
)
|
||||
.await?;
|
||||
let db_start_status = self.pg_ctl(db_start_args).await;
|
||||
let start_timeout: Duration = start_args.start_timeout.into();
|
||||
let db_start_deadline = Instant::now() + start_timeout;
|
||||
if !db_start_status.success() {
|
||||
return Err(anyhow::anyhow!(
|
||||
"Failed to start postgres {}",
|
||||
db_start_status.code().unwrap()
|
||||
));
|
||||
}
|
||||
|
||||
loop {
|
||||
if Instant::now() > db_start_deadline {
|
||||
return Err(anyhow::anyhow!("Timed out waiting for postgres to start"));
|
||||
}
|
||||
|
||||
match self.pg_isready(&pg_bin_dir, postgres_port).await {
|
||||
Ok(true) => {
|
||||
tracing::info!("storage controller postgres is now ready");
|
||||
break;
|
||||
}
|
||||
Ok(false) => {
|
||||
tokio::time::sleep(Duration::from_millis(100)).await;
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!("Failed to check postgres status: {e}")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.setup_database(postgres_port).await?;
|
||||
}
|
||||
@@ -583,15 +621,10 @@ impl StorageController {
|
||||
}
|
||||
|
||||
let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
|
||||
println!("Stopping storage controller database...");
|
||||
let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"];
|
||||
let stop_status = Command::new(pg_bin_dir.join("pg_ctl"))
|
||||
.args(pg_stop_args)
|
||||
.spawn()?
|
||||
.wait()
|
||||
.await?;
|
||||
let stop_status = self.pg_ctl(pg_stop_args).await;
|
||||
if !stop_status.success() {
|
||||
match self.is_postgres_running().await {
|
||||
Ok(false) => {
|
||||
@@ -612,14 +645,9 @@ impl StorageController {
|
||||
|
||||
async fn is_postgres_running(&self) -> anyhow::Result<bool> {
|
||||
let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
|
||||
let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
|
||||
let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
|
||||
.args(pg_status_args)
|
||||
.spawn()?
|
||||
.wait()
|
||||
.await?;
|
||||
let status_exitcode = self.pg_ctl(pg_status_args).await;
|
||||
|
||||
// pg_ctl status returns this exit code if postgres is not running: in this case it is
|
||||
// fine that stop failed. Otherwise it is an error that stop failed.
|
||||
|
||||
@@ -111,6 +111,11 @@ enum Command {
|
||||
#[arg(long)]
|
||||
node: NodeId,
|
||||
},
|
||||
/// Cancel any ongoing reconciliation for this shard
|
||||
TenantShardCancelReconcile {
|
||||
#[arg(long)]
|
||||
tenant_shard_id: TenantShardId,
|
||||
},
|
||||
/// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
|
||||
/// that is passed through to pageservers, and does not affect storage controller behavior.
|
||||
TenantConfig {
|
||||
@@ -535,6 +540,15 @@ async fn main() -> anyhow::Result<()> {
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Command::TenantShardCancelReconcile { tenant_shard_id } => {
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(
|
||||
Method::PUT,
|
||||
format!("control/v1/tenant/{tenant_shard_id}/cancel_reconcile"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Command::TenantConfig { tenant_id, config } => {
|
||||
let tenant_conf = serde_json::from_str(&config)?;
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
Currently we build two main images:
|
||||
|
||||
- [neondatabase/neon](https://hub.docker.com/repository/docker/neondatabase/neon) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
|
||||
- [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14. Built from [/compute-node/Dockerfile](/compute/Dockerfile.compute-node).
|
||||
- [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14. Built from [/compute-node/Dockerfile](/compute/compute-node.Dockerfile).
|
||||
|
||||
And additional intermediate image:
|
||||
|
||||
@@ -56,7 +56,7 @@ CREATE TABLE
|
||||
postgres=# insert into t values(1, 1);
|
||||
INSERT 0 1
|
||||
postgres=# select * from t;
|
||||
key | value
|
||||
key | value
|
||||
-----+-------
|
||||
1 | 1
|
||||
(1 row)
|
||||
@@ -84,4 +84,4 @@ Access http://localhost:9001 and sign in.
|
||||
- Username: `minio`
|
||||
- Password: `password`
|
||||
|
||||
You can see durable pages and WAL data in `neon` bucket.
|
||||
You can see durable pages and WAL data in `neon` bucket.
|
||||
|
||||
@@ -19,6 +19,7 @@ use once_cell::sync::Lazy;
|
||||
use prometheus::core::{
|
||||
Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
|
||||
};
|
||||
pub use prometheus::local::LocalHistogram;
|
||||
pub use prometheus::opts;
|
||||
pub use prometheus::register;
|
||||
pub use prometheus::Error;
|
||||
|
||||
@@ -211,13 +211,30 @@ pub enum TimelineState {
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct TimelineCreateRequest {
|
||||
pub new_timeline_id: TimelineId,
|
||||
#[serde(default)]
|
||||
pub ancestor_timeline_id: Option<TimelineId>,
|
||||
#[serde(default)]
|
||||
pub existing_initdb_timeline_id: Option<TimelineId>,
|
||||
#[serde(default)]
|
||||
pub ancestor_start_lsn: Option<Lsn>,
|
||||
pub pg_version: Option<u32>,
|
||||
#[serde(flatten)]
|
||||
pub mode: TimelineCreateRequestMode,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
#[serde(untagged)]
|
||||
pub enum TimelineCreateRequestMode {
|
||||
Branch {
|
||||
ancestor_timeline_id: TimelineId,
|
||||
#[serde(default)]
|
||||
ancestor_start_lsn: Option<Lsn>,
|
||||
// TODO: cplane sets this, but, the branching code always
|
||||
// inherits the ancestor's pg_version. Earlier code wasn't
|
||||
// using a flattened enum, so, it was an accepted field, and
|
||||
// we continue to accept it by having it here.
|
||||
pg_version: Option<u32>,
|
||||
},
|
||||
// NB: Bootstrap is all-optional, and thus the serde(untagged) will cause serde to stop at Bootstrap.
|
||||
// (serde picks the first matching enum variant, in declaration order).
|
||||
Bootstrap {
|
||||
#[serde(default)]
|
||||
existing_initdb_timeline_id: Option<TimelineId>,
|
||||
pg_version: Option<u32>,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
@@ -684,6 +701,25 @@ pub struct TimelineArchivalConfigRequest {
|
||||
pub state: TimelineArchivalState,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct TimelinesInfoAndOffloaded {
|
||||
pub timelines: Vec<TimelineInfo>,
|
||||
pub offloaded: Vec<OffloadedTimelineInfo>,
|
||||
}
|
||||
|
||||
/// Analog of [`TimelineInfo`] for offloaded timelines.
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct OffloadedTimelineInfo {
|
||||
pub tenant_id: TenantShardId,
|
||||
pub timeline_id: TimelineId,
|
||||
/// Whether the timeline has a parent it has been branched off from or not
|
||||
pub ancestor_timeline_id: Option<TimelineId>,
|
||||
/// Whether to retain the branch lsn at the ancestor or not
|
||||
pub ancestor_retain_lsn: Option<Lsn>,
|
||||
/// The time point when the timeline was archived
|
||||
pub archived_at: chrono::DateTime<chrono::Utc>,
|
||||
}
|
||||
|
||||
/// This represents the output of the "timeline_detail" and "timeline_list" API calls.
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct TimelineInfo {
|
||||
@@ -1032,6 +1068,12 @@ pub mod virtual_file {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ScanDisposableKeysResponse {
|
||||
pub disposable_count: usize,
|
||||
pub not_disposable_count: usize,
|
||||
}
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
#[derive(PartialEq, Eq, Debug)]
|
||||
pub enum PagestreamFeMessage {
|
||||
|
||||
@@ -357,22 +357,20 @@ impl RemoteStorage for LocalFs {
|
||||
.list_recursive(prefix)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
let objects = keys
|
||||
.into_iter()
|
||||
.filter_map(|k| {
|
||||
let path = k.with_base(&self.storage_root);
|
||||
if path.is_dir() {
|
||||
None
|
||||
} else {
|
||||
Some(ListingObject {
|
||||
key: k.clone(),
|
||||
// LocalFs is just for testing, so just specify a dummy time
|
||||
last_modified: SystemTime::now(),
|
||||
size: 0,
|
||||
})
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
let mut objects = Vec::with_capacity(keys.len());
|
||||
for key in keys {
|
||||
let path = key.with_base(&self.storage_root);
|
||||
let metadata = file_metadata(&path).await?;
|
||||
if metadata.is_dir() {
|
||||
continue;
|
||||
}
|
||||
objects.push(ListingObject {
|
||||
key: key.clone(),
|
||||
last_modified: metadata.modified()?,
|
||||
size: metadata.len(),
|
||||
});
|
||||
}
|
||||
let objects = objects;
|
||||
|
||||
if let ListingMode::NoDelimiter = mode {
|
||||
result.keys = objects;
|
||||
@@ -410,9 +408,8 @@ impl RemoteStorage for LocalFs {
|
||||
} else {
|
||||
result.keys.push(ListingObject {
|
||||
key: RemotePath::from_string(&relative_key).unwrap(),
|
||||
// LocalFs is just for testing
|
||||
last_modified: SystemTime::now(),
|
||||
size: 0,
|
||||
last_modified: object.last_modified,
|
||||
size: object.size,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -164,7 +164,11 @@ fn criterion_benchmark(c: &mut Criterion) {
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(
|
||||
pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
|
||||
));
|
||||
virtual_file::init(16384, virtual_file::io_engine_for_bench());
|
||||
virtual_file::init(
|
||||
16384,
|
||||
virtual_file::io_engine_for_bench(),
|
||||
conf.virtual_file_io_mode,
|
||||
);
|
||||
page_cache::init(conf.page_cache_size);
|
||||
|
||||
{
|
||||
|
||||
@@ -11,7 +11,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
|
||||
match cmd {
|
||||
IndexPartCmd::Dump { path } => {
|
||||
let bytes = tokio::fs::read(path).await.context("read file")?;
|
||||
let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
|
||||
let des: IndexPart = IndexPart::from_json_bytes(&bytes).context("deserialize")?;
|
||||
let output = serde_json::to_string_pretty(&des).context("serialize output")?;
|
||||
println!("{output}");
|
||||
Ok(())
|
||||
|
||||
@@ -7,6 +7,7 @@ use camino::{Utf8Path, Utf8PathBuf};
|
||||
use pageserver::context::{DownloadBehavior, RequestContext};
|
||||
use pageserver::task_mgr::TaskKind;
|
||||
use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||
use pageserver::virtual_file::api::IoMode;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::ops::Range;
|
||||
@@ -152,7 +153,11 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
|
||||
// Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
|
||||
pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
|
||||
pageserver::virtual_file::init(
|
||||
10,
|
||||
virtual_file::api::IoEngineKind::StdFs,
|
||||
IoMode::preferred(),
|
||||
);
|
||||
pageserver::page_cache::init(100);
|
||||
|
||||
let mut total_delta_layers = 0usize;
|
||||
|
||||
@@ -11,6 +11,7 @@ use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
|
||||
use pageserver::tenant::storage_layer::{delta_layer, image_layer};
|
||||
use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer};
|
||||
use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||
use pageserver::virtual_file::api::IoMode;
|
||||
use pageserver::{page_cache, virtual_file};
|
||||
use pageserver::{
|
||||
repository::{Key, KEY_SIZE},
|
||||
@@ -59,7 +60,11 @@ pub(crate) enum LayerCmd {
|
||||
|
||||
async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
|
||||
let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
|
||||
virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
|
||||
virtual_file::init(
|
||||
10,
|
||||
virtual_file::api::IoEngineKind::StdFs,
|
||||
IoMode::preferred(),
|
||||
);
|
||||
page_cache::init(100);
|
||||
let file = VirtualFile::open(path, ctx).await?;
|
||||
let file_id = page_cache::next_file_id();
|
||||
@@ -190,7 +195,11 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
new_tenant_id,
|
||||
new_timeline_id,
|
||||
} => {
|
||||
pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
|
||||
pageserver::virtual_file::init(
|
||||
10,
|
||||
virtual_file::api::IoEngineKind::StdFs,
|
||||
IoMode::preferred(),
|
||||
);
|
||||
pageserver::page_cache::init(100);
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
|
||||
@@ -24,7 +24,7 @@ use pageserver::{
|
||||
page_cache,
|
||||
task_mgr::TaskKind,
|
||||
tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
|
||||
virtual_file,
|
||||
virtual_file::{self, api::IoMode},
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use postgres_ffi::ControlFileData;
|
||||
@@ -205,7 +205,11 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {
|
||||
|
||||
async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
|
||||
// Basic initialization of things that don't change after startup
|
||||
virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
|
||||
virtual_file::init(
|
||||
10,
|
||||
virtual_file::api::IoEngineKind::StdFs,
|
||||
IoMode::preferred(),
|
||||
);
|
||||
page_cache::init(100);
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
dump_layerfile_from_path(path, true, &ctx).await
|
||||
|
||||
@@ -167,7 +167,11 @@ fn main() -> anyhow::Result<()> {
|
||||
let scenario = failpoint_support::init();
|
||||
|
||||
// Basic initialization of things that don't change after startup
|
||||
virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine);
|
||||
virtual_file::init(
|
||||
conf.max_file_descriptors,
|
||||
conf.virtual_file_io_engine,
|
||||
conf.virtual_file_io_mode,
|
||||
);
|
||||
page_cache::init(conf.page_cache_size);
|
||||
|
||||
start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;
|
||||
|
||||
@@ -597,6 +597,10 @@ paths:
|
||||
Create a timeline. Returns new timeline id on success.
|
||||
Recreating the same timeline will succeed if the parameters match the existing timeline.
|
||||
If no pg_version is specified, assume DEFAULT_PG_VERSION hardcoded in the pageserver.
|
||||
|
||||
To ensure durability, the caller must retry the creation until success.
|
||||
Just because the timeline is visible via other endpoints does not mean it is durable.
|
||||
Future versions may stop showing timelines that are not yet durable.
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
|
||||
@@ -26,6 +26,7 @@ use pageserver_api::models::LocationConfigListResponse;
|
||||
use pageserver_api::models::LocationConfigMode;
|
||||
use pageserver_api::models::LsnLease;
|
||||
use pageserver_api::models::LsnLeaseRequest;
|
||||
use pageserver_api::models::OffloadedTimelineInfo;
|
||||
use pageserver_api::models::ShardParameters;
|
||||
use pageserver_api::models::TenantDetails;
|
||||
use pageserver_api::models::TenantLocationConfigRequest;
|
||||
@@ -37,6 +38,8 @@ use pageserver_api::models::TenantShardSplitRequest;
|
||||
use pageserver_api::models::TenantShardSplitResponse;
|
||||
use pageserver_api::models::TenantSorting;
|
||||
use pageserver_api::models::TimelineArchivalConfigRequest;
|
||||
use pageserver_api::models::TimelineCreateRequestMode;
|
||||
use pageserver_api::models::TimelinesInfoAndOffloaded;
|
||||
use pageserver_api::models::TopTenantShardItem;
|
||||
use pageserver_api::models::TopTenantShardsRequest;
|
||||
use pageserver_api::models::TopTenantShardsResponse;
|
||||
@@ -81,7 +84,9 @@ use crate::tenant::timeline::CompactFlags;
|
||||
use crate::tenant::timeline::CompactionError;
|
||||
use crate::tenant::timeline::Timeline;
|
||||
use crate::tenant::GetTimelineError;
|
||||
use crate::tenant::OffloadedTimeline;
|
||||
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
|
||||
use crate::DEFAULT_PG_VERSION;
|
||||
use crate::{disk_usage_eviction_task, tenant};
|
||||
use pageserver_api::models::{
|
||||
StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest,
|
||||
@@ -477,6 +482,24 @@ async fn build_timeline_info_common(
|
||||
Ok(info)
|
||||
}
|
||||
|
||||
fn build_timeline_offloaded_info(offloaded: &Arc<OffloadedTimeline>) -> OffloadedTimelineInfo {
|
||||
let &OffloadedTimeline {
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
ancestor_retain_lsn,
|
||||
ancestor_timeline_id,
|
||||
archived_at,
|
||||
..
|
||||
} = offloaded.as_ref();
|
||||
OffloadedTimelineInfo {
|
||||
tenant_id: tenant_shard_id,
|
||||
timeline_id,
|
||||
ancestor_retain_lsn,
|
||||
ancestor_timeline_id,
|
||||
archived_at: archived_at.and_utc(),
|
||||
}
|
||||
}
|
||||
|
||||
// healthcheck handler
|
||||
async fn status_handler(
|
||||
request: Request<Body>,
|
||||
@@ -526,6 +549,26 @@ async fn timeline_create_handler(
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let new_timeline_id = request_data.new_timeline_id;
|
||||
// fill in the default pg_version if not provided & convert request into domain model
|
||||
let params: tenant::CreateTimelineParams = match request_data.mode {
|
||||
TimelineCreateRequestMode::Bootstrap {
|
||||
existing_initdb_timeline_id,
|
||||
pg_version,
|
||||
} => tenant::CreateTimelineParams::Bootstrap(tenant::CreateTimelineParamsBootstrap {
|
||||
new_timeline_id,
|
||||
existing_initdb_timeline_id,
|
||||
pg_version: pg_version.unwrap_or(DEFAULT_PG_VERSION),
|
||||
}),
|
||||
TimelineCreateRequestMode::Branch {
|
||||
ancestor_timeline_id,
|
||||
ancestor_start_lsn,
|
||||
pg_version: _,
|
||||
} => tenant::CreateTimelineParams::Branch(tenant::CreateTimelineParamsBranch {
|
||||
new_timeline_id,
|
||||
ancestor_timeline_id,
|
||||
ancestor_start_lsn,
|
||||
}),
|
||||
};
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);
|
||||
|
||||
@@ -538,22 +581,12 @@ async fn timeline_create_handler(
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
if let Some(ancestor_id) = request_data.ancestor_timeline_id.as_ref() {
|
||||
tracing::info!(%ancestor_id, "starting to branch");
|
||||
} else {
|
||||
tracing::info!("bootstrapping");
|
||||
}
|
||||
// earlier versions of the code had pg_version and ancestor_lsn in the span
|
||||
// => continue to provide that information, but, through a log message that doesn't require us to destructure
|
||||
tracing::info!(?params, "creating timeline");
|
||||
|
||||
match tenant
|
||||
.create_timeline(
|
||||
new_timeline_id,
|
||||
request_data.ancestor_timeline_id,
|
||||
request_data.ancestor_start_lsn,
|
||||
request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
|
||||
request_data.existing_initdb_timeline_id,
|
||||
state.broker_client.clone(),
|
||||
&ctx,
|
||||
)
|
||||
.create_timeline(params, state.broker_client.clone(), &ctx)
|
||||
.await
|
||||
{
|
||||
Ok(new_timeline) => {
|
||||
@@ -604,8 +637,6 @@ async fn timeline_create_handler(
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard_id = %tenant_shard_id.shard_slug(),
|
||||
timeline_id = %new_timeline_id,
|
||||
lsn=?request_data.ancestor_start_lsn,
|
||||
pg_version=?request_data.pg_version
|
||||
))
|
||||
.await
|
||||
}
|
||||
@@ -643,7 +674,7 @@ async fn timeline_list_handler(
|
||||
)
|
||||
.instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
|
||||
.await
|
||||
.context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}")
|
||||
.context("Failed to build timeline info")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
response_data.push(timeline_info);
|
||||
@@ -658,6 +689,62 @@ async fn timeline_list_handler(
|
||||
json_response(StatusCode::OK, response_data)
|
||||
}
|
||||
|
||||
async fn timeline_and_offloaded_list_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let include_non_incremental_logical_size: Option<bool> =
|
||||
parse_query_param(&request, "include-non-incremental-logical-size")?;
|
||||
let force_await_initial_logical_size: Option<bool> =
|
||||
parse_query_param(&request, "force-await-initial-logical-size")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
|
||||
let response_data = async {
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
let (timelines, offloadeds) = tenant.list_timelines_and_offloaded();
|
||||
|
||||
let mut timeline_infos = Vec::with_capacity(timelines.len());
|
||||
for timeline in timelines {
|
||||
let timeline_info = build_timeline_info(
|
||||
&timeline,
|
||||
include_non_incremental_logical_size.unwrap_or(false),
|
||||
force_await_initial_logical_size.unwrap_or(false),
|
||||
&ctx,
|
||||
)
|
||||
.instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
|
||||
.await
|
||||
.context("Failed to build timeline info")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
timeline_infos.push(timeline_info);
|
||||
}
|
||||
let offloaded_infos = offloadeds
|
||||
.into_iter()
|
||||
.map(|offloaded| build_timeline_offloaded_info(&offloaded))
|
||||
.collect::<Vec<_>>();
|
||||
let res = TimelinesInfoAndOffloaded {
|
||||
timelines: timeline_infos,
|
||||
offloaded: offloaded_infos,
|
||||
};
|
||||
Ok::<TimelinesInfoAndOffloaded, ApiError>(res)
|
||||
}
|
||||
.instrument(info_span!("timeline_and_offloaded_list",
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard_id = %tenant_shard_id.shard_slug()))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, response_data)
|
||||
}
|
||||
|
||||
async fn timeline_preserve_initdb_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -1206,6 +1293,99 @@ async fn layer_map_info_handler(
|
||||
json_response(StatusCode::OK, layer_map_info)
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id, shard_id, timeline_id, layer_name))]
|
||||
async fn timeline_layer_scan_disposable_keys(
|
||||
request: Request<Body>,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let layer_name: LayerName = parse_request_param(&request, "layer_name")?;
|
||||
|
||||
tracing::Span::current().record(
|
||||
"tenant_id",
|
||||
tracing::field::display(&tenant_shard_id.tenant_id),
|
||||
);
|
||||
tracing::Span::current().record(
|
||||
"shard_id",
|
||||
tracing::field::display(tenant_shard_id.shard_slug()),
|
||||
);
|
||||
tracing::Span::current().record("timeline_id", tracing::field::display(&timeline_id));
|
||||
tracing::Span::current().record("layer_name", tracing::field::display(&layer_name));
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
// technically the timeline need not be active for this scan to complete
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
|
||||
let guard = timeline.layers.read().await;
|
||||
let Some(layer) = guard.try_get_from_key(&layer_name.clone().into()) else {
|
||||
return Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Layer {tenant_shard_id}/{timeline_id}/{layer_name} not found").into(),
|
||||
));
|
||||
};
|
||||
|
||||
let resident_layer = layer
|
||||
.download_and_keep_resident()
|
||||
.await
|
||||
.map_err(|err| match err {
|
||||
tenant::storage_layer::layer::DownloadError::TimelineShutdown
|
||||
| tenant::storage_layer::layer::DownloadError::DownloadCancelled => {
|
||||
ApiError::ShuttingDown
|
||||
}
|
||||
tenant::storage_layer::layer::DownloadError::ContextAndConfigReallyDeniesDownloads
|
||||
| tenant::storage_layer::layer::DownloadError::DownloadRequired
|
||||
| tenant::storage_layer::layer::DownloadError::NotFile(_)
|
||||
| tenant::storage_layer::layer::DownloadError::DownloadFailed
|
||||
| tenant::storage_layer::layer::DownloadError::PreStatFailed(_) => {
|
||||
ApiError::InternalServerError(err.into())
|
||||
}
|
||||
#[cfg(test)]
|
||||
tenant::storage_layer::layer::DownloadError::Failpoint(_) => {
|
||||
ApiError::InternalServerError(err.into())
|
||||
}
|
||||
})?;
|
||||
|
||||
let keys = resident_layer
|
||||
.load_keys(&ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
let shard_identity = timeline.get_shard_identity();
|
||||
|
||||
let mut disposable_count = 0;
|
||||
let mut not_disposable_count = 0;
|
||||
let cancel = cancel.clone();
|
||||
for (i, key) in keys.into_iter().enumerate() {
|
||||
if shard_identity.is_key_disposable(&key) {
|
||||
disposable_count += 1;
|
||||
tracing::debug!(key = %key, key.dbg=?key, "disposable key");
|
||||
} else {
|
||||
not_disposable_count += 1;
|
||||
}
|
||||
#[allow(clippy::collapsible_if)]
|
||||
if i % 10000 == 0 {
|
||||
if cancel.is_cancelled() || timeline.cancel.is_cancelled() || timeline.is_stopping() {
|
||||
return Err(ApiError::ShuttingDown);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
pageserver_api::models::ScanDisposableKeysResponse {
|
||||
disposable_count,
|
||||
not_disposable_count,
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
async fn layer_download_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -2993,6 +3173,9 @@ pub fn make_router(
|
||||
.get("/v1/tenant/:tenant_shard_id/timeline", |r| {
|
||||
api_handler(r, timeline_list_handler)
|
||||
})
|
||||
.get("/v1/tenant/:tenant_shard_id/timeline_and_offloaded", |r| {
|
||||
api_handler(r, timeline_and_offloaded_list_handler)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_shard_id/timeline", |r| {
|
||||
api_handler(r, timeline_create_handler)
|
||||
})
|
||||
@@ -3065,6 +3248,10 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
|
||||
|r| api_handler(r, evict_timeline_layer_handler),
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_name/scan_disposable_keys",
|
||||
|r| testing_api_handler("timeline_layer_scan_disposable_keys", r, timeline_layer_scan_disposable_keys),
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/block_gc",
|
||||
|r| api_handler(r, timeline_gc_blocking_handler),
|
||||
|
||||
@@ -2092,6 +2092,7 @@ pub(crate) struct WalIngestMetrics {
|
||||
pub(crate) records_received: IntCounter,
|
||||
pub(crate) records_committed: IntCounter,
|
||||
pub(crate) records_filtered: IntCounter,
|
||||
pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter,
|
||||
}
|
||||
|
||||
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
|
||||
@@ -2115,6 +2116,11 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
|
||||
"Number of WAL records filtered out due to sharding"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
gap_blocks_zeroed_on_rel_extend: register_int_counter!(
|
||||
"pageserver_gap_blocks_zeroed_on_rel_extend",
|
||||
"Total number of zero gap blocks written on relation extends"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
});
|
||||
|
||||
pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
@@ -3034,13 +3040,111 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
|
||||
}
|
||||
|
||||
pub mod tokio_epoll_uring {
|
||||
use metrics::{register_int_counter, UIntGauge};
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
sync::{Arc, Mutex},
|
||||
};
|
||||
|
||||
use metrics::{register_histogram, register_int_counter, Histogram, LocalHistogram, UIntGauge};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
/// Shared storage for tokio-epoll-uring thread local metrics.
|
||||
pub(crate) static THREAD_LOCAL_METRICS_STORAGE: Lazy<ThreadLocalMetricsStorage> =
|
||||
Lazy::new(|| {
|
||||
let slots_submission_queue_depth = register_histogram!(
|
||||
"pageserver_tokio_epoll_uring_slots_submission_queue_depth",
|
||||
"The slots waiters queue depth of each tokio_epoll_uring system",
|
||||
vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
ThreadLocalMetricsStorage {
|
||||
observers: Mutex::new(HashMap::new()),
|
||||
slots_submission_queue_depth,
|
||||
}
|
||||
});
|
||||
|
||||
pub struct ThreadLocalMetricsStorage {
|
||||
/// List of thread local metrics observers.
|
||||
observers: Mutex<HashMap<u64, Arc<ThreadLocalMetrics>>>,
|
||||
/// A histogram shared between all thread local systems
|
||||
/// for collecting slots submission queue depth.
|
||||
slots_submission_queue_depth: Histogram,
|
||||
}
|
||||
|
||||
/// Each thread-local [`tokio_epoll_uring::System`] gets one of these as its
|
||||
/// [`tokio_epoll_uring::metrics::PerSystemMetrics`] generic.
|
||||
///
|
||||
/// The System makes observations into [`Self`] and periodically, the collector
|
||||
/// comes along and flushes [`Self`] into the shared storage [`THREAD_LOCAL_METRICS_STORAGE`].
|
||||
///
|
||||
/// [`LocalHistogram`] is `!Send`, so, we need to put it behind a [`Mutex`].
|
||||
/// But except for the periodic flush, the lock is uncontended so there's no waiting
|
||||
/// for cache coherence protocol to get an exclusive cache line.
|
||||
pub struct ThreadLocalMetrics {
|
||||
/// Local observer of thread local tokio-epoll-uring system's slots waiters queue depth.
|
||||
slots_submission_queue_depth: Mutex<LocalHistogram>,
|
||||
}
|
||||
|
||||
impl ThreadLocalMetricsStorage {
|
||||
/// Registers a new thread local system. Returns a thread local metrics observer.
|
||||
pub fn register_system(&self, id: u64) -> Arc<ThreadLocalMetrics> {
|
||||
let per_system_metrics = Arc::new(ThreadLocalMetrics::new(
|
||||
self.slots_submission_queue_depth.local(),
|
||||
));
|
||||
let mut g = self.observers.lock().unwrap();
|
||||
g.insert(id, Arc::clone(&per_system_metrics));
|
||||
per_system_metrics
|
||||
}
|
||||
|
||||
/// Removes metrics observer for a thread local system.
|
||||
/// This should be called before dropping a thread local system.
|
||||
pub fn remove_system(&self, id: u64) {
|
||||
let mut g = self.observers.lock().unwrap();
|
||||
g.remove(&id);
|
||||
}
|
||||
|
||||
/// Flush all thread local metrics to the shared storage.
|
||||
pub fn flush_thread_local_metrics(&self) {
|
||||
let g = self.observers.lock().unwrap();
|
||||
g.values().for_each(|local| {
|
||||
local.flush();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
impl ThreadLocalMetrics {
|
||||
pub fn new(slots_submission_queue_depth: LocalHistogram) -> Self {
|
||||
ThreadLocalMetrics {
|
||||
slots_submission_queue_depth: Mutex::new(slots_submission_queue_depth),
|
||||
}
|
||||
}
|
||||
|
||||
/// Flushes the thread local metrics to shared aggregator.
|
||||
pub fn flush(&self) {
|
||||
let Self {
|
||||
slots_submission_queue_depth,
|
||||
} = self;
|
||||
slots_submission_queue_depth.lock().unwrap().flush();
|
||||
}
|
||||
}
|
||||
|
||||
impl tokio_epoll_uring::metrics::PerSystemMetrics for ThreadLocalMetrics {
|
||||
fn observe_slots_submission_queue_depth(&self, queue_depth: u64) {
|
||||
let Self {
|
||||
slots_submission_queue_depth,
|
||||
} = self;
|
||||
slots_submission_queue_depth
|
||||
.lock()
|
||||
.unwrap()
|
||||
.observe(queue_depth as f64);
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Collector {
|
||||
descs: Vec<metrics::core::Desc>,
|
||||
systems_created: UIntGauge,
|
||||
systems_destroyed: UIntGauge,
|
||||
thread_local_metrics_storage: &'static ThreadLocalMetricsStorage,
|
||||
}
|
||||
|
||||
impl metrics::core::Collector for Collector {
|
||||
@@ -3050,7 +3154,7 @@ pub mod tokio_epoll_uring {
|
||||
|
||||
fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
|
||||
let mut mfs = Vec::with_capacity(Self::NMETRICS);
|
||||
let tokio_epoll_uring::metrics::Metrics {
|
||||
let tokio_epoll_uring::metrics::GlobalMetrics {
|
||||
systems_created,
|
||||
systems_destroyed,
|
||||
} = tokio_epoll_uring::metrics::global();
|
||||
@@ -3058,12 +3162,21 @@ pub mod tokio_epoll_uring {
|
||||
mfs.extend(self.systems_created.collect());
|
||||
self.systems_destroyed.set(systems_destroyed);
|
||||
mfs.extend(self.systems_destroyed.collect());
|
||||
|
||||
self.thread_local_metrics_storage
|
||||
.flush_thread_local_metrics();
|
||||
|
||||
mfs.extend(
|
||||
self.thread_local_metrics_storage
|
||||
.slots_submission_queue_depth
|
||||
.collect(),
|
||||
);
|
||||
mfs
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector {
|
||||
const NMETRICS: usize = 2;
|
||||
const NMETRICS: usize = 3;
|
||||
|
||||
#[allow(clippy::new_without_default)]
|
||||
pub fn new() -> Self {
|
||||
@@ -3095,6 +3208,7 @@ pub mod tokio_epoll_uring {
|
||||
descs,
|
||||
systems_created,
|
||||
systems_destroyed,
|
||||
thread_local_metrics_storage: &THREAD_LOCAL_METRICS_STORAGE,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3454,6 +3568,7 @@ pub fn preinitialize_metrics() {
|
||||
Lazy::force(&RECONSTRUCT_TIME);
|
||||
Lazy::force(&BASEBACKUP_QUERY_TIME);
|
||||
Lazy::force(&COMPUTE_COMMANDS_COUNTERS);
|
||||
Lazy::force(&tokio_epoll_uring::THREAD_LOCAL_METRICS_STORAGE);
|
||||
|
||||
tenant_throttling::preinitialize_global_metrics();
|
||||
}
|
||||
|
||||
@@ -82,6 +82,7 @@ use once_cell::sync::OnceCell;
|
||||
use crate::{
|
||||
context::RequestContext,
|
||||
metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics},
|
||||
virtual_file::{IoBufferMut, IoPageSlice},
|
||||
};
|
||||
|
||||
static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
|
||||
@@ -144,7 +145,7 @@ struct SlotInner {
|
||||
key: Option<CacheKey>,
|
||||
// for `coalesce_readers_permit`
|
||||
permit: std::sync::Mutex<Weak<PinnedSlotsPermit>>,
|
||||
buf: &'static mut [u8; PAGE_SZ],
|
||||
buf: IoPageSlice<'static>,
|
||||
}
|
||||
|
||||
impl Slot {
|
||||
@@ -234,13 +235,13 @@ impl std::ops::Deref for PageReadGuard<'_> {
|
||||
type Target = [u8; PAGE_SZ];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.slot_guard.buf
|
||||
self.slot_guard.buf.deref()
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
|
||||
fn as_ref(&self) -> &[u8; PAGE_SZ] {
|
||||
self.slot_guard.buf
|
||||
self.slot_guard.buf.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -266,7 +267,7 @@ enum PageWriteGuardState<'i> {
|
||||
impl std::ops::DerefMut for PageWriteGuard<'_> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
match &mut self.state {
|
||||
PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
|
||||
PageWriteGuardState::Invalid { inner, _permit } => inner.buf.deref_mut(),
|
||||
PageWriteGuardState::Downgraded => unreachable!(),
|
||||
}
|
||||
}
|
||||
@@ -277,7 +278,7 @@ impl std::ops::Deref for PageWriteGuard<'_> {
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
match &self.state {
|
||||
PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
|
||||
PageWriteGuardState::Invalid { inner, _permit } => inner.buf.deref(),
|
||||
PageWriteGuardState::Downgraded => unreachable!(),
|
||||
}
|
||||
}
|
||||
@@ -643,7 +644,7 @@ impl PageCache {
|
||||
// We could use Vec::leak here, but that potentially also leaks
|
||||
// uninitialized reserved capacity. With into_boxed_slice and Box::leak
|
||||
// this is avoided.
|
||||
let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());
|
||||
let page_buffer = IoBufferMut::with_capacity_zeroed(num_pages * PAGE_SZ).leak();
|
||||
|
||||
let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
|
||||
size_metrics.max_bytes.set_page_sz(num_pages);
|
||||
@@ -652,7 +653,8 @@ impl PageCache {
|
||||
let slots = page_buffer
|
||||
.chunks_exact_mut(PAGE_SZ)
|
||||
.map(|chunk| {
|
||||
let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();
|
||||
// SAFETY: Each chunk has `PAGE_SZ` (8192) bytes, greater than 512, still aligned.
|
||||
let buf = unsafe { IoPageSlice::new_unchecked(chunk.try_into().unwrap()) };
|
||||
|
||||
Slot {
|
||||
inner: tokio::sync::RwLock::new(SlotInner {
|
||||
|
||||
@@ -1506,35 +1506,42 @@ impl<'a> DatadirModification<'a> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Drop a relation.
|
||||
pub async fn put_rel_drop(&mut self, rel: RelTag, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
/// Drop some relations
|
||||
pub(crate) async fn put_rel_drops(
|
||||
&mut self,
|
||||
drop_relations: HashMap<(u32, u32), Vec<RelTag>>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
for ((spc_node, db_node), rel_tags) in drop_relations {
|
||||
let dir_key = rel_dir_to_key(spc_node, db_node);
|
||||
let buf = self.get(dir_key, ctx).await?;
|
||||
let mut dir = RelDirectory::des(&buf)?;
|
||||
|
||||
// Remove it from the directory entry
|
||||
let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
|
||||
let buf = self.get(dir_key, ctx).await?;
|
||||
let mut dir = RelDirectory::des(&buf)?;
|
||||
let mut dirty = false;
|
||||
for rel_tag in rel_tags {
|
||||
if dir.rels.remove(&(rel_tag.relnode, rel_tag.forknum)) {
|
||||
dirty = true;
|
||||
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::Rel, dir.rels.len()));
|
||||
// update logical size
|
||||
let size_key = rel_size_to_key(rel_tag);
|
||||
let old_size = self.get(size_key, ctx).await?.get_u32_le();
|
||||
self.pending_nblocks -= old_size as i64;
|
||||
|
||||
if dir.rels.remove(&(rel.relnode, rel.forknum)) {
|
||||
self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?)));
|
||||
} else {
|
||||
warn!("dropped rel {} did not exist in rel directory", rel);
|
||||
// Remove entry from relation size cache
|
||||
self.tline.remove_cached_rel_size(&rel_tag);
|
||||
|
||||
// Delete size entry, as well as all blocks
|
||||
self.delete(rel_key_range(rel_tag));
|
||||
}
|
||||
}
|
||||
|
||||
if dirty {
|
||||
self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?)));
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::Rel, dir.rels.len()));
|
||||
}
|
||||
}
|
||||
|
||||
// update logical size
|
||||
let size_key = rel_size_to_key(rel);
|
||||
let old_size = self.get(size_key, ctx).await?.get_u32_le();
|
||||
self.pending_nblocks -= old_size as i64;
|
||||
|
||||
// Remove enty from relation size cache
|
||||
self.tline.remove_cached_rel_size(&rel);
|
||||
|
||||
// Delete size entry, as well as all blocks
|
||||
self.delete(rel_key_range(rel));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -5,6 +5,8 @@
|
||||
use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
|
||||
use crate::context::RequestContext;
|
||||
use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
|
||||
#[cfg(test)]
|
||||
use crate::virtual_file::IoBufferMut;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use bytes::Bytes;
|
||||
use std::ops::Deref;
|
||||
@@ -40,7 +42,7 @@ pub enum BlockLease<'a> {
|
||||
#[cfg(test)]
|
||||
Arc(std::sync::Arc<[u8; PAGE_SZ]>),
|
||||
#[cfg(test)]
|
||||
Vec(Vec<u8>),
|
||||
IoBufferMut(IoBufferMut),
|
||||
}
|
||||
|
||||
impl From<PageReadGuard<'static>> for BlockLease<'static> {
|
||||
@@ -67,7 +69,7 @@ impl Deref for BlockLease<'_> {
|
||||
#[cfg(test)]
|
||||
BlockLease::Arc(v) => v.deref(),
|
||||
#[cfg(test)]
|
||||
BlockLease::Vec(v) => {
|
||||
BlockLease::IoBufferMut(v) => {
|
||||
TryFrom::try_from(&v[..]).expect("caller must ensure that v has PAGE_SZ")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,10 +6,11 @@ use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::page_cache;
|
||||
use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut;
|
||||
use crate::virtual_file::owned_buffers_io::slice::SliceMutExt;
|
||||
use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
|
||||
use crate::virtual_file::owned_buffers_io::write::Buffer;
|
||||
use crate::virtual_file::{self, owned_buffers_io, VirtualFile};
|
||||
use crate::virtual_file::{self, owned_buffers_io, IoBufferMut, VirtualFile};
|
||||
use bytes::BytesMut;
|
||||
use camino::Utf8PathBuf;
|
||||
use num_traits::Num;
|
||||
@@ -107,15 +108,18 @@ impl EphemeralFile {
|
||||
self.page_cache_file_id
|
||||
}
|
||||
|
||||
pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
|
||||
pub(crate) async fn load_to_io_buf(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<IoBufferMut, io::Error> {
|
||||
let size = self.len().into_usize();
|
||||
let vec = Vec::with_capacity(size);
|
||||
let (slice, nread) = self.read_exact_at_eof_ok(0, vec.slice_full(), ctx).await?;
|
||||
let buf = IoBufferMut::with_capacity(size);
|
||||
let (slice, nread) = self.read_exact_at_eof_ok(0, buf.slice_full(), ctx).await?;
|
||||
assert_eq!(nread, size);
|
||||
let vec = slice.into_inner();
|
||||
assert_eq!(vec.len(), nread);
|
||||
assert_eq!(vec.capacity(), size, "we shouldn't be reallocating");
|
||||
Ok(vec)
|
||||
let buf = slice.into_inner();
|
||||
assert_eq!(buf.len(), nread);
|
||||
assert_eq!(buf.capacity(), size, "we shouldn't be reallocating");
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
/// Returns the offset at which the first byte of the input was written, for use
|
||||
@@ -158,7 +162,7 @@ impl EphemeralFile {
|
||||
}
|
||||
|
||||
impl super::storage_layer::inmemory_layer::vectored_dio_read::File for EphemeralFile {
|
||||
async fn read_exact_at_eof_ok<'a, 'b, B: tokio_epoll_uring::IoBufMut + Send>(
|
||||
async fn read_exact_at_eof_ok<'a, 'b, B: IoBufAlignedMut + Send>(
|
||||
&'b self,
|
||||
start: u64,
|
||||
dst: tokio_epoll_uring::Slice<B>,
|
||||
@@ -345,7 +349,7 @@ mod tests {
|
||||
assert!(file.len() as usize == write_nbytes);
|
||||
for i in 0..write_nbytes {
|
||||
assert_eq!(value_offsets[i], i.into_u64());
|
||||
let buf = Vec::with_capacity(1);
|
||||
let buf = IoBufferMut::with_capacity(1);
|
||||
let (buf_slice, nread) = file
|
||||
.read_exact_at_eof_ok(i.into_u64(), buf.slice_full(), &ctx)
|
||||
.await
|
||||
@@ -385,7 +389,7 @@ mod tests {
|
||||
|
||||
// assert the state is as this test expects it to be
|
||||
assert_eq!(
|
||||
&file.load_to_vec(&ctx).await.unwrap(),
|
||||
&file.load_to_io_buf(&ctx).await.unwrap(),
|
||||
&content[0..cap + cap / 2]
|
||||
);
|
||||
let md = file
|
||||
@@ -440,7 +444,7 @@ mod tests {
|
||||
let (buf, nread) = file
|
||||
.read_exact_at_eof_ok(
|
||||
start.into_u64(),
|
||||
Vec::with_capacity(len).slice_full(),
|
||||
IoBufferMut::with_capacity(len).slice_full(),
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
|
||||
@@ -180,6 +180,7 @@
|
||||
|
||||
pub(crate) mod download;
|
||||
pub mod index;
|
||||
pub mod manifest;
|
||||
pub(crate) mod upload;
|
||||
|
||||
use anyhow::Context;
|
||||
@@ -191,7 +192,6 @@ use pageserver_api::models::TimelineArchivalState;
|
||||
use pageserver_api::shard::{ShardIndex, TenantShardId};
|
||||
use scopeguard::ScopeGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
pub(crate) use upload::upload_initdb_dir;
|
||||
use utils::backoff::{
|
||||
self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
};
|
||||
@@ -245,9 +245,11 @@ use super::upload_queue::{NotInitialized, SetDeletedFlagProgress};
|
||||
use super::Generation;
|
||||
|
||||
pub(crate) use download::{
|
||||
download_index_part, is_temp_download_file, list_remote_tenant_shards, list_remote_timelines,
|
||||
do_download_tenant_manifest, download_index_part, is_temp_download_file,
|
||||
list_remote_tenant_shards, list_remote_timelines,
|
||||
};
|
||||
pub(crate) use index::LayerFileMetadata;
|
||||
pub(crate) use upload::{upload_initdb_dir, upload_tenant_manifest};
|
||||
|
||||
// Occasional network issues and such can cause remote operations to fail, and
|
||||
// that's expected. If a download fails, we log it at info-level, and retry.
|
||||
@@ -272,6 +274,12 @@ pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
|
||||
/// which we warn and skip.
|
||||
const DELETION_QUEUE_FLUSH_TIMEOUT: Duration = Duration::from_secs(10);
|
||||
|
||||
/// Hardcode a generation for the tenant manifest for now so that we don't
|
||||
/// need to deal with generation-less manifests in the future.
|
||||
///
|
||||
/// TODO: add proper generation support to all the places that use this.
|
||||
pub(crate) const TENANT_MANIFEST_GENERATION: Generation = Generation::new(1);
|
||||
|
||||
pub enum MaybeDeletedIndexPart {
|
||||
IndexPart(IndexPart),
|
||||
Deleted(IndexPart),
|
||||
@@ -295,6 +303,10 @@ pub enum WaitCompletionError {
|
||||
UploadQueueShutDownOrStopped,
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[error("Upload queue either in unexpected state or hasn't downloaded manifest yet")]
|
||||
pub struct UploadQueueNotReadyError;
|
||||
|
||||
/// A client for accessing a timeline's data in remote storage.
|
||||
///
|
||||
/// This takes care of managing the number of connections, and balancing them
|
||||
@@ -468,6 +480,20 @@ impl RemoteTimelineClient {
|
||||
.ok()
|
||||
}
|
||||
|
||||
/// Returns `Ok(Some(timestamp))` if the timeline has been archived, `Ok(None)` if the timeline hasn't been archived.
|
||||
///
|
||||
/// Return Err(_) if the remote index_part hasn't been downloaded yet, or the timeline hasn't been stopped yet.
|
||||
pub(crate) fn archived_at_stopped_queue(
|
||||
&self,
|
||||
) -> Result<Option<NaiveDateTime>, UploadQueueNotReadyError> {
|
||||
self.upload_queue
|
||||
.lock()
|
||||
.unwrap()
|
||||
.stopped_mut()
|
||||
.map(|q| q.upload_queue_for_deletion.clean.0.archived_at)
|
||||
.map_err(|_| UploadQueueNotReadyError)
|
||||
}
|
||||
|
||||
fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) {
|
||||
let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part {
|
||||
current_remote_index_part
|
||||
@@ -1252,10 +1278,14 @@ impl RemoteTimelineClient {
|
||||
let fut = {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = match &mut *guard {
|
||||
UploadQueue::Stopped(_) => return,
|
||||
UploadQueue::Stopped(_) => {
|
||||
scopeguard::ScopeGuard::into_inner(sg);
|
||||
return;
|
||||
}
|
||||
UploadQueue::Uninitialized => {
|
||||
// transition into Stopped state
|
||||
self.stop_impl(&mut guard);
|
||||
scopeguard::ScopeGuard::into_inner(sg);
|
||||
return;
|
||||
}
|
||||
UploadQueue::Initialized(ref mut init) => init,
|
||||
@@ -2198,6 +2228,17 @@ pub fn remote_tenant_path(tenant_shard_id: &TenantShardId) -> RemotePath {
|
||||
RemotePath::from_string(&path).expect("Failed to construct path")
|
||||
}
|
||||
|
||||
pub fn remote_tenant_manifest_path(
|
||||
tenant_shard_id: &TenantShardId,
|
||||
generation: Generation,
|
||||
) -> RemotePath {
|
||||
let path = format!(
|
||||
"tenants/{tenant_shard_id}/tenant-manifest{}.json",
|
||||
generation.get_suffix()
|
||||
);
|
||||
RemotePath::from_string(&path).expect("Failed to construct path")
|
||||
}
|
||||
|
||||
pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
|
||||
let path = format!("tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}");
|
||||
RemotePath::from_string(&path).expect("Failed to construct path")
|
||||
|
||||
@@ -34,10 +34,11 @@ use utils::id::{TenantId, TimelineId};
|
||||
use utils::pausable_failpoint;
|
||||
|
||||
use super::index::{IndexPart, LayerFileMetadata};
|
||||
use super::manifest::TenantManifest;
|
||||
use super::{
|
||||
parse_remote_index_path, remote_index_path, remote_initdb_archive_path,
|
||||
remote_initdb_preserved_archive_path, remote_tenant_path, FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
|
||||
remote_initdb_preserved_archive_path, remote_tenant_manifest_path, remote_tenant_path,
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
|
||||
};
|
||||
|
||||
///
|
||||
@@ -338,19 +339,15 @@ pub async fn list_remote_timelines(
|
||||
list_identifiers::<TimelineId>(storage, remote_path, cancel).await
|
||||
}
|
||||
|
||||
async fn do_download_index_part(
|
||||
async fn do_download_remote_path_retry_forever(
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
index_generation: Generation,
|
||||
remote_path: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
|
||||
let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
|
||||
|
||||
let (index_part_bytes, index_part_mtime) = download_retry_forever(
|
||||
) -> Result<(Vec<u8>, SystemTime), DownloadError> {
|
||||
download_retry_forever(
|
||||
|| async {
|
||||
let download = storage
|
||||
.download(&remote_path, &DownloadOpts::default(), cancel)
|
||||
.download(remote_path, &DownloadOpts::default(), cancel)
|
||||
.await?;
|
||||
|
||||
let mut bytes = Vec::new();
|
||||
@@ -365,7 +362,39 @@ async fn do_download_index_part(
|
||||
&format!("download {remote_path:?}"),
|
||||
cancel,
|
||||
)
|
||||
.await?;
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn do_download_tenant_manifest(
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(TenantManifest, Generation), DownloadError> {
|
||||
// TODO: generation support
|
||||
let generation = super::TENANT_MANIFEST_GENERATION;
|
||||
let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation);
|
||||
|
||||
let (manifest_bytes, _manifest_bytes_mtime) =
|
||||
do_download_remote_path_retry_forever(storage, &remote_path, cancel).await?;
|
||||
|
||||
let tenant_manifest = TenantManifest::from_json_bytes(&manifest_bytes)
|
||||
.with_context(|| format!("deserialize tenant manifest file at {remote_path:?}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
Ok((tenant_manifest, generation))
|
||||
}
|
||||
|
||||
async fn do_download_index_part(
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
index_generation: Generation,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
|
||||
let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
|
||||
|
||||
let (index_part_bytes, index_part_mtime) =
|
||||
do_download_remote_path_retry_forever(storage, &remote_path, cancel).await?;
|
||||
|
||||
let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
|
||||
.with_context(|| format!("deserialize index part file at {remote_path:?}"))
|
||||
|
||||
@@ -121,11 +121,11 @@ impl IndexPart {
|
||||
self.disk_consistent_lsn
|
||||
}
|
||||
|
||||
pub fn from_s3_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
|
||||
pub fn from_json_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
|
||||
serde_json::from_slice::<IndexPart>(bytes)
|
||||
}
|
||||
|
||||
pub fn to_s3_bytes(&self) -> serde_json::Result<Vec<u8>> {
|
||||
pub fn to_json_bytes(&self) -> serde_json::Result<Vec<u8>> {
|
||||
serde_json::to_vec(self)
|
||||
}
|
||||
|
||||
@@ -383,7 +383,7 @@ mod tests {
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
@@ -427,7 +427,7 @@ mod tests {
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
@@ -472,7 +472,7 @@ mod tests {
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
@@ -520,7 +520,7 @@ mod tests {
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
let empty_layers_parsed = IndexPart::from_s3_bytes(empty_layers_json.as_bytes()).unwrap();
|
||||
let empty_layers_parsed = IndexPart::from_json_bytes(empty_layers_json.as_bytes()).unwrap();
|
||||
|
||||
assert_eq!(empty_layers_parsed, expected);
|
||||
}
|
||||
@@ -563,7 +563,7 @@ mod tests {
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
@@ -609,7 +609,7 @@ mod tests {
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
@@ -660,7 +660,7 @@ mod tests {
|
||||
last_aux_file_policy: Some(AuxFilePolicy::V2),
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
@@ -716,7 +716,7 @@ mod tests {
|
||||
last_aux_file_policy: Default::default(),
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
@@ -773,7 +773,7 @@ mod tests {
|
||||
last_aux_file_policy: Default::default(),
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
@@ -835,7 +835,7 @@ mod tests {
|
||||
archived_at: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
|
||||
53
pageserver/src/tenant/remote_timeline_client/manifest.rs
Normal file
53
pageserver/src/tenant/remote_timeline_client/manifest.rs
Normal file
@@ -0,0 +1,53 @@
|
||||
use chrono::NaiveDateTime;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::{id::TimelineId, lsn::Lsn};
|
||||
|
||||
/// Tenant-shard scoped manifest
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct TenantManifest {
|
||||
/// Debugging aid describing the version of this manifest.
|
||||
/// Can also be used for distinguishing breaking changes later on.
|
||||
pub version: usize,
|
||||
|
||||
/// The list of offloaded timelines together with enough information
|
||||
/// to not have to actually load them.
|
||||
///
|
||||
/// Note: the timelines mentioned in this list might be deleted, i.e.
|
||||
/// we don't hold an invariant that the references aren't dangling.
|
||||
/// Existence of index-part.json is the actual indicator of timeline existence.
|
||||
pub offloaded_timelines: Vec<OffloadedTimelineManifest>,
|
||||
}
|
||||
|
||||
/// The remote level representation of an offloaded timeline.
|
||||
///
|
||||
/// Very similar to [`pageserver_api::models::OffloadedTimelineInfo`],
|
||||
/// but the two datastructures serve different needs, this is for a persistent disk format
|
||||
/// that must be backwards compatible, while the other is only for informative purposes.
|
||||
#[derive(Clone, Serialize, Deserialize, Copy)]
|
||||
pub struct OffloadedTimelineManifest {
|
||||
pub timeline_id: TimelineId,
|
||||
/// Whether the timeline has a parent it has been branched off from or not
|
||||
pub ancestor_timeline_id: Option<TimelineId>,
|
||||
/// Whether to retain the branch lsn at the ancestor or not
|
||||
pub ancestor_retain_lsn: Option<Lsn>,
|
||||
/// The time point when the timeline was archived
|
||||
pub archived_at: NaiveDateTime,
|
||||
}
|
||||
|
||||
pub const LATEST_TENANT_MANIFEST_VERSION: usize = 1;
|
||||
|
||||
impl TenantManifest {
|
||||
pub(crate) fn empty() -> Self {
|
||||
Self {
|
||||
version: LATEST_TENANT_MANIFEST_VERSION,
|
||||
offloaded_timelines: vec![],
|
||||
}
|
||||
}
|
||||
pub(crate) fn from_json_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
|
||||
serde_json::from_slice::<Self>(bytes)
|
||||
}
|
||||
|
||||
pub(crate) fn to_json_bytes(&self) -> serde_json::Result<Vec<u8>> {
|
||||
serde_json::to_vec(self)
|
||||
}
|
||||
}
|
||||
@@ -13,9 +13,11 @@ use tokio_util::sync::CancellationToken;
|
||||
use utils::{backoff, pausable_failpoint};
|
||||
|
||||
use super::index::IndexPart;
|
||||
use super::manifest::TenantManifest;
|
||||
use super::Generation;
|
||||
use crate::tenant::remote_timeline_client::{
|
||||
remote_index_path, remote_initdb_archive_path, remote_initdb_preserved_archive_path,
|
||||
remote_tenant_manifest_path,
|
||||
};
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -39,7 +41,7 @@ pub(crate) async fn upload_index_part<'a>(
|
||||
pausable_failpoint!("before-upload-index-pausable");
|
||||
|
||||
// FIXME: this error comes too late
|
||||
let serialized = index_part.to_s3_bytes()?;
|
||||
let serialized = index_part.to_json_bytes()?;
|
||||
let serialized = Bytes::from(serialized);
|
||||
|
||||
let index_part_size = serialized.len();
|
||||
@@ -55,6 +57,37 @@ pub(crate) async fn upload_index_part<'a>(
|
||||
.await
|
||||
.with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
|
||||
}
|
||||
/// Serializes and uploads the given tenant manifest data to the remote storage.
|
||||
pub(crate) async fn upload_tenant_manifest(
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
generation: Generation,
|
||||
tenant_manifest: &TenantManifest,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
tracing::trace!("uploading new tenant manifest");
|
||||
|
||||
fail_point!("before-upload-manifest", |_| {
|
||||
bail!("failpoint before-upload-manifest")
|
||||
});
|
||||
pausable_failpoint!("before-upload-manifest-pausable");
|
||||
|
||||
let serialized = tenant_manifest.to_json_bytes()?;
|
||||
let serialized = Bytes::from(serialized);
|
||||
|
||||
let tenant_manifest_site = serialized.len();
|
||||
|
||||
let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation);
|
||||
storage
|
||||
.upload_storage_object(
|
||||
futures::stream::once(futures::future::ready(Ok(serialized))),
|
||||
tenant_manifest_site,
|
||||
&remote_path,
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("upload tenant manifest for '{tenant_shard_id}'"))
|
||||
}
|
||||
|
||||
/// Attempts to upload given layer files.
|
||||
/// No extra checks for overlapping files is made and any files that are already present remotely will be overwritten, if submitted during the upload.
|
||||
|
||||
@@ -187,6 +187,8 @@ pub(super) async fn gather_inputs(
|
||||
// but it is unlikely to cause any issues. In the worst case,
|
||||
// the calculation will error out.
|
||||
timelines.retain(|t| t.is_active());
|
||||
// Also filter out archived timelines.
|
||||
timelines.retain(|t| t.is_archived() != Some(true));
|
||||
|
||||
// Build a map of branch points.
|
||||
let mut branchpoints: HashMap<TimelineId, HashSet<Lsn>> = HashMap::new();
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
//! Common traits and structs for layers
|
||||
|
||||
pub mod batch_split_writer;
|
||||
pub mod delta_layer;
|
||||
pub mod filter_iterator;
|
||||
pub mod image_layer;
|
||||
@@ -8,7 +9,6 @@ pub(crate) mod layer;
|
||||
mod layer_desc;
|
||||
mod layer_name;
|
||||
pub mod merge_iterator;
|
||||
pub mod split_writer;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||
use crate::repository::Value;
|
||||
|
||||
@@ -12,41 +12,154 @@ use super::{
|
||||
DeltaLayerWriter, ImageLayerWriter, PersistentLayerDesc, PersistentLayerKey, ResidentLayer,
|
||||
};
|
||||
|
||||
pub(crate) enum SplitWriterResult {
|
||||
pub(crate) enum BatchWriterResult {
|
||||
Produced(ResidentLayer),
|
||||
Discarded(PersistentLayerKey),
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl SplitWriterResult {
|
||||
impl BatchWriterResult {
|
||||
fn into_resident_layer(self) -> ResidentLayer {
|
||||
match self {
|
||||
SplitWriterResult::Produced(layer) => layer,
|
||||
SplitWriterResult::Discarded(_) => panic!("unexpected discarded layer"),
|
||||
BatchWriterResult::Produced(layer) => layer,
|
||||
BatchWriterResult::Discarded(_) => panic!("unexpected discarded layer"),
|
||||
}
|
||||
}
|
||||
|
||||
fn into_discarded_layer(self) -> PersistentLayerKey {
|
||||
match self {
|
||||
SplitWriterResult::Produced(_) => panic!("unexpected produced layer"),
|
||||
SplitWriterResult::Discarded(layer) => layer,
|
||||
BatchWriterResult::Produced(_) => panic!("unexpected produced layer"),
|
||||
BatchWriterResult::Discarded(layer) => layer,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
enum LayerWriterWrapper {
|
||||
Image(ImageLayerWriter),
|
||||
Delta(DeltaLayerWriter),
|
||||
}
|
||||
|
||||
/// An layer writer that takes unfinished layers and finish them atomically.
|
||||
#[must_use]
|
||||
pub struct BatchLayerWriter {
|
||||
generated_layer_writers: Vec<(LayerWriterWrapper, PersistentLayerKey)>,
|
||||
conf: &'static PageServerConf,
|
||||
}
|
||||
|
||||
impl BatchLayerWriter {
|
||||
pub async fn new(conf: &'static PageServerConf) -> anyhow::Result<Self> {
|
||||
Ok(Self {
|
||||
generated_layer_writers: Vec::new(),
|
||||
conf,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn add_unfinished_image_writer(
|
||||
&mut self,
|
||||
writer: ImageLayerWriter,
|
||||
key_range: Range<Key>,
|
||||
lsn: Lsn,
|
||||
) {
|
||||
self.generated_layer_writers.push((
|
||||
LayerWriterWrapper::Image(writer),
|
||||
PersistentLayerKey {
|
||||
key_range,
|
||||
lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn),
|
||||
is_delta: false,
|
||||
},
|
||||
));
|
||||
}
|
||||
|
||||
pub fn add_unfinished_delta_writer(
|
||||
&mut self,
|
||||
writer: DeltaLayerWriter,
|
||||
key_range: Range<Key>,
|
||||
lsn_range: Range<Lsn>,
|
||||
) {
|
||||
self.generated_layer_writers.push((
|
||||
LayerWriterWrapper::Delta(writer),
|
||||
PersistentLayerKey {
|
||||
key_range,
|
||||
lsn_range,
|
||||
is_delta: true,
|
||||
},
|
||||
));
|
||||
}
|
||||
|
||||
pub(crate) async fn finish_with_discard_fn<D, F>(
|
||||
self,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
discard_fn: D,
|
||||
) -> anyhow::Result<Vec<BatchWriterResult>>
|
||||
where
|
||||
D: Fn(&PersistentLayerKey) -> F,
|
||||
F: Future<Output = bool>,
|
||||
{
|
||||
let Self {
|
||||
generated_layer_writers,
|
||||
..
|
||||
} = self;
|
||||
let clean_up_layers = |generated_layers: Vec<BatchWriterResult>| {
|
||||
for produced_layer in generated_layers {
|
||||
if let BatchWriterResult::Produced(resident_layer) = produced_layer {
|
||||
let layer: Layer = resident_layer.into();
|
||||
layer.delete_on_drop();
|
||||
}
|
||||
}
|
||||
};
|
||||
// BEGIN: catch every error and do the recovery in the below section
|
||||
let mut generated_layers: Vec<BatchWriterResult> = Vec::new();
|
||||
for (inner, layer_key) in generated_layer_writers {
|
||||
if discard_fn(&layer_key).await {
|
||||
generated_layers.push(BatchWriterResult::Discarded(layer_key));
|
||||
} else {
|
||||
let res = match inner {
|
||||
LayerWriterWrapper::Delta(writer) => {
|
||||
writer.finish(layer_key.key_range.end, ctx).await
|
||||
}
|
||||
LayerWriterWrapper::Image(writer) => {
|
||||
writer
|
||||
.finish_with_end_key(layer_key.key_range.end, ctx)
|
||||
.await
|
||||
}
|
||||
};
|
||||
let layer = match res {
|
||||
Ok((desc, path)) => {
|
||||
match Layer::finish_creating(self.conf, tline, desc, &path) {
|
||||
Ok(layer) => layer,
|
||||
Err(e) => {
|
||||
tokio::fs::remove_file(&path).await.ok();
|
||||
clean_up_layers(generated_layers);
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
// Image/DeltaLayerWriter::finish will clean up the temporary layer if anything goes wrong,
|
||||
// so we don't need to remove the layer we just failed to create by ourselves.
|
||||
clean_up_layers(generated_layers);
|
||||
return Err(e);
|
||||
}
|
||||
};
|
||||
generated_layers.push(BatchWriterResult::Produced(layer));
|
||||
}
|
||||
}
|
||||
// END: catch every error and do the recovery in the above section
|
||||
Ok(generated_layers)
|
||||
}
|
||||
}
|
||||
|
||||
/// An image writer that takes images and produces multiple image layers.
|
||||
///
|
||||
/// The interface does not guarantee atomicity (i.e., if the image layer generation
|
||||
/// fails, there might be leftover files to be cleaned up)
|
||||
#[must_use]
|
||||
pub struct SplitImageLayerWriter {
|
||||
inner: ImageLayerWriter,
|
||||
target_layer_size: u64,
|
||||
generated_layers: Vec<SplitWriterResult>,
|
||||
lsn: Lsn,
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
lsn: Lsn,
|
||||
batches: BatchLayerWriter,
|
||||
start_key: Key,
|
||||
}
|
||||
|
||||
@@ -71,27 +184,21 @@ impl SplitImageLayerWriter {
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
generated_layers: Vec::new(),
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
batches: BatchLayerWriter::new(conf).await?,
|
||||
lsn,
|
||||
start_key,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn put_image_with_discard_fn<D, F>(
|
||||
pub async fn put_image(
|
||||
&mut self,
|
||||
key: Key,
|
||||
img: Bytes,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
discard: D,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
D: FnOnce(&PersistentLayerKey) -> F,
|
||||
F: Future<Output = bool>,
|
||||
{
|
||||
) -> anyhow::Result<()> {
|
||||
// The current estimation is an upper bound of the space that the key/image could take
|
||||
// because we did not consider compression in this estimation. The resulting image layer
|
||||
// could be smaller than the target size.
|
||||
@@ -109,72 +216,34 @@ impl SplitImageLayerWriter {
|
||||
)
|
||||
.await?;
|
||||
let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
|
||||
let layer_key = PersistentLayerKey {
|
||||
key_range: self.start_key..key,
|
||||
lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn),
|
||||
is_delta: false,
|
||||
};
|
||||
self.batches.add_unfinished_image_writer(
|
||||
prev_image_writer,
|
||||
self.start_key..key,
|
||||
self.lsn,
|
||||
);
|
||||
self.start_key = key;
|
||||
|
||||
if discard(&layer_key).await {
|
||||
drop(prev_image_writer);
|
||||
self.generated_layers
|
||||
.push(SplitWriterResult::Discarded(layer_key));
|
||||
} else {
|
||||
let (desc, path) = prev_image_writer.finish_with_end_key(key, ctx).await?;
|
||||
|
||||
let layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
|
||||
self.generated_layers
|
||||
.push(SplitWriterResult::Produced(layer));
|
||||
}
|
||||
}
|
||||
self.inner.put_image(key, img, ctx).await
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub async fn put_image(
|
||||
&mut self,
|
||||
key: Key,
|
||||
img: Bytes,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
self.put_image_with_discard_fn(key, img, tline, ctx, |_| async { false })
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn finish_with_discard_fn<D, F>(
|
||||
self,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
end_key: Key,
|
||||
discard: D,
|
||||
) -> anyhow::Result<Vec<SplitWriterResult>>
|
||||
discard_fn: D,
|
||||
) -> anyhow::Result<Vec<BatchWriterResult>>
|
||||
where
|
||||
D: FnOnce(&PersistentLayerKey) -> F,
|
||||
D: Fn(&PersistentLayerKey) -> F,
|
||||
F: Future<Output = bool>,
|
||||
{
|
||||
let Self {
|
||||
mut generated_layers,
|
||||
inner,
|
||||
..
|
||||
mut batches, inner, ..
|
||||
} = self;
|
||||
if inner.num_keys() == 0 {
|
||||
return Ok(generated_layers);
|
||||
if inner.num_keys() != 0 {
|
||||
batches.add_unfinished_image_writer(inner, self.start_key..end_key, self.lsn);
|
||||
}
|
||||
let layer_key = PersistentLayerKey {
|
||||
key_range: self.start_key..end_key,
|
||||
lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn),
|
||||
is_delta: false,
|
||||
};
|
||||
if discard(&layer_key).await {
|
||||
generated_layers.push(SplitWriterResult::Discarded(layer_key));
|
||||
} else {
|
||||
let (desc, path) = inner.finish_with_end_key(end_key, ctx).await?;
|
||||
let layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
|
||||
generated_layers.push(SplitWriterResult::Produced(layer));
|
||||
}
|
||||
Ok(generated_layers)
|
||||
batches.finish_with_discard_fn(tline, ctx, discard_fn).await
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -183,22 +252,14 @@ impl SplitImageLayerWriter {
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
end_key: Key,
|
||||
) -> anyhow::Result<Vec<SplitWriterResult>> {
|
||||
) -> anyhow::Result<Vec<BatchWriterResult>> {
|
||||
self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false })
|
||||
.await
|
||||
}
|
||||
|
||||
/// This function will be deprecated with #8841.
|
||||
pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, ImageLayerWriter)> {
|
||||
Ok((self.generated_layers, self.inner))
|
||||
}
|
||||
}
|
||||
|
||||
/// A delta writer that takes key-lsn-values and produces multiple delta layers.
|
||||
///
|
||||
/// The interface does not guarantee atomicity (i.e., if the delta layer generation fails,
|
||||
/// there might be leftover files to be cleaned up).
|
||||
///
|
||||
/// Note that if updates of a single key exceed the target size limit, all of the updates will be batched
|
||||
/// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm
|
||||
/// will split them into multiple files based on size.
|
||||
@@ -206,12 +267,12 @@ impl SplitImageLayerWriter {
|
||||
pub struct SplitDeltaLayerWriter {
|
||||
inner: Option<(Key, DeltaLayerWriter)>,
|
||||
target_layer_size: u64,
|
||||
generated_layers: Vec<SplitWriterResult>,
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
lsn_range: Range<Lsn>,
|
||||
last_key_written: Key,
|
||||
batches: BatchLayerWriter,
|
||||
}
|
||||
|
||||
impl SplitDeltaLayerWriter {
|
||||
@@ -225,29 +286,22 @@ impl SplitDeltaLayerWriter {
|
||||
Ok(Self {
|
||||
target_layer_size,
|
||||
inner: None,
|
||||
generated_layers: Vec::new(),
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
lsn_range,
|
||||
last_key_written: Key::MIN,
|
||||
batches: BatchLayerWriter::new(conf).await?,
|
||||
})
|
||||
}
|
||||
|
||||
/// Put value into the layer writer. In the case the writer decides to produce a layer, and the discard fn returns true, no layer will be written in the end.
|
||||
pub async fn put_value_with_discard_fn<D, F>(
|
||||
pub async fn put_value(
|
||||
&mut self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
val: Value,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
discard: D,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
D: FnOnce(&PersistentLayerKey) -> F,
|
||||
F: Future<Output = bool>,
|
||||
{
|
||||
) -> anyhow::Result<()> {
|
||||
// The current estimation is key size plus LSN size plus value size estimation. This is not an accurate
|
||||
// number, and therefore the final layer size could be a little bit larger or smaller than the target.
|
||||
//
|
||||
@@ -286,21 +340,11 @@ impl SplitDeltaLayerWriter {
|
||||
.await?;
|
||||
let (start_key, prev_delta_writer) =
|
||||
std::mem::replace(&mut self.inner, Some((key, next_delta_writer))).unwrap();
|
||||
let layer_key = PersistentLayerKey {
|
||||
key_range: start_key..key,
|
||||
lsn_range: self.lsn_range.clone(),
|
||||
is_delta: true,
|
||||
};
|
||||
if discard(&layer_key).await {
|
||||
drop(prev_delta_writer);
|
||||
self.generated_layers
|
||||
.push(SplitWriterResult::Discarded(layer_key));
|
||||
} else {
|
||||
let (desc, path) = prev_delta_writer.finish(key, ctx).await?;
|
||||
let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
|
||||
self.generated_layers
|
||||
.push(SplitWriterResult::Produced(delta_layer));
|
||||
}
|
||||
self.batches.add_unfinished_delta_writer(
|
||||
prev_delta_writer,
|
||||
start_key..key,
|
||||
self.lsn_range.clone(),
|
||||
);
|
||||
} else if inner.estimated_size() >= S3_UPLOAD_LIMIT {
|
||||
// We have to produce a very large file b/c a key is updated too often.
|
||||
anyhow::bail!(
|
||||
@@ -315,53 +359,30 @@ impl SplitDeltaLayerWriter {
|
||||
inner.put_value(key, lsn, val, ctx).await
|
||||
}
|
||||
|
||||
pub async fn put_value(
|
||||
&mut self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
val: Value,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
self.put_value_with_discard_fn(key, lsn, val, tline, ctx, |_| async { false })
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn finish_with_discard_fn<D, F>(
|
||||
self,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
discard: D,
|
||||
) -> anyhow::Result<Vec<SplitWriterResult>>
|
||||
discard_fn: D,
|
||||
) -> anyhow::Result<Vec<BatchWriterResult>>
|
||||
where
|
||||
D: FnOnce(&PersistentLayerKey) -> F,
|
||||
D: Fn(&PersistentLayerKey) -> F,
|
||||
F: Future<Output = bool>,
|
||||
{
|
||||
let Self {
|
||||
mut generated_layers,
|
||||
inner,
|
||||
..
|
||||
mut batches, inner, ..
|
||||
} = self;
|
||||
let Some((start_key, inner)) = inner else {
|
||||
return Ok(generated_layers);
|
||||
};
|
||||
if inner.num_keys() == 0 {
|
||||
return Ok(generated_layers);
|
||||
if let Some((start_key, writer)) = inner {
|
||||
if writer.num_keys() != 0 {
|
||||
let end_key = self.last_key_written.next();
|
||||
batches.add_unfinished_delta_writer(
|
||||
writer,
|
||||
start_key..end_key,
|
||||
self.lsn_range.clone(),
|
||||
);
|
||||
}
|
||||
}
|
||||
let end_key = self.last_key_written.next();
|
||||
let layer_key = PersistentLayerKey {
|
||||
key_range: start_key..end_key,
|
||||
lsn_range: self.lsn_range.clone(),
|
||||
is_delta: true,
|
||||
};
|
||||
if discard(&layer_key).await {
|
||||
generated_layers.push(SplitWriterResult::Discarded(layer_key));
|
||||
} else {
|
||||
let (desc, path) = inner.finish(end_key, ctx).await?;
|
||||
let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
|
||||
generated_layers.push(SplitWriterResult::Produced(delta_layer));
|
||||
}
|
||||
Ok(generated_layers)
|
||||
batches.finish_with_discard_fn(tline, ctx, discard_fn).await
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -369,15 +390,10 @@ impl SplitDeltaLayerWriter {
|
||||
self,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<SplitWriterResult>> {
|
||||
) -> anyhow::Result<Vec<BatchWriterResult>> {
|
||||
self.finish_with_discard_fn(tline, ctx, |_| async { false })
|
||||
.await
|
||||
}
|
||||
|
||||
/// This function will be deprecated with #8841.
|
||||
pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, Option<DeltaLayerWriter>)> {
|
||||
Ok((self.generated_layers, self.inner.map(|x| x.1)))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -447,7 +463,7 @@ mod tests {
|
||||
.unwrap();
|
||||
|
||||
image_writer
|
||||
.put_image(get_key(0), get_img(0), &tline, &ctx)
|
||||
.put_image(get_key(0), get_img(0), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let layers = image_writer
|
||||
@@ -457,13 +473,7 @@ mod tests {
|
||||
assert_eq!(layers.len(), 1);
|
||||
|
||||
delta_writer
|
||||
.put_value(
|
||||
get_key(0),
|
||||
Lsn(0x18),
|
||||
Value::Image(get_img(0)),
|
||||
&tline,
|
||||
&ctx,
|
||||
)
|
||||
.put_value(get_key(0), Lsn(0x18), Value::Image(get_img(0)), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let layers = delta_writer.finish(&tline, &ctx).await.unwrap();
|
||||
@@ -486,14 +496,18 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn write_split() {
|
||||
// Test the split writer with retaining all the layers we have produced (discard=false)
|
||||
write_split_helper("split_writer_write_split", false).await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn write_split_discard() {
|
||||
write_split_helper("split_writer_write_split_discard", false).await;
|
||||
// Test the split writer with discarding all the layers we have produced (discard=true)
|
||||
write_split_helper("split_writer_write_split_discard", true).await;
|
||||
}
|
||||
|
||||
/// Test the image+delta writer by writing a large number of images and deltas. If discard is
|
||||
/// set to true, all layers will be discarded.
|
||||
async fn write_split_helper(harness_name: &'static str, discard: bool) {
|
||||
let harness = TenantHarness::create(harness_name).await.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
@@ -527,69 +541,63 @@ mod tests {
|
||||
for i in 0..N {
|
||||
let i = i as u32;
|
||||
image_writer
|
||||
.put_image_with_discard_fn(get_key(i), get_large_img(), &tline, &ctx, |_| async {
|
||||
discard
|
||||
})
|
||||
.put_image(get_key(i), get_large_img(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
delta_writer
|
||||
.put_value_with_discard_fn(
|
||||
get_key(i),
|
||||
Lsn(0x20),
|
||||
Value::Image(get_large_img()),
|
||||
&tline,
|
||||
&ctx,
|
||||
|_| async { discard },
|
||||
)
|
||||
.put_value(get_key(i), Lsn(0x20), Value::Image(get_large_img()), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
let image_layers = image_writer
|
||||
.finish(&tline, &ctx, get_key(N as u32))
|
||||
.finish_with_discard_fn(&tline, &ctx, get_key(N as u32), |_| async { discard })
|
||||
.await
|
||||
.unwrap();
|
||||
let delta_layers = delta_writer.finish(&tline, &ctx).await.unwrap();
|
||||
if discard {
|
||||
for layer in image_layers {
|
||||
layer.into_discarded_layer();
|
||||
}
|
||||
for layer in delta_layers {
|
||||
layer.into_discarded_layer();
|
||||
}
|
||||
} else {
|
||||
let image_layers = image_layers
|
||||
.into_iter()
|
||||
.map(|x| x.into_resident_layer())
|
||||
.collect_vec();
|
||||
let delta_layers = delta_layers
|
||||
.into_iter()
|
||||
.map(|x| x.into_resident_layer())
|
||||
.collect_vec();
|
||||
assert_eq!(image_layers.len(), N / 512 + 1);
|
||||
assert_eq!(delta_layers.len(), N / 512 + 1);
|
||||
assert_eq!(
|
||||
delta_layers.first().unwrap().layer_desc().key_range.start,
|
||||
get_key(0)
|
||||
);
|
||||
assert_eq!(
|
||||
delta_layers.last().unwrap().layer_desc().key_range.end,
|
||||
get_key(N as u32)
|
||||
);
|
||||
for idx in 0..image_layers.len() {
|
||||
assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
|
||||
assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
|
||||
assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN);
|
||||
assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX);
|
||||
if idx > 0 {
|
||||
assert_eq!(
|
||||
image_layers[idx - 1].layer_desc().key_range.end,
|
||||
image_layers[idx].layer_desc().key_range.start
|
||||
);
|
||||
assert_eq!(
|
||||
delta_layers[idx - 1].layer_desc().key_range.end,
|
||||
delta_layers[idx].layer_desc().key_range.start
|
||||
);
|
||||
let delta_layers = delta_writer
|
||||
.finish_with_discard_fn(&tline, &ctx, |_| async { discard })
|
||||
.await
|
||||
.unwrap();
|
||||
let image_layers = image_layers
|
||||
.into_iter()
|
||||
.map(|x| {
|
||||
if discard {
|
||||
x.into_discarded_layer()
|
||||
} else {
|
||||
x.into_resident_layer().layer_desc().key()
|
||||
}
|
||||
})
|
||||
.collect_vec();
|
||||
let delta_layers = delta_layers
|
||||
.into_iter()
|
||||
.map(|x| {
|
||||
if discard {
|
||||
x.into_discarded_layer()
|
||||
} else {
|
||||
x.into_resident_layer().layer_desc().key()
|
||||
}
|
||||
})
|
||||
.collect_vec();
|
||||
assert_eq!(image_layers.len(), N / 512 + 1);
|
||||
assert_eq!(delta_layers.len(), N / 512 + 1);
|
||||
assert_eq!(delta_layers.first().unwrap().key_range.start, get_key(0));
|
||||
assert_eq!(
|
||||
delta_layers.last().unwrap().key_range.end,
|
||||
get_key(N as u32)
|
||||
);
|
||||
for idx in 0..image_layers.len() {
|
||||
assert_ne!(image_layers[idx].key_range.start, Key::MIN);
|
||||
assert_ne!(image_layers[idx].key_range.end, Key::MAX);
|
||||
assert_ne!(delta_layers[idx].key_range.start, Key::MIN);
|
||||
assert_ne!(delta_layers[idx].key_range.end, Key::MAX);
|
||||
if idx > 0 {
|
||||
assert_eq!(
|
||||
image_layers[idx - 1].key_range.end,
|
||||
image_layers[idx].key_range.start
|
||||
);
|
||||
assert_eq!(
|
||||
delta_layers[idx - 1].key_range.end,
|
||||
delta_layers[idx].key_range.start
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -629,11 +637,11 @@ mod tests {
|
||||
.unwrap();
|
||||
|
||||
image_writer
|
||||
.put_image(get_key(0), get_img(0), &tline, &ctx)
|
||||
.put_image(get_key(0), get_img(0), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
image_writer
|
||||
.put_image(get_key(1), get_large_img(), &tline, &ctx)
|
||||
.put_image(get_key(1), get_large_img(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let layers = image_writer
|
||||
@@ -643,23 +651,11 @@ mod tests {
|
||||
assert_eq!(layers.len(), 2);
|
||||
|
||||
delta_writer
|
||||
.put_value(
|
||||
get_key(0),
|
||||
Lsn(0x18),
|
||||
Value::Image(get_img(0)),
|
||||
&tline,
|
||||
&ctx,
|
||||
)
|
||||
.put_value(get_key(0), Lsn(0x18), Value::Image(get_img(0)), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
delta_writer
|
||||
.put_value(
|
||||
get_key(1),
|
||||
Lsn(0x1A),
|
||||
Value::Image(get_large_img()),
|
||||
&tline,
|
||||
&ctx,
|
||||
)
|
||||
.put_value(get_key(1), Lsn(0x1A), Value::Image(get_large_img()), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let layers = delta_writer.finish(&tline, &ctx).await.unwrap();
|
||||
@@ -723,7 +719,6 @@ mod tests {
|
||||
get_key(0),
|
||||
Lsn(i as u64 * 16 + 0x10),
|
||||
Value::Image(get_large_img()),
|
||||
&tline,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
@@ -44,11 +44,11 @@ use crate::tenant::vectored_blob_io::{
|
||||
};
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
|
||||
use crate::virtual_file::IoBufferMut;
|
||||
use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
|
||||
use crate::{walrecord, TEMP_FILE_SUFFIX};
|
||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use bytes::BytesMut;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use futures::StreamExt;
|
||||
use itertools::Itertools;
|
||||
@@ -515,8 +515,8 @@ impl DeltaLayerWriterInner {
|
||||
) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
|
||||
let temp_path = self.path.clone();
|
||||
let result = self.finish0(key_end, ctx).await;
|
||||
if result.is_err() {
|
||||
tracing::info!(%temp_path, "cleaning up temporary file after error during writing");
|
||||
if let Err(ref e) = result {
|
||||
tracing::info!(%temp_path, "cleaning up temporary file after error during writing: {e}");
|
||||
if let Err(e) = std::fs::remove_file(&temp_path) {
|
||||
tracing::warn!(error=%e, %temp_path, "error cleaning up temporary layer file after error during writing");
|
||||
}
|
||||
@@ -1002,7 +1002,7 @@ impl DeltaLayerInner {
|
||||
.0
|
||||
.into();
|
||||
let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes);
|
||||
let mut buf = Some(BytesMut::with_capacity(buf_size));
|
||||
let mut buf = Some(IoBufferMut::with_capacity(buf_size));
|
||||
|
||||
// Note that reads are processed in reverse order (from highest key+lsn).
|
||||
// This is the order that `ReconstructState` requires such that it can
|
||||
@@ -1029,7 +1029,7 @@ impl DeltaLayerInner {
|
||||
|
||||
// We have "lost" the buffer since the lower level IO api
|
||||
// doesn't return the buffer on error. Allocate a new one.
|
||||
buf = Some(BytesMut::with_capacity(buf_size));
|
||||
buf = Some(IoBufferMut::with_capacity(buf_size));
|
||||
|
||||
continue;
|
||||
}
|
||||
@@ -1084,7 +1084,7 @@ impl DeltaLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) async fn load_keys<'a>(
|
||||
pub(crate) async fn index_entries<'a>(
|
||||
&'a self,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Vec<DeltaEntry<'a>>> {
|
||||
@@ -1203,7 +1203,7 @@ impl DeltaLayerInner {
|
||||
.map(|x| x.0.get())
|
||||
.unwrap_or(8192);
|
||||
|
||||
let mut buffer = Some(BytesMut::with_capacity(max_read_size));
|
||||
let mut buffer = Some(IoBufferMut::with_capacity(max_read_size));
|
||||
|
||||
// FIXME: buffering of DeltaLayerWriter
|
||||
let mut per_blob_copy = Vec::new();
|
||||
@@ -1346,7 +1346,7 @@ impl DeltaLayerInner {
|
||||
|
||||
tree_reader.dump().await?;
|
||||
|
||||
let keys = self.load_keys(ctx).await?;
|
||||
let keys = self.index_entries(ctx).await?;
|
||||
|
||||
async fn dump_blob(val: &ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
|
||||
let buf = val.load_raw(ctx).await?;
|
||||
@@ -1453,6 +1453,16 @@ impl DeltaLayerInner {
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
/// NB: not super efficient, but not terrible either. Should prob be an iterator.
|
||||
//
|
||||
// We're reusing the index traversal logical in plan_reads; would be nice to
|
||||
// factor that out.
|
||||
pub(crate) async fn load_keys(&self, ctx: &RequestContext) -> anyhow::Result<Vec<Key>> {
|
||||
self.index_entries(ctx)
|
||||
.await
|
||||
.map(|entries| entries.into_iter().map(|entry| entry.key).collect())
|
||||
}
|
||||
}
|
||||
|
||||
/// A set of data associated with a delta layer key and its value
|
||||
@@ -1561,12 +1571,11 @@ impl<'a> DeltaLayerIterator<'a> {
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&self.delta_layer.file);
|
||||
let mut next_batch = std::collections::VecDeque::new();
|
||||
let buf_size = plan.size();
|
||||
let buf = BytesMut::with_capacity(buf_size);
|
||||
let buf = IoBufferMut::with_capacity(buf_size);
|
||||
let blobs_buf = vectored_blob_reader
|
||||
.read_blobs(&plan, buf, self.ctx)
|
||||
.await?;
|
||||
let frozen_buf = blobs_buf.buf.freeze();
|
||||
let view = BufView::new_bytes(frozen_buf);
|
||||
let view = BufView::new_slice(&blobs_buf.buf);
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let blob_read = meta.read(&view).await?;
|
||||
let value = Value::des(&blob_read)?;
|
||||
@@ -1941,7 +1950,7 @@ pub(crate) mod test {
|
||||
&vectored_reads,
|
||||
constants::MAX_VECTORED_READ_BYTES,
|
||||
);
|
||||
let mut buf = Some(BytesMut::with_capacity(buf_size));
|
||||
let mut buf = Some(IoBufferMut::with_capacity(buf_size));
|
||||
|
||||
for read in vectored_reads {
|
||||
let blobs_buf = vectored_blob_reader
|
||||
|
||||
@@ -41,10 +41,11 @@ use crate::tenant::vectored_blob_io::{
|
||||
};
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
|
||||
use crate::virtual_file::IoBufferMut;
|
||||
use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
|
||||
use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use bytes::Bytes;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use hex;
|
||||
use itertools::Itertools;
|
||||
@@ -547,10 +548,10 @@ impl ImageLayerInner {
|
||||
for read in plan.into_iter() {
|
||||
let buf_size = read.size();
|
||||
|
||||
let buf = BytesMut::with_capacity(buf_size);
|
||||
let buf = IoBufferMut::with_capacity(buf_size);
|
||||
let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?;
|
||||
let frozen_buf = blobs_buf.buf.freeze();
|
||||
let view = BufView::new_bytes(frozen_buf);
|
||||
|
||||
let view = BufView::new_slice(&blobs_buf.buf);
|
||||
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let img_buf = meta.read(&view).await?;
|
||||
@@ -609,13 +610,12 @@ impl ImageLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
let buf = BytesMut::with_capacity(buf_size);
|
||||
let buf = IoBufferMut::with_capacity(buf_size);
|
||||
let res = vectored_blob_reader.read_blobs(&read, buf, ctx).await;
|
||||
|
||||
match res {
|
||||
Ok(blobs_buf) => {
|
||||
let frozen_buf = blobs_buf.buf.freeze();
|
||||
let view = BufView::new_bytes(frozen_buf);
|
||||
let view = BufView::new_slice(&blobs_buf.buf);
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let img_buf = meta.read(&view).await;
|
||||
|
||||
@@ -673,6 +673,21 @@ impl ImageLayerInner {
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
/// NB: not super efficient, but not terrible either. Should prob be an iterator.
|
||||
//
|
||||
// We're reusing the index traversal logical in plan_reads; would be nice to
|
||||
// factor that out.
|
||||
pub(crate) async fn load_keys(&self, ctx: &RequestContext) -> anyhow::Result<Vec<Key>> {
|
||||
let plan = self
|
||||
.plan_reads(KeySpace::single(self.key_range.clone()), None, ctx)
|
||||
.await?;
|
||||
Ok(plan
|
||||
.into_iter()
|
||||
.flat_map(|read| read.blobs_at)
|
||||
.map(|(_, blob_meta)| blob_meta.key)
|
||||
.collect())
|
||||
}
|
||||
}
|
||||
|
||||
/// A builder object for constructing a new image layer.
|
||||
@@ -827,6 +842,25 @@ impl ImageLayerWriterInner {
|
||||
self,
|
||||
ctx: &RequestContext,
|
||||
end_key: Option<Key>,
|
||||
) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
|
||||
let temp_path = self.path.clone();
|
||||
let result = self.finish0(ctx, end_key).await;
|
||||
if let Err(ref e) = result {
|
||||
tracing::info!(%temp_path, "cleaning up temporary file after error during writing: {e}");
|
||||
if let Err(e) = std::fs::remove_file(&temp_path) {
|
||||
tracing::warn!(error=%e, %temp_path, "error cleaning up temporary layer file after error during writing");
|
||||
}
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
///
|
||||
/// Finish writing the image layer.
|
||||
///
|
||||
async fn finish0(
|
||||
self,
|
||||
ctx: &RequestContext,
|
||||
end_key: Option<Key>,
|
||||
) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
|
||||
let index_start_blk = self.blob_writer.size().div_ceil(PAGE_SZ as u64) as u32;
|
||||
|
||||
@@ -990,7 +1024,7 @@ impl ImageLayerWriter {
|
||||
self.inner.take().unwrap().finish(ctx, None).await
|
||||
}
|
||||
|
||||
/// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
|
||||
/// Finish writing the image layer with an end key, used in [`super::batch_split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
|
||||
pub(super) async fn finish_with_end_key(
|
||||
mut self,
|
||||
end_key: Key,
|
||||
@@ -1050,12 +1084,11 @@ impl<'a> ImageLayerIterator<'a> {
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&self.image_layer.file);
|
||||
let mut next_batch = std::collections::VecDeque::new();
|
||||
let buf_size = plan.size();
|
||||
let buf = BytesMut::with_capacity(buf_size);
|
||||
let buf = IoBufferMut::with_capacity(buf_size);
|
||||
let blobs_buf = vectored_blob_reader
|
||||
.read_blobs(&plan, buf, self.ctx)
|
||||
.await?;
|
||||
let frozen_buf = blobs_buf.buf.freeze();
|
||||
let view = BufView::new_bytes(frozen_buf);
|
||||
let view = BufView::new_slice(&blobs_buf.buf);
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let img_buf = meta.read(&view).await?;
|
||||
next_batch.push_back((
|
||||
|
||||
@@ -14,7 +14,6 @@ use crate::tenant::PageReconstructError;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
|
||||
use crate::{l0_flush, page_cache};
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::key::CompactKey;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
@@ -809,9 +808,8 @@ impl InMemoryLayer {
|
||||
|
||||
match l0_flush_global_state {
|
||||
l0_flush::Inner::Direct { .. } => {
|
||||
let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
|
||||
|
||||
let file_contents = Bytes::from(file_contents);
|
||||
let file_contents = inner.file.load_to_io_buf(ctx).await?;
|
||||
let file_contents = file_contents.freeze();
|
||||
|
||||
for (key, vec_map) in inner.index.iter() {
|
||||
// Write all page versions
|
||||
@@ -825,7 +823,7 @@ impl InMemoryLayer {
|
||||
len,
|
||||
will_init,
|
||||
} = entry;
|
||||
let buf = Bytes::slice(&file_contents, pos as usize..(pos + len) as usize);
|
||||
let buf = file_contents.slice(pos as usize..(pos + len) as usize);
|
||||
let (_buf, res) = delta_layer_writer
|
||||
.put_value_bytes(
|
||||
Key::from_compact(*key),
|
||||
|
||||
@@ -9,6 +9,7 @@ use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice};
|
||||
use crate::{
|
||||
assert_u64_eq_usize::{U64IsUsize, UsizeIsU64},
|
||||
context::RequestContext,
|
||||
virtual_file::{owned_buffers_io::io_buf_aligned::IoBufAlignedMut, IoBufferMut},
|
||||
};
|
||||
|
||||
/// The file interface we require. At runtime, this is a [`crate::tenant::ephemeral_file::EphemeralFile`].
|
||||
@@ -24,7 +25,7 @@ pub trait File: Send {
|
||||
/// [`std::io::ErrorKind::UnexpectedEof`] error if the file is shorter than `start+dst.len()`.
|
||||
///
|
||||
/// No guarantees are made about the remaining bytes in `dst` in case of a short read.
|
||||
async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>(
|
||||
async fn read_exact_at_eof_ok<'a, 'b, B: IoBufAlignedMut + Send>(
|
||||
&'b self,
|
||||
start: u64,
|
||||
dst: Slice<B>,
|
||||
@@ -227,7 +228,7 @@ where
|
||||
|
||||
// Execute physical reads and fill the logical read buffers
|
||||
// TODO: pipelined reads; prefetch;
|
||||
let get_io_buffer = |nchunks| Vec::with_capacity(nchunks * DIO_CHUNK_SIZE);
|
||||
let get_io_buffer = |nchunks| IoBufferMut::with_capacity(nchunks * DIO_CHUNK_SIZE);
|
||||
for PhysicalRead {
|
||||
start_chunk_no,
|
||||
nchunks,
|
||||
@@ -459,7 +460,7 @@ mod tests {
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
let file = InMemoryFile::new_random(10);
|
||||
let test_read = |pos, len| {
|
||||
let buf = vec![0; len];
|
||||
let buf = IoBufferMut::with_capacity_zeroed(len);
|
||||
let fut = file.read_exact_at_eof_ok(pos, buf.slice_full(), &ctx);
|
||||
use futures::FutureExt;
|
||||
let (slice, nread) = fut
|
||||
@@ -470,9 +471,9 @@ mod tests {
|
||||
buf.truncate(nread);
|
||||
buf
|
||||
};
|
||||
assert_eq!(test_read(0, 1), &file.content[0..1]);
|
||||
assert_eq!(test_read(1, 2), &file.content[1..3]);
|
||||
assert_eq!(test_read(9, 2), &file.content[9..]);
|
||||
assert_eq!(&test_read(0, 1), &file.content[0..1]);
|
||||
assert_eq!(&test_read(1, 2), &file.content[1..3]);
|
||||
assert_eq!(&test_read(9, 2), &file.content[9..]);
|
||||
assert!(test_read(10, 2).is_empty());
|
||||
assert!(test_read(11, 2).is_empty());
|
||||
}
|
||||
@@ -609,7 +610,7 @@ mod tests {
|
||||
}
|
||||
|
||||
impl<'x> File for RecorderFile<'x> {
|
||||
async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>(
|
||||
async fn read_exact_at_eof_ok<'a, 'b, B: IoBufAlignedMut + Send>(
|
||||
&'b self,
|
||||
start: u64,
|
||||
dst: Slice<B>,
|
||||
@@ -782,7 +783,7 @@ mod tests {
|
||||
2048, 1024 => Err("foo".to_owned()),
|
||||
};
|
||||
|
||||
let buf = Vec::with_capacity(512);
|
||||
let buf = IoBufferMut::with_capacity(512);
|
||||
let (buf, nread) = mock_file
|
||||
.read_exact_at_eof_ok(0, buf.slice_full(), &ctx)
|
||||
.await
|
||||
@@ -790,7 +791,7 @@ mod tests {
|
||||
assert_eq!(nread, 512);
|
||||
assert_eq!(&buf.into_inner()[..nread], &[0; 512]);
|
||||
|
||||
let buf = Vec::with_capacity(512);
|
||||
let buf = IoBufferMut::with_capacity(512);
|
||||
let (buf, nread) = mock_file
|
||||
.read_exact_at_eof_ok(512, buf.slice_full(), &ctx)
|
||||
.await
|
||||
@@ -798,7 +799,7 @@ mod tests {
|
||||
assert_eq!(nread, 512);
|
||||
assert_eq!(&buf.into_inner()[..nread], &[1; 512]);
|
||||
|
||||
let buf = Vec::with_capacity(512);
|
||||
let buf = IoBufferMut::with_capacity(512);
|
||||
let (buf, nread) = mock_file
|
||||
.read_exact_at_eof_ok(1024, buf.slice_full(), &ctx)
|
||||
.await
|
||||
@@ -806,7 +807,7 @@ mod tests {
|
||||
assert_eq!(nread, 10);
|
||||
assert_eq!(&buf.into_inner()[..nread], &[2; 10]);
|
||||
|
||||
let buf = Vec::with_capacity(1024);
|
||||
let buf = IoBufferMut::with_capacity(1024);
|
||||
let err = mock_file
|
||||
.read_exact_at_eof_ok(2048, buf.slice_full(), &ctx)
|
||||
.await
|
||||
|
||||
@@ -19,7 +19,7 @@ use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::timeline::{CompactionError, GetVectoredError};
|
||||
use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
|
||||
|
||||
use super::delta_layer::{self, DeltaEntry};
|
||||
use super::delta_layer::{self};
|
||||
use super::image_layer::{self};
|
||||
use super::{
|
||||
AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
|
||||
@@ -1841,23 +1841,22 @@ impl ResidentLayer {
|
||||
pub(crate) async fn load_keys<'a>(
|
||||
&'a self,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<DeltaEntry<'a>>> {
|
||||
) -> anyhow::Result<Vec<pageserver_api::key::Key>> {
|
||||
use LayerKind::*;
|
||||
|
||||
let owner = &self.owner.0;
|
||||
match self.downloaded.get(owner, ctx).await? {
|
||||
Delta(ref d) => {
|
||||
// this is valid because the DownloadedLayer::kind is a OnceCell, not a
|
||||
// Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
|
||||
// while it's being held.
|
||||
self.owner.record_access(ctx);
|
||||
let inner = self.downloaded.get(owner, ctx).await?;
|
||||
|
||||
delta_layer::DeltaLayerInner::load_keys(d, ctx)
|
||||
.await
|
||||
.with_context(|| format!("Layer index is corrupted for {self}"))
|
||||
}
|
||||
Image(_) => anyhow::bail!(format!("cannot load_keys on a image layer {self}")),
|
||||
}
|
||||
// this is valid because the DownloadedLayer::kind is a OnceCell, not a
|
||||
// Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
|
||||
// while it's being held.
|
||||
self.owner.record_access(ctx);
|
||||
|
||||
let res = match inner {
|
||||
Delta(ref d) => delta_layer::DeltaLayerInner::load_keys(d, ctx).await,
|
||||
Image(ref i) => image_layer::ImageLayerInner::load_keys(i, ctx).await,
|
||||
};
|
||||
res.with_context(|| format!("Layer index is corrupted for {self}"))
|
||||
}
|
||||
|
||||
/// Read all they keys in this layer which match the ShardIdentity, and write them all to
|
||||
|
||||
@@ -57,6 +57,34 @@ impl std::fmt::Display for PersistentLayerKey {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<ImageLayerName> for PersistentLayerKey {
|
||||
fn from(image_layer_name: ImageLayerName) -> Self {
|
||||
Self {
|
||||
key_range: image_layer_name.key_range,
|
||||
lsn_range: PersistentLayerDesc::image_layer_lsn_range(image_layer_name.lsn),
|
||||
is_delta: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<DeltaLayerName> for PersistentLayerKey {
|
||||
fn from(delta_layer_name: DeltaLayerName) -> Self {
|
||||
Self {
|
||||
key_range: delta_layer_name.key_range,
|
||||
lsn_range: delta_layer_name.lsn_range,
|
||||
is_delta: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<LayerName> for PersistentLayerKey {
|
||||
fn from(layer_name: LayerName) -> Self {
|
||||
match layer_name {
|
||||
LayerName::Image(i) => i.into(),
|
||||
LayerName::Delta(d) => d.into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
impl PersistentLayerDesc {
|
||||
pub fn key(&self) -> PersistentLayerKey {
|
||||
PersistentLayerKey {
|
||||
|
||||
@@ -371,7 +371,7 @@ pub struct Timeline {
|
||||
|
||||
/// Prevent two tasks from deleting the timeline at the same time. If held, the
|
||||
/// timeline is being deleted. If 'true', the timeline has already been deleted.
|
||||
pub delete_progress: Arc<tokio::sync::Mutex<DeleteTimelineFlow>>,
|
||||
pub delete_progress: TimelineDeleteProgress,
|
||||
|
||||
eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,
|
||||
|
||||
@@ -424,8 +424,13 @@ pub struct Timeline {
|
||||
pub(crate) handles: handle::PerTimelineState<crate::page_service::TenantManagerTypes>,
|
||||
|
||||
pub(crate) attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
|
||||
|
||||
/// Cf. [`crate::tenant::CreateTimelineIdempotency`].
|
||||
pub(crate) create_idempotency: crate::tenant::CreateTimelineIdempotency,
|
||||
}
|
||||
|
||||
pub type TimelineDeleteProgress = Arc<tokio::sync::Mutex<DeleteTimelineFlow>>;
|
||||
|
||||
pub struct WalReceiverInfo {
|
||||
pub wal_source_connconf: PgConnectionConfig,
|
||||
pub last_received_msg_lsn: Lsn,
|
||||
@@ -2134,6 +2139,7 @@ impl Timeline {
|
||||
pg_version: u32,
|
||||
state: TimelineState,
|
||||
attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
|
||||
create_idempotency: crate::tenant::CreateTimelineIdempotency,
|
||||
cancel: CancellationToken,
|
||||
) -> Arc<Self> {
|
||||
let disk_consistent_lsn = metadata.disk_consistent_lsn();
|
||||
@@ -2250,7 +2256,7 @@ impl Timeline {
|
||||
eviction_task_timeline_state: tokio::sync::Mutex::new(
|
||||
EvictionTaskTimelineState::default(),
|
||||
),
|
||||
delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTimelineFlow::default())),
|
||||
delete_progress: TimelineDeleteProgress::default(),
|
||||
|
||||
cancel,
|
||||
gate: Gate::default(),
|
||||
@@ -2272,6 +2278,8 @@ impl Timeline {
|
||||
handles: Default::default(),
|
||||
|
||||
attach_wal_lag_cooldown,
|
||||
|
||||
create_idempotency,
|
||||
};
|
||||
|
||||
result.repartition_threshold =
|
||||
@@ -2402,7 +2410,7 @@ impl Timeline {
|
||||
pub(super) async fn load_layer_map(
|
||||
&self,
|
||||
disk_consistent_lsn: Lsn,
|
||||
index_part: Option<IndexPart>,
|
||||
index_part: IndexPart,
|
||||
) -> anyhow::Result<()> {
|
||||
use init::{Decision::*, Discovered, DismissedLayer};
|
||||
use LayerName::*;
|
||||
@@ -2466,8 +2474,7 @@ impl Timeline {
|
||||
);
|
||||
}
|
||||
|
||||
let decided =
|
||||
init::reconcile(discovered_layers, index_part.as_ref(), disk_consistent_lsn);
|
||||
let decided = init::reconcile(discovered_layers, &index_part, disk_consistent_lsn);
|
||||
|
||||
let mut loaded_layers = Vec::new();
|
||||
let mut needs_cleanup = Vec::new();
|
||||
|
||||
@@ -32,11 +32,11 @@ use crate::page_cache;
|
||||
use crate::statvfs::Statvfs;
|
||||
use crate::tenant::checks::check_valid_layermap;
|
||||
use crate::tenant::remote_timeline_client::WaitCompletionError;
|
||||
use crate::tenant::storage_layer::batch_split_writer::{
|
||||
BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
|
||||
};
|
||||
use crate::tenant::storage_layer::filter_iterator::FilterIterator;
|
||||
use crate::tenant::storage_layer::merge_iterator::MergeIterator;
|
||||
use crate::tenant::storage_layer::split_writer::{
|
||||
SplitDeltaLayerWriter, SplitImageLayerWriter, SplitWriterResult,
|
||||
};
|
||||
use crate::tenant::storage_layer::{
|
||||
AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState,
|
||||
};
|
||||
@@ -121,18 +121,12 @@ impl KeyHistoryRetention {
|
||||
async fn pipe_to(
|
||||
self,
|
||||
key: Key,
|
||||
tline: &Arc<Timeline>,
|
||||
delta_writer: &mut SplitDeltaLayerWriter,
|
||||
mut image_writer: Option<&mut SplitImageLayerWriter>,
|
||||
stat: &mut CompactionStatistics,
|
||||
dry_run: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut first_batch = true;
|
||||
let discard = |key: &PersistentLayerKey| {
|
||||
let key = key.clone();
|
||||
async move { Self::discard_key(&key, tline, dry_run).await }
|
||||
};
|
||||
for (cutoff_lsn, KeyLogAtLsn(logs)) in self.below_horizon {
|
||||
if first_batch {
|
||||
if logs.len() == 1 && logs[0].1.is_image() {
|
||||
@@ -141,45 +135,30 @@ impl KeyHistoryRetention {
|
||||
};
|
||||
stat.produce_image_key(img);
|
||||
if let Some(image_writer) = image_writer.as_mut() {
|
||||
image_writer
|
||||
.put_image_with_discard_fn(key, img.clone(), tline, ctx, discard)
|
||||
.await?;
|
||||
image_writer.put_image(key, img.clone(), ctx).await?;
|
||||
} else {
|
||||
delta_writer
|
||||
.put_value_with_discard_fn(
|
||||
key,
|
||||
cutoff_lsn,
|
||||
Value::Image(img.clone()),
|
||||
tline,
|
||||
ctx,
|
||||
discard,
|
||||
)
|
||||
.put_value(key, cutoff_lsn, Value::Image(img.clone()), ctx)
|
||||
.await?;
|
||||
}
|
||||
} else {
|
||||
for (lsn, val) in logs {
|
||||
stat.produce_key(&val);
|
||||
delta_writer
|
||||
.put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
|
||||
.await?;
|
||||
delta_writer.put_value(key, lsn, val, ctx).await?;
|
||||
}
|
||||
}
|
||||
first_batch = false;
|
||||
} else {
|
||||
for (lsn, val) in logs {
|
||||
stat.produce_key(&val);
|
||||
delta_writer
|
||||
.put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
|
||||
.await?;
|
||||
delta_writer.put_value(key, lsn, val, ctx).await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
let KeyLogAtLsn(above_horizon_logs) = self.above_horizon;
|
||||
for (lsn, val) in above_horizon_logs {
|
||||
stat.produce_key(&val);
|
||||
delta_writer
|
||||
.put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
|
||||
.await?;
|
||||
delta_writer.put_value(key, lsn, val, ctx).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -855,7 +834,12 @@ impl Timeline {
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
}
|
||||
all_keys.extend(l.load_keys(ctx).await.map_err(CompactionError::Other)?);
|
||||
let delta = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
|
||||
let keys = delta
|
||||
.index_entries(ctx)
|
||||
.await
|
||||
.map_err(CompactionError::Other)?;
|
||||
all_keys.extend(keys);
|
||||
}
|
||||
// The current stdlib sorting implementation is designed in a way where it is
|
||||
// particularly fast where the slice is made up of sorted sub-ranges.
|
||||
@@ -1990,11 +1974,9 @@ impl Timeline {
|
||||
retention
|
||||
.pipe_to(
|
||||
*last_key,
|
||||
self,
|
||||
&mut delta_layer_writer,
|
||||
image_layer_writer.as_mut(),
|
||||
&mut stat,
|
||||
dry_run,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -2021,11 +2003,9 @@ impl Timeline {
|
||||
retention
|
||||
.pipe_to(
|
||||
last_key,
|
||||
self,
|
||||
&mut delta_layer_writer,
|
||||
image_layer_writer.as_mut(),
|
||||
&mut stat,
|
||||
dry_run,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -2041,8 +2021,7 @@ impl Timeline {
|
||||
.finish_with_discard_fn(self, ctx, Key::MAX, discard)
|
||||
.await?
|
||||
} else {
|
||||
let (layers, _) = writer.take()?;
|
||||
assert!(layers.is_empty(), "image layers produced in dry run mode?");
|
||||
drop(writer);
|
||||
Vec::new()
|
||||
}
|
||||
} else {
|
||||
@@ -2054,8 +2033,7 @@ impl Timeline {
|
||||
.finish_with_discard_fn(self, ctx, discard)
|
||||
.await?
|
||||
} else {
|
||||
let (layers, _) = delta_layer_writer.take()?;
|
||||
assert!(layers.is_empty(), "delta layers produced in dry run mode?");
|
||||
drop(delta_layer_writer);
|
||||
Vec::new()
|
||||
};
|
||||
|
||||
@@ -2065,11 +2043,11 @@ impl Timeline {
|
||||
let produced_image_layers_len = produced_image_layers.len();
|
||||
for action in produced_delta_layers {
|
||||
match action {
|
||||
SplitWriterResult::Produced(layer) => {
|
||||
BatchWriterResult::Produced(layer) => {
|
||||
stat.produce_delta_layer(layer.layer_desc().file_size());
|
||||
compact_to.push(layer);
|
||||
}
|
||||
SplitWriterResult::Discarded(l) => {
|
||||
BatchWriterResult::Discarded(l) => {
|
||||
keep_layers.insert(l);
|
||||
stat.discard_delta_layer();
|
||||
}
|
||||
@@ -2077,11 +2055,11 @@ impl Timeline {
|
||||
}
|
||||
for action in produced_image_layers {
|
||||
match action {
|
||||
SplitWriterResult::Produced(layer) => {
|
||||
BatchWriterResult::Produced(layer) => {
|
||||
stat.produce_image_layer(layer.layer_desc().file_size());
|
||||
compact_to.push(layer);
|
||||
}
|
||||
SplitWriterResult::Discarded(l) => {
|
||||
BatchWriterResult::Discarded(l) => {
|
||||
keep_layers.insert(l);
|
||||
stat.discard_image_layer();
|
||||
}
|
||||
@@ -2465,7 +2443,7 @@ impl CompactionDeltaLayer<TimelineAdaptor> for ResidentDeltaLayer {
|
||||
type DeltaEntry<'a> = DeltaEntry<'a>;
|
||||
|
||||
async fn load_keys<'a>(&self, ctx: &RequestContext) -> anyhow::Result<Vec<DeltaEntry<'_>>> {
|
||||
self.0.load_keys(ctx).await
|
||||
self.0.get_as_delta(ctx).await?.index_entries(ctx).await
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -14,7 +14,9 @@ use crate::{
|
||||
task_mgr::{self, TaskKind},
|
||||
tenant::{
|
||||
metadata::TimelineMetadata,
|
||||
remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
|
||||
remote_timeline_client::{
|
||||
self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
|
||||
},
|
||||
CreateTimelineCause, DeleteTimelineError, Tenant, TimelineOrOffloaded,
|
||||
},
|
||||
};
|
||||
@@ -25,12 +27,9 @@ use super::{Timeline, TimelineResources};
|
||||
/// during attach or pageserver restart.
|
||||
/// See comment in persist_index_part_with_deleted_flag.
|
||||
async fn set_deleted_in_remote_index(
|
||||
timeline: &TimelineOrOffloaded,
|
||||
remote_client: &Arc<RemoteTimelineClient>,
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
let res = timeline
|
||||
.remote_client()
|
||||
.persist_index_part_with_deleted_flag()
|
||||
.await;
|
||||
let res = remote_client.persist_index_part_with_deleted_flag().await;
|
||||
match res {
|
||||
// If we (now, or already) marked it successfully as deleted, we can proceed
|
||||
Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
|
||||
@@ -129,12 +128,10 @@ pub(super) async fn delete_local_timeline_directory(
|
||||
}
|
||||
|
||||
/// Removes remote layers and an index file after them.
|
||||
async fn delete_remote_layers_and_index(timeline: &TimelineOrOffloaded) -> anyhow::Result<()> {
|
||||
timeline
|
||||
.remote_client()
|
||||
.delete_all()
|
||||
.await
|
||||
.context("delete_all")
|
||||
async fn delete_remote_layers_and_index(
|
||||
remote_client: &Arc<RemoteTimelineClient>,
|
||||
) -> anyhow::Result<()> {
|
||||
remote_client.delete_all().await.context("delete_all")
|
||||
}
|
||||
|
||||
/// It is important that this gets called when DeletionGuard is being held.
|
||||
@@ -179,6 +176,32 @@ async fn remove_maybe_offloaded_timeline_from_tenant(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// It is important that this gets called when DeletionGuard is being held.
|
||||
/// For more context see comments in [`DeleteTimelineFlow::prepare`]
|
||||
async fn upload_new_tenant_manifest(
|
||||
tenant: &Tenant,
|
||||
_: &DeletionGuard, // using it as a witness
|
||||
) -> anyhow::Result<()> {
|
||||
// This is susceptible to race conditions, i.e. we won't continue deletions if there is a crash
|
||||
// between the deletion of the index-part.json and reaching of this code.
|
||||
// So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted.
|
||||
// However, we handle this case in tenant loading code so the next time we attach, the issue is
|
||||
// resolved.
|
||||
let manifest = tenant.tenant_manifest();
|
||||
// TODO: generation support
|
||||
let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
|
||||
remote_timeline_client::upload_tenant_manifest(
|
||||
&tenant.remote_storage,
|
||||
&tenant.tenant_shard_id,
|
||||
generation,
|
||||
&manifest,
|
||||
&tenant.cancel,
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures,
|
||||
/// and deletes its data from both disk and s3.
|
||||
/// The sequence of steps:
|
||||
@@ -235,7 +258,8 @@ impl DeleteTimelineFlow {
|
||||
))?
|
||||
});
|
||||
|
||||
set_deleted_in_remote_index(&timeline).await?;
|
||||
let remote_client = timeline.remote_client_maybe_construct(tenant);
|
||||
set_deleted_in_remote_index(&remote_client).await?;
|
||||
|
||||
fail::fail_point!("timeline-delete-before-schedule", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
@@ -243,7 +267,13 @@ impl DeleteTimelineFlow {
|
||||
))?
|
||||
});
|
||||
|
||||
Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
|
||||
Self::schedule_background(
|
||||
guard,
|
||||
tenant.conf,
|
||||
Arc::clone(tenant),
|
||||
timeline,
|
||||
remote_client,
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -283,6 +313,7 @@ impl DeleteTimelineFlow {
|
||||
// Important. We dont pass ancestor above because it can be missing.
|
||||
// Thus we need to skip the validation here.
|
||||
CreateTimelineCause::Delete,
|
||||
crate::tenant::CreateTimelineIdempotency::FailWithConflict, // doesn't matter what we put here
|
||||
)
|
||||
.context("create_timeline_struct")?;
|
||||
|
||||
@@ -301,8 +332,9 @@ impl DeleteTimelineFlow {
|
||||
|
||||
guard.mark_in_progress()?;
|
||||
|
||||
let remote_client = timeline.remote_client.clone();
|
||||
let timeline = TimelineOrOffloaded::Timeline(timeline);
|
||||
Self::schedule_background(guard, tenant.conf, tenant, timeline);
|
||||
Self::schedule_background(guard, tenant.conf, tenant, timeline, remote_client);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -380,6 +412,7 @@ impl DeleteTimelineFlow {
|
||||
conf: &'static PageServerConf,
|
||||
tenant: Arc<Tenant>,
|
||||
timeline: TimelineOrOffloaded,
|
||||
remote_client: Arc<RemoteTimelineClient>,
|
||||
) {
|
||||
let tenant_shard_id = timeline.tenant_shard_id();
|
||||
let timeline_id = timeline.timeline_id();
|
||||
@@ -391,7 +424,7 @@ impl DeleteTimelineFlow {
|
||||
Some(timeline_id),
|
||||
"timeline_delete",
|
||||
async move {
|
||||
if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await {
|
||||
if let Err(err) = Self::background(guard, conf, &tenant, &timeline, remote_client).await {
|
||||
error!("Error: {err:#}");
|
||||
if let TimelineOrOffloaded::Timeline(timeline) = timeline {
|
||||
timeline.set_broken(format!("{err:#}"))
|
||||
@@ -408,6 +441,7 @@ impl DeleteTimelineFlow {
|
||||
conf: &PageServerConf,
|
||||
tenant: &Tenant,
|
||||
timeline: &TimelineOrOffloaded,
|
||||
remote_client: Arc<RemoteTimelineClient>,
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
// Offloaded timelines have no local state
|
||||
// TODO: once we persist offloaded information, delete the timeline from there, too
|
||||
@@ -415,12 +449,14 @@ impl DeleteTimelineFlow {
|
||||
delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await?;
|
||||
}
|
||||
|
||||
delete_remote_layers_and_index(timeline).await?;
|
||||
delete_remote_layers_and_index(&remote_client).await?;
|
||||
|
||||
pausable_failpoint!("in_progress_delete");
|
||||
|
||||
remove_maybe_offloaded_timeline_from_tenant(tenant, timeline, &guard).await?;
|
||||
|
||||
upload_new_tenant_manifest(tenant, &guard).await?;
|
||||
|
||||
*guard = Self::Finished;
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -125,19 +125,9 @@ pub(super) enum DismissedLayer {
|
||||
/// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
|
||||
pub(super) fn reconcile(
|
||||
local_layers: Vec<(LayerName, LocalLayerFileMetadata)>,
|
||||
index_part: Option<&IndexPart>,
|
||||
index_part: &IndexPart,
|
||||
disk_consistent_lsn: Lsn,
|
||||
) -> Vec<(LayerName, Result<Decision, DismissedLayer>)> {
|
||||
let Some(index_part) = index_part else {
|
||||
// If we have no remote metadata, no local layer files are considered valid to load
|
||||
return local_layers
|
||||
.into_iter()
|
||||
.map(|(layer_name, local_metadata)| {
|
||||
(layer_name, Err(DismissedLayer::LocalOnly(local_metadata)))
|
||||
})
|
||||
.collect();
|
||||
};
|
||||
|
||||
let mut result = Vec::new();
|
||||
|
||||
let mut remote_layers = HashMap::new();
|
||||
|
||||
@@ -45,13 +45,16 @@ impl LayerManager {
|
||||
pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer {
|
||||
// The assumption for the `expect()` is that all code maintains the following invariant:
|
||||
// A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
|
||||
self.layers()
|
||||
.get(key)
|
||||
self.try_get_from_key(key)
|
||||
.with_context(|| format!("get layer from key: {key}"))
|
||||
.expect("not found")
|
||||
.clone()
|
||||
}
|
||||
|
||||
pub(crate) fn try_get_from_key(&self, key: &PersistentLayerKey) -> Option<&Layer> {
|
||||
self.layers().get(key)
|
||||
}
|
||||
|
||||
pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
|
||||
self.get_from_key(&desc.key())
|
||||
}
|
||||
|
||||
@@ -1,17 +1,17 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::tenant::{OffloadedTimeline, Tenant, TimelineOrOffloaded};
|
||||
|
||||
use super::{
|
||||
delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard},
|
||||
Timeline,
|
||||
};
|
||||
use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard};
|
||||
use super::Timeline;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::{remote_timeline_client, OffloadedTimeline, Tenant, TimelineOrOffloaded};
|
||||
|
||||
pub(crate) async fn offload_timeline(
|
||||
tenant: &Tenant,
|
||||
timeline: &Arc<Timeline>,
|
||||
) -> anyhow::Result<()> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
tracing::info!("offloading archived timeline");
|
||||
|
||||
let (timeline, guard) = DeleteTimelineFlow::prepare(tenant, timeline.timeline_id)?;
|
||||
|
||||
let TimelineOrOffloaded::Timeline(timeline) = timeline else {
|
||||
@@ -19,14 +19,28 @@ pub(crate) async fn offload_timeline(
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
let is_archived = timeline.is_archived();
|
||||
match is_archived {
|
||||
Some(true) => (),
|
||||
Some(false) => {
|
||||
tracing::warn!(?is_archived, "tried offloading a non-archived timeline");
|
||||
anyhow::bail!("timeline isn't archived");
|
||||
}
|
||||
None => {
|
||||
tracing::warn!(
|
||||
?is_archived,
|
||||
"tried offloading a timeline where manifest is not yet available"
|
||||
);
|
||||
anyhow::bail!("timeline manifest hasn't been loaded yet");
|
||||
}
|
||||
}
|
||||
|
||||
// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
|
||||
timeline.shutdown(super::ShutdownMode::Hard).await;
|
||||
|
||||
// TODO extend guard mechanism above with method
|
||||
// to make deletions possible while offloading is in progress
|
||||
|
||||
// TODO mark timeline as offloaded in S3
|
||||
|
||||
let conf = &tenant.conf;
|
||||
delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline).await?;
|
||||
|
||||
@@ -36,10 +50,31 @@ pub(crate) async fn offload_timeline(
|
||||
let mut offloaded_timelines = tenant.timelines_offloaded.lock().unwrap();
|
||||
offloaded_timelines.insert(
|
||||
timeline.timeline_id,
|
||||
Arc::new(OffloadedTimeline::from_timeline(&timeline)),
|
||||
Arc::new(
|
||||
OffloadedTimeline::from_timeline(&timeline)
|
||||
.expect("we checked above that timeline was ready"),
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
// Last step: mark timeline as offloaded in S3
|
||||
// TODO: maybe move this step above, right above deletion of the local timeline directory,
|
||||
// then there is no potential race condition where we partially offload a timeline, and
|
||||
// at the next restart attach it again.
|
||||
// For that to happen, we'd need to make the manifest reflect our *intended* state,
|
||||
// not our actual state of offloaded timelines.
|
||||
let manifest = tenant.tenant_manifest();
|
||||
// TODO: generation support
|
||||
let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
|
||||
remote_timeline_client::upload_tenant_manifest(
|
||||
&tenant.remote_storage,
|
||||
&tenant.tenant_shard_id,
|
||||
generation,
|
||||
&manifest,
|
||||
&tenant.cancel,
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -5,7 +5,11 @@ use camino::Utf8PathBuf;
|
||||
use tracing::{error, info, info_span};
|
||||
use utils::{fs_ext, id::TimelineId, lsn::Lsn};
|
||||
|
||||
use crate::{context::RequestContext, import_datadir, tenant::Tenant};
|
||||
use crate::{
|
||||
context::RequestContext,
|
||||
import_datadir,
|
||||
tenant::{CreateTimelineIdempotency, Tenant, TimelineOrOffloaded},
|
||||
};
|
||||
|
||||
use super::Timeline;
|
||||
|
||||
@@ -165,13 +169,17 @@ pub(crate) struct TimelineCreateGuard<'t> {
|
||||
owning_tenant: &'t Tenant,
|
||||
timeline_id: TimelineId,
|
||||
pub(crate) timeline_path: Utf8PathBuf,
|
||||
pub(crate) idempotency: CreateTimelineIdempotency,
|
||||
}
|
||||
|
||||
/// Errors when acquiring exclusive access to a timeline ID for creation
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum TimelineExclusionError {
|
||||
#[error("Already exists")]
|
||||
AlreadyExists(Arc<Timeline>),
|
||||
AlreadyExists {
|
||||
existing: TimelineOrOffloaded,
|
||||
arg: CreateTimelineIdempotency,
|
||||
},
|
||||
#[error("Already creating")]
|
||||
AlreadyCreating,
|
||||
|
||||
@@ -185,27 +193,42 @@ impl<'t> TimelineCreateGuard<'t> {
|
||||
owning_tenant: &'t Tenant,
|
||||
timeline_id: TimelineId,
|
||||
timeline_path: Utf8PathBuf,
|
||||
idempotency: CreateTimelineIdempotency,
|
||||
allow_offloaded: bool,
|
||||
) -> Result<Self, TimelineExclusionError> {
|
||||
// Lock order: this is the only place we take both locks. During drop() we only
|
||||
// lock creating_timelines
|
||||
let timelines = owning_tenant.timelines.lock().unwrap();
|
||||
let timelines_offloaded = owning_tenant.timelines_offloaded.lock().unwrap();
|
||||
let mut creating_timelines: std::sync::MutexGuard<
|
||||
'_,
|
||||
std::collections::HashSet<TimelineId>,
|
||||
> = owning_tenant.timelines_creating.lock().unwrap();
|
||||
|
||||
if let Some(existing) = timelines.get(&timeline_id) {
|
||||
Err(TimelineExclusionError::AlreadyExists(existing.clone()))
|
||||
} else if creating_timelines.contains(&timeline_id) {
|
||||
Err(TimelineExclusionError::AlreadyCreating)
|
||||
} else {
|
||||
creating_timelines.insert(timeline_id);
|
||||
Ok(Self {
|
||||
owning_tenant,
|
||||
timeline_id,
|
||||
timeline_path,
|
||||
})
|
||||
return Err(TimelineExclusionError::AlreadyExists {
|
||||
existing: TimelineOrOffloaded::Timeline(existing.clone()),
|
||||
arg: idempotency,
|
||||
});
|
||||
}
|
||||
if !allow_offloaded {
|
||||
if let Some(existing) = timelines_offloaded.get(&timeline_id) {
|
||||
return Err(TimelineExclusionError::AlreadyExists {
|
||||
existing: TimelineOrOffloaded::Offloaded(existing.clone()),
|
||||
arg: idempotency,
|
||||
});
|
||||
}
|
||||
}
|
||||
if creating_timelines.contains(&timeline_id) {
|
||||
return Err(TimelineExclusionError::AlreadyCreating);
|
||||
}
|
||||
creating_timelines.insert(timeline_id);
|
||||
Ok(Self {
|
||||
owning_tenant,
|
||||
timeline_id,
|
||||
timeline_path,
|
||||
idempotency,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::ops::Deref;
|
||||
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::key::Key;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tokio_epoll_uring::BoundedBuf;
|
||||
@@ -27,6 +27,7 @@ use utils::vec_map::VecMap;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK};
|
||||
use crate::virtual_file::IoBufferMut;
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
|
||||
/// Metadata bundled with the start and end offset of a blob.
|
||||
@@ -158,7 +159,7 @@ impl std::fmt::Display for VectoredBlob {
|
||||
/// Return type of [`VectoredBlobReader::read_blobs`]
|
||||
pub struct VectoredBlobsBuf {
|
||||
/// Buffer for all blobs in this read
|
||||
pub buf: BytesMut,
|
||||
pub buf: IoBufferMut,
|
||||
/// Offsets into the buffer and metadata for all blobs in this read
|
||||
pub blobs: Vec<VectoredBlob>,
|
||||
}
|
||||
@@ -441,7 +442,7 @@ impl<'a> VectoredBlobReader<'a> {
|
||||
pub async fn read_blobs(
|
||||
&self,
|
||||
read: &VectoredRead,
|
||||
buf: BytesMut,
|
||||
buf: IoBufferMut,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<VectoredBlobsBuf, std::io::Error> {
|
||||
assert!(read.size() > 0);
|
||||
@@ -916,7 +917,7 @@ mod tests {
|
||||
|
||||
// Multiply by two (compressed data might need more space), and add a few bytes for the header
|
||||
let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16;
|
||||
let mut buf = BytesMut::with_capacity(reserved_bytes);
|
||||
let mut buf = IoBufferMut::with_capacity(reserved_bytes);
|
||||
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&file);
|
||||
let meta = BlobMeta {
|
||||
|
||||
@@ -18,6 +18,9 @@ use crate::page_cache::{PageWriteGuard, PAGE_SZ};
|
||||
use crate::tenant::TENANTS_SEGMENT_NAME;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use once_cell::sync::OnceCell;
|
||||
use owned_buffers_io::aligned_buffer::buffer::AlignedBuffer;
|
||||
use owned_buffers_io::aligned_buffer::{AlignedBufferMut, AlignedSlice, ConstAlign};
|
||||
use owned_buffers_io::io_buf_aligned::IoBufAlignedMut;
|
||||
use owned_buffers_io::io_buf_ext::FullSlice;
|
||||
use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
@@ -55,6 +58,8 @@ pub(crate) mod owned_buffers_io {
|
||||
//! but for the time being we're proving out the primitives in the neon.git repo
|
||||
//! for faster iteration.
|
||||
|
||||
pub(crate) mod aligned_buffer;
|
||||
pub(crate) mod io_buf_aligned;
|
||||
pub(crate) mod io_buf_ext;
|
||||
pub(crate) mod slice;
|
||||
pub(crate) mod write;
|
||||
@@ -196,7 +201,7 @@ impl VirtualFile {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Slice<Buf>, Error>
|
||||
where
|
||||
Buf: IoBufMut + Send,
|
||||
Buf: IoBufAlignedMut + Send,
|
||||
{
|
||||
self.inner.read_exact_at(slice, offset, ctx).await
|
||||
}
|
||||
@@ -771,7 +776,7 @@ impl VirtualFileInner {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Slice<Buf>, Error>
|
||||
where
|
||||
Buf: IoBufMut + Send,
|
||||
Buf: IoBufAlignedMut + Send,
|
||||
{
|
||||
let assert_we_return_original_bounds = if cfg!(debug_assertions) {
|
||||
Some((slice.stable_ptr() as usize, slice.bytes_total()))
|
||||
@@ -1222,12 +1227,14 @@ impl VirtualFileInner {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
let slice = Vec::with_capacity(PAGE_SZ).slice_full();
|
||||
let slice = IoBufferMut::with_capacity(PAGE_SZ).slice_full();
|
||||
assert_eq!(slice.bytes_total(), PAGE_SZ);
|
||||
let slice = self
|
||||
.read_exact_at(slice, blknum as u64 * (PAGE_SZ as u64), ctx)
|
||||
.await?;
|
||||
Ok(crate::tenant::block_io::BlockLease::Vec(slice.into_inner()))
|
||||
Ok(crate::tenant::block_io::BlockLease::IoBufferMut(
|
||||
slice.into_inner(),
|
||||
))
|
||||
}
|
||||
|
||||
async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
|
||||
@@ -1325,10 +1332,11 @@ impl OpenFiles {
|
||||
/// server startup.
|
||||
///
|
||||
#[cfg(not(test))]
|
||||
pub fn init(num_slots: usize, engine: IoEngineKind) {
|
||||
pub fn init(num_slots: usize, engine: IoEngineKind, mode: IoMode) {
|
||||
if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() {
|
||||
panic!("virtual_file::init called twice");
|
||||
}
|
||||
set_io_mode(mode);
|
||||
io_engine::init(engine);
|
||||
crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
|
||||
}
|
||||
@@ -1357,6 +1365,11 @@ pub(crate) const fn get_io_buffer_alignment() -> usize {
|
||||
DEFAULT_IO_BUFFER_ALIGNMENT
|
||||
}
|
||||
|
||||
pub(crate) type IoBufferMut = AlignedBufferMut<ConstAlign<{ get_io_buffer_alignment() }>>;
|
||||
pub(crate) type IoBuffer = AlignedBuffer<ConstAlign<{ get_io_buffer_alignment() }>>;
|
||||
pub(crate) type IoPageSlice<'a> =
|
||||
AlignedSlice<'a, PAGE_SZ, ConstAlign<{ get_io_buffer_alignment() }>>;
|
||||
|
||||
static IO_MODE: AtomicU8 = AtomicU8::new(IoMode::preferred() as u8);
|
||||
|
||||
pub(crate) fn set_io_mode(mode: IoMode) {
|
||||
@@ -1395,10 +1408,10 @@ mod tests {
|
||||
impl MaybeVirtualFile {
|
||||
async fn read_exact_at(
|
||||
&self,
|
||||
mut slice: tokio_epoll_uring::Slice<Vec<u8>>,
|
||||
mut slice: tokio_epoll_uring::Slice<IoBufferMut>,
|
||||
offset: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<tokio_epoll_uring::Slice<Vec<u8>>, Error> {
|
||||
) -> Result<tokio_epoll_uring::Slice<IoBufferMut>, Error> {
|
||||
match self {
|
||||
MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(slice, offset, ctx).await,
|
||||
MaybeVirtualFile::File(file) => {
|
||||
@@ -1466,12 +1479,13 @@ mod tests {
|
||||
len: usize,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<String, Error> {
|
||||
let slice = Vec::with_capacity(len).slice_full();
|
||||
let slice = IoBufferMut::with_capacity(len).slice_full();
|
||||
assert_eq!(slice.bytes_total(), len);
|
||||
let slice = self.read_exact_at(slice, pos, ctx).await?;
|
||||
let vec = slice.into_inner();
|
||||
assert_eq!(vec.len(), len);
|
||||
Ok(String::from_utf8(vec).unwrap())
|
||||
let buf = slice.into_inner();
|
||||
assert_eq!(buf.len(), len);
|
||||
|
||||
Ok(String::from_utf8(buf.to_vec()).unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1695,7 +1709,7 @@ mod tests {
|
||||
let files = files.clone();
|
||||
let ctx = ctx.detached_child(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
let hdl = rt.spawn(async move {
|
||||
let mut buf = vec![0u8; SIZE];
|
||||
let mut buf = IoBufferMut::with_capacity_zeroed(SIZE);
|
||||
let mut rng = rand::rngs::OsRng;
|
||||
for _ in 1..1000 {
|
||||
let f = &files[rng.gen_range(0..files.len())];
|
||||
@@ -1704,7 +1718,7 @@ mod tests {
|
||||
.await
|
||||
.unwrap()
|
||||
.into_inner();
|
||||
assert!(buf == SAMPLE);
|
||||
assert!(buf[..] == SAMPLE);
|
||||
}
|
||||
});
|
||||
hdls.push(hdl);
|
||||
|
||||
@@ -16,18 +16,24 @@ use tokio_epoll_uring::{System, SystemHandle};
|
||||
|
||||
use crate::virtual_file::on_fatal_io_error;
|
||||
|
||||
use crate::metrics::tokio_epoll_uring as metrics;
|
||||
use crate::metrics::tokio_epoll_uring::{self as metrics, THREAD_LOCAL_METRICS_STORAGE};
|
||||
|
||||
#[derive(Clone)]
|
||||
struct ThreadLocalState(Arc<ThreadLocalStateInner>);
|
||||
|
||||
struct ThreadLocalStateInner {
|
||||
cell: tokio::sync::OnceCell<SystemHandle>,
|
||||
cell: tokio::sync::OnceCell<SystemHandle<metrics::ThreadLocalMetrics>>,
|
||||
launch_attempts: AtomicU32,
|
||||
/// populated through fetch_add from [`THREAD_LOCAL_STATE_ID`]
|
||||
thread_local_state_id: u64,
|
||||
}
|
||||
|
||||
impl Drop for ThreadLocalStateInner {
|
||||
fn drop(&mut self) {
|
||||
THREAD_LOCAL_METRICS_STORAGE.remove_system(self.thread_local_state_id);
|
||||
}
|
||||
}
|
||||
|
||||
impl ThreadLocalState {
|
||||
pub fn new() -> Self {
|
||||
Self(Arc::new(ThreadLocalStateInner {
|
||||
@@ -71,7 +77,8 @@ pub async fn thread_local_system() -> Handle {
|
||||
&fake_cancel,
|
||||
)
|
||||
.await;
|
||||
let res = System::launch()
|
||||
let per_system_metrics = metrics::THREAD_LOCAL_METRICS_STORAGE.register_system(inner.thread_local_state_id);
|
||||
let res = System::launch_with_metrics(per_system_metrics)
|
||||
// this might move us to another executor thread => loop outside the get_or_try_init, not inside it
|
||||
.await;
|
||||
match res {
|
||||
@@ -86,6 +93,7 @@ pub async fn thread_local_system() -> Handle {
|
||||
emit_launch_failure_process_stats();
|
||||
});
|
||||
metrics::THREAD_LOCAL_LAUNCH_FAILURES.inc();
|
||||
metrics::THREAD_LOCAL_METRICS_STORAGE.remove_system(inner.thread_local_state_id);
|
||||
Err(())
|
||||
}
|
||||
// abort the process instead of panicking because pageserver usually becomes half-broken if we panic somewhere.
|
||||
@@ -115,7 +123,7 @@ fn emit_launch_failure_process_stats() {
|
||||
// number of threads
|
||||
// rss / system memory usage generally
|
||||
|
||||
let tokio_epoll_uring::metrics::Metrics {
|
||||
let tokio_epoll_uring::metrics::GlobalMetrics {
|
||||
systems_created,
|
||||
systems_destroyed,
|
||||
} = tokio_epoll_uring::metrics::global();
|
||||
@@ -182,7 +190,7 @@ fn emit_launch_failure_process_stats() {
|
||||
pub struct Handle(ThreadLocalState);
|
||||
|
||||
impl std::ops::Deref for Handle {
|
||||
type Target = SystemHandle;
|
||||
type Target = SystemHandle<metrics::ThreadLocalMetrics>;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.0
|
||||
|
||||
@@ -0,0 +1,9 @@
|
||||
pub mod alignment;
|
||||
pub mod buffer;
|
||||
pub mod buffer_mut;
|
||||
pub mod raw;
|
||||
pub mod slice;
|
||||
|
||||
pub use alignment::*;
|
||||
pub use buffer_mut::AlignedBufferMut;
|
||||
pub use slice::AlignedSlice;
|
||||
@@ -0,0 +1,26 @@
|
||||
pub trait Alignment: std::marker::Unpin + 'static {
|
||||
/// Returns the required alignments.
|
||||
fn align(&self) -> usize;
|
||||
}
|
||||
|
||||
/// Alignment at compile time.
|
||||
#[derive(Debug)]
|
||||
pub struct ConstAlign<const A: usize>;
|
||||
|
||||
impl<const A: usize> Alignment for ConstAlign<A> {
|
||||
fn align(&self) -> usize {
|
||||
A
|
||||
}
|
||||
}
|
||||
|
||||
/// Alignment at run time.
|
||||
#[derive(Debug)]
|
||||
pub struct RuntimeAlign {
|
||||
align: usize,
|
||||
}
|
||||
|
||||
impl Alignment for RuntimeAlign {
|
||||
fn align(&self) -> usize {
|
||||
self.align
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,124 @@
|
||||
use std::{
|
||||
ops::{Deref, Range, RangeBounds},
|
||||
sync::Arc,
|
||||
};
|
||||
|
||||
use super::{alignment::Alignment, raw::RawAlignedBuffer};
|
||||
|
||||
/// An shared, immutable aligned buffer type.
|
||||
pub struct AlignedBuffer<A: Alignment> {
|
||||
/// Shared raw buffer.
|
||||
raw: Arc<RawAlignedBuffer<A>>,
|
||||
/// Range that specifies the current slice.
|
||||
range: Range<usize>,
|
||||
}
|
||||
|
||||
impl<A: Alignment> AlignedBuffer<A> {
|
||||
/// Creates an immutable `IoBuffer` from the raw buffer
|
||||
pub(super) fn from_raw(raw: RawAlignedBuffer<A>, range: Range<usize>) -> Self {
|
||||
AlignedBuffer {
|
||||
raw: Arc::new(raw),
|
||||
range,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the number of bytes in the buffer, also referred to as its 'length'.
|
||||
#[inline]
|
||||
pub fn len(&self) -> usize {
|
||||
self.range.len()
|
||||
}
|
||||
|
||||
/// Returns the alignment of the buffer.
|
||||
#[inline]
|
||||
pub fn align(&self) -> usize {
|
||||
self.raw.align()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn as_ptr(&self) -> *const u8 {
|
||||
// SAFETY: `self.range.start` is guaranteed to be within [0, self.len()).
|
||||
unsafe { self.raw.as_ptr().add(self.range.start) }
|
||||
}
|
||||
|
||||
/// Extracts a slice containing the entire buffer.
|
||||
///
|
||||
/// Equivalent to `&s[..]`.
|
||||
#[inline]
|
||||
fn as_slice(&self) -> &[u8] {
|
||||
&self.raw.as_slice()[self.range.start..self.range.end]
|
||||
}
|
||||
|
||||
/// Returns a slice of self for the index range `[begin..end)`.
|
||||
pub fn slice(&self, range: impl RangeBounds<usize>) -> Self {
|
||||
use core::ops::Bound;
|
||||
let len = self.len();
|
||||
|
||||
let begin = match range.start_bound() {
|
||||
Bound::Included(&n) => n,
|
||||
Bound::Excluded(&n) => n.checked_add(1).expect("out of range"),
|
||||
Bound::Unbounded => 0,
|
||||
};
|
||||
|
||||
let end = match range.end_bound() {
|
||||
Bound::Included(&n) => n.checked_add(1).expect("out of range"),
|
||||
Bound::Excluded(&n) => n,
|
||||
Bound::Unbounded => len,
|
||||
};
|
||||
|
||||
assert!(
|
||||
begin <= end,
|
||||
"range start must not be greater than end: {:?} <= {:?}",
|
||||
begin,
|
||||
end,
|
||||
);
|
||||
assert!(
|
||||
end <= len,
|
||||
"range end out of bounds: {:?} <= {:?}",
|
||||
end,
|
||||
len,
|
||||
);
|
||||
|
||||
let begin = self.range.start + begin;
|
||||
let end = self.range.start + end;
|
||||
|
||||
AlignedBuffer {
|
||||
raw: Arc::clone(&self.raw),
|
||||
range: begin..end,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<A: Alignment> Deref for AlignedBuffer<A> {
|
||||
type Target = [u8];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.as_slice()
|
||||
}
|
||||
}
|
||||
|
||||
impl<A: Alignment> AsRef<[u8]> for AlignedBuffer<A> {
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
self.as_slice()
|
||||
}
|
||||
}
|
||||
|
||||
impl<A: Alignment> PartialEq<[u8]> for AlignedBuffer<A> {
|
||||
fn eq(&self, other: &[u8]) -> bool {
|
||||
self.as_slice().eq(other)
|
||||
}
|
||||
}
|
||||
|
||||
/// SAFETY: the underlying buffer references a stable memory region.
|
||||
unsafe impl<A: Alignment> tokio_epoll_uring::IoBuf for AlignedBuffer<A> {
|
||||
fn stable_ptr(&self) -> *const u8 {
|
||||
self.as_ptr()
|
||||
}
|
||||
|
||||
fn bytes_init(&self) -> usize {
|
||||
self.len()
|
||||
}
|
||||
|
||||
fn bytes_total(&self) -> usize {
|
||||
self.len()
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,347 @@
|
||||
use std::ops::{Deref, DerefMut};
|
||||
|
||||
use super::{
|
||||
alignment::{Alignment, ConstAlign},
|
||||
buffer::AlignedBuffer,
|
||||
raw::RawAlignedBuffer,
|
||||
};
|
||||
|
||||
/// A mutable aligned buffer type.
|
||||
#[derive(Debug)]
|
||||
pub struct AlignedBufferMut<A: Alignment> {
|
||||
raw: RawAlignedBuffer<A>,
|
||||
}
|
||||
|
||||
impl<const A: usize> AlignedBufferMut<ConstAlign<A>> {
|
||||
/// Constructs a new, empty `IoBufferMut` with at least the specified capacity and alignment.
|
||||
///
|
||||
/// The buffer will be able to hold at most `capacity` elements and will never resize.
|
||||
///
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Panics if the new capacity exceeds `isize::MAX` _bytes_, or if the following alignment requirement is not met:
|
||||
/// * `align` must not be zero,
|
||||
///
|
||||
/// * `align` must be a power of two,
|
||||
///
|
||||
/// * `capacity`, when rounded up to the nearest multiple of `align`,
|
||||
/// must not overflow isize (i.e., the rounded value must be
|
||||
/// less than or equal to `isize::MAX`).
|
||||
pub fn with_capacity(capacity: usize) -> Self {
|
||||
AlignedBufferMut {
|
||||
raw: RawAlignedBuffer::with_capacity(capacity),
|
||||
}
|
||||
}
|
||||
|
||||
/// Constructs a new `IoBufferMut` with at least the specified capacity and alignment, filled with zeros.
|
||||
pub fn with_capacity_zeroed(capacity: usize) -> Self {
|
||||
use bytes::BufMut;
|
||||
let mut buf = Self::with_capacity(capacity);
|
||||
buf.put_bytes(0, capacity);
|
||||
// SAFETY: `put_bytes` filled the entire buffer.
|
||||
unsafe { buf.set_len(capacity) };
|
||||
buf
|
||||
}
|
||||
}
|
||||
|
||||
impl<A: Alignment> AlignedBufferMut<A> {
|
||||
/// Returns the total number of bytes the buffer can hold.
|
||||
#[inline]
|
||||
pub fn capacity(&self) -> usize {
|
||||
self.raw.capacity()
|
||||
}
|
||||
|
||||
/// Returns the alignment of the buffer.
|
||||
#[inline]
|
||||
pub fn align(&self) -> usize {
|
||||
self.raw.align()
|
||||
}
|
||||
|
||||
/// Returns the number of bytes in the buffer, also referred to as its 'length'.
|
||||
#[inline]
|
||||
pub fn len(&self) -> usize {
|
||||
self.raw.len()
|
||||
}
|
||||
|
||||
/// Force the length of the buffer to `new_len`.
|
||||
#[inline]
|
||||
unsafe fn set_len(&mut self, new_len: usize) {
|
||||
self.raw.set_len(new_len)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn as_ptr(&self) -> *const u8 {
|
||||
self.raw.as_ptr()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn as_mut_ptr(&mut self) -> *mut u8 {
|
||||
self.raw.as_mut_ptr()
|
||||
}
|
||||
|
||||
/// Extracts a slice containing the entire buffer.
|
||||
///
|
||||
/// Equivalent to `&s[..]`.
|
||||
#[inline]
|
||||
fn as_slice(&self) -> &[u8] {
|
||||
self.raw.as_slice()
|
||||
}
|
||||
|
||||
/// Extracts a mutable slice of the entire buffer.
|
||||
///
|
||||
/// Equivalent to `&mut s[..]`.
|
||||
fn as_mut_slice(&mut self) -> &mut [u8] {
|
||||
self.raw.as_mut_slice()
|
||||
}
|
||||
|
||||
/// Drops the all the contents of the buffer, setting its length to `0`.
|
||||
#[inline]
|
||||
pub fn clear(&mut self) {
|
||||
self.raw.clear()
|
||||
}
|
||||
|
||||
/// Reserves capacity for at least `additional` more bytes to be inserted
|
||||
/// in the given `IoBufferMut`. The collection may reserve more space to
|
||||
/// speculatively avoid frequent reallocations. After calling `reserve`,
|
||||
/// capacity will be greater than or equal to `self.len() + additional`.
|
||||
/// Does nothing if capacity is already sufficient.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Panics if the new capacity exceeds `isize::MAX` _bytes_.
|
||||
pub fn reserve(&mut self, additional: usize) {
|
||||
self.raw.reserve(additional);
|
||||
}
|
||||
|
||||
/// Shortens the buffer, keeping the first len bytes.
|
||||
pub fn truncate(&mut self, len: usize) {
|
||||
self.raw.truncate(len);
|
||||
}
|
||||
|
||||
/// Consumes and leaks the `IoBufferMut`, returning a mutable reference to the contents, &'a mut [u8].
|
||||
pub fn leak<'a>(self) -> &'a mut [u8] {
|
||||
self.raw.leak()
|
||||
}
|
||||
|
||||
pub fn freeze(self) -> AlignedBuffer<A> {
|
||||
let len = self.len();
|
||||
AlignedBuffer::from_raw(self.raw, 0..len)
|
||||
}
|
||||
}
|
||||
|
||||
impl<A: Alignment> Deref for AlignedBufferMut<A> {
|
||||
type Target = [u8];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.as_slice()
|
||||
}
|
||||
}
|
||||
|
||||
impl<A: Alignment> DerefMut for AlignedBufferMut<A> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
self.as_mut_slice()
|
||||
}
|
||||
}
|
||||
|
||||
impl<A: Alignment> AsRef<[u8]> for AlignedBufferMut<A> {
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
self.as_slice()
|
||||
}
|
||||
}
|
||||
|
||||
impl<A: Alignment> AsMut<[u8]> for AlignedBufferMut<A> {
|
||||
fn as_mut(&mut self) -> &mut [u8] {
|
||||
self.as_mut_slice()
|
||||
}
|
||||
}
|
||||
|
||||
impl<A: Alignment> PartialEq<[u8]> for AlignedBufferMut<A> {
|
||||
fn eq(&self, other: &[u8]) -> bool {
|
||||
self.as_slice().eq(other)
|
||||
}
|
||||
}
|
||||
|
||||
/// SAFETY: When advancing the internal cursor, the caller needs to make sure the bytes advcanced past have been initialized.
|
||||
unsafe impl<A: Alignment> bytes::BufMut for AlignedBufferMut<A> {
|
||||
#[inline]
|
||||
fn remaining_mut(&self) -> usize {
|
||||
// Although a `Vec` can have at most isize::MAX bytes, we never want to grow `IoBufferMut`.
|
||||
// Thus, it can have at most `self.capacity` bytes.
|
||||
self.capacity() - self.len()
|
||||
}
|
||||
|
||||
// SAFETY: Caller needs to make sure the bytes being advanced past have been initialized.
|
||||
#[inline]
|
||||
unsafe fn advance_mut(&mut self, cnt: usize) {
|
||||
let len = self.len();
|
||||
let remaining = self.remaining_mut();
|
||||
|
||||
if remaining < cnt {
|
||||
panic_advance(cnt, remaining);
|
||||
}
|
||||
|
||||
// Addition will not overflow since the sum is at most the capacity.
|
||||
self.set_len(len + cnt);
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn chunk_mut(&mut self) -> &mut bytes::buf::UninitSlice {
|
||||
let cap = self.capacity();
|
||||
let len = self.len();
|
||||
|
||||
// SAFETY: Since `self.ptr` is valid for `cap` bytes, `self.ptr.add(len)` must be
|
||||
// valid for `cap - len` bytes. The subtraction will not underflow since
|
||||
// `len <= cap`.
|
||||
unsafe {
|
||||
bytes::buf::UninitSlice::from_raw_parts_mut(self.as_mut_ptr().add(len), cap - len)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Panic with a nice error message.
|
||||
#[cold]
|
||||
fn panic_advance(idx: usize, len: usize) -> ! {
|
||||
panic!(
|
||||
"advance out of bounds: the len is {} but advancing by {}",
|
||||
len, idx
|
||||
);
|
||||
}
|
||||
|
||||
/// Safety: [`AlignedBufferMut`] has exclusive ownership of the io buffer,
|
||||
/// and the underlying pointer remains stable while io-uring is owning the buffer.
|
||||
/// The tokio-epoll-uring crate itself will not resize the buffer and will respect
|
||||
/// [`tokio_epoll_uring::IoBuf::bytes_total`].
|
||||
unsafe impl<A: Alignment> tokio_epoll_uring::IoBuf for AlignedBufferMut<A> {
|
||||
fn stable_ptr(&self) -> *const u8 {
|
||||
self.as_ptr()
|
||||
}
|
||||
|
||||
fn bytes_init(&self) -> usize {
|
||||
self.len()
|
||||
}
|
||||
|
||||
fn bytes_total(&self) -> usize {
|
||||
self.capacity()
|
||||
}
|
||||
}
|
||||
|
||||
// SAFETY: See above.
|
||||
unsafe impl<A: Alignment> tokio_epoll_uring::IoBufMut for AlignedBufferMut<A> {
|
||||
fn stable_mut_ptr(&mut self) -> *mut u8 {
|
||||
self.as_mut_ptr()
|
||||
}
|
||||
|
||||
unsafe fn set_init(&mut self, init_len: usize) {
|
||||
if self.len() < init_len {
|
||||
self.set_len(init_len);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
|
||||
const ALIGN: usize = 4 * 1024;
|
||||
type TestIoBufferMut = AlignedBufferMut<ConstAlign<ALIGN>>;
|
||||
|
||||
#[test]
|
||||
fn test_with_capacity() {
|
||||
let v = TestIoBufferMut::with_capacity(ALIGN * 4);
|
||||
assert_eq!(v.len(), 0);
|
||||
assert_eq!(v.capacity(), ALIGN * 4);
|
||||
assert_eq!(v.align(), ALIGN);
|
||||
assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
|
||||
|
||||
let v = TestIoBufferMut::with_capacity(ALIGN / 2);
|
||||
assert_eq!(v.len(), 0);
|
||||
assert_eq!(v.capacity(), ALIGN / 2);
|
||||
assert_eq!(v.align(), ALIGN);
|
||||
assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_with_capacity_zeroed() {
|
||||
let v = TestIoBufferMut::with_capacity_zeroed(ALIGN);
|
||||
assert_eq!(v.len(), ALIGN);
|
||||
assert_eq!(v.capacity(), ALIGN);
|
||||
assert_eq!(v.align(), ALIGN);
|
||||
assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
|
||||
assert_eq!(&v[..], &[0; ALIGN])
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_reserve() {
|
||||
use bytes::BufMut;
|
||||
let mut v = TestIoBufferMut::with_capacity(ALIGN);
|
||||
let capacity = v.capacity();
|
||||
v.reserve(capacity);
|
||||
assert_eq!(v.capacity(), capacity);
|
||||
let data = [b'a'; ALIGN];
|
||||
v.put(&data[..]);
|
||||
v.reserve(capacity);
|
||||
assert!(v.capacity() >= capacity * 2);
|
||||
assert_eq!(&v[..], &data[..]);
|
||||
let capacity = v.capacity();
|
||||
v.clear();
|
||||
v.reserve(capacity);
|
||||
assert_eq!(capacity, v.capacity());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bytes_put() {
|
||||
use bytes::BufMut;
|
||||
let mut v = TestIoBufferMut::with_capacity(ALIGN * 4);
|
||||
let x = [b'a'; ALIGN];
|
||||
|
||||
for _ in 0..2 {
|
||||
for _ in 0..4 {
|
||||
v.put(&x[..]);
|
||||
}
|
||||
assert_eq!(v.len(), ALIGN * 4);
|
||||
assert_eq!(v.capacity(), ALIGN * 4);
|
||||
assert_eq!(v.align(), ALIGN);
|
||||
assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
|
||||
v.clear()
|
||||
}
|
||||
assert_eq!(v.len(), 0);
|
||||
assert_eq!(v.capacity(), ALIGN * 4);
|
||||
assert_eq!(v.align(), ALIGN);
|
||||
assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_bytes_put_panic() {
|
||||
use bytes::BufMut;
|
||||
const ALIGN: usize = 4 * 1024;
|
||||
let mut v = TestIoBufferMut::with_capacity(ALIGN * 4);
|
||||
let x = [b'a'; ALIGN];
|
||||
for _ in 0..5 {
|
||||
v.put_slice(&x[..]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_io_buf_put_slice() {
|
||||
use tokio_epoll_uring::BoundedBufMut;
|
||||
const ALIGN: usize = 4 * 1024;
|
||||
let mut v = TestIoBufferMut::with_capacity(ALIGN);
|
||||
let x = [b'a'; ALIGN];
|
||||
|
||||
for _ in 0..2 {
|
||||
v.put_slice(&x[..]);
|
||||
assert_eq!(v.len(), ALIGN);
|
||||
assert_eq!(v.capacity(), ALIGN);
|
||||
assert_eq!(v.align(), ALIGN);
|
||||
assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
|
||||
v.clear()
|
||||
}
|
||||
assert_eq!(v.len(), 0);
|
||||
assert_eq!(v.capacity(), ALIGN);
|
||||
assert_eq!(v.align(), ALIGN);
|
||||
assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,216 @@
|
||||
use core::slice;
|
||||
use std::{
|
||||
alloc::{self, Layout},
|
||||
cmp,
|
||||
mem::ManuallyDrop,
|
||||
};
|
||||
|
||||
use super::alignment::{Alignment, ConstAlign};
|
||||
|
||||
#[derive(Debug)]
|
||||
struct AlignedBufferPtr(*mut u8);
|
||||
|
||||
// SAFETY: We gurantees no one besides `IoBufferPtr` itself has the raw pointer.
|
||||
unsafe impl Send for AlignedBufferPtr {}
|
||||
|
||||
// SAFETY: We gurantees no one besides `IoBufferPtr` itself has the raw pointer.
|
||||
unsafe impl Sync for AlignedBufferPtr {}
|
||||
|
||||
/// An aligned buffer type.
|
||||
#[derive(Debug)]
|
||||
pub struct RawAlignedBuffer<A: Alignment> {
|
||||
ptr: AlignedBufferPtr,
|
||||
capacity: usize,
|
||||
len: usize,
|
||||
align: A,
|
||||
}
|
||||
|
||||
impl<const A: usize> RawAlignedBuffer<ConstAlign<A>> {
|
||||
/// Constructs a new, empty `IoBufferMut` with at least the specified capacity and alignment.
|
||||
///
|
||||
/// The buffer will be able to hold at most `capacity` elements and will never resize.
|
||||
///
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Panics if the new capacity exceeds `isize::MAX` _bytes_, or if the following alignment requirement is not met:
|
||||
/// * `align` must not be zero,
|
||||
///
|
||||
/// * `align` must be a power of two,
|
||||
///
|
||||
/// * `capacity`, when rounded up to the nearest multiple of `align`,
|
||||
/// must not overflow isize (i.e., the rounded value must be
|
||||
/// less than or equal to `isize::MAX`).
|
||||
pub fn with_capacity(capacity: usize) -> Self {
|
||||
let align = ConstAlign::<A>;
|
||||
let layout = Layout::from_size_align(capacity, align.align()).expect("Invalid layout");
|
||||
|
||||
// SAFETY: Making an allocation with a sized and aligned layout. The memory is manually freed with the same layout.
|
||||
let ptr = unsafe {
|
||||
let ptr = alloc::alloc(layout);
|
||||
if ptr.is_null() {
|
||||
alloc::handle_alloc_error(layout);
|
||||
}
|
||||
AlignedBufferPtr(ptr)
|
||||
};
|
||||
|
||||
RawAlignedBuffer {
|
||||
ptr,
|
||||
capacity,
|
||||
len: 0,
|
||||
align,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<A: Alignment> RawAlignedBuffer<A> {
|
||||
/// Returns the total number of bytes the buffer can hold.
|
||||
#[inline]
|
||||
pub fn capacity(&self) -> usize {
|
||||
self.capacity
|
||||
}
|
||||
|
||||
/// Returns the alignment of the buffer.
|
||||
#[inline]
|
||||
pub fn align(&self) -> usize {
|
||||
self.align.align()
|
||||
}
|
||||
|
||||
/// Returns the number of bytes in the buffer, also referred to as its 'length'.
|
||||
#[inline]
|
||||
pub fn len(&self) -> usize {
|
||||
self.len
|
||||
}
|
||||
|
||||
/// Force the length of the buffer to `new_len`.
|
||||
#[inline]
|
||||
pub unsafe fn set_len(&mut self, new_len: usize) {
|
||||
debug_assert!(new_len <= self.capacity());
|
||||
self.len = new_len;
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn as_ptr(&self) -> *const u8 {
|
||||
self.ptr.0
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn as_mut_ptr(&mut self) -> *mut u8 {
|
||||
self.ptr.0
|
||||
}
|
||||
|
||||
/// Extracts a slice containing the entire buffer.
|
||||
///
|
||||
/// Equivalent to `&s[..]`.
|
||||
#[inline]
|
||||
pub fn as_slice(&self) -> &[u8] {
|
||||
// SAFETY: The pointer is valid and `len` bytes are initialized.
|
||||
unsafe { slice::from_raw_parts(self.as_ptr(), self.len) }
|
||||
}
|
||||
|
||||
/// Extracts a mutable slice of the entire buffer.
|
||||
///
|
||||
/// Equivalent to `&mut s[..]`.
|
||||
pub fn as_mut_slice(&mut self) -> &mut [u8] {
|
||||
// SAFETY: The pointer is valid and `len` bytes are initialized.
|
||||
unsafe { slice::from_raw_parts_mut(self.as_mut_ptr(), self.len) }
|
||||
}
|
||||
|
||||
/// Drops the all the contents of the buffer, setting its length to `0`.
|
||||
#[inline]
|
||||
pub fn clear(&mut self) {
|
||||
self.len = 0;
|
||||
}
|
||||
|
||||
/// Reserves capacity for at least `additional` more bytes to be inserted
|
||||
/// in the given `IoBufferMut`. The collection may reserve more space to
|
||||
/// speculatively avoid frequent reallocations. After calling `reserve`,
|
||||
/// capacity will be greater than or equal to `self.len() + additional`.
|
||||
/// Does nothing if capacity is already sufficient.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Panics if the new capacity exceeds `isize::MAX` _bytes_.
|
||||
pub fn reserve(&mut self, additional: usize) {
|
||||
if additional > self.capacity() - self.len() {
|
||||
self.reserve_inner(additional);
|
||||
}
|
||||
}
|
||||
|
||||
fn reserve_inner(&mut self, additional: usize) {
|
||||
let Some(required_cap) = self.len().checked_add(additional) else {
|
||||
capacity_overflow()
|
||||
};
|
||||
|
||||
let old_capacity = self.capacity();
|
||||
let align = self.align();
|
||||
// This guarantees exponential growth. The doubling cannot overflow
|
||||
// because `cap <= isize::MAX` and the type of `cap` is `usize`.
|
||||
let cap = cmp::max(old_capacity * 2, required_cap);
|
||||
|
||||
if !is_valid_alloc(cap) {
|
||||
capacity_overflow()
|
||||
}
|
||||
let new_layout = Layout::from_size_align(cap, self.align()).expect("Invalid layout");
|
||||
|
||||
let old_ptr = self.as_mut_ptr();
|
||||
|
||||
// SAFETY: old allocation was allocated with std::alloc::alloc with the same layout,
|
||||
// and we panics on null pointer.
|
||||
let (ptr, cap) = unsafe {
|
||||
let old_layout = Layout::from_size_align_unchecked(old_capacity, align);
|
||||
let ptr = alloc::realloc(old_ptr, old_layout, new_layout.size());
|
||||
if ptr.is_null() {
|
||||
alloc::handle_alloc_error(new_layout);
|
||||
}
|
||||
(AlignedBufferPtr(ptr), cap)
|
||||
};
|
||||
|
||||
self.ptr = ptr;
|
||||
self.capacity = cap;
|
||||
}
|
||||
|
||||
/// Shortens the buffer, keeping the first len bytes.
|
||||
pub fn truncate(&mut self, len: usize) {
|
||||
if len > self.len {
|
||||
return;
|
||||
}
|
||||
self.len = len;
|
||||
}
|
||||
|
||||
/// Consumes and leaks the `IoBufferMut`, returning a mutable reference to the contents, &'a mut [u8].
|
||||
pub fn leak<'a>(self) -> &'a mut [u8] {
|
||||
let mut buf = ManuallyDrop::new(self);
|
||||
// SAFETY: leaking the buffer as intended.
|
||||
unsafe { slice::from_raw_parts_mut(buf.as_mut_ptr(), buf.len) }
|
||||
}
|
||||
}
|
||||
|
||||
fn capacity_overflow() -> ! {
|
||||
panic!("capacity overflow")
|
||||
}
|
||||
|
||||
// We need to guarantee the following:
|
||||
// * We don't ever allocate `> isize::MAX` byte-size objects.
|
||||
// * We don't overflow `usize::MAX` and actually allocate too little.
|
||||
//
|
||||
// On 64-bit we just need to check for overflow since trying to allocate
|
||||
// `> isize::MAX` bytes will surely fail. On 32-bit and 16-bit we need to add
|
||||
// an extra guard for this in case we're running on a platform which can use
|
||||
// all 4GB in user-space, e.g., PAE or x32.
|
||||
#[inline]
|
||||
fn is_valid_alloc(alloc_size: usize) -> bool {
|
||||
!(usize::BITS < 64 && alloc_size > isize::MAX as usize)
|
||||
}
|
||||
|
||||
impl<A: Alignment> Drop for RawAlignedBuffer<A> {
|
||||
fn drop(&mut self) {
|
||||
// SAFETY: memory was allocated with std::alloc::alloc with the same layout.
|
||||
unsafe {
|
||||
alloc::dealloc(
|
||||
self.as_mut_ptr(),
|
||||
Layout::from_size_align_unchecked(self.capacity, self.align.align()),
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,40 @@
|
||||
use std::ops::{Deref, DerefMut};
|
||||
|
||||
use super::alignment::{Alignment, ConstAlign};
|
||||
|
||||
/// Newtype for an aligned slice.
|
||||
pub struct AlignedSlice<'a, const N: usize, A: Alignment> {
|
||||
/// underlying byte slice
|
||||
buf: &'a mut [u8; N],
|
||||
/// alignment marker
|
||||
_align: A,
|
||||
}
|
||||
|
||||
impl<'a, const N: usize, const A: usize> AlignedSlice<'a, N, ConstAlign<A>> {
|
||||
/// Create a new aligned slice from a mutable byte slice. The input must already satisify the alignment.
|
||||
pub unsafe fn new_unchecked(buf: &'a mut [u8; N]) -> Self {
|
||||
let _align = ConstAlign::<A>;
|
||||
assert_eq!(buf.as_ptr().align_offset(_align.align()), 0);
|
||||
AlignedSlice { buf, _align }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, const N: usize, A: Alignment> Deref for AlignedSlice<'a, N, A> {
|
||||
type Target = [u8; N];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.buf
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, const N: usize, A: Alignment> DerefMut for AlignedSlice<'a, N, A> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
self.buf
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, const N: usize, A: Alignment> AsRef<[u8; N]> for AlignedSlice<'a, N, A> {
|
||||
fn as_ref(&self) -> &[u8; N] {
|
||||
self.buf
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,9 @@
|
||||
use tokio_epoll_uring::IoBufMut;
|
||||
|
||||
use crate::virtual_file::{IoBufferMut, PageWriteGuardBuf};
|
||||
|
||||
pub trait IoBufAlignedMut: IoBufMut {}
|
||||
|
||||
impl IoBufAlignedMut for IoBufferMut {}
|
||||
|
||||
impl IoBufAlignedMut for PageWriteGuardBuf {}
|
||||
@@ -1,5 +1,6 @@
|
||||
//! See [`FullSlice`].
|
||||
|
||||
use crate::virtual_file::{IoBuffer, IoBufferMut};
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use std::ops::{Deref, Range};
|
||||
use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
|
||||
@@ -76,3 +77,5 @@ macro_rules! impl_io_buf_ext {
|
||||
impl_io_buf_ext!(Bytes);
|
||||
impl_io_buf_ext!(BytesMut);
|
||||
impl_io_buf_ext!(Vec<u8>);
|
||||
impl_io_buf_ext!(IoBufferMut);
|
||||
impl_io_buf_ext!(IoBuffer);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -8,6 +8,7 @@ OBJS = \
|
||||
file_cache.o \
|
||||
hll.o \
|
||||
libpagestore.o \
|
||||
logical_replication_monitor.o \
|
||||
neon.o \
|
||||
neon_pgversioncompat.o \
|
||||
neon_perf_counters.o \
|
||||
@@ -54,7 +55,7 @@ walproposer-lib: libwalproposer.a;
|
||||
|
||||
.PHONY: libwalproposer.a
|
||||
libwalproposer.a: $(WALPROP_OBJS)
|
||||
rm -f $@
|
||||
$(RM) $@
|
||||
$(AR) $(AROPT) $@ $^
|
||||
|
||||
# needs vars:
|
||||
|
||||
@@ -767,7 +767,7 @@ HandleDropRole(DropRoleStmt *stmt)
|
||||
entry->type = Op_Delete;
|
||||
entry->password = NULL;
|
||||
if (!found)
|
||||
memset(entry->old_name, 0, sizeof(entry));
|
||||
memset(entry->old_name, 0, sizeof(entry->old_name));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
253
pgxn/neon/logical_replication_monitor.c
Normal file
253
pgxn/neon/logical_replication_monitor.c
Normal file
@@ -0,0 +1,253 @@
|
||||
#include <limits.h>
|
||||
#include <string.h>
|
||||
#include <dirent.h>
|
||||
#include <signal.h>
|
||||
|
||||
#include "postgres.h"
|
||||
|
||||
#include "miscadmin.h"
|
||||
#include "postmaster/bgworker.h"
|
||||
#include "postmaster/interrupt.h"
|
||||
#include "replication/slot.h"
|
||||
#include "storage/fd.h"
|
||||
#include "storage/procsignal.h"
|
||||
#include "tcop/tcopprot.h"
|
||||
#include "utils/guc.h"
|
||||
#include "utils/wait_event.h"
|
||||
|
||||
#include "logical_replication_monitor.h"
|
||||
|
||||
#define LS_MONITOR_CHECK_INTERVAL 10000 /* ms */
|
||||
|
||||
static int logical_replication_max_snap_files = 300;
|
||||
|
||||
PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg);
|
||||
|
||||
static int
|
||||
LsnDescComparator(const void *a, const void *b)
|
||||
{
|
||||
XLogRecPtr lsn1 = *((const XLogRecPtr *) a);
|
||||
XLogRecPtr lsn2 = *((const XLogRecPtr *) b);
|
||||
|
||||
if (lsn1 < lsn2)
|
||||
return 1;
|
||||
else if (lsn1 == lsn2)
|
||||
return 0;
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Look at .snap files and calculate minimum allowed restart_lsn of slot so that
|
||||
* next gc would leave not more than logical_replication_max_snap_files; all
|
||||
* slots having lower restart_lsn should be dropped.
|
||||
*/
|
||||
static XLogRecPtr
|
||||
get_num_snap_files_lsn_threshold(void)
|
||||
{
|
||||
DIR *dirdesc;
|
||||
struct dirent *de;
|
||||
char *snap_path = "pg_logical/snapshots/";
|
||||
int lsns_allocated = 1024;
|
||||
int lsns_num = 0;
|
||||
XLogRecPtr *lsns;
|
||||
XLogRecPtr cutoff;
|
||||
|
||||
if (logical_replication_max_snap_files < 0)
|
||||
return 0;
|
||||
|
||||
lsns = palloc(sizeof(XLogRecPtr) * lsns_allocated);
|
||||
|
||||
/* find all .snap files and get their lsns */
|
||||
dirdesc = AllocateDir(snap_path);
|
||||
while ((de = ReadDir(dirdesc, snap_path)) != NULL)
|
||||
{
|
||||
XLogRecPtr lsn;
|
||||
uint32 hi;
|
||||
uint32 lo;
|
||||
|
||||
if (strcmp(de->d_name, ".") == 0 ||
|
||||
strcmp(de->d_name, "..") == 0)
|
||||
continue;
|
||||
|
||||
if (sscanf(de->d_name, "%X-%X.snap", &hi, &lo) != 2)
|
||||
{
|
||||
ereport(LOG,
|
||||
(errmsg("could not parse file name as .snap file \"%s\"", de->d_name)));
|
||||
continue;
|
||||
}
|
||||
|
||||
lsn = ((uint64) hi) << 32 | lo;
|
||||
elog(DEBUG5, "found snap file %X/%X", LSN_FORMAT_ARGS(lsn));
|
||||
if (lsns_allocated == lsns_num)
|
||||
{
|
||||
lsns_allocated *= 2;
|
||||
lsns = repalloc(lsns, sizeof(XLogRecPtr) * lsns_allocated);
|
||||
}
|
||||
lsns[lsns_num++] = lsn;
|
||||
}
|
||||
/* sort by lsn desc */
|
||||
qsort(lsns, lsns_num, sizeof(XLogRecPtr), LsnDescComparator);
|
||||
/* and take cutoff at logical_replication_max_snap_files */
|
||||
if (logical_replication_max_snap_files > lsns_num)
|
||||
cutoff = 0;
|
||||
/* have less files than cutoff */
|
||||
else
|
||||
{
|
||||
cutoff = lsns[logical_replication_max_snap_files - 1];
|
||||
elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %d .snap files, limit is %d",
|
||||
LSN_FORMAT_ARGS(cutoff), lsns_num, logical_replication_max_snap_files);
|
||||
}
|
||||
pfree(lsns);
|
||||
FreeDir(dirdesc);
|
||||
return cutoff;
|
||||
}
|
||||
|
||||
void
|
||||
InitLogicalReplicationMonitor(void)
|
||||
{
|
||||
BackgroundWorker bgw;
|
||||
|
||||
DefineCustomIntVariable(
|
||||
"neon.logical_replication_max_snap_files",
|
||||
"Maximum allowed logical replication .snap files. When exceeded, slots are dropped until the limit is met. -1 disables the limit.",
|
||||
NULL,
|
||||
&logical_replication_max_snap_files,
|
||||
300, -1, INT_MAX,
|
||||
PGC_SIGHUP,
|
||||
0,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
memset(&bgw, 0, sizeof(bgw));
|
||||
bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
|
||||
bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
|
||||
snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
|
||||
snprintf(bgw.bgw_function_name, BGW_MAXLEN, "LogicalSlotsMonitorMain");
|
||||
snprintf(bgw.bgw_name, BGW_MAXLEN, "Logical replication monitor");
|
||||
snprintf(bgw.bgw_type, BGW_MAXLEN, "Logical replication monitor");
|
||||
bgw.bgw_restart_time = 5;
|
||||
bgw.bgw_notify_pid = 0;
|
||||
bgw.bgw_main_arg = (Datum) 0;
|
||||
|
||||
RegisterBackgroundWorker(&bgw);
|
||||
}
|
||||
|
||||
/*
|
||||
* Unused logical replication slots pins WAL and prevents deletion of snapshots.
|
||||
* WAL bloat is guarded by max_slot_wal_keep_size; this bgw removes slots which
|
||||
* need too many .snap files.
|
||||
*/
|
||||
void
|
||||
LogicalSlotsMonitorMain(Datum main_arg)
|
||||
{
|
||||
/* Establish signal handlers. */
|
||||
pqsignal(SIGUSR1, procsignal_sigusr1_handler);
|
||||
pqsignal(SIGHUP, SignalHandlerForConfigReload);
|
||||
pqsignal(SIGTERM, die);
|
||||
|
||||
BackgroundWorkerUnblockSignals();
|
||||
|
||||
for (;;)
|
||||
{
|
||||
XLogRecPtr cutoff_lsn;
|
||||
|
||||
/* In case of a SIGHUP, just reload the configuration. */
|
||||
if (ConfigReloadPending)
|
||||
{
|
||||
ConfigReloadPending = false;
|
||||
ProcessConfigFile(PGC_SIGHUP);
|
||||
}
|
||||
|
||||
/*
|
||||
* If there are too many .snap files, just drop all logical slots to
|
||||
* prevent aux files bloat.
|
||||
*/
|
||||
cutoff_lsn = get_num_snap_files_lsn_threshold();
|
||||
if (cutoff_lsn > 0)
|
||||
{
|
||||
for (int i = 0; i < max_replication_slots; i++)
|
||||
{
|
||||
char slot_name[NAMEDATALEN];
|
||||
ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
|
||||
XLogRecPtr restart_lsn;
|
||||
|
||||
/* find the name */
|
||||
LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
|
||||
/* Consider only logical repliction slots */
|
||||
if (!s->in_use || !SlotIsLogical(s))
|
||||
{
|
||||
LWLockRelease(ReplicationSlotControlLock);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* do we need to drop it? */
|
||||
SpinLockAcquire(&s->mutex);
|
||||
restart_lsn = s->data.restart_lsn;
|
||||
SpinLockRelease(&s->mutex);
|
||||
if (restart_lsn >= cutoff_lsn)
|
||||
{
|
||||
LWLockRelease(ReplicationSlotControlLock);
|
||||
continue;
|
||||
}
|
||||
|
||||
strlcpy(slot_name, s->data.name.data, NAMEDATALEN);
|
||||
elog(LOG, "ls_monitor: dropping slot %s with restart_lsn %X/%X below horizon %X/%X",
|
||||
slot_name, LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(cutoff_lsn));
|
||||
LWLockRelease(ReplicationSlotControlLock);
|
||||
|
||||
/* now try to drop it, killing owner before if any */
|
||||
for (;;)
|
||||
{
|
||||
pid_t active_pid;
|
||||
|
||||
SpinLockAcquire(&s->mutex);
|
||||
active_pid = s->active_pid;
|
||||
SpinLockRelease(&s->mutex);
|
||||
|
||||
if (active_pid == 0)
|
||||
{
|
||||
/*
|
||||
* Slot is releasted, try to drop it. Though of course
|
||||
* it could have been reacquired, so drop can ERROR
|
||||
* out. Similarly it could have been dropped in the
|
||||
* meanwhile.
|
||||
*
|
||||
* In principle we could remove pg_try/pg_catch, that
|
||||
* would restart the whole bgworker.
|
||||
*/
|
||||
ConditionVariableCancelSleep();
|
||||
PG_TRY();
|
||||
{
|
||||
ReplicationSlotDrop(slot_name, true);
|
||||
elog(LOG, "ls_monitor: slot %s dropped", slot_name);
|
||||
}
|
||||
PG_CATCH();
|
||||
{
|
||||
/* log ERROR and reset elog stack */
|
||||
EmitErrorReport();
|
||||
FlushErrorState();
|
||||
elog(LOG, "ls_monitor: failed to drop slot %s", slot_name);
|
||||
}
|
||||
PG_END_TRY();
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* kill the owner and wait for release */
|
||||
elog(LOG, "ls_monitor: killing slot %s owner %d", slot_name, active_pid);
|
||||
(void) kill(active_pid, SIGTERM);
|
||||
/* We shouldn't get stuck, but to be safe add timeout. */
|
||||
ConditionVariableTimedSleep(&s->active_cv, 1000, WAIT_EVENT_REPLICATION_SLOT_DROP);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
(void) WaitLatch(MyLatch,
|
||||
WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT,
|
||||
LS_MONITOR_CHECK_INTERVAL,
|
||||
PG_WAIT_EXTENSION);
|
||||
ResetLatch(MyLatch);
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
}
|
||||
}
|
||||
6
pgxn/neon/logical_replication_monitor.h
Normal file
6
pgxn/neon/logical_replication_monitor.h
Normal file
@@ -0,0 +1,6 @@
|
||||
#ifndef __NEON_LOGICAL_REPLICATION_MONITOR_H__
|
||||
#define __NEON_LOGICAL_REPLICATION_MONITOR_H__
|
||||
|
||||
void InitLogicalReplicationMonitor(void);
|
||||
|
||||
#endif
|
||||
245
pgxn/neon/neon.c
245
pgxn/neon/neon.c
@@ -14,32 +14,22 @@
|
||||
#include "miscadmin.h"
|
||||
#include "access/subtrans.h"
|
||||
#include "access/twophase.h"
|
||||
#include "access/xact.h"
|
||||
#include "access/xlog.h"
|
||||
#include "storage/buf_internals.h"
|
||||
#include "storage/bufmgr.h"
|
||||
#include "catalog/pg_type.h"
|
||||
#include "postmaster/bgworker.h"
|
||||
#include "postmaster/interrupt.h"
|
||||
#include "replication/logical.h"
|
||||
#include "replication/slot.h"
|
||||
#include "replication/walsender.h"
|
||||
#include "storage/proc.h"
|
||||
#include "storage/procsignal.h"
|
||||
#include "tcop/tcopprot.h"
|
||||
#include "funcapi.h"
|
||||
#include "access/htup_details.h"
|
||||
#include "utils/builtins.h"
|
||||
#include "utils/pg_lsn.h"
|
||||
#include "utils/guc.h"
|
||||
#include "utils/guc_tables.h"
|
||||
#include "utils/wait_event.h"
|
||||
|
||||
#include "extension_server.h"
|
||||
#include "neon.h"
|
||||
#include "walproposer.h"
|
||||
#include "pagestore_client.h"
|
||||
#include "control_plane_connector.h"
|
||||
#include "logical_replication_monitor.h"
|
||||
#include "walsender_hooks.h"
|
||||
#if PG_MAJORVERSION_NUM >= 16
|
||||
#include "storage/ipc.h"
|
||||
@@ -48,7 +38,6 @@
|
||||
PG_MODULE_MAGIC;
|
||||
void _PG_init(void);
|
||||
|
||||
static int logical_replication_max_snap_files = 300;
|
||||
|
||||
static int running_xacts_overflow_policy;
|
||||
|
||||
@@ -82,237 +71,6 @@ static const struct config_enum_entry running_xacts_overflow_policies[] = {
|
||||
{NULL, 0, false}
|
||||
};
|
||||
|
||||
static void
|
||||
InitLogicalReplicationMonitor(void)
|
||||
{
|
||||
BackgroundWorker bgw;
|
||||
|
||||
DefineCustomIntVariable(
|
||||
"neon.logical_replication_max_snap_files",
|
||||
"Maximum allowed logical replication .snap files. When exceeded, slots are dropped until the limit is met. -1 disables the limit.",
|
||||
NULL,
|
||||
&logical_replication_max_snap_files,
|
||||
300, -1, INT_MAX,
|
||||
PGC_SIGHUP,
|
||||
0,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
memset(&bgw, 0, sizeof(bgw));
|
||||
bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
|
||||
bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
|
||||
snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
|
||||
snprintf(bgw.bgw_function_name, BGW_MAXLEN, "LogicalSlotsMonitorMain");
|
||||
snprintf(bgw.bgw_name, BGW_MAXLEN, "Logical replication monitor");
|
||||
snprintf(bgw.bgw_type, BGW_MAXLEN, "Logical replication monitor");
|
||||
bgw.bgw_restart_time = 5;
|
||||
bgw.bgw_notify_pid = 0;
|
||||
bgw.bgw_main_arg = (Datum) 0;
|
||||
|
||||
RegisterBackgroundWorker(&bgw);
|
||||
}
|
||||
|
||||
static int
|
||||
LsnDescComparator(const void *a, const void *b)
|
||||
{
|
||||
XLogRecPtr lsn1 = *((const XLogRecPtr *) a);
|
||||
XLogRecPtr lsn2 = *((const XLogRecPtr *) b);
|
||||
|
||||
if (lsn1 < lsn2)
|
||||
return 1;
|
||||
else if (lsn1 == lsn2)
|
||||
return 0;
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Look at .snap files and calculate minimum allowed restart_lsn of slot so that
|
||||
* next gc would leave not more than logical_replication_max_snap_files; all
|
||||
* slots having lower restart_lsn should be dropped.
|
||||
*/
|
||||
static XLogRecPtr
|
||||
get_num_snap_files_lsn_threshold(void)
|
||||
{
|
||||
DIR *dirdesc;
|
||||
struct dirent *de;
|
||||
char *snap_path = "pg_logical/snapshots/";
|
||||
int lsns_allocated = 1024;
|
||||
int lsns_num = 0;
|
||||
XLogRecPtr *lsns;
|
||||
XLogRecPtr cutoff;
|
||||
|
||||
if (logical_replication_max_snap_files < 0)
|
||||
return 0;
|
||||
|
||||
lsns = palloc(sizeof(XLogRecPtr) * lsns_allocated);
|
||||
|
||||
/* find all .snap files and get their lsns */
|
||||
dirdesc = AllocateDir(snap_path);
|
||||
while ((de = ReadDir(dirdesc, snap_path)) != NULL)
|
||||
{
|
||||
XLogRecPtr lsn;
|
||||
uint32 hi;
|
||||
uint32 lo;
|
||||
|
||||
if (strcmp(de->d_name, ".") == 0 ||
|
||||
strcmp(de->d_name, "..") == 0)
|
||||
continue;
|
||||
|
||||
if (sscanf(de->d_name, "%X-%X.snap", &hi, &lo) != 2)
|
||||
{
|
||||
ereport(LOG,
|
||||
(errmsg("could not parse file name as .snap file \"%s\"", de->d_name)));
|
||||
continue;
|
||||
}
|
||||
|
||||
lsn = ((uint64) hi) << 32 | lo;
|
||||
elog(DEBUG5, "found snap file %X/%X", LSN_FORMAT_ARGS(lsn));
|
||||
if (lsns_allocated == lsns_num)
|
||||
{
|
||||
lsns_allocated *= 2;
|
||||
lsns = repalloc(lsns, sizeof(XLogRecPtr) * lsns_allocated);
|
||||
}
|
||||
lsns[lsns_num++] = lsn;
|
||||
}
|
||||
/* sort by lsn desc */
|
||||
qsort(lsns, lsns_num, sizeof(XLogRecPtr), LsnDescComparator);
|
||||
/* and take cutoff at logical_replication_max_snap_files */
|
||||
if (logical_replication_max_snap_files > lsns_num)
|
||||
cutoff = 0;
|
||||
/* have less files than cutoff */
|
||||
else
|
||||
{
|
||||
cutoff = lsns[logical_replication_max_snap_files - 1];
|
||||
elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %d .snap files, limit is %d",
|
||||
LSN_FORMAT_ARGS(cutoff), lsns_num, logical_replication_max_snap_files);
|
||||
}
|
||||
pfree(lsns);
|
||||
FreeDir(dirdesc);
|
||||
return cutoff;
|
||||
}
|
||||
|
||||
#define LS_MONITOR_CHECK_INTERVAL 10000 /* ms */
|
||||
|
||||
/*
|
||||
* Unused logical replication slots pins WAL and prevents deletion of snapshots.
|
||||
* WAL bloat is guarded by max_slot_wal_keep_size; this bgw removes slots which
|
||||
* need too many .snap files.
|
||||
*/
|
||||
PGDLLEXPORT void
|
||||
LogicalSlotsMonitorMain(Datum main_arg)
|
||||
{
|
||||
/* Establish signal handlers. */
|
||||
pqsignal(SIGUSR1, procsignal_sigusr1_handler);
|
||||
pqsignal(SIGHUP, SignalHandlerForConfigReload);
|
||||
pqsignal(SIGTERM, die);
|
||||
|
||||
BackgroundWorkerUnblockSignals();
|
||||
|
||||
for (;;)
|
||||
{
|
||||
XLogRecPtr cutoff_lsn;
|
||||
|
||||
/* In case of a SIGHUP, just reload the configuration. */
|
||||
if (ConfigReloadPending)
|
||||
{
|
||||
ConfigReloadPending = false;
|
||||
ProcessConfigFile(PGC_SIGHUP);
|
||||
}
|
||||
|
||||
/*
|
||||
* If there are too many .snap files, just drop all logical slots to
|
||||
* prevent aux files bloat.
|
||||
*/
|
||||
cutoff_lsn = get_num_snap_files_lsn_threshold();
|
||||
if (cutoff_lsn > 0)
|
||||
{
|
||||
for (int i = 0; i < max_replication_slots; i++)
|
||||
{
|
||||
char slot_name[NAMEDATALEN];
|
||||
ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
|
||||
XLogRecPtr restart_lsn;
|
||||
|
||||
/* find the name */
|
||||
LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
|
||||
/* Consider only logical repliction slots */
|
||||
if (!s->in_use || !SlotIsLogical(s))
|
||||
{
|
||||
LWLockRelease(ReplicationSlotControlLock);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* do we need to drop it? */
|
||||
SpinLockAcquire(&s->mutex);
|
||||
restart_lsn = s->data.restart_lsn;
|
||||
SpinLockRelease(&s->mutex);
|
||||
if (restart_lsn >= cutoff_lsn)
|
||||
{
|
||||
LWLockRelease(ReplicationSlotControlLock);
|
||||
continue;
|
||||
}
|
||||
|
||||
strlcpy(slot_name, s->data.name.data, NAMEDATALEN);
|
||||
elog(LOG, "ls_monitor: dropping slot %s with restart_lsn %X/%X below horizon %X/%X",
|
||||
slot_name, LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(cutoff_lsn));
|
||||
LWLockRelease(ReplicationSlotControlLock);
|
||||
|
||||
/* now try to drop it, killing owner before if any */
|
||||
for (;;)
|
||||
{
|
||||
pid_t active_pid;
|
||||
|
||||
SpinLockAcquire(&s->mutex);
|
||||
active_pid = s->active_pid;
|
||||
SpinLockRelease(&s->mutex);
|
||||
|
||||
if (active_pid == 0)
|
||||
{
|
||||
/*
|
||||
* Slot is releasted, try to drop it. Though of course
|
||||
* it could have been reacquired, so drop can ERROR
|
||||
* out. Similarly it could have been dropped in the
|
||||
* meanwhile.
|
||||
*
|
||||
* In principle we could remove pg_try/pg_catch, that
|
||||
* would restart the whole bgworker.
|
||||
*/
|
||||
ConditionVariableCancelSleep();
|
||||
PG_TRY();
|
||||
{
|
||||
ReplicationSlotDrop(slot_name, true);
|
||||
elog(LOG, "ls_monitor: slot %s dropped", slot_name);
|
||||
}
|
||||
PG_CATCH();
|
||||
{
|
||||
/* log ERROR and reset elog stack */
|
||||
EmitErrorReport();
|
||||
FlushErrorState();
|
||||
elog(LOG, "ls_monitor: failed to drop slot %s", slot_name);
|
||||
}
|
||||
PG_END_TRY();
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* kill the owner and wait for release */
|
||||
elog(LOG, "ls_monitor: killing slot %s owner %d", slot_name, active_pid);
|
||||
(void) kill(active_pid, SIGTERM);
|
||||
/* We shouldn't get stuck, but to be safe add timeout. */
|
||||
ConditionVariableTimedSleep(&s->active_cv, 1000, WAIT_EVENT_REPLICATION_SLOT_DROP);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
(void) WaitLatch(MyLatch,
|
||||
WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT,
|
||||
LS_MONITOR_CHECK_INTERVAL,
|
||||
PG_WAIT_EXTENSION);
|
||||
ResetLatch(MyLatch);
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* XXX: These private to procarray.c, but we need them here.
|
||||
*/
|
||||
@@ -667,7 +425,6 @@ _PG_init(void)
|
||||
SlotFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
|
||||
|
||||
InitLogicalReplicationMonitor();
|
||||
|
||||
InitControlPlaneConnector();
|
||||
|
||||
pg_init_extension_server();
|
||||
|
||||
10
poetry.lock
generated
10
poetry.lock
generated
@@ -1,4 +1,4 @@
|
||||
# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "aiohappyeyeballs"
|
||||
@@ -3118,13 +3118,13 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "werkzeug"
|
||||
version = "3.0.3"
|
||||
version = "3.0.6"
|
||||
description = "The comprehensive WSGI web application library."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "werkzeug-3.0.3-py3-none-any.whl", hash = "sha256:fc9645dc43e03e4d630d23143a04a7f947a9a3b5727cd535fdfe155a17cc48c8"},
|
||||
{file = "werkzeug-3.0.3.tar.gz", hash = "sha256:097e5bfda9f0aba8da6b8545146def481d06aa7d3266e7448e2cccf67dd8bd18"},
|
||||
{file = "werkzeug-3.0.6-py3-none-any.whl", hash = "sha256:1bc0c2310d2fbb07b1dd1105eba2f7af72f322e1e455f2f93c993bee8c8a5f17"},
|
||||
{file = "werkzeug-3.0.6.tar.gz", hash = "sha256:a8dd59d4de28ca70471a34cba79bed5f7ef2e036a76b3ab0835474246eb41f8d"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -3406,4 +3406,4 @@ cffi = ["cffi (>=1.11)"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "f52632571e34b0e51b059c280c35d6ff6f69f6a8c9586caca78282baf635be91"
|
||||
content-hash = "0f4804119f417edf8e1fbd6d715d2e8d70ad731334fa9570304a2203f83339cf"
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{info, warn};
|
||||
use tracing::{debug, info};
|
||||
|
||||
use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint};
|
||||
use crate::auth::{self, AuthFlow};
|
||||
@@ -21,7 +21,7 @@ pub(crate) async fn authenticate_cleartext(
|
||||
secret: AuthSecret,
|
||||
config: &'static AuthenticationConfig,
|
||||
) -> auth::Result<ComputeCredentials> {
|
||||
warn!("cleartext auth flow override is enabled, proceeding");
|
||||
debug!("cleartext auth flow override is enabled, proceeding");
|
||||
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
|
||||
|
||||
// pause the timer while we communicate with the client
|
||||
@@ -61,7 +61,7 @@ pub(crate) async fn password_hack_no_authentication(
|
||||
info: ComputeUserInfoNoEndpoint,
|
||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
) -> auth::Result<(ComputeUserInfo, Vec<u8>)> {
|
||||
warn!("project not specified, resorting to the password hack auth flow");
|
||||
debug!("project not specified, resorting to the password hack auth flow");
|
||||
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
|
||||
|
||||
// pause the timer while we communicate with the client
|
||||
|
||||
@@ -5,6 +5,7 @@ use std::time::{Duration, SystemTime};
|
||||
use arc_swap::ArcSwapOption;
|
||||
use dashmap::DashMap;
|
||||
use jose_jwk::crypto::KeyInfo;
|
||||
use reqwest::{redirect, Client};
|
||||
use serde::de::Visitor;
|
||||
use serde::{Deserialize, Deserializer};
|
||||
use signature::Verifier;
|
||||
@@ -16,7 +17,7 @@ use crate::context::RequestMonitoring;
|
||||
use crate::control_plane::errors::GetEndpointJwksError;
|
||||
use crate::http::parse_json_body_with_limit;
|
||||
use crate::intern::RoleNameInt;
|
||||
use crate::{EndpointId, RoleName};
|
||||
use crate::types::{EndpointId, RoleName};
|
||||
|
||||
// TODO(conrad): make these configurable.
|
||||
const CLOCK_SKEW_LEEWAY: Duration = Duration::from_secs(30);
|
||||
@@ -24,6 +25,7 @@ const MIN_RENEW: Duration = Duration::from_secs(30);
|
||||
const AUTO_RENEW: Duration = Duration::from_secs(300);
|
||||
const MAX_RENEW: Duration = Duration::from_secs(3600);
|
||||
const MAX_JWK_BODY_SIZE: usize = 64 * 1024;
|
||||
const JWKS_USER_AGENT: &str = "neon-proxy";
|
||||
|
||||
/// How to get the JWT auth rules
|
||||
pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static {
|
||||
@@ -50,7 +52,6 @@ pub(crate) struct AuthRule {
|
||||
pub(crate) role_names: Vec<RoleNameInt>,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct JwkCache {
|
||||
client: reqwest::Client,
|
||||
|
||||
@@ -357,6 +358,20 @@ impl JwkCache {
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for JwkCache {
|
||||
fn default() -> Self {
|
||||
let client = Client::builder()
|
||||
.user_agent(JWKS_USER_AGENT)
|
||||
.redirect(redirect::Policy::none())
|
||||
.build()
|
||||
.expect("using &str and standard redirect::Policy");
|
||||
JwkCache {
|
||||
client,
|
||||
map: DashMap::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn verify_ec_signature(data: &[u8], sig: &[u8], key: &jose_jwk::Ec) -> Result<(), JwtError> {
|
||||
use ecdsa::Signature;
|
||||
use signature::Verifier;
|
||||
@@ -669,7 +684,7 @@ mod tests {
|
||||
use tokio::net::TcpListener;
|
||||
|
||||
use super::*;
|
||||
use crate::RoleName;
|
||||
use crate::types::RoleName;
|
||||
|
||||
fn new_ec_jwk(kid: String) -> (p256::SecretKey, jose_jwk::Jwk) {
|
||||
let sk = p256::SecretKey::random(&mut OsRng);
|
||||
|
||||
@@ -10,9 +10,10 @@ use crate::compute_ctl::ComputeCtlApi;
|
||||
use crate::context::RequestMonitoring;
|
||||
use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo};
|
||||
use crate::control_plane::NodeInfo;
|
||||
use crate::http;
|
||||
use crate::intern::{BranchIdTag, EndpointIdTag, InternId, ProjectIdTag};
|
||||
use crate::types::EndpointId;
|
||||
use crate::url::ApiUrl;
|
||||
use crate::{http, EndpointId};
|
||||
|
||||
pub struct LocalBackend {
|
||||
pub(crate) initialize: Semaphore,
|
||||
|
||||
@@ -32,7 +32,8 @@ use crate::proxy::connect_compute::ComputeConnectBackend;
|
||||
use crate::proxy::NeonOptions;
|
||||
use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter, RateBucketInfo};
|
||||
use crate::stream::Stream;
|
||||
use crate::{scram, stream, EndpointCacheKey, EndpointId, RoleName};
|
||||
use crate::types::{EndpointCacheKey, EndpointId, RoleName};
|
||||
use crate::{scram, stream};
|
||||
|
||||
/// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality
|
||||
pub enum MaybeOwned<'a, T> {
|
||||
@@ -551,7 +552,7 @@ mod tests {
|
||||
async fn get_endpoint_jwks(
|
||||
&self,
|
||||
_ctx: &RequestMonitoring,
|
||||
_endpoint: crate::EndpointId,
|
||||
_endpoint: crate::types::EndpointId,
|
||||
) -> Result<Vec<super::jwt::AuthRule>, control_plane::errors::GetEndpointJwksError>
|
||||
{
|
||||
unimplemented!()
|
||||
|
||||
@@ -15,7 +15,7 @@ use crate::error::{ReportableError, UserFacingError};
|
||||
use crate::metrics::{Metrics, SniKind};
|
||||
use crate::proxy::NeonOptions;
|
||||
use crate::serverless::SERVERLESS_DRIVER_SNI;
|
||||
use crate::{EndpointId, RoleName};
|
||||
use crate::types::{EndpointId, RoleName};
|
||||
|
||||
#[derive(Debug, Error, PartialEq, Eq, Clone)]
|
||||
pub(crate) enum ComputeUserInfoParseError {
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
|
||||
use bstr::ByteSlice;
|
||||
|
||||
use crate::EndpointId;
|
||||
use crate::types::EndpointId;
|
||||
|
||||
pub(crate) struct PasswordHackPayload {
|
||||
pub(crate) endpoint: EndpointId,
|
||||
|
||||
@@ -25,8 +25,8 @@ use proxy::rate_limiter::{
|
||||
use proxy::scram::threadpool::ThreadPool;
|
||||
use proxy::serverless::cancel_set::CancelSet;
|
||||
use proxy::serverless::{self, GlobalConnPoolOptions};
|
||||
use proxy::types::RoleName;
|
||||
use proxy::url::ApiUrl;
|
||||
use proxy::RoleName;
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
project_build_tag!(BUILD_TAG);
|
||||
@@ -177,7 +177,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
let mut maintenance_tasks = JoinSet::new();
|
||||
|
||||
let refresh_config_notify = Arc::new(Notify::new());
|
||||
maintenance_tasks.spawn(proxy::handle_signals(shutdown.clone(), {
|
||||
maintenance_tasks.spawn(proxy::signals::handle(shutdown.clone(), {
|
||||
let refresh_config_notify = Arc::clone(&refresh_config_notify);
|
||||
move || {
|
||||
refresh_config_notify.notify_one();
|
||||
@@ -216,7 +216,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
match futures::future::select(pin!(maintenance_tasks.join_next()), pin!(task)).await {
|
||||
// exit immediately on maintenance task completion
|
||||
Either::Left((Some(res), _)) => match proxy::flatten_err(res)? {},
|
||||
Either::Left((Some(res), _)) => match proxy::error::flatten_err(res)? {},
|
||||
// exit with error immediately if all maintenance tasks have ceased (should be caught by branch above)
|
||||
Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"),
|
||||
// exit immediately on client task error
|
||||
|
||||
@@ -133,14 +133,14 @@ async fn main() -> anyhow::Result<()> {
|
||||
proxy_listener,
|
||||
cancellation_token.clone(),
|
||||
));
|
||||
let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token, || {}));
|
||||
let signals_task = tokio::spawn(proxy::signals::handle(cancellation_token, || {}));
|
||||
|
||||
// the signal task cant ever succeed.
|
||||
// the main task can error, or can succeed on cancellation.
|
||||
// we want to immediately exit on either of these cases
|
||||
let signal = match futures::future::select(signals_task, main).await {
|
||||
Either::Left((res, _)) => proxy::flatten_err(res)?,
|
||||
Either::Right((res, _)) => return proxy::flatten_err(res),
|
||||
Either::Left((res, _)) => proxy::error::flatten_err(res)?,
|
||||
Either::Right((res, _)) => return proxy::error::flatten_err(res),
|
||||
};
|
||||
|
||||
// maintenance tasks return `Infallible` success values, this is an impossible value
|
||||
|
||||
@@ -137,9 +137,6 @@ struct ProxyCliArgs {
|
||||
/// size of the threadpool for password hashing
|
||||
#[clap(long, default_value_t = 4)]
|
||||
scram_thread_pool_size: u8,
|
||||
/// Disable dynamic rate limiter and store the metrics to ensure its production behaviour.
|
||||
#[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
disable_dynamic_rate_limiter: bool,
|
||||
/// Endpoint rate limiter max number of requests per second.
|
||||
///
|
||||
/// Provided in the form `<Requests Per Second>@<Bucket Duration Size>`.
|
||||
@@ -495,7 +492,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
// maintenance tasks. these never return unless there's an error
|
||||
let mut maintenance_tasks = JoinSet::new();
|
||||
maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone(), || {}));
|
||||
maintenance_tasks.spawn(proxy::signals::handle(cancellation_token.clone(), || {}));
|
||||
maintenance_tasks.spawn(http::health_server::task_main(
|
||||
http_listener,
|
||||
AppMetrics {
|
||||
@@ -561,11 +558,11 @@ async fn main() -> anyhow::Result<()> {
|
||||
.await
|
||||
{
|
||||
// exit immediately on maintenance task completion
|
||||
Either::Left((Some(res), _)) => break proxy::flatten_err(res)?,
|
||||
Either::Left((Some(res), _)) => break proxy::error::flatten_err(res)?,
|
||||
// exit with error immediately if all maintenance tasks have ceased (should be caught by branch above)
|
||||
Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"),
|
||||
// exit immediately on client task error
|
||||
Either::Right((Some(res), _)) => proxy::flatten_err(res)?,
|
||||
Either::Right((Some(res), _)) => proxy::error::flatten_err(res)?,
|
||||
// exit if all our client tasks have shutdown gracefully
|
||||
Either::Right((None, _)) => return Ok(()),
|
||||
}
|
||||
@@ -615,9 +612,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
and metric-collection-interval must be specified"
|
||||
),
|
||||
};
|
||||
if !args.disable_dynamic_rate_limiter {
|
||||
bail!("dynamic rate limiter should be disabled");
|
||||
}
|
||||
|
||||
let config::ConcurrencyLockOptions {
|
||||
shards,
|
||||
|
||||
2
proxy/src/cache/endpoints.rs
vendored
2
proxy/src/cache/endpoints.rs
vendored
@@ -17,7 +17,7 @@ use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
|
||||
use crate::metrics::{Metrics, RedisErrors, RedisEventsCount};
|
||||
use crate::rate_limiter::GlobalRateLimiter;
|
||||
use crate::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
|
||||
use crate::EndpointId;
|
||||
use crate::types::EndpointId;
|
||||
|
||||
#[derive(Deserialize, Debug, Clone)]
|
||||
pub(crate) struct ControlPlaneEventKey {
|
||||
|
||||
4
proxy/src/cache/project_info.rs
vendored
4
proxy/src/cache/project_info.rs
vendored
@@ -17,7 +17,7 @@ use crate::auth::IpPattern;
|
||||
use crate::config::ProjectInfoCacheOptions;
|
||||
use crate::control_plane::AuthSecret;
|
||||
use crate::intern::{EndpointIdInt, ProjectIdInt, RoleNameInt};
|
||||
use crate::{EndpointId, RoleName};
|
||||
use crate::types::{EndpointId, RoleName};
|
||||
|
||||
#[async_trait]
|
||||
pub(crate) trait ProjectInfoCache {
|
||||
@@ -368,7 +368,7 @@ impl Cache for ProjectInfoCacheImpl {
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::scram::ServerSecret;
|
||||
use crate::ProjectId;
|
||||
use crate::types::ProjectId;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_project_info_cache_settings() {
|
||||
|
||||
@@ -25,7 +25,7 @@ use crate::control_plane::provider::ApiLockError;
|
||||
use crate::error::{ReportableError, UserFacingError};
|
||||
use crate::metrics::{Metrics, NumDbConnectionsGuard};
|
||||
use crate::proxy::neon_option;
|
||||
use crate::Host;
|
||||
use crate::types::Host;
|
||||
|
||||
pub const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node";
|
||||
|
||||
|
||||
@@ -4,8 +4,9 @@ use serde::de::DeserializeOwned;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use thiserror::Error;
|
||||
|
||||
use crate::http;
|
||||
use crate::types::{DbName, RoleName};
|
||||
use crate::url::ApiUrl;
|
||||
use crate::{http, DbName, RoleName};
|
||||
|
||||
pub struct ComputeCtlApi {
|
||||
pub(crate) api: http::Endpoint,
|
||||
|
||||
@@ -20,7 +20,7 @@ use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig}
|
||||
use crate::scram::threadpool::ThreadPool;
|
||||
use crate::serverless::cancel_set::CancelSet;
|
||||
use crate::serverless::GlobalConnPoolOptions;
|
||||
use crate::Host;
|
||||
use crate::types::Host;
|
||||
|
||||
pub struct ProxyConfig {
|
||||
pub tls_config: Option<TlsConfig>,
|
||||
|
||||
@@ -19,7 +19,7 @@ use crate::intern::{BranchIdInt, ProjectIdInt};
|
||||
use crate::metrics::{
|
||||
ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting,
|
||||
};
|
||||
use crate::{DbName, EndpointId, RoleName};
|
||||
use crate::types::{DbName, EndpointId, RoleName};
|
||||
|
||||
pub mod parquet;
|
||||
|
||||
|
||||
@@ -161,6 +161,9 @@ pub(crate) enum Reason {
|
||||
/// LockAlreadyTaken indicates that the we attempted to take a lock that was already taken.
|
||||
#[serde(rename = "LOCK_ALREADY_TAKEN")]
|
||||
LockAlreadyTaken,
|
||||
/// ActiveEndpointsLimitExceeded indicates that the limit of concurrently active endpoints was exceeded.
|
||||
#[serde(rename = "ACTIVE_ENDPOINTS_LIMIT_EXCEEDED")]
|
||||
ActiveEndpointsLimitExceeded,
|
||||
#[default]
|
||||
#[serde(other)]
|
||||
Unknown,
|
||||
@@ -194,7 +197,8 @@ impl Reason {
|
||||
| Reason::ComputeTimeQuotaExceeded
|
||||
| Reason::WrittenDataQuotaExceeded
|
||||
| Reason::DataTransferQuotaExceeded
|
||||
| Reason::LogicalSizeQuotaExceeded => false,
|
||||
| Reason::LogicalSizeQuotaExceeded
|
||||
| Reason::ActiveEndpointsLimitExceeded => false,
|
||||
// transitive error. control plane is currently busy
|
||||
// but might be ready soon
|
||||
Reason::RunningOperations
|
||||
|
||||
@@ -21,8 +21,9 @@ use crate::control_plane::messages::MetricsAuxInfo;
|
||||
use crate::control_plane::provider::{CachedAllowedIps, CachedRoleSecret};
|
||||
use crate::error::io_error;
|
||||
use crate::intern::RoleNameInt;
|
||||
use crate::types::{BranchId, EndpointId, ProjectId, RoleName};
|
||||
use crate::url::ApiUrl;
|
||||
use crate::{compute, scram, BranchId, EndpointId, ProjectId, RoleName};
|
||||
use crate::{compute, scram};
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
enum MockApiError {
|
||||
|
||||
@@ -23,7 +23,8 @@ use crate::error::ReportableError;
|
||||
use crate::intern::ProjectIdInt;
|
||||
use crate::metrics::ApiLockMetrics;
|
||||
use crate::rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token};
|
||||
use crate::{compute, scram, EndpointCacheKey, EndpointId};
|
||||
use crate::types::{EndpointCacheKey, EndpointId};
|
||||
use crate::{compute, scram};
|
||||
|
||||
pub(crate) mod errors {
|
||||
use thiserror::Error;
|
||||
@@ -87,36 +88,8 @@ pub(crate) mod errors {
|
||||
Reason::ConcurrencyLimitReached => ErrorKind::ControlPlane,
|
||||
Reason::LockAlreadyTaken => ErrorKind::ControlPlane,
|
||||
Reason::RunningOperations => ErrorKind::ControlPlane,
|
||||
Reason::Unknown => match &**e {
|
||||
ControlPlaneError {
|
||||
http_status_code:
|
||||
http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
|
||||
..
|
||||
} => crate::error::ErrorKind::User,
|
||||
ControlPlaneError {
|
||||
http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY,
|
||||
error,
|
||||
..
|
||||
} if error
|
||||
.contains("compute time quota of non-primary branches is exceeded") =>
|
||||
{
|
||||
crate::error::ErrorKind::Quota
|
||||
}
|
||||
ControlPlaneError {
|
||||
http_status_code: http::StatusCode::LOCKED,
|
||||
error,
|
||||
..
|
||||
} if error.contains("quota exceeded")
|
||||
|| error.contains("the limit for current plan reached") =>
|
||||
{
|
||||
crate::error::ErrorKind::Quota
|
||||
}
|
||||
ControlPlaneError {
|
||||
http_status_code: http::StatusCode::TOO_MANY_REQUESTS,
|
||||
..
|
||||
} => crate::error::ErrorKind::ServiceRateLimit,
|
||||
ControlPlaneError { .. } => crate::error::ErrorKind::ControlPlane,
|
||||
},
|
||||
Reason::ActiveEndpointsLimitExceeded => ErrorKind::ControlPlane,
|
||||
Reason::Unknown => ErrorKind::ControlPlane,
|
||||
},
|
||||
ApiError::Transport(_) => crate::error::ErrorKind::ControlPlane,
|
||||
}
|
||||
|
||||
@@ -24,7 +24,8 @@ use crate::control_plane::errors::GetEndpointJwksError;
|
||||
use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason};
|
||||
use crate::metrics::{CacheOutcome, Metrics};
|
||||
use crate::rate_limiter::WakeComputeRateLimiter;
|
||||
use crate::{compute, http, scram, EndpointCacheKey, EndpointId};
|
||||
use crate::types::{EndpointCacheKey, EndpointId};
|
||||
use crate::{compute, http, scram};
|
||||
|
||||
const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id");
|
||||
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
use std::error::Error as StdError;
|
||||
use std::{fmt, io};
|
||||
|
||||
use anyhow::Context;
|
||||
use measured::FixedCardinalityLabel;
|
||||
use tokio::task::JoinError;
|
||||
|
||||
/// Upcast (almost) any error into an opaque [`io::Error`].
|
||||
pub(crate) fn io_error(e: impl Into<Box<dyn StdError + Send + Sync>>) -> io::Error {
|
||||
@@ -97,3 +99,8 @@ impl ReportableError for tokio_postgres::error::Error {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Flattens `Result<Result<T>>` into `Result<T>`.
|
||||
pub fn flatten_err<T>(r: Result<anyhow::Result<T>, JoinError>) -> anyhow::Result<T> {
|
||||
r.context("join error").and_then(|x| x)
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user