Merge branch 'problame/hung-shutdown/demo-hypothesis' into problame/hung-shutdown/fix

This commit is contained in:
Christian Schwarz
2025-01-14 23:51:53 +01:00
89 changed files with 4873 additions and 2437 deletions

View File

@@ -728,30 +728,6 @@ jobs:
tags: |
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }}
- name: Build compute-tools image
# compute-tools are Postgres independent, so build it only once
# We pick 16, because that builds on debian 11 with older glibc (and is
# thus compatible with newer glibc), rather than 17 on Debian 12, as
# that isn't guaranteed to be compatible with Debian 11
if: matrix.version.pg == 'v16'
uses: docker/build-push-action@v6
with:
target: compute-tools-image
context: .
build-args: |
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }}
DEBIAN_VERSION=${{ matrix.version.debian }}
provenance: false
push: true
pull: true
file: compute/compute-node.Dockerfile
cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-tools-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
tags: |
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }}
compute-node-image:
needs: [ compute-node-image-arch, tag ]
permissions:
@@ -794,14 +770,6 @@ jobs:
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
- name: Create multi-arch compute-tools image
if: matrix.version.pg == 'v16'
run: |
docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \
-t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
@@ -817,12 +785,6 @@ jobs:
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}
- name: Push multi-arch compute-tools image to ECR
if: matrix.version.pg == 'v16'
run: |
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
vm-compute-node-image:
needs: [ check-permissions, tag, compute-node-image ]
runs-on: [ self-hosted, large ]
@@ -1001,9 +963,6 @@ jobs:
docker buildx imagetools create -t $repo/neon:latest \
$repo/neon:${{ needs.tag.outputs.build-tag }}
docker buildx imagetools create -t $repo/compute-tools:latest \
$repo/compute-tools:${{ needs.tag.outputs.build-tag }}
for version in ${VERSIONS}; do
docker buildx imagetools create -t $repo/compute-node-${version}:latest \
$repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
@@ -1032,7 +991,7 @@ jobs:
- name: Copy all images to prod ECR
if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
run: |
for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16,v17}; do
for image in neon {vm-,}compute-node-{v14,v15,v16,v17}; do
docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \
369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }}
done
@@ -1044,7 +1003,7 @@ jobs:
with:
client_id: ${{ vars.AZURE_DEV_CLIENT_ID }}
image_tag: ${{ needs.tag.outputs.build-tag }}
images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
tenant_id: ${{ vars.AZURE_TENANT_ID }}
@@ -1056,7 +1015,7 @@ jobs:
with:
client_id: ${{ vars.AZURE_PROD_CLIENT_ID }}
image_tag: ${{ needs.tag.outputs.build-tag }}
images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
tenant_id: ${{ vars.AZURE_TENANT_ID }}

56
Cargo.lock generated
View File

@@ -1605,6 +1605,32 @@ dependencies = [
"typenum",
]
[[package]]
name = "curve25519-dalek"
version = "4.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97fb8b7c4503de7d6ae7b42ab72a5a59857b4c937ec27a3d4539dba95b5ab2be"
dependencies = [
"cfg-if",
"cpufeatures",
"curve25519-dalek-derive",
"digest",
"fiat-crypto",
"rustc_version",
"subtle",
]
[[package]]
name = "curve25519-dalek-derive"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.90",
]
[[package]]
name = "darling"
version = "0.20.1"
@@ -1875,6 +1901,28 @@ dependencies = [
"spki 0.7.3",
]
[[package]]
name = "ed25519"
version = "2.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "115531babc129696a58c64a4fef0a8bf9e9698629fb97e9e40767d235cfbcd53"
dependencies = [
"signature 2.2.0",
]
[[package]]
name = "ed25519-dalek"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a3daa8e81a3963a60642bcc1f90a670680bd4a77535faa384e9d1c79d620871"
dependencies = [
"curve25519-dalek",
"ed25519",
"rand_core 0.6.4",
"sha2",
"subtle",
]
[[package]]
name = "either"
version = "1.8.1"
@@ -2113,6 +2161,12 @@ dependencies = [
"subtle",
]
[[package]]
name = "fiat-crypto"
version = "0.2.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d"
[[package]]
name = "filetime"
version = "0.2.22"
@@ -3990,6 +4044,7 @@ dependencies = [
"postgres_connection",
"postgres_ffi",
"postgres_initdb",
"pprof",
"pq_proto",
"procfs",
"rand 0.8.5",
@@ -4745,6 +4800,7 @@ dependencies = [
"consumption_metrics",
"dashmap 5.5.0",
"ecdsa 0.16.9",
"ed25519-dalek",
"env_logger 0.10.2",
"fallible-iterator",
"flate2",

View File

@@ -71,6 +71,7 @@ RUN set -e \
ca-certificates \
# System postgres for use with client libraries (e.g. in storage controller)
postgresql-15 \
openssl \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
&& useradd -d /data neon \
&& chown -R neon:neon /data

View File

@@ -3,7 +3,6 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
# Where to install Postgres, default is ./pg_install, maybe useful for package managers
POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/
OPENSSL_PREFIX_DIR := /usr/local/openssl
ICU_PREFIX_DIR := /usr/local/icu
#
@@ -26,11 +25,9 @@ endif
ifeq ($(shell test -e /home/nonroot/.docker_build && echo -n yes),yes)
# Exclude static build openssl, icu for local build (MacOS, Linux)
# Only keep for build type release and debug
PG_CFLAGS += -I$(OPENSSL_PREFIX_DIR)/include
PG_CONFIGURE_OPTS += --with-icu
PG_CONFIGURE_OPTS += ICU_CFLAGS='-I/$(ICU_PREFIX_DIR)/include -DU_STATIC_IMPLEMENTATION'
PG_CONFIGURE_OPTS += ICU_LIBS='-L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -licui18n -licuuc -licudata -lstdc++ -Wl,-Bdynamic -lm'
PG_CONFIGURE_OPTS += LDFLAGS='-L$(OPENSSL_PREFIX_DIR)/lib -L$(OPENSSL_PREFIX_DIR)/lib64 -L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -Wl,-Bstatic -lssl -lcrypto -Wl,-Bdynamic -lrt -lm -ldl -lpthread'
endif
UNAME_S := $(shell uname -s)

View File

@@ -115,7 +115,7 @@ RUN set -e \
# Keep the version the same as in compute/compute-node.Dockerfile and
# test_runner/regress/test_compute_metrics.py.
ENV SQL_EXPORTER_VERSION=0.16.0
ENV SQL_EXPORTER_VERSION=0.17.0
RUN curl -fsSL \
"https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \
--output sql_exporter.tar.gz \
@@ -190,21 +190,6 @@ RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JS
&& make install \
&& rm -rf ../lcov.tar.gz
# Compile and install the static OpenSSL library
ENV OPENSSL_VERSION=1.1.1w
ENV OPENSSL_PREFIX=/usr/local/openssl
RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz && \
echo "cf3098950cb4d853ad95c0841f1f9c6d3dc102dccfcacd521d93925208b76ac8 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \
cd /tmp && \
tar xzvf /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
rm /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
cd /tmp/openssl-${OPENSSL_VERSION} && \
./config --prefix=${OPENSSL_PREFIX} -static --static no-shared -fPIC && \
make -j "$(nproc)" && \
make install && \
cd /tmp && \
rm -rf /tmp/openssl-${OPENSSL_VERSION}
# Use the same version of libicu as the compute nodes so that
# clusters created using inidb on pageserver can be used by computes.
#

View File

@@ -104,16 +104,18 @@ RUN cd postgres && \
esac; \
done;
# Set PATH for all the subsequent build steps
ENV PATH="/usr/local/pgsql/bin:$PATH"
#########################################################################################
#
# Layer "postgis-build"
# Build PostGIS from the upstream PostGIS mirror.
#
#########################################################################################
FROM build-deps AS postgis-build
FROM pg-build AS postgis-build
ARG DEBIAN_VERSION
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN apt update && \
apt install --no-install-recommends --no-install-suggests -y \
gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
@@ -151,8 +153,6 @@ RUN case "${DEBIAN_VERSION}" in \
DESTDIR=/sfcgal ninja install -j $(getconf _NPROCESSORS_ONLN) && \
ninja clean && cp -R /sfcgal/* /
ENV PATH="/usr/local/pgsql/bin:$PATH"
# Postgis 3.5.0 supports v17
RUN case "${PG_VERSION}" in \
"v17") \
@@ -170,7 +170,6 @@ RUN case "${PG_VERSION}" in \
wget https://download.osgeo.org/postgis/source/postgis-${POSTGIS_VERSION}.tar.gz -O postgis.tar.gz && \
echo "${POSTGIS_CHECKSUM} postgis.tar.gz" | sha256sum --check && \
mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
./autogen.sh && \
./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -220,11 +219,7 @@ RUN case "${PG_VERSION}" in \
cmake -GNinja -DCMAKE_BUILD_TYPE=Release .. && \
ninja -j $(getconf _NPROCESSORS_ONLN) && \
ninja -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control && \
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
cp /usr/local/pgsql/share/extension/pgrouting.control /extensions/postgis && \
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/postgis.tar.zst -T -
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control
#########################################################################################
#
@@ -232,9 +227,8 @@ RUN case "${PG_VERSION}" in \
# Build plv8
#
#########################################################################################
FROM build-deps AS plv8-build
FROM pg-build AS plv8-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY compute/patches/plv8-3.1.10.patch /plv8-3.1.10.patch
@@ -269,7 +263,6 @@ RUN case "${PG_VERSION}" in \
# generate and copy upgrade scripts
mkdir -p upgrade && ./generate_upgrade.sh ${PLV8_TAG#v} && \
cp upgrade/* /usr/local/pgsql/share/extension/ && \
export PATH="/usr/local/pgsql/bin:$PATH" && \
make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
rm -rf /plv8-* && \
find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
@@ -296,9 +289,8 @@ RUN case "${PG_VERSION}" in \
# Build h3_pg
#
#########################################################################################
FROM build-deps AS h3-pg-build
FROM pg-build AS h3-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
# not version-specific
# last release v4.1.0 - Jan 18, 2023
@@ -319,7 +311,6 @@ RUN mkdir -p /h3/usr/ && \
RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \
export PATH="/usr/local/pgsql/bin:$PATH" && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
@@ -331,17 +322,16 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3
# compile unit extension
#
#########################################################################################
FROM build-deps AS unit-pg-build
FROM pg-build AS unit-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
# not version-specific
# last release 7.9 - Sep 15, 2024
RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz -O postgresql-unit.tar.gz && \
echo "e46de6245dcc8b2c2ecf29873dbd43b2b346773f31dd5ce4b8315895a052b456 postgresql-unit.tar.gz" | sha256sum --check && \
mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
# unit extension's "create extension" script relies on absolute install path to fill some reference tables.
# We move the extension from '/usr/local/pgsql/' to '/usr/local/' after it is build. So we need to adjust the path.
# This one-liner removes pgsql/ part of the path.
@@ -355,9 +345,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz -
# compile pgvector extension
#
#########################################################################################
FROM build-deps AS vector-pg-build
FROM pg-build AS vector-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY compute/patches/pgvector.patch /pgvector.patch
@@ -371,8 +360,8 @@ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O
echo "867a2c328d4928a5a9d6f052cd3bc78c7d60228a9b914ad32aa3db88e9de27b0 pgvector.tar.gz" | sha256sum --check && \
mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
patch -p1 < /pgvector.patch && \
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" && \
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
#########################################################################################
@@ -381,16 +370,15 @@ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O
# compile pgjwt extension
#
#########################################################################################
FROM build-deps AS pgjwt-pg-build
FROM pg-build AS pgjwt-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
# not version-specific
# doesn't use releases, last commit f3d82fd - Mar 2, 2023
RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71c20689ce3d.tar.gz -O pgjwt.tar.gz && \
echo "dae8ed99eebb7593b43013f6532d772b12dfecd55548d2673f2dfd0163f6d2b9 pgjwt.tar.gz" | sha256sum --check && \
mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control
#########################################################################################
@@ -399,17 +387,16 @@ RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71
# compile hypopg extension
#
#########################################################################################
FROM build-deps AS hypopg-pg-build
FROM pg-build AS hypopg-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
# HypoPG 1.4.1 supports v17
# last release 1.4.1 - Apr 28, 2024
RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypopg.tar.gz && \
echo "9afe6357fd389d8d33fad81703038ce520b09275ec00153c6c89282bcdedd6bc hypopg.tar.gz" | sha256sum --check && \
mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control
#########################################################################################
@@ -418,17 +405,16 @@ RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypo
# compile pg_hashids extension
#
#########################################################################################
FROM build-deps AS pg-hashids-pg-build
FROM pg-build AS pg-hashids-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
# not version-specific
# last release v1.2.1 -Jan 12, 2018
RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \
mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control
#########################################################################################
@@ -437,9 +423,8 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz
# compile rum extension
#
#########################################################################################
FROM build-deps AS rum-pg-build
FROM pg-build AS rum-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY compute/patches/rum.patch /rum.patch
@@ -450,8 +435,8 @@ RUN wget https://github.com/postgrespro/rum/archive/cb1edffc57736cd2a4455f8d0fea
echo "65e0a752e99f4c3226400c9b899f997049e93503db8bf5c8072efa136d32fd83 rum.tar.gz" | sha256sum --check && \
mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
patch -p1 < /rum.patch && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
#########################################################################################
@@ -460,17 +445,16 @@ RUN wget https://github.com/postgrespro/rum/archive/cb1edffc57736cd2a4455f8d0fea
# compile pgTAP extension
#
#########################################################################################
FROM build-deps AS pgtap-pg-build
FROM pg-build AS pgtap-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
# pgtap 1.3.3 supports v17
# last release v1.3.3 - Apr 8, 2024
RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgtap.tar.gz && \
echo "325ea79d0d2515bce96bce43f6823dcd3effbd6c54cb2a4d6c2384fffa3a14c7 pgtap.tar.gz" | sha256sum --check && \
mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control
#########################################################################################
@@ -479,17 +463,16 @@ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgta
# compile ip4r extension
#
#########################################################################################
FROM build-deps AS ip4r-pg-build
FROM pg-build AS ip4r-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
# not version-specific
# last release v2.4.2 - Jul 29, 2023
RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/ip4r.control
#########################################################################################
@@ -498,17 +481,16 @@ RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O i
# compile Prefix extension
#
#########################################################################################
FROM build-deps AS prefix-pg-build
FROM pg-build AS prefix-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
# not version-specific
# last release v1.2.10 - Jul 5, 2023
RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control
#########################################################################################
@@ -517,17 +499,16 @@ RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O p
# compile hll extension
#
#########################################################################################
FROM build-deps AS hll-pg-build
FROM pg-build AS hll-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
# not version-specific
# last release v2.18 - Aug 29, 2023
RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control
#########################################################################################
@@ -536,17 +517,16 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar
# compile plpgsql_check extension
#
#########################################################################################
FROM build-deps AS plpgsql-check-pg-build
FROM pg-build AS plpgsql-check-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
# plpgsql_check v2.7.11 supports v17
# last release v2.7.11 - Sep 16, 2024
RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz -O plpgsql_check.tar.gz && \
echo "208933f8dbe8e0d2628eb3851e9f52e6892b8e280c63700c0f1ce7883625d172 plpgsql_check.tar.gz" | sha256sum --check && \
mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control
#########################################################################################
@@ -555,11 +535,8 @@ RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz
# compile timescaledb extension
#
#########################################################################################
FROM build-deps AS timescaledb-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
FROM pg-build AS timescaledb-pg-build
ARG PG_VERSION
ENV PATH="/usr/local/pgsql/bin:$PATH"
RUN case "${PG_VERSION}" in \
"v14" | "v15") \
@@ -590,11 +567,8 @@ RUN case "${PG_VERSION}" in \
# compile pg_hint_plan extension
#
#########################################################################################
FROM build-deps AS pg-hint-plan-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
FROM pg-build AS pg-hint-plan-pg-build
ARG PG_VERSION
ENV PATH="/usr/local/pgsql/bin:$PATH"
# version-specific, has separate releases for each version
RUN case "${PG_VERSION}" in \
@@ -632,14 +606,12 @@ RUN case "${PG_VERSION}" in \
# compile pg_cron extension
#
#########################################################################################
FROM build-deps AS pg-cron-pg-build
FROM pg-build AS pg-cron-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
# This is an experimental extension that we do not support on prod yet.
# !Do not remove!
# We set it in shared_preload_libraries and computes will fail to start if library is not found.
ENV PATH="/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O pg_cron.tar.gz && \
echo "52d1850ee7beb85a4cb7185731ef4e5a90d1de216709d8988324b0d02e76af61 pg_cron.tar.gz" | sha256sum --check && \
mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
@@ -653,9 +625,8 @@ RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O
# compile rdkit extension
#
#########################################################################################
FROM build-deps AS rdkit-pg-build
FROM pg-build AS rdkit-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN apt update && \
apt install --no-install-recommends --no-install-suggests -y \
@@ -673,7 +644,13 @@ RUN apt update && \
# Use new version only for v17
# because Release_2024_09_1 has some backward incompatible changes
# https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1
ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
# XXX: /usr/local/pgsql/bin is already in PATH, and that should be enough to find
# pg_config. For some reason the rdkit cmake script doesn't work with just that,
# however. By also adding /usr/local/pgsql, it works, which is weird because there
# are no executables in that directory.
ENV PATH="/usr/local/pgsql:$PATH"
RUN case "${PG_VERSION}" in \
"v17") \
export RDKIT_VERSION=Release_2024_09_1 \
@@ -726,13 +703,11 @@ RUN case "${PG_VERSION}" in \
# compile pg_uuidv7 extension
#
#########################################################################################
FROM build-deps AS pg-uuidv7-pg-build
FROM pg-build AS pg-uuidv7-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
# not version-specific
# last release v1.6.0 - Oct 9, 2024
ENV PATH="/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz -O pg_uuidv7.tar.gz && \
echo "0fa6c710929d003f6ce276a7de7a864e9d1667b2d78be3dc2c07f2409eb55867 pg_uuidv7.tar.gz" | sha256sum --check && \
mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
@@ -746,13 +721,11 @@ RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz
# compile pg_roaringbitmap extension
#
#########################################################################################
FROM build-deps AS pg-roaringbitmap-pg-build
FROM pg-build AS pg-roaringbitmap-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
# not version-specific
# last release v0.5.4 - Jun 28, 2022
ENV PATH="/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
@@ -766,16 +739,14 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
# compile pg_semver extension
#
#########################################################################################
FROM build-deps AS pg-semver-pg-build
FROM pg-build AS pg-semver-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
# Release 0.40.0 breaks backward compatibility with previous versions
# see release note https://github.com/theory/pg-semver/releases/tag/v0.40.0
# Use new version only for v17
#
# last release v0.40.0 - Jul 22, 2024
ENV PATH="/usr/local/pgsql/bin/:$PATH"
RUN case "${PG_VERSION}" in \
"v17") \
export SEMVER_VERSION=0.40.0 \
@@ -802,13 +773,11 @@ RUN case "${PG_VERSION}" in \
# compile pg_embedding extension
#
#########################################################################################
FROM build-deps AS pg-embedding-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
FROM pg-build AS pg-embedding-pg-build
# This is our extension, support stopped in favor of pgvector
# TODO: deprecate it
ARG PG_VERSION
ENV PATH="/usr/local/pgsql/bin/:$PATH"
RUN case "${PG_VERSION}" in \
"v14" | "v15") \
export PG_EMBEDDING_VERSION=0.3.5 \
@@ -829,26 +798,19 @@ RUN case "${PG_VERSION}" in \
# compile anon extension
#
#########################################################################################
FROM build-deps AS pg-anon-pg-build
FROM pg-build AS pg-anon-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
# This is an experimental extension, never got to real production.
# !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found.
ENV PATH="/usr/local/pgsql/bin/:$PATH"
RUN case "${PG_VERSION}" in "v17") \
echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
esac && \
wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \
mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
mkdir -p /extensions/anon && cp /usr/local/pgsql/share/extension/anon.control /extensions/anon && \
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/anon.tar.zst -T -
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control
#########################################################################################
#
@@ -856,9 +818,8 @@ RUN case "${PG_VERSION}" in "v17") \
# This layer is used to build `pgrx` deps
#
#########################################################################################
FROM build-deps AS rust-extensions-build
FROM pg-build AS rust-extensions-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN apt update && \
apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \
@@ -866,7 +827,7 @@ RUN apt update && \
useradd -ms /bin/bash nonroot -b /home
ENV HOME=/home/nonroot
ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
ENV PATH="/home/nonroot/.cargo/bin:$PATH"
USER nonroot
WORKDIR /home/nonroot
@@ -893,9 +854,8 @@ USER root
# and eventually get merged with `rust-extensions-build`
#
#########################################################################################
FROM build-deps AS rust-extensions-build-pgrx12
FROM pg-build AS rust-extensions-build-pgrx12
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN apt update && \
apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \
@@ -903,7 +863,7 @@ RUN apt update && \
useradd -ms /bin/bash nonroot -b /home
ENV HOME=/home/nonroot
ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
ENV PATH="/home/nonroot/.cargo/bin:$PATH"
USER nonroot
WORKDIR /home/nonroot
@@ -976,22 +936,9 @@ RUN apt update && apt install --no-install-recommends --no-install-suggests -y p
FROM rust-extensions-build-pgrx12 AS pg-jsonschema-pg-build
ARG PG_VERSION
# version 0.3.3 supports v17
# last release v0.3.3 - Oct 16, 2024
#
# there were no breaking changes
# so we can use the same version for all postgres versions
RUN case "${PG_VERSION}" in \
"v14" | "v15" | "v16" | "v17") \
export PG_JSONSCHEMA_VERSION=0.3.3 \
export PG_JSONSCHEMA_CHECKSUM=40c2cffab4187e0233cb8c3bde013be92218c282f95f4469c5282f6b30d64eac \
;; \
*) \
echo "unexpected PostgreSQL version" && exit 1 \
;; \
esac && \
wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v${PG_JSONSCHEMA_VERSION}.tar.gz -O pg_jsonschema.tar.gz && \
echo "${PG_JSONSCHEMA_CHECKSUM} pg_jsonschema.tar.gz" | sha256sum --check && \
RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.3.tar.gz -O pg_jsonschema.tar.gz && \
echo "40c2cffab4187e0233cb8c3bde013be92218c282f95f4469c5282f6b30d64eac pg_jsonschema.tar.gz" | sha256sum --check && \
mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
# see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8
# `unsafe-postgres` feature allows to build pgx extensions
@@ -1012,22 +959,9 @@ RUN case "${PG_VERSION}" in \
FROM rust-extensions-build-pgrx12 AS pg-graphql-pg-build
ARG PG_VERSION
# version 1.5.9 supports v17
# last release v1.5.9 - Oct 16, 2024
#
# there were no breaking changes
# so we can use the same version for all postgres versions
RUN case "${PG_VERSION}" in \
"v14" | "v15" | "v16" | "v17") \
export PG_GRAPHQL_VERSION=1.5.9 \
export PG_GRAPHQL_CHECKSUM=cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 \
;; \
*) \
echo "unexpected PostgreSQL version" && exit 1 \
;; \
esac && \
wget https://github.com/supabase/pg_graphql/archive/refs/tags/v${PG_GRAPHQL_VERSION}.tar.gz -O pg_graphql.tar.gz && \
echo "${PG_GRAPHQL_CHECKSUM} pg_graphql.tar.gz" | sha256sum --check && \
RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.9.tar.gz -O pg_graphql.tar.gz && \
echo "cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 pg_graphql.tar.gz" | sha256sum --check && \
mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgrx install --release && \
@@ -1091,8 +1025,8 @@ ARG PG_VERSION
# NOTE: local_proxy depends on the version of pg_session_jwt
# Do not update without approve from proxy team
# Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs
RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.1.2-v17.tar.gz -O pg_session_jwt.tar.gz && \
echo "c8ecbed9cb8c6441bce5134a176002b043018adf9d05a08e457dda233090a86e pg_session_jwt.tar.gz" | sha256sum --check && \
RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0.tar.gz -O pg_session_jwt.tar.gz && \
echo "5ace028e591f2e000ca10afa5b1ca62203ebff014c2907c0ec3b29c36f28a1bb pg_session_jwt.tar.gz" | sha256sum --check && \
mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "=0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgrx install --release
@@ -1104,13 +1038,11 @@ RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.1.2
#
#########################################################################################
FROM build-deps AS wal2json-pg-build
FROM pg-build AS wal2json-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
# wal2json wal2json_2_6 supports v17
# last release wal2json_2_6 - Apr 25, 2024
ENV PATH="/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.gz -O wal2json.tar.gz && \
echo "18b4bdec28c74a8fc98a11c72de38378a760327ef8e5e42e975b0029eb96ba0d wal2json.tar.gz" | sha256sum --check && \
mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json.tar.gz --strip-components=1 -C . && \
@@ -1123,13 +1055,11 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.
# compile pg_ivm extension
#
#########################################################################################
FROM build-deps AS pg-ivm-build
FROM pg-build AS pg-ivm-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
# pg_ivm v1.9 supports v17
# last release v1.9 - Jul 31
ENV PATH="/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_ivm.tar.gz && \
echo "59e15722939f274650abf637f315dd723c87073496ca77236b044cb205270d8b pg_ivm.tar.gz" | sha256sum --check && \
mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
@@ -1143,13 +1073,11 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_iv
# compile pg_partman extension
#
#########################################################################################
FROM build-deps AS pg-partman-build
FROM pg-build AS pg-partman-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
# should support v17 https://github.com/pgpartman/pg_partman/discussions/693
# last release 5.1.0 Apr 2, 2024
ENV PATH="/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz -O pg_partman.tar.gz && \
echo "3e3a27d7ff827295d5c55ef72f07a49062d6204b3cb0b9a048645d6db9f3cb9f pg_partman.tar.gz" | sha256sum --check && \
mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \
@@ -1165,9 +1093,6 @@ RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz
#########################################################################################
FROM rust-extensions-build AS pg-mooncake-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH="/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.0/pg_mooncake-0.1.0.tar.gz -O pg_mooncake.tar.gz && \
echo "eafd059b77f541f11525eb8affcd66a176968cbd8fe7c0d436e733f2aa4da59f pg_mooncake.tar.gz" | sha256sum --check && \
@@ -1183,11 +1108,8 @@ RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.0/p
#
#########################################################################################
FROM build-deps AS pg-repack-build
FROM pg-build AS pg-repack-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH="/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/reorg/pg_repack/archive/refs/tags/ver_1.5.2.tar.gz -O pg_repack.tar.gz && \
echo '4516cad42251ed3ad53ff619733004db47d5755acac83f75924cd94d1c4fb681 pg_repack.tar.gz' | sha256sum --check && \
@@ -1258,20 +1180,6 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
make -j $(getconf _NPROCESSORS_ONLN) \
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-C pgxn/neon_rmgr \
-s install && \
case "${PG_VERSION}" in \
"v14" | "v15") \
;; \
"v16" | "v17") \
echo "Skipping HNSW for PostgreSQL ${PG_VERSION}" && exit 0 \
;; \
*) \
echo "unexpected PostgreSQL version" && exit 1 \
;; \
esac && \
make -j $(getconf _NPROCESSORS_ONLN) \
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-C pgxn/hnsw \
-s install
#########################################################################################
@@ -1288,17 +1196,6 @@ USER nonroot
COPY --chown=nonroot . .
RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin compute_ctl --bin fast_import --bin local_proxy
#########################################################################################
#
# Final compute-tools image
#
#########################################################################################
FROM debian:$DEBIAN_FLAVOR AS compute-tools-image
COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/fast_import /usr/local/bin/fast_import
#########################################################################################
#
# Layer "pgbouncer"
@@ -1335,11 +1232,11 @@ RUN set -e \
#
#########################################################################################
FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
FROM quay.io/prometheuscommunity/postgres-exporter:v0.16.0 AS postgres-exporter
# Keep the version the same as in build-tools.Dockerfile and
# test_runner/regress/test_compute_metrics.py.
FROM burningalchemist/sql_exporter:0.16.0 AS sql-exporter
FROM burningalchemist/sql_exporter:0.17.0 AS sql-exporter
#########################################################################################
#

View File

@@ -17,7 +17,7 @@
//!
//! # Local Testing
//!
//! - Comment out most of the pgxns in The Dockerfile.compute-tools to speed up the build.
//! - Comment out most of the pgxns in compute-node.Dockerfile to speed up the build.
//! - Build the image with the following command:
//!
//! ```bash

View File

@@ -483,7 +483,6 @@ impl LocalEnv {
.iter()
.find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id)
.map(|&(_, timeline_id)| timeline_id)
.map(TimelineId::from)
}
pub fn timeline_name_mappings(&self) -> HashMap<TenantTimelineId, String> {

View File

@@ -822,10 +822,7 @@ impl StorageController {
self.dispatch(
Method::PUT,
format!("control/v1/tenant/{tenant_shard_id}/migrate"),
Some(TenantShardMigrateRequest {
tenant_shard_id,
node_id,
}),
Some(TenantShardMigrateRequest { node_id }),
)
.await
}

View File

@@ -1,12 +1,16 @@
use futures::StreamExt;
use std::{str::FromStr, time::Duration};
use std::{
collections::{HashMap, HashSet},
str::FromStr,
time::Duration,
};
use clap::{Parser, Subcommand};
use pageserver_api::{
controller_api::{
AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse,
SafekeeperDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest,
TenantDescribeResponse, TenantPolicyRequest,
SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest,
TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
},
models::{
EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
@@ -112,6 +116,13 @@ enum Command {
#[arg(long)]
node: NodeId,
},
/// Migrate the secondary location for a tenant shard to a specific pageserver.
TenantShardMigrateSecondary {
#[arg(long)]
tenant_shard_id: TenantShardId,
#[arg(long)]
node: NodeId,
},
/// Cancel any ongoing reconciliation for this shard
TenantShardCancelReconcile {
#[arg(long)]
@@ -146,6 +157,12 @@ enum Command {
#[arg(long)]
tenant_id: TenantId,
},
TenantSetPreferredAz {
#[arg(long)]
tenant_id: TenantId,
#[arg(long)]
preferred_az: Option<String>,
},
/// Uncleanly drop a tenant from the storage controller: this doesn't delete anything from pageservers. Appropriate
/// if you e.g. used `tenant-warmup` by mistake on a tenant ID that doesn't really exist, or is in some other region.
TenantDrop {
@@ -395,11 +412,12 @@ async fn main() -> anyhow::Result<()> {
resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr));
let mut table = comfy_table::Table::new();
table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
table.set_header(["Id", "Hostname", "AZ", "Scheduling", "Availability"]);
for node in resp {
table.add_row([
format!("{}", node.id),
node.listen_http_addr,
node.availability_zone_id,
format!("{:?}", node.scheduling),
format!("{:?}", node.availability),
]);
@@ -472,6 +490,7 @@ async fn main() -> anyhow::Result<()> {
let mut table = comfy_table::Table::new();
table.set_header([
"TenantId",
"Preferred AZ",
"ShardCount",
"StripeSize",
"Placement",
@@ -481,6 +500,11 @@ async fn main() -> anyhow::Result<()> {
let shard_zero = tenant.shards.into_iter().next().unwrap();
table.add_row([
format!("{}", tenant.tenant_id),
shard_zero
.preferred_az_id
.as_ref()
.cloned()
.unwrap_or("".to_string()),
format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
format!("{:?}", tenant.stripe_size),
format!("{:?}", tenant.policy),
@@ -540,10 +564,7 @@ async fn main() -> anyhow::Result<()> {
tenant_shard_id,
node,
} => {
let req = TenantShardMigrateRequest {
tenant_shard_id,
node_id: node,
};
let req = TenantShardMigrateRequest { node_id: node };
storcon_client
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
@@ -553,6 +574,20 @@ async fn main() -> anyhow::Result<()> {
)
.await?;
}
Command::TenantShardMigrateSecondary {
tenant_shard_id,
node,
} => {
let req = TenantShardMigrateRequest { node_id: node };
storcon_client
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
Method::PUT,
format!("control/v1/tenant/{tenant_shard_id}/migrate_secondary"),
Some(req),
)
.await?;
}
Command::TenantShardCancelReconcile { tenant_shard_id } => {
storcon_client
.dispatch::<(), ()>(
@@ -596,6 +631,19 @@ async fn main() -> anyhow::Result<()> {
None,
)
.await?;
let nodes = storcon_client
.dispatch::<(), Vec<NodeDescribeResponse>>(
Method::GET,
"control/v1/node".to_string(),
None,
)
.await?;
let nodes = nodes
.into_iter()
.map(|n| (n.id, n))
.collect::<HashMap<_, _>>();
println!("Tenant {tenant_id}");
let mut table = comfy_table::Table::new();
table.add_row(["Policy", &format!("{:?}", policy)]);
@@ -604,7 +652,14 @@ async fn main() -> anyhow::Result<()> {
println!("{table}");
println!("Shards:");
let mut table = comfy_table::Table::new();
table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
table.set_header([
"Shard",
"Attached",
"Attached AZ",
"Secondary",
"Last error",
"status",
]);
for shard in shards {
let secondary = shard
.node_secondary
@@ -627,11 +682,18 @@ async fn main() -> anyhow::Result<()> {
}
let status = status_parts.join(",");
let attached_node = shard
.node_attached
.as_ref()
.map(|id| nodes.get(id).expect("Shard references nonexistent node"));
table.add_row([
format!("{}", shard.tenant_shard_id),
shard
.node_attached
.map(|n| format!("{}", n))
attached_node
.map(|n| format!("{} ({})", n.listen_http_addr, n.id))
.unwrap_or(String::new()),
attached_node
.map(|n| n.availability_zone_id.clone())
.unwrap_or(String::new()),
secondary,
shard.last_error,
@@ -640,6 +702,66 @@ async fn main() -> anyhow::Result<()> {
}
println!("{table}");
}
Command::TenantSetPreferredAz {
tenant_id,
preferred_az,
} => {
// First learn about the tenant's shards
let describe_response = storcon_client
.dispatch::<(), TenantDescribeResponse>(
Method::GET,
format!("control/v1/tenant/{tenant_id}"),
None,
)
.await?;
// Learn about nodes to validate the AZ ID
let nodes = storcon_client
.dispatch::<(), Vec<NodeDescribeResponse>>(
Method::GET,
"control/v1/node".to_string(),
None,
)
.await?;
if let Some(preferred_az) = &preferred_az {
let azs = nodes
.into_iter()
.map(|n| (n.availability_zone_id))
.collect::<HashSet<_>>();
if !azs.contains(preferred_az) {
anyhow::bail!(
"AZ {} not found on any node: known AZs are: {:?}",
preferred_az,
azs
);
}
} else {
// Make it obvious to the user that since they've omitted an AZ, we're clearing it
eprintln!("Clearing preferred AZ for tenant {}", tenant_id);
}
// Construct a request that modifies all the tenant's shards
let req = ShardsPreferredAzsRequest {
preferred_az_ids: describe_response
.shards
.into_iter()
.map(|s| {
(
s.tenant_shard_id,
preferred_az.clone().map(AvailabilityZone),
)
})
.collect(),
};
storcon_client
.dispatch::<ShardsPreferredAzsRequest, ()>(
Method::PUT,
"control/v1/preferred_azs".to_string(),
Some(req),
)
.await?;
}
Command::TenantWarmup { tenant_id } => {
let describe_response = storcon_client
.dispatch::<(), TenantDescribeResponse>(
@@ -915,10 +1037,7 @@ async fn main() -> anyhow::Result<()> {
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
Method::PUT,
format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id),
Some(TenantShardMigrateRequest {
tenant_shard_id: mv.tenant_shard_id,
node_id: mv.to,
}),
Some(TenantShardMigrateRequest { node_id: mv.to }),
)
.await
.map_err(|e| (mv.tenant_shard_id, mv.from, mv.to, e))

View File

@@ -7,15 +7,11 @@ Currently we build two main images:
- [neondatabase/neon](https://hub.docker.com/repository/docker/neondatabase/neon) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
- [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14. Built from [/compute-node/Dockerfile](/compute/compute-node.Dockerfile).
And additional intermediate image:
- [neondatabase/compute-tools](https://hub.docker.com/repository/docker/neondatabase/compute-tools) — compute node configuration management tools.
## Build pipeline
We build all images after a successful `release` tests run and push automatically to Docker Hub with two parallel CI jobs
1. `neondatabase/compute-tools` and `neondatabase/compute-node-v16` (and -v15 and -v14)
1. `neondatabase/compute-node-v17` (and -16, -v15, -v14)
2. `neondatabase/neon`

View File

@@ -87,7 +87,7 @@ impl Display for AvailabilityZone {
#[derive(Serialize, Deserialize)]
pub struct ShardsPreferredAzsRequest {
#[serde(flatten)]
pub preferred_az_ids: HashMap<TenantShardId, AvailabilityZone>,
pub preferred_az_ids: HashMap<TenantShardId, Option<AvailabilityZone>>,
}
#[derive(Serialize, Deserialize)]
@@ -144,6 +144,8 @@ pub struct NodeDescribeResponse {
pub availability: NodeAvailabilityWrapper,
pub scheduling: NodeSchedulingPolicy,
pub availability_zone_id: String,
pub listen_http_addr: String,
pub listen_http_port: u16,
@@ -179,7 +181,6 @@ pub struct TenantDescribeResponseShard {
/// specifies some constraints, e.g. asking it to get off particular node(s)
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantShardMigrateRequest {
pub tenant_shard_id: TenantShardId,
pub node_id: NodeId,
}
@@ -368,6 +369,16 @@ pub enum PlacementPolicy {
Detached,
}
impl PlacementPolicy {
pub fn want_secondaries(&self) -> usize {
match self {
PlacementPolicy::Attached(secondary_count) => *secondary_count,
PlacementPolicy::Secondary => 1,
PlacementPolicy::Detached => 0,
}
}
}
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantShardMigrateResponse {}

View File

@@ -706,7 +706,7 @@ pub fn repl_origin_key_range() -> Range<Key> {
/// Non inherited range for vectored get.
pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
/// Sparse keyspace range for vectored get. Missing key error will be ignored for this range.
pub const NON_INHERITED_SPARSE_RANGE: Range<Key> = Key::metadata_key_range();
pub const SPARSE_RANGE: Range<Key> = Key::metadata_key_range();
impl Key {
// AUX_FILES currently stores only data for logical replication (slots etc), and
@@ -714,7 +714,42 @@ impl Key {
// switch (and generally it likely should be optional), so ignore these.
#[inline(always)]
pub fn is_inherited_key(self) -> bool {
!NON_INHERITED_RANGE.contains(&self) && !NON_INHERITED_SPARSE_RANGE.contains(&self)
if self.is_sparse() {
self.is_inherited_sparse_key()
} else {
!NON_INHERITED_RANGE.contains(&self)
}
}
#[inline(always)]
pub fn is_sparse(self) -> bool {
self.field1 >= METADATA_KEY_BEGIN_PREFIX && self.field1 < METADATA_KEY_END_PREFIX
}
/// Check if the key belongs to the inherited keyspace.
fn is_inherited_sparse_key(self) -> bool {
debug_assert!(self.is_sparse());
self.field1 == RELATION_SIZE_PREFIX
}
pub fn sparse_non_inherited_keyspace() -> Range<Key> {
// The two keys are adjacent; if we will have non-adjancent keys in the future, we should return a keyspace
debug_assert_eq!(AUX_KEY_PREFIX + 1, REPL_ORIGIN_KEY_PREFIX);
Key {
field1: AUX_KEY_PREFIX,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 0,
}..Key {
field1: REPL_ORIGIN_KEY_PREFIX + 1,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 0,
}
}
#[inline(always)]

View File

@@ -272,6 +272,8 @@ pub struct CompactInfoResponse {
pub compact_key_range: Option<CompactKeyRange>,
pub compact_lsn_range: Option<CompactLsnRange>,
pub sub_compaction: bool,
pub running: bool,
pub job_id: usize,
}
#[derive(Serialize, Deserialize, Clone)]
@@ -1398,6 +1400,8 @@ pub enum PagestreamFeMessage {
GetPage(PagestreamGetPageRequest),
DbSize(PagestreamDbSizeRequest),
GetSlruSegment(PagestreamGetSlruSegmentRequest),
#[cfg(feature = "testing")]
Test(PagestreamTestRequest),
}
// Wrapped in libpq CopyData
@@ -1409,6 +1413,8 @@ pub enum PagestreamBeMessage {
Error(PagestreamErrorResponse),
DbSize(PagestreamDbSizeResponse),
GetSlruSegment(PagestreamGetSlruSegmentResponse),
#[cfg(feature = "testing")]
Test(PagestreamTestResponse),
}
// Keep in sync with `pagestore_client.h`
@@ -1420,6 +1426,9 @@ enum PagestreamBeMessageTag {
Error = 103,
DbSize = 104,
GetSlruSegment = 105,
/// Test message discrimimant is unstable
#[cfg(feature = "testing")]
Test = 106,
}
impl TryFrom<u8> for PagestreamBeMessageTag {
type Error = u8;
@@ -1431,6 +1440,8 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
103 => Ok(PagestreamBeMessageTag::Error),
104 => Ok(PagestreamBeMessageTag::DbSize),
105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
#[cfg(feature = "testing")]
106 => Ok(PagestreamBeMessageTag::Test),
_ => Err(value),
}
}
@@ -1548,6 +1559,20 @@ pub struct PagestreamDbSizeResponse {
pub db_size: i64,
}
#[cfg(feature = "testing")]
#[derive(Debug, PartialEq, Eq, Clone)]
pub struct PagestreamTestRequest {
pub hdr: PagestreamRequest,
pub batch_key: u64,
pub message: String,
}
#[cfg(feature = "testing")]
#[derive(Debug)]
pub struct PagestreamTestResponse {
pub req: PagestreamTestRequest,
}
// This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields
// that require pageserver-internal types. It is sufficient to get the total size.
#[derive(Serialize, Deserialize, Debug)]
@@ -1616,6 +1641,17 @@ impl PagestreamFeMessage {
bytes.put_u8(req.kind);
bytes.put_u32(req.segno);
}
#[cfg(feature = "testing")]
Self::Test(req) => {
bytes.put_u8(5);
bytes.put_u64(req.hdr.reqid);
bytes.put_u64(req.hdr.request_lsn.0);
bytes.put_u64(req.hdr.not_modified_since.0);
bytes.put_u64(req.batch_key);
let message = req.message.as_bytes();
bytes.put_u64(message.len() as u64);
bytes.put_slice(message);
}
}
bytes.into()
@@ -1703,6 +1739,21 @@ impl PagestreamFeMessage {
segno: body.read_u32::<BigEndian>()?,
},
)),
#[cfg(feature = "testing")]
5 => Ok(PagestreamFeMessage::Test(PagestreamTestRequest {
hdr: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
batch_key: body.read_u64::<BigEndian>()?,
message: {
let len = body.read_u64::<BigEndian>()?;
let mut buf = vec![0; len as usize];
body.read_exact(&mut buf)?;
String::from_utf8(buf)?
},
})),
_ => bail!("unknown smgr message tag: {:?}", msg_tag),
}
}
@@ -1746,6 +1797,15 @@ impl PagestreamBeMessage {
bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
bytes.put(&resp.segment[..]);
}
#[cfg(feature = "testing")]
Self::Test(resp) => {
bytes.put_u8(Tag::Test as u8);
bytes.put_u64(resp.req.batch_key);
let message = resp.req.message.as_bytes();
bytes.put_u64(message.len() as u64);
bytes.put_slice(message);
}
}
}
PagestreamProtocolVersion::V3 => {
@@ -1814,6 +1874,18 @@ impl PagestreamBeMessage {
bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
bytes.put(&resp.segment[..]);
}
#[cfg(feature = "testing")]
Self::Test(resp) => {
bytes.put_u8(Tag::Test as u8);
bytes.put_u64(resp.req.hdr.reqid);
bytes.put_u64(resp.req.hdr.request_lsn.0);
bytes.put_u64(resp.req.hdr.not_modified_since.0);
bytes.put_u64(resp.req.batch_key);
let message = resp.req.message.as_bytes();
bytes.put_u64(message.len() as u64);
bytes.put_slice(message);
}
}
}
}
@@ -1956,6 +2028,28 @@ impl PagestreamBeMessage {
segment: segment.into(),
})
}
#[cfg(feature = "testing")]
Tag::Test => {
let reqid = buf.read_u64::<BigEndian>()?;
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
let batch_key = buf.read_u64::<BigEndian>()?;
let len = buf.read_u64::<BigEndian>()?;
let mut msg = vec![0; len as usize];
buf.read_exact(&mut msg)?;
let message = String::from_utf8(msg)?;
Self::Test(PagestreamTestResponse {
req: PagestreamTestRequest {
hdr: PagestreamRequest {
reqid,
request_lsn,
not_modified_since,
},
batch_key,
message,
},
})
}
};
let remaining = buf.into_inner();
if !remaining.is_empty() {
@@ -1975,6 +2069,8 @@ impl PagestreamBeMessage {
Self::Error(_) => "Error",
Self::DbSize(_) => "DbSize",
Self::GetSlruSegment(_) => "GetSlruSegment",
#[cfg(feature = "testing")]
Self::Test(_) => "Test",
}
}
}

View File

@@ -44,7 +44,7 @@ pub struct ProtocolVersion(u32);
impl ProtocolVersion {
pub const fn new(major: u16, minor: u16) -> Self {
Self((major as u32) << 16 | minor as u32)
Self(((major as u32) << 16) | minor as u32)
}
pub const fn minor(self) -> u16 {
self.0 as u16

View File

@@ -43,6 +43,17 @@ impl RemoteStorageKind {
}
}
impl RemoteStorageConfig {
/// Helper to fetch the configured concurrency limit.
pub fn concurrency_limit(&self) -> Option<usize> {
match &self.storage {
RemoteStorageKind::LocalFs { .. } => None,
RemoteStorageKind::AwsS3(c) => Some(c.concurrency_limit.into()),
RemoteStorageKind::AzureContainer(c) => Some(c.concurrency_limit.into()),
}
}
}
fn default_timeout() -> Duration {
RemoteStorageConfig::DEFAULT_TIMEOUT
}

View File

@@ -112,9 +112,9 @@ impl Serialize for Generation {
// We should never be asked to serialize a None. Structures
// that include an optional generation should convert None to an
// Option<Generation>::None
Err(serde::ser::Error::custom(
"Tried to serialize invalid generation ({self})",
))
Err(serde::ser::Error::custom(format!(
"Tried to serialize invalid generation ({self:?})"
)))
}
}
}

View File

@@ -15,7 +15,7 @@ use once_cell::sync::Lazy;
use regex::Regex;
use routerify::ext::RequestExt;
use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
use tokio::sync::{mpsc, Mutex};
use tokio::sync::{mpsc, Mutex, Notify};
use tokio_stream::wrappers::ReceiverStream;
use tokio_util::io::ReaderStream;
use tracing::{debug, info, info_span, warn, Instrument};
@@ -350,33 +350,53 @@ pub async fn profile_cpu_handler(req: Request<Body>) -> Result<Response<Body>, A
};
let seconds = match parse_query_param(&req, "seconds")? {
None => 5,
Some(seconds @ 1..=30) => seconds,
Some(_) => return Err(ApiError::BadRequest(anyhow!("duration must be 1-30 secs"))),
Some(seconds @ 1..=60) => seconds,
Some(_) => return Err(ApiError::BadRequest(anyhow!("duration must be 1-60 secs"))),
};
let frequency_hz = match parse_query_param(&req, "frequency")? {
None => 99,
Some(1001..) => return Err(ApiError::BadRequest(anyhow!("frequency must be <=1000 Hz"))),
Some(frequency) => frequency,
};
// Only allow one profiler at a time.
static PROFILE_LOCK: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
let _lock = PROFILE_LOCK
.try_lock()
.map_err(|_| ApiError::Conflict("profiler already running".into()))?;
let force: bool = parse_query_param(&req, "force")?.unwrap_or_default();
// Take the profile.
let report = tokio::task::spawn_blocking(move || {
static PROFILE_LOCK: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
static PROFILE_CANCEL: Lazy<Notify> = Lazy::new(Notify::new);
let report = {
// Only allow one profiler at a time. If force is true, cancel a running profile (e.g. a
// Grafana continuous profile). We use a try_lock() loop when cancelling instead of waiting
// for a lock(), to avoid races where the notify isn't currently awaited.
let _lock = loop {
match PROFILE_LOCK.try_lock() {
Ok(lock) => break lock,
Err(_) if force => PROFILE_CANCEL.notify_waiters(),
Err(_) => {
return Err(ApiError::Conflict(
"profiler already running (use ?force=true to cancel it)".into(),
))
}
}
tokio::time::sleep(Duration::from_millis(1)).await; // don't busy-wait
};
let guard = ProfilerGuardBuilder::default()
.frequency(frequency_hz)
.blocklist(&["libc", "libgcc", "pthread", "vdso"])
.build()?;
std::thread::sleep(Duration::from_secs(seconds));
guard.report().build()
})
.await
.map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
.map_err(|pprof_err| ApiError::InternalServerError(pprof_err.into()))?;
.build()
.map_err(|err| ApiError::InternalServerError(err.into()))?;
tokio::select! {
_ = tokio::time::sleep(Duration::from_secs(seconds)) => {},
_ = PROFILE_CANCEL.notified() => {},
};
guard
.report()
.build()
.map_err(|err| ApiError::InternalServerError(err.into()))?
};
// Return the report in the requested format.
match format {

View File

@@ -260,7 +260,7 @@ impl FromStr for Lsn {
{
let left_num = u32::from_str_radix(left, 16).map_err(|_| LsnParseError)?;
let right_num = u32::from_str_radix(right, 16).map_err(|_| LsnParseError)?;
Ok(Lsn((left_num as u64) << 32 | right_num as u64))
Ok(Lsn(((left_num as u64) << 32) | right_num as u64))
} else {
Err(LsnParseError)
}

View File

@@ -8,7 +8,7 @@ license.workspace = true
default = []
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
# which adds some runtime cost to run tests on outage conditions
testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing"]
testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing", "pageserver_client/testing"]
[dependencies]
anyhow.workspace = true
@@ -44,6 +44,7 @@ postgres_backend.workspace = true
postgres-protocol.workspace = true
postgres-types.workspace = true
postgres_initdb.workspace = true
pprof.workspace = true
rand.workspace = true
range-set-blaze = { version = "0.1.16", features = ["alloc"] }
regex.workspace = true
@@ -108,3 +109,11 @@ harness = false
[[bench]]
name = "bench_ingest"
harness = false
[[bench]]
name = "upload_queue"
harness = false
[[bin]]
name = "test_helper_slow_client_reads"
required-features = [ "testing" ]

View File

@@ -0,0 +1,87 @@
//! Upload queue benchmarks.
use std::str::FromStr as _;
use std::sync::atomic::AtomicU32;
use std::sync::Arc;
use criterion::{criterion_group, criterion_main, Bencher, Criterion};
use pageserver::tenant::metadata::TimelineMetadata;
use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
use pageserver::tenant::storage_layer::LayerName;
use pageserver::tenant::upload_queue::{Delete, UploadOp, UploadQueue, UploadTask};
use pageserver::tenant::IndexPart;
use pprof::criterion::{Output, PProfProfiler};
use utils::generation::Generation;
use utils::shard::{ShardCount, ShardIndex, ShardNumber};
// Register benchmarks with Criterion.
criterion_group!(
name = benches;
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
targets = bench_upload_queue_next_ready,
);
criterion_main!(benches);
/// Benchmarks the cost of UploadQueue::next_ready() with the given number of in-progress tasks
/// (which is equivalent to tasks ahead of it in the queue). This has linear cost, and the upload
/// queue as a whole is thus quadratic.
///
/// UploadOp::UploadLayer requires an entire tenant and timeline to construct, so we just test
/// Delete and UploadMetadata instead. This is incidentally the most expensive case.
fn bench_upload_queue_next_ready(c: &mut Criterion) {
let mut g = c.benchmark_group("upload_queue_next_ready");
for inprogress in [0, 1, 10, 100, 1_000, 10_000, 100_000, 1_000_000] {
g.bench_function(format!("inprogress={inprogress}"), |b| {
run_bench(b, inprogress).unwrap()
});
}
fn run_bench(b: &mut Bencher, inprogress: usize) -> anyhow::Result<()> {
// Construct two layers. layer0 is in the indexes, layer1 will be deleted.
let layer0 = LayerName::from_str("000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51").expect("invalid name");
let layer1 = LayerName::from_str("100000000000000000000000000000000001-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51").expect("invalid name");
let metadata = LayerFileMetadata {
shard: ShardIndex::new(ShardNumber(1), ShardCount(2)),
generation: Generation::Valid(1),
file_size: 0,
};
// Construct the (initial and uploaded) index with layer0.
let mut index = IndexPart::empty(TimelineMetadata::example());
index.layer_metadata.insert(layer0, metadata.clone());
// Construct the queue.
let mut queue = UploadQueue::Uninitialized;
let queue = queue.initialize_with_current_remote_index_part(&index, 0)?;
// Populate inprogress_tasks with a bunch of layer1 deletions.
let delete = UploadOp::Delete(Delete {
layers: vec![(layer1, metadata)],
});
for task_id in 0..(inprogress as u64) {
queue.inprogress_tasks.insert(
task_id,
Arc::new(UploadTask {
task_id,
retries: AtomicU32::new(0),
op: delete.clone(),
coalesced_ops: Vec::new(),
}),
);
}
// Benchmark index upload scheduling.
let index_upload = UploadOp::UploadMetadata {
uploaded: Box::new(index),
};
b.iter(|| {
queue.queued_operations.push_front(index_upload.clone());
assert!(queue.next_ready().is_some());
});
Ok(())
}
}

View File

@@ -4,6 +4,9 @@ version = "0.1.0"
edition.workspace = true
license.workspace = true
[features]
testing = [ "pageserver_api/testing" ]
[dependencies]
pageserver_api.workspace = true
thiserror.workspace = true

View File

@@ -1,6 +1,9 @@
use std::pin::Pin;
use std::sync::{Arc, Mutex};
use futures::SinkExt;
use futures::{
stream::{SplitSink, SplitStream},
SinkExt, StreamExt,
};
use pageserver_api::{
models::{
PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest,
@@ -10,7 +13,6 @@ use pageserver_api::{
};
use tokio::task::JoinHandle;
use tokio_postgres::CopyOutStream;
use tokio_stream::StreamExt;
use tokio_util::sync::CancellationToken;
use utils::{
id::{TenantId, TimelineId},
@@ -62,15 +64,28 @@ impl Client {
.client
.copy_both_simple(&format!("pagestream_v3 {tenant_id} {timeline_id}"))
.await?;
let (sink, stream) = copy_both.split(); // TODO: actually support splitting of the CopyBothDuplex so the lock inside this split adaptor goes away.
let Client {
cancel_on_client_drop,
conn_task,
client: _,
} = self;
let shared = Arc::new(Mutex::new(PagestreamShared::ConnTaskRunning(
ConnTaskRunning {
cancel_on_client_drop,
conn_task,
},
)));
Ok(PagestreamClient {
copy_both: Box::pin(copy_both),
conn_task,
cancel_on_client_drop,
sink: PagestreamSender {
shared: shared.clone(),
sink,
},
stream: PagestreamReceiver {
shared: shared.clone(),
stream,
},
shared,
})
}
@@ -97,7 +112,28 @@ impl Client {
/// Create using [`Client::pagestream`].
pub struct PagestreamClient {
copy_both: Pin<Box<tokio_postgres::CopyBothDuplex<bytes::Bytes>>>,
shared: Arc<Mutex<PagestreamShared>>,
sink: PagestreamSender,
stream: PagestreamReceiver,
}
pub struct PagestreamSender {
#[allow(dead_code)]
shared: Arc<Mutex<PagestreamShared>>,
sink: SplitSink<tokio_postgres::CopyBothDuplex<bytes::Bytes>, bytes::Bytes>,
}
pub struct PagestreamReceiver {
#[allow(dead_code)]
shared: Arc<Mutex<PagestreamShared>>,
stream: SplitStream<tokio_postgres::CopyBothDuplex<bytes::Bytes>>,
}
enum PagestreamShared {
ConnTaskRunning(ConnTaskRunning),
ConnTaskCancelledJoinHandleReturnedOrDropped,
}
struct ConnTaskRunning {
cancel_on_client_drop: Option<tokio_util::sync::DropGuard>,
conn_task: JoinHandle<()>,
}
@@ -110,11 +146,11 @@ pub struct RelTagBlockNo {
impl PagestreamClient {
pub async fn shutdown(self) {
let Self {
copy_both,
cancel_on_client_drop: cancel_conn_task,
conn_task,
} = self;
// The `copy_both` contains internal channel sender, the receiver of which is polled by `conn_task`.
shared,
sink,
stream,
} = { self };
// The `copy_both` split into `sink` and `stream` contains internal channel sender, the receiver of which is polled by `conn_task`.
// When `conn_task` observes the sender has been dropped, it sends a `FeMessage::CopyFail` into the connection.
// (see https://github.com/neondatabase/rust-postgres/blob/2005bf79573b8add5cf205b52a2b208e356cc8b0/tokio-postgres/src/copy_both.rs#L56).
//
@@ -131,27 +167,77 @@ impl PagestreamClient {
//
// NB: page_service doesn't have a use case to exit the `pagestream` mode currently.
// => https://github.com/neondatabase/neon/issues/6390
let _ = cancel_conn_task.unwrap();
let ConnTaskRunning {
cancel_on_client_drop,
conn_task,
} = {
let mut guard = shared.lock().unwrap();
match std::mem::replace(
&mut *guard,
PagestreamShared::ConnTaskCancelledJoinHandleReturnedOrDropped,
) {
PagestreamShared::ConnTaskRunning(conn_task_running) => conn_task_running,
PagestreamShared::ConnTaskCancelledJoinHandleReturnedOrDropped => unreachable!(),
}
};
let _ = cancel_on_client_drop.unwrap();
conn_task.await.unwrap();
drop(copy_both);
// Now drop the split copy_both.
drop(sink);
drop(stream);
}
pub fn split(self) -> (PagestreamSender, PagestreamReceiver) {
let Self {
shared: _,
sink,
stream,
} = self;
(sink, stream)
}
pub async fn getpage(
&mut self,
req: PagestreamGetPageRequest,
) -> anyhow::Result<PagestreamGetPageResponse> {
let req = PagestreamFeMessage::GetPage(req);
let req: bytes::Bytes = req.serialize();
// let mut req = tokio_util::io::ReaderStream::new(&req);
let mut req = tokio_stream::once(Ok(req));
self.getpage_send(req).await?;
self.getpage_recv().await
}
self.copy_both.send_all(&mut req).await?;
pub async fn getpage_send(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> {
self.sink.getpage_send(req).await
}
let next: Option<Result<bytes::Bytes, _>> = self.copy_both.next().await;
pub async fn getpage_recv(&mut self) -> anyhow::Result<PagestreamGetPageResponse> {
self.stream.getpage_recv().await
}
}
impl PagestreamSender {
// TODO: maybe make this impl Sink instead for better composability?
pub async fn send(&mut self, msg: PagestreamFeMessage) -> anyhow::Result<()> {
let msg = msg.serialize();
self.sink.send_all(&mut tokio_stream::once(Ok(msg))).await?;
Ok(())
}
pub async fn getpage_send(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> {
self.send(PagestreamFeMessage::GetPage(req)).await
}
}
impl PagestreamReceiver {
// TODO: maybe make this impl Stream instead for better composability?
pub async fn recv(&mut self) -> anyhow::Result<PagestreamBeMessage> {
let next: Option<Result<bytes::Bytes, _>> = self.stream.next().await;
let next: bytes::Bytes = next.unwrap()?;
PagestreamBeMessage::deserialize(next)
}
let msg = PagestreamBeMessage::deserialize(next)?;
match msg {
pub async fn getpage_recv(&mut self) -> anyhow::Result<PagestreamGetPageResponse> {
let next: PagestreamBeMessage = self.recv().await?;
match next {
PagestreamBeMessage::GetPage(p) => Ok(p),
PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e),
PagestreamBeMessage::Exists(_)
@@ -160,7 +246,14 @@ impl PagestreamClient {
| PagestreamBeMessage::GetSlruSegment(_) => {
anyhow::bail!(
"unexpected be message kind in response to getpage request: {}",
msg.kind()
next.kind()
)
}
#[cfg(feature = "testing")]
PagestreamBeMessage::Test(_) => {
anyhow::bail!(
"unexpected be message kind in response to getpage request: {}",
next.kind()
)
}
}

View File

@@ -53,10 +53,12 @@ project_build_tag!(BUILD_TAG);
#[global_allocator]
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
/// Configure jemalloc to profile heap allocations by sampling stack traces every 2 MB (1 << 21).
/// This adds roughly 3% overhead for allocations on average, which is acceptable considering
/// performance-sensitive code will avoid allocations as far as possible anyway.
#[allow(non_upper_case_globals)]
#[export_name = "malloc_conf"]
pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
const PID_FILE_NAME: &str = "pageserver.pid";

View File

@@ -0,0 +1,65 @@
use std::{
io::{stdin, stdout, Read, Write},
time::Duration,
};
use clap::Parser;
use pageserver_api::models::{PagestreamRequest, PagestreamTestRequest};
use utils::{
id::{TenantId, TimelineId},
lsn::Lsn,
};
#[derive(clap::Parser)]
struct Args {
connstr: String,
tenant_id: TenantId,
timeline_id: TimelineId,
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let Args {
connstr,
tenant_id,
timeline_id,
} = Args::parse();
let client = pageserver_client::page_service::Client::new(connstr).await?;
let client = client.pagestream(tenant_id, timeline_id).await?;
let (mut sender, _receiver) = client.split();
eprintln!("filling the pipe");
let mut msg = 0;
loop {
msg += 1;
let fut = sender.send(pageserver_api::models::PagestreamFeMessage::Test(
PagestreamTestRequest {
hdr: PagestreamRequest {
reqid: 0,
request_lsn: Lsn(23),
not_modified_since: Lsn(23),
},
batch_key: 42,
message: format!("message {}", msg),
},
));
let Ok(res) = tokio::time::timeout(Duration::from_secs(10), fut).await else {
eprintln!("pipe seems full");
break;
};
let _: () = res?;
}
let n = stdout().write(b"R")?;
assert_eq!(n, 1);
stdout().flush()?;
eprintln!("waiting for signal to tell us to exit");
let mut buf = [0u8; 1];
stdin().read_exact(&mut buf)?;
eprintln!("termination signal received, exiting");
anyhow::Ok(())
}

View File

@@ -97,8 +97,8 @@ use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
use crate::DEFAULT_PG_VERSION;
use crate::{disk_usage_eviction_task, tenant};
use pageserver_api::models::{
CompactInfoResponse, StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest,
TimelineGcRequest, TimelineInfo,
StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest,
TimelineInfo,
};
use utils::{
auth::SwappableJwtAuth,
@@ -2052,15 +2052,7 @@ async fn timeline_compact_info_handler(
let tenant = state
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id)?;
let res = tenant.get_scheduled_compaction_tasks(timeline_id);
let mut resp = Vec::new();
for item in res {
resp.push(CompactInfoResponse {
compact_key_range: item.compact_key_range,
compact_lsn_range: item.compact_lsn_range,
sub_compaction: item.sub_compaction,
});
}
let resp = tenant.get_scheduled_compaction_tasks(timeline_id);
json_response(StatusCode::OK, resp)
}
.instrument(info_span!("timeline_compact_info", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))

View File

@@ -91,15 +91,6 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_layers_visited_per_read_global",
"Number of layers visited to reconstruct one key",
vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
)
.expect("failed to define a metric")
});
pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_layers_visited_per_vectored_read_global",
@@ -1238,6 +1229,16 @@ pub(crate) struct SmgrOpTimerInner {
timings: SmgrOpTimerState,
}
/// The stages of request processing are represented by the enum variants.
/// Used as part of [`SmgrOpTimerInner::timings`].
///
/// Request processing calls into the `SmgrOpTimer::observe_*` methods at the
/// transition points.
/// These methods bump relevant counters and then update [`SmgrOpTimerInner::timings`]
/// to the next state.
///
/// Each request goes through every stage, in all configurations.
///
#[derive(Debug)]
enum SmgrOpTimerState {
Received {
@@ -1247,13 +1248,13 @@ enum SmgrOpTimerState {
#[allow(dead_code)]
received_at: Instant,
},
ParsedRoutedNowThrottling {
Throttling {
throttle_started_at: Instant,
},
ParsedRoutedThrottledNowBatching {
Batching {
throttle_done_at: Instant,
},
BatchedNowExecuting {
Executing {
execution_started_at: Instant,
},
Flushing,
@@ -1262,6 +1263,7 @@ enum SmgrOpTimerState {
// NB: when adding observation points, remember to update the Drop impl.
impl SmgrOpTimer {
/// See [`SmgrOpTimerState`] for more context.
pub(crate) fn observe_throttle_start(&mut self, at: Instant) {
let Some(inner) = self.0.as_mut() else {
return;
@@ -1270,24 +1272,26 @@ impl SmgrOpTimer {
return;
};
inner.throttling.count_accounted_start.inc();
inner.timings = SmgrOpTimerState::ParsedRoutedNowThrottling {
inner.timings = SmgrOpTimerState::Throttling {
throttle_started_at: at,
};
}
/// See [`SmgrOpTimerState`] for more context.
pub(crate) fn observe_throttle_done(&mut self, throttle: ThrottleResult) {
let Some(inner) = self.0.as_mut() else {
return;
};
let SmgrOpTimerState::ParsedRoutedNowThrottling {
let SmgrOpTimerState::Throttling {
throttle_started_at,
} = &mut inner.timings
} = &inner.timings
else {
return;
};
inner.throttling.count_accounted_finish.inc();
match throttle {
ThrottleResult::NotThrottled { end } => {
inner.timings = SmgrOpTimerState::ParsedRoutedThrottledNowBatching {
inner.timings = SmgrOpTimerState::Batching {
throttle_done_at: end,
};
}
@@ -1299,20 +1303,19 @@ impl SmgrOpTimer {
.wait_time
.inc_by((end - *throttle_started_at).as_micros().try_into().unwrap());
// state transition
inner.timings = SmgrOpTimerState::ParsedRoutedThrottledNowBatching {
inner.timings = SmgrOpTimerState::Batching {
throttle_done_at: end,
};
}
}
}
/// See [`SmgrOpTimerState`] for more context.
pub(crate) fn observe_execution_start(&mut self, at: Instant) {
let Some(inner) = self.0.as_mut() else {
return;
};
let SmgrOpTimerState::ParsedRoutedThrottledNowBatching { throttle_done_at } =
&mut inner.timings
else {
let SmgrOpTimerState::Batching { throttle_done_at } = &inner.timings else {
return;
};
// update metrics
@@ -1322,13 +1325,15 @@ impl SmgrOpTimer {
.per_timeline_batch_wait_time
.observe(batch.as_secs_f64());
// state transition
inner.timings = SmgrOpTimerState::BatchedNowExecuting {
inner.timings = SmgrOpTimerState::Executing {
execution_started_at: at,
}
}
/// For all but the first caller, this is a no-op.
/// The first callers receives Some, subsequent ones None.
///
/// See [`SmgrOpTimerState`] for more context.
pub(crate) fn observe_execution_end_flush_start(
&mut self,
at: Instant,
@@ -1338,9 +1343,9 @@ impl SmgrOpTimer {
let Some(mut inner) = self.0.take() else {
return None;
};
let SmgrOpTimerState::BatchedNowExecuting {
let SmgrOpTimerState::Executing {
execution_started_at,
} = &mut inner.timings
} = &inner.timings
else {
return None;
};
@@ -1373,6 +1378,14 @@ impl SmgrOpTimer {
}
}
/// The last stage of request processing is serializing and flushing the request
/// into the TCP connection. We want to make slow flushes observable
/// _while they are occuring_, so this struct provides a wrapper method [`Self::measure`]
/// to periodically bump the metric.
///
/// If in the future we decide that we're not interested in live updates, we can
/// add another `observe_*` method to [`SmgrOpTimer`], follow the existing pattern there,
/// and remove this struct from the code base.
pub(crate) struct SmgrOpFlushInProgress {
flush_started_at: Instant,
global_micros: IntCounter,
@@ -1450,6 +1463,8 @@ pub enum SmgrQueryType {
GetPageAtLsn,
GetDbSize,
GetSlruSegment,
#[cfg(feature = "testing")]
Test,
}
pub(crate) struct SmgrQueryTimePerTimeline {
@@ -3863,7 +3878,6 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
// histograms
[
&READ_NUM_LAYERS_VISITED,
&VEC_READ_NUM_LAYERS_VISITED,
&WAIT_LSN_TIME,
&WAL_REDO_TIME,

View File

@@ -554,6 +554,12 @@ struct BatchedGetPageRequest {
timer: SmgrOpTimer,
}
#[cfg(feature = "testing")]
struct BatchedTestRequest {
req: models::PagestreamTestRequest,
timer: SmgrOpTimer,
}
enum BatchedFeMessage {
Exists {
span: Span,
@@ -585,6 +591,12 @@ enum BatchedFeMessage {
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
req: models::PagestreamGetSlruSegmentRequest,
},
#[cfg(feature = "testing")]
Test {
span: Span,
shard: timeline::handle::Handle<TenantManagerTypes>,
requests: Vec<BatchedTestRequest>,
},
RespondError {
span: Span,
error: BatchedPageStreamError,
@@ -605,6 +617,12 @@ impl BatchedFeMessage {
page.timer.observe_execution_start(at);
}
}
#[cfg(feature = "testing")]
BatchedFeMessage::Test { requests, .. } => {
for req in requests {
req.timer.observe_execution_start(at);
}
}
BatchedFeMessage::RespondError { .. } => {}
}
}
@@ -866,6 +884,22 @@ impl PageServerHandler {
pages: smallvec::smallvec![BatchedGetPageRequest { req, timer }],
}
}
#[cfg(feature = "testing")]
PagestreamFeMessage::Test(req) => {
let span = tracing::info_span!(parent: parent_span, "handle_test_request");
let shard = timeline_handles
.get(tenant_id, timeline_id, ShardSelector::Zero)
.instrument(span.clone()) // sets `shard_id` field
.await?;
let timer =
record_op_start_and_throttle(&shard, metrics::SmgrQueryType::Test, received_at)
.await?;
BatchedFeMessage::Test {
span,
shard,
requests: vec![BatchedTestRequest { req, timer }],
}
}
};
Ok(Some(batched_msg))
}
@@ -926,6 +960,46 @@ impl PageServerHandler {
accum_pages.extend(this_pages);
Ok(())
}
#[cfg(feature = "testing")]
(
Ok(BatchedFeMessage::Test {
shard: accum_shard,
requests: accum_requests,
..
}),
BatchedFeMessage::Test {
shard: this_shard,
requests: this_requests,
..
},
) if (|| {
assert!(this_requests.len() == 1);
if accum_requests.len() >= max_batch_size.get() {
trace!(%max_batch_size, "stopping batching because of batch size");
assert_eq!(accum_requests.len(), max_batch_size.get());
return false;
}
if (accum_shard.tenant_shard_id, accum_shard.timeline_id)
!= (this_shard.tenant_shard_id, this_shard.timeline_id)
{
trace!("stopping batching because timeline object mismatch");
// TODO: we _could_ batch & execute each shard seperately (and in parallel).
// But the current logic for keeping responses in order does not support that.
return false;
}
let this_batch_key = this_requests[0].req.batch_key;
let accum_batch_key = accum_requests[0].req.batch_key;
if this_requests[0].req.batch_key != accum_requests[0].req.batch_key {
trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed");
return false;
}
true
})() =>
{
// ok to batch
accum_requests.extend(this_requests);
Ok(())
}
// something batched already but this message is unbatchable
(_, this_msg) => {
// by default, don't continue batching
@@ -1052,6 +1126,27 @@ impl PageServerHandler {
span,
)
}
#[cfg(feature = "testing")]
BatchedFeMessage::Test {
span,
shard,
requests,
} => {
fail::fail_point!("ps::handle-pagerequest-message::test");
(
{
let npages = requests.len();
trace!(npages, "handling getpage request");
let res = self
.handle_test_request_batch(&shard, requests, ctx)
.instrument(span.clone())
.await;
assert_eq!(res.len(), npages);
res
},
span,
)
}
BatchedFeMessage::RespondError { span, error } => {
// We've already decided to respond with an error, so we don't need to
// call the handler.
@@ -1775,6 +1870,51 @@ impl PageServerHandler {
))
}
// NB: this impl mimics what we do for batched getpage requests.
#[cfg(feature = "testing")]
#[instrument(skip_all, fields(shard_id))]
async fn handle_test_request_batch(
&mut self,
timeline: &Timeline,
requests: Vec<BatchedTestRequest>,
_ctx: &RequestContext,
) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
// real requests would do something with the timeline
let mut results = Vec::with_capacity(requests.len());
for _req in requests.iter() {
tokio::task::yield_now().await;
results.push({
if timeline.cancel.is_cancelled() {
Err(PageReconstructError::Cancelled)
} else {
Ok(())
}
});
}
// TODO: avoid creating the new Vec here
Vec::from_iter(
requests
.into_iter()
.zip(results.into_iter())
.map(|(req, res)| {
res.map(|()| {
(
PagestreamBeMessage::Test(models::PagestreamTestResponse {
req: req.req.clone(),
}),
req.timer,
)
})
.map_err(|e| BatchedPageStreamError {
err: PageStreamError::from(e),
req: req.req.hdr,
})
}),
)
}
/// Note on "fullbackup":
/// Full basebackups should only be used for debugging purposes.
/// Originally, it was introduced to enable breaking storage format changes,

View File

@@ -21,6 +21,7 @@ use enumset::EnumSet;
use futures::stream::FuturesUnordered;
use futures::StreamExt;
use pageserver_api::models;
use pageserver_api::models::CompactInfoResponse;
use pageserver_api::models::LsnLease;
use pageserver_api::models::TimelineArchivalState;
use pageserver_api::models::TimelineState;
@@ -37,21 +38,17 @@ use remote_timeline_client::manifest::{
};
use remote_timeline_client::UploadQueueNotReadyError;
use std::collections::BTreeMap;
use std::collections::VecDeque;
use std::fmt;
use std::future::Future;
use std::sync::atomic::AtomicBool;
use std::sync::Weak;
use std::time::SystemTime;
use storage_broker::BrokerClientChannel;
use timeline::compaction::GcCompactJob;
use timeline::compaction::ScheduledCompactionTask;
use timeline::compaction::GcCompactionQueue;
use timeline::import_pgdata;
use timeline::offload::offload_timeline;
use timeline::offload::OffloadError;
use timeline::CompactFlags;
use timeline::CompactOptions;
use timeline::CompactionError;
use timeline::ShutdownMode;
use tokio::io::BufReader;
use tokio::sync::watch;
@@ -347,10 +344,8 @@ pub struct Tenant {
/// Overhead of mutex is acceptable because compaction is done with a multi-second period.
compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,
/// Scheduled compaction tasks. Currently, this can only be populated by triggering
/// a manual gc-compaction from the manual compaction API.
scheduled_compaction_tasks:
std::sync::Mutex<HashMap<TimelineId, VecDeque<ScheduledCompactionTask>>>,
/// Scheduled gc-compaction tasks.
scheduled_compaction_tasks: std::sync::Mutex<HashMap<TimelineId, Arc<GcCompactionQueue>>>,
/// If the tenant is in Activating state, notify this to encourage it
/// to proceed to Active as soon as possible, rather than waiting for lazy
@@ -2999,104 +2994,18 @@ impl Tenant {
if has_pending_l0_compaction_task {
Some(true)
} else {
let mut has_pending_scheduled_compaction_task;
let next_scheduled_compaction_task = {
let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
if let Some(tline_pending_tasks) = guard.get_mut(timeline_id) {
if !tline_pending_tasks.is_empty() {
info!(
"{} tasks left in the compaction schedule queue",
tline_pending_tasks.len()
);
}
let next_task = tline_pending_tasks.pop_front();
has_pending_scheduled_compaction_task = !tline_pending_tasks.is_empty();
next_task
} else {
has_pending_scheduled_compaction_task = false;
None
}
let queue = {
let guard = self.scheduled_compaction_tasks.lock().unwrap();
guard.get(timeline_id).cloned()
};
if let Some(mut next_scheduled_compaction_task) = next_scheduled_compaction_task
{
if !next_scheduled_compaction_task
.options
.flags
.contains(CompactFlags::EnhancedGcBottomMostCompaction)
{
warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", next_scheduled_compaction_task.options);
} else if next_scheduled_compaction_task.options.sub_compaction {
info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
let jobs: Vec<GcCompactJob> = timeline
.gc_compaction_split_jobs(
GcCompactJob::from_compact_options(
next_scheduled_compaction_task.options.clone(),
),
next_scheduled_compaction_task
.options
.sub_compaction_max_job_size_mb,
)
.await
.map_err(CompactionError::Other)?;
if jobs.is_empty() {
info!("no jobs to run, skipping scheduled compaction task");
} else {
has_pending_scheduled_compaction_task = true;
let jobs_len = jobs.len();
let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
let tline_pending_tasks = guard.entry(*timeline_id).or_default();
for (idx, job) in jobs.into_iter().enumerate() {
// Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions`
// until we do further refactors to allow directly call `compact_with_gc`.
let mut flags: EnumSet<CompactFlags> = EnumSet::default();
flags |= CompactFlags::EnhancedGcBottomMostCompaction;
if job.dry_run {
flags |= CompactFlags::DryRun;
}
let options = CompactOptions {
flags,
sub_compaction: false,
compact_key_range: Some(job.compact_key_range.into()),
compact_lsn_range: Some(job.compact_lsn_range.into()),
sub_compaction_max_job_size_mb: None,
};
tline_pending_tasks.push_back(if idx == jobs_len - 1 {
ScheduledCompactionTask {
options,
// The last job in the queue sends the signal and releases the gc guard
result_tx: next_scheduled_compaction_task
.result_tx
.take(),
gc_block: next_scheduled_compaction_task
.gc_block
.take(),
}
} else {
ScheduledCompactionTask {
options,
result_tx: None,
gc_block: None,
}
});
}
info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len);
}
} else {
let _ = timeline
.compact_with_options(
cancel,
next_scheduled_compaction_task.options,
ctx,
)
.instrument(info_span!("scheduled_compact_timeline", %timeline_id))
.await?;
if let Some(tx) = next_scheduled_compaction_task.result_tx.take() {
// TODO: we can send compaction statistics in the future
tx.send(()).ok();
}
}
if let Some(queue) = queue {
let has_pending_tasks = queue
.iteration(cancel, ctx, &self.gc_block, timeline)
.await?;
Some(has_pending_tasks)
} else {
Some(false)
}
Some(has_pending_scheduled_compaction_task)
}
} else {
None
@@ -3126,34 +3035,32 @@ impl Tenant {
}
/// Cancel scheduled compaction tasks
pub(crate) fn cancel_scheduled_compaction(
&self,
timeline_id: TimelineId,
) -> Vec<ScheduledCompactionTask> {
pub(crate) fn cancel_scheduled_compaction(&self, timeline_id: TimelineId) {
let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
if let Some(tline_pending_tasks) = guard.get_mut(&timeline_id) {
let current_tline_pending_tasks = std::mem::take(tline_pending_tasks);
current_tline_pending_tasks.into_iter().collect()
} else {
Vec::new()
if let Some(q) = guard.get_mut(&timeline_id) {
q.cancel_scheduled();
}
}
pub(crate) fn get_scheduled_compaction_tasks(
&self,
timeline_id: TimelineId,
) -> Vec<CompactOptions> {
use itertools::Itertools;
let guard = self.scheduled_compaction_tasks.lock().unwrap();
guard
.get(&timeline_id)
.map(|tline_pending_tasks| {
tline_pending_tasks
.iter()
.map(|x| x.options.clone())
.collect_vec()
})
.unwrap_or_default()
) -> Vec<CompactInfoResponse> {
let res = {
let guard = self.scheduled_compaction_tasks.lock().unwrap();
guard.get(&timeline_id).map(|q| q.remaining_jobs())
};
let Some((running, remaining)) = res else {
return Vec::new();
};
let mut result = Vec::new();
if let Some((id, running)) = running {
result.extend(running.into_compact_info_resp(id, true));
}
for (id, job) in remaining {
result.extend(job.into_compact_info_resp(id, false));
}
result
}
/// Schedule a compaction task for a timeline.
@@ -3162,20 +3069,12 @@ impl Tenant {
timeline_id: TimelineId,
options: CompactOptions,
) -> anyhow::Result<tokio::sync::oneshot::Receiver<()>> {
let gc_guard = match self.gc_block.start().await {
Ok(guard) => guard,
Err(e) => {
bail!("cannot run gc-compaction because gc is blocked: {}", e);
}
};
let (tx, rx) = tokio::sync::oneshot::channel();
let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
let tline_pending_tasks = guard.entry(timeline_id).or_default();
tline_pending_tasks.push_back(ScheduledCompactionTask {
options,
result_tx: Some(tx),
gc_block: Some(gc_guard),
});
let q = guard
.entry(timeline_id)
.or_insert_with(|| Arc::new(GcCompactionQueue::new()));
q.schedule_manual_compaction(options, Some(tx));
Ok(rx)
}
@@ -5791,7 +5690,7 @@ mod tests {
use bytes::{Bytes, BytesMut};
use hex_literal::hex;
use itertools::Itertools;
use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX};
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
use pageserver_api::value::Value;
@@ -7850,7 +7749,18 @@ mod tests {
let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
let base_key_child = Key::from_hex("620000000033333333444444445500000001").unwrap();
let base_key_nonexist = Key::from_hex("620000000033333333444444445500000002").unwrap();
let base_key_overwrite = Key::from_hex("620000000033333333444444445500000003").unwrap();
let base_inherited_key = Key::from_hex("610000000033333333444444445500000000").unwrap();
let base_inherited_key_child =
Key::from_hex("610000000033333333444444445500000001").unwrap();
let base_inherited_key_nonexist =
Key::from_hex("610000000033333333444444445500000002").unwrap();
let base_inherited_key_overwrite =
Key::from_hex("610000000033333333444444445500000003").unwrap();
assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix...
assert_eq!(base_inherited_key.field1, RELATION_SIZE_PREFIX);
let tline = tenant
.create_test_timeline_with_layers(
@@ -7859,7 +7769,18 @@ mod tests {
DEFAULT_PG_VERSION,
&ctx,
Vec::new(), // delta layers
vec![(Lsn(0x20), vec![(base_key, test_img("metadata key 1"))])], // image layers
vec![(
Lsn(0x20),
vec![
(base_inherited_key, test_img("metadata inherited key 1")),
(
base_inherited_key_overwrite,
test_img("metadata key overwrite 1a"),
),
(base_key, test_img("metadata key 1")),
(base_key_overwrite, test_img("metadata key overwrite 1b")),
],
)], // image layers
Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
)
.await?;
@@ -7873,7 +7794,18 @@ mod tests {
Vec::new(), // delta layers
vec![(
Lsn(0x30),
vec![(base_key_child, test_img("metadata key 2"))],
vec![
(
base_inherited_key_child,
test_img("metadata inherited key 2"),
),
(
base_inherited_key_overwrite,
test_img("metadata key overwrite 2a"),
),
(base_key_child, test_img("metadata key 2")),
(base_key_overwrite, test_img("metadata key overwrite 2b")),
],
)], // image layers
Lsn(0x30),
)
@@ -7895,6 +7827,26 @@ mod tests {
get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx).await?,
None
);
assert_eq!(
get_vectored_impl_wrapper(&tline, base_key_overwrite, lsn, &ctx).await?,
Some(test_img("metadata key overwrite 1b"))
);
assert_eq!(
get_vectored_impl_wrapper(&tline, base_inherited_key, lsn, &ctx).await?,
Some(test_img("metadata inherited key 1"))
);
assert_eq!(
get_vectored_impl_wrapper(&tline, base_inherited_key_child, lsn, &ctx).await?,
None
);
assert_eq!(
get_vectored_impl_wrapper(&tline, base_inherited_key_nonexist, lsn, &ctx).await?,
None
);
assert_eq!(
get_vectored_impl_wrapper(&tline, base_inherited_key_overwrite, lsn, &ctx).await?,
Some(test_img("metadata key overwrite 1a"))
);
// test vectored get on child timeline
assert_eq!(
@@ -7909,6 +7861,82 @@ mod tests {
get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx).await?,
None
);
assert_eq!(
get_vectored_impl_wrapper(&child, base_inherited_key, lsn, &ctx).await?,
Some(test_img("metadata inherited key 1"))
);
assert_eq!(
get_vectored_impl_wrapper(&child, base_inherited_key_child, lsn, &ctx).await?,
Some(test_img("metadata inherited key 2"))
);
assert_eq!(
get_vectored_impl_wrapper(&child, base_inherited_key_nonexist, lsn, &ctx).await?,
None
);
assert_eq!(
get_vectored_impl_wrapper(&child, base_key_overwrite, lsn, &ctx).await?,
Some(test_img("metadata key overwrite 2b"))
);
assert_eq!(
get_vectored_impl_wrapper(&child, base_inherited_key_overwrite, lsn, &ctx).await?,
Some(test_img("metadata key overwrite 2a"))
);
// test vectored scan on parent timeline
let mut reconstruct_state = ValuesReconstructState::new();
let res = tline
.get_vectored_impl(
KeySpace::single(Key::metadata_key_range()),
lsn,
&mut reconstruct_state,
&ctx,
)
.await?;
assert_eq!(
res.into_iter()
.map(|(k, v)| (k, v.unwrap()))
.collect::<Vec<_>>(),
vec![
(base_inherited_key, test_img("metadata inherited key 1")),
(
base_inherited_key_overwrite,
test_img("metadata key overwrite 1a")
),
(base_key, test_img("metadata key 1")),
(base_key_overwrite, test_img("metadata key overwrite 1b")),
]
);
// test vectored scan on child timeline
let mut reconstruct_state = ValuesReconstructState::new();
let res = child
.get_vectored_impl(
KeySpace::single(Key::metadata_key_range()),
lsn,
&mut reconstruct_state,
&ctx,
)
.await?;
assert_eq!(
res.into_iter()
.map(|(k, v)| (k, v.unwrap()))
.collect::<Vec<_>>(),
vec![
(base_inherited_key, test_img("metadata inherited key 1")),
(
base_inherited_key_child,
test_img("metadata inherited key 2")
),
(
base_inherited_key_overwrite,
test_img("metadata key overwrite 2a")
),
(base_key_child, test_img("metadata key 2")),
(base_key_overwrite, test_img("metadata key overwrite 2b")),
]
);
Ok(())
}

View File

@@ -11,7 +11,7 @@
pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
use pageserver_api::models::CompactionAlgorithmSettings;
use pageserver_api::models::EvictionPolicy;
use pageserver_api::models::{self, TenantConfigPatch, ThrottleConfig};
use pageserver_api::models::{self, TenantConfigPatch};
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
use serde::de::IntoDeserializer;
use serde::{Deserialize, Serialize};
@@ -597,7 +597,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
.map(humantime),
heatmap_period: value.heatmap_period.map(humantime),
lazy_slru_download: value.lazy_slru_download,
timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
timeline_get_throttle: value.timeline_get_throttle,
image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
lsn_lease_length: value.lsn_lease_length.map(humantime),
lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),

View File

@@ -84,17 +84,17 @@ impl Value {
fn to_u64(self) -> u64 {
let b = &self.0;
(b[0] as u64) << 32
| (b[1] as u64) << 24
| (b[2] as u64) << 16
| (b[3] as u64) << 8
((b[0] as u64) << 32)
| ((b[1] as u64) << 24)
| ((b[2] as u64) << 16)
| ((b[3] as u64) << 8)
| b[4] as u64
}
fn to_blknum(self) -> u32 {
let b = &self.0;
assert!(b[0] == 0x80);
(b[1] as u32) << 24 | (b[2] as u32) << 16 | (b[3] as u32) << 8 | b[4] as u32
((b[1] as u32) << 24) | ((b[2] as u32) << 16) | ((b[3] as u32) << 8) | b[4] as u32
}
}

View File

@@ -320,7 +320,6 @@ impl TimelineMetadata {
// Checksums make it awkward to build a valid instance by hand. This helper
// provides a TimelineMetadata with a valid checksum in its header.
#[cfg(test)]
pub fn example() -> Self {
let instance = Self::new(
"0/16960E8".parse::<Lsn>().unwrap(),

View File

@@ -63,22 +63,18 @@
//! The contract between client and its user is that the user is responsible of
//! scheduling operations in an order that keeps the remote consistent as
//! described above.
//!
//! From the user's perspective, the operations are executed sequentially.
//! Internally, the client knows which operations can be performed in parallel,
//! and which operations act like a "barrier" that require preceding operations
//! to finish. The calling code just needs to call the schedule-functions in the
//! correct order, and the client will parallelize the operations in a way that
//! is safe.
//!
//! The caller should be careful with deletion, though. They should not delete
//! local files that have been scheduled for upload but not yet finished uploading.
//! Otherwise the upload will fail. To wait for an upload to finish, use
//! the 'wait_completion' function (more on that later.)
//! is safe. For more details, see `UploadOp::can_bypass`.
//!
//! All of this relies on the following invariants:
//!
//! - We rely on read-after write consistency in the remote storage.
//! - Layer files are immutable
//! - Layer files are immutable.
//!
//! NB: Pageserver assumes that it has exclusive write access to the tenant in remote
//! storage. Different tenants can be attached to different pageservers, but if the
@@ -429,8 +425,16 @@ impl RemoteTimelineClient {
/// an index file upload, i.e., it's not empty.
/// The given `index_part` must be the one on the remote.
pub fn init_upload_queue(&self, index_part: &IndexPart) -> anyhow::Result<()> {
// Set the maximum number of inprogress tasks to the remote storage concurrency. There's
// certainly no point in starting more upload tasks than this.
let inprogress_limit = self
.conf
.remote_storage_config
.as_ref()
.and_then(|r| r.concurrency_limit())
.unwrap_or(0);
let mut upload_queue = self.upload_queue.lock().unwrap();
upload_queue.initialize_with_current_remote_index_part(index_part)?;
upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
self.update_remote_physical_size_gauge(Some(index_part));
info!(
"initialized upload queue from remote index with {} layer files",
@@ -445,8 +449,16 @@ impl RemoteTimelineClient {
&self,
local_metadata: &TimelineMetadata,
) -> anyhow::Result<()> {
// Set the maximum number of inprogress tasks to the remote storage concurrency. There's
// certainly no point in starting more upload tasks than this.
let inprogress_limit = self
.conf
.remote_storage_config
.as_ref()
.and_then(|r| r.concurrency_limit())
.unwrap_or(0);
let mut upload_queue = self.upload_queue.lock().unwrap();
upload_queue.initialize_empty_remote(local_metadata)?;
upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
self.update_remote_physical_size_gauge(None);
info!("initialized upload queue as empty");
Ok(())
@@ -462,9 +474,15 @@ impl RemoteTimelineClient {
let deleted_at = index_part.deleted_at.ok_or(anyhow::anyhow!(
"bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted"
))?;
let inprogress_limit = self
.conf
.remote_storage_config
.as_ref()
.and_then(|r| r.concurrency_limit())
.unwrap_or(0);
let mut upload_queue = self.upload_queue.lock().unwrap();
upload_queue.initialize_with_current_remote_index_part(index_part)?;
upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
self.update_remote_physical_size_gauge(Some(index_part));
self.stop_impl(&mut upload_queue);
@@ -1855,57 +1873,17 @@ impl RemoteTimelineClient {
Ok(())
}
///
/// Pick next tasks from the queue, and start as many of them as possible without violating
/// the ordering constraints.
///
/// The caller needs to already hold the `upload_queue` lock.
/// TODO: consider limiting the number of in-progress tasks, beyond what remote_storage does.
/// This can launch an unbounded number of queued tasks. `UploadQueue::next_ready()` also has
/// worst-case quadratic cost in the number of tasks, and may struggle beyond 10,000 tasks.
fn launch_queued_tasks(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
while let Some(next_op) = upload_queue.queued_operations.front() {
// Can we run this task now?
let can_run_now = match next_op {
UploadOp::UploadLayer(..) => {
// Can always be scheduled.
true
}
UploadOp::UploadMetadata { .. } => {
// These can only be performed after all the preceding operations
// have finished.
upload_queue.inprogress_tasks.is_empty()
}
UploadOp::Delete(..) => {
// Wait for preceding uploads to finish. Concurrent deletions are OK, though.
upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
}
while let Some((mut next_op, coalesced_ops)) = upload_queue.next_ready() {
debug!("starting op: {next_op}");
UploadOp::Barrier(_) | UploadOp::Shutdown => {
upload_queue.inprogress_tasks.is_empty()
}
};
// If we cannot launch this task, don't look any further.
//
// In some cases, we could let some non-frontmost tasks to "jump the queue" and launch
// them now, but we don't try to do that currently. For example, if the frontmost task
// is an index-file upload that cannot proceed until preceding uploads have finished, we
// could still start layer uploads that were scheduled later.
if !can_run_now {
break;
}
if let UploadOp::Shutdown = next_op {
// leave the op in the queue but do not start more tasks; it will be dropped when
// the stop is called.
upload_queue.shutdown_ready.close();
break;
}
// We can launch this task. Remove it from the queue first.
let mut next_op = upload_queue.queued_operations.pop_front().unwrap();
debug!("starting op: {}", next_op);
// Update the counters and prepare
// Prepare upload.
match &mut next_op {
UploadOp::UploadLayer(layer, meta, mode) => {
if upload_queue
@@ -1916,18 +1894,14 @@ impl RemoteTimelineClient {
} else {
*mode = Some(OpType::MayReorder)
}
upload_queue.num_inprogress_layer_uploads += 1;
}
UploadOp::UploadMetadata { .. } => {
upload_queue.num_inprogress_metadata_uploads += 1;
}
UploadOp::UploadMetadata { .. } => {}
UploadOp::Delete(Delete { layers }) => {
for (name, meta) in layers {
upload_queue
.recently_deleted
.insert((name.clone(), meta.generation));
}
upload_queue.num_inprogress_deletions += 1;
}
UploadOp::Barrier(sender) => {
sender.send_replace(());
@@ -1944,6 +1918,7 @@ impl RemoteTimelineClient {
let task = Arc::new(UploadTask {
task_id: upload_task_id,
op: next_op,
coalesced_ops,
retries: AtomicU32::new(0),
});
upload_queue
@@ -2027,6 +2002,8 @@ impl RemoteTimelineClient {
let upload_result: anyhow::Result<()> = match &task.op {
UploadOp::UploadLayer(ref layer, ref layer_metadata, mode) => {
// TODO: check if this mechanism can be removed now that can_bypass() performs
// conflict checks during scheduling.
if let Some(OpType::FlushDeletion) = mode {
if self.config.read().unwrap().block_deletions {
// Of course, this is not efficient... but usually the queue should be empty.
@@ -2249,13 +2226,8 @@ impl RemoteTimelineClient {
upload_queue.inprogress_tasks.remove(&task.task_id);
let lsn_update = match task.op {
UploadOp::UploadLayer(_, _, _) => {
upload_queue.num_inprogress_layer_uploads -= 1;
None
}
UploadOp::UploadLayer(_, _, _) => None,
UploadOp::UploadMetadata { ref uploaded } => {
upload_queue.num_inprogress_metadata_uploads -= 1;
// the task id is reused as a monotonicity check for storing the "clean"
// IndexPart.
let last_updater = upload_queue.clean.1;
@@ -2289,10 +2261,7 @@ impl RemoteTimelineClient {
None
}
}
UploadOp::Delete(_) => {
upload_queue.num_inprogress_deletions -= 1;
None
}
UploadOp::Delete(_) => None,
UploadOp::Barrier(..) | UploadOp::Shutdown => unreachable!(),
};
@@ -2317,6 +2286,9 @@ impl RemoteTimelineClient {
}
self.metric_end(&task.op);
for coalesced_op in &task.coalesced_ops {
self.metric_end(coalesced_op);
}
}
fn metric_impl(
@@ -2409,6 +2381,7 @@ impl RemoteTimelineClient {
// but for this use case it doesnt really makes sense to bring unsafe code only for this usage point.
// Deletion is not really perf sensitive so there shouldnt be any problems with cloning a fraction of it.
let upload_queue_for_deletion = UploadQueueInitialized {
inprogress_limit: initialized.inprogress_limit,
task_counter: 0,
dirty: initialized.dirty.clone(),
clean: initialized.clean.clone(),
@@ -2416,9 +2389,6 @@ impl RemoteTimelineClient {
visible_remote_consistent_lsn: initialized
.visible_remote_consistent_lsn
.clone(),
num_inprogress_layer_uploads: 0,
num_inprogress_metadata_uploads: 0,
num_inprogress_deletions: 0,
inprogress_tasks: HashMap::default(),
queued_operations: VecDeque::default(),
#[cfg(feature = "testing")]
@@ -2445,14 +2415,6 @@ impl RemoteTimelineClient {
}
};
// consistency check
assert_eq!(
qi.num_inprogress_layer_uploads
+ qi.num_inprogress_metadata_uploads
+ qi.num_inprogress_deletions,
qi.inprogress_tasks.len()
);
// We don't need to do anything here for in-progress tasks. They will finish
// on their own, decrement the unfinished-task counter themselves, and observe
// that the queue is Stopped.
@@ -2899,8 +2861,8 @@ mod tests {
let mut guard = client.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut().unwrap();
assert!(upload_queue.queued_operations.is_empty());
assert!(upload_queue.inprogress_tasks.len() == 2);
assert!(upload_queue.num_inprogress_layer_uploads == 2);
assert_eq!(upload_queue.inprogress_tasks.len(), 2);
assert_eq!(upload_queue.num_inprogress_layer_uploads(), 2);
// also check that `latest_file_changes` was updated
assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 2);
@@ -2970,8 +2932,8 @@ mod tests {
// Deletion schedules upload of the index file, and the file deletion itself
assert_eq!(upload_queue.queued_operations.len(), 2);
assert_eq!(upload_queue.inprogress_tasks.len(), 1);
assert_eq!(upload_queue.num_inprogress_layer_uploads, 1);
assert_eq!(upload_queue.num_inprogress_deletions, 0);
assert_eq!(upload_queue.num_inprogress_layer_uploads(), 1);
assert_eq!(upload_queue.num_inprogress_deletions(), 0);
assert_eq!(
upload_queue.latest_files_changes_since_metadata_upload_scheduled,
0

View File

@@ -104,7 +104,7 @@ impl IndexPart {
pub const FILE_NAME: &'static str = "index_part.json";
pub(crate) fn empty(metadata: TimelineMetadata) -> Self {
pub fn empty(metadata: TimelineMetadata) -> Self {
IndexPart {
version: Self::LATEST_VERSION,
layer_metadata: Default::default(),

View File

@@ -12,7 +12,7 @@ pub mod merge_iterator;
use crate::context::{AccessStatsBehavior, RequestContext};
use bytes::Bytes;
use pageserver_api::key::{Key, NON_INHERITED_SPARSE_RANGE};
use pageserver_api::key::Key;
use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
use pageserver_api::record::NeonWalRecord;
use pageserver_api::value::Value;
@@ -209,7 +209,7 @@ impl ValuesReconstructState {
.keys
.entry(*key)
.or_insert(Ok(VectoredValueReconstructState::default()));
let is_sparse_key = NON_INHERITED_SPARSE_RANGE.contains(key);
let is_sparse_key = key.is_sparse();
if let Ok(state) = state {
let key_done = match state.situation {
ValueReconstructSituation::Complete => {

View File

@@ -112,8 +112,8 @@ const MAX_SUPPORTED_BLOB_LEN_BITS: usize = {
///
/// Layout:
/// - 1 bit: `will_init`
/// - [`MAX_SUPPORTED_BLOB_LEN_BITS`]: `len`
/// - [`MAX_SUPPORTED_POS_BITS`]: `pos`
/// - [`MAX_SUPPORTED_BLOB_LEN_BITS`][]: `len`
/// - [`MAX_SUPPORTED_POS_BITS`](IndexEntry::MAX_SUPPORTED_POS_BITS): `pos`
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct IndexEntry(u64);

View File

@@ -1812,7 +1812,7 @@ enum LayerKind {
/// Guard for forcing a layer be resident while it exists.
#[derive(Clone)]
pub(crate) struct ResidentLayer {
pub struct ResidentLayer {
owner: Layer,
downloaded: Arc<DownloadedLayer>,
}

View File

@@ -27,7 +27,7 @@ use pageserver_api::{
config::tenant_conf_defaults::DEFAULT_COMPACTION_THRESHOLD,
key::{
KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
NON_INHERITED_SPARSE_RANGE,
SPARSE_RANGE,
},
keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
models::{
@@ -3221,7 +3221,7 @@ impl Timeline {
// We don't return a blanket [`GetVectoredError::MissingKey`] to avoid
// stalling compaction.
keyspace.remove_overlapping_with(&KeySpace {
ranges: vec![NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE],
ranges: vec![NON_INHERITED_RANGE, Key::sparse_non_inherited_keyspace()],
});
// Keyspace is fully retrieved
@@ -3242,7 +3242,11 @@ impl Timeline {
// keys from `keyspace`, we expect there to be no overlap between it and the image covered key
// space. If that's not the case, we had at least one key encounter a gap in the image layer
// and stop the search as a result of that.
let removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
let mut removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
// Do not fire missing key error for sparse keys.
removed.remove_overlapping_with(&KeySpace {
ranges: vec![SPARSE_RANGE],
});
if !removed.is_empty() {
break Some(removed);
}
@@ -3257,6 +3261,21 @@ impl Timeline {
timeline = &*timeline_owned;
};
// Remove sparse keys from the keyspace so that it doesn't fire errors.
let missing_keyspace = if let Some(missing_keyspace) = missing_keyspace {
let mut missing_keyspace = missing_keyspace;
missing_keyspace.remove_overlapping_with(&KeySpace {
ranges: vec![SPARSE_RANGE],
});
if missing_keyspace.is_empty() {
None
} else {
Some(missing_keyspace)
}
} else {
None
};
if let Some(missing_keyspace) = missing_keyspace {
return Err(GetVectoredError::MissingKey(MissingKeyError {
key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */
@@ -3762,36 +3781,35 @@ impl Timeline {
return Err(FlushLayerError::Cancelled);
}
let mut layers_to_upload = Vec::new();
layers_to_upload.extend(
self.create_image_layers(
&rel_partition,
self.initdb_lsn,
ImageLayerCreationMode::Initial,
ctx,
)
.await?,
);
// Ensure that we have a single call to `create_image_layers` with a combined dense keyspace.
// So that the key ranges don't overlap.
let mut partitions = KeyPartitioning::default();
partitions.parts.extend(rel_partition.parts);
if !metadata_partition.parts.is_empty() {
assert_eq!(
metadata_partition.parts.len(),
1,
"currently sparse keyspace should only contain a single metadata keyspace"
);
layers_to_upload.extend(
self.create_image_layers(
// Safety: create_image_layers treat sparse keyspaces differently that it does not scan
// every single key within the keyspace, and therefore, it's safe to force converting it
// into a dense keyspace before calling this function.
&metadata_partition.into_dense(),
self.initdb_lsn,
ImageLayerCreationMode::Initial,
ctx,
)
.await?,
);
// Safety: create_image_layers treat sparse keyspaces differently that it does not scan
// every single key within the keyspace, and therefore, it's safe to force converting it
// into a dense keyspace before calling this function.
partitions
.parts
.extend(metadata_partition.into_dense().parts);
}
let mut layers_to_upload = Vec::new();
layers_to_upload.extend(
self.create_image_layers(
&partitions,
self.initdb_lsn,
ImageLayerCreationMode::Initial,
ctx,
)
.await?,
);
(layers_to_upload, None)
} else {
// Normal case, write out a L0 delta layer file.

View File

@@ -4,7 +4,7 @@
//!
//! The old legacy algorithm is implemented directly in `timeline.rs`.
use std::collections::{BinaryHeap, HashMap, HashSet};
use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
use std::ops::{Deref, Range};
use std::sync::Arc;
@@ -16,10 +16,12 @@ use super::{
use anyhow::{anyhow, bail, Context};
use bytes::Bytes;
use enumset::EnumSet;
use fail::fail_point;
use itertools::Itertools;
use pageserver_api::key::KEY_SIZE;
use pageserver_api::keyspace::ShardedRange;
use pageserver_api::models::CompactInfoResponse;
use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
use serde::Serialize;
use tokio_util::sync::CancellationToken;
@@ -30,6 +32,7 @@ use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}
use crate::page_cache;
use crate::statvfs::Statvfs;
use crate::tenant::checks::check_valid_layermap;
use crate::tenant::gc_block::GcBlock;
use crate::tenant::remote_timeline_client::WaitCompletionError;
use crate::tenant::storage_layer::batch_split_writer::{
BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
@@ -63,16 +66,284 @@ use super::CompactionError;
/// Maximum number of deltas before generating an image layer in bottom-most compaction.
const COMPACTION_DELTA_THRESHOLD: usize = 5;
/// A scheduled compaction task.
pub(crate) struct ScheduledCompactionTask {
/// It's unfortunate that we need to store a compact options struct here because the only outer
/// API we can call here is `compact_with_options` which does a few setup calls before starting the
/// actual compaction job... We should refactor this to store `GcCompactionJob` in the future.
pub options: CompactOptions,
/// The channel to send the compaction result. If this is a subcompaction, the last compaction job holds the sender.
pub result_tx: Option<tokio::sync::oneshot::Sender<()>>,
/// Hold the GC block. If this is a subcompaction, the last compaction job holds the gc block guard.
pub gc_block: Option<gc_block::Guard>,
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
pub struct GcCompactionJobId(pub usize);
impl std::fmt::Display for GcCompactionJobId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
#[derive(Debug, Clone)]
pub enum GcCompactionQueueItem {
Manual(CompactOptions),
SubCompactionJob(CompactOptions),
#[allow(dead_code)]
UpdateL2Lsn(Lsn),
Notify(GcCompactionJobId),
}
impl GcCompactionQueueItem {
pub fn into_compact_info_resp(
self,
id: GcCompactionJobId,
running: bool,
) -> Option<CompactInfoResponse> {
match self {
GcCompactionQueueItem::Manual(options) => Some(CompactInfoResponse {
compact_key_range: options.compact_key_range,
compact_lsn_range: options.compact_lsn_range,
sub_compaction: options.sub_compaction,
running,
job_id: id.0,
}),
GcCompactionQueueItem::SubCompactionJob(options) => Some(CompactInfoResponse {
compact_key_range: options.compact_key_range,
compact_lsn_range: options.compact_lsn_range,
sub_compaction: options.sub_compaction,
running,
job_id: id.0,
}),
GcCompactionQueueItem::UpdateL2Lsn(_) => None,
GcCompactionQueueItem::Notify(_) => None,
}
}
}
struct GcCompactionQueueInner {
running: Option<(GcCompactionJobId, GcCompactionQueueItem)>,
queued: VecDeque<(GcCompactionJobId, GcCompactionQueueItem)>,
notify: HashMap<GcCompactionJobId, tokio::sync::oneshot::Sender<()>>,
gc_guards: HashMap<GcCompactionJobId, gc_block::Guard>,
last_id: GcCompactionJobId,
}
impl GcCompactionQueueInner {
fn next_id(&mut self) -> GcCompactionJobId {
let id = self.last_id;
self.last_id = GcCompactionJobId(id.0 + 1);
id
}
}
/// A structure to store gc_compaction jobs.
pub struct GcCompactionQueue {
/// All items in the queue, and the currently-running job.
inner: std::sync::Mutex<GcCompactionQueueInner>,
/// Ensure only one thread is consuming the queue.
consumer_lock: tokio::sync::Mutex<()>,
}
impl GcCompactionQueue {
pub fn new() -> Self {
GcCompactionQueue {
inner: std::sync::Mutex::new(GcCompactionQueueInner {
running: None,
queued: VecDeque::new(),
notify: HashMap::new(),
gc_guards: HashMap::new(),
last_id: GcCompactionJobId(0),
}),
consumer_lock: tokio::sync::Mutex::new(()),
}
}
pub fn cancel_scheduled(&self) {
let mut guard = self.inner.lock().unwrap();
guard.queued.clear();
guard.notify.clear();
guard.gc_guards.clear();
}
/// Schedule a manual compaction job.
pub fn schedule_manual_compaction(
&self,
options: CompactOptions,
notify: Option<tokio::sync::oneshot::Sender<()>>,
) -> GcCompactionJobId {
let mut guard = self.inner.lock().unwrap();
let id = guard.next_id();
guard
.queued
.push_back((id, GcCompactionQueueItem::Manual(options)));
if let Some(notify) = notify {
guard.notify.insert(id, notify);
}
info!("scheduled compaction job id={}", id);
id
}
/// Trigger an auto compaction.
#[allow(dead_code)]
pub fn trigger_auto_compaction(&self, _: &Arc<Timeline>) {}
/// Notify the caller the job has finished and unblock GC.
fn notify_and_unblock(&self, id: GcCompactionJobId) {
info!("compaction job id={} finished", id);
let mut guard = self.inner.lock().unwrap();
if let Some(blocking) = guard.gc_guards.remove(&id) {
drop(blocking)
}
if let Some(tx) = guard.notify.remove(&id) {
let _ = tx.send(());
}
}
async fn handle_sub_compaction(
&self,
id: GcCompactionJobId,
options: CompactOptions,
timeline: &Arc<Timeline>,
gc_block: &GcBlock,
) -> Result<(), CompactionError> {
info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
let jobs: Vec<GcCompactJob> = timeline
.gc_compaction_split_jobs(
GcCompactJob::from_compact_options(options.clone()),
options.sub_compaction_max_job_size_mb,
)
.await
.map_err(CompactionError::Other)?;
if jobs.is_empty() {
info!("no jobs to run, skipping scheduled compaction task");
self.notify_and_unblock(id);
} else {
let gc_guard = match gc_block.start().await {
Ok(guard) => guard,
Err(e) => {
return Err(CompactionError::Other(anyhow!(
"cannot run gc-compaction because gc is blocked: {}",
e
)));
}
};
let jobs_len = jobs.len();
let mut pending_tasks = Vec::new();
for job in jobs {
// Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions`
// until we do further refactors to allow directly call `compact_with_gc`.
let mut flags: EnumSet<CompactFlags> = EnumSet::default();
flags |= CompactFlags::EnhancedGcBottomMostCompaction;
if job.dry_run {
flags |= CompactFlags::DryRun;
}
let options = CompactOptions {
flags,
sub_compaction: false,
compact_key_range: Some(job.compact_key_range.into()),
compact_lsn_range: Some(job.compact_lsn_range.into()),
sub_compaction_max_job_size_mb: None,
};
pending_tasks.push(GcCompactionQueueItem::SubCompactionJob(options));
}
pending_tasks.push(GcCompactionQueueItem::Notify(id));
{
let mut guard = self.inner.lock().unwrap();
guard.gc_guards.insert(id, gc_guard);
let mut tasks = Vec::new();
for task in pending_tasks {
let id = guard.next_id();
tasks.push((id, task));
}
tasks.reverse();
for item in tasks {
guard.queued.push_front(item);
}
}
info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len);
}
Ok(())
}
/// Take a job from the queue and process it. Returns if there are still pending tasks.
pub async fn iteration(
&self,
cancel: &CancellationToken,
ctx: &RequestContext,
gc_block: &GcBlock,
timeline: &Arc<Timeline>,
) -> Result<bool, CompactionError> {
let _one_op_at_a_time_guard = self.consumer_lock.lock().await;
let has_pending_tasks;
let (id, item) = {
let mut guard = self.inner.lock().unwrap();
let Some((id, item)) = guard.queued.pop_front() else {
return Ok(false);
};
guard.running = Some((id, item.clone()));
has_pending_tasks = !guard.queued.is_empty();
(id, item)
};
match item {
GcCompactionQueueItem::Manual(options) => {
if !options
.flags
.contains(CompactFlags::EnhancedGcBottomMostCompaction)
{
warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", options);
} else if options.sub_compaction {
self.handle_sub_compaction(id, options, timeline, gc_block)
.await?;
} else {
let gc_guard = match gc_block.start().await {
Ok(guard) => guard,
Err(e) => {
return Err(CompactionError::Other(anyhow!(
"cannot run gc-compaction because gc is blocked: {}",
e
)));
}
};
{
let mut guard = self.inner.lock().unwrap();
guard.gc_guards.insert(id, gc_guard);
}
let _ = timeline
.compact_with_options(cancel, options, ctx)
.instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id))
.await?;
self.notify_and_unblock(id);
}
}
GcCompactionQueueItem::SubCompactionJob(options) => {
let _ = timeline
.compact_with_options(cancel, options, ctx)
.instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id))
.await?;
}
GcCompactionQueueItem::Notify(id) => {
self.notify_and_unblock(id);
}
GcCompactionQueueItem::UpdateL2Lsn(_) => {
unreachable!()
}
}
{
let mut guard = self.inner.lock().unwrap();
guard.running = None;
}
Ok(has_pending_tasks)
}
#[allow(clippy::type_complexity)]
pub fn remaining_jobs(
&self,
) -> (
Option<(GcCompactionJobId, GcCompactionQueueItem)>,
VecDeque<(GcCompactionJobId, GcCompactionQueueItem)>,
) {
let guard = self.inner.lock().unwrap();
(guard.running.clone(), guard.queued.clone())
}
#[allow(dead_code)]
pub fn remaining_jobs_num(&self) -> usize {
let guard = self.inner.lock().unwrap();
guard.queued.len() + if guard.running.is_some() { 1 } else { 0 }
}
}
/// A job description for the gc-compaction job. This structure describes the rectangle range that the job will

View File

@@ -403,7 +403,7 @@ pub(super) async fn handle_walreceiver_connection(
// need to advance last record LSN on all shards. If we've not ingested the latest
// record, then set the LSN of the modification past it. This way all shards
// advance their last record LSN at the same time.
let needs_last_record_lsn_advance = match next_record_lsn.map(Lsn::from) {
let needs_last_record_lsn_advance = match next_record_lsn {
Some(lsn) if lsn > modification.get_lsn() => {
modification.set_lsn(lsn).unwrap();
true

File diff suppressed because it is too large Load Diff

View File

@@ -308,7 +308,7 @@ impl WalIngest {
epoch -= 1;
}
Ok((epoch as u64) << 32 | xid as u64)
Ok(((epoch as u64) << 32) | xid as u64)
}
async fn ingest_clear_vm_bits(

View File

@@ -1,26 +0,0 @@
EXTENSION = hnsw
EXTVERSION = 0.1.0
MODULE_big = hnsw
DATA = $(wildcard *--*.sql)
OBJS = hnsw.o hnswalg.o
TESTS = $(wildcard test/sql/*.sql)
REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS))
REGRESS_OPTS = --inputdir=test --load-extension=hnsw
# For auto-vectorization:
# - GCC (needs -ftree-vectorize OR -O3) - https://gcc.gnu.org/projects/tree-ssa/vectorization.html
PG_CFLAGS += -O3
PG_CXXFLAGS += -O3 -std=c++11
PG_LDFLAGS += -lstdc++
all: $(EXTENSION)--$(EXTVERSION).sql
PG_CONFIG ?= pg_config
PGXS := $(shell $(PG_CONFIG) --pgxs)
include $(PGXS)
dist:
mkdir -p dist
git archive --format zip --prefix=$(EXTENSION)-$(EXTVERSION)/ --output dist/$(EXTENSION)-$(EXTVERSION).zip master

View File

@@ -1,25 +0,0 @@
# Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors
This ANN extension of Postgres is based
on [ivf-hnsw](https://github.com/dbaranchuk/ivf-hnsw.git) implementation of [HNSW](https://www.pinecone.io/learn/hnsw),
the code for the current state-of-the-art billion-scale nearest neighbor search system presented in the paper:
[Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors](http://openaccess.thecvf.com/content_ECCV_2018/html/Dmitry_Baranchuk_Revisiting_the_Inverted_ECCV_2018_paper.html),
<br>
Dmitry Baranchuk, Artem Babenko, Yury Malkov
# Postgres extension
HNSW index is hold in memory (built on demand) and it's maxial size is limited
by `maxelements` index parameter. Another required parameter is nubmer of dimensions (if it is not specified in column type).
Optional parameter `ef` specifies number of neighbors which are considered during index construction and search (corresponds `efConstruction` and `efSearch` parameters
described in the article).
# Example of usage:
```
create extension hnsw;
create table embeddings(id integer primary key, payload real[]);
create index on embeddings using hnsw(payload) with (maxelements=1000000, dims=100, m=32);
select id from embeddings order by payload <-> array[1.0, 2.0,...] limit 100;
```

View File

@@ -1,29 +0,0 @@
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION hnsw" to load this file. \quit
-- functions
CREATE FUNCTION l2_distance(real[], real[]) RETURNS real
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
-- operators
CREATE OPERATOR <-> (
LEFTARG = real[], RIGHTARG = real[], PROCEDURE = l2_distance,
COMMUTATOR = '<->'
);
-- access method
CREATE FUNCTION hnsw_handler(internal) RETURNS index_am_handler
AS 'MODULE_PATHNAME' LANGUAGE C;
CREATE ACCESS METHOD hnsw TYPE INDEX HANDLER hnsw_handler;
COMMENT ON ACCESS METHOD hnsw IS 'hnsw index access method';
-- opclasses
CREATE OPERATOR CLASS knn_ops
DEFAULT FOR TYPE real[] USING hnsw AS
OPERATOR 1 <-> (real[], real[]) FOR ORDER BY float_ops;

View File

@@ -1,590 +0,0 @@
#include "postgres.h"
#include "access/amapi.h"
#include "access/generic_xlog.h"
#include "access/relation.h"
#include "access/reloptions.h"
#include "access/tableam.h"
#include "catalog/index.h"
#include "commands/vacuum.h"
#include "nodes/execnodes.h"
#include "storage/bufmgr.h"
#include "utils/guc.h"
#include "utils/selfuncs.h"
#include <math.h>
#include <float.h>
#include "hnsw.h"
PG_MODULE_MAGIC;
typedef struct {
int32 vl_len_; /* varlena header (do not touch directly!) */
int dims;
int maxelements;
int efConstruction;
int efSearch;
int M;
} HnswOptions;
static relopt_kind hnsw_relopt_kind;
typedef struct {
HierarchicalNSW* hnsw;
size_t curr;
size_t n_results;
ItemPointer results;
} HnswScanOpaqueData;
typedef HnswScanOpaqueData* HnswScanOpaque;
typedef struct {
Oid relid;
uint32 status;
HierarchicalNSW* hnsw;
} HnswHashEntry;
#define SH_PREFIX hnsw_index
#define SH_ELEMENT_TYPE HnswHashEntry
#define SH_KEY_TYPE Oid
#define SH_KEY relid
#define SH_STORE_HASH
#define SH_GET_HASH(tb, a) ((a)->relid)
#define SH_HASH_KEY(tb, key) (key)
#define SH_EQUAL(tb, a, b) ((a) == (b))
#define SH_SCOPE static inline
#define SH_DEFINE
#define SH_DECLARE
#include "lib/simplehash.h"
#define INDEX_HASH_SIZE 11
#define DEFAULT_EF_SEARCH 64
PGDLLEXPORT void _PG_init(void);
static hnsw_index_hash *hnsw_indexes;
/*
* Initialize index options and variables
*/
void
_PG_init(void)
{
hnsw_relopt_kind = add_reloption_kind();
add_int_reloption(hnsw_relopt_kind, "dims", "Number of dimensions",
0, 0, INT_MAX, AccessExclusiveLock);
add_int_reloption(hnsw_relopt_kind, "maxelements", "Maximal number of elements",
0, 0, INT_MAX, AccessExclusiveLock);
add_int_reloption(hnsw_relopt_kind, "m", "Number of neighbors of each vertex",
100, 0, INT_MAX, AccessExclusiveLock);
add_int_reloption(hnsw_relopt_kind, "efconstruction", "Number of inspected neighbors during index construction",
16, 1, INT_MAX, AccessExclusiveLock);
add_int_reloption(hnsw_relopt_kind, "efsearch", "Number of inspected neighbors during index search",
64, 1, INT_MAX, AccessExclusiveLock);
hnsw_indexes = hnsw_index_create(TopMemoryContext, INDEX_HASH_SIZE, NULL);
}
static void
hnsw_build_callback(Relation index, ItemPointer tid, Datum *values,
bool *isnull, bool tupleIsAlive, void *state)
{
HierarchicalNSW* hnsw = (HierarchicalNSW*) state;
ArrayType* array;
int n_items;
label_t label = 0;
/* Skip nulls */
if (isnull[0])
return;
array = DatumGetArrayTypeP(values[0]);
n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
if (n_items != hnsw_dimensions(hnsw))
{
elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
n_items, hnsw_dimensions(hnsw));
}
memcpy(&label, tid, sizeof(*tid));
hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label);
}
static void
hnsw_populate(HierarchicalNSW* hnsw, Relation indexRel, Relation heapRel)
{
IndexInfo* indexInfo = BuildIndexInfo(indexRel);
Assert(indexInfo->ii_NumIndexAttrs == 1);
table_index_build_scan(heapRel, indexRel, indexInfo,
true, true, hnsw_build_callback, (void *) hnsw, NULL);
}
#ifdef __APPLE__
#include <sys/types.h>
#include <sys/sysctl.h>
static void
hnsw_check_available_memory(Size requested)
{
size_t total;
if (sysctlbyname("hw.memsize", NULL, &total, NULL, 0) < 0)
elog(ERROR, "Failed to get amount of RAM: %m");
if ((Size)NBuffers*BLCKSZ + requested >= total)
elog(ERROR, "HNSW index requeries %ld bytes while only %ld are available",
requested, total - (Size)NBuffers*BLCKSZ);
}
#else
#include <sys/sysinfo.h>
static void
hnsw_check_available_memory(Size requested)
{
struct sysinfo si;
Size total;
if (sysinfo(&si) < 0)
elog(ERROR, "Failed to get amount of RAM: %m");
total = si.totalram*si.mem_unit;
if ((Size)NBuffers*BLCKSZ + requested >= total)
elog(ERROR, "HNSW index requeries %ld bytes while only %ld are available",
requested, total - (Size)NBuffers*BLCKSZ);
}
#endif
static HierarchicalNSW*
hnsw_get_index(Relation indexRel, Relation heapRel)
{
HierarchicalNSW* hnsw;
Oid indexoid = RelationGetRelid(indexRel);
HnswHashEntry* entry = hnsw_index_lookup(hnsw_indexes, indexoid);
if (entry == NULL)
{
size_t dims, maxelements;
size_t M;
size_t maxM;
size_t size_links_level0;
size_t size_data_per_element;
size_t data_size;
dsm_handle handle = indexoid << 1; /* make it even */
void* impl_private = NULL;
void* mapped_address = NULL;
Size mapped_size = 0;
Size shmem_size;
bool exists = true;
bool found;
HnswOptions *opts = (HnswOptions *) indexRel->rd_options;
if (opts == NULL || opts->maxelements == 0 || opts->dims == 0) {
elog(ERROR, "HNSW index requires 'maxelements' and 'dims' to be specified");
}
dims = opts->dims;
maxelements = opts->maxelements;
M = opts->M;
maxM = M * 2;
data_size = dims * sizeof(coord_t);
size_links_level0 = (maxM + 1) * sizeof(idx_t);
size_data_per_element = size_links_level0 + data_size + sizeof(label_t);
shmem_size = hnsw_sizeof() + maxelements * size_data_per_element;
hnsw_check_available_memory(shmem_size);
/* first try to attach to existed index */
if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private,
&mapped_address, &mapped_size, DEBUG1))
{
/* index doesn't exists: try to create it */
if (!dsm_impl_op(DSM_OP_CREATE, handle, shmem_size, &impl_private,
&mapped_address, &mapped_size, DEBUG1))
{
/* We can do it under shared lock, so some other backend may
* try to initialize index. If create is failed because index already
* created by somebody else, then try to attach to it once again
*/
if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private,
&mapped_address, &mapped_size, ERROR))
{
return NULL;
}
}
else
{
exists = false;
}
}
Assert(mapped_size == shmem_size);
hnsw = (HierarchicalNSW*)mapped_address;
if (!exists)
{
hnsw_init(hnsw, dims, maxelements, M, maxM, opts->efConstruction);
hnsw_populate(hnsw, indexRel, heapRel);
}
entry = hnsw_index_insert(hnsw_indexes, indexoid, &found);
Assert(!found);
entry->hnsw = hnsw;
}
else
{
hnsw = entry->hnsw;
}
return hnsw;
}
/*
* Start or restart an index scan
*/
static IndexScanDesc
hnsw_beginscan(Relation index, int nkeys, int norderbys)
{
IndexScanDesc scan = RelationGetIndexScan(index, nkeys, norderbys);
HnswScanOpaque so = (HnswScanOpaque) palloc(sizeof(HnswScanOpaqueData));
Relation heap = relation_open(index->rd_index->indrelid, NoLock);
so->hnsw = hnsw_get_index(index, heap);
relation_close(heap, NoLock);
so->curr = 0;
so->n_results = 0;
so->results = NULL;
scan->opaque = so;
return scan;
}
/*
* Start or restart an index scan
*/
static void
hnsw_rescan(IndexScanDesc scan, ScanKey keys, int nkeys, ScanKey orderbys, int norderbys)
{
HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
if (so->results)
{
pfree(so->results);
so->results = NULL;
}
so->curr = 0;
if (orderbys && scan->numberOfOrderBys > 0)
memmove(scan->orderByData, orderbys, scan->numberOfOrderBys * sizeof(ScanKeyData));
}
/*
* Fetch the next tuple in the given scan
*/
static bool
hnsw_gettuple(IndexScanDesc scan, ScanDirection dir)
{
HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
/*
* Index can be used to scan backward, but Postgres doesn't support
* backward scan on operators
*/
Assert(ScanDirectionIsForward(dir));
if (so->curr == 0)
{
Datum value;
ArrayType* array;
int n_items;
size_t n_results;
label_t* results;
HnswOptions *opts = (HnswOptions *) scan->indexRelation->rd_options;
size_t efSearch = opts ? opts->efSearch : DEFAULT_EF_SEARCH;
/* Safety check */
if (scan->orderByData == NULL)
elog(ERROR, "cannot scan HNSW index without order");
/* No items will match if null */
if (scan->orderByData->sk_flags & SK_ISNULL)
return false;
value = scan->orderByData->sk_argument;
array = DatumGetArrayTypeP(value);
n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
if (n_items != hnsw_dimensions(so->hnsw))
{
elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
n_items, hnsw_dimensions(so->hnsw));
}
if (!hnsw_search(so->hnsw, (coord_t*)ARR_DATA_PTR(array), efSearch, &n_results, &results))
elog(ERROR, "HNSW index search failed");
so->results = (ItemPointer)palloc(n_results*sizeof(ItemPointerData));
so->n_results = n_results;
for (size_t i = 0; i < n_results; i++)
{
memcpy(&so->results[i], &results[i], sizeof(so->results[i]));
}
free(results);
}
if (so->curr >= so->n_results)
{
return false;
}
else
{
scan->xs_heaptid = so->results[so->curr++];
scan->xs_recheckorderby = false;
return true;
}
}
/*
* End a scan and release resources
*/
static void
hnsw_endscan(IndexScanDesc scan)
{
HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
if (so->results)
pfree(so->results);
pfree(so);
scan->opaque = NULL;
}
/*
* Estimate the cost of an index scan
*/
static void
hnsw_costestimate(PlannerInfo *root, IndexPath *path, double loop_count,
Cost *indexStartupCost, Cost *indexTotalCost,
Selectivity *indexSelectivity, double *indexCorrelation
,double *indexPages
)
{
GenericCosts costs;
/* Never use index without order */
if (path->indexorderbys == NULL)
{
*indexStartupCost = DBL_MAX;
*indexTotalCost = DBL_MAX;
*indexSelectivity = 0;
*indexCorrelation = 0;
*indexPages = 0;
return;
}
MemSet(&costs, 0, sizeof(costs));
genericcostestimate(root, path, loop_count, &costs);
/* Startup cost and total cost are same */
*indexStartupCost = costs.indexTotalCost;
*indexTotalCost = costs.indexTotalCost;
*indexSelectivity = costs.indexSelectivity;
*indexCorrelation = costs.indexCorrelation;
*indexPages = costs.numIndexPages;
}
/*
* Parse and validate the reloptions
*/
static bytea *
hnsw_options(Datum reloptions, bool validate)
{
static const relopt_parse_elt tab[] = {
{"dims", RELOPT_TYPE_INT, offsetof(HnswOptions, dims)},
{"maxelements", RELOPT_TYPE_INT, offsetof(HnswOptions, maxelements)},
{"efconstruction", RELOPT_TYPE_INT, offsetof(HnswOptions, efConstruction)},
{"efsearch", RELOPT_TYPE_INT, offsetof(HnswOptions, efSearch)},
{"m", RELOPT_TYPE_INT, offsetof(HnswOptions, M)}
};
return (bytea *) build_reloptions(reloptions, validate,
hnsw_relopt_kind,
sizeof(HnswOptions),
tab, lengthof(tab));
}
/*
* Validate catalog entries for the specified operator class
*/
static bool
hnsw_validate(Oid opclassoid)
{
return true;
}
/*
* Build the index for a logged table
*/
static IndexBuildResult *
hnsw_build(Relation heap, Relation index, IndexInfo *indexInfo)
{
HierarchicalNSW* hnsw = hnsw_get_index(index, heap);
IndexBuildResult* result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
result->heap_tuples = result->index_tuples = hnsw_count(hnsw);
return result;
}
/*
* Insert a tuple into the index
*/
static bool
hnsw_insert(Relation index, Datum *values, bool *isnull, ItemPointer heap_tid,
Relation heap, IndexUniqueCheck checkUnique,
bool indexUnchanged,
IndexInfo *indexInfo)
{
HierarchicalNSW* hnsw = hnsw_get_index(index, heap);
Datum value;
ArrayType* array;
int n_items;
label_t label = 0;
/* Skip nulls */
if (isnull[0])
return false;
/* Detoast value */
value = PointerGetDatum(PG_DETOAST_DATUM(values[0]));
array = DatumGetArrayTypeP(value);
n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
if (n_items != hnsw_dimensions(hnsw))
{
elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
n_items, hnsw_dimensions(hnsw));
}
memcpy(&label, heap_tid, sizeof(*heap_tid));
if (!hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label))
elog(ERROR, "HNSW index insert failed");
return true;
}
/*
* Build the index for an unlogged table
*/
static void
hnsw_buildempty(Relation index)
{
/* index will be constructed on dema nd when accessed */
}
/*
* Clean up after a VACUUM operation
*/
static IndexBulkDeleteResult *
hnsw_vacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
{
Relation rel = info->index;
if (stats == NULL)
return NULL;
stats->num_pages = RelationGetNumberOfBlocks(rel);
return stats;
}
/*
* Bulk delete tuples from the index
*/
static IndexBulkDeleteResult *
hnsw_bulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
IndexBulkDeleteCallback callback, void *callback_state)
{
if (stats == NULL)
stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
return stats;
}
/*
* Define index handler
*
* See https://www.postgresql.org/docs/current/index-api.html
*/
PGDLLEXPORT PG_FUNCTION_INFO_V1(hnsw_handler);
Datum
hnsw_handler(PG_FUNCTION_ARGS)
{
IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
amroutine->amstrategies = 0;
amroutine->amsupport = 0;
amroutine->amoptsprocnum = 0;
amroutine->amcanorder = false;
amroutine->amcanorderbyop = true;
amroutine->amcanbackward = false; /* can change direction mid-scan */
amroutine->amcanunique = false;
amroutine->amcanmulticol = false;
amroutine->amoptionalkey = true;
amroutine->amsearcharray = false;
amroutine->amsearchnulls = false;
amroutine->amstorage = false;
amroutine->amclusterable = false;
amroutine->ampredlocks = false;
amroutine->amcanparallel = false;
amroutine->amcaninclude = false;
amroutine->amusemaintenanceworkmem = false; /* not used during VACUUM */
amroutine->amparallelvacuumoptions = VACUUM_OPTION_PARALLEL_BULKDEL;
amroutine->amkeytype = InvalidOid;
/* Interface functions */
amroutine->ambuild = hnsw_build;
amroutine->ambuildempty = hnsw_buildempty;
amroutine->aminsert = hnsw_insert;
amroutine->ambulkdelete = hnsw_bulkdelete;
amroutine->amvacuumcleanup = hnsw_vacuumcleanup;
amroutine->amcanreturn = NULL; /* tuple not included in heapsort */
amroutine->amcostestimate = hnsw_costestimate;
amroutine->amoptions = hnsw_options;
amroutine->amproperty = NULL; /* TODO AMPROP_DISTANCE_ORDERABLE */
amroutine->ambuildphasename = NULL;
amroutine->amvalidate = hnsw_validate;
amroutine->amadjustmembers = NULL;
amroutine->ambeginscan = hnsw_beginscan;
amroutine->amrescan = hnsw_rescan;
amroutine->amgettuple = hnsw_gettuple;
amroutine->amgetbitmap = NULL;
amroutine->amendscan = hnsw_endscan;
amroutine->ammarkpos = NULL;
amroutine->amrestrpos = NULL;
/* Interface functions to support parallel index scans */
amroutine->amestimateparallelscan = NULL;
amroutine->aminitparallelscan = NULL;
amroutine->amparallelrescan = NULL;
PG_RETURN_POINTER(amroutine);
}
/*
* Get the L2 distance between vectors
*/
PGDLLEXPORT PG_FUNCTION_INFO_V1(l2_distance);
Datum
l2_distance(PG_FUNCTION_ARGS)
{
ArrayType *a = PG_GETARG_ARRAYTYPE_P(0);
ArrayType *b = PG_GETARG_ARRAYTYPE_P(1);
int a_dim = ArrayGetNItems(ARR_NDIM(a), ARR_DIMS(a));
int b_dim = ArrayGetNItems(ARR_NDIM(b), ARR_DIMS(b));
dist_t distance = 0.0;
dist_t diff;
coord_t *ax = (coord_t*)ARR_DATA_PTR(a);
coord_t *bx = (coord_t*)ARR_DATA_PTR(b);
if (a_dim != b_dim)
{
ereport(ERROR,
(errcode(ERRCODE_DATA_EXCEPTION),
errmsg("different array dimensions %d and %d", a_dim, b_dim)));
}
for (int i = 0; i < a_dim; i++)
{
diff = ax[i] - bx[i];
distance += diff * diff;
}
PG_RETURN_FLOAT4((dist_t)sqrt(distance));
}

View File

@@ -1,4 +0,0 @@
comment = '** Deprecated ** Please use pg_embedding instead'
default_version = '0.1.0'
module_pathname = '$libdir/hnsw'
relocatable = true

View File

@@ -1,15 +0,0 @@
#pragma once
typedef float coord_t;
typedef float dist_t;
typedef uint32_t idx_t;
typedef uint64_t label_t;
typedef struct HierarchicalNSW HierarchicalNSW;
bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results);
bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label);
void hnsw_init(HierarchicalNSW* hnsw, size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction);
int hnsw_dimensions(HierarchicalNSW* hnsw);
size_t hnsw_count(HierarchicalNSW* hnsw);
size_t hnsw_sizeof(void);

View File

@@ -1,379 +0,0 @@
#include "hnswalg.h"
#if defined(__GNUC__)
#define PORTABLE_ALIGN32 __attribute__((aligned(32)))
#define PREFETCH(addr,hint) __builtin_prefetch(addr, 0, hint)
#else
#define PORTABLE_ALIGN32 __declspec(align(32))
#define PREFETCH(addr,hint)
#endif
HierarchicalNSW::HierarchicalNSW(size_t dim_, size_t maxelements_, size_t M_, size_t maxM_, size_t efConstruction_)
{
dim = dim_;
data_size = dim * sizeof(coord_t);
efConstruction = efConstruction_;
maxelements = maxelements_;
M = M_;
maxM = maxM_;
size_links_level0 = (maxM + 1) * sizeof(idx_t);
size_data_per_element = size_links_level0 + data_size + sizeof(label_t);
offset_data = size_links_level0;
offset_label = offset_data + data_size;
enterpoint_node = 0;
cur_element_count = 0;
#ifdef __x86_64__
use_avx2 = __builtin_cpu_supports("avx2");
#endif
}
std::priority_queue<std::pair<dist_t, idx_t>> HierarchicalNSW::searchBaseLayer(const coord_t *point, size_t ef)
{
std::vector<uint32_t> visited;
visited.resize((cur_element_count + 31) >> 5);
std::priority_queue<std::pair<dist_t, idx_t >> topResults;
std::priority_queue<std::pair<dist_t, idx_t >> candidateSet;
dist_t dist = fstdistfunc(point, getDataByInternalId(enterpoint_node));
topResults.emplace(dist, enterpoint_node);
candidateSet.emplace(-dist, enterpoint_node);
visited[enterpoint_node >> 5] = 1 << (enterpoint_node & 31);
dist_t lowerBound = dist;
while (!candidateSet.empty())
{
std::pair<dist_t, idx_t> curr_el_pair = candidateSet.top();
if (-curr_el_pair.first > lowerBound)
break;
candidateSet.pop();
idx_t curNodeNum = curr_el_pair.second;
idx_t* data = get_linklist0(curNodeNum);
size_t size = *data++;
PREFETCH(getDataByInternalId(*data), 0);
for (size_t j = 0; j < size; ++j) {
size_t tnum = *(data + j);
PREFETCH(getDataByInternalId(*(data + j + 1)), 0);
if (!(visited[tnum >> 5] & (1 << (tnum & 31)))) {
visited[tnum >> 5] |= 1 << (tnum & 31);
dist = fstdistfunc(point, getDataByInternalId(tnum));
if (topResults.top().first > dist || topResults.size() < ef) {
candidateSet.emplace(-dist, tnum);
PREFETCH(get_linklist0(candidateSet.top().second), 0);
topResults.emplace(dist, tnum);
if (topResults.size() > ef)
topResults.pop();
lowerBound = topResults.top().first;
}
}
}
}
return topResults;
}
void HierarchicalNSW::getNeighborsByHeuristic(std::priority_queue<std::pair<dist_t, idx_t>> &topResults, size_t NN)
{
if (topResults.size() < NN)
return;
std::priority_queue<std::pair<dist_t, idx_t>> resultSet;
std::vector<std::pair<dist_t, idx_t>> returnlist;
while (topResults.size() > 0) {
resultSet.emplace(-topResults.top().first, topResults.top().second);
topResults.pop();
}
while (resultSet.size()) {
if (returnlist.size() >= NN)
break;
std::pair<dist_t, idx_t> curen = resultSet.top();
dist_t dist_to_query = -curen.first;
resultSet.pop();
bool good = true;
for (std::pair<dist_t, idx_t> curen2 : returnlist) {
dist_t curdist = fstdistfunc(getDataByInternalId(curen2.second),
getDataByInternalId(curen.second));
if (curdist < dist_to_query) {
good = false;
break;
}
}
if (good) returnlist.push_back(curen);
}
for (std::pair<dist_t, idx_t> elem : returnlist)
topResults.emplace(-elem.first, elem.second);
}
void HierarchicalNSW::mutuallyConnectNewElement(const coord_t *point, idx_t cur_c,
std::priority_queue<std::pair<dist_t, idx_t>> topResults)
{
getNeighborsByHeuristic(topResults, M);
std::vector<idx_t> res;
res.reserve(M);
while (topResults.size() > 0) {
res.push_back(topResults.top().second);
topResults.pop();
}
{
idx_t* data = get_linklist0(cur_c);
if (*data)
throw std::runtime_error("Should be blank");
*data++ = res.size();
for (size_t idx = 0; idx < res.size(); idx++) {
if (data[idx])
throw std::runtime_error("Should be blank");
data[idx] = res[idx];
}
}
for (size_t idx = 0; idx < res.size(); idx++) {
if (res[idx] == cur_c)
throw std::runtime_error("Connection to the same element");
size_t resMmax = maxM;
idx_t *ll_other = get_linklist0(res[idx]);
idx_t sz_link_list_other = *ll_other;
if (sz_link_list_other > resMmax || sz_link_list_other < 0)
throw std::runtime_error("Bad sz_link_list_other");
if (sz_link_list_other < resMmax) {
idx_t *data = ll_other + 1;
data[sz_link_list_other] = cur_c;
*ll_other = sz_link_list_other + 1;
} else {
// finding the "weakest" element to replace it with the new one
idx_t *data = ll_other + 1;
dist_t d_max = fstdistfunc(getDataByInternalId(cur_c), getDataByInternalId(res[idx]));
// Heuristic:
std::priority_queue<std::pair<dist_t, idx_t>> candidates;
candidates.emplace(d_max, cur_c);
for (size_t j = 0; j < sz_link_list_other; j++)
candidates.emplace(fstdistfunc(getDataByInternalId(data[j]), getDataByInternalId(res[idx])), data[j]);
getNeighborsByHeuristic(candidates, resMmax);
size_t indx = 0;
while (!candidates.empty()) {
data[indx] = candidates.top().second;
candidates.pop();
indx++;
}
*ll_other = indx;
}
}
}
void HierarchicalNSW::addPoint(const coord_t *point, label_t label)
{
if (cur_element_count >= maxelements) {
throw std::runtime_error("The number of elements exceeds the specified limit");
}
idx_t cur_c = cur_element_count++;
memset((char *) get_linklist0(cur_c), 0, size_data_per_element);
memcpy(getDataByInternalId(cur_c), point, data_size);
memcpy(getExternalLabel(cur_c), &label, sizeof label);
// Do nothing for the first element
if (cur_c != 0) {
std::priority_queue <std::pair<dist_t, idx_t>> topResults = searchBaseLayer(point, efConstruction);
mutuallyConnectNewElement(point, cur_c, topResults);
}
};
std::priority_queue<std::pair<dist_t, label_t>> HierarchicalNSW::searchKnn(const coord_t *query, size_t k)
{
std::priority_queue<std::pair<dist_t, label_t>> topResults;
auto topCandidates = searchBaseLayer(query, k);
while (topCandidates.size() > k) {
topCandidates.pop();
}
while (!topCandidates.empty()) {
std::pair<dist_t, idx_t> rez = topCandidates.top();
label_t label;
memcpy(&label, getExternalLabel(rez.second), sizeof(label));
topResults.push(std::pair<dist_t, label_t>(rez.first, label));
topCandidates.pop();
}
return topResults;
};
dist_t fstdistfunc_scalar(const coord_t *x, const coord_t *y, size_t n)
{
dist_t distance = 0.0;
for (size_t i = 0; i < n; i++)
{
dist_t diff = x[i] - y[i];
distance += diff * diff;
}
return distance;
}
#ifdef __x86_64__
#include <immintrin.h>
__attribute__((target("avx2")))
dist_t fstdistfunc_avx2(const coord_t *x, const coord_t *y, size_t n)
{
const size_t TmpResSz = sizeof(__m256) / sizeof(float);
float PORTABLE_ALIGN32 TmpRes[TmpResSz];
size_t qty16 = n / 16;
const float *pEnd1 = x + (qty16 * 16);
__m256 diff, v1, v2;
__m256 sum = _mm256_set1_ps(0);
while (x < pEnd1) {
v1 = _mm256_loadu_ps(x);
x += 8;
v2 = _mm256_loadu_ps(y);
y += 8;
diff = _mm256_sub_ps(v1, v2);
sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
v1 = _mm256_loadu_ps(x);
x += 8;
v2 = _mm256_loadu_ps(y);
y += 8;
diff = _mm256_sub_ps(v1, v2);
sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
}
_mm256_store_ps(TmpRes, sum);
float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
return (res);
}
dist_t fstdistfunc_sse(const coord_t *x, const coord_t *y, size_t n)
{
const size_t TmpResSz = sizeof(__m128) / sizeof(float);
float PORTABLE_ALIGN32 TmpRes[TmpResSz];
size_t qty16 = n / 16;
const float *pEnd1 = x + (qty16 * 16);
__m128 diff, v1, v2;
__m128 sum = _mm_set1_ps(0);
while (x < pEnd1) {
v1 = _mm_loadu_ps(x);
x += 4;
v2 = _mm_loadu_ps(y);
y += 4;
diff = _mm_sub_ps(v1, v2);
sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
v1 = _mm_loadu_ps(x);
x += 4;
v2 = _mm_loadu_ps(y);
y += 4;
diff = _mm_sub_ps(v1, v2);
sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
v1 = _mm_loadu_ps(x);
x += 4;
v2 = _mm_loadu_ps(y);
y += 4;
diff = _mm_sub_ps(v1, v2);
sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
v1 = _mm_loadu_ps(x);
x += 4;
v2 = _mm_loadu_ps(y);
y += 4;
diff = _mm_sub_ps(v1, v2);
sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
}
_mm_store_ps(TmpRes, sum);
float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
return res;
}
#endif
dist_t HierarchicalNSW::fstdistfunc(const coord_t *x, const coord_t *y)
{
#ifndef __x86_64__
return fstdistfunc_scalar(x, y, dim);
#else
if(use_avx2)
return fstdistfunc_avx2(x, y, dim);
return fstdistfunc_sse(x, y, dim);
#endif
}
bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results)
{
try
{
auto result = hnsw->searchKnn(point, efSearch);
size_t nResults = result.size();
*results = (label_t*)malloc(nResults*sizeof(label_t));
for (size_t i = nResults; i-- != 0;)
{
(*results)[i] = result.top().second;
result.pop();
}
*n_results = nResults;
return true;
}
catch (std::exception& x)
{
return false;
}
}
bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label)
{
try
{
hnsw->addPoint(point, label);
return true;
}
catch (std::exception& x)
{
fprintf(stderr, "Catch %s\n", x.what());
return false;
}
}
void hnsw_init(HierarchicalNSW* hnsw, size_t dims, size_t maxelements, size_t M, size_t maxM, size_t efConstruction)
{
new ((void*)hnsw) HierarchicalNSW(dims, maxelements, M, maxM, efConstruction);
}
int hnsw_dimensions(HierarchicalNSW* hnsw)
{
return (int)hnsw->dim;
}
size_t hnsw_count(HierarchicalNSW* hnsw)
{
return hnsw->cur_element_count;
}
size_t hnsw_sizeof(void)
{
return sizeof(HierarchicalNSW);
}

View File

@@ -1,69 +0,0 @@
#pragma once
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <unordered_map>
#include <unordered_set>
#include <map>
#include <cmath>
#include <queue>
#include <stdexcept>
extern "C" {
#include "hnsw.h"
}
struct HierarchicalNSW
{
size_t maxelements;
size_t cur_element_count;
idx_t enterpoint_node;
size_t dim;
size_t data_size;
size_t offset_data;
size_t offset_label;
size_t size_data_per_element;
size_t M;
size_t maxM;
size_t size_links_level0;
size_t efConstruction;
#ifdef __x86_64__
bool use_avx2;
#endif
char data_level0_memory[0]; // varying size
public:
HierarchicalNSW(size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction);
~HierarchicalNSW();
inline coord_t *getDataByInternalId(idx_t internal_id) const {
return (coord_t *)&data_level0_memory[internal_id * size_data_per_element + offset_data];
}
inline idx_t *get_linklist0(idx_t internal_id) const {
return (idx_t*)&data_level0_memory[internal_id * size_data_per_element];
}
inline label_t *getExternalLabel(idx_t internal_id) const {
return (label_t *)&data_level0_memory[internal_id * size_data_per_element + offset_label];
}
std::priority_queue<std::pair<dist_t, idx_t>> searchBaseLayer(const coord_t *x, size_t ef);
void getNeighborsByHeuristic(std::priority_queue<std::pair<dist_t, idx_t>> &topResults, size_t NN);
void mutuallyConnectNewElement(const coord_t *x, idx_t id, std::priority_queue<std::pair<dist_t, idx_t>> topResults);
void addPoint(const coord_t *point, label_t label);
std::priority_queue<std::pair<dist_t, label_t>> searchKnn(const coord_t *query_data, size_t k);
dist_t fstdistfunc(const coord_t *x, const coord_t *y);
};

View File

@@ -1,28 +0,0 @@
SET enable_seqscan = off;
CREATE TABLE t (val real[]);
INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL);
CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3);
INSERT INTO t (val) VALUES (array[1,2,4]);
explain SELECT * FROM t ORDER BY val <-> array[3,3,3];
QUERY PLAN
--------------------------------------------------------------------
Index Scan using t_val_idx on t (cost=4.02..8.06 rows=3 width=36)
Order By: (val <-> '{3,3,3}'::real[])
(2 rows)
SELECT * FROM t ORDER BY val <-> array[3,3,3];
val
---------
{1,2,3}
{1,2,4}
{1,1,1}
{0,0,0}
(4 rows)
SELECT COUNT(*) FROM t;
count
-------
5
(1 row)
DROP TABLE t;

View File

@@ -1,13 +0,0 @@
SET enable_seqscan = off;
CREATE TABLE t (val real[]);
INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL);
CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3);
INSERT INTO t (val) VALUES (array[1,2,4]);
explain SELECT * FROM t ORDER BY val <-> array[3,3,3];
SELECT * FROM t ORDER BY val <-> array[3,3,3];
SELECT COUNT(*) FROM t;
DROP TABLE t;

View File

@@ -106,6 +106,7 @@ jose-jwk = { version = "0.1.2", features = ["p256", "p384", "rsa"] }
signature = "2"
ecdsa = "0.16"
p256 = { version = "0.13", features = ["jwk"] }
ed25519-dalek = { version = "2", default-features = false, features = ["rand_core"] }
rsa = "0.9"
workspace_hack.workspace = true

View File

@@ -187,10 +187,6 @@ pub async fn worker(
let rx = futures::stream::poll_fn(move |cx| rx.poll_recv(cx));
let rx = rx.map(RequestData::from);
let storage = GenericRemoteStorage::from_config(&remote_storage_config)
.await
.context("remote storage init")?;
let properties = WriterProperties::builder()
.set_data_page_size_limit(config.parquet_upload_page_size)
.set_compression(config.parquet_upload_compression);
@@ -224,18 +220,18 @@ pub async fn worker(
let rx_disconnect = futures::stream::poll_fn(move |cx| rx_disconnect.poll_recv(cx));
let rx_disconnect = rx_disconnect.map(RequestData::from);
let storage_disconnect =
GenericRemoteStorage::from_config(&disconnect_events_storage_config)
.await
.context("remote storage for disconnect events init")?;
let parquet_config_disconnect = parquet_config.clone();
tokio::try_join!(
worker_inner(storage, rx, parquet_config),
worker_inner(storage_disconnect, rx_disconnect, parquet_config_disconnect)
worker_inner(remote_storage_config, rx, parquet_config),
worker_inner(
disconnect_events_storage_config,
rx_disconnect,
parquet_config_disconnect
)
)
.map(|_| ())
} else {
worker_inner(storage, rx, parquet_config).await
worker_inner(remote_storage_config, rx, parquet_config).await
}
}
@@ -251,18 +247,32 @@ struct ParquetConfig {
test_remote_failures: u64,
}
impl ParquetConfig {
async fn storage(
&self,
storage_config: &RemoteStorageConfig,
) -> anyhow::Result<GenericRemoteStorage> {
let storage = GenericRemoteStorage::from_config(storage_config)
.await
.context("remote storage init")?;
#[cfg(any(test, feature = "testing"))]
if self.test_remote_failures > 0 {
return Ok(GenericRemoteStorage::unreliable_wrapper(
storage,
self.test_remote_failures,
));
}
Ok(storage)
}
}
async fn worker_inner(
storage: GenericRemoteStorage,
storage_config: RemoteStorageConfig,
rx: impl Stream<Item = RequestData>,
config: ParquetConfig,
) -> anyhow::Result<()> {
#[cfg(any(test, feature = "testing"))]
let storage = if config.test_remote_failures > 0 {
GenericRemoteStorage::unreliable_wrapper(storage, config.test_remote_failures)
} else {
storage
};
let mut rx = std::pin::pin!(rx);
let mut rows = Vec::with_capacity(config.rows_per_group);
@@ -285,7 +295,7 @@ async fn worker_inner(
}
if len > config.file_size || force {
last_upload = time::Instant::now();
let file = upload_parquet(w, len, &storage).await?;
let file = upload_parquet(w, len, &storage_config, &config).await?;
w = SerializedFileWriter::new(file, schema.clone(), config.propeties.clone())?;
len = 0;
}
@@ -298,7 +308,7 @@ async fn worker_inner(
}
if !w.flushed_row_groups().is_empty() {
let _rtchk: Writer<BytesMut> = upload_parquet(w, len, &storage).await?;
let _rtchk: Writer<BytesMut> = upload_parquet(w, len, &storage_config, &config).await?;
}
Ok(())
@@ -340,7 +350,8 @@ where
async fn upload_parquet(
mut w: SerializedFileWriter<Writer<BytesMut>>,
len: i64,
storage: &GenericRemoteStorage,
storage_config: &RemoteStorageConfig,
config: &ParquetConfig,
) -> anyhow::Result<Writer<BytesMut>> {
let len_uncompressed = w
.flushed_row_groups()
@@ -377,6 +388,15 @@ async fn upload_parquet(
size, compression, "uploading request parquet file"
);
// A bug in azure-sdk means that the identity-token-file that expires after
// 1 hour is not refreshed. This identity-token is used to fetch the actual azure storage
// tokens that last for 24 hours. After this 24 hour period, azure-sdk tries to refresh
// the storage token, but the identity token has now expired.
// <https://github.com/Azure/azure-sdk-for-rust/issues/1739>
//
// To work around this, we recreate the storage every time.
let storage = config.storage(storage_config).await?;
let year = now.year();
let month = now.month();
let day = now.day();
@@ -431,8 +451,8 @@ mod tests {
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use remote_storage::{
GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind, S3Config,
DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
RemoteStorageConfig, RemoteStorageKind, S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
};
use tokio::sync::mpsc;
use tokio::time;
@@ -559,12 +579,11 @@ mod tests {
timeout: std::time::Duration::from_secs(120),
small_timeout: std::time::Duration::from_secs(30),
};
let storage = GenericRemoteStorage::from_config(&remote_storage_config)
worker_inner(remote_storage_config, rx, config)
.await
.unwrap();
worker_inner(storage, rx, config).await.unwrap();
let mut files = WalkDir::new(tmpdir.as_std_path())
.into_iter()
.filter_map(|entry| entry.ok())

View File

@@ -3,9 +3,9 @@ use std::sync::Arc;
use std::time::Duration;
use async_trait::async_trait;
use ed25519_dalek::SigningKey;
use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer};
use p256::ecdsa::SigningKey;
use p256::elliptic_curve::JwkEcKey;
use jose_jwk::jose_b64;
use rand::rngs::OsRng;
use tokio::net::{lookup_host, TcpStream};
use tracing::field::display;
@@ -354,9 +354,15 @@ impl PoolingBackend {
}
}
fn create_random_jwk() -> (SigningKey, JwkEcKey) {
let key = SigningKey::random(&mut OsRng);
let jwk = p256::PublicKey::from(key.verifying_key()).to_jwk();
fn create_random_jwk() -> (SigningKey, jose_jwk::Key) {
let key = SigningKey::generate(&mut OsRng);
let jwk = jose_jwk::Key::Okp(jose_jwk::Okp {
crv: jose_jwk::OkpCurves::Ed25519,
x: jose_b64::serde::Bytes::from(key.verifying_key().to_bytes().to_vec()),
d: None,
});
(key, jwk)
}

View File

@@ -16,17 +16,16 @@ use std::sync::Arc;
use std::task::{ready, Poll};
use std::time::Duration;
use ed25519_dalek::{Signature, Signer, SigningKey};
use futures::future::poll_fn;
use futures::Future;
use indexmap::IndexMap;
use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding};
use p256::ecdsa::{Signature, SigningKey};
use parking_lot::RwLock;
use postgres_client::tls::NoTlsStream;
use postgres_client::types::ToSql;
use postgres_client::AsyncMessage;
use serde_json::value::RawValue;
use signature::Signer;
use tokio::net::TcpStream;
use tokio::time::Instant;
use tokio_util::sync::CancellationToken;
@@ -42,7 +41,7 @@ use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
use crate::metrics::Metrics;
pub(crate) const EXT_NAME: &str = "pg_session_jwt";
pub(crate) const EXT_VERSION: &str = "0.1.2";
pub(crate) const EXT_VERSION: &str = "0.2.0";
pub(crate) const EXT_SCHEMA: &str = "auth";
#[derive(Clone)]
@@ -339,8 +338,8 @@ fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String {
let cap = jwt.capacity();
// we only need an empty header with the alg specified.
// base64url(r#"{"alg":"ES256"}"#) == "eyJhbGciOiJFUzI1NiJ9"
jwt.push_str("eyJhbGciOiJFUzI1NiJ9.");
// base64url(r#"{"alg":"EdDSA"}"#) == "eyJhbGciOiJFZERTQSJ9"
jwt.push_str("eyJhbGciOiJFZERTQSJ9.");
// encode the jwt payload in-place
base64::encode_config_buf(payload, base64::URL_SAFE_NO_PAD, &mut jwt);
@@ -366,14 +365,14 @@ fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String {
#[cfg(test)]
#[expect(clippy::unwrap_used)]
mod tests {
use p256::ecdsa::SigningKey;
use ed25519_dalek::SigningKey;
use typed_json::json;
use super::resign_jwt;
#[test]
fn jwt_token_snapshot() {
let key = SigningKey::from_bytes(&[1; 32].into()).unwrap();
let key = SigningKey::from_bytes(&[1; 32]);
let data =
json!({"foo":"bar","jti":"foo\nbar","nested":{"jti":"tricky nesting"}}).to_string();
@@ -381,12 +380,17 @@ mod tests {
// To validate the JWT, copy the JWT string and paste it into https://jwt.io/.
// In the public-key box, paste the following jwk public key
// `{"kty":"EC","crv":"P-256","x":"b_A7lJJBzh2t1DUZ5pYOCoW0GmmgXDKBA6orzhWUyhY","y":"PE91OlW_AdxT9sCwx-7ni0DG_30lqW4igrmJzvccFEo"}`
// `{"kty":"OKP","crv":"Ed25519","x":"iojj3XQJ8ZX9UtstPLpdcspnCb8dlBIb83SIAbQPb1w"}`
// Note - jwt.io doesn't support EdDSA :(
// https://github.com/jsonwebtoken/jsonwebtoken.github.io/issues/509
// let pub_key = p256::ecdsa::VerifyingKey::from(&key);
// let pub_key = p256::PublicKey::from(pub_key);
// println!("{}", pub_key.to_jwk_string());
// let jwk = jose_jwk::Key::Okp(jose_jwk::Okp {
// crv: jose_jwk::OkpCurves::Ed25519,
// x: jose_jwk::jose_b64::serde::Bytes::from(key.verifying_key().to_bytes().to_vec()),
// d: None,
// });
// println!("{}", serde_json::to_string(&jwk).unwrap());
assert_eq!(jwt, "eyJhbGciOiJFUzI1NiJ9.eyJmb28iOiJiYXIiLCJqdGkiOjIsIm5lc3RlZCI6eyJqdGkiOiJ0cmlja3kgbmVzdGluZyJ9fQ.pYf0LxoJ8sDgpmsYOgrbNecOSipnPBEGwnZzB-JhW2cONrKlqRsgXwK8_cOsyolGy-hTTe8GXbWTl_UdpF5RyA");
assert_eq!(jwt, "eyJhbGciOiJFZERTQSJ9.eyJmb28iOiJiYXIiLCJqdGkiOjIsIm5lc3RlZCI6eyJqdGkiOiJ0cmlja3kgbmVzdGluZyJ9fQ.Cvyc2By33KI0f0obystwdy8PN111L3Sc9_Mr2CU3XshtSqSdxuRxNEZGbb_RvyJf2IzheC_s7aBZ-jLeQ9N0Bg");
}
}

View File

@@ -21,14 +21,13 @@ const KB: usize = 1024;
const MB: usize = 1024 * KB;
const GB: usize = 1024 * MB;
/// Use jemalloc, and configure it to sample allocations for profiles every 1 MB.
/// This mirrors the configuration in bin/safekeeper.rs.
/// Use jemalloc and enable profiling, to mirror bin/safekeeper.rs.
#[global_allocator]
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
#[allow(non_upper_case_globals)]
#[export_name = "malloc_conf"]
pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
// Register benchmarks with Criterion.
criterion_group!(

View File

@@ -51,10 +51,12 @@ use utils::{
#[global_allocator]
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
/// Configure jemalloc to profile heap allocations by sampling stack traces every 2 MB (1 << 21).
/// This adds roughly 3% overhead for allocations on average, which is acceptable considering
/// performance-sensitive code will avoid allocations as far as possible anyway.
#[allow(non_upper_case_globals)]
#[export_name = "malloc_conf"]
pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
const PID_FILE_NAME: &str = "safekeeper.pid";
const ID_FILE_NAME: &str = "safekeeper.id";

View File

@@ -55,4 +55,4 @@ r2d2 = { version = "0.8.10" }
utils = { path = "../libs/utils/" }
metrics = { path = "../libs/metrics/" }
control_plane = { path = "../control_plane" }
workspace_hack = { version = "0.1", path = "../workspace_hack" }
workspace_hack = { version = "0.1", path = "../workspace_hack" }

View File

@@ -0,0 +1,4 @@
-- this sadly isn't a "true" revert of the migration, as the column is now at the end of the table.
-- But preserving order is not a trivial operation.
-- https://wiki.postgresql.org/wiki/Alter_column_position
ALTER TABLE safekeepers ADD active BOOLEAN NOT NULL DEFAULT false;

View File

@@ -0,0 +1 @@
ALTER TABLE safekeepers DROP active;

View File

@@ -124,7 +124,10 @@ impl ComputeHookTenant {
if let Some(shard_idx) = shard_idx {
sharded.shards.remove(shard_idx);
} else {
tracing::warn!("Shard not found while handling detach")
// This is a valid but niche case, where the tenant was previously attached
// as a Secondary location and then detached, so has no previously notified
// state.
tracing::info!("Shard not found while handling detach")
}
}
ComputeHookTenant::Unsharded(_) => {
@@ -761,7 +764,10 @@ impl ComputeHook {
let mut state_locked = self.state.lock().unwrap();
match state_locked.entry(tenant_shard_id.tenant_id) {
Entry::Vacant(_) => {
tracing::warn!("Compute hook tenant not found for detach");
// This is a valid but niche case, where the tenant was previously attached
// as a Secondary location and then detached, so has no previously notified
// state.
tracing::info!("Compute hook tenant not found for detach");
}
Entry::Occupied(mut e) => {
let sharded = e.get().is_sharded();

View File

@@ -112,7 +112,7 @@ impl TenantShardDrain {
}
}
match scheduler.node_preferred(tenant_shard.intent.get_secondary()) {
match tenant_shard.preferred_secondary(scheduler) {
Some(node) => Some(node),
None => {
tracing::warn!(

View File

@@ -690,7 +690,8 @@ async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError
};
let state = get_state(&req);
let nodes = state.service.node_list().await?;
let mut nodes = state.service.node_list().await?;
nodes.sort_by_key(|n| n.get_id());
let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::<Vec<_>>();
json_response(StatusCode::OK, api_nodes)
@@ -1005,6 +1006,29 @@ async fn handle_tenant_shard_migrate(
)
}
async fn handle_tenant_shard_migrate_secondary(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let mut req = match maybe_forward(req).await {
ForwardOutcome::Forwarded(res) => {
return res;
}
ForwardOutcome::NotForwarded(req) => req,
};
let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
json_response(
StatusCode::OK,
service
.tenant_shard_migrate_secondary(tenant_shard_id, migrate_req)
.await?,
)
}
async fn handle_tenant_shard_cancel_reconcile(
service: Arc<Service>,
req: Request<Body>,
@@ -1855,6 +1879,16 @@ pub fn make_router(
RequestName("control_v1_tenant_migrate"),
)
})
.put(
"/control/v1/tenant/:tenant_shard_id/migrate_secondary",
|r| {
tenant_service_handler(
r,
handle_tenant_shard_migrate_secondary,
RequestName("control_v1_tenant_migrate_secondary"),
)
},
)
.put(
"/control/v1/tenant/:tenant_shard_id/cancel_reconcile",
|r| {

View File

@@ -53,6 +53,16 @@ pub(crate) struct StorageControllerMetricGroup {
/// How many shards are not scheduled into their preferred AZ
pub(crate) storage_controller_schedule_az_violation: measured::Gauge,
/// How many shard locations (secondary or attached) on each node
pub(crate) storage_controller_node_shards: measured::GaugeVec<NodeLabelGroupSet>,
/// How many _attached_ shard locations on each node
pub(crate) storage_controller_node_attached_shards: measured::GaugeVec<NodeLabelGroupSet>,
/// How many _home_ shard locations on each node (i.e. the node's AZ matches the shard's
/// preferred AZ)
pub(crate) storage_controller_node_home_shards: measured::GaugeVec<NodeLabelGroupSet>,
/// How many shards would like to reconcile but were blocked by concurrency limits
pub(crate) storage_controller_pending_reconciles: measured::Gauge,
@@ -132,6 +142,15 @@ impl Default for StorageControllerMetrics {
}
}
#[derive(measured::LabelGroup, Clone)]
#[label(set = NodeLabelGroupSet)]
pub(crate) struct NodeLabelGroup<'a> {
#[label(dynamic_with = lasso::ThreadedRodeo, default)]
pub(crate) az: &'a str,
#[label(dynamic_with = lasso::ThreadedRodeo, default)]
pub(crate) node_id: &'a str,
}
#[derive(measured::LabelGroup)]
#[label(set = ReconcileCompleteLabelGroupSet)]
pub(crate) struct ReconcileCompleteLabelGroup {

View File

@@ -299,6 +299,7 @@ impl Node {
id: self.id,
availability: self.availability.clone().into(),
scheduling: self.scheduling,
availability_zone_id: self.availability_zone_id.0.clone(),
listen_http_addr: self.listen_http_addr.clone(),
listen_http_port: self.listen_http_port,
listen_pg_addr: self.listen_pg_addr.clone(),

View File

@@ -708,10 +708,11 @@ impl Persistence {
Ok(())
}
/// Note that passing None for a shard clears the preferred AZ (rather than leaving it unmodified)
pub(crate) async fn set_tenant_shard_preferred_azs(
&self,
preferred_azs: Vec<(TenantShardId, AvailabilityZone)>,
) -> DatabaseResult<Vec<(TenantShardId, AvailabilityZone)>> {
preferred_azs: Vec<(TenantShardId, Option<AvailabilityZone>)>,
) -> DatabaseResult<Vec<(TenantShardId, Option<AvailabilityZone>)>> {
use crate::schema::tenant_shards::dsl::*;
self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| {
@@ -722,7 +723,7 @@ impl Persistence {
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
.set(preferred_az_id.eq(preferred_az.0.clone()))
.set(preferred_az_id.eq(preferred_az.as_ref().map(|az| az.0.clone())))
.execute(conn)?;
if updated == 1 {
@@ -1258,7 +1259,6 @@ pub(crate) struct SafekeeperPersistence {
pub(crate) version: i64,
pub(crate) host: String,
pub(crate) port: i32,
pub(crate) active: bool,
pub(crate) http_port: i32,
pub(crate) availability_zone_id: String,
pub(crate) scheduling_policy: String,
@@ -1270,7 +1270,6 @@ impl SafekeeperPersistence {
SkSchedulingPolicy::from_str(&self.scheduling_policy).map_err(|e| {
DatabaseError::Logical(format!("can't construct SkSchedulingPolicy: {e:?}"))
})?;
// omit the `active` flag on purpose: it is deprecated.
Ok(SafekeeperDescribeResponse {
id: NodeId(self.id as u64),
region_id: self.region_id.clone(),
@@ -1295,7 +1294,8 @@ pub(crate) struct SafekeeperUpsert {
pub(crate) version: i64,
pub(crate) host: String,
pub(crate) port: i32,
pub(crate) active: bool,
/// The active flag will not be stored in the database and will be ignored.
pub(crate) active: Option<bool>,
pub(crate) http_port: i32,
pub(crate) availability_zone_id: String,
}
@@ -1311,7 +1311,6 @@ impl SafekeeperUpsert {
version: self.version,
host: &self.host,
port: self.port,
active: self.active,
http_port: self.http_port,
availability_zone_id: &self.availability_zone_id,
// None means a wish to not update this column. We expose abilities to update it via other means.
@@ -1328,7 +1327,6 @@ struct InsertUpdateSafekeeper<'a> {
version: i64,
host: &'a str,
port: i32,
active: bool,
http_port: i32,
availability_zone_id: &'a str,
scheduling_policy: Option<&'a str>,

View File

@@ -696,6 +696,11 @@ impl Reconciler {
/// First we apply special case handling (e.g. for live migrations), and then a
/// general case reconciliation where we walk through the intent by pageserver
/// and call out to the pageserver to apply the desired state.
///
/// An Ok(()) result indicates that we successfully attached the tenant, but _not_ that
/// all locations for the tenant are in the expected state. When nodes that are to be detached
/// or configured as secondary are unavailable, we may return Ok(()) but leave the shard in a
/// state where it still requires later reconciliation.
pub(crate) async fn reconcile(&mut self) -> Result<(), ReconcileError> {
// Prepare: if we have uncertain `observed` state for our would-be attachement location, then refresh it
self.maybe_refresh_observed().await?;
@@ -784,10 +789,18 @@ impl Reconciler {
tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.")
}
_ => {
// In all cases other than a matching observed configuration, we will
// reconcile this location.
tracing::info!(node_id=%node.get_id(), "Observed configuration requires update.");
changes.push((node.clone(), wanted_conf))
// Only try and configure secondary locations on nodes that are available. This
// allows the reconciler to "succeed" while some secondaries are offline (e.g. after
// a node failure, where the failed node will have a secondary intent)
if node.is_available() {
tracing::info!(node_id=%node.get_id(), "Observed configuration requires update.");
changes.push((node.clone(), wanted_conf))
} else {
tracing::info!(node_id=%node.get_id(), "Skipping configuration as secondary, node is unavailable");
self.observed
.locations
.insert(node.get_id(), ObservedStateLocation { conf: None });
}
}
}
}
@@ -813,7 +826,21 @@ impl Reconciler {
if self.cancel.is_cancelled() {
return Err(ReconcileError::Cancel);
}
self.location_config(&node, conf, None, false).await?;
// We only try to configure secondary locations if the node is available. This does
// not stop us succeeding with the reconcile, because our core goal is to make the
// shard _available_ (the attached location), and configuring secondary locations
// can be done lazily when the node becomes available (via background reconciliation).
if node.is_available() {
self.location_config(&node, conf, None, false).await?;
} else {
// If the node is unavailable, we skip and consider the reconciliation successful: this
// is a common case where a pageserver is marked unavailable: we demote a location on
// that unavailable pageserver to secondary.
tracing::info!("Skipping configuring secondary location {node}, it is unavailable");
self.observed
.locations
.insert(node.get_id(), ObservedStateLocation { conf: None });
}
}
// The condition below identifies a detach. We must have no attached intent and

View File

@@ -1,4 +1,4 @@
use crate::{node::Node, tenant_shard::TenantShard};
use crate::{metrics::NodeLabelGroup, node::Node, tenant_shard::TenantShard};
use itertools::Itertools;
use pageserver_api::{controller_api::AvailabilityZone, models::PageserverUtilization};
use serde::Serialize;
@@ -32,6 +32,9 @@ pub(crate) struct SchedulerNode {
shard_count: usize,
/// How many shards are currently attached on this node, via their [`crate::tenant_shard::IntentState`].
attached_shard_count: usize,
/// How many shards have a location on this node (via [`crate::tenant_shard::IntentState`]) _and_ this node
/// is in their preferred AZ (i.e. this is their 'home' location)
home_shard_count: usize,
/// Availability zone id in which the node resides
az: AvailabilityZone,
@@ -47,6 +50,12 @@ pub(crate) trait NodeSchedulingScore: Debug + Ord + Copy + Sized {
preferred_az: &Option<AvailabilityZone>,
context: &ScheduleContext,
) -> Option<Self>;
/// Return a score that drops any components based on node utilization: this is useful
/// for finding scores for scheduling optimisation, when we want to avoid rescheduling
/// shards due to e.g. disk usage, to avoid flapping.
fn for_optimization(&self) -> Self;
fn is_overloaded(&self) -> bool;
fn node_id(&self) -> NodeId;
}
@@ -136,17 +145,13 @@ impl PartialOrd for SecondaryAzMatch {
/// Ordering is given by member declaration order (top to bottom).
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
pub(crate) struct NodeAttachmentSchedulingScore {
/// The number of shards belonging to the tenant currently being
/// scheduled that are attached to this node.
affinity_score: AffinityScore,
/// Flag indicating whether this node matches the preferred AZ
/// of the shard. For equal affinity scores, nodes in the matching AZ
/// are considered first.
az_match: AttachmentAzMatch,
/// Size of [`ScheduleContext::attached_nodes`] for the current node.
/// This normally tracks the number of attached shards belonging to the
/// tenant being scheduled that are already on this node.
attached_shards_in_context: usize,
/// The number of shards belonging to the tenant currently being
/// scheduled that are attached to this node.
affinity_score: AffinityScore,
/// Utilisation score that combines shard count and disk utilisation
utilization_score: u64,
/// Total number of shards attached to this node. When nodes have identical utilisation, this
@@ -177,13 +182,25 @@ impl NodeSchedulingScore for NodeAttachmentSchedulingScore {
.copied()
.unwrap_or(AffinityScore::FREE),
az_match: AttachmentAzMatch(AzMatch::new(&node.az, preferred_az.as_ref())),
attached_shards_in_context: context.attached_nodes.get(node_id).copied().unwrap_or(0),
utilization_score: utilization.cached_score(),
total_attached_shard_count: node.attached_shard_count,
node_id: *node_id,
})
}
/// For use in scheduling optimisation, where we only want to consider the aspects
/// of the score that can only be resolved by moving things (such as inter-shard affinity
/// and AZ affinity), and ignore aspects that reflect the total utilization of a node (which
/// can fluctuate for other reasons)
fn for_optimization(&self) -> Self {
Self {
utilization_score: 0,
total_attached_shard_count: 0,
node_id: NodeId(0),
..*self
}
}
fn is_overloaded(&self) -> bool {
PageserverUtilization::is_overloaded(self.utilization_score)
}
@@ -208,9 +225,9 @@ pub(crate) struct NodeSecondarySchedulingScore {
affinity_score: AffinityScore,
/// Utilisation score that combines shard count and disk utilisation
utilization_score: u64,
/// Total number of shards attached to this node. When nodes have identical utilisation, this
/// acts as an anti-affinity between attached shards.
total_attached_shard_count: usize,
/// Anti-affinity with other non-home locations: this gives the behavior that secondaries
/// will spread out across the nodes in an AZ.
total_non_home_shard_count: usize,
/// Convenience to make selection deterministic in tests and empty systems
node_id: NodeId,
}
@@ -237,11 +254,20 @@ impl NodeSchedulingScore for NodeSecondarySchedulingScore {
.copied()
.unwrap_or(AffinityScore::FREE),
utilization_score: utilization.cached_score(),
total_attached_shard_count: node.attached_shard_count,
total_non_home_shard_count: (node.shard_count - node.home_shard_count),
node_id: *node_id,
})
}
fn for_optimization(&self) -> Self {
Self {
utilization_score: 0,
total_non_home_shard_count: 0,
node_id: NodeId(0),
..*self
}
}
fn is_overloaded(&self) -> bool {
PageserverUtilization::is_overloaded(self.utilization_score)
}
@@ -293,6 +319,10 @@ impl AffinityScore {
pub(crate) fn inc(&mut self) {
self.0 += 1;
}
pub(crate) fn dec(&mut self) {
self.0 -= 1;
}
}
impl std::ops::Add for AffinityScore {
@@ -324,9 +354,6 @@ pub(crate) struct ScheduleContext {
/// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`]
pub(crate) nodes: HashMap<NodeId, AffinityScore>,
/// Specifically how many _attached_ locations are on each node
pub(crate) attached_nodes: HashMap<NodeId, usize>,
pub(crate) mode: ScheduleMode,
}
@@ -334,7 +361,6 @@ impl ScheduleContext {
pub(crate) fn new(mode: ScheduleMode) -> Self {
Self {
nodes: HashMap::new(),
attached_nodes: HashMap::new(),
mode,
}
}
@@ -348,25 +374,31 @@ impl ScheduleContext {
}
}
pub(crate) fn push_attached(&mut self, node_id: NodeId) {
let entry = self.attached_nodes.entry(node_id).or_default();
*entry += 1;
}
pub(crate) fn get_node_affinity(&self, node_id: NodeId) -> AffinityScore {
self.nodes
.get(&node_id)
.copied()
.unwrap_or(AffinityScore::FREE)
}
pub(crate) fn get_node_attachments(&self, node_id: NodeId) -> usize {
self.attached_nodes.get(&node_id).copied().unwrap_or(0)
/// Remove `shard`'s contributions to this context. This is useful when considering scheduling
/// this shard afresh, where we don't want it to e.g. experience anti-affinity to its current location.
pub(crate) fn project_detach(&self, shard: &TenantShard) -> Self {
let mut new_context = self.clone();
if let Some(attached) = shard.intent.get_attached() {
if let Some(score) = new_context.nodes.get_mut(attached) {
score.dec();
}
}
for secondary in shard.intent.get_secondary() {
if let Some(score) = new_context.nodes.get_mut(secondary) {
score.dec();
}
}
new_context
}
/// For test, track the sum of AffinityScore values, which is effectively how many
/// attached or secondary locations have been registered with this context.
#[cfg(test)]
pub(crate) fn attach_count(&self) -> usize {
self.attached_nodes.values().sum()
pub(crate) fn location_count(&self) -> usize {
self.nodes.values().map(|i| i.0).sum()
}
}
@@ -388,6 +420,7 @@ impl Scheduler {
SchedulerNode {
shard_count: 0,
attached_shard_count: 0,
home_shard_count: 0,
may_schedule: node.may_schedule(),
az: node.get_availability_zone_id().clone(),
},
@@ -415,6 +448,7 @@ impl Scheduler {
SchedulerNode {
shard_count: 0,
attached_shard_count: 0,
home_shard_count: 0,
may_schedule: node.may_schedule(),
az: node.get_availability_zone_id().clone(),
},
@@ -427,6 +461,9 @@ impl Scheduler {
Some(node) => {
node.shard_count += 1;
node.attached_shard_count += 1;
if Some(&node.az) == shard.preferred_az() {
node.home_shard_count += 1;
}
}
None => anyhow::bail!(
"Tenant {} references nonexistent node {}",
@@ -438,7 +475,12 @@ impl Scheduler {
for node_id in shard.intent.get_secondary() {
match expect_nodes.get_mut(node_id) {
Some(node) => node.shard_count += 1,
Some(node) => {
node.shard_count += 1;
if Some(&node.az) == shard.preferred_az() {
node.home_shard_count += 1;
}
}
None => anyhow::bail!(
"Tenant {} references nonexistent node {}",
shard.tenant_shard_id,
@@ -482,13 +524,20 @@ impl Scheduler {
///
/// It is an error to call this for a node that is not known to the scheduler (i.e. passed into
/// [`Self::new`] or [`Self::node_upsert`])
pub(crate) fn update_node_ref_counts(&mut self, node_id: NodeId, update: RefCountUpdate) {
pub(crate) fn update_node_ref_counts(
&mut self,
node_id: NodeId,
preferred_az: Option<&AvailabilityZone>,
update: RefCountUpdate,
) {
let Some(node) = self.nodes.get_mut(&node_id) else {
debug_assert!(false);
tracing::error!("Scheduler missing node {node_id}");
return;
};
let is_home_az = Some(&node.az) == preferred_az;
match update {
RefCountUpdate::PromoteSecondary => {
node.attached_shard_count += 1;
@@ -496,19 +545,31 @@ impl Scheduler {
RefCountUpdate::Attach => {
node.shard_count += 1;
node.attached_shard_count += 1;
if is_home_az {
node.home_shard_count += 1;
}
}
RefCountUpdate::Detach => {
node.shard_count -= 1;
node.attached_shard_count -= 1;
if is_home_az {
node.home_shard_count -= 1;
}
}
RefCountUpdate::DemoteAttached => {
node.attached_shard_count -= 1;
}
RefCountUpdate::AddSecondary => {
node.shard_count += 1;
if is_home_az {
node.home_shard_count += 1;
}
}
RefCountUpdate::RemoveSecondary => {
node.shard_count -= 1;
if is_home_az {
node.home_shard_count -= 1;
}
}
}
@@ -594,6 +655,7 @@ impl Scheduler {
entry.insert(SchedulerNode {
shard_count: 0,
attached_shard_count: 0,
home_shard_count: 0,
may_schedule: node.may_schedule(),
az: node.get_availability_zone_id().clone(),
});
@@ -607,33 +669,20 @@ impl Scheduler {
}
}
/// Where we have several nodes to choose from, for example when picking a secondary location
/// to promote to an attached location, this method may be used to pick the best choice based
/// on the scheduler's knowledge of utilization and availability.
///
/// If the input is empty, or all the nodes are not elegible for scheduling, return None: the
/// caller can pick a node some other way.
pub(crate) fn node_preferred(&self, nodes: &[NodeId]) -> Option<NodeId> {
if nodes.is_empty() {
return None;
}
// TODO: When the utilization score returned by the pageserver becomes meaningful,
// schedule based on that instead of the shard count.
let node = nodes
.iter()
.map(|node_id| {
let may_schedule = self
.nodes
.get(node_id)
.map(|n| !matches!(n.may_schedule, MaySchedule::No))
.unwrap_or(false);
(*node_id, may_schedule)
})
.max_by_key(|(_n, may_schedule)| *may_schedule);
// If even the preferred node has may_schedule==false, return None
node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
/// Calculate a single node's score, used in optimizer logic to compare specific
/// nodes' scores.
pub(crate) fn compute_node_score<Score>(
&mut self,
node_id: NodeId,
preferred_az: &Option<AvailabilityZone>,
context: &ScheduleContext,
) -> Option<Score>
where
Score: NodeSchedulingScore,
{
self.nodes
.get_mut(&node_id)
.and_then(|node| Score::generate(&node_id, node, preferred_az, context))
}
/// Compute a schedulling score for each node that the scheduler knows of
@@ -727,7 +776,7 @@ impl Scheduler {
tracing::info!(
"scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
scores.iter().map(|i| i.node_id().0).collect::<Vec<_>>()
);
);
}
// Note that we do not update shard count here to reflect the scheduling: that
@@ -743,47 +792,74 @@ impl Scheduler {
}
/// For choosing which AZ to schedule a new shard into, use this. It will return the
/// AZ with the lowest median utilization.
/// AZ with the the lowest number of shards currently scheduled in this AZ as their home
/// location.
///
/// We use an AZ-wide measure rather than simply selecting the AZ of the least-loaded
/// node, because while tenants start out single sharded, when they grow and undergo
/// shard-split, they will occupy space on many nodes within an AZ.
/// shard-split, they will occupy space on many nodes within an AZ. It is important
/// that we pick the AZ in a way that balances this _future_ load.
///
/// We use median rather than total free space or mean utilization, because
/// we wish to avoid preferring AZs that have low-load nodes resulting from
/// recent replacements.
///
/// The practical result is that we will pick an AZ based on its median node, and
/// then actually _schedule_ the new shard onto the lowest-loaded node in that AZ.
/// Once we've picked an AZ, subsequent scheduling within that AZ will be driven by
/// nodes' utilization scores.
pub(crate) fn get_az_for_new_tenant(&self) -> Option<AvailabilityZone> {
if self.nodes.is_empty() {
return None;
}
let mut scores_by_az = HashMap::new();
for (node_id, node) in &self.nodes {
let az_scores = scores_by_az.entry(&node.az).or_insert_with(Vec::new);
let score = match &node.may_schedule {
MaySchedule::Yes(utilization) => utilization.score(),
MaySchedule::No => PageserverUtilization::full().score(),
};
az_scores.push((node_id, node, score));
#[derive(Default)]
struct AzScore {
home_shard_count: usize,
scheduleable: bool,
}
// Sort by utilization. Also include the node ID to break ties.
for scores in scores_by_az.values_mut() {
scores.sort_by_key(|i| (i.2, i.0));
let mut azs: HashMap<&AvailabilityZone, AzScore> = HashMap::new();
for node in self.nodes.values() {
let az = azs.entry(&node.az).or_default();
az.home_shard_count += node.home_shard_count;
az.scheduleable |= matches!(node.may_schedule, MaySchedule::Yes(_));
}
let mut median_by_az = scores_by_az
// If any AZs are schedulable, then filter out the non-schedulable ones (i.e. AZs where
// all nodes are overloaded or otherwise unschedulable).
if azs.values().any(|i| i.scheduleable) {
azs.retain(|_, i| i.scheduleable);
}
// Find the AZ with the lowest number of shards currently allocated
Some(
azs.into_iter()
.min_by_key(|i| (i.1.home_shard_count, i.0))
.unwrap()
.0
.clone(),
)
}
pub(crate) fn get_node_az(&self, node_id: &NodeId) -> Option<AvailabilityZone> {
self.nodes.get(node_id).map(|n| n.az.clone())
}
/// For use when choosing a preferred secondary location: filter out nodes that are not
/// available, and gather their AZs.
pub(crate) fn filter_usable_nodes(
&self,
nodes: &[NodeId],
) -> Vec<(NodeId, Option<AvailabilityZone>)> {
nodes
.iter()
.map(|(az, nodes)| (*az, nodes.get(nodes.len() / 2).unwrap().2))
.collect::<Vec<_>>();
// Sort by utilization. Also include the AZ to break ties.
median_by_az.sort_by_key(|i| (i.1, i.0));
// Return the AZ with the lowest median utilization
Some(median_by_az.first().unwrap().0.clone())
.filter_map(|node_id| {
let node = self
.nodes
.get(node_id)
.expect("Referenced nodes always exist");
if matches!(node.may_schedule, MaySchedule::Yes(_)) {
Some((*node_id, Some(node.az.clone())))
} else {
None
}
})
.collect()
}
/// Unit test access to internal state
@@ -796,6 +872,33 @@ impl Scheduler {
pub(crate) fn get_node_attached_shard_count(&self, node_id: NodeId) -> usize {
self.nodes.get(&node_id).unwrap().attached_shard_count
}
/// Some metrics that we only calculate periodically: this is simpler than
/// rigorously updating them on every change.
pub(crate) fn update_metrics(&self) {
for (node_id, node) in &self.nodes {
let node_id_str = format!("{}", node_id);
let label_group = NodeLabelGroup {
az: &node.az.0,
node_id: &node_id_str,
};
crate::metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_node_shards
.set(label_group.clone(), node.shard_count as i64);
crate::metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_node_attached_shards
.set(label_group.clone(), node.attached_shard_count as i64);
crate::metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_node_home_shards
.set(label_group.clone(), node.home_shard_count as i64);
}
}
}
#[cfg(test)]
@@ -843,7 +946,14 @@ pub(crate) mod test_utils {
#[cfg(test)]
mod tests {
use pageserver_api::{controller_api::NodeAvailability, models::utilization::test_utilization};
use pageserver_api::{
controller_api::NodeAvailability, models::utilization::test_utilization,
shard::ShardIdentity,
};
use utils::{
id::TenantId,
shard::{ShardCount, ShardNumber, TenantShardId},
};
use super::*;
@@ -853,8 +963,8 @@ mod tests {
let nodes = test_utils::make_test_nodes(2, &[]);
let mut scheduler = Scheduler::new(nodes.values());
let mut t1_intent = IntentState::new();
let mut t2_intent = IntentState::new();
let mut t1_intent = IntentState::new(None);
let mut t2_intent = IntentState::new(None);
let context = ScheduleContext::default();
@@ -930,7 +1040,7 @@ mod tests {
let scheduled = scheduler
.schedule_shard::<AttachedShardTag>(&[], &None, context)
.unwrap();
let mut intent = IntentState::new();
let mut intent = IntentState::new(None);
intent.set_attached(scheduler, Some(scheduled));
scheduled_intents.push(intent);
assert_eq!(scheduled, expect_node);
@@ -1063,7 +1173,7 @@ mod tests {
let scheduled = scheduler
.schedule_shard::<Tag>(&[], &preferred_az, context)
.unwrap();
let mut intent = IntentState::new();
let mut intent = IntentState::new(preferred_az.clone());
intent.set_attached(scheduler, Some(scheduled));
scheduled_intents.push(intent);
assert_eq!(scheduled, expect_node);
@@ -1089,9 +1199,9 @@ mod tests {
&mut context,
);
// Node 2 is not in "az-a", but it has the lowest affinity so we prefer that.
// Node 1 and 3 (az-a) have same affinity score, so prefer the lowest node id.
assert_scheduler_chooses::<AttachedShardTag>(
NodeId(2),
NodeId(1),
Some(az_a_tag.clone()),
&mut scheduled_intents,
&mut scheduler,
@@ -1107,26 +1217,6 @@ mod tests {
&mut context,
);
// Avoid nodes in "az-b" for the secondary location.
// Nodes 1 and 3 are identically loaded, so prefer the lowest node id.
assert_scheduler_chooses::<SecondaryShardTag>(
NodeId(1),
Some(az_b_tag.clone()),
&mut scheduled_intents,
&mut scheduler,
&mut context,
);
// Avoid nodes in "az-b" for the secondary location.
// Node 3 has lower affinity score than 1, so prefer that.
assert_scheduler_chooses::<SecondaryShardTag>(
NodeId(3),
Some(az_b_tag.clone()),
&mut scheduled_intents,
&mut scheduler,
&mut context,
);
for mut intent in scheduled_intents {
intent.clear(&mut scheduler);
}
@@ -1150,34 +1240,292 @@ mod tests {
let mut scheduler = Scheduler::new(nodes.values());
/// Force the utilization of a node in Scheduler's state to a particular
/// number of bytes used.
fn set_utilization(scheduler: &mut Scheduler, node_id: NodeId, shard_count: u32) {
let mut node = Node::new(
node_id,
"".to_string(),
0,
"".to_string(),
0,
scheduler.nodes.get(&node_id).unwrap().az.clone(),
);
node.set_availability(NodeAvailability::Active(test_utilization::simple(
shard_count,
0,
)));
scheduler.node_upsert(&node);
/// Force the `home_shard_count` of a node directly: this is the metric used
/// by the scheduler when picking AZs.
fn set_shard_count(scheduler: &mut Scheduler, node_id: NodeId, shard_count: usize) {
let node = scheduler.nodes.get_mut(&node_id).unwrap();
node.home_shard_count = shard_count;
}
// Initial empty state. Scores are tied, scheduler prefers lower AZ ID.
assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone()));
// Put some utilization on one node in AZ A: this should change nothing, as the median hasn't changed
set_utilization(&mut scheduler, NodeId(1), 1000000);
assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone()));
// Put some utilization on a second node in AZ A: now the median has changed, so the scheduler
// should prefer the other AZ.
set_utilization(&mut scheduler, NodeId(2), 1000000);
// Home shard count is higher in AZ A, so AZ B will be preferred
set_shard_count(&mut scheduler, NodeId(1), 10);
assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_b_tag.clone()));
// Total home shard count is higher in AZ B, so we revert to preferring AZ A
set_shard_count(&mut scheduler, NodeId(4), 6);
set_shard_count(&mut scheduler, NodeId(5), 6);
assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone()));
}
/// Test that when selecting AZs for many new tenants, we get the expected balance across nodes
#[test]
fn az_selection_many() {
let az_a_tag = AvailabilityZone("az-a".to_string());
let az_b_tag = AvailabilityZone("az-b".to_string());
let az_c_tag = AvailabilityZone("az-c".to_string());
let nodes = test_utils::make_test_nodes(
6,
&[
az_a_tag.clone(),
az_b_tag.clone(),
az_c_tag.clone(),
az_a_tag.clone(),
az_b_tag.clone(),
az_c_tag.clone(),
],
);
let mut scheduler = Scheduler::new(nodes.values());
// We should get 1/6th of these on each node, give or take a few...
let total_tenants = 300;
// ...where the 'few' is the number of AZs, because the scheduling will sometimes overshoot
// on one AZ before correcting itself. This is because we select the 'home' AZ based on
// an AZ-wide metric, but we select the location for secondaries on a purely node-based
// metric (while excluding the home AZ).
let grace = 3;
let mut scheduled_shards = Vec::new();
for _i in 0..total_tenants {
let preferred_az = scheduler.get_az_for_new_tenant().unwrap();
let mut node_home_counts = scheduler
.nodes
.iter()
.map(|(node_id, node)| (node_id, node.home_shard_count))
.collect::<Vec<_>>();
node_home_counts.sort_by_key(|i| i.0);
eprintln!("Selected {}, vs nodes {:?}", preferred_az, node_home_counts);
let tenant_shard_id = TenantShardId {
tenant_id: TenantId::generate(),
shard_number: ShardNumber(0),
shard_count: ShardCount(1),
};
let shard_identity = ShardIdentity::new(
tenant_shard_id.shard_number,
tenant_shard_id.shard_count,
pageserver_api::shard::ShardStripeSize(1),
)
.unwrap();
let mut shard = TenantShard::new(
tenant_shard_id,
shard_identity,
pageserver_api::controller_api::PlacementPolicy::Attached(1),
Some(preferred_az),
);
let mut context = ScheduleContext::default();
shard.schedule(&mut scheduler, &mut context).unwrap();
eprintln!("Scheduled shard at {:?}", shard.intent);
scheduled_shards.push(shard);
}
for (node_id, node) in &scheduler.nodes {
eprintln!(
"Node {}: {} {} {}",
node_id, node.shard_count, node.attached_shard_count, node.home_shard_count
);
}
for node in scheduler.nodes.values() {
assert!((node.home_shard_count as i64 - total_tenants as i64 / 6).abs() < grace);
}
for mut shard in scheduled_shards {
shard.intent.clear(&mut scheduler);
}
}
#[test]
/// Make sure that when we have an odd number of nodes and an even number of shards, we still
/// get scheduling stability.
fn odd_nodes_stability() {
let az_a = AvailabilityZone("az-a".to_string());
let az_b = AvailabilityZone("az-b".to_string());
let nodes = test_utils::make_test_nodes(
10,
&[
az_a.clone(),
az_a.clone(),
az_a.clone(),
az_a.clone(),
az_a.clone(),
az_b.clone(),
az_b.clone(),
az_b.clone(),
az_b.clone(),
az_b.clone(),
],
);
let mut scheduler = Scheduler::new(nodes.values());
// Need to keep these alive because they contribute to shard counts via RAII
let mut scheduled_shards = Vec::new();
let mut context = ScheduleContext::default();
fn schedule_shard(
tenant_shard_id: TenantShardId,
expect_attached: NodeId,
expect_secondary: NodeId,
scheduled_shards: &mut Vec<TenantShard>,
scheduler: &mut Scheduler,
preferred_az: Option<AvailabilityZone>,
context: &mut ScheduleContext,
) {
let shard_identity = ShardIdentity::new(
tenant_shard_id.shard_number,
tenant_shard_id.shard_count,
pageserver_api::shard::ShardStripeSize(1),
)
.unwrap();
let mut shard = TenantShard::new(
tenant_shard_id,
shard_identity,
pageserver_api::controller_api::PlacementPolicy::Attached(1),
preferred_az,
);
shard.schedule(scheduler, context).unwrap();
assert_eq!(shard.intent.get_attached().unwrap(), expect_attached);
assert_eq!(
shard.intent.get_secondary().first().unwrap(),
&expect_secondary
);
scheduled_shards.push(shard);
}
let tenant_id = TenantId::generate();
schedule_shard(
TenantShardId {
tenant_id,
shard_number: ShardNumber(0),
shard_count: ShardCount(8),
},
NodeId(1),
NodeId(6),
&mut scheduled_shards,
&mut scheduler,
Some(az_a.clone()),
&mut context,
);
schedule_shard(
TenantShardId {
tenant_id,
shard_number: ShardNumber(1),
shard_count: ShardCount(8),
},
NodeId(2),
NodeId(7),
&mut scheduled_shards,
&mut scheduler,
Some(az_a.clone()),
&mut context,
);
schedule_shard(
TenantShardId {
tenant_id,
shard_number: ShardNumber(2),
shard_count: ShardCount(8),
},
NodeId(3),
NodeId(8),
&mut scheduled_shards,
&mut scheduler,
Some(az_a.clone()),
&mut context,
);
schedule_shard(
TenantShardId {
tenant_id,
shard_number: ShardNumber(3),
shard_count: ShardCount(8),
},
NodeId(4),
NodeId(9),
&mut scheduled_shards,
&mut scheduler,
Some(az_a.clone()),
&mut context,
);
schedule_shard(
TenantShardId {
tenant_id,
shard_number: ShardNumber(4),
shard_count: ShardCount(8),
},
NodeId(5),
NodeId(10),
&mut scheduled_shards,
&mut scheduler,
Some(az_a.clone()),
&mut context,
);
schedule_shard(
TenantShardId {
tenant_id,
shard_number: ShardNumber(5),
shard_count: ShardCount(8),
},
NodeId(1),
NodeId(6),
&mut scheduled_shards,
&mut scheduler,
Some(az_a.clone()),
&mut context,
);
schedule_shard(
TenantShardId {
tenant_id,
shard_number: ShardNumber(6),
shard_count: ShardCount(8),
},
NodeId(2),
NodeId(7),
&mut scheduled_shards,
&mut scheduler,
Some(az_a.clone()),
&mut context,
);
schedule_shard(
TenantShardId {
tenant_id,
shard_number: ShardNumber(7),
shard_count: ShardCount(8),
},
NodeId(3),
NodeId(8),
&mut scheduled_shards,
&mut scheduler,
Some(az_a.clone()),
&mut context,
);
// Assert that the optimizer suggests nochanges, i.e. our initial scheduling was stable.
for shard in &scheduled_shards {
assert_eq!(shard.optimize_attachment(&mut scheduler, &context), None);
}
for mut shard in scheduled_shards {
shard.intent.clear(&mut scheduler);
}
}
}

View File

@@ -36,7 +36,6 @@ diesel::table! {
version -> Int8,
host -> Text,
port -> Int4,
active -> Bool,
http_port -> Int4,
availability_zone_id -> Text,
scheduling_policy -> Varchar,

View File

@@ -1404,7 +1404,11 @@ impl Service {
// We will populate intent properly later in [`Self::startup_reconcile`], initially populate
// it with what we can infer: the node for which a generation was most recently issued.
let mut intent = IntentState::new();
let mut intent = IntentState::new(
tsp.preferred_az_id
.as_ref()
.map(|az| AvailabilityZone(az.clone())),
);
if let Some(generation_pageserver) = tsp.generation_pageserver.map(|n| NodeId(n as u64))
{
if nodes.contains_key(&generation_pageserver) {
@@ -2474,18 +2478,29 @@ impl Service {
tenant_id: TenantId,
_guard: &TracingExclusiveGuard<TenantOperations>,
) -> Result<(), ApiError> {
let present_in_memory = {
// Check if the tenant is present in memory, and select an AZ to use when loading
// if we will load it.
let load_in_az = {
let locked = self.inner.read().unwrap();
locked
let existing = locked
.tenants
.range(TenantShardId::tenant_range(tenant_id))
.next()
.is_some()
};
.next();
if present_in_memory {
return Ok(());
}
// If the tenant is not present in memory, we expect to load it from database,
// so let's figure out what AZ to load it into while we have self.inner locked.
if existing.is_none() {
locked
.scheduler
.get_az_for_new_tenant()
.ok_or(ApiError::BadRequest(anyhow::anyhow!(
"No AZ with nodes found to load tenant"
)))?
} else {
// We already have this tenant in memory
return Ok(());
}
};
let tenant_shards = self.persistence.load_tenant(tenant_id).await?;
if tenant_shards.is_empty() {
@@ -2494,8 +2509,20 @@ impl Service {
));
}
// TODO: choose a fresh AZ to use for this tenant when un-detaching: there definitely isn't a running
// compute, so no benefit to making AZ sticky across detaches.
// Update the persistent shards with the AZ that we are about to apply to in-memory state
self.persistence
.set_tenant_shard_preferred_azs(
tenant_shards
.iter()
.map(|t| {
(
t.get_tenant_shard_id().expect("Corrupt shard in database"),
Some(load_in_az.clone()),
)
})
.collect(),
)
.await?;
let mut locked = self.inner.write().unwrap();
tracing::info!(
@@ -2505,7 +2532,7 @@ impl Service {
);
locked.tenants.extend(tenant_shards.into_iter().map(|p| {
let intent = IntentState::new();
let intent = IntentState::new(Some(load_in_az.clone()));
let shard =
TenantShard::from_persistent(p, intent).expect("Corrupt shard row in database");
@@ -4236,6 +4263,22 @@ impl Service {
}
tracing::info!("Restoring parent shard {tenant_shard_id}");
// Drop any intents that refer to unavailable nodes, to enable this abort to proceed even
// if the original attachment location is offline.
if let Some(node_id) = shard.intent.get_attached() {
if !nodes.get(node_id).unwrap().is_available() {
tracing::info!("Demoting attached intent for {tenant_shard_id} on unavailable node {node_id}");
shard.intent.demote_attached(scheduler, *node_id);
}
}
for node_id in shard.intent.get_secondary().clone() {
if !nodes.get(&node_id).unwrap().is_available() {
tracing::info!("Dropping secondary intent for {tenant_shard_id} on unavailable node {node_id}");
shard.intent.remove_secondary(scheduler, node_id);
}
}
shard.splitting = SplitState::Idle;
if let Err(e) = shard.schedule(scheduler, &mut ScheduleContext::default()) {
// If this shard can't be scheduled now (perhaps due to offline nodes or
@@ -4389,15 +4432,13 @@ impl Service {
let mut child_state =
TenantShard::new(child, child_shard, policy.clone(), preferred_az.clone());
child_state.intent = IntentState::single(scheduler, Some(pageserver));
child_state.intent =
IntentState::single(scheduler, Some(pageserver), preferred_az.clone());
child_state.observed = ObservedState {
locations: child_observed,
};
child_state.generation = Some(generation);
child_state.config = config.clone();
if let Some(preferred_az) = &preferred_az {
child_state.set_preferred_az(preferred_az.clone());
}
// The child's TenantShard::splitting is intentionally left at the default value of Idle,
// as at this point in the split process we have succeeded and this part is infallible:
@@ -5014,6 +5055,8 @@ impl Service {
// If our new attached node was a secondary, it no longer should be.
shard.intent.remove_secondary(scheduler, migrate_req.node_id);
shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
// If we were already attached to something, demote that to a secondary
if let Some(old_attached) = old_attached {
if n > 0 {
@@ -5025,8 +5068,6 @@ impl Service {
shard.intent.push_secondary(scheduler, old_attached);
}
}
shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
}
PlacementPolicy::Secondary => {
shard.intent.clear(scheduler);
@@ -5055,6 +5096,69 @@ impl Service {
Ok(TenantShardMigrateResponse {})
}
pub(crate) async fn tenant_shard_migrate_secondary(
&self,
tenant_shard_id: TenantShardId,
migrate_req: TenantShardMigrateRequest,
) -> Result<TenantShardMigrateResponse, ApiError> {
let waiter = {
let mut locked = self.inner.write().unwrap();
let (nodes, tenants, scheduler) = locked.parts_mut();
let Some(node) = nodes.get(&migrate_req.node_id) else {
return Err(ApiError::BadRequest(anyhow::anyhow!(
"Node {} not found",
migrate_req.node_id
)));
};
if !node.is_available() {
// Warn but proceed: the caller may intend to manually adjust the placement of
// a shard even if the node is down, e.g. if intervening during an incident.
tracing::warn!("Migrating to unavailable node {node}");
}
let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
return Err(ApiError::NotFound(
anyhow::anyhow!("Tenant shard not found").into(),
));
};
if shard.intent.get_secondary().len() == 1
&& shard.intent.get_secondary()[0] == migrate_req.node_id
{
tracing::info!(
"Migrating secondary to {node}: intent is unchanged {:?}",
shard.intent
);
} else if shard.intent.get_attached() == &Some(migrate_req.node_id) {
tracing::info!("Migrating secondary to {node}: already attached where we were asked to create a secondary");
} else {
let old_secondaries = shard.intent.get_secondary().clone();
for secondary in old_secondaries {
shard.intent.remove_secondary(scheduler, secondary);
}
shard.intent.push_secondary(scheduler, migrate_req.node_id);
shard.sequence = shard.sequence.next();
tracing::info!(
"Migrating secondary to {node}: new intent {:?}",
shard.intent
);
}
self.maybe_reconcile_shard(shard, nodes)
};
if let Some(waiter) = waiter {
waiter.wait_timeout(RECONCILE_TIMEOUT).await?;
} else {
tracing::info!("Migration is a no-op");
}
Ok(TenantShardMigrateResponse {})
}
/// 'cancel' in this context means cancel any ongoing reconcile
pub(crate) async fn tenant_shard_cancel_reconcile(
&self,
@@ -5256,7 +5360,8 @@ impl Service {
expect_nodes.sort_by_key(|n| n.node_id);
nodes.sort_by_key(|n| n.node_id);
if nodes != expect_nodes {
// Errors relating to nodes are deferred so that we don't skip the shard checks below if we have a node error
let node_result = if nodes != expect_nodes {
tracing::error!("Consistency check failed on nodes.");
tracing::error!(
"Nodes in memory: {}",
@@ -5268,10 +5373,12 @@ impl Service {
serde_json::to_string(&nodes)
.map_err(|e| ApiError::InternalServerError(e.into()))?
);
return Err(ApiError::InternalServerError(anyhow::anyhow!(
Err(ApiError::InternalServerError(anyhow::anyhow!(
"Node consistency failure"
)));
}
)))
} else {
Ok(())
};
let mut persistent_shards = self.persistence.load_active_tenant_shards().await?;
persistent_shards
@@ -5281,6 +5388,7 @@ impl Service {
if persistent_shards != expect_shards {
tracing::error!("Consistency check failed on shards.");
tracing::error!(
"Shards in memory: {}",
serde_json::to_string(&expect_shards)
@@ -5291,12 +5399,57 @@ impl Service {
serde_json::to_string(&persistent_shards)
.map_err(|e| ApiError::InternalServerError(e.into()))?
);
// The total dump log lines above are useful in testing but in the field grafana will
// usually just drop them because they're so large. So we also do some explicit logging
// of just the diffs.
let persistent_shards = persistent_shards
.into_iter()
.map(|tsp| (tsp.get_tenant_shard_id().unwrap(), tsp))
.collect::<HashMap<_, _>>();
let expect_shards = expect_shards
.into_iter()
.map(|tsp| (tsp.get_tenant_shard_id().unwrap(), tsp))
.collect::<HashMap<_, _>>();
for (tenant_shard_id, persistent_tsp) in &persistent_shards {
match expect_shards.get(tenant_shard_id) {
None => {
tracing::error!(
"Shard {} found in database but not in memory",
tenant_shard_id
);
}
Some(expect_tsp) => {
if expect_tsp != persistent_tsp {
tracing::error!(
"Shard {} is inconsistent. In memory: {}, database has: {}",
tenant_shard_id,
serde_json::to_string(expect_tsp).unwrap(),
serde_json::to_string(&persistent_tsp).unwrap()
);
}
}
}
}
// Having already logged any differences, log any shards that simply aren't present in the database
for (tenant_shard_id, memory_tsp) in &expect_shards {
if !persistent_shards.contains_key(tenant_shard_id) {
tracing::error!(
"Shard {} found in memory but not in database: {}",
tenant_shard_id,
serde_json::to_string(memory_tsp)
.map_err(|e| ApiError::InternalServerError(e.into()))?
);
}
}
return Err(ApiError::InternalServerError(anyhow::anyhow!(
"Shard consistency failure"
)));
}
Ok(())
node_result
}
/// For debug/support: a JSON dump of the [`Scheduler`]. Returns a response so that
@@ -5600,7 +5753,7 @@ impl Service {
register_req.listen_http_port,
register_req.listen_pg_addr,
register_req.listen_pg_port,
register_req.availability_zone_id,
register_req.availability_zone_id.clone(),
);
// TODO: idempotency if the node already exists in the database
@@ -5620,8 +5773,9 @@ impl Service {
.set(locked.nodes.len() as i64);
tracing::info!(
"Registered pageserver {}, now have {} pageservers",
"Registered pageserver {} ({}), now have {} pageservers",
register_req.node_id,
register_req.availability_zone_id,
locked.nodes.len()
);
Ok(())
@@ -6236,7 +6390,7 @@ impl Service {
/// available. A return value of 0 indicates that everything is fully reconciled already.
fn reconcile_all(&self) -> usize {
let mut locked = self.inner.write().unwrap();
let (nodes, tenants, _scheduler) = locked.parts_mut();
let (nodes, tenants, scheduler) = locked.parts_mut();
let pageservers = nodes.clone();
// This function is an efficient place to update lazy statistics, since we are walking
@@ -6297,6 +6451,9 @@ impl Service {
}
}
// Some metrics are calculated from SchedulerNode state, update these periodically
scheduler.update_metrics();
// Process any deferred tenant drops
for (tenant_id, guard) in drop_detached_tenants {
self.maybe_drop_tenant(tenant_id, &mut locked, &guard);
@@ -6355,6 +6512,7 @@ impl Service {
// Shard was dropped between planning and execution;
continue;
};
tracing::info!(tenant_shard_id=%tenant_shard_id, "Applying optimization: {optimization:?}");
if shard.apply_optimization(scheduler, optimization) {
optimizations_applied += 1;
if self.maybe_reconcile_shard(shard, nodes).is_some() {
@@ -6385,7 +6543,13 @@ impl Service {
let mut work = Vec::new();
let mut locked = self.inner.write().unwrap();
let (nodes, tenants, scheduler) = locked.parts_mut();
let (_nodes, tenants, scheduler) = locked.parts_mut();
// We are going to plan a bunch of optimisations before applying any of them, so the
// utilisation stats on nodes will be effectively stale for the >1st optimisation we
// generate. To avoid this causing unstable migrations/flapping, it's important that the
// code in TenantShard for finding optimisations uses [`NodeAttachmentSchedulingScore::disregard_utilization`]
// to ignore the utilisation component of the score.
for (_tenant_id, schedule_context, shards) in
TenantShardContextIterator::new(tenants, ScheduleMode::Speculative)
@@ -6416,13 +6580,28 @@ impl Service {
continue;
}
// TODO: optimization calculations are relatively expensive: create some fast-path for
// the common idle case (avoiding the search on tenants that we have recently checked)
// Fast path: we may quickly identify shards that don't have any possible optimisations
if !shard.maybe_optimizable(scheduler, &schedule_context) {
if cfg!(feature = "testing") {
// Check that maybe_optimizable doesn't disagree with the actual optimization functions.
// Only do this in testing builds because it is not a correctness-critical check, so we shouldn't
// panic in prod if we hit this, or spend cycles on it in prod.
assert!(shard
.optimize_attachment(scheduler, &schedule_context)
.is_none());
assert!(shard
.optimize_secondary(scheduler, &schedule_context)
.is_none());
}
continue;
}
if let Some(optimization) =
// If idle, maybe ptimize attachments: if a shard has a secondary location that is preferable to
// If idle, maybe optimize attachments: if a shard has a secondary location that is preferable to
// its primary location based on soft constraints, cut it over.
shard.optimize_attachment(nodes, &schedule_context)
shard.optimize_attachment(scheduler, &schedule_context)
{
tracing::info!(tenant_shard_id=%shard.tenant_shard_id, "Identified optimization for attachment: {optimization:?}");
work.push((shard.tenant_shard_id, optimization));
break;
} else if let Some(optimization) =
@@ -6432,6 +6611,7 @@ impl Service {
// in the same tenant with secondary locations on the node where they originally split.
shard.optimize_secondary(scheduler, &schedule_context)
{
tracing::info!(tenant_shard_id=%shard.tenant_shard_id, "Identified optimization for secondary: {optimization:?}");
work.push((shard.tenant_shard_id, optimization));
break;
}
@@ -6480,8 +6660,10 @@ impl Service {
}
}
}
ScheduleOptimizationAction::ReplaceSecondary(_) => {
// No extra checks needed to replace a secondary: this does not interrupt client access
ScheduleOptimizationAction::ReplaceSecondary(_)
| ScheduleOptimizationAction::CreateSecondary(_)
| ScheduleOptimizationAction::RemoveSecondary(_) => {
// No extra checks needed to manage secondaries: this does not interrupt client access
validated_work.push((tenant_shard_id, optimization))
}
};
@@ -6553,26 +6735,35 @@ impl Service {
/// we have this helper to move things along faster.
#[cfg(feature = "testing")]
async fn kick_secondary_download(&self, tenant_shard_id: TenantShardId) {
let (attached_node, secondary_node) = {
let (attached_node, secondaries) = {
let locked = self.inner.read().unwrap();
let Some(shard) = locked.tenants.get(&tenant_shard_id) else {
tracing::warn!(
"Skipping kick of secondary download for {tenant_shard_id}: not found"
);
return;
};
let (Some(attached), Some(secondary)) = (
shard.intent.get_attached(),
shard.intent.get_secondary().first(),
) else {
let Some(attached) = shard.intent.get_attached() else {
tracing::warn!(
"Skipping kick of secondary download for {tenant_shard_id}: no attached"
);
return;
};
(
locked.nodes.get(attached).unwrap().clone(),
locked.nodes.get(secondary).unwrap().clone(),
)
let secondaries = shard
.intent
.get_secondary()
.iter()
.map(|n| locked.nodes.get(n).unwrap().clone())
.collect::<Vec<_>>();
(locked.nodes.get(attached).unwrap().clone(), secondaries)
};
// Make remote API calls to upload + download heatmaps: we ignore errors because this is just
// a 'kick' to let scheduling optimisation run more promptly.
attached_node
match attached_node
.with_client_retries(
|client| async move { client.tenant_heatmap_upload(tenant_shard_id).await },
&self.config.jwt_token,
@@ -6581,22 +6772,57 @@ impl Service {
SHORT_RECONCILE_TIMEOUT,
&self.cancel,
)
.await;
.await
{
Some(Err(e)) => {
tracing::info!(
"Failed to upload heatmap from {attached_node} for {tenant_shard_id}: {e}"
);
}
None => {
tracing::info!(
"Cancelled while uploading heatmap from {attached_node} for {tenant_shard_id}"
);
}
Some(Ok(_)) => {
tracing::info!(
"Successfully uploaded heatmap from {attached_node} for {tenant_shard_id}"
);
}
}
secondary_node
.with_client_retries(
|client| async move {
client
.tenant_secondary_download(tenant_shard_id, Some(Duration::from_secs(1)))
.await
},
&self.config.jwt_token,
3,
10,
SHORT_RECONCILE_TIMEOUT,
&self.cancel,
)
.await;
for secondary_node in secondaries {
match secondary_node
.with_client_retries(
|client| async move {
client
.tenant_secondary_download(
tenant_shard_id,
Some(Duration::from_secs(1)),
)
.await
},
&self.config.jwt_token,
3,
10,
SHORT_RECONCILE_TIMEOUT,
&self.cancel,
)
.await
{
Some(Err(e)) => {
tracing::info!(
"Failed to download heatmap from {secondary_node} for {tenant_shard_id}: {e}"
);
}
None => {
tracing::info!("Cancelled while downloading heatmap from {secondary_node} for {tenant_shard_id}");
}
Some(Ok(progress)) => {
tracing::info!("Successfully downloaded heatmap from {secondary_node} for {tenant_shard_id}: {progress:?}");
}
}
}
}
/// Look for shards which are oversized and in need of splitting
@@ -7032,9 +7258,15 @@ impl Service {
fn fill_node_plan(&self, node_id: NodeId) -> Vec<TenantShardId> {
let mut locked = self.inner.write().unwrap();
let fill_requirement = locked.scheduler.compute_fill_requirement(node_id);
let (nodes, tenants, _scheduler) = locked.parts_mut();
let mut tids_by_node = locked
.tenants
let node_az = nodes
.get(&node_id)
.expect("Node must exist")
.get_availability_zone_id()
.clone();
let mut tids_by_node = tenants
.iter_mut()
.filter_map(|(tid, tenant_shard)| {
if !matches!(
@@ -7047,6 +7279,25 @@ impl Service {
return None;
}
// AZ check: when filling nodes after a restart, our intent is to move _back_ the
// shards which belong on this node, not to promote shards whose scheduling preference
// would be on their currently attached node. So will avoid promoting shards whose
// home AZ doesn't match the AZ of the node we're filling.
match tenant_shard.preferred_az() {
None => {
// Shard doesn't have an AZ preference: it is elegible to be moved.
}
Some(az) if az == &node_az => {
// This shard's home AZ is equal to the node we're filling: it is
// elegible to be moved: fall through;
}
Some(_) => {
// This shard's home AZ is somewhere other than the node we're filling:
// do not include it in the fill plan.
return None;
}
}
if tenant_shard.intent.get_secondary().contains(&node_id) {
if let Some(primary) = tenant_shard.intent.get_attached() {
return Some((*primary, *tid));

View File

@@ -43,9 +43,6 @@ impl<'a> Iterator for TenantShardContextIterator<'a> {
// Accumulate the schedule context for all the shards in a tenant
schedule_context.avoid(&shard.intent.all_pageservers());
if let Some(attached) = shard.intent.get_attached() {
schedule_context.push_attached(*attached);
}
tenant_shards.push(shard);
if tenant_shard_id.shard_number.0 == tenant_shard_id.shard_count.count() - 1 {
@@ -115,7 +112,7 @@ mod tests {
assert_eq!(tenant_id, t1_id);
assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0));
assert_eq!(shards.len(), 1);
assert_eq!(context.attach_count(), 1);
assert_eq!(context.location_count(), 2);
let (tenant_id, context, shards) = iter.next().unwrap();
assert_eq!(tenant_id, t2_id);
@@ -124,13 +121,13 @@ mod tests {
assert_eq!(shards[2].tenant_shard_id.shard_number, ShardNumber(2));
assert_eq!(shards[3].tenant_shard_id.shard_number, ShardNumber(3));
assert_eq!(shards.len(), 4);
assert_eq!(context.attach_count(), 4);
assert_eq!(context.location_count(), 8);
let (tenant_id, context, shards) = iter.next().unwrap();
assert_eq!(tenant_id, t3_id);
assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0));
assert_eq!(shards.len(), 1);
assert_eq!(context.attach_count(), 1);
assert_eq!(context.location_count(), 2);
for shard in tenants.values_mut() {
shard.intent.clear(&mut scheduler);

File diff suppressed because it is too large Load Diff

View File

@@ -131,7 +131,6 @@ PAGESERVER_GLOBAL_METRICS: tuple[str, ...] = (
"pageserver_getpage_reconstruct_seconds_sum",
*[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]],
*histogram("pageserver_smgr_query_seconds_global"),
*histogram("pageserver_layers_visited_per_read_global"),
*histogram("pageserver_getpage_get_reconstruct_data_seconds"),
*histogram("pageserver_wait_lsn_seconds"),
*histogram("pageserver_remote_operation_seconds"),

View File

@@ -2,6 +2,7 @@ from __future__ import annotations
import concurrent.futures
import re
import threading
from pathlib import Path
import pytest
@@ -188,7 +189,20 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
check_pgbench_output(out_path)
with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads:
stop_pump = threading.Event()
def pump_controller():
# Run a background loop to force the storage controller to run its
# background work faster than it otherwise would: this helps
# us:
# A) to create a test that runs in a shorter time
# B) to create a test that is more intensive by doing the shard migrations
# after splits happen more rapidly.
while not stop_pump.is_set():
env.storage_controller.reconcile_all()
stop_pump.wait(0.1)
with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count + 1) as pgbench_threads:
pgbench_futs = []
for tenant_state in tenants.values():
fut = pgbench_threads.submit(run_pgbench_init, tenant_state.endpoint)
@@ -198,6 +212,8 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
for fut in pgbench_futs:
fut.result()
pump_fut = pgbench_threads.submit(pump_controller)
pgbench_futs = []
for tenant_state in tenants.values():
fut = pgbench_threads.submit(run_pgbench_main, tenant_state.endpoint)
@@ -207,6 +223,9 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
for fut in pgbench_futs:
fut.result()
stop_pump.set()
pump_fut.result()
def assert_all_split():
for tenant_id in tenants.keys():
shards = tenant_get_shards(env, tenant_id)

View File

@@ -13,11 +13,13 @@ from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnv,
NeonEnvBuilder,
NeonPageserver,
PageserverAvailability,
PageserverSchedulingPolicy,
)
from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
from fixtures.pg_version import PgVersion
from fixtures.utils import wait_until
def get_consistent_node_shard_counts(env: NeonEnv, total_shards) -> defaultdict[str, int]:
@@ -85,8 +87,12 @@ def test_storage_controller_many_tenants(
)
AZS = ["alpha", "bravo", "charlie"]
def az_selector(node_id):
return f"az-{AZS[(node_id - 1) % len(AZS)]}"
neon_env_builder.pageserver_config_override = lambda ps_cfg: ps_cfg.update(
{"availability_zone": f"az-{AZS[ps_cfg['id'] % len(AZS)]}"}
{"availability_zone": az_selector(ps_cfg["id"])}
)
# A small sleep on each call into the notify hook, to simulate the latency of doing a database write
@@ -168,6 +174,31 @@ def test_storage_controller_many_tenants(
log.info(f"Resident memory: {rss} ({ rss / total_shards} per shard)")
assert rss < expect_memory_per_shard * total_shards
def assert_all_tenants_scheduled_in_home_az():
for tenant_id in tenant_ids:
desc = env.storage_controller.tenant_describe(tenant_id)
preferred_az = None
for shard in desc["shards"]:
# All shards in a tenant should have the same preferred AZ
if preferred_az is None:
preferred_az = shard["preferred_az_id"]
else:
assert preferred_az == shard["preferred_az_id"]
# Attachment should be in the preferred AZ
assert shard["preferred_az_id"] == az_selector(
shard["node_attached"]
), f"Shard {shard['tenant_shard_id']} not in {shard['preferred_az_id']}"
# Secondary locations should not be in the preferred AZ
for node_secondary in shard["node_secondary"]:
assert (
shard["preferred_az_id"] != az_selector(node_secondary)
), f"Shard {shard['tenant_shard_id']} secondary should be in {shard['preferred_az_id']}"
# There should only be one secondary location (i.e. no migrations in flight)
assert len(shard["node_secondary"]) == 1
# Issue more concurrent operations than the storage controller's reconciler concurrency semaphore
# permits, to ensure that we are exercising stressing that.
api_concurrency = 135
@@ -242,6 +273,22 @@ def test_storage_controller_many_tenants(
f"Created {len(tenants_with_timelines)} timelines in {time.time() - t1}, {len(tenants_with_timelines) / (time.time() - t1)}/s"
)
# Check initial scheduling
assert_all_tenants_scheduled_in_home_az()
az_attached_counts: defaultdict[str, int] = defaultdict(int)
az_secondary_counts: defaultdict[str, int] = defaultdict(int)
node_attached_counts: defaultdict[str, int] = defaultdict(int)
for tenant_id in tenants.keys():
desc = env.storage_controller.tenant_describe(tenant_id)
for shard in desc["shards"]:
az_attached_counts[az_selector(shard["node_attached"])] += 1
node_attached_counts[shard["node_attached"]] += 1
for node_secondary in shard["node_secondary"]:
az_secondary_counts[az_selector(node_secondary)] += 1
log.info(f"Initial node attached counts: {node_attached_counts}")
log.info(f"Initial AZ shard counts: {az_attached_counts}, {az_secondary_counts}")
# Plan operations: ensure each tenant with a timeline gets at least
# one of each operation type. Then add other tenants to make up the
# numbers.
@@ -450,11 +497,77 @@ def test_storage_controller_many_tenants(
env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120)
env.storage_controller.consistency_check()
# Since we did `reconcile_until_idle` during the above loop, the system should be left in
# an optimally scheduled state. Validate that this includes all the tenants being scheduled
# in their home AZ.
assert_all_tenants_scheduled_in_home_az()
# Consistency check is safe here: restarting pageservers should not have caused any Reconcilers to spawn,
# as they were not offline long enough to trigger any scheduling changes.
env.storage_controller.consistency_check()
check_memory()
# Simulate loss of an AZ
victim_az = "az-alpha"
killed_pageservers = []
for ps in env.pageservers:
if az_selector(ps.id) == victim_az:
ps.stop(immediate=True)
killed_pageservers.append(ps)
log.info(f"Killed pageserver {ps.id}")
assert killed_pageservers
# Wait for the controller to notice the pageservers are dead
def assert_pageservers_availability(
pageservers: list[NeonPageserver], expected_availability: PageserverAvailability
):
nodes = env.storage_controller.nodes()
checked_any = False
node_ids = [ps.id for ps in pageservers]
for node in nodes:
if node["id"] in node_ids:
checked_any = True
assert (
node["availability"] == expected_availability
), f"Node {node['id']} is not {expected_availability} yet: {node['availability']}"
assert checked_any
wait_until(
lambda: assert_pageservers_availability(killed_pageservers, PageserverAvailability.OFFLINE),
timeout=60,
)
# Let the controller finish all its rescheduling
env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120)
# Check that all the tenants are rescheduled to the remaining pageservers
for tenant_id in tenant_ids:
desc = env.storage_controller.tenant_describe(tenant_id)
for shard in desc["shards"]:
# Attachment should be outside the AZ where we killed the pageservers
assert (
az_selector(shard["node_attached"]) != victim_az
), f"Shard {shard['tenant_shard_id']} still in {victim_az} (node {shard['node_attached']})"
# Bring back the pageservers
for ps in killed_pageservers:
ps.start()
wait_until(
lambda: assert_pageservers_availability(killed_pageservers, PageserverAvailability.ACTIVE),
timeout=60,
)
# A very long timeout is required: we will be migrating all the tenants on all the pageservers
# in the region that we just restored. Assume it'll take up to twice as long as it took to fill
# a single node
env.storage_controller.reconcile_until_idle(
max_interval=0.1, timeout_secs=DRAIN_FILL_TIMEOUT * 4
)
assert_all_tenants_scheduled_in_home_az()
# Stop the storage controller before tearing down fixtures, because it otherwise might log
# errors trying to call our `ComputeReconfigure`.
env.storage_controller.stop()

View File

@@ -84,9 +84,6 @@ page_cache_size=10
log.info("Checking layer access metrics ...")
layer_access_metric_names = [
"pageserver_layers_visited_per_read_global_sum",
"pageserver_layers_visited_per_read_global_count",
"pageserver_layers_visited_per_read_global_bucket",
"pageserver_layers_visited_per_vectored_read_global_sum",
"pageserver_layers_visited_per_vectored_read_global_count",
"pageserver_layers_visited_per_vectored_read_global_bucket",
@@ -97,12 +94,6 @@ page_cache_size=10
layer_access_metrics = metrics.query_all(name)
log.info(f"Got metrics: {layer_access_metrics}")
non_vectored_sum = metrics.query_one("pageserver_layers_visited_per_read_global_sum")
non_vectored_count = metrics.query_one("pageserver_layers_visited_per_read_global_count")
if non_vectored_count.value != 0:
non_vectored_average = non_vectored_sum.value / non_vectored_count.value
else:
non_vectored_average = 0
vectored_sum = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_sum")
vectored_count = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_count")
if vectored_count.value > 0:
@@ -113,11 +104,10 @@ page_cache_size=10
assert vectored_sum.value == 0
vectored_average = 0
log.info(f"{non_vectored_average=} {vectored_average=}")
log.info(f"{vectored_average=}")
# The upper bound for average number of layer visits below (8)
# was chosen empirically for this workload.
assert non_vectored_average < 8
assert vectored_average < 8

View File

@@ -219,7 +219,7 @@ if SQL_EXPORTER is None:
#
# The "host" network mode allows sql_exporter to talk to the
# endpoint which is running on the host.
super().__init__("docker.io/burningalchemist/sql_exporter:0.16.0", network_mode="host")
super().__init__("docker.io/burningalchemist/sql_exporter:0.17.0", network_mode="host")
self.__logs_dir = logs_dir
self.__port = port
@@ -252,7 +252,7 @@ if SQL_EXPORTER is None:
log.info("Waiting for sql_exporter to be ready")
wait_for_logs(
self,
rf'level=info msg="Listening on" address=\[::\]:{self.__port}',
rf'msg="Listening on" address=\[::\]:{self.__port}',
timeout=5,
)
@@ -344,10 +344,7 @@ else:
time.sleep(0.5)
continue
if (
f'level=info msg="Listening on" address=[::]:{self._sql_exporter_port}'
in line
):
if f'msg="Listening on" address=[::]:{self._sql_exporter_port}' in line:
break
@override

View File

@@ -0,0 +1,44 @@
# NB: there are benchmarks that double-serve as tests inside the `performance` directory.
import subprocess
from pathlib import Path
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder
def test_slow_flush(neon_env_builder: NeonEnvBuilder, neon_binpath: Path):
def patch_pageserver_toml(config):
config["page_service_pipelining"] = {
"mode": "pipelined",
"max_batch_size": 32,
"execution": "concurrent-futures",
}
neon_env_builder.pageserver_config_override = patch_pageserver_toml
env = neon_env_builder.init_start()
log.info("make flush appear slow")
log.info("filling pipe")
child = subprocess.Popen(
[
neon_binpath / "test_helper_slow_client_reads",
env.pageserver.connstr(),
str(env.initial_tenant),
str(env.initial_timeline),
],
bufsize=0, # unbuffered
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
)
assert child.stdout is not None
buf = child.stdout.read(1)
if len(buf) != 1:
raise Exception("unexpected EOF")
if buf != b"R":
raise Exception(f"unexpected data: {buf!r}")
log.info("helper reports pipe filled")
log.info("try to shut down the tenant")
env.pageserver.tenant_detach(env.initial_tenant)

View File

@@ -187,7 +187,7 @@ def test_physical_replication_config_mismatch_too_many_known_xids(neon_simple_en
origin=primary,
endpoint_id="secondary",
config_lines=[
"max_connections=2",
"max_connections=5",
"autovacuum_max_workers=1",
"max_worker_processes=5",
"max_wal_senders=1",

View File

@@ -1,10 +1,15 @@
from __future__ import annotations
import asyncio
import ssl
import asyncpg
import pytest
import websocket_tunnel
import websockets
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonProxy
from fixtures.port_distributor import PortDistributor
@pytest.mark.asyncio
@@ -196,3 +201,53 @@ async def test_websockets_pipelined(static_proxy: NeonProxy):
# close
await websocket.send(b"X\x00\x00\x00\x04")
await websocket.wait_closed()
@pytest.mark.asyncio
async def test_websockets_tunneled(static_proxy: NeonProxy, port_distributor: PortDistributor):
static_proxy.safe_psql("create user ws_auth with password 'ws' superuser")
user = "ws_auth"
password = "ws"
ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
ssl_context.load_verify_locations(str(static_proxy.test_output_dir / "proxy.crt"))
# Launch a tunnel service so that we can speak the websockets protocol to
# the proxy
tunnel_port = port_distributor.get_port()
tunnel_server = await websocket_tunnel.start_server(
"127.0.0.1",
tunnel_port,
f"wss://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
ssl_context,
)
log.info(f"websockets tunnel listening for connections on port {tunnel_port}")
async with tunnel_server:
async def run_tunnel():
try:
async with tunnel_server:
await tunnel_server.serve_forever()
except Exception as e:
log.error(f"Error in tunnel task: {e}")
tunnel_task = asyncio.create_task(run_tunnel())
# Ok, the tunnel is now running. Check that we can connect to the proxy's
# websocket interface, through the tunnel
tunnel_connstring = f"postgres://{user}:{password}@127.0.0.1:{tunnel_port}/postgres"
log.info(f"connecting to {tunnel_connstring}")
conn = await asyncpg.connect(tunnel_connstring)
res = await conn.fetchval("SELECT 123")
assert res == 123
await conn.close()
log.info("Ran a query successfully through the tunnel")
tunnel_server.close()
try:
await tunnel_task
except asyncio.CancelledError:
pass

View File

@@ -520,14 +520,18 @@ def test_sharding_split_smoke(
shard_count = 2
# Shard count we split into
split_shard_count = 4
# We will have 2 shards per pageserver once done (including secondaries)
neon_env_builder.num_pageservers = split_shard_count
# In preferred AZ & other AZ we will end up with one shard per pageserver
neon_env_builder.num_pageservers = split_shard_count * 2
# Two AZs
def assign_az(ps_cfg):
az = f"az-{(ps_cfg['id'] - 1) % 2}"
ps_cfg["availability_zone"] = az
# We will run more pageservers than tests usually do, so give them tiny page caches
# in case we're on a test node under memory pressure.
ps_cfg["page_cache_size"] = 128
neon_env_builder.pageserver_config_override = assign_az
# 1MiB stripes: enable getting some meaningful data distribution without
@@ -679,8 +683,8 @@ def test_sharding_split_smoke(
# - shard_count reconciles for the original setup of the tenant
# - shard_count reconciles for detaching the original secondary locations during split
# - split_shard_count reconciles during shard splitting, for setting up secondaries.
# - split_shard_count/2 of the child shards will need to fail over to their secondaries (since we have 8 shards and 4 pageservers, only 4 will move)
expect_reconciles = shard_count * 2 + split_shard_count + split_shard_count / 2
# - split_shard_count/2 reconciles to migrate shards to their temporary secondaries
expect_reconciles = shard_count * 2 + split_shard_count + 3 * (split_shard_count / 2)
reconcile_ok = env.storage_controller.get_metric_value(
"storage_controller_reconcile_complete_total", filter={"status": "ok"}
@@ -745,10 +749,14 @@ def test_sharding_split_smoke(
# dominated by shard count.
log.info(f"total: {total}")
assert total == {
1: 2,
2: 2,
3: 2,
4: 2,
1: 1,
2: 1,
3: 1,
4: 1,
5: 1,
6: 1,
7: 1,
8: 1,
}
# The controller is not required to lay out the attached locations in any particular way, but
@@ -1387,13 +1395,7 @@ def test_sharding_split_failures(
else:
attached_count += 1
if exclude_ps_id is not None:
# For a node failure case, we expect there to be a secondary location
# scheduled on the offline node, so expect one fewer secondary in total
assert secondary_count == initial_shard_count - 1
else:
assert secondary_count == initial_shard_count
assert secondary_count == initial_shard_count
assert attached_count == initial_shard_count
def assert_split_done(exclude_ps_id: int | None = None) -> None:

View File

@@ -822,6 +822,122 @@ def test_storage_controller_stuck_compute_hook(
env.storage_controller.consistency_check()
@run_only_on_default_postgres("postgres behavior is not relevant")
def test_storage_controller_compute_hook_retry(
httpserver: HTTPServer,
neon_env_builder: NeonEnvBuilder,
httpserver_listen_address: ListenAddress,
):
"""
Test that when a reconciler can't do its compute hook notification, it will keep
trying until it succeeds.
Reproducer for https://github.com/neondatabase/cloud/issues/22612
"""
neon_env_builder.num_pageservers = 2
(host, port) = httpserver_listen_address
neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify"
handle_params = {"status": 200}
notifications = []
def handler(request: Request):
status = handle_params["status"]
log.info(f"Notify request[{status}]: {request}")
notifications.append(request.json)
return Response(status=status)
httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler)
# Start running
env = neon_env_builder.init_configs()
env.start()
tenant_id = TenantId.generate()
env.create_tenant(tenant_id, placement_policy='{"Attached": 1}')
# Initial notification from tenant creation
assert len(notifications) == 1
expect: dict[str, list[dict[str, int]] | str | None | int] = {
"tenant_id": str(tenant_id),
"stripe_size": None,
"shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}],
"preferred_az": DEFAULT_AZ_ID,
}
assert notifications[0] == expect
# Block notifications, and fail a node
handle_params["status"] = 423
env.pageservers[0].stop()
env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG)
env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS)
# Avoid waiting for heartbeats
env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"})
# Make reconciler run and fail: it should leave itself in a state where the shard will retry notification later,
# and we will check that that happens
notifications = []
try:
assert env.storage_controller.reconcile_all() == 1
except StorageControllerApiException as e:
assert "Control plane tenant busy" in str(e)
assert len(notifications) == 1
assert (
env.storage_controller.tenant_describe(tenant_id)["shards"][0][
"is_pending_compute_notification"
]
is True
)
# Try reconciling again, it should try notifying again
notifications = []
try:
assert env.storage_controller.reconcile_all() == 1
except StorageControllerApiException as e:
assert "Control plane tenant busy" in str(e)
assert len(notifications) == 1
assert (
env.storage_controller.tenant_describe(tenant_id)["shards"][0][
"is_pending_compute_notification"
]
is True
)
# The describe API should indicate that a notification is pending
assert (
env.storage_controller.tenant_describe(tenant_id)["shards"][0][
"is_pending_compute_notification"
]
is True
)
# Unblock notifications: reconcile should work now
handle_params["status"] = 200
notifications = []
assert env.storage_controller.reconcile_all() == 1
assert len(notifications) == 1
assert (
env.storage_controller.tenant_describe(tenant_id)["shards"][0][
"is_pending_compute_notification"
]
is False
)
# Reconciler should be idle now that it succeeded in its compute notification
notifications = []
assert env.storage_controller.reconcile_all() == 0
assert len(notifications) == 0
assert (
env.storage_controller.tenant_describe(tenant_id)["shards"][0][
"is_pending_compute_notification"
]
is False
)
@run_only_on_default_postgres("this test doesn't start an endpoint")
def test_storage_controller_compute_hook_revert(
httpserver: HTTPServer,
@@ -936,7 +1052,7 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
that just hits the endpoints to check that they don't bitrot.
"""
neon_env_builder.num_pageservers = 2
neon_env_builder.num_pageservers = 3
env = neon_env_builder.init_start()
tenant_id = TenantId.generate()
@@ -961,7 +1077,7 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
"GET", f"{env.storage_controller_api}/debug/v1/scheduler"
)
# Two nodes, in a dict of node_id->node
assert len(response.json()["nodes"]) == 2
assert len(response.json()["nodes"]) == 3
assert sum(v["shard_count"] for v in response.json()["nodes"].values()) == 3
assert all(v["may_schedule"] for v in response.json()["nodes"].values())
@@ -972,13 +1088,25 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
headers=env.storage_controller.headers(TokenScope.ADMIN),
)
# Secondary migration API: superficial check that it migrates
secondary_dest = env.pageservers[2].id
env.storage_controller.request(
"PUT",
f"{env.storage_controller_api}/control/v1/tenant/{tenant_id}-0002/migrate_secondary",
headers=env.storage_controller.headers(TokenScope.ADMIN),
json={"tenant_shard_id": f"{tenant_id}-0002", "node_id": secondary_dest},
)
assert env.storage_controller.tenant_describe(tenant_id)["shards"][0]["node_secondary"] == [
secondary_dest
]
# Node unclean drop API
response = env.storage_controller.request(
"POST",
f"{env.storage_controller_api}/debug/v1/node/{env.pageservers[1].id}/drop",
headers=env.storage_controller.headers(TokenScope.ADMIN),
)
assert len(env.storage_controller.node_list()) == 1
assert len(env.storage_controller.node_list()) == 2
# Tenant unclean drop API
response = env.storage_controller.request(
@@ -1696,7 +1824,13 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
"""
output_dir = neon_env_builder.test_output_dir
shard_count = 4
env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
neon_env_builder.num_pageservers = 2
env = neon_env_builder.init_configs()
env.start()
tenant_id = TenantId.generate()
env.create_tenant(tenant_id, placement_policy='{"Attached":1}', shard_count=shard_count)
base_args = [env.neon_binpath / "storcon_cli", "--api", env.storage_controller_api]
def storcon_cli(args):
@@ -1725,7 +1859,7 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
# List nodes
node_lines = storcon_cli(["nodes"])
# Table header, footer, and one line of data
assert len(node_lines) == 5
assert len(node_lines) == 7
assert "localhost" in node_lines[3]
# Pause scheduling onto a node
@@ -1743,10 +1877,21 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
storcon_cli(["node-configure", "--node-id", "1", "--availability", "offline"])
assert "Offline" in storcon_cli(["nodes"])[3]
# Restore node, verify status changes in CLI output
env.pageservers[0].start()
def is_online():
assert "Offline" not in storcon_cli(["nodes"])
wait_until(is_online)
# Let everything stabilize after node failure to avoid interfering with subsequent steps
env.storage_controller.reconcile_until_idle(timeout_secs=10)
# List tenants
tenant_lines = storcon_cli(["tenants"])
assert len(tenant_lines) == 5
assert str(env.initial_tenant) in tenant_lines[3]
assert str(tenant_id) in tenant_lines[3]
# Setting scheduling policies intentionally result in warnings, they're for rare use.
env.storage_controller.allowed_errors.extend(
@@ -1754,23 +1899,58 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
)
# Describe a tenant
tenant_lines = storcon_cli(["tenant-describe", "--tenant-id", str(env.initial_tenant)])
tenant_lines = storcon_cli(["tenant-describe", "--tenant-id", str(tenant_id)])
assert len(tenant_lines) >= 3 + shard_count * 2
assert str(env.initial_tenant) in tenant_lines[0]
assert str(tenant_id) in tenant_lines[0]
# Migrate an attached location
def other_ps_id(current_ps_id):
return (
env.pageservers[0].id
if current_ps_id == env.pageservers[1].id
else env.pageservers[1].id
)
storcon_cli(
[
"tenant-shard-migrate",
"--tenant-shard-id",
f"{tenant_id}-0004",
"--node",
str(
other_ps_id(
env.storage_controller.tenant_describe(tenant_id)["shards"][0]["node_attached"]
)
),
]
)
# Migrate a secondary location
storcon_cli(
[
"tenant-shard-migrate-secondary",
"--tenant-shard-id",
f"{tenant_id}-0004",
"--node",
str(
other_ps_id(
env.storage_controller.tenant_describe(tenant_id)["shards"][0][
"node_secondary"
][0]
)
),
]
)
# Pause changes on a tenant
storcon_cli(["tenant-policy", "--tenant-id", str(env.initial_tenant), "--scheduling", "stop"])
storcon_cli(["tenant-policy", "--tenant-id", str(tenant_id), "--scheduling", "stop"])
assert "Stop" in storcon_cli(["tenants"])[3]
# Cancel ongoing reconcile on a tenant
storcon_cli(
["tenant-shard-cancel-reconcile", "--tenant-shard-id", f"{env.initial_tenant}-0104"]
)
storcon_cli(["tenant-shard-cancel-reconcile", "--tenant-shard-id", f"{tenant_id}-0104"])
# Change a tenant's placement
storcon_cli(
["tenant-policy", "--tenant-id", str(env.initial_tenant), "--placement", "secondary"]
)
storcon_cli(["tenant-policy", "--tenant-id", str(tenant_id), "--placement", "secondary"])
assert "Secondary" in storcon_cli(["tenants"])[3]
# Modify a tenant's config
@@ -1778,7 +1958,7 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
[
"patch-tenant-config",
"--tenant-id",
str(env.initial_tenant),
str(tenant_id),
"--config",
json.dumps({"pitr_interval": "1m"}),
]
@@ -3033,11 +3213,12 @@ def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool:
@run_only_on_default_postgres("this is like a 'unit test' against storcon db")
def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder):
def assign_az(ps_cfg):
az = f"az-{ps_cfg['id']}"
az = f"az-{ps_cfg['id'] % 2}"
log.info("Assigned AZ {az}")
ps_cfg["availability_zone"] = az
neon_env_builder.pageserver_config_override = assign_az
neon_env_builder.num_pageservers = 2
neon_env_builder.num_pageservers = 4
env = neon_env_builder.init_configs()
env.start()
@@ -3052,8 +3233,14 @@ def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder):
assert shards[0]["preferred_az_id"] == expected_az
# When all other schedule scoring parameters are equal, tenants should round-robin on AZs
assert env.storage_controller.tenant_describe(tids[0])["shards"][0]["preferred_az_id"] == "az-0"
assert env.storage_controller.tenant_describe(tids[1])["shards"][0]["preferred_az_id"] == "az-1"
assert env.storage_controller.tenant_describe(tids[2])["shards"][0]["preferred_az_id"] == "az-0"
# Try modifying preferred AZ
updated = env.storage_controller.set_preferred_azs(
{TenantShardId(tid, 0, 0): "foo" for tid in tids}
{TenantShardId(tid, 0, 0): "az-0" for tid in tids}
)
assert set(updated) == set([TenantShardId(tid, 0, 0) for tid in tids])
@@ -3061,29 +3248,24 @@ def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder):
for tid in tids:
shards = env.storage_controller.tenant_describe(tid)["shards"]
assert len(shards) == 1
assert shards[0]["preferred_az_id"] == "foo"
assert shards[0]["preferred_az_id"] == "az-0"
# Generate a layer to avoid shard split handling on ps from tripping
# up on debug assert.
timeline_id = TimelineId.generate()
env.create_timeline("bar", tids[0], timeline_id)
workload = Workload(env, tids[0], timeline_id, branch_name="bar")
workload.init()
workload.write_rows(256)
workload.validate()
# Having modified preferred AZ, we should get moved there
env.storage_controller.reconcile_until_idle(max_interval=0.1)
for tid in tids:
shard = env.storage_controller.tenant_describe(tid)["shards"][0]
attached_to = shard["node_attached"]
attached_in_az = env.get_pageserver(attached_to).az_id
assert shard["preferred_az_id"] == attached_in_az == "az-0"
env.storage_controller.tenant_shard_split(tids[0], shard_count=2)
env.storage_controller.reconcile_until_idle(max_interval=0.1)
shards = env.storage_controller.tenant_describe(tids[0])["shards"]
assert len(shards) == 2
for shard in shards:
attached_to = shard["node_attached"]
expected_az = env.get_pageserver(attached_to).az_id
# The scheduling optimization logic is not yet AZ-aware, so doesn't succeed
# in putting the tenant shards in the preferred AZ.
# To be fixed in https://github.com/neondatabase/neon/pull/9916
# assert shard["preferred_az_id"] == expected_az
attached_in_az = env.get_pageserver(attached_to).az_id
assert shard["preferred_az_id"] == attached_in_az == "az-0"
@run_only_on_default_postgres("Postgres version makes no difference here")

154
test_runner/websocket_tunnel.py Executable file
View File

@@ -0,0 +1,154 @@
#!/usr/bin/env python3
#
# This program helps to test the WebSocket tunneling in proxy. It listens for a TCP
# connection on a port, and when you connect to it, it opens a websocket connection,
# and forwards all the traffic to the websocket connection, wrapped in WebSocket binary
# frames.
#
# This is used in the test_proxy::test_websockets test, but it is handy for manual testing too.
#
# Usage for manual testing:
#
# ## Launch Posgres on port 3000:
# postgres -D data -p3000
#
# ## Launch proxy with WSS enabled:
# openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj '/CN=*.neon.localtest.me'
# ./target/debug/proxy --wss 127.0.0.1:40433 --http 127.0.0.1:28080 --mgmt 127.0.0.1:9099 --proxy 127.0.0.1:4433 --tls-key server.key --tls-cert server.crt --auth-backend postgres
#
# ## Launch the tunnel:
#
# poetry run ./test_runner/websocket_tunnel.py --ws-port 40433 --ws-url "wss://ep-test.neon.localtest.me"
#
# ## Now you can connect with psql:
# psql "postgresql://heikki@localhost:40433/postgres"
#
import argparse
import asyncio
import logging
import ssl
from ssl import Purpose
import websockets
from fixtures.log_helper import log
# Enable verbose logging of all the traffic
def enable_verbose_logging():
logger = logging.getLogger("websockets")
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())
async def start_server(tcp_listen_host, tcp_listen_port, ws_url, ctx):
server = await asyncio.start_server(
lambda r, w: handle_client(r, w, ws_url, ctx), tcp_listen_host, tcp_listen_port
)
return server
async def handle_tcp_to_websocket(tcp_reader, ws):
try:
while not tcp_reader.at_eof():
data = await tcp_reader.read(1024)
await ws.send(data)
except websockets.exceptions.ConnectionClosedError as e:
log.debug(f"connection closed: {e}")
except websockets.exceptions.ConnectionClosedOK:
log.debug("connection closed")
except Exception as e:
log.error(e)
async def handle_websocket_to_tcp(ws, tcp_writer):
try:
async for message in ws:
tcp_writer.write(message)
await tcp_writer.drain()
except websockets.exceptions.ConnectionClosedError as e:
log.debug(f"connection closed: {e}")
except websockets.exceptions.ConnectionClosedOK:
log.debug("connection closed")
except Exception as e:
log.error(e)
async def handle_client(tcp_reader, tcp_writer, ws_url: str, ctx: ssl.SSLContext):
try:
log.info("Received TCP connection. Connecting to websockets proxy.")
async with websockets.connect(ws_url, ssl=ctx) as ws:
try:
log.info("Connected to websockets proxy")
async with asyncio.TaskGroup() as tg:
task1 = tg.create_task(handle_tcp_to_websocket(tcp_reader, ws))
task2 = tg.create_task(handle_websocket_to_tcp(ws, tcp_writer))
done, pending = await asyncio.wait(
[task1, task2], return_when=asyncio.FIRST_COMPLETED
)
tcp_writer.close()
await ws.close()
except* Exception as ex:
log.error(ex.exceptions)
except Exception as e:
log.error(e)
async def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--tcp-listen-addr",
default="localhost",
help="TCP addr to listen on",
)
parser.add_argument(
"--tcp-listen-port",
default="40444",
help="TCP port to listen on",
)
parser.add_argument(
"--ws-url",
default="wss://localhost/",
help="websocket URL to connect to. This determines the Host header sent to the server",
)
parser.add_argument(
"--ws-host",
default="127.0.0.1",
help="websockets host to connect to",
)
parser.add_argument(
"--ws-port",
type=int,
default=443,
help="websockets port to connect to",
)
parser.add_argument(
"--verbose",
action="store_true",
help="enable verbose logging",
)
args = parser.parse_args()
if args.verbose:
enable_verbose_logging()
ctx = ssl.create_default_context(Purpose.SERVER_AUTH)
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
server = await start_server(args.tcp_listen_addr, args.tcp_listen_port, args.ws_url, ctx)
print(
f"Listening for connections at {args.tcp_listen_addr}:{args.tcp_listen_port}, forwarding them to {args.ws_host}:{args.ws_port}"
)
async with server:
await server.serve_forever()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,18 +1,18 @@
{
"v17": [
"17.2",
"7e3f3974bc8895938308f94d0e96879ffae638cd"
"0f8da73ed08d4fc4ee58cccea008c75bfb20baa8"
],
"v16": [
"16.6",
"97f9fde349c6de6d573f5ce96db07eca60ce6185"
"f63b141cfb0c813725a6b2574049565bff643018"
],
"v15": [
"15.10",
"f262d631ad477a1819e84a183e5a7ef561830085"
"d3141e17a7155e3d07c8deba4a10c748a29ba1e6"
],
"v14": [
"14.15",
"c2f65b3201591e02ce45b66731392f98d3388e73"
"210a0ba3afd8134ea910b203f274b165bd4f05d7"
]
}