mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-23 12:30:38 +00:00
Compare commits
50 Commits
problame/b
...
sk-members
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c811ae0b91 | ||
|
|
777afbafe5 | ||
|
|
0d0cd16ea2 | ||
|
|
217309c7ef | ||
|
|
976afcee26 | ||
|
|
20e974ecdd | ||
|
|
eb43f65055 | ||
|
|
96d67abd50 | ||
|
|
f7485c4459 | ||
|
|
4ea7b22537 | ||
|
|
10a7878230 | ||
|
|
c19a8b69f2 | ||
|
|
8be17724d8 | ||
|
|
234c3a29df | ||
|
|
db5513076a | ||
|
|
70d4e077a6 | ||
|
|
ae9db8975a | ||
|
|
05a71c7d6a | ||
|
|
b9464865b6 | ||
|
|
1577430408 | ||
|
|
05d17a10ae | ||
|
|
2d0ea08524 | ||
|
|
c98cbbeac1 | ||
|
|
47c1640acc | ||
|
|
6debb49b87 | ||
|
|
e58e29e639 | ||
|
|
d36112d20f | ||
|
|
ffaa52ff5d | ||
|
|
aa7323a384 | ||
|
|
2466a2f977 | ||
|
|
9bdb14c1c0 | ||
|
|
df4abd8b14 | ||
|
|
a039f8381f | ||
|
|
430b556b34 | ||
|
|
1783501eaa | ||
|
|
fd1368d31e | ||
|
|
e9ed53b14f | ||
|
|
a338aee132 | ||
|
|
96243af651 | ||
|
|
ef8bfacd6b | ||
|
|
ceacc29609 | ||
|
|
b31ed0acd1 | ||
|
|
b2d0e1a519 | ||
|
|
d1bc36f536 | ||
|
|
0b9032065e | ||
|
|
09fe3b025c | ||
|
|
12053cf832 | ||
|
|
de199d71e1 | ||
|
|
22a6460010 | ||
|
|
cd982a82ec |
1
.github/actionlint.yml
vendored
1
.github/actionlint.yml
vendored
@@ -25,3 +25,4 @@ config-variables:
|
||||
- PGREGRESS_PG17_PROJECT_ID
|
||||
- SLACK_ON_CALL_QA_STAGING_STREAM
|
||||
- DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN
|
||||
- SLACK_ON_CALL_STORAGE_STAGING_STREAM
|
||||
|
||||
25
.github/workflows/build_and_test.yml
vendored
25
.github/workflows/build_and_test.yml
vendored
@@ -346,25 +346,22 @@ jobs:
|
||||
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
||||
# while coverage is currently collected for the debug ones
|
||||
|
||||
report-benchmarks-failures:
|
||||
report-benchmarks-results-to-slack:
|
||||
needs: [ benchmarks, create-test-report ]
|
||||
if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure'
|
||||
permissions:
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
statuses: write
|
||||
contents: write
|
||||
pull-requests: write
|
||||
if: github.ref_name == 'main' && !cancelled() && contains(fromJSON('["success", "failure"]'), needs.benchmarks.result)
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- uses: slackapi/slack-github-action@v1
|
||||
- uses: slackapi/slack-github-action@v2
|
||||
with:
|
||||
channel-id: C060CNA47S9 # on-call-staging-storage-stream
|
||||
slack-message: |
|
||||
Benchmarks failed on main <${{ github.event.head_commit.url }}|${{ github.sha }}>
|
||||
<${{ needs.create-test-report.outputs.report-url }}|Allure report>
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
method: chat.postMessage
|
||||
token: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
payload: |
|
||||
channel: "${{ vars.SLACK_ON_CALL_STORAGE_STAGING_STREAM }}"
|
||||
text: |
|
||||
Benchmarks on main: *${{ needs.benchmarks.result }}*
|
||||
- <${{ needs.create-test-report.outputs.report-url }}|Allure report>
|
||||
- <${{ github.event.head_commit.url }}|${{ github.sha }}>
|
||||
|
||||
create-test-report:
|
||||
needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image, benchmarks ]
|
||||
|
||||
67
Cargo.lock
generated
67
Cargo.lock
generated
@@ -1605,6 +1605,32 @@ dependencies = [
|
||||
"typenum",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "curve25519-dalek"
|
||||
version = "4.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "97fb8b7c4503de7d6ae7b42ab72a5a59857b4c937ec27a3d4539dba95b5ab2be"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"cpufeatures",
|
||||
"curve25519-dalek-derive",
|
||||
"digest",
|
||||
"fiat-crypto",
|
||||
"rustc_version",
|
||||
"subtle",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "curve25519-dalek-derive"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "darling"
|
||||
version = "0.20.1"
|
||||
@@ -1875,6 +1901,28 @@ dependencies = [
|
||||
"spki 0.7.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ed25519"
|
||||
version = "2.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "115531babc129696a58c64a4fef0a8bf9e9698629fb97e9e40767d235cfbcd53"
|
||||
dependencies = [
|
||||
"signature 2.2.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ed25519-dalek"
|
||||
version = "2.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4a3daa8e81a3963a60642bcc1f90a670680bd4a77535faa384e9d1c79d620871"
|
||||
dependencies = [
|
||||
"curve25519-dalek",
|
||||
"ed25519",
|
||||
"rand_core 0.6.4",
|
||||
"sha2",
|
||||
"subtle",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "either"
|
||||
version = "1.8.1"
|
||||
@@ -2113,6 +2161,12 @@ dependencies = [
|
||||
"subtle",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fiat-crypto"
|
||||
version = "0.2.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d"
|
||||
|
||||
[[package]]
|
||||
name = "filetime"
|
||||
version = "0.2.22"
|
||||
@@ -3990,6 +4044,7 @@ dependencies = [
|
||||
"postgres_connection",
|
||||
"postgres_ffi",
|
||||
"postgres_initdb",
|
||||
"pprof",
|
||||
"pq_proto",
|
||||
"procfs",
|
||||
"rand 0.8.5",
|
||||
@@ -4745,6 +4800,7 @@ dependencies = [
|
||||
"consumption_metrics",
|
||||
"dashmap 5.5.0",
|
||||
"ecdsa 0.16.9",
|
||||
"ed25519-dalek",
|
||||
"env_logger 0.10.2",
|
||||
"fallible-iterator",
|
||||
"flate2",
|
||||
@@ -5651,10 +5707,12 @@ dependencies = [
|
||||
name = "safekeeper_api"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"const_format",
|
||||
"postgres_ffi",
|
||||
"pq_proto",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tokio",
|
||||
"utils",
|
||||
]
|
||||
@@ -7502,12 +7560,21 @@ dependencies = [
|
||||
"anyhow",
|
||||
"async-compression",
|
||||
"bytes",
|
||||
"camino",
|
||||
"camino-tempfile",
|
||||
"criterion",
|
||||
"futures",
|
||||
"pageserver_api",
|
||||
"postgres_ffi",
|
||||
"pprof",
|
||||
"prost",
|
||||
"remote_storage",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"thiserror",
|
||||
"tikv-jemallocator",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"tonic",
|
||||
"tonic-build",
|
||||
"tracing",
|
||||
|
||||
@@ -71,6 +71,7 @@ RUN set -e \
|
||||
ca-certificates \
|
||||
# System postgres for use with client libraries (e.g. in storage controller)
|
||||
postgresql-15 \
|
||||
openssl \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
|
||||
&& useradd -d /data neon \
|
||||
&& chown -R neon:neon /data
|
||||
|
||||
3
Makefile
3
Makefile
@@ -3,7 +3,6 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
|
||||
# Where to install Postgres, default is ./pg_install, maybe useful for package managers
|
||||
POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/
|
||||
|
||||
OPENSSL_PREFIX_DIR := /usr/local/openssl
|
||||
ICU_PREFIX_DIR := /usr/local/icu
|
||||
|
||||
#
|
||||
@@ -26,11 +25,9 @@ endif
|
||||
ifeq ($(shell test -e /home/nonroot/.docker_build && echo -n yes),yes)
|
||||
# Exclude static build openssl, icu for local build (MacOS, Linux)
|
||||
# Only keep for build type release and debug
|
||||
PG_CFLAGS += -I$(OPENSSL_PREFIX_DIR)/include
|
||||
PG_CONFIGURE_OPTS += --with-icu
|
||||
PG_CONFIGURE_OPTS += ICU_CFLAGS='-I/$(ICU_PREFIX_DIR)/include -DU_STATIC_IMPLEMENTATION'
|
||||
PG_CONFIGURE_OPTS += ICU_LIBS='-L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -licui18n -licuuc -licudata -lstdc++ -Wl,-Bdynamic -lm'
|
||||
PG_CONFIGURE_OPTS += LDFLAGS='-L$(OPENSSL_PREFIX_DIR)/lib -L$(OPENSSL_PREFIX_DIR)/lib64 -L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -Wl,-Bstatic -lssl -lcrypto -Wl,-Bdynamic -lrt -lm -ldl -lpthread'
|
||||
endif
|
||||
|
||||
UNAME_S := $(shell uname -s)
|
||||
|
||||
@@ -115,7 +115,7 @@ RUN set -e \
|
||||
|
||||
# Keep the version the same as in compute/compute-node.Dockerfile and
|
||||
# test_runner/regress/test_compute_metrics.py.
|
||||
ENV SQL_EXPORTER_VERSION=0.16.0
|
||||
ENV SQL_EXPORTER_VERSION=0.17.0
|
||||
RUN curl -fsSL \
|
||||
"https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \
|
||||
--output sql_exporter.tar.gz \
|
||||
@@ -190,21 +190,6 @@ RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JS
|
||||
&& make install \
|
||||
&& rm -rf ../lcov.tar.gz
|
||||
|
||||
# Compile and install the static OpenSSL library
|
||||
ENV OPENSSL_VERSION=1.1.1w
|
||||
ENV OPENSSL_PREFIX=/usr/local/openssl
|
||||
RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz && \
|
||||
echo "cf3098950cb4d853ad95c0841f1f9c6d3dc102dccfcacd521d93925208b76ac8 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \
|
||||
cd /tmp && \
|
||||
tar xzvf /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
|
||||
rm /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
|
||||
cd /tmp/openssl-${OPENSSL_VERSION} && \
|
||||
./config --prefix=${OPENSSL_PREFIX} -static --static no-shared -fPIC && \
|
||||
make -j "$(nproc)" && \
|
||||
make install && \
|
||||
cd /tmp && \
|
||||
rm -rf /tmp/openssl-${OPENSSL_VERSION}
|
||||
|
||||
# Use the same version of libicu as the compute nodes so that
|
||||
# clusters created using inidb on pageserver can be used by computes.
|
||||
#
|
||||
|
||||
@@ -104,16 +104,18 @@ RUN cd postgres && \
|
||||
esac; \
|
||||
done;
|
||||
|
||||
# Set PATH for all the subsequent build steps
|
||||
ENV PATH="/usr/local/pgsql/bin:$PATH"
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "postgis-build"
|
||||
# Build PostGIS from the upstream PostGIS mirror.
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS postgis-build
|
||||
FROM pg-build AS postgis-build
|
||||
ARG DEBIAN_VERSION
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
RUN apt update && \
|
||||
apt install --no-install-recommends --no-install-suggests -y \
|
||||
gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
|
||||
@@ -151,8 +153,6 @@ RUN case "${DEBIAN_VERSION}" in \
|
||||
DESTDIR=/sfcgal ninja install -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
ninja clean && cp -R /sfcgal/* /
|
||||
|
||||
ENV PATH="/usr/local/pgsql/bin:$PATH"
|
||||
|
||||
# Postgis 3.5.0 supports v17
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v17") \
|
||||
@@ -170,7 +170,6 @@ RUN case "${PG_VERSION}" in \
|
||||
wget https://download.osgeo.org/postgis/source/postgis-${POSTGIS_VERSION}.tar.gz -O postgis.tar.gz && \
|
||||
echo "${POSTGIS_CHECKSUM} postgis.tar.gz" | sha256sum --check && \
|
||||
mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
|
||||
./autogen.sh && \
|
||||
./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
@@ -220,11 +219,7 @@ RUN case "${PG_VERSION}" in \
|
||||
cmake -GNinja -DCMAKE_BUILD_TYPE=Release .. && \
|
||||
ninja -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
ninja -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
|
||||
cp /usr/local/pgsql/share/extension/pgrouting.control /extensions/postgis && \
|
||||
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
|
||||
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/postgis.tar.zst -T -
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -232,9 +227,8 @@ RUN case "${PG_VERSION}" in \
|
||||
# Build plv8
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS plv8-build
|
||||
FROM pg-build AS plv8-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
COPY compute/patches/plv8-3.1.10.patch /plv8-3.1.10.patch
|
||||
|
||||
@@ -269,7 +263,6 @@ RUN case "${PG_VERSION}" in \
|
||||
# generate and copy upgrade scripts
|
||||
mkdir -p upgrade && ./generate_upgrade.sh ${PLV8_TAG#v} && \
|
||||
cp upgrade/* /usr/local/pgsql/share/extension/ && \
|
||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||
make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
rm -rf /plv8-* && \
|
||||
find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
|
||||
@@ -296,9 +289,8 @@ RUN case "${PG_VERSION}" in \
|
||||
# Build h3_pg
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS h3-pg-build
|
||||
FROM pg-build AS h3-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# not version-specific
|
||||
# last release v4.1.0 - Jan 18, 2023
|
||||
@@ -319,7 +311,6 @@ RUN mkdir -p /h3/usr/ && \
|
||||
RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
|
||||
echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
|
||||
mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \
|
||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
|
||||
@@ -331,17 +322,16 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3
|
||||
# compile unit extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS unit-pg-build
|
||||
FROM pg-build AS unit-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# not version-specific
|
||||
# last release 7.9 - Sep 15, 2024
|
||||
RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz -O postgresql-unit.tar.gz && \
|
||||
echo "e46de6245dcc8b2c2ecf29873dbd43b2b346773f31dd5ce4b8315895a052b456 postgresql-unit.tar.gz" | sha256sum --check && \
|
||||
mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
# unit extension's "create extension" script relies on absolute install path to fill some reference tables.
|
||||
# We move the extension from '/usr/local/pgsql/' to '/usr/local/' after it is build. So we need to adjust the path.
|
||||
# This one-liner removes pgsql/ part of the path.
|
||||
@@ -355,9 +345,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz -
|
||||
# compile pgvector extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS vector-pg-build
|
||||
FROM pg-build AS vector-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
COPY compute/patches/pgvector.patch /pgvector.patch
|
||||
|
||||
@@ -371,8 +360,8 @@ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O
|
||||
echo "867a2c328d4928a5a9d6f052cd3bc78c7d60228a9b914ad32aa3db88e9de27b0 pgvector.tar.gz" | sha256sum --check && \
|
||||
mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
|
||||
patch -p1 < /pgvector.patch && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
|
||||
|
||||
#########################################################################################
|
||||
@@ -381,16 +370,15 @@ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O
|
||||
# compile pgjwt extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pgjwt-pg-build
|
||||
FROM pg-build AS pgjwt-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# not version-specific
|
||||
# doesn't use releases, last commit f3d82fd - Mar 2, 2023
|
||||
RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71c20689ce3d.tar.gz -O pgjwt.tar.gz && \
|
||||
echo "dae8ed99eebb7593b43013f6532d772b12dfecd55548d2673f2dfd0163f6d2b9 pgjwt.tar.gz" | sha256sum --check && \
|
||||
mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control
|
||||
|
||||
#########################################################################################
|
||||
@@ -399,17 +387,16 @@ RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71
|
||||
# compile hypopg extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS hypopg-pg-build
|
||||
FROM pg-build AS hypopg-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# HypoPG 1.4.1 supports v17
|
||||
# last release 1.4.1 - Apr 28, 2024
|
||||
RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypopg.tar.gz && \
|
||||
echo "9afe6357fd389d8d33fad81703038ce520b09275ec00153c6c89282bcdedd6bc hypopg.tar.gz" | sha256sum --check && \
|
||||
mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control
|
||||
|
||||
#########################################################################################
|
||||
@@ -418,17 +405,16 @@ RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypo
|
||||
# compile pg_hashids extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-hashids-pg-build
|
||||
FROM pg-build AS pg-hashids-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# not version-specific
|
||||
# last release v1.2.1 -Jan 12, 2018
|
||||
RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
|
||||
echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control
|
||||
|
||||
#########################################################################################
|
||||
@@ -437,9 +423,8 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz
|
||||
# compile rum extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS rum-pg-build
|
||||
FROM pg-build AS rum-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
COPY compute/patches/rum.patch /rum.patch
|
||||
|
||||
@@ -450,8 +435,8 @@ RUN wget https://github.com/postgrespro/rum/archive/cb1edffc57736cd2a4455f8d0fea
|
||||
echo "65e0a752e99f4c3226400c9b899f997049e93503db8bf5c8072efa136d32fd83 rum.tar.gz" | sha256sum --check && \
|
||||
mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
|
||||
patch -p1 < /rum.patch && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
|
||||
|
||||
#########################################################################################
|
||||
@@ -460,17 +445,16 @@ RUN wget https://github.com/postgrespro/rum/archive/cb1edffc57736cd2a4455f8d0fea
|
||||
# compile pgTAP extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pgtap-pg-build
|
||||
FROM pg-build AS pgtap-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# pgtap 1.3.3 supports v17
|
||||
# last release v1.3.3 - Apr 8, 2024
|
||||
RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgtap.tar.gz && \
|
||||
echo "325ea79d0d2515bce96bce43f6823dcd3effbd6c54cb2a4d6c2384fffa3a14c7 pgtap.tar.gz" | sha256sum --check && \
|
||||
mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control
|
||||
|
||||
#########################################################################################
|
||||
@@ -479,17 +463,16 @@ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgta
|
||||
# compile ip4r extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS ip4r-pg-build
|
||||
FROM pg-build AS ip4r-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# not version-specific
|
||||
# last release v2.4.2 - Jul 29, 2023
|
||||
RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
|
||||
echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
|
||||
mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/ip4r.control
|
||||
|
||||
#########################################################################################
|
||||
@@ -498,17 +481,16 @@ RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O i
|
||||
# compile Prefix extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS prefix-pg-build
|
||||
FROM pg-build AS prefix-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# not version-specific
|
||||
# last release v1.2.10 - Jul 5, 2023
|
||||
RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
|
||||
echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
|
||||
mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control
|
||||
|
||||
#########################################################################################
|
||||
@@ -517,17 +499,16 @@ RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O p
|
||||
# compile hll extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS hll-pg-build
|
||||
FROM pg-build AS hll-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# not version-specific
|
||||
# last release v2.18 - Aug 29, 2023
|
||||
RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
|
||||
echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
|
||||
mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control
|
||||
|
||||
#########################################################################################
|
||||
@@ -536,17 +517,16 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar
|
||||
# compile plpgsql_check extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS plpgsql-check-pg-build
|
||||
FROM pg-build AS plpgsql-check-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# plpgsql_check v2.7.11 supports v17
|
||||
# last release v2.7.11 - Sep 16, 2024
|
||||
RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz -O plpgsql_check.tar.gz && \
|
||||
echo "208933f8dbe8e0d2628eb3851e9f52e6892b8e280c63700c0f1ce7883625d172 plpgsql_check.tar.gz" | sha256sum --check && \
|
||||
mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control
|
||||
|
||||
#########################################################################################
|
||||
@@ -555,11 +535,8 @@ RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz
|
||||
# compile timescaledb extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS timescaledb-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
FROM pg-build AS timescaledb-pg-build
|
||||
ARG PG_VERSION
|
||||
ENV PATH="/usr/local/pgsql/bin:$PATH"
|
||||
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v14" | "v15") \
|
||||
@@ -590,11 +567,8 @@ RUN case "${PG_VERSION}" in \
|
||||
# compile pg_hint_plan extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-hint-plan-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
FROM pg-build AS pg-hint-plan-pg-build
|
||||
ARG PG_VERSION
|
||||
ENV PATH="/usr/local/pgsql/bin:$PATH"
|
||||
|
||||
# version-specific, has separate releases for each version
|
||||
RUN case "${PG_VERSION}" in \
|
||||
@@ -632,14 +606,12 @@ RUN case "${PG_VERSION}" in \
|
||||
# compile pg_cron extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-cron-pg-build
|
||||
FROM pg-build AS pg-cron-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# This is an experimental extension that we do not support on prod yet.
|
||||
# !Do not remove!
|
||||
# We set it in shared_preload_libraries and computes will fail to start if library is not found.
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O pg_cron.tar.gz && \
|
||||
echo "52d1850ee7beb85a4cb7185731ef4e5a90d1de216709d8988324b0d02e76af61 pg_cron.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
|
||||
@@ -653,9 +625,8 @@ RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O
|
||||
# compile rdkit extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS rdkit-pg-build
|
||||
FROM pg-build AS rdkit-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN apt update && \
|
||||
apt install --no-install-recommends --no-install-suggests -y \
|
||||
@@ -673,7 +644,13 @@ RUN apt update && \
|
||||
# Use new version only for v17
|
||||
# because Release_2024_09_1 has some backward incompatible changes
|
||||
# https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1
|
||||
ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
|
||||
|
||||
# XXX: /usr/local/pgsql/bin is already in PATH, and that should be enough to find
|
||||
# pg_config. For some reason the rdkit cmake script doesn't work with just that,
|
||||
# however. By also adding /usr/local/pgsql, it works, which is weird because there
|
||||
# are no executables in that directory.
|
||||
ENV PATH="/usr/local/pgsql:$PATH"
|
||||
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v17") \
|
||||
export RDKIT_VERSION=Release_2024_09_1 \
|
||||
@@ -726,13 +703,11 @@ RUN case "${PG_VERSION}" in \
|
||||
# compile pg_uuidv7 extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-uuidv7-pg-build
|
||||
FROM pg-build AS pg-uuidv7-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# not version-specific
|
||||
# last release v1.6.0 - Oct 9, 2024
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz -O pg_uuidv7.tar.gz && \
|
||||
echo "0fa6c710929d003f6ce276a7de7a864e9d1667b2d78be3dc2c07f2409eb55867 pg_uuidv7.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
|
||||
@@ -746,13 +721,11 @@ RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz
|
||||
# compile pg_roaringbitmap extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-roaringbitmap-pg-build
|
||||
FROM pg-build AS pg-roaringbitmap-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# not version-specific
|
||||
# last release v0.5.4 - Jun 28, 2022
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
|
||||
echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
|
||||
@@ -766,16 +739,14 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
|
||||
# compile pg_semver extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-semver-pg-build
|
||||
FROM pg-build AS pg-semver-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# Release 0.40.0 breaks backward compatibility with previous versions
|
||||
# see release note https://github.com/theory/pg-semver/releases/tag/v0.40.0
|
||||
# Use new version only for v17
|
||||
#
|
||||
# last release v0.40.0 - Jul 22, 2024
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v17") \
|
||||
export SEMVER_VERSION=0.40.0 \
|
||||
@@ -802,13 +773,11 @@ RUN case "${PG_VERSION}" in \
|
||||
# compile pg_embedding extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-embedding-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
FROM pg-build AS pg-embedding-pg-build
|
||||
|
||||
# This is our extension, support stopped in favor of pgvector
|
||||
# TODO: deprecate it
|
||||
ARG PG_VERSION
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v14" | "v15") \
|
||||
export PG_EMBEDDING_VERSION=0.3.5 \
|
||||
@@ -829,26 +798,19 @@ RUN case "${PG_VERSION}" in \
|
||||
# compile anon extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-anon-pg-build
|
||||
FROM pg-build AS pg-anon-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# This is an experimental extension, never got to real production.
|
||||
# !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found.
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
|
||||
esac && \
|
||||
wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
|
||||
echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
|
||||
mkdir -p /extensions/anon && cp /usr/local/pgsql/share/extension/anon.control /extensions/anon && \
|
||||
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
|
||||
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/anon.tar.zst -T -
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -856,9 +818,8 @@ RUN case "${PG_VERSION}" in "v17") \
|
||||
# This layer is used to build `pgrx` deps
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS rust-extensions-build
|
||||
FROM pg-build AS rust-extensions-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN apt update && \
|
||||
apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \
|
||||
@@ -866,7 +827,7 @@ RUN apt update && \
|
||||
useradd -ms /bin/bash nonroot -b /home
|
||||
|
||||
ENV HOME=/home/nonroot
|
||||
ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
|
||||
ENV PATH="/home/nonroot/.cargo/bin:$PATH"
|
||||
USER nonroot
|
||||
WORKDIR /home/nonroot
|
||||
|
||||
@@ -893,9 +854,8 @@ USER root
|
||||
# and eventually get merged with `rust-extensions-build`
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS rust-extensions-build-pgrx12
|
||||
FROM pg-build AS rust-extensions-build-pgrx12
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN apt update && \
|
||||
apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \
|
||||
@@ -903,7 +863,7 @@ RUN apt update && \
|
||||
useradd -ms /bin/bash nonroot -b /home
|
||||
|
||||
ENV HOME=/home/nonroot
|
||||
ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
|
||||
ENV PATH="/home/nonroot/.cargo/bin:$PATH"
|
||||
USER nonroot
|
||||
WORKDIR /home/nonroot
|
||||
|
||||
@@ -1065,8 +1025,8 @@ ARG PG_VERSION
|
||||
# NOTE: local_proxy depends on the version of pg_session_jwt
|
||||
# Do not update without approve from proxy team
|
||||
# Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs
|
||||
RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.1.2-v17.tar.gz -O pg_session_jwt.tar.gz && \
|
||||
echo "c8ecbed9cb8c6441bce5134a176002b043018adf9d05a08e457dda233090a86e pg_session_jwt.tar.gz" | sha256sum --check && \
|
||||
RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0.tar.gz -O pg_session_jwt.tar.gz && \
|
||||
echo "5ace028e591f2e000ca10afa5b1ca62203ebff014c2907c0ec3b29c36f28a1bb pg_session_jwt.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "=0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgrx install --release
|
||||
@@ -1078,13 +1038,11 @@ RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.1.2
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM build-deps AS wal2json-pg-build
|
||||
FROM pg-build AS wal2json-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# wal2json wal2json_2_6 supports v17
|
||||
# last release wal2json_2_6 - Apr 25, 2024
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.gz -O wal2json.tar.gz && \
|
||||
echo "18b4bdec28c74a8fc98a11c72de38378a760327ef8e5e42e975b0029eb96ba0d wal2json.tar.gz" | sha256sum --check && \
|
||||
mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json.tar.gz --strip-components=1 -C . && \
|
||||
@@ -1097,13 +1055,11 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.
|
||||
# compile pg_ivm extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-ivm-build
|
||||
FROM pg-build AS pg-ivm-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# pg_ivm v1.9 supports v17
|
||||
# last release v1.9 - Jul 31
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_ivm.tar.gz && \
|
||||
echo "59e15722939f274650abf637f315dd723c87073496ca77236b044cb205270d8b pg_ivm.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
|
||||
@@ -1117,13 +1073,11 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_iv
|
||||
# compile pg_partman extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-partman-build
|
||||
FROM pg-build AS pg-partman-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# should support v17 https://github.com/pgpartman/pg_partman/discussions/693
|
||||
# last release 5.1.0 Apr 2, 2024
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz -O pg_partman.tar.gz && \
|
||||
echo "3e3a27d7ff827295d5c55ef72f07a49062d6204b3cb0b9a048645d6db9f3cb9f pg_partman.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \
|
||||
@@ -1139,9 +1093,6 @@ RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz
|
||||
#########################################################################################
|
||||
FROM rust-extensions-build AS pg-mooncake-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
|
||||
RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.0/pg_mooncake-0.1.0.tar.gz -O pg_mooncake.tar.gz && \
|
||||
echo "eafd059b77f541f11525eb8affcd66a176968cbd8fe7c0d436e733f2aa4da59f pg_mooncake.tar.gz" | sha256sum --check && \
|
||||
@@ -1157,11 +1108,8 @@ RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.0/p
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM build-deps AS pg-repack-build
|
||||
FROM pg-build AS pg-repack-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
|
||||
RUN wget https://github.com/reorg/pg_repack/archive/refs/tags/ver_1.5.2.tar.gz -O pg_repack.tar.gz && \
|
||||
echo '4516cad42251ed3ad53ff619733004db47d5755acac83f75924cd94d1c4fb681 pg_repack.tar.gz' | sha256sum --check && \
|
||||
@@ -1284,11 +1232,11 @@ RUN set -e \
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
|
||||
FROM quay.io/prometheuscommunity/postgres-exporter:v0.16.0 AS postgres-exporter
|
||||
|
||||
# Keep the version the same as in build-tools.Dockerfile and
|
||||
# test_runner/regress/test_compute_metrics.py.
|
||||
FROM burningalchemist/sql_exporter:0.16.0 AS sql-exporter
|
||||
FROM burningalchemist/sql_exporter:0.17.0 AS sql-exporter
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
|
||||
@@ -483,7 +483,6 @@ impl LocalEnv {
|
||||
.iter()
|
||||
.find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id)
|
||||
.map(|&(_, timeline_id)| timeline_id)
|
||||
.map(TimelineId::from)
|
||||
}
|
||||
|
||||
pub fn timeline_name_mappings(&self) -> HashMap<TenantTimelineId, String> {
|
||||
|
||||
@@ -822,10 +822,7 @@ impl StorageController {
|
||||
self.dispatch(
|
||||
Method::PUT,
|
||||
format!("control/v1/tenant/{tenant_shard_id}/migrate"),
|
||||
Some(TenantShardMigrateRequest {
|
||||
tenant_shard_id,
|
||||
node_id,
|
||||
}),
|
||||
Some(TenantShardMigrateRequest { node_id }),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -1,12 +1,16 @@
|
||||
use futures::StreamExt;
|
||||
use std::{str::FromStr, time::Duration};
|
||||
use std::{
|
||||
collections::{HashMap, HashSet},
|
||||
str::FromStr,
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use clap::{Parser, Subcommand};
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse,
|
||||
SafekeeperDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest,
|
||||
TenantDescribeResponse, TenantPolicyRequest,
|
||||
SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest,
|
||||
TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
|
||||
},
|
||||
models::{
|
||||
EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
|
||||
@@ -112,6 +116,13 @@ enum Command {
|
||||
#[arg(long)]
|
||||
node: NodeId,
|
||||
},
|
||||
/// Migrate the secondary location for a tenant shard to a specific pageserver.
|
||||
TenantShardMigrateSecondary {
|
||||
#[arg(long)]
|
||||
tenant_shard_id: TenantShardId,
|
||||
#[arg(long)]
|
||||
node: NodeId,
|
||||
},
|
||||
/// Cancel any ongoing reconciliation for this shard
|
||||
TenantShardCancelReconcile {
|
||||
#[arg(long)]
|
||||
@@ -146,6 +157,12 @@ enum Command {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
},
|
||||
TenantSetPreferredAz {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
#[arg(long)]
|
||||
preferred_az: Option<String>,
|
||||
},
|
||||
/// Uncleanly drop a tenant from the storage controller: this doesn't delete anything from pageservers. Appropriate
|
||||
/// if you e.g. used `tenant-warmup` by mistake on a tenant ID that doesn't really exist, or is in some other region.
|
||||
TenantDrop {
|
||||
@@ -395,11 +412,12 @@ async fn main() -> anyhow::Result<()> {
|
||||
resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr));
|
||||
|
||||
let mut table = comfy_table::Table::new();
|
||||
table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
|
||||
table.set_header(["Id", "Hostname", "AZ", "Scheduling", "Availability"]);
|
||||
for node in resp {
|
||||
table.add_row([
|
||||
format!("{}", node.id),
|
||||
node.listen_http_addr,
|
||||
node.availability_zone_id,
|
||||
format!("{:?}", node.scheduling),
|
||||
format!("{:?}", node.availability),
|
||||
]);
|
||||
@@ -459,33 +477,65 @@ async fn main() -> anyhow::Result<()> {
|
||||
println!("{table}");
|
||||
}
|
||||
Command::Tenants { node_id: None } => {
|
||||
let mut resp = storcon_client
|
||||
.dispatch::<(), Vec<TenantDescribeResponse>>(
|
||||
Method::GET,
|
||||
"control/v1/tenant".to_string(),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
resp.sort_by(|a, b| a.tenant_id.cmp(&b.tenant_id));
|
||||
|
||||
// Set up output formatting
|
||||
let mut table = comfy_table::Table::new();
|
||||
table.set_header([
|
||||
"TenantId",
|
||||
"Preferred AZ",
|
||||
"ShardCount",
|
||||
"StripeSize",
|
||||
"Placement",
|
||||
"Scheduling",
|
||||
]);
|
||||
for tenant in resp {
|
||||
let shard_zero = tenant.shards.into_iter().next().unwrap();
|
||||
table.add_row([
|
||||
format!("{}", tenant.tenant_id),
|
||||
format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
|
||||
format!("{:?}", tenant.stripe_size),
|
||||
format!("{:?}", tenant.policy),
|
||||
format!("{:?}", shard_zero.scheduling_policy),
|
||||
]);
|
||||
|
||||
// Pagination loop over listing API
|
||||
let mut start_after = None;
|
||||
const LIMIT: usize = 1000;
|
||||
loop {
|
||||
let path = match start_after {
|
||||
None => format!("control/v1/tenant?limit={LIMIT}"),
|
||||
Some(start_after) => {
|
||||
format!("control/v1/tenant?limit={LIMIT}&start_after={start_after}")
|
||||
}
|
||||
};
|
||||
|
||||
let resp = storcon_client
|
||||
.dispatch::<(), Vec<TenantDescribeResponse>>(Method::GET, path, None)
|
||||
.await?;
|
||||
|
||||
if resp.is_empty() {
|
||||
// End of data reached
|
||||
break;
|
||||
}
|
||||
|
||||
// Give some visual feedback while we're building up the table (comfy_table doesn't have
|
||||
// streaming output)
|
||||
if resp.len() >= LIMIT {
|
||||
eprint!(".");
|
||||
}
|
||||
|
||||
start_after = Some(resp.last().unwrap().tenant_id);
|
||||
|
||||
for tenant in resp {
|
||||
let shard_zero = tenant.shards.into_iter().next().unwrap();
|
||||
table.add_row([
|
||||
format!("{}", tenant.tenant_id),
|
||||
shard_zero
|
||||
.preferred_az_id
|
||||
.as_ref()
|
||||
.cloned()
|
||||
.unwrap_or("".to_string()),
|
||||
format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
|
||||
format!("{:?}", tenant.stripe_size),
|
||||
format!("{:?}", tenant.policy),
|
||||
format!("{:?}", shard_zero.scheduling_policy),
|
||||
]);
|
||||
}
|
||||
}
|
||||
|
||||
// Terminate progress dots
|
||||
if table.row_count() > LIMIT {
|
||||
eprint!("");
|
||||
}
|
||||
|
||||
println!("{table}");
|
||||
@@ -540,10 +590,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
tenant_shard_id,
|
||||
node,
|
||||
} => {
|
||||
let req = TenantShardMigrateRequest {
|
||||
tenant_shard_id,
|
||||
node_id: node,
|
||||
};
|
||||
let req = TenantShardMigrateRequest { node_id: node };
|
||||
|
||||
storcon_client
|
||||
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
|
||||
@@ -553,6 +600,20 @@ async fn main() -> anyhow::Result<()> {
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Command::TenantShardMigrateSecondary {
|
||||
tenant_shard_id,
|
||||
node,
|
||||
} => {
|
||||
let req = TenantShardMigrateRequest { node_id: node };
|
||||
|
||||
storcon_client
|
||||
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
|
||||
Method::PUT,
|
||||
format!("control/v1/tenant/{tenant_shard_id}/migrate_secondary"),
|
||||
Some(req),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Command::TenantShardCancelReconcile { tenant_shard_id } => {
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(
|
||||
@@ -596,6 +657,19 @@ async fn main() -> anyhow::Result<()> {
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let nodes = storcon_client
|
||||
.dispatch::<(), Vec<NodeDescribeResponse>>(
|
||||
Method::GET,
|
||||
"control/v1/node".to_string(),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
let nodes = nodes
|
||||
.into_iter()
|
||||
.map(|n| (n.id, n))
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
println!("Tenant {tenant_id}");
|
||||
let mut table = comfy_table::Table::new();
|
||||
table.add_row(["Policy", &format!("{:?}", policy)]);
|
||||
@@ -604,7 +678,14 @@ async fn main() -> anyhow::Result<()> {
|
||||
println!("{table}");
|
||||
println!("Shards:");
|
||||
let mut table = comfy_table::Table::new();
|
||||
table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
|
||||
table.set_header([
|
||||
"Shard",
|
||||
"Attached",
|
||||
"Attached AZ",
|
||||
"Secondary",
|
||||
"Last error",
|
||||
"status",
|
||||
]);
|
||||
for shard in shards {
|
||||
let secondary = shard
|
||||
.node_secondary
|
||||
@@ -627,11 +708,18 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
let status = status_parts.join(",");
|
||||
|
||||
let attached_node = shard
|
||||
.node_attached
|
||||
.as_ref()
|
||||
.map(|id| nodes.get(id).expect("Shard references nonexistent node"));
|
||||
|
||||
table.add_row([
|
||||
format!("{}", shard.tenant_shard_id),
|
||||
shard
|
||||
.node_attached
|
||||
.map(|n| format!("{}", n))
|
||||
attached_node
|
||||
.map(|n| format!("{} ({})", n.listen_http_addr, n.id))
|
||||
.unwrap_or(String::new()),
|
||||
attached_node
|
||||
.map(|n| n.availability_zone_id.clone())
|
||||
.unwrap_or(String::new()),
|
||||
secondary,
|
||||
shard.last_error,
|
||||
@@ -640,6 +728,66 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
println!("{table}");
|
||||
}
|
||||
Command::TenantSetPreferredAz {
|
||||
tenant_id,
|
||||
preferred_az,
|
||||
} => {
|
||||
// First learn about the tenant's shards
|
||||
let describe_response = storcon_client
|
||||
.dispatch::<(), TenantDescribeResponse>(
|
||||
Method::GET,
|
||||
format!("control/v1/tenant/{tenant_id}"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Learn about nodes to validate the AZ ID
|
||||
let nodes = storcon_client
|
||||
.dispatch::<(), Vec<NodeDescribeResponse>>(
|
||||
Method::GET,
|
||||
"control/v1/node".to_string(),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
if let Some(preferred_az) = &preferred_az {
|
||||
let azs = nodes
|
||||
.into_iter()
|
||||
.map(|n| (n.availability_zone_id))
|
||||
.collect::<HashSet<_>>();
|
||||
if !azs.contains(preferred_az) {
|
||||
anyhow::bail!(
|
||||
"AZ {} not found on any node: known AZs are: {:?}",
|
||||
preferred_az,
|
||||
azs
|
||||
);
|
||||
}
|
||||
} else {
|
||||
// Make it obvious to the user that since they've omitted an AZ, we're clearing it
|
||||
eprintln!("Clearing preferred AZ for tenant {}", tenant_id);
|
||||
}
|
||||
|
||||
// Construct a request that modifies all the tenant's shards
|
||||
let req = ShardsPreferredAzsRequest {
|
||||
preferred_az_ids: describe_response
|
||||
.shards
|
||||
.into_iter()
|
||||
.map(|s| {
|
||||
(
|
||||
s.tenant_shard_id,
|
||||
preferred_az.clone().map(AvailabilityZone),
|
||||
)
|
||||
})
|
||||
.collect(),
|
||||
};
|
||||
storcon_client
|
||||
.dispatch::<ShardsPreferredAzsRequest, ()>(
|
||||
Method::PUT,
|
||||
"control/v1/preferred_azs".to_string(),
|
||||
Some(req),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Command::TenantWarmup { tenant_id } => {
|
||||
let describe_response = storcon_client
|
||||
.dispatch::<(), TenantDescribeResponse>(
|
||||
@@ -915,10 +1063,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
|
||||
Method::PUT,
|
||||
format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id),
|
||||
Some(TenantShardMigrateRequest {
|
||||
tenant_shard_id: mv.tenant_shard_id,
|
||||
node_id: mv.to,
|
||||
}),
|
||||
Some(TenantShardMigrateRequest { node_id: mv.to }),
|
||||
)
|
||||
.await
|
||||
.map_err(|e| (mv.tenant_shard_id, mv.from, mv.to, e))
|
||||
|
||||
@@ -81,7 +81,7 @@ configuration generation in them is less than its current one. Namely, it
|
||||
refuses to vote, to truncate WAL in `handle_elected` and to accept WAL. In
|
||||
response it sends its current configuration generation to let walproposer know.
|
||||
|
||||
Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration`
|
||||
Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration`
|
||||
accepting `Configuration`. Safekeeper switches to the given conf it is higher than its
|
||||
current one and ignores it otherwise. In any case it replies with
|
||||
```
|
||||
@@ -103,7 +103,7 @@ currently and tries to communicate with all of them. However, the list does not
|
||||
define consensus members. Instead, on start walproposer tracks highest
|
||||
configuration it receives from `AcceptorGreeting`s. Once it assembles greetings
|
||||
from majority of `sk_set` and majority of `new_sk_set` (if it is present), it
|
||||
establishes this configuration as its own and moves to voting.
|
||||
establishes this configuration as its own and moves to voting.
|
||||
|
||||
It should stop talking to safekeepers not listed in the configuration at this
|
||||
point, though it is not unsafe to continue doing so.
|
||||
@@ -119,7 +119,7 @@ refusal to accept due to configuration change) it simply restarts.
|
||||
The following algorithm can be executed anywhere having access to configuration
|
||||
storage and safekeepers. It is safe to interrupt / restart it and run multiple
|
||||
instances of it concurrently, though likely one of them won't make
|
||||
progress then. It accepts `desired_set: Vec<NodeId>` as input.
|
||||
progress then. It accepts `desired_set: Vec<NodeId>` as input.
|
||||
|
||||
Algorithm will refuse to make the change if it encounters previous interrupted
|
||||
change attempt, but in this case it will try to finish it.
|
||||
@@ -140,7 +140,7 @@ storage are reachable.
|
||||
safe. Failed CAS aborts the procedure.
|
||||
4) Call `PUT` `configuration` on safekeepers from the current set,
|
||||
delivering them `joint_conf`. Collecting responses from majority is required
|
||||
to proceed. If any response returned generation higher than
|
||||
to proceed. If any response returned generation higher than
|
||||
`joint_conf.generation`, abort (another switch raced us). Otherwise, choose
|
||||
max `<last_log_term, flush_lsn>` among responses and establish it as
|
||||
(in memory) `sync_position`. Also choose max `term` and establish it as (in
|
||||
@@ -149,49 +149,49 @@ storage are reachable.
|
||||
without ack from the new set. Similarly, we'll bump term on new majority
|
||||
to `sync_term` so that two computes with the same term are never elected.
|
||||
4) Initialize timeline on safekeeper(s) from `new_sk_set` where it
|
||||
doesn't exist yet by doing `pull_timeline` from the majority of the
|
||||
doesn't exist yet by doing `pull_timeline` from the majority of the
|
||||
current set. Doing that on majority of `new_sk_set` is enough to
|
||||
proceed, but it is reasonable to ensure that all `new_sk_set` members
|
||||
are initialized -- if some of them are down why are we migrating there?
|
||||
5) Call `POST` `bump_term(sync_term)` on safekeepers from the new set.
|
||||
5) Call `POST` `bump_term(sync_term)` on safekeepers from the new set.
|
||||
Success on majority is enough.
|
||||
6) Repeatedly call `PUT` `configuration` on safekeepers from the new set,
|
||||
delivering them `joint_conf` and collecting their positions. This will
|
||||
switch them to the `joint_conf` which generally won't be needed
|
||||
switch them to the `joint_conf` which generally won't be needed
|
||||
because `pull_timeline` already includes it and plus additionally would be
|
||||
broadcast by compute. More importantly, we may proceed to the next step
|
||||
only when `<last_log_term, flush_lsn>` on the majority of the new set reached
|
||||
`sync_position`. Similarly, on the happy path no waiting is not needed because
|
||||
only when `<last_log_term, flush_lsn>` on the majority of the new set reached
|
||||
`sync_position`. Similarly, on the happy path no waiting is not needed because
|
||||
`pull_timeline` already includes it. However, we should double
|
||||
check to be safe. For example, timeline could have been created earlier e.g.
|
||||
manually or after try-to-migrate, abort, try-to-migrate-again sequence.
|
||||
7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new
|
||||
safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration
|
||||
manually or after try-to-migrate, abort, try-to-migrate-again sequence.
|
||||
7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new
|
||||
safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration
|
||||
storage under one more CAS.
|
||||
8) Call `PUT` `configuration` on safekeepers from the new set,
|
||||
delivering them `new_conf`. It is enough to deliver it to the majority
|
||||
delivering them `new_conf`. It is enough to deliver it to the majority
|
||||
of the new set; the rest can be updated by compute.
|
||||
|
||||
I haven't put huge effort to make the description above very precise, because it
|
||||
is natural language prone to interpretations anyway. Instead I'd like to make TLA+
|
||||
spec of it.
|
||||
|
||||
Description above focuses on safety. To make the flow practical and live, here a few more
|
||||
Description above focuses on safety. To make the flow practical and live, here a few more
|
||||
considerations.
|
||||
1) It makes sense to ping new set to ensure it we are migrating to live node(s) before
|
||||
1) It makes sense to ping new set to ensure it we are migrating to live node(s) before
|
||||
step 3.
|
||||
2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed
|
||||
2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed
|
||||
it is safe to rollback to the old conf with one more CAS.
|
||||
3) On step 4 timeline might be already created on members of the new set for various reasons;
|
||||
3) On step 4 timeline might be already created on members of the new set for various reasons;
|
||||
the simplest is the procedure restart. There are more complicated scenarious like mentioned
|
||||
in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving
|
||||
generations, so seems simpler to treat existing timeline as success. However, this also
|
||||
in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving
|
||||
generations, so seems simpler to treat existing timeline as success. However, this also
|
||||
has a disadvantage: you might imagine an surpassingly unlikely schedule where condition in
|
||||
the step 5 is never reached until compute is (re)awaken up to synchronize new member(s).
|
||||
I don't think we'll observe this in practice, but can add waking up compute if needed.
|
||||
4) In the end timeline should be locally deleted on the safekeeper(s) which are
|
||||
in the old set but not in the new one, unless they are unreachable. To be
|
||||
safe this also should be done under generation number (deletion proceeds only if
|
||||
safe this also should be done under generation number (deletion proceeds only if
|
||||
current configuration is <= than one in request and safekeeper is not memeber of it).
|
||||
5) If current conf fetched on step 1 is already not joint and members equal to `desired_set`,
|
||||
jump to step 7, using it as `new_conf`.
|
||||
@@ -202,47 +202,87 @@ The procedure ought to be driven from somewhere. Obvious candidates are control
|
||||
plane and storage_controller; and as each of them already has db we don't want
|
||||
yet another storage. I propose to manage safekeepers in storage_controller
|
||||
because 1) since it is in rust it simplifies simulation testing (more on this
|
||||
below) 2) it already manages pageservers.
|
||||
below) 2) it already manages pageservers.
|
||||
|
||||
This assumes that migration will be fully usable only after we migrate all
|
||||
tenants/timelines to storage_controller. It is discussible whether we want also
|
||||
to manage pageserver attachments for all of these, but likely we do.
|
||||
|
||||
This requires us to define storcon <-> cplane interface.
|
||||
This requires us to define storcon <-> cplane interface and changes.
|
||||
|
||||
### storage_controller <-> control plane interface
|
||||
### storage_controller <-> control plane interface and changes
|
||||
|
||||
First of all, control plane should
|
||||
[change](https://neondb.slack.com/archives/C03438W3FLZ/p1719226543199829)
|
||||
storing safekeepers per timeline instead of per tenant because we can't migrate
|
||||
tenants atomically.
|
||||
tenants atomically.
|
||||
|
||||
The important question is how updated configuration is delivered from
|
||||
storage_controller to control plane to provide it to computes. As always, there
|
||||
are two options, pull and push. Let's do it the same push as with pageserver
|
||||
`/notify-attach` because 1) it keeps storage_controller out of critical compute
|
||||
start path 2) provides easier upgrade: there won't be such a thing as 'timeline
|
||||
managed by control plane / storcon', cplane just takes the value out of its db
|
||||
when needed 3) uniformity. It makes storage_controller responsible for retrying notifying
|
||||
control plane until it succeeds.
|
||||
start path 2) uniformity. It makes storage_controller responsible for retrying
|
||||
notifying control plane until it succeeds.
|
||||
|
||||
So, cplane `/notify-safekeepers` for the timeline accepts `Configuration` and
|
||||
updates it in the db if the provided conf generation is higher (the cplane db
|
||||
should also store generations for this). Similarly to [`/notify-attach`](https://www.notion.so/neondatabase/Storage-Controller-Control-Plane-interface-6de56dd310a043bfa5c2f5564fa98365), it
|
||||
should update db which makes the call successful, and then try to schedule
|
||||
`apply_config` if possible, it is ok if not. storage_controller
|
||||
should rate limit calling the endpoint, but likely this won't be needed, as migration
|
||||
It is not needed for the control plane to fully know the `Configuration`. It is
|
||||
enough for it to only to be aware of the list of safekeepers in the latest
|
||||
configuration to supply it to compute, plus associated generation number to
|
||||
protect from stale update requests and to also pass it to compute.
|
||||
|
||||
So, cplane `/notify-safekeepers` for the timeline can accept JSON like
|
||||
```
|
||||
{
|
||||
tenant_id: String,
|
||||
timeline_id: String,
|
||||
generation: u32,
|
||||
safekeepers: Vec<SafekeeperId>,
|
||||
}
|
||||
```
|
||||
where `SafekeeperId` is
|
||||
```
|
||||
{
|
||||
node_id: u64,
|
||||
host: String
|
||||
}
|
||||
```
|
||||
In principle `host` is redundant, but may be useful for observability.
|
||||
|
||||
The request updates list of safekeepers in the db if the provided conf
|
||||
generation is higher (the cplane db should also store generations for this).
|
||||
Similarly to
|
||||
[`/notify-attach`](https://www.notion.so/neondatabase/Storage-Controller-Control-Plane-interface-6de56dd310a043bfa5c2f5564fa98365),
|
||||
it should update db which makes the call successful, and then try to schedule
|
||||
`apply_config` if possible, it is ok if not. storage_controller should rate
|
||||
limit calling the endpoint, but likely this won't be needed, as migration
|
||||
throughput is limited by `pull_timeline`.
|
||||
|
||||
Timeline (branch) creation in cplane should call storage_controller POST
|
||||
`tenant/:tenant_id/timeline` like it currently does for sharded tenants.
|
||||
Response should be augmented with `safekeeper_conf: Configuration`. The call
|
||||
should be retried until succeeds.
|
||||
Response should be augmented with `safekeepers_generation` and `safekeepers`
|
||||
fields like described in `/notify-safekeepers` above. Initially (currently)
|
||||
these fields may be absent; in this case cplane chooses safekeepers on its own
|
||||
like it currently does. The call should be retried until succeeds.
|
||||
|
||||
Timeline deletion and tenant deletion in cplane should call appropriate
|
||||
storage_controller endpoints like it currently does for sharded tenants. The
|
||||
calls should be retried until they succeed.
|
||||
|
||||
When compute receives safekeepers list from control plane it needs to know the
|
||||
generation to checked whether it should be updated (note that compute may get
|
||||
safekeeper list from either cplane or safekeepers). Currently `neon.safekeepers`
|
||||
GUC is just a comma separates list of `host:port`. Let's prefix it with
|
||||
`g#<generation>:` to this end, so it will look like
|
||||
```
|
||||
g#42:safekeeper-0.eu-central-1.aws.neon.tech:6401,safekeeper-2.eu-central-1.aws.neon.tech:6401,safekeeper-1.eu-central-1.aws.neon.tech:6401
|
||||
```
|
||||
|
||||
To summarize, list of cplane changes:
|
||||
- per tenant -> per timeline safekeepers management and addition of int `safekeeper_generation` field.
|
||||
- `/notify-safekeepers` endpoint.
|
||||
- Branch creation call may return list of safekeepers and when it is
|
||||
present cplane should adopt it instead of choosing on its own like it does currently.
|
||||
- `neon.safekeepers` GUC should be prefixed with `g#<generation>:`.
|
||||
|
||||
### storage_controller implementation
|
||||
|
||||
Current 'load everything on startup and keep in memory' easy design is fine.
|
||||
@@ -360,10 +400,10 @@ source safekeeper might fail, which is not a problem if we are going to
|
||||
decomission the node but leaves garbage otherwise. I'd propose in the first version
|
||||
1) Don't attempt deletion at all if node status is `offline`.
|
||||
2) If it failed, just issue warning.
|
||||
And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and
|
||||
remove garbage timelines for manual use. It will 1) list all timelines on the
|
||||
safekeeper 2) compare each one against configuration storage: if timeline
|
||||
doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can
|
||||
And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and
|
||||
remove garbage timelines for manual use. It will 1) list all timelines on the
|
||||
safekeeper 2) compare each one against configuration storage: if timeline
|
||||
doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can
|
||||
be deleted under generation number if node is not member of current generation.
|
||||
|
||||
Automating this is untrivial; we'd need to register all potential missing
|
||||
@@ -412,8 +452,8 @@ There should be following layers of tests:
|
||||
3) Since simulation testing injects at relatively high level points (not
|
||||
syscalls), it omits some code, in particular `pull_timeline`. Thus it is
|
||||
better to have basic tests covering whole system as well. Extended version of
|
||||
`test_restarts_under_load` would do: start background load and do migration
|
||||
under it, then restart endpoint and check that no reported commits
|
||||
`test_restarts_under_load` would do: start background load and do migration
|
||||
under it, then restart endpoint and check that no reported commits
|
||||
had been lost. I'd also add one more creating classic network split scenario, with
|
||||
one compute talking to AC and another to BD while migration from nodes ABC to ABD
|
||||
happens.
|
||||
@@ -422,35 +462,51 @@ There should be following layers of tests:
|
||||
|
||||
## Order of implementation and rollout
|
||||
|
||||
Note that
|
||||
Note that
|
||||
- Control plane parts and integration with it is fully independent from everything else
|
||||
(tests would use simulation and neon_local).
|
||||
- It is reasonable to make compute <-> safekeepers protocol change
|
||||
independent of enabling generations.
|
||||
- There is a lot of infra work making storage_controller aware of timelines and safekeepers
|
||||
and its impl/rollout should be separate from migration itself.
|
||||
- Initially walproposer can just stop working while it observers joint configuration.
|
||||
- Initially walproposer can just stop working while it observes joint configuration.
|
||||
Such window would be typically very short anyway.
|
||||
- Obviously we want to test the whole thing thoroughly on staging and only then
|
||||
gradually enable in prod.
|
||||
|
||||
To rollout smoothly, both walproposer and safekeeper should have flag
|
||||
`configurations_enabled`; when set to false, they would work as currently, i.e.
|
||||
walproposer is able to commit on whatever safekeeper set it is provided. Until
|
||||
all timelines are managed by storcon we'd need to use current script to migrate
|
||||
and update/drop entries in the storage_controller database if it has any.
|
||||
Let's have the following implementation bits for gradual rollout:
|
||||
- compute gets `neon.safekeepers_proto_version` flag.
|
||||
Initially both compute and safekeepers will be able to talk both
|
||||
versions so that we can delay force restart of them and for
|
||||
simplicity of rollback in case it is needed.
|
||||
- storcon gets `-set-safekeepers` config option disabled by
|
||||
default. Timeline creation request chooses safekeepers
|
||||
(and returns them in response to cplane) only when it is set to
|
||||
true.
|
||||
- control_plane [see above](storage_controller-<->-control-plane interface-and-changes)
|
||||
prefixes `neon.safekeepers` GUC with generation number. When it is 0
|
||||
(or prefix not present at all), walproposer behaves as currently, committing on
|
||||
the provided safekeeper list -- generations are disabled.
|
||||
If it is non 0 it follows this RFC rules.
|
||||
- We provide a script for manual migration to storage controller.
|
||||
It selects timeline(s) from control plane (specified or all of them) db
|
||||
and calls special import endpoint on storage controller which is very
|
||||
similar to timeline creation: it inserts into the db, sets
|
||||
configuration to initial on the safekeepers, calls cplane
|
||||
`notify-safekeepers`.
|
||||
|
||||
Safekeepers would need to be able to talk both current and new protocol version
|
||||
with compute to reduce number of computes restarted in prod once v2 protocol is
|
||||
deployed (though before completely switching we'd need to force this).
|
||||
|
||||
Let's have the following rollout order:
|
||||
- storage_controller becomes aware of safekeepers;
|
||||
- storage_controller gets timeline creation for new timelines and deletion requests, but
|
||||
doesn't manage all timelines yet. Migration can be tested on these new timelines.
|
||||
To keep control plane and storage_controller databases in sync while control
|
||||
plane still chooses the safekeepers initially (until all timelines are imported
|
||||
it can choose better), `TimelineCreateRequest` can get optional safekeepers
|
||||
field with safekeepers chosen by cplane.
|
||||
- Then we can import all existing timelines from control plane to
|
||||
storage_controller and gradually enable configurations region by region.
|
||||
Then the rollout for a region would be:
|
||||
- Current situation: safekeepers are choosen by control_plane.
|
||||
- We manually migrate some timelines, test moving them around.
|
||||
- Then we enable `--set-safekeepers` so that all new timelines
|
||||
are on storage controller.
|
||||
- Finally migrate all existing timelines using the script (no
|
||||
compute should be speaking old proto version at this point).
|
||||
|
||||
Until all timelines are managed by storcon we'd need to use current ad hoc
|
||||
script to migrate if needed. To keep state clean, all storage controller managed
|
||||
timelines must be migrated before that, or controller db and configurations
|
||||
state of safekeepers dropped manually.
|
||||
|
||||
Very rough implementation order:
|
||||
- Add concept of configurations to safekeepers (including control file),
|
||||
@@ -458,10 +514,10 @@ Very rough implementation order:
|
||||
- Implement walproposer changes, including protocol.
|
||||
- Implement storconn part. Use it in neon_local (and pytest).
|
||||
- Make cplane store safekeepers per timeline instead of per tenant.
|
||||
- Implement cplane/storcon integration. Route branch creation/deletion
|
||||
- Implement cplane/storcon integration. Route branch creation/deletion
|
||||
through storcon. Then we can test migration of new branches.
|
||||
- Finally import existing branches. Then we can drop cplane
|
||||
safekeeper selection code. Gradually enable configurations at
|
||||
- Finally import existing branches. Then we can drop cplane
|
||||
safekeeper selection code. Gradually enable configurations at
|
||||
computes and safekeepers. Before that, all computes must talk only
|
||||
v3 protocol version.
|
||||
|
||||
|
||||
247
docs/rfcs/040-profiling.md
Normal file
247
docs/rfcs/040-profiling.md
Normal file
@@ -0,0 +1,247 @@
|
||||
# CPU and Memory Profiling
|
||||
|
||||
Created 2025-01-12 by Erik Grinaker.
|
||||
|
||||
See also [internal user guide](https://www.notion.so/neondatabase/Storage-CPU-Memory-Profiling-14bf189e004780228ec7d04442742324?pvs=4).
|
||||
|
||||
## Summary
|
||||
|
||||
This document proposes a standard cross-team pattern for CPU and memory profiling across
|
||||
applications and languages, using the [pprof](https://github.com/google/pprof) profile format.
|
||||
|
||||
It enables both ad hoc profiles via HTTP endpoints, and continuous profiling across the fleet via
|
||||
[Grafana Cloud Profiles](https://grafana.com/docs/grafana-cloud/monitor-applications/profiles/).
|
||||
Continuous profiling incurs an overhead of about 0.1% CPU usage and 3% slower heap allocations.
|
||||
|
||||
## Motivation
|
||||
|
||||
CPU and memory profiles are crucial observability tools for understanding performance issues,
|
||||
resource exhaustion, and resource costs. They allow answering questions like:
|
||||
|
||||
* Why is this process using 100% CPU?
|
||||
* How do I make this go faster?
|
||||
* Why did this process run out of memory?
|
||||
* Why are we paying for all these CPU cores and memory chips?
|
||||
|
||||
Go has [first-class support](https://pkg.go.dev/net/http/pprof) for profiling included in its
|
||||
standard library, using the [pprof](https://github.com/google/pprof) profile format and associated
|
||||
tooling.
|
||||
|
||||
This is not the case for Rust and C, where obtaining profiles can be rather cumbersome. It requires
|
||||
installing and running additional tools like `perf` as root on production nodes, with analysis tools
|
||||
that can be hard to use and often don't give good results. This is not only annoying, but can also
|
||||
significantly affect the resolution time of production incidents.
|
||||
|
||||
This proposal will:
|
||||
|
||||
* Provide CPU and heap profiles in pprof format via HTTP API.
|
||||
* Record continuous profiles in Grafana for aggregate historical analysis.
|
||||
* Make it easy for anyone to see a flamegraph in less than one minute.
|
||||
* Be reasonably consistent across teams and services (Rust, Go, C).
|
||||
|
||||
## Non Goals (For Now)
|
||||
|
||||
* [Additional profile types](https://grafana.com/docs/pyroscope/next/configure-client/profile-types/)
|
||||
like mutexes, locks, goroutines, etc.
|
||||
* [Runtime trace integration](https://grafana.com/docs/pyroscope/next/configure-client/trace-span-profiles/).
|
||||
* [Profile-guided optimization](https://en.wikipedia.org/wiki/Profile-guided_optimization).
|
||||
|
||||
## Using Profiles
|
||||
|
||||
Ready-to-use profiles can be obtained using e.g. `curl`. For Rust services:
|
||||
|
||||
```
|
||||
$ curl localhost:9898/profile/cpu >profile.pb.gz
|
||||
```
|
||||
|
||||
pprof profiles can be explored using the [`pprof`](https://github.com/google/pprof) web UI, which
|
||||
provides flamegraphs, call graphs, plain text listings, and more:
|
||||
|
||||
```
|
||||
$ pprof -http :6060 <profile>
|
||||
```
|
||||
|
||||
Some endpoints (e.g. Rust-based ones) can also generate flamegraph SVGs directly:
|
||||
|
||||
```
|
||||
$ curl localhost:9898/profile/cpu?format=svg >profile.svg
|
||||
$ open profile.svg
|
||||
```
|
||||
|
||||
Continuous profiles are available in Grafana under Explore → Profiles → Explore Profiles
|
||||
(currently only in [staging](https://neonstaging.grafana.net/a/grafana-pyroscope-app/profiles-explorer)).
|
||||
|
||||
## API Requirements
|
||||
|
||||
* HTTP endpoints that return a profile in pprof format (with symbols).
|
||||
* CPU: records a profile over the request time interval (`seconds` query parameter).
|
||||
* Memory: returns the current in-use heap allocations.
|
||||
* Unauthenticated, as it should not expose user data or pose a denial-of-service risk.
|
||||
* Default sample frequency should not impact service (maximum 5% CPU overhead).
|
||||
* Linux-compatibility.
|
||||
|
||||
Nice to have:
|
||||
|
||||
* Return flamegraph SVG directly from the HTTP endpoint if requested.
|
||||
* Configurable sample frequency for CPU profiles.
|
||||
* Historical heap allocations, by count and bytes.
|
||||
* macOS-compatiblity.
|
||||
|
||||
## Rust Profiling
|
||||
|
||||
[`libs/utils/src/http/endpoint.rs`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs)
|
||||
contains ready-to-use HTTP endpoints for CPU and memory profiling:
|
||||
[`profile_cpu_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L338) and [`profile_heap_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L416).
|
||||
|
||||
### CPU
|
||||
|
||||
CPU profiles are provided by [pprof-rs](https://github.com/tikv/pprof-rs) via
|
||||
[`profile_cpu_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L338).
|
||||
Expose it unauthenticated at `/profile/cpu`.
|
||||
|
||||
Parameters:
|
||||
|
||||
* `format`: profile output format (`pprof` or `svg`; default `pprof`).
|
||||
* `seconds`: duration to collect profile over, in seconds (default `5`).
|
||||
* `frequency`: how often to sample thread stacks, in Hz (default `99`).
|
||||
* `force`: if `true`, cancel a running profile and start a new one (default `false`).
|
||||
|
||||
Works on Linux and macOS.
|
||||
|
||||
### Memory
|
||||
|
||||
Use the jemalloc allocator via [`tikv-jemallocator`](https://github.com/tikv/jemallocator),
|
||||
and enable profiling with samples every 2 MB allocated:
|
||||
|
||||
```rust
|
||||
#[global_allocator]
|
||||
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
|
||||
|
||||
#[allow(non_upper_case_globals)]
|
||||
#[export_name = "malloc_conf"]
|
||||
pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
|
||||
```
|
||||
|
||||
pprof profiles are generated by
|
||||
[`jemalloc-pprof`](https://github.com/polarsignals/rust-jemalloc-pprof) via
|
||||
[`profile_heap_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L416).
|
||||
Expose it unauthenticated at `/profile/heap`.
|
||||
|
||||
Parameters:
|
||||
|
||||
* `format`: profile output format (`pprof`, `svg`, or `jemalloc`; default `pprof`).
|
||||
|
||||
Works on Linux only, due to [jemalloc limitations](https://github.com/jemalloc/jemalloc/issues/26).
|
||||
|
||||
## Go Profiling
|
||||
|
||||
The Go standard library includes pprof profiling via HTTP API in
|
||||
[`net/http/pprof`](https://pkg.go.dev/net/http/pprof). Expose it unauthenticated at
|
||||
`/debug/pprof`.
|
||||
|
||||
Works on Linux and macOS.
|
||||
|
||||
### CPU
|
||||
|
||||
Via `/debug/pprof/profile`. Parameters:
|
||||
|
||||
* `debug`: profile output format (`0` is pprof, `1` or above is plaintext; default `0`).
|
||||
* `seconds`: duration to collect profile over, in seconds (default `30`).
|
||||
|
||||
Does not support a frequency parameter (see [#57488](https://github.com/golang/go/issues/57488)),
|
||||
and defaults to 100 Hz. A lower frequency can be hardcoded via `SetCPUProfileRate`, but the default
|
||||
is likely ok (estimated 1% overhead).
|
||||
|
||||
### Memory
|
||||
|
||||
Via `/debug/pprof/heap`. Parameters:
|
||||
|
||||
* `seconds`: take a delta profile over the given duration, in seconds (default `0`).
|
||||
* `gc`: if `1`, garbage collect before taking profile.
|
||||
|
||||
## C Profiling
|
||||
|
||||
[gperftools](https://github.com/gperftools/gperftools) provides in-process CPU and heap profiling
|
||||
with pprof output.
|
||||
|
||||
However, continuous profiling of PostgreSQL is expensive (many computes), and has limited value
|
||||
since we don't own the internals anyway.
|
||||
|
||||
Ad hoc profiling might still be useful, but the compute team considers existing tooling sufficient,
|
||||
so this is not a priority at the moment.
|
||||
|
||||
## Grafana Continuous Profiling
|
||||
|
||||
[Grafana Alloy](https://grafana.com/docs/alloy/latest/) continually scrapes CPU and memory profiles
|
||||
across the fleet, and archives them as time series. This can be used to analyze resource usage over
|
||||
time, either in aggregate or zoomed in to specific events and nodes.
|
||||
|
||||
Profiles are retained for 30 days. Profile ingestion volume for CPU+heap at 60-second intervals
|
||||
is about 0.5 GB/node/day, or about $0.25/node/day = $7.5/node/month ($0.50/GB).
|
||||
|
||||
It is currently enabled in [staging](https://neonstaging.grafana.net/a/grafana-pyroscope-app/profiles-explorer)
|
||||
for Pageserver and Safekeeper.
|
||||
|
||||
### Scraping
|
||||
|
||||
* CPU profiling: 59 seconds at 19 Hz every 60 seconds.
|
||||
* Heap profiling: heap snapshot with 2 MB frequency every 60 seconds.
|
||||
|
||||
There are two main approaches that can be taken for CPU profiles:
|
||||
|
||||
* Continuous low-frequency profiles (e.g. 19 Hz for 60 seconds every 60 seconds).
|
||||
* Occasional high-frequency profiles (e.g. 99 Hz for 5 seconds every 60 seconds).
|
||||
|
||||
We choose continuous low-frequency profiles where possible. This has a fixed low overhead, instead
|
||||
of a spiky high overhead. It likely also gives a more representative view of resource usage.
|
||||
However, a 19 Hz rate gives a minimum resolution of 52.6 ms per sample, which may be larger than the
|
||||
actual runtime of small functions. Note that Go does not support a frequency parameter, so we must
|
||||
use a fixed frequency for all profiles via `SetCPUProfileRate()` (default 100 Hz).
|
||||
|
||||
Only one CPU profile can be taken at a time. With continuous profiling, one will always be running.
|
||||
To allow also taking an ad hoc CPU profile, the Rust endpoint supports a `force` query parameter to
|
||||
cancel a running profile and start a new one.
|
||||
|
||||
### Overhead
|
||||
|
||||
With Rust:
|
||||
|
||||
* CPU profiles at 19 Hz frequency: 0.1% overhead.
|
||||
* Heap profiles at 2 MB frequency: 3% allocation overhead.
|
||||
* Profile call/encoding/symbolization: 20 ms every 60 seconds, or 0.03% of 1 CPU (for Pageserver).
|
||||
* Profile symbolization caches: 125 MB memory, or 0.4% of 32 GB (for Pageserver).
|
||||
|
||||
Benchmarks with pprof-rs showed that the CPU time for taking a stack trace of a 40-frame stack was
|
||||
11 µs using the `frame-pointer` feature, and 1.4 µs using `libunwind` with DWARF. `libunwind` saw
|
||||
frequent seg faults, so we use `frame-pointer` and build binaries with frame pointers (negligible
|
||||
overhead).
|
||||
|
||||
CPU profiles work by installing an `ITIMER_PROF` for the process, which triggers a `SIGPROF` signal
|
||||
after a given amount of cumulative CPU time across all CPUs. The signal handler will run for one
|
||||
of the currently executing threads and take a stack trace. Thus, a 19 Hz profile will take 1 stack
|
||||
trace every 52.6 ms CPU time -- assuming 11 µs for a stack trace, this is 0.02% overhead, but
|
||||
likely 0.1% in practice (given e.g. context switches).
|
||||
|
||||
Heap profiles work by probabilistically taking a stack trace on allocations, adjusted for the
|
||||
allocation size. A 1 MB allocation takes about 15 µs in benchmarks, and a stack trace about 1 µs,
|
||||
so we can estimate that a 2 MB sampling frequency has about 3% allocation overhead -- this is
|
||||
consistent with benchmarks. This is significantly larger than CPU profiles, but mitigated by the
|
||||
fact that performance-sensitive code will avoid allocations as far as possible.
|
||||
|
||||
Profile symbolization uses in-memory caches for symbol lookups. These take about 125 MB for
|
||||
Pageserver.
|
||||
|
||||
## Alternatives Considered
|
||||
|
||||
* eBPF profiles.
|
||||
* Don't require instrumenting the binary.
|
||||
* Use less resources.
|
||||
* Can profile in kernel space too.
|
||||
* Supported by Grafana.
|
||||
* Less information about stack frames and spans.
|
||||
* Limited tooling for local analysis.
|
||||
* Does not support heap profiles.
|
||||
* Does not work on macOS.
|
||||
|
||||
* [Polar Signals](https://www.polarsignals.com) instead of Grafana.
|
||||
* We already use Grafana for everything else. Appears good enough.
|
||||
@@ -1,6 +1,6 @@
|
||||
# Storage broker
|
||||
|
||||
Storage broker targets two issues
|
||||
Storage broker targets two issues:
|
||||
- Allowing safekeepers and pageservers learn which nodes also hold their
|
||||
timelines, and timeline statuses there.
|
||||
- Avoiding O(n^2) connections between storage nodes while doing so.
|
||||
@@ -19,7 +19,7 @@ Currently, the only message is `SafekeeperTimelineInfo`. Each safekeeper, for
|
||||
each active timeline, once in a while pushes timeline status to the broker.
|
||||
Other nodes subscribe and receive this info, using it per above.
|
||||
|
||||
Broker serves /metrics on the same port as grpc service.
|
||||
Broker serves /metrics on the same port as grpc service.
|
||||
|
||||
grpcurl can be used to check which values are currently being pushed:
|
||||
```
|
||||
|
||||
@@ -87,7 +87,7 @@ impl Display for AvailabilityZone {
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ShardsPreferredAzsRequest {
|
||||
#[serde(flatten)]
|
||||
pub preferred_az_ids: HashMap<TenantShardId, AvailabilityZone>,
|
||||
pub preferred_az_ids: HashMap<TenantShardId, Option<AvailabilityZone>>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
@@ -144,6 +144,8 @@ pub struct NodeDescribeResponse {
|
||||
pub availability: NodeAvailabilityWrapper,
|
||||
pub scheduling: NodeSchedulingPolicy,
|
||||
|
||||
pub availability_zone_id: String,
|
||||
|
||||
pub listen_http_addr: String,
|
||||
pub listen_http_port: u16,
|
||||
|
||||
@@ -179,7 +181,6 @@ pub struct TenantDescribeResponseShard {
|
||||
/// specifies some constraints, e.g. asking it to get off particular node(s)
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantShardMigrateRequest {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
pub node_id: NodeId,
|
||||
}
|
||||
|
||||
@@ -368,6 +369,16 @@ pub enum PlacementPolicy {
|
||||
Detached,
|
||||
}
|
||||
|
||||
impl PlacementPolicy {
|
||||
pub fn want_secondaries(&self) -> usize {
|
||||
match self {
|
||||
PlacementPolicy::Attached(secondary_count) => *secondary_count,
|
||||
PlacementPolicy::Secondary => 1,
|
||||
PlacementPolicy::Detached => 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantShardMigrateResponse {}
|
||||
|
||||
|
||||
@@ -706,7 +706,7 @@ pub fn repl_origin_key_range() -> Range<Key> {
|
||||
/// Non inherited range for vectored get.
|
||||
pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
|
||||
/// Sparse keyspace range for vectored get. Missing key error will be ignored for this range.
|
||||
pub const NON_INHERITED_SPARSE_RANGE: Range<Key> = Key::metadata_key_range();
|
||||
pub const SPARSE_RANGE: Range<Key> = Key::metadata_key_range();
|
||||
|
||||
impl Key {
|
||||
// AUX_FILES currently stores only data for logical replication (slots etc), and
|
||||
@@ -714,7 +714,42 @@ impl Key {
|
||||
// switch (and generally it likely should be optional), so ignore these.
|
||||
#[inline(always)]
|
||||
pub fn is_inherited_key(self) -> bool {
|
||||
!NON_INHERITED_RANGE.contains(&self) && !NON_INHERITED_SPARSE_RANGE.contains(&self)
|
||||
if self.is_sparse() {
|
||||
self.is_inherited_sparse_key()
|
||||
} else {
|
||||
!NON_INHERITED_RANGE.contains(&self)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn is_sparse(self) -> bool {
|
||||
self.field1 >= METADATA_KEY_BEGIN_PREFIX && self.field1 < METADATA_KEY_END_PREFIX
|
||||
}
|
||||
|
||||
/// Check if the key belongs to the inherited keyspace.
|
||||
fn is_inherited_sparse_key(self) -> bool {
|
||||
debug_assert!(self.is_sparse());
|
||||
self.field1 == RELATION_SIZE_PREFIX
|
||||
}
|
||||
|
||||
pub fn sparse_non_inherited_keyspace() -> Range<Key> {
|
||||
// The two keys are adjacent; if we will have non-adjancent keys in the future, we should return a keyspace
|
||||
debug_assert_eq!(AUX_KEY_PREFIX + 1, REPL_ORIGIN_KEY_PREFIX);
|
||||
Key {
|
||||
field1: AUX_KEY_PREFIX,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
}..Key {
|
||||
field1: REPL_ORIGIN_KEY_PREFIX + 1,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
|
||||
@@ -31,6 +31,8 @@
|
||||
//! - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
|
||||
//! and their slugs are 0004, 0104, 0204, and 0304.
|
||||
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
use crate::{key::Key, models::ShardParameters};
|
||||
use postgres_ffi::relfile_utils::INIT_FORKNUM;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -48,6 +50,23 @@ pub struct ShardIdentity {
|
||||
layout: ShardLayout,
|
||||
}
|
||||
|
||||
/// Hash implementation
|
||||
///
|
||||
/// The stripe size cannot change dynamically, so it can be ignored for efficiency reasons.
|
||||
impl Hash for ShardIdentity {
|
||||
fn hash<H: Hasher>(&self, state: &mut H) {
|
||||
let ShardIdentity {
|
||||
number,
|
||||
count,
|
||||
stripe_size: _,
|
||||
layout: _,
|
||||
} = self;
|
||||
|
||||
number.0.hash(state);
|
||||
count.0.hash(state);
|
||||
}
|
||||
}
|
||||
|
||||
/// Stripe size in number of pages
|
||||
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
||||
pub struct ShardStripeSize(pub u32);
|
||||
@@ -59,7 +78,7 @@ impl Default for ShardStripeSize {
|
||||
}
|
||||
|
||||
/// Layout version: for future upgrades where we might change how the key->shard mapping works
|
||||
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
||||
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Hash, Debug)]
|
||||
pub struct ShardLayout(u8);
|
||||
|
||||
const LAYOUT_V1: ShardLayout = ShardLayout(1);
|
||||
|
||||
@@ -16,7 +16,7 @@ use utils::bin_ser::DeserializeError;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct XlMultiXactCreate {
|
||||
pub mid: MultiXactId,
|
||||
/* new MultiXact's ID */
|
||||
@@ -46,7 +46,7 @@ impl XlMultiXactCreate {
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct XlMultiXactTruncate {
|
||||
pub oldest_multi_db: Oid,
|
||||
/* to-be-truncated range of multixact offsets */
|
||||
@@ -72,7 +72,7 @@ impl XlMultiXactTruncate {
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct XlRelmapUpdate {
|
||||
pub dbid: Oid, /* database ID, or 0 for shared map */
|
||||
pub tsid: Oid, /* database's tablespace, or pg_global */
|
||||
@@ -90,7 +90,7 @@ impl XlRelmapUpdate {
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct XlReploriginDrop {
|
||||
pub node_id: RepOriginId,
|
||||
}
|
||||
@@ -104,7 +104,7 @@ impl XlReploriginDrop {
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct XlReploriginSet {
|
||||
pub remote_lsn: Lsn,
|
||||
pub node_id: RepOriginId,
|
||||
@@ -911,7 +911,7 @@ impl XlSmgrCreate {
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct XlSmgrTruncate {
|
||||
pub blkno: BlockNumber,
|
||||
pub rnode: RelFileNode,
|
||||
@@ -984,7 +984,7 @@ impl XlDropDatabase {
|
||||
/// xl_xact_parsed_abort structs in PostgreSQL, but we use the same
|
||||
/// struct for commits and aborts.
|
||||
///
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct XlXactParsedRecord {
|
||||
pub xid: TransactionId,
|
||||
pub info: u8,
|
||||
|
||||
@@ -44,7 +44,7 @@ pub struct ProtocolVersion(u32);
|
||||
|
||||
impl ProtocolVersion {
|
||||
pub const fn new(major: u16, minor: u16) -> Self {
|
||||
Self((major as u32) << 16 | minor as u32)
|
||||
Self(((major as u32) << 16) | minor as u32)
|
||||
}
|
||||
pub const fn minor(self) -> u16 {
|
||||
self.0 as u16
|
||||
|
||||
@@ -43,6 +43,17 @@ impl RemoteStorageKind {
|
||||
}
|
||||
}
|
||||
|
||||
impl RemoteStorageConfig {
|
||||
/// Helper to fetch the configured concurrency limit.
|
||||
pub fn concurrency_limit(&self) -> Option<usize> {
|
||||
match &self.storage {
|
||||
RemoteStorageKind::LocalFs { .. } => None,
|
||||
RemoteStorageKind::AwsS3(c) => Some(c.concurrency_limit.into()),
|
||||
RemoteStorageKind::AzureContainer(c) => Some(c.concurrency_limit.into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn default_timeout() -> Duration {
|
||||
RemoteStorageConfig::DEFAULT_TIMEOUT
|
||||
}
|
||||
|
||||
@@ -5,8 +5,10 @@ edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
const_format.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
pq_proto.workspace = true
|
||||
tokio.workspace = true
|
||||
|
||||
@@ -4,12 +4,15 @@ use const_format::formatcp;
|
||||
use pq_proto::SystemId;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
pub mod membership;
|
||||
/// Public API types
|
||||
pub mod models;
|
||||
|
||||
/// Consensus logical timestamp. Note: it is a part of sk control file.
|
||||
pub type Term = u64;
|
||||
pub const INVALID_TERM: Term = 0;
|
||||
/// With this term timeline is created initially. It
|
||||
/// is a normal term except wp is never elected with it.
|
||||
pub const INITIAL_TERM: Term = 0;
|
||||
|
||||
/// Information about Postgres. Safekeeper gets it once and then verifies all
|
||||
/// further connections from computes match. Note: it is a part of sk control
|
||||
|
||||
160
libs/safekeeper_api/src/membership.rs
Normal file
160
libs/safekeeper_api/src/membership.rs
Normal file
@@ -0,0 +1,160 @@
|
||||
//! Types defining safekeeper membership, see
|
||||
//! rfcs/035-safekeeper-dynamic-membership-change.md
|
||||
//! for details.
|
||||
|
||||
use std::{collections::HashSet, fmt::Display};
|
||||
|
||||
use anyhow;
|
||||
use anyhow::bail;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::NodeId;
|
||||
|
||||
/// Number uniquely identifying safekeeper configuration.
|
||||
/// Note: it is a part of sk control file.
|
||||
pub type Generation = u32;
|
||||
/// 1 is the first valid generation, 0 is used as
|
||||
/// a placeholder before we fully migrate to generations.
|
||||
pub const INVALID_GENERATION: Generation = 0;
|
||||
pub const INITIAL_GENERATION: Generation = 1;
|
||||
|
||||
/// Membership is defined by ids so e.g. walproposer uses them to figure out
|
||||
/// quorums, but we also carry host and port to give wp idea where to connect.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct SafekeeperId {
|
||||
pub id: NodeId,
|
||||
pub host: String,
|
||||
/// We include here only port for computes -- that is, pg protocol tenant
|
||||
/// only port, or wide pg protocol port if the former is not configured.
|
||||
pub pg_port: u16,
|
||||
}
|
||||
|
||||
impl Display for SafekeeperId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "[id={}, ep={}:{}]", self.id, self.host, self.pg_port)
|
||||
}
|
||||
}
|
||||
|
||||
/// Set of safekeepers.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
#[serde(transparent)]
|
||||
pub struct MemberSet {
|
||||
pub m: Vec<SafekeeperId>,
|
||||
}
|
||||
|
||||
impl MemberSet {
|
||||
pub fn empty() -> Self {
|
||||
MemberSet { m: Vec::new() }
|
||||
}
|
||||
|
||||
pub fn new(members: Vec<SafekeeperId>) -> anyhow::Result<Self> {
|
||||
let hs: HashSet<NodeId> = HashSet::from_iter(members.iter().map(|sk| sk.id));
|
||||
if hs.len() != members.len() {
|
||||
bail!("duplicate safekeeper id in the set {:?}", members);
|
||||
}
|
||||
Ok(MemberSet { m: members })
|
||||
}
|
||||
|
||||
pub fn contains(&self, sk: &SafekeeperId) -> bool {
|
||||
self.m.iter().any(|m| m.id == sk.id)
|
||||
}
|
||||
|
||||
pub fn add(&mut self, sk: SafekeeperId) -> anyhow::Result<()> {
|
||||
if self.contains(&sk) {
|
||||
bail!(format!(
|
||||
"sk {} is already member of the set {}",
|
||||
sk.id, self
|
||||
));
|
||||
}
|
||||
self.m.push(sk);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for MemberSet {
|
||||
/// Display as a comma separated list of members.
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let sks_str = self.m.iter().map(|sk| sk.to_string()).collect::<Vec<_>>();
|
||||
write!(f, "({})", sks_str.join(", "))
|
||||
}
|
||||
}
|
||||
|
||||
/// Safekeeper membership configuration.
|
||||
/// Note: it is a part of both control file and http API.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct Configuration {
|
||||
/// Unique id.
|
||||
pub generation: Generation,
|
||||
/// Current members of the configuration.
|
||||
pub members: MemberSet,
|
||||
/// Some means it is a joint conf.
|
||||
pub new_members: Option<MemberSet>,
|
||||
}
|
||||
|
||||
impl Configuration {
|
||||
/// Used for pre-generations timelines, will be removed eventually.
|
||||
pub fn empty() -> Self {
|
||||
Configuration {
|
||||
generation: INVALID_GENERATION,
|
||||
members: MemberSet::empty(),
|
||||
new_members: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for Configuration {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"gen={}, members={}, new_members={}",
|
||||
self.generation,
|
||||
self.members,
|
||||
self.new_members
|
||||
.as_ref()
|
||||
.map(ToString::to_string)
|
||||
.unwrap_or(String::from("none"))
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{MemberSet, SafekeeperId};
|
||||
use utils::id::NodeId;
|
||||
|
||||
#[test]
|
||||
fn test_member_set() {
|
||||
let mut members = MemberSet::empty();
|
||||
members
|
||||
.add(SafekeeperId {
|
||||
id: NodeId(42),
|
||||
host: String::from("lala.org"),
|
||||
pg_port: 5432,
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
members
|
||||
.add(SafekeeperId {
|
||||
id: NodeId(42),
|
||||
host: String::from("lala.org"),
|
||||
pg_port: 5432,
|
||||
})
|
||||
.expect_err("duplicate must not be allowed");
|
||||
|
||||
members
|
||||
.add(SafekeeperId {
|
||||
id: NodeId(43),
|
||||
host: String::from("bubu.org"),
|
||||
pg_port: 5432,
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
println!("members: {}", members);
|
||||
|
||||
let j = serde_json::to_string(&members).expect("failed to serialize");
|
||||
println!("members json: {}", j);
|
||||
assert_eq!(
|
||||
j,
|
||||
r#"[{"id":42,"host":"lala.org","pg_port":5432},{"id":43,"host":"bubu.org","pg_port":5432}]"#
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -11,7 +11,7 @@ use utils::{
|
||||
pageserver_feedback::PageserverFeedback,
|
||||
};
|
||||
|
||||
use crate::{ServerInfo, Term};
|
||||
use crate::{membership::Configuration, ServerInfo, Term};
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct SafekeeperStatus {
|
||||
@@ -22,13 +22,16 @@ pub struct SafekeeperStatus {
|
||||
pub struct TimelineCreateRequest {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub peer_ids: Option<Vec<NodeId>>,
|
||||
pub mconf: Configuration,
|
||||
pub pg_version: u32,
|
||||
pub system_id: Option<u64>,
|
||||
// By default WAL_SEGMENT_SIZE
|
||||
pub wal_seg_size: Option<u32>,
|
||||
pub commit_lsn: Lsn,
|
||||
// If not passed, it is assigned to the beginning of commit_lsn segment.
|
||||
pub local_start_lsn: Option<Lsn>,
|
||||
pub start_lsn: Lsn,
|
||||
// Normal creation should omit this field (start_lsn initializes all LSNs).
|
||||
// However, we allow specifying custom value higher than start_lsn for
|
||||
// manual recovery case, see test_s3_wal_replay.
|
||||
pub commit_lsn: Option<Lsn>,
|
||||
}
|
||||
|
||||
/// Same as TermLsn, but serializes LSN using display serializer
|
||||
@@ -172,6 +175,7 @@ pub enum WalReceiverStatus {
|
||||
pub struct TimelineStatus {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub mconf: Configuration,
|
||||
pub acceptor_state: AcceptorStateStatus,
|
||||
pub pg_info: ServerInfo,
|
||||
pub flush_lsn: Lsn,
|
||||
@@ -186,6 +190,20 @@ pub struct TimelineStatus {
|
||||
pub walreceivers: Vec<WalReceiverState>,
|
||||
}
|
||||
|
||||
/// Request to switch membership configuration.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[serde(transparent)]
|
||||
pub struct TimelineMembershipSwitchRequest {
|
||||
pub mconf: Configuration,
|
||||
}
|
||||
|
||||
/// In response both previous and current configuration are sent.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TimelineMembershipSwitchResponse {
|
||||
pub previous_conf: Configuration,
|
||||
pub current_conf: Configuration,
|
||||
}
|
||||
|
||||
fn lsn_invalid() -> Lsn {
|
||||
Lsn::INVALID
|
||||
}
|
||||
|
||||
@@ -112,9 +112,9 @@ impl Serialize for Generation {
|
||||
// We should never be asked to serialize a None. Structures
|
||||
// that include an optional generation should convert None to an
|
||||
// Option<Generation>::None
|
||||
Err(serde::ser::Error::custom(
|
||||
"Tried to serialize invalid generation ({self})",
|
||||
))
|
||||
Err(serde::ser::Error::custom(format!(
|
||||
"Tried to serialize invalid generation ({self:?})"
|
||||
)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,7 +15,7 @@ use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
use routerify::ext::RequestExt;
|
||||
use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
|
||||
use tokio::sync::{mpsc, Mutex};
|
||||
use tokio::sync::{mpsc, Mutex, Notify};
|
||||
use tokio_stream::wrappers::ReceiverStream;
|
||||
use tokio_util::io::ReaderStream;
|
||||
use tracing::{debug, info, info_span, warn, Instrument};
|
||||
@@ -350,33 +350,53 @@ pub async fn profile_cpu_handler(req: Request<Body>) -> Result<Response<Body>, A
|
||||
};
|
||||
let seconds = match parse_query_param(&req, "seconds")? {
|
||||
None => 5,
|
||||
Some(seconds @ 1..=30) => seconds,
|
||||
Some(_) => return Err(ApiError::BadRequest(anyhow!("duration must be 1-30 secs"))),
|
||||
Some(seconds @ 1..=60) => seconds,
|
||||
Some(_) => return Err(ApiError::BadRequest(anyhow!("duration must be 1-60 secs"))),
|
||||
};
|
||||
let frequency_hz = match parse_query_param(&req, "frequency")? {
|
||||
None => 99,
|
||||
Some(1001..) => return Err(ApiError::BadRequest(anyhow!("frequency must be <=1000 Hz"))),
|
||||
Some(frequency) => frequency,
|
||||
};
|
||||
|
||||
// Only allow one profiler at a time.
|
||||
static PROFILE_LOCK: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
|
||||
let _lock = PROFILE_LOCK
|
||||
.try_lock()
|
||||
.map_err(|_| ApiError::Conflict("profiler already running".into()))?;
|
||||
let force: bool = parse_query_param(&req, "force")?.unwrap_or_default();
|
||||
|
||||
// Take the profile.
|
||||
let report = tokio::task::spawn_blocking(move || {
|
||||
static PROFILE_LOCK: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
|
||||
static PROFILE_CANCEL: Lazy<Notify> = Lazy::new(Notify::new);
|
||||
|
||||
let report = {
|
||||
// Only allow one profiler at a time. If force is true, cancel a running profile (e.g. a
|
||||
// Grafana continuous profile). We use a try_lock() loop when cancelling instead of waiting
|
||||
// for a lock(), to avoid races where the notify isn't currently awaited.
|
||||
let _lock = loop {
|
||||
match PROFILE_LOCK.try_lock() {
|
||||
Ok(lock) => break lock,
|
||||
Err(_) if force => PROFILE_CANCEL.notify_waiters(),
|
||||
Err(_) => {
|
||||
return Err(ApiError::Conflict(
|
||||
"profiler already running (use ?force=true to cancel it)".into(),
|
||||
))
|
||||
}
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(1)).await; // don't busy-wait
|
||||
};
|
||||
|
||||
let guard = ProfilerGuardBuilder::default()
|
||||
.frequency(frequency_hz)
|
||||
.blocklist(&["libc", "libgcc", "pthread", "vdso"])
|
||||
.build()?;
|
||||
std::thread::sleep(Duration::from_secs(seconds));
|
||||
guard.report().build()
|
||||
})
|
||||
.await
|
||||
.map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
|
||||
.map_err(|pprof_err| ApiError::InternalServerError(pprof_err.into()))?;
|
||||
.build()
|
||||
.map_err(|err| ApiError::InternalServerError(err.into()))?;
|
||||
|
||||
tokio::select! {
|
||||
_ = tokio::time::sleep(Duration::from_secs(seconds)) => {},
|
||||
_ = PROFILE_CANCEL.notified() => {},
|
||||
};
|
||||
|
||||
guard
|
||||
.report()
|
||||
.build()
|
||||
.map_err(|err| ApiError::InternalServerError(err.into()))?
|
||||
};
|
||||
|
||||
// Return the report in the requested format.
|
||||
match format {
|
||||
|
||||
@@ -260,7 +260,7 @@ impl FromStr for Lsn {
|
||||
{
|
||||
let left_num = u32::from_str_radix(left, 16).map_err(|_| LsnParseError)?;
|
||||
let right_num = u32::from_str_radix(right, 16).map_err(|_| LsnParseError)?;
|
||||
Ok(Lsn((left_num as u64) << 32 | right_num as u64))
|
||||
Ok(Lsn(((left_num as u64) << 32) | right_num as u64))
|
||||
} else {
|
||||
Err(LsnParseError)
|
||||
}
|
||||
|
||||
@@ -24,3 +24,18 @@ workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
[build-dependencies]
|
||||
tonic-build.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
criterion.workspace = true
|
||||
camino.workspace = true
|
||||
camino-tempfile.workspace = true
|
||||
remote_storage.workspace = true
|
||||
tokio-util.workspace = true
|
||||
serde_json.workspace = true
|
||||
futures.workspace = true
|
||||
tikv-jemallocator.workspace = true
|
||||
pprof.workspace = true
|
||||
|
||||
[[bench]]
|
||||
name = "bench_interpret_wal"
|
||||
harness = false
|
||||
|
||||
34
libs/wal_decoder/benches/README.md
Normal file
34
libs/wal_decoder/benches/README.md
Normal file
@@ -0,0 +1,34 @@
|
||||
## WAL Decoding and Interpretation Benchmarks
|
||||
|
||||
Note that these benchmarks pull WAL from a public bucket in S3
|
||||
as a preparation step. Hence, you need a way to auth with AWS.
|
||||
You can achieve this by copying the `~/.aws/config` file from
|
||||
the AWS SSO notion page and exporting `AWS_PROFILE=dev` when invoking
|
||||
the benchmarks.
|
||||
|
||||
To run benchmarks:
|
||||
|
||||
```sh
|
||||
aws sso login --profile dev
|
||||
|
||||
# All benchmarks.
|
||||
AWS_PROFILE=dev cargo bench --package wal_decoder
|
||||
|
||||
# Specific file.
|
||||
AWS_PROFILE=dev cargo bench --package wal_decoder --bench bench_interpret_wal
|
||||
|
||||
# Specific benchmark.
|
||||
AWS_PROFILE=dev cargo bench --package wal_decoder --bench bench_interpret_wal unsharded
|
||||
|
||||
# List available benchmarks.
|
||||
cargo bench --package wal_decoder --benches -- --list
|
||||
|
||||
# Generate flamegraph profiles using pprof-rs, profiling for 10 seconds.
|
||||
# Output in target/criterion/*/profile/flamegraph.svg.
|
||||
AWS_PROFILE=dev cargo bench --package wal_decoder --bench bench_interpret_wal unsharded -- --profile-time 10
|
||||
```
|
||||
|
||||
Additional charts and statistics are available in `target/criterion/report/index.html`.
|
||||
|
||||
Benchmarks are automatically compared against the previous run. To compare against other runs, see
|
||||
`--baseline` and `--save-baseline`.
|
||||
250
libs/wal_decoder/benches/bench_interpret_wal.rs
Normal file
250
libs/wal_decoder/benches/bench_interpret_wal.rs
Normal file
@@ -0,0 +1,250 @@
|
||||
use anyhow::Context;
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use futures::{stream::FuturesUnordered, StreamExt};
|
||||
use pageserver_api::shard::{ShardIdentity, ShardStripeSize};
|
||||
use postgres_ffi::{waldecoder::WalStreamDecoder, MAX_SEND_SIZE, WAL_SEGMENT_SIZE};
|
||||
use pprof::criterion::{Output, PProfProfiler};
|
||||
use serde::Deserialize;
|
||||
use std::{env, num::NonZeroUsize, sync::Arc};
|
||||
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use camino_tempfile::Utf8TempDir;
|
||||
use remote_storage::{
|
||||
DownloadOpts, GenericRemoteStorage, ListingMode, RemoteStorageConfig, RemoteStorageKind,
|
||||
S3Config,
|
||||
};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{
|
||||
lsn::Lsn,
|
||||
shard::{ShardCount, ShardNumber},
|
||||
};
|
||||
use wal_decoder::models::InterpretedWalRecord;
|
||||
|
||||
const S3_BUCKET: &str = "neon-github-public-dev";
|
||||
const S3_REGION: &str = "eu-central-1";
|
||||
const BUCKET_PREFIX: &str = "wal-snapshots/bulk-insert/";
|
||||
const METADATA_FILENAME: &str = "metadata.json";
|
||||
|
||||
/// Use jemalloc, and configure it to sample allocations for profiles every 1 MB.
|
||||
/// This mirrors the configuration in bin/safekeeper.rs.
|
||||
#[global_allocator]
|
||||
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
|
||||
|
||||
#[allow(non_upper_case_globals)]
|
||||
#[export_name = "malloc_conf"]
|
||||
pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
|
||||
|
||||
async fn create_s3_client() -> anyhow::Result<Arc<GenericRemoteStorage>> {
|
||||
let remote_storage_config = RemoteStorageConfig {
|
||||
storage: RemoteStorageKind::AwsS3(S3Config {
|
||||
bucket_name: S3_BUCKET.to_string(),
|
||||
bucket_region: S3_REGION.to_string(),
|
||||
prefix_in_bucket: Some(BUCKET_PREFIX.to_string()),
|
||||
endpoint: None,
|
||||
concurrency_limit: NonZeroUsize::new(100).unwrap(),
|
||||
max_keys_per_list_response: None,
|
||||
upload_storage_class: None,
|
||||
}),
|
||||
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
||||
small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT,
|
||||
};
|
||||
Ok(Arc::new(
|
||||
GenericRemoteStorage::from_config(&remote_storage_config)
|
||||
.await
|
||||
.context("remote storage init")?,
|
||||
))
|
||||
}
|
||||
|
||||
async fn download_bench_data(
|
||||
client: Arc<GenericRemoteStorage>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<Utf8TempDir> {
|
||||
let temp_dir_parent: Utf8PathBuf = env::current_dir().unwrap().try_into()?;
|
||||
let temp_dir = camino_tempfile::tempdir_in(temp_dir_parent)?;
|
||||
|
||||
eprintln!("Downloading benchmark data to {:?}", temp_dir);
|
||||
|
||||
let listing = client
|
||||
.list(None, ListingMode::NoDelimiter, None, cancel)
|
||||
.await?;
|
||||
|
||||
let mut downloads = listing
|
||||
.keys
|
||||
.into_iter()
|
||||
.map(|obj| {
|
||||
let client = client.clone();
|
||||
let temp_dir_path = temp_dir.path().to_owned();
|
||||
|
||||
async move {
|
||||
let remote_path = obj.key;
|
||||
let download = client
|
||||
.download(&remote_path, &DownloadOpts::default(), cancel)
|
||||
.await?;
|
||||
let mut body = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
|
||||
let file_name = remote_path.object_name().unwrap();
|
||||
let file_path = temp_dir_path.join(file_name);
|
||||
let file = tokio::fs::OpenOptions::new()
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.write(true)
|
||||
.open(&file_path)
|
||||
.await?;
|
||||
|
||||
let mut writer = tokio::io::BufWriter::new(file);
|
||||
tokio::io::copy_buf(&mut body, &mut writer).await?;
|
||||
|
||||
Ok::<(), anyhow::Error>(())
|
||||
}
|
||||
})
|
||||
.collect::<FuturesUnordered<_>>();
|
||||
|
||||
while let Some(download) = downloads.next().await {
|
||||
download?;
|
||||
}
|
||||
|
||||
Ok(temp_dir)
|
||||
}
|
||||
|
||||
struct BenchmarkData {
|
||||
wal: Vec<u8>,
|
||||
meta: BenchmarkMetadata,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct BenchmarkMetadata {
|
||||
pg_version: u32,
|
||||
start_lsn: Lsn,
|
||||
}
|
||||
|
||||
async fn load_bench_data(path: &Utf8Path, input_size: usize) -> anyhow::Result<BenchmarkData> {
|
||||
eprintln!("Loading benchmark data from {:?}", path);
|
||||
|
||||
let mut entries = tokio::fs::read_dir(path).await?;
|
||||
let mut ordered_segment_paths = Vec::new();
|
||||
let mut metadata = None;
|
||||
|
||||
while let Some(entry) = entries.next_entry().await? {
|
||||
if entry.file_name() == METADATA_FILENAME {
|
||||
let bytes = tokio::fs::read(entry.path()).await?;
|
||||
metadata = Some(
|
||||
serde_json::from_slice::<BenchmarkMetadata>(&bytes)
|
||||
.context("failed to deserialize metadata.json")?,
|
||||
);
|
||||
} else {
|
||||
ordered_segment_paths.push(entry.path());
|
||||
}
|
||||
}
|
||||
|
||||
ordered_segment_paths.sort();
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
for path in ordered_segment_paths {
|
||||
if buffer.len() >= input_size {
|
||||
break;
|
||||
}
|
||||
|
||||
use async_compression::tokio::bufread::ZstdDecoder;
|
||||
let file = tokio::fs::File::open(path).await?;
|
||||
let reader = tokio::io::BufReader::new(file);
|
||||
let decoder = ZstdDecoder::new(reader);
|
||||
let mut reader = tokio::io::BufReader::new(decoder);
|
||||
tokio::io::copy_buf(&mut reader, &mut buffer).await?;
|
||||
}
|
||||
|
||||
buffer.truncate(input_size);
|
||||
|
||||
Ok(BenchmarkData {
|
||||
wal: buffer,
|
||||
meta: metadata.unwrap(),
|
||||
})
|
||||
}
|
||||
|
||||
fn criterion_benchmark(c: &mut Criterion) {
|
||||
const INPUT_SIZE: usize = 128 * 1024 * 1024;
|
||||
|
||||
let setup_runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let (_temp_dir, bench_data) = setup_runtime.block_on(async move {
|
||||
let cancel = CancellationToken::new();
|
||||
let client = create_s3_client().await.unwrap();
|
||||
let temp_dir = download_bench_data(client, &cancel).await.unwrap();
|
||||
let bench_data = load_bench_data(temp_dir.path(), INPUT_SIZE).await.unwrap();
|
||||
|
||||
(temp_dir, bench_data)
|
||||
});
|
||||
|
||||
eprintln!(
|
||||
"Benchmarking against {} MiB of WAL",
|
||||
INPUT_SIZE / 1024 / 1024
|
||||
);
|
||||
|
||||
let mut group = c.benchmark_group("decode-interpret-wal");
|
||||
group.throughput(criterion::Throughput::Bytes(bench_data.wal.len() as u64));
|
||||
group.sample_size(10);
|
||||
|
||||
group.bench_function("unsharded", |b| {
|
||||
b.iter(|| decode_interpret_main(&bench_data, &[ShardIdentity::unsharded()]))
|
||||
});
|
||||
|
||||
let eight_shards = (0..8)
|
||||
.map(|i| ShardIdentity::new(ShardNumber(i), ShardCount(8), ShardStripeSize(8)).unwrap())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
group.bench_function("8/8-shards", |b| {
|
||||
b.iter(|| decode_interpret_main(&bench_data, &eight_shards))
|
||||
});
|
||||
|
||||
let four_shards = eight_shards
|
||||
.into_iter()
|
||||
.filter(|s| s.number.0 % 2 == 0)
|
||||
.collect::<Vec<_>>();
|
||||
group.bench_function("4/8-shards", |b| {
|
||||
b.iter(|| decode_interpret_main(&bench_data, &four_shards))
|
||||
});
|
||||
|
||||
let two_shards = four_shards
|
||||
.into_iter()
|
||||
.filter(|s| s.number.0 % 4 == 0)
|
||||
.collect::<Vec<_>>();
|
||||
group.bench_function("2/8-shards", |b| {
|
||||
b.iter(|| decode_interpret_main(&bench_data, &two_shards))
|
||||
});
|
||||
}
|
||||
|
||||
fn decode_interpret_main(bench: &BenchmarkData, shards: &[ShardIdentity]) {
|
||||
let r = decode_interpret(bench, shards);
|
||||
if let Err(e) = r {
|
||||
panic!("{e:?}");
|
||||
}
|
||||
}
|
||||
|
||||
fn decode_interpret(bench: &BenchmarkData, shard: &[ShardIdentity]) -> anyhow::Result<()> {
|
||||
let mut decoder = WalStreamDecoder::new(bench.meta.start_lsn, bench.meta.pg_version);
|
||||
let xlogoff: usize = bench.meta.start_lsn.segment_offset(WAL_SEGMENT_SIZE);
|
||||
|
||||
for chunk in bench.wal[xlogoff..].chunks(MAX_SEND_SIZE) {
|
||||
decoder.feed_bytes(chunk);
|
||||
while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() {
|
||||
assert!(lsn.is_aligned());
|
||||
let _ = InterpretedWalRecord::from_bytes_filtered(
|
||||
recdata,
|
||||
shard,
|
||||
lsn,
|
||||
bench.meta.pg_version,
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
criterion_group!(
|
||||
name=benches;
|
||||
config=Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
|
||||
targets=criterion_benchmark
|
||||
);
|
||||
criterion_main!(benches);
|
||||
@@ -1,6 +1,8 @@
|
||||
//! This module contains logic for decoding and interpreting
|
||||
//! raw bytes which represent a raw Postgres WAL record.
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::models::*;
|
||||
use crate::serialized_batch::SerializedValueBatch;
|
||||
use bytes::{Buf, Bytes};
|
||||
@@ -14,15 +16,15 @@ use utils::lsn::Lsn;
|
||||
|
||||
impl InterpretedWalRecord {
|
||||
/// Decode and interpreted raw bytes which represent one Postgres WAL record.
|
||||
/// Data blocks which do not match the provided shard identity are filtered out.
|
||||
/// Data blocks which do not match any of the provided shard identities are filtered out.
|
||||
/// Shard 0 is a special case since it tracks all relation sizes. We only give it
|
||||
/// the keys that are being written as that is enough for updating relation sizes.
|
||||
pub fn from_bytes_filtered(
|
||||
buf: Bytes,
|
||||
shard: &ShardIdentity,
|
||||
shards: &[ShardIdentity],
|
||||
next_record_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<InterpretedWalRecord> {
|
||||
) -> anyhow::Result<HashMap<ShardIdentity, InterpretedWalRecord>> {
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
decode_wal_record(buf, &mut decoded, pg_version)?;
|
||||
let xid = decoded.xl_xid;
|
||||
@@ -33,43 +35,57 @@ impl InterpretedWalRecord {
|
||||
FlushUncommittedRecords::No
|
||||
};
|
||||
|
||||
let metadata_record =
|
||||
MetadataRecord::from_decoded_filtered(&decoded, shard, next_record_lsn, pg_version)?;
|
||||
let batch = SerializedValueBatch::from_decoded_filtered(
|
||||
let mut shard_records: HashMap<ShardIdentity, InterpretedWalRecord> =
|
||||
HashMap::with_capacity(shards.len());
|
||||
for shard in shards {
|
||||
shard_records.insert(
|
||||
*shard,
|
||||
InterpretedWalRecord {
|
||||
metadata_record: None,
|
||||
batch: SerializedValueBatch::default(),
|
||||
next_record_lsn,
|
||||
flush_uncommitted,
|
||||
xid,
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
MetadataRecord::from_decoded_filtered(
|
||||
&decoded,
|
||||
&mut shard_records,
|
||||
next_record_lsn,
|
||||
pg_version,
|
||||
)?;
|
||||
SerializedValueBatch::from_decoded_filtered(
|
||||
decoded,
|
||||
shard,
|
||||
&mut shard_records,
|
||||
next_record_lsn,
|
||||
pg_version,
|
||||
)?;
|
||||
|
||||
Ok(InterpretedWalRecord {
|
||||
metadata_record,
|
||||
batch,
|
||||
next_record_lsn,
|
||||
flush_uncommitted,
|
||||
xid,
|
||||
})
|
||||
Ok(shard_records)
|
||||
}
|
||||
}
|
||||
|
||||
impl MetadataRecord {
|
||||
/// Builds a metadata record for this WAL record, if any.
|
||||
/// Populates the given `shard_records` with metadata records from this WAL record, if any,
|
||||
/// discarding those belonging to other shards.
|
||||
///
|
||||
/// Only metadata records relevant for the given shard are emitted. Currently, most metadata
|
||||
/// Only metadata records relevant for the given shards is emitted. Currently, most metadata
|
||||
/// records are broadcast to all shards for simplicity, but this should be improved.
|
||||
fn from_decoded_filtered(
|
||||
decoded: &DecodedWALRecord,
|
||||
shard: &ShardIdentity,
|
||||
shard_records: &mut HashMap<ShardIdentity, InterpretedWalRecord>,
|
||||
next_record_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Option<MetadataRecord>> {
|
||||
) -> anyhow::Result<()> {
|
||||
// Note: this doesn't actually copy the bytes since
|
||||
// the [`Bytes`] type implements it via a level of indirection.
|
||||
let mut buf = decoded.record.clone();
|
||||
buf.advance(decoded.main_data_offset);
|
||||
|
||||
// First, generate metadata records from the decoded WAL record.
|
||||
let mut metadata_record = match decoded.xl_rmid {
|
||||
let metadata_record = match decoded.xl_rmid {
|
||||
pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
|
||||
Self::decode_heapam_record(&mut buf, decoded, pg_version)?
|
||||
}
|
||||
@@ -112,41 +128,65 @@ impl MetadataRecord {
|
||||
};
|
||||
|
||||
// Next, filter the metadata record by shard.
|
||||
match metadata_record {
|
||||
Some(
|
||||
MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref mut clear_vm_bits))
|
||||
| MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref mut clear_vm_bits)),
|
||||
) => {
|
||||
// Route VM page updates to the shards that own them. VM pages are stored in the VM fork
|
||||
// of the main relation. These are sharded and managed just like regular relation pages.
|
||||
// See: https://github.com/neondatabase/neon/issues/9855
|
||||
let is_local_vm_page = |heap_blk| {
|
||||
let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk);
|
||||
shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk))
|
||||
};
|
||||
// Send the old and new VM page updates to their respective shards.
|
||||
clear_vm_bits.old_heap_blkno = clear_vm_bits
|
||||
.old_heap_blkno
|
||||
.filter(|&blkno| is_local_vm_page(blkno));
|
||||
clear_vm_bits.new_heap_blkno = clear_vm_bits
|
||||
.new_heap_blkno
|
||||
.filter(|&blkno| is_local_vm_page(blkno));
|
||||
// If neither VM page belongs to this shard, discard the record.
|
||||
if clear_vm_bits.old_heap_blkno.is_none() && clear_vm_bits.new_heap_blkno.is_none()
|
||||
{
|
||||
metadata_record = None
|
||||
for (shard, record) in shard_records.iter_mut() {
|
||||
match metadata_record {
|
||||
Some(
|
||||
MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref clear_vm_bits))
|
||||
| MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref clear_vm_bits)),
|
||||
) => {
|
||||
// Route VM page updates to the shards that own them. VM pages are stored in the VM fork
|
||||
// of the main relation. These are sharded and managed just like regular relation pages.
|
||||
// See: https://github.com/neondatabase/neon/issues/9855
|
||||
let is_local_vm_page = |heap_blk| {
|
||||
let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk);
|
||||
shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk))
|
||||
};
|
||||
// Send the old and new VM page updates to their respective shards.
|
||||
let updated_old_heap_blkno = clear_vm_bits
|
||||
.old_heap_blkno
|
||||
.filter(|&blkno| is_local_vm_page(blkno));
|
||||
let updated_new_heap_blkno = clear_vm_bits
|
||||
.new_heap_blkno
|
||||
.filter(|&blkno| is_local_vm_page(blkno));
|
||||
// If neither VM page belongs to this shard, discard the record.
|
||||
if updated_old_heap_blkno.is_some() || updated_new_heap_blkno.is_some() {
|
||||
// Clone the record and update it for the current shard.
|
||||
let mut for_shard = metadata_record.clone();
|
||||
match for_shard {
|
||||
Some(
|
||||
MetadataRecord::Heapam(HeapamRecord::ClearVmBits(
|
||||
ref mut clear_vm_bits,
|
||||
))
|
||||
| MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(
|
||||
ref mut clear_vm_bits,
|
||||
)),
|
||||
) => {
|
||||
clear_vm_bits.old_heap_blkno = updated_old_heap_blkno;
|
||||
clear_vm_bits.new_heap_blkno = updated_new_heap_blkno;
|
||||
record.metadata_record = for_shard;
|
||||
}
|
||||
_ => {
|
||||
unreachable!("for_shard is a clone of what we checked above")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Some(MetadataRecord::LogicalMessage(LogicalMessageRecord::Put(_))) => {
|
||||
// Filter LogicalMessage records (AUX files) to only be stored on shard zero
|
||||
if shard.is_shard_zero() {
|
||||
record.metadata_record = metadata_record;
|
||||
// No other shards should receive this record, so we stop traversing shards early.
|
||||
break;
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
// All other metadata records are sent to all shards.
|
||||
record.metadata_record = metadata_record.clone();
|
||||
}
|
||||
}
|
||||
Some(MetadataRecord::LogicalMessage(LogicalMessageRecord::Put(_))) => {
|
||||
// Filter LogicalMessage records (AUX files) to only be stored on shard zero
|
||||
if !shard.is_shard_zero() {
|
||||
metadata_record = None;
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
Ok(metadata_record)
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn decode_heapam_record(
|
||||
|
||||
@@ -48,7 +48,7 @@ pub mod proto {
|
||||
tonic::include_proto!("interpreted_wal");
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Copy, Clone, Serialize, Deserialize)]
|
||||
pub enum FlushUncommittedRecords {
|
||||
Yes,
|
||||
No,
|
||||
@@ -107,7 +107,7 @@ impl InterpretedWalRecord {
|
||||
|
||||
/// The interpreted part of the Postgres WAL record which requires metadata
|
||||
/// writes to the underlying storage engine.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub enum MetadataRecord {
|
||||
Heapam(HeapamRecord),
|
||||
Neonrmgr(NeonrmgrRecord),
|
||||
@@ -123,12 +123,12 @@ pub enum MetadataRecord {
|
||||
Replorigin(ReploriginRecord),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub enum HeapamRecord {
|
||||
ClearVmBits(ClearVmBits),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct ClearVmBits {
|
||||
pub new_heap_blkno: Option<u32>,
|
||||
pub old_heap_blkno: Option<u32>,
|
||||
@@ -136,29 +136,29 @@ pub struct ClearVmBits {
|
||||
pub flags: u8,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub enum NeonrmgrRecord {
|
||||
ClearVmBits(ClearVmBits),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub enum SmgrRecord {
|
||||
Create(SmgrCreate),
|
||||
Truncate(XlSmgrTruncate),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct SmgrCreate {
|
||||
pub rel: RelTag,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub enum DbaseRecord {
|
||||
Create(DbaseCreate),
|
||||
Drop(DbaseDrop),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct DbaseCreate {
|
||||
pub db_id: Oid,
|
||||
pub tablespace_id: Oid,
|
||||
@@ -166,32 +166,32 @@ pub struct DbaseCreate {
|
||||
pub src_tablespace_id: Oid,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct DbaseDrop {
|
||||
pub db_id: Oid,
|
||||
pub tablespace_ids: Vec<Oid>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub enum ClogRecord {
|
||||
ZeroPage(ClogZeroPage),
|
||||
Truncate(ClogTruncate),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct ClogZeroPage {
|
||||
pub segno: u32,
|
||||
pub rpageno: u32,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct ClogTruncate {
|
||||
pub pageno: u32,
|
||||
pub oldest_xid: TransactionId,
|
||||
pub oldest_xid_db: Oid,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub enum XactRecord {
|
||||
Commit(XactCommon),
|
||||
Abort(XactCommon),
|
||||
@@ -200,7 +200,7 @@ pub enum XactRecord {
|
||||
Prepare(XactPrepare),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct XactCommon {
|
||||
pub parsed: XlXactParsedRecord,
|
||||
pub origin_id: u16,
|
||||
@@ -209,73 +209,73 @@ pub struct XactCommon {
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct XactPrepare {
|
||||
pub xl_xid: TransactionId,
|
||||
pub data: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub enum MultiXactRecord {
|
||||
ZeroPage(MultiXactZeroPage),
|
||||
Create(XlMultiXactCreate),
|
||||
Truncate(XlMultiXactTruncate),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct MultiXactZeroPage {
|
||||
pub slru_kind: SlruKind,
|
||||
pub segno: u32,
|
||||
pub rpageno: u32,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub enum RelmapRecord {
|
||||
Update(RelmapUpdate),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct RelmapUpdate {
|
||||
pub update: XlRelmapUpdate,
|
||||
pub buf: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub enum XlogRecord {
|
||||
Raw(RawXlogRecord),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct RawXlogRecord {
|
||||
pub info: u8,
|
||||
pub lsn: Lsn,
|
||||
pub buf: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub enum LogicalMessageRecord {
|
||||
Put(PutLogicalMessage),
|
||||
#[cfg(feature = "testing")]
|
||||
Failpoint,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct PutLogicalMessage {
|
||||
pub path: String,
|
||||
pub buf: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub enum StandbyRecord {
|
||||
RunningXacts(StandbyRunningXacts),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct StandbyRunningXacts {
|
||||
pub oldest_running_xid: TransactionId,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub enum ReploriginRecord {
|
||||
Set(XlReploriginSet),
|
||||
Drop(XlReploriginDrop),
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
//! Such batches are created from decoded PG wal records and ingested
|
||||
//! by the pageserver by writing directly to the ephemeral file.
|
||||
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::{BTreeSet, HashMap};
|
||||
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
@@ -22,6 +22,8 @@ use utils::lsn::Lsn;
|
||||
|
||||
use pageserver_api::key::Key;
|
||||
|
||||
use crate::models::InterpretedWalRecord;
|
||||
|
||||
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
|
||||
|
||||
/// Accompanying metadata for the batch
|
||||
@@ -128,7 +130,8 @@ impl Default for SerializedValueBatch {
|
||||
}
|
||||
|
||||
impl SerializedValueBatch {
|
||||
/// Build a batch of serialized values from a decoded PG WAL record
|
||||
/// Populates the given `shard_records` with value batches from this WAL record, if any,
|
||||
/// discarding those belonging to other shards.
|
||||
///
|
||||
/// The batch will only contain values for keys targeting the specifiec
|
||||
/// shard. Shard 0 is a special case, where any keys that don't belong to
|
||||
@@ -136,21 +139,20 @@ impl SerializedValueBatch {
|
||||
/// but absent from the raw buffer [`SerializedValueBatch::raw`]).
|
||||
pub(crate) fn from_decoded_filtered(
|
||||
decoded: DecodedWALRecord,
|
||||
shard: &ShardIdentity,
|
||||
shard_records: &mut HashMap<ShardIdentity, InterpretedWalRecord>,
|
||||
next_record_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<SerializedValueBatch> {
|
||||
// First determine how big the buffer needs to be and allocate it up-front.
|
||||
) -> anyhow::Result<()> {
|
||||
// First determine how big the buffers need to be and allocate it up-front.
|
||||
// This duplicates some of the work below, but it's empirically much faster.
|
||||
let estimated_buffer_size = Self::estimate_buffer_size(&decoded, shard, pg_version);
|
||||
let mut buf = Vec::<u8>::with_capacity(estimated_buffer_size);
|
||||
for (shard, record) in shard_records.iter_mut() {
|
||||
assert!(record.batch.is_empty());
|
||||
|
||||
let estimate = Self::estimate_buffer_size(&decoded, shard, pg_version);
|
||||
record.batch.raw = Vec::with_capacity(estimate);
|
||||
}
|
||||
|
||||
let mut metadata: Vec<ValueMeta> = Vec::with_capacity(decoded.blocks.len());
|
||||
let mut max_lsn: Lsn = Lsn(0);
|
||||
let mut len: usize = 0;
|
||||
for blk in decoded.blocks.iter() {
|
||||
let relative_off = buf.len() as u64;
|
||||
|
||||
let rel = RelTag {
|
||||
spcnode: blk.rnode_spcnode,
|
||||
dbnode: blk.rnode_dbnode,
|
||||
@@ -168,99 +170,98 @@ impl SerializedValueBatch {
|
||||
);
|
||||
}
|
||||
|
||||
let key_is_local = shard.is_key_local(&key);
|
||||
for (shard, record) in shard_records.iter_mut() {
|
||||
let key_is_local = shard.is_key_local(&key);
|
||||
|
||||
tracing::debug!(
|
||||
lsn=%next_record_lsn,
|
||||
key=%key,
|
||||
"ingest: shard decision {}",
|
||||
if !key_is_local { "drop" } else { "keep" },
|
||||
);
|
||||
tracing::debug!(
|
||||
lsn=%next_record_lsn,
|
||||
key=%key,
|
||||
"ingest: shard decision {}",
|
||||
if !key_is_local { "drop" } else { "keep" },
|
||||
);
|
||||
|
||||
if !key_is_local {
|
||||
if shard.is_shard_zero() {
|
||||
// Shard 0 tracks relation sizes. Although we will not store this block, we will observe
|
||||
// its blkno in case it implicitly extends a relation.
|
||||
metadata.push(ValueMeta::Observed(ObservedValueMeta {
|
||||
if !key_is_local {
|
||||
if shard.is_shard_zero() {
|
||||
// Shard 0 tracks relation sizes. Although we will not store this block, we will observe
|
||||
// its blkno in case it implicitly extends a relation.
|
||||
record
|
||||
.batch
|
||||
.metadata
|
||||
.push(ValueMeta::Observed(ObservedValueMeta {
|
||||
key: key.to_compact(),
|
||||
lsn: next_record_lsn,
|
||||
}))
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Instead of storing full-page-image WAL record,
|
||||
// it is better to store extracted image: we can skip wal-redo
|
||||
// in this case. Also some FPI records may contain multiple (up to 32) pages,
|
||||
// so them have to be copied multiple times.
|
||||
//
|
||||
let val = if Self::block_is_image(&decoded, blk, pg_version) {
|
||||
// Extract page image from FPI record
|
||||
let img_len = blk.bimg_len as usize;
|
||||
let img_offs = blk.bimg_offset as usize;
|
||||
let mut image = BytesMut::with_capacity(BLCKSZ as usize);
|
||||
// TODO(vlad): skip the copy
|
||||
image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]);
|
||||
|
||||
if blk.hole_length != 0 {
|
||||
let tail = image.split_off(blk.hole_offset as usize);
|
||||
image.resize(image.len() + blk.hole_length as usize, 0u8);
|
||||
image.unsplit(tail);
|
||||
}
|
||||
//
|
||||
// Match the logic of XLogReadBufferForRedoExtended:
|
||||
// The page may be uninitialized. If so, we can't set the LSN because
|
||||
// that would corrupt the page.
|
||||
//
|
||||
if !page_is_new(&image) {
|
||||
page_set_lsn(&mut image, next_record_lsn)
|
||||
}
|
||||
assert_eq!(image.len(), BLCKSZ as usize);
|
||||
|
||||
Value::Image(image.freeze())
|
||||
} else {
|
||||
Value::WalRecord(NeonWalRecord::Postgres {
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
rec: decoded.record.clone(),
|
||||
})
|
||||
};
|
||||
|
||||
let relative_off = record.batch.raw.len() as u64;
|
||||
|
||||
val.ser_into(&mut record.batch.raw)
|
||||
.expect("Writing into in-memory buffer is infallible");
|
||||
|
||||
let val_ser_size = record.batch.raw.len() - relative_off as usize;
|
||||
|
||||
record
|
||||
.batch
|
||||
.metadata
|
||||
.push(ValueMeta::Serialized(SerializedValueMeta {
|
||||
key: key.to_compact(),
|
||||
lsn: next_record_lsn,
|
||||
}))
|
||||
}
|
||||
|
||||
continue;
|
||||
batch_offset: relative_off,
|
||||
len: val_ser_size,
|
||||
will_init: val.will_init(),
|
||||
}));
|
||||
record.batch.max_lsn = std::cmp::max(record.batch.max_lsn, next_record_lsn);
|
||||
record.batch.len += 1;
|
||||
}
|
||||
|
||||
// Instead of storing full-page-image WAL record,
|
||||
// it is better to store extracted image: we can skip wal-redo
|
||||
// in this case. Also some FPI records may contain multiple (up to 32) pages,
|
||||
// so them have to be copied multiple times.
|
||||
//
|
||||
let val = if Self::block_is_image(&decoded, blk, pg_version) {
|
||||
// Extract page image from FPI record
|
||||
let img_len = blk.bimg_len as usize;
|
||||
let img_offs = blk.bimg_offset as usize;
|
||||
let mut image = BytesMut::with_capacity(BLCKSZ as usize);
|
||||
// TODO(vlad): skip the copy
|
||||
image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]);
|
||||
|
||||
if blk.hole_length != 0 {
|
||||
let tail = image.split_off(blk.hole_offset as usize);
|
||||
image.resize(image.len() + blk.hole_length as usize, 0u8);
|
||||
image.unsplit(tail);
|
||||
}
|
||||
//
|
||||
// Match the logic of XLogReadBufferForRedoExtended:
|
||||
// The page may be uninitialized. If so, we can't set the LSN because
|
||||
// that would corrupt the page.
|
||||
//
|
||||
if !page_is_new(&image) {
|
||||
page_set_lsn(&mut image, next_record_lsn)
|
||||
}
|
||||
assert_eq!(image.len(), BLCKSZ as usize);
|
||||
|
||||
Value::Image(image.freeze())
|
||||
} else {
|
||||
Value::WalRecord(NeonWalRecord::Postgres {
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
rec: decoded.record.clone(),
|
||||
})
|
||||
};
|
||||
|
||||
val.ser_into(&mut buf)
|
||||
.expect("Writing into in-memory buffer is infallible");
|
||||
|
||||
let val_ser_size = buf.len() - relative_off as usize;
|
||||
|
||||
metadata.push(ValueMeta::Serialized(SerializedValueMeta {
|
||||
key: key.to_compact(),
|
||||
lsn: next_record_lsn,
|
||||
batch_offset: relative_off,
|
||||
len: val_ser_size,
|
||||
will_init: val.will_init(),
|
||||
}));
|
||||
max_lsn = std::cmp::max(max_lsn, next_record_lsn);
|
||||
len += 1;
|
||||
}
|
||||
|
||||
if cfg!(any(debug_assertions, test)) {
|
||||
let batch = Self {
|
||||
raw: buf,
|
||||
metadata,
|
||||
max_lsn,
|
||||
len,
|
||||
};
|
||||
|
||||
batch.validate_lsn_order();
|
||||
|
||||
return Ok(batch);
|
||||
// Validate that the batches are correct
|
||||
for record in shard_records.values() {
|
||||
record.batch.validate_lsn_order();
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
raw: buf,
|
||||
metadata,
|
||||
max_lsn,
|
||||
len,
|
||||
})
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Look into the decoded PG WAL record and determine
|
||||
|
||||
@@ -215,6 +215,7 @@ impl Wrapper {
|
||||
syncSafekeepers: config.sync_safekeepers,
|
||||
systemId: 0,
|
||||
pgTimeline: 1,
|
||||
proto_version: 2,
|
||||
callback_data,
|
||||
};
|
||||
let c_config = Box::into_raw(Box::new(c_config));
|
||||
|
||||
@@ -44,6 +44,7 @@ postgres_backend.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
postgres-types.workspace = true
|
||||
postgres_initdb.workspace = true
|
||||
pprof.workspace = true
|
||||
rand.workspace = true
|
||||
range-set-blaze = { version = "0.1.16", features = ["alloc"] }
|
||||
regex.workspace = true
|
||||
@@ -108,3 +109,7 @@ harness = false
|
||||
[[bench]]
|
||||
name = "bench_ingest"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "upload_queue"
|
||||
harness = false
|
||||
|
||||
87
pageserver/benches/upload_queue.rs
Normal file
87
pageserver/benches/upload_queue.rs
Normal file
@@ -0,0 +1,87 @@
|
||||
//! Upload queue benchmarks.
|
||||
|
||||
use std::str::FromStr as _;
|
||||
use std::sync::atomic::AtomicU32;
|
||||
use std::sync::Arc;
|
||||
|
||||
use criterion::{criterion_group, criterion_main, Bencher, Criterion};
|
||||
use pageserver::tenant::metadata::TimelineMetadata;
|
||||
use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use pageserver::tenant::storage_layer::LayerName;
|
||||
use pageserver::tenant::upload_queue::{Delete, UploadOp, UploadQueue, UploadTask};
|
||||
use pageserver::tenant::IndexPart;
|
||||
use pprof::criterion::{Output, PProfProfiler};
|
||||
use utils::generation::Generation;
|
||||
use utils::shard::{ShardCount, ShardIndex, ShardNumber};
|
||||
|
||||
// Register benchmarks with Criterion.
|
||||
criterion_group!(
|
||||
name = benches;
|
||||
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
|
||||
targets = bench_upload_queue_next_ready,
|
||||
);
|
||||
criterion_main!(benches);
|
||||
|
||||
/// Benchmarks the cost of UploadQueue::next_ready() with the given number of in-progress tasks
|
||||
/// (which is equivalent to tasks ahead of it in the queue). This has linear cost, and the upload
|
||||
/// queue as a whole is thus quadratic.
|
||||
///
|
||||
/// UploadOp::UploadLayer requires an entire tenant and timeline to construct, so we just test
|
||||
/// Delete and UploadMetadata instead. This is incidentally the most expensive case.
|
||||
fn bench_upload_queue_next_ready(c: &mut Criterion) {
|
||||
let mut g = c.benchmark_group("upload_queue_next_ready");
|
||||
for inprogress in [0, 1, 10, 100, 1_000, 10_000, 100_000, 1_000_000] {
|
||||
g.bench_function(format!("inprogress={inprogress}"), |b| {
|
||||
run_bench(b, inprogress).unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
fn run_bench(b: &mut Bencher, inprogress: usize) -> anyhow::Result<()> {
|
||||
// Construct two layers. layer0 is in the indexes, layer1 will be deleted.
|
||||
let layer0 = LayerName::from_str("000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51").expect("invalid name");
|
||||
let layer1 = LayerName::from_str("100000000000000000000000000000000001-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51").expect("invalid name");
|
||||
|
||||
let metadata = LayerFileMetadata {
|
||||
shard: ShardIndex::new(ShardNumber(1), ShardCount(2)),
|
||||
generation: Generation::Valid(1),
|
||||
file_size: 0,
|
||||
};
|
||||
|
||||
// Construct the (initial and uploaded) index with layer0.
|
||||
let mut index = IndexPart::empty(TimelineMetadata::example());
|
||||
index.layer_metadata.insert(layer0, metadata.clone());
|
||||
|
||||
// Construct the queue.
|
||||
let mut queue = UploadQueue::Uninitialized;
|
||||
let queue = queue.initialize_with_current_remote_index_part(&index, 0)?;
|
||||
|
||||
// Populate inprogress_tasks with a bunch of layer1 deletions.
|
||||
let delete = UploadOp::Delete(Delete {
|
||||
layers: vec![(layer1, metadata)],
|
||||
});
|
||||
|
||||
for task_id in 0..(inprogress as u64) {
|
||||
queue.inprogress_tasks.insert(
|
||||
task_id,
|
||||
Arc::new(UploadTask {
|
||||
task_id,
|
||||
retries: AtomicU32::new(0),
|
||||
op: delete.clone(),
|
||||
coalesced_ops: Vec::new(),
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
// Benchmark index upload scheduling.
|
||||
let index_upload = UploadOp::UploadMetadata {
|
||||
uploaded: Box::new(index),
|
||||
};
|
||||
|
||||
b.iter(|| {
|
||||
queue.queued_operations.push_front(index_upload.clone());
|
||||
assert!(queue.next_ready().is_some());
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -53,10 +53,12 @@ project_build_tag!(BUILD_TAG);
|
||||
#[global_allocator]
|
||||
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
|
||||
|
||||
/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
|
||||
/// Configure jemalloc to profile heap allocations by sampling stack traces every 2 MB (1 << 21).
|
||||
/// This adds roughly 3% overhead for allocations on average, which is acceptable considering
|
||||
/// performance-sensitive code will avoid allocations as far as possible anyway.
|
||||
#[allow(non_upper_case_globals)]
|
||||
#[export_name = "malloc_conf"]
|
||||
pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
|
||||
pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
|
||||
|
||||
const PID_FILE_NAME: &str = "pageserver.pid";
|
||||
|
||||
|
||||
@@ -278,6 +278,8 @@ async fn import_wal(
|
||||
|
||||
let mut walingest = WalIngest::new(tline, startpoint, ctx).await?;
|
||||
|
||||
let shard = vec![*tline.get_shard_identity()];
|
||||
|
||||
while last_lsn <= endpoint {
|
||||
// FIXME: assume postgresql tli 1 for now
|
||||
let filename = XLogFileName(1, segno, WAL_SEGMENT_SIZE);
|
||||
@@ -314,10 +316,12 @@ async fn import_wal(
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
let interpreted = InterpretedWalRecord::from_bytes_filtered(
|
||||
recdata,
|
||||
tline.get_shard_identity(),
|
||||
&shard,
|
||||
lsn,
|
||||
tline.pg_version,
|
||||
)?;
|
||||
)?
|
||||
.remove(tline.get_shard_identity())
|
||||
.unwrap();
|
||||
|
||||
walingest
|
||||
.ingest_record(interpreted, &mut modification, ctx)
|
||||
@@ -411,6 +415,7 @@ pub async fn import_wal_from_tar(
|
||||
let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE);
|
||||
let mut last_lsn = start_lsn;
|
||||
let mut walingest = WalIngest::new(tline, start_lsn, ctx).await?;
|
||||
let shard = vec![*tline.get_shard_identity()];
|
||||
|
||||
// Ingest wal until end_lsn
|
||||
info!("importing wal until {}", end_lsn);
|
||||
@@ -459,10 +464,12 @@ pub async fn import_wal_from_tar(
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
let interpreted = InterpretedWalRecord::from_bytes_filtered(
|
||||
recdata,
|
||||
tline.get_shard_identity(),
|
||||
&shard,
|
||||
lsn,
|
||||
tline.pg_version,
|
||||
)?;
|
||||
)?
|
||||
.remove(tline.get_shard_identity())
|
||||
.unwrap();
|
||||
|
||||
walingest
|
||||
.ingest_record(interpreted, &mut modification, ctx)
|
||||
|
||||
@@ -1224,117 +1224,189 @@ pub(crate) struct SmgrOpTimerInner {
|
||||
global_flush_in_progress_micros: IntCounter,
|
||||
per_timeline_flush_in_progress_micros: IntCounter,
|
||||
|
||||
throttling: Arc<tenant_throttling::Pagestream>,
|
||||
|
||||
timings: SmgrOpTimerState,
|
||||
}
|
||||
|
||||
/// The stages of request processing are represented by the enum variants.
|
||||
/// Used as part of [`SmgrOpTimerInner::timings`].
|
||||
///
|
||||
/// Request processing calls into the `SmgrOpTimer::observe_*` methods at the
|
||||
/// transition points.
|
||||
/// These methods bump relevant counters and then update [`SmgrOpTimerInner::timings`]
|
||||
/// to the next state.
|
||||
///
|
||||
/// Each request goes through every stage, in all configurations.
|
||||
///
|
||||
#[derive(Debug)]
|
||||
enum SmgrOpTimerState {
|
||||
Received {
|
||||
// In the future, we may want to track the full time the request spent
|
||||
// inside pageserver process (time spent in kernel buffers can't be tracked).
|
||||
// `received_at` would be used for that.
|
||||
#[allow(dead_code)]
|
||||
received_at: Instant,
|
||||
},
|
||||
ThrottleDoneExecutionStarting {
|
||||
received_at: Instant,
|
||||
Throttling {
|
||||
throttle_started_at: Instant,
|
||||
started_execution_at: Instant,
|
||||
},
|
||||
Batching {
|
||||
throttle_done_at: Instant,
|
||||
},
|
||||
Executing {
|
||||
execution_started_at: Instant,
|
||||
},
|
||||
Flushing,
|
||||
// NB: when adding observation points, remember to update the Drop impl.
|
||||
}
|
||||
|
||||
// NB: when adding observation points, remember to update the Drop impl.
|
||||
impl SmgrOpTimer {
|
||||
/// See [`SmgrOpTimerState`] for more context.
|
||||
pub(crate) fn observe_throttle_start(&mut self, at: Instant) {
|
||||
let Some(inner) = self.0.as_mut() else {
|
||||
return;
|
||||
};
|
||||
let SmgrOpTimerState::Received { received_at: _ } = &mut inner.timings else {
|
||||
return;
|
||||
};
|
||||
inner.throttling.count_accounted_start.inc();
|
||||
inner.timings = SmgrOpTimerState::Throttling {
|
||||
throttle_started_at: at,
|
||||
};
|
||||
}
|
||||
|
||||
/// See [`SmgrOpTimerState`] for more context.
|
||||
pub(crate) fn observe_throttle_done(&mut self, throttle: ThrottleResult) {
|
||||
let Some(inner) = self.0.as_mut() else {
|
||||
return;
|
||||
};
|
||||
let SmgrOpTimerState::Throttling {
|
||||
throttle_started_at,
|
||||
} = &inner.timings
|
||||
else {
|
||||
return;
|
||||
};
|
||||
inner.throttling.count_accounted_finish.inc();
|
||||
match throttle {
|
||||
ThrottleResult::NotThrottled { end } => {
|
||||
inner.timings = SmgrOpTimerState::Batching {
|
||||
throttle_done_at: end,
|
||||
};
|
||||
}
|
||||
ThrottleResult::Throttled { end } => {
|
||||
// update metrics
|
||||
inner.throttling.count_throttled.inc();
|
||||
inner
|
||||
.throttling
|
||||
.wait_time
|
||||
.inc_by((end - *throttle_started_at).as_micros().try_into().unwrap());
|
||||
// state transition
|
||||
inner.timings = SmgrOpTimerState::Batching {
|
||||
throttle_done_at: end,
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`SmgrOpTimerState`] for more context.
|
||||
pub(crate) fn observe_execution_start(&mut self, at: Instant) {
|
||||
let Some(inner) = self.0.as_mut() else {
|
||||
return;
|
||||
};
|
||||
let SmgrOpTimerState::Batching { throttle_done_at } = &inner.timings else {
|
||||
return;
|
||||
};
|
||||
// update metrics
|
||||
let batch = at - *throttle_done_at;
|
||||
inner.global_batch_wait_time.observe(batch.as_secs_f64());
|
||||
inner
|
||||
.per_timeline_batch_wait_time
|
||||
.observe(batch.as_secs_f64());
|
||||
// state transition
|
||||
inner.timings = SmgrOpTimerState::Executing {
|
||||
execution_started_at: at,
|
||||
}
|
||||
}
|
||||
|
||||
/// For all but the first caller, this is a no-op.
|
||||
/// The first callers receives Some, subsequent ones None.
|
||||
///
|
||||
/// See [`SmgrOpTimerState`] for more context.
|
||||
pub(crate) fn observe_execution_end_flush_start(
|
||||
&mut self,
|
||||
at: Instant,
|
||||
) -> Option<SmgrOpFlushInProgress> {
|
||||
// NB: unlike the other observe_* methods, this one take()s.
|
||||
#[allow(clippy::question_mark)] // maintain similar code pattern.
|
||||
let Some(mut inner) = self.0.take() else {
|
||||
return None;
|
||||
};
|
||||
let SmgrOpTimerState::Executing {
|
||||
execution_started_at,
|
||||
} = &inner.timings
|
||||
else {
|
||||
return None;
|
||||
};
|
||||
// update metrics
|
||||
let execution = at - *execution_started_at;
|
||||
inner
|
||||
.global_execution_latency_histo
|
||||
.observe(execution.as_secs_f64());
|
||||
if let Some(per_timeline_execution_latency_histo) =
|
||||
&inner.per_timeline_execution_latency_histo
|
||||
{
|
||||
per_timeline_execution_latency_histo.observe(execution.as_secs_f64());
|
||||
}
|
||||
|
||||
// state transition
|
||||
inner.timings = SmgrOpTimerState::Flushing;
|
||||
|
||||
// return the flush in progress object which
|
||||
// will do the remaining metrics updates
|
||||
let SmgrOpTimerInner {
|
||||
global_flush_in_progress_micros,
|
||||
per_timeline_flush_in_progress_micros,
|
||||
..
|
||||
} = inner;
|
||||
Some(SmgrOpFlushInProgress {
|
||||
flush_started_at: at,
|
||||
global_micros: global_flush_in_progress_micros,
|
||||
per_timeline_micros: per_timeline_flush_in_progress_micros,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// The last stage of request processing is serializing and flushing the request
|
||||
/// into the TCP connection. We want to make slow flushes observable
|
||||
/// _while they are occuring_, so this struct provides a wrapper method [`Self::measure`]
|
||||
/// to periodically bump the metric.
|
||||
///
|
||||
/// If in the future we decide that we're not interested in live updates, we can
|
||||
/// add another `observe_*` method to [`SmgrOpTimer`], follow the existing pattern there,
|
||||
/// and remove this struct from the code base.
|
||||
pub(crate) struct SmgrOpFlushInProgress {
|
||||
flush_started_at: Instant,
|
||||
global_micros: IntCounter,
|
||||
per_timeline_micros: IntCounter,
|
||||
}
|
||||
|
||||
impl SmgrOpTimer {
|
||||
pub(crate) fn observe_throttle_done_execution_starting(&mut self, throttle: &ThrottleResult) {
|
||||
let inner = self.0.as_mut().expect("other public methods consume self");
|
||||
match (&mut inner.timings, throttle) {
|
||||
(SmgrOpTimerState::Received { received_at }, throttle) => match throttle {
|
||||
ThrottleResult::NotThrottled { start } => {
|
||||
inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting {
|
||||
received_at: *received_at,
|
||||
throttle_started_at: *start,
|
||||
started_execution_at: *start,
|
||||
};
|
||||
}
|
||||
ThrottleResult::Throttled { start, end } => {
|
||||
inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting {
|
||||
received_at: *start,
|
||||
throttle_started_at: *start,
|
||||
started_execution_at: *end,
|
||||
};
|
||||
}
|
||||
},
|
||||
(x, _) => panic!("called in unexpected state: {x:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn observe_smgr_op_completion_and_start_flushing(mut self) -> SmgrOpFlushInProgress {
|
||||
let (flush_start, inner) = self
|
||||
.smgr_op_end()
|
||||
.expect("this method consume self, and the only other caller is drop handler");
|
||||
let SmgrOpTimerInner {
|
||||
global_flush_in_progress_micros,
|
||||
per_timeline_flush_in_progress_micros,
|
||||
..
|
||||
} = inner;
|
||||
SmgrOpFlushInProgress {
|
||||
flush_started_at: flush_start,
|
||||
global_micros: global_flush_in_progress_micros,
|
||||
per_timeline_micros: per_timeline_flush_in_progress_micros,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `None`` if this method has already been called, `Some` otherwise.
|
||||
fn smgr_op_end(&mut self) -> Option<(Instant, SmgrOpTimerInner)> {
|
||||
let inner = self.0.take()?;
|
||||
|
||||
let now = Instant::now();
|
||||
|
||||
let batch;
|
||||
let execution;
|
||||
let throttle;
|
||||
match inner.timings {
|
||||
SmgrOpTimerState::Received { received_at } => {
|
||||
batch = (now - received_at).as_secs_f64();
|
||||
// TODO: use label for dropped requests.
|
||||
// This is quite rare in practice, only during tenant/pageservers shutdown.
|
||||
throttle = Duration::ZERO;
|
||||
execution = Duration::ZERO.as_secs_f64();
|
||||
}
|
||||
SmgrOpTimerState::ThrottleDoneExecutionStarting {
|
||||
received_at,
|
||||
throttle_started_at,
|
||||
started_execution_at,
|
||||
} => {
|
||||
batch = (throttle_started_at - received_at).as_secs_f64();
|
||||
throttle = started_execution_at - throttle_started_at;
|
||||
execution = (now - started_execution_at).as_secs_f64();
|
||||
}
|
||||
}
|
||||
|
||||
// update time spent in batching
|
||||
inner.global_batch_wait_time.observe(batch);
|
||||
inner.per_timeline_batch_wait_time.observe(batch);
|
||||
|
||||
// time spent in throttle metric is updated by throttle impl
|
||||
let _ = throttle;
|
||||
|
||||
// update metrics for execution latency
|
||||
inner.global_execution_latency_histo.observe(execution);
|
||||
if let Some(per_timeline_execution_latency_histo) =
|
||||
&inner.per_timeline_execution_latency_histo
|
||||
{
|
||||
per_timeline_execution_latency_histo.observe(execution);
|
||||
}
|
||||
|
||||
Some((now, inner))
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for SmgrOpTimer {
|
||||
fn drop(&mut self) {
|
||||
self.smgr_op_end();
|
||||
// In case of early drop, update any of the remaining metrics with
|
||||
// observations so that (started,finished) counter pairs balance out
|
||||
// and all counters on the latency path have the the same number of
|
||||
// observations.
|
||||
// It's technically lying and it would be better if each metric had
|
||||
// a separate label or similar for cancelled requests.
|
||||
// But we don't have that right now and counter pairs balancing
|
||||
// out is useful when using the metrics in panels and whatnot.
|
||||
let now = Instant::now();
|
||||
self.observe_throttle_start(now);
|
||||
self.observe_throttle_done(ThrottleResult::NotThrottled { end: now });
|
||||
self.observe_execution_start(now);
|
||||
self.observe_execution_end_flush_start(now);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1345,12 +1417,12 @@ impl SmgrOpFlushInProgress {
|
||||
{
|
||||
let mut fut = std::pin::pin!(fut);
|
||||
|
||||
let now = Instant::now();
|
||||
// Whenever observe_guard gets called, or dropped,
|
||||
// it adds the time elapsed since its last call to metrics.
|
||||
// Last call is tracked in `now`.
|
||||
let mut observe_guard = scopeguard::guard(
|
||||
|| {
|
||||
let now = Instant::now();
|
||||
let elapsed = now - self.flush_started_at;
|
||||
self.global_micros
|
||||
.inc_by(u64::try_from(elapsed.as_micros()).unwrap());
|
||||
@@ -1393,7 +1465,6 @@ pub enum SmgrQueryType {
|
||||
GetSlruSegment,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct SmgrQueryTimePerTimeline {
|
||||
global_started: [IntCounter; SmgrQueryType::COUNT],
|
||||
global_latency: [Histogram; SmgrQueryType::COUNT],
|
||||
@@ -1405,6 +1476,7 @@ pub(crate) struct SmgrQueryTimePerTimeline {
|
||||
per_timeline_flush_in_progress_micros: IntCounter,
|
||||
global_batch_wait_time: Histogram,
|
||||
per_timeline_batch_wait_time: Histogram,
|
||||
throttling: Arc<tenant_throttling::Pagestream>,
|
||||
}
|
||||
|
||||
static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
@@ -1610,7 +1682,11 @@ static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL: Lazy<Histogram> = Lazy::new(||
|
||||
});
|
||||
|
||||
impl SmgrQueryTimePerTimeline {
|
||||
pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
|
||||
pub(crate) fn new(
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
pagestream_throttle_metrics: Arc<tenant_throttling::Pagestream>,
|
||||
) -> Self {
|
||||
let tenant_id = tenant_shard_id.tenant_id.to_string();
|
||||
let shard_slug = format!("{}", tenant_shard_id.shard_slug());
|
||||
let timeline_id = timeline_id.to_string();
|
||||
@@ -1671,6 +1747,7 @@ impl SmgrQueryTimePerTimeline {
|
||||
per_timeline_flush_in_progress_micros,
|
||||
global_batch_wait_time,
|
||||
per_timeline_batch_wait_time,
|
||||
throttling: pagestream_throttle_metrics,
|
||||
}
|
||||
}
|
||||
pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, received_at: Instant) -> SmgrOpTimer {
|
||||
@@ -1686,88 +1763,24 @@ impl SmgrQueryTimePerTimeline {
|
||||
SmgrOpTimer(Some(SmgrOpTimerInner {
|
||||
global_execution_latency_histo: self.global_latency[op as usize].clone(),
|
||||
per_timeline_execution_latency_histo: per_timeline_latency_histo,
|
||||
timings: SmgrOpTimerState::Received { received_at },
|
||||
global_flush_in_progress_micros: self.global_flush_in_progress_micros.clone(),
|
||||
per_timeline_flush_in_progress_micros: self
|
||||
.per_timeline_flush_in_progress_micros
|
||||
.clone(),
|
||||
global_batch_wait_time: self.global_batch_wait_time.clone(),
|
||||
per_timeline_batch_wait_time: self.per_timeline_batch_wait_time.clone(),
|
||||
throttling: self.throttling.clone(),
|
||||
timings: SmgrOpTimerState::Received { received_at },
|
||||
}))
|
||||
}
|
||||
|
||||
/// TODO: do something about this? seems odd, we have a similar call on SmgrOpTimer
|
||||
pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) {
|
||||
self.global_batch_size.observe(batch_size as f64);
|
||||
self.per_timeline_batch_size.observe(batch_size as f64);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod smgr_query_time_tests {
|
||||
use std::time::Instant;
|
||||
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use strum::IntoEnumIterator;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
// Regression test, we used hard-coded string constants before using an enum.
|
||||
#[test]
|
||||
fn op_label_name() {
|
||||
use super::SmgrQueryType::*;
|
||||
let expect: [(super::SmgrQueryType, &'static str); 5] = [
|
||||
(GetRelExists, "get_rel_exists"),
|
||||
(GetRelSize, "get_rel_size"),
|
||||
(GetPageAtLsn, "get_page_at_lsn"),
|
||||
(GetDbSize, "get_db_size"),
|
||||
(GetSlruSegment, "get_slru_segment"),
|
||||
];
|
||||
for (op, expect) in expect {
|
||||
let actual: &'static str = op.into();
|
||||
assert_eq!(actual, expect);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn basic() {
|
||||
let ops: Vec<_> = super::SmgrQueryType::iter().collect();
|
||||
|
||||
for op in &ops {
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
let metrics = super::SmgrQueryTimePerTimeline::new(
|
||||
&TenantShardId::unsharded(tenant_id),
|
||||
&timeline_id,
|
||||
);
|
||||
|
||||
let get_counts = || {
|
||||
let global: u64 = ops
|
||||
.iter()
|
||||
.map(|op| metrics.global_latency[*op as usize].get_sample_count())
|
||||
.sum();
|
||||
(
|
||||
global,
|
||||
metrics.per_timeline_getpage_latency.get_sample_count(),
|
||||
)
|
||||
};
|
||||
|
||||
let (pre_global, pre_per_tenant_timeline) = get_counts();
|
||||
assert_eq!(pre_per_tenant_timeline, 0);
|
||||
|
||||
let timer = metrics.start_smgr_op(*op, Instant::now());
|
||||
drop(timer);
|
||||
|
||||
let (post_global, post_per_tenant_timeline) = get_counts();
|
||||
if matches!(op, super::SmgrQueryType::GetPageAtLsn) {
|
||||
// getpage ops are tracked per-timeline, others aren't
|
||||
assert_eq!(post_per_tenant_timeline, 1);
|
||||
} else {
|
||||
assert_eq!(post_per_tenant_timeline, 0);
|
||||
}
|
||||
assert!(post_global > pre_global);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// keep in sync with control plane Go code so that we can validate
|
||||
// compute's basebackup_ms metric with our perspective in the context of SLI/SLO.
|
||||
static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
|
||||
@@ -3563,9 +3576,7 @@ pub(crate) mod tenant_throttling {
|
||||
use once_cell::sync::Lazy;
|
||||
use utils::shard::TenantShardId;
|
||||
|
||||
use crate::tenant::{self};
|
||||
|
||||
struct GlobalAndPerTenantIntCounter {
|
||||
pub(crate) struct GlobalAndPerTenantIntCounter {
|
||||
global: IntCounter,
|
||||
per_tenant: IntCounter,
|
||||
}
|
||||
@@ -3583,10 +3594,10 @@ pub(crate) mod tenant_throttling {
|
||||
}
|
||||
|
||||
pub(crate) struct Metrics<const KIND: usize> {
|
||||
count_accounted_start: GlobalAndPerTenantIntCounter,
|
||||
count_accounted_finish: GlobalAndPerTenantIntCounter,
|
||||
wait_time: GlobalAndPerTenantIntCounter,
|
||||
count_throttled: GlobalAndPerTenantIntCounter,
|
||||
pub(super) count_accounted_start: GlobalAndPerTenantIntCounter,
|
||||
pub(super) count_accounted_finish: GlobalAndPerTenantIntCounter,
|
||||
pub(super) wait_time: GlobalAndPerTenantIntCounter,
|
||||
pub(super) count_throttled: GlobalAndPerTenantIntCounter,
|
||||
}
|
||||
|
||||
static COUNT_ACCOUNTED_START: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
|
||||
@@ -3721,26 +3732,6 @@ pub(crate) mod tenant_throttling {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<const KIND: usize> tenant::throttle::Metric for Metrics<KIND> {
|
||||
#[inline(always)]
|
||||
fn accounting_start(&self) {
|
||||
self.count_accounted_start.inc();
|
||||
}
|
||||
#[inline(always)]
|
||||
fn accounting_finish(&self) {
|
||||
self.count_accounted_finish.inc();
|
||||
}
|
||||
#[inline(always)]
|
||||
fn observe_throttling(
|
||||
&self,
|
||||
tenant::throttle::Observation { wait_time }: &tenant::throttle::Observation,
|
||||
) {
|
||||
let val = u64::try_from(wait_time.as_micros()).unwrap();
|
||||
self.wait_time.inc_by(val);
|
||||
self.count_throttled.inc();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) mod disk_usage_based_eviction {
|
||||
|
||||
@@ -592,43 +592,21 @@ enum BatchedFeMessage {
|
||||
}
|
||||
|
||||
impl BatchedFeMessage {
|
||||
async fn throttle_and_record_start_processing(
|
||||
&mut self,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), QueryError> {
|
||||
let (shard, tokens, timers) = match self {
|
||||
BatchedFeMessage::Exists { shard, timer, .. }
|
||||
| BatchedFeMessage::Nblocks { shard, timer, .. }
|
||||
| BatchedFeMessage::DbSize { shard, timer, .. }
|
||||
| BatchedFeMessage::GetSlruSegment { shard, timer, .. } => {
|
||||
(
|
||||
shard,
|
||||
// 1 token is probably under-estimating because these
|
||||
// request handlers typically do several Timeline::get calls.
|
||||
1,
|
||||
itertools::Either::Left(std::iter::once(timer)),
|
||||
)
|
||||
fn observe_execution_start(&mut self, at: Instant) {
|
||||
match self {
|
||||
BatchedFeMessage::Exists { timer, .. }
|
||||
| BatchedFeMessage::Nblocks { timer, .. }
|
||||
| BatchedFeMessage::DbSize { timer, .. }
|
||||
| BatchedFeMessage::GetSlruSegment { timer, .. } => {
|
||||
timer.observe_execution_start(at);
|
||||
}
|
||||
BatchedFeMessage::GetPage { shard, pages, .. } => (
|
||||
shard,
|
||||
pages.len(),
|
||||
itertools::Either::Right(pages.iter_mut().map(|p| &mut p.timer)),
|
||||
),
|
||||
BatchedFeMessage::RespondError { .. } => return Ok(()),
|
||||
};
|
||||
let throttled = tokio::select! {
|
||||
throttled = shard.pagestream_throttle.throttle(tokens) => { throttled }
|
||||
_ = shard.cancel.cancelled() => {
|
||||
return Err(QueryError::Shutdown);
|
||||
BatchedFeMessage::GetPage { pages, .. } => {
|
||||
for page in pages {
|
||||
page.timer.observe_execution_start(at);
|
||||
}
|
||||
}
|
||||
_ = cancel.cancelled() => {
|
||||
return Err(QueryError::Shutdown);
|
||||
}
|
||||
};
|
||||
for timer in timers {
|
||||
timer.observe_throttle_done_execution_starting(&throttled);
|
||||
BatchedFeMessage::RespondError { .. } => {}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -720,6 +698,26 @@ impl PageServerHandler {
|
||||
let neon_fe_msg =
|
||||
PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;
|
||||
|
||||
// TODO: turn in to async closure once available to avoid repeating received_at
|
||||
async fn record_op_start_and_throttle(
|
||||
shard: &timeline::handle::Handle<TenantManagerTypes>,
|
||||
op: metrics::SmgrQueryType,
|
||||
received_at: Instant,
|
||||
) -> Result<SmgrOpTimer, QueryError> {
|
||||
// It's important to start the smgr op metric recorder as early as possible
|
||||
// so that the _started counters are incremented before we do
|
||||
// any serious waiting, e.g., for throttle, batching, or actual request handling.
|
||||
let mut timer = shard.query_metrics.start_smgr_op(op, received_at);
|
||||
let now = Instant::now();
|
||||
timer.observe_throttle_start(now);
|
||||
let throttled = tokio::select! {
|
||||
res = shard.pagestream_throttle.throttle(1, now) => res,
|
||||
_ = shard.cancel.cancelled() => return Err(QueryError::Shutdown),
|
||||
};
|
||||
timer.observe_throttle_done(throttled);
|
||||
Ok(timer)
|
||||
}
|
||||
|
||||
let batched_msg = match neon_fe_msg {
|
||||
PagestreamFeMessage::Exists(req) => {
|
||||
let span = tracing::info_span!(parent: parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn);
|
||||
@@ -727,9 +725,12 @@ impl PageServerHandler {
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
.await?;
|
||||
let timer = shard
|
||||
.query_metrics
|
||||
.start_smgr_op(metrics::SmgrQueryType::GetRelExists, received_at);
|
||||
let timer = record_op_start_and_throttle(
|
||||
&shard,
|
||||
metrics::SmgrQueryType::GetRelExists,
|
||||
received_at,
|
||||
)
|
||||
.await?;
|
||||
BatchedFeMessage::Exists {
|
||||
span,
|
||||
timer,
|
||||
@@ -743,9 +744,12 @@ impl PageServerHandler {
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
.await?;
|
||||
let timer = shard
|
||||
.query_metrics
|
||||
.start_smgr_op(metrics::SmgrQueryType::GetRelSize, received_at);
|
||||
let timer = record_op_start_and_throttle(
|
||||
&shard,
|
||||
metrics::SmgrQueryType::GetRelSize,
|
||||
received_at,
|
||||
)
|
||||
.await?;
|
||||
BatchedFeMessage::Nblocks {
|
||||
span,
|
||||
timer,
|
||||
@@ -759,9 +763,12 @@ impl PageServerHandler {
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
.await?;
|
||||
let timer = shard
|
||||
.query_metrics
|
||||
.start_smgr_op(metrics::SmgrQueryType::GetDbSize, received_at);
|
||||
let timer = record_op_start_and_throttle(
|
||||
&shard,
|
||||
metrics::SmgrQueryType::GetDbSize,
|
||||
received_at,
|
||||
)
|
||||
.await?;
|
||||
BatchedFeMessage::DbSize {
|
||||
span,
|
||||
timer,
|
||||
@@ -775,9 +782,12 @@ impl PageServerHandler {
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
.await?;
|
||||
let timer = shard
|
||||
.query_metrics
|
||||
.start_smgr_op(metrics::SmgrQueryType::GetSlruSegment, received_at);
|
||||
let timer = record_op_start_and_throttle(
|
||||
&shard,
|
||||
metrics::SmgrQueryType::GetSlruSegment,
|
||||
received_at,
|
||||
)
|
||||
.await?;
|
||||
BatchedFeMessage::GetSlruSegment {
|
||||
span,
|
||||
timer,
|
||||
@@ -826,12 +836,12 @@ impl PageServerHandler {
|
||||
}
|
||||
};
|
||||
|
||||
// It's important to start the timer before waiting for the LSN
|
||||
// so that the _started counters are incremented before we do
|
||||
// any serious waiting, e.g., for LSNs.
|
||||
let timer = shard
|
||||
.query_metrics
|
||||
.start_smgr_op(metrics::SmgrQueryType::GetPageAtLsn, received_at);
|
||||
let timer = record_op_start_and_throttle(
|
||||
&shard,
|
||||
metrics::SmgrQueryType::GetPageAtLsn,
|
||||
received_at,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let effective_request_lsn = match Self::wait_or_get_last_lsn(
|
||||
&shard,
|
||||
@@ -937,6 +947,13 @@ impl PageServerHandler {
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
let started_at = Instant::now();
|
||||
let batch = {
|
||||
let mut batch = batch;
|
||||
batch.observe_execution_start(started_at);
|
||||
batch
|
||||
};
|
||||
|
||||
// invoke handler function
|
||||
let (handler_results, span): (
|
||||
Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>>,
|
||||
@@ -1103,8 +1120,11 @@ impl PageServerHandler {
|
||||
// The timer's underlying metric is used for a storage-internal latency SLO and
|
||||
// we don't want to include latency in it that we can't control.
|
||||
// And as pointed out above, in this case, we don't control the time that flush will take.
|
||||
let flushing_timer =
|
||||
timer.map(|timer| timer.observe_smgr_op_completion_and_start_flushing());
|
||||
let flushing_timer = timer.map(|mut timer| {
|
||||
timer
|
||||
.observe_execution_end_flush_start(Instant::now())
|
||||
.expect("we are the first caller")
|
||||
});
|
||||
|
||||
// what we want to do
|
||||
let flush_fut = pgb_writer.flush();
|
||||
@@ -1258,7 +1278,7 @@ impl PageServerHandler {
|
||||
Ok(msg) => msg,
|
||||
Err(e) => break e,
|
||||
};
|
||||
let mut msg = match msg {
|
||||
let msg = match msg {
|
||||
Some(msg) => msg,
|
||||
None => {
|
||||
debug!("pagestream subprotocol end observed");
|
||||
@@ -1266,10 +1286,6 @@ impl PageServerHandler {
|
||||
}
|
||||
};
|
||||
|
||||
if let Err(cancelled) = msg.throttle_and_record_start_processing(&self.cancel).await {
|
||||
break cancelled;
|
||||
}
|
||||
|
||||
let err = self
|
||||
.pagesteam_handle_batched_message(pgb_writer, msg, &cancel, protocol_version, ctx)
|
||||
.await;
|
||||
@@ -1429,15 +1445,12 @@ impl PageServerHandler {
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
let mut batch = match batch {
|
||||
let batch = match batch {
|
||||
Ok(batch) => batch,
|
||||
Err(e) => {
|
||||
return Err(e);
|
||||
}
|
||||
};
|
||||
batch
|
||||
.throttle_and_record_start_processing(&self.cancel)
|
||||
.await?;
|
||||
self.pagesteam_handle_batched_message(
|
||||
pgb_writer,
|
||||
batch,
|
||||
|
||||
@@ -365,8 +365,9 @@ pub struct Tenant {
|
||||
|
||||
/// Throttle applied at the top of [`Timeline::get`].
|
||||
/// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
|
||||
pub(crate) pagestream_throttle:
|
||||
Arc<throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,
|
||||
pub(crate) pagestream_throttle: Arc<throttle::Throttle>,
|
||||
|
||||
pub(crate) pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,
|
||||
|
||||
/// An ongoing timeline detach concurrency limiter.
|
||||
///
|
||||
@@ -1687,6 +1688,7 @@ impl Tenant {
|
||||
TimelineResources {
|
||||
remote_client,
|
||||
pagestream_throttle: self.pagestream_throttle.clone(),
|
||||
pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
|
||||
l0_flush_global_state: self.l0_flush_global_state.clone(),
|
||||
},
|
||||
LoadTimelineCause::Attach,
|
||||
@@ -3992,6 +3994,9 @@ impl Tenant {
|
||||
Ok(timeline)
|
||||
}
|
||||
|
||||
/// [`Tenant::shutdown`] must be called before dropping the returned [`Tenant`] object
|
||||
/// to ensure proper cleanup of background tasks and metrics.
|
||||
//
|
||||
// Allow too_many_arguments because a constructor's argument list naturally grows with the
|
||||
// number of attributes in the struct: breaking these out into a builder wouldn't be helpful.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
@@ -4100,8 +4105,10 @@ impl Tenant {
|
||||
gate: Gate::default(),
|
||||
pagestream_throttle: Arc::new(throttle::Throttle::new(
|
||||
Tenant::get_pagestream_throttle_config(conf, &attached_conf.tenant_conf),
|
||||
crate::metrics::tenant_throttling::Metrics::new(&tenant_shard_id),
|
||||
)),
|
||||
pagestream_throttle_metrics: Arc::new(
|
||||
crate::metrics::tenant_throttling::Pagestream::new(&tenant_shard_id),
|
||||
),
|
||||
tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
|
||||
ongoing_timeline_detach: std::sync::Mutex::default(),
|
||||
gc_block: Default::default(),
|
||||
@@ -5008,6 +5015,7 @@ impl Tenant {
|
||||
TimelineResources {
|
||||
remote_client: self.build_timeline_remote_client(timeline_id),
|
||||
pagestream_throttle: self.pagestream_throttle.clone(),
|
||||
pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
|
||||
l0_flush_global_state: self.l0_flush_global_state.clone(),
|
||||
}
|
||||
}
|
||||
@@ -5682,7 +5690,7 @@ mod tests {
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use hex_literal::hex;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
|
||||
use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
|
||||
use pageserver_api::value::Value;
|
||||
@@ -7741,7 +7749,18 @@ mod tests {
|
||||
let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||
let base_key_child = Key::from_hex("620000000033333333444444445500000001").unwrap();
|
||||
let base_key_nonexist = Key::from_hex("620000000033333333444444445500000002").unwrap();
|
||||
let base_key_overwrite = Key::from_hex("620000000033333333444444445500000003").unwrap();
|
||||
|
||||
let base_inherited_key = Key::from_hex("610000000033333333444444445500000000").unwrap();
|
||||
let base_inherited_key_child =
|
||||
Key::from_hex("610000000033333333444444445500000001").unwrap();
|
||||
let base_inherited_key_nonexist =
|
||||
Key::from_hex("610000000033333333444444445500000002").unwrap();
|
||||
let base_inherited_key_overwrite =
|
||||
Key::from_hex("610000000033333333444444445500000003").unwrap();
|
||||
|
||||
assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix...
|
||||
assert_eq!(base_inherited_key.field1, RELATION_SIZE_PREFIX);
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
@@ -7750,7 +7769,18 @@ mod tests {
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
Vec::new(), // delta layers
|
||||
vec![(Lsn(0x20), vec![(base_key, test_img("metadata key 1"))])], // image layers
|
||||
vec![(
|
||||
Lsn(0x20),
|
||||
vec![
|
||||
(base_inherited_key, test_img("metadata inherited key 1")),
|
||||
(
|
||||
base_inherited_key_overwrite,
|
||||
test_img("metadata key overwrite 1a"),
|
||||
),
|
||||
(base_key, test_img("metadata key 1")),
|
||||
(base_key_overwrite, test_img("metadata key overwrite 1b")),
|
||||
],
|
||||
)], // image layers
|
||||
Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
|
||||
)
|
||||
.await?;
|
||||
@@ -7764,7 +7794,18 @@ mod tests {
|
||||
Vec::new(), // delta layers
|
||||
vec![(
|
||||
Lsn(0x30),
|
||||
vec![(base_key_child, test_img("metadata key 2"))],
|
||||
vec![
|
||||
(
|
||||
base_inherited_key_child,
|
||||
test_img("metadata inherited key 2"),
|
||||
),
|
||||
(
|
||||
base_inherited_key_overwrite,
|
||||
test_img("metadata key overwrite 2a"),
|
||||
),
|
||||
(base_key_child, test_img("metadata key 2")),
|
||||
(base_key_overwrite, test_img("metadata key overwrite 2b")),
|
||||
],
|
||||
)], // image layers
|
||||
Lsn(0x30),
|
||||
)
|
||||
@@ -7786,6 +7827,26 @@ mod tests {
|
||||
get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx).await?,
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&tline, base_key_overwrite, lsn, &ctx).await?,
|
||||
Some(test_img("metadata key overwrite 1b"))
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&tline, base_inherited_key, lsn, &ctx).await?,
|
||||
Some(test_img("metadata inherited key 1"))
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&tline, base_inherited_key_child, lsn, &ctx).await?,
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&tline, base_inherited_key_nonexist, lsn, &ctx).await?,
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&tline, base_inherited_key_overwrite, lsn, &ctx).await?,
|
||||
Some(test_img("metadata key overwrite 1a"))
|
||||
);
|
||||
|
||||
// test vectored get on child timeline
|
||||
assert_eq!(
|
||||
@@ -7800,6 +7861,82 @@ mod tests {
|
||||
get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx).await?,
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&child, base_inherited_key, lsn, &ctx).await?,
|
||||
Some(test_img("metadata inherited key 1"))
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&child, base_inherited_key_child, lsn, &ctx).await?,
|
||||
Some(test_img("metadata inherited key 2"))
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&child, base_inherited_key_nonexist, lsn, &ctx).await?,
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&child, base_key_overwrite, lsn, &ctx).await?,
|
||||
Some(test_img("metadata key overwrite 2b"))
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&child, base_inherited_key_overwrite, lsn, &ctx).await?,
|
||||
Some(test_img("metadata key overwrite 2a"))
|
||||
);
|
||||
|
||||
// test vectored scan on parent timeline
|
||||
let mut reconstruct_state = ValuesReconstructState::new();
|
||||
let res = tline
|
||||
.get_vectored_impl(
|
||||
KeySpace::single(Key::metadata_key_range()),
|
||||
lsn,
|
||||
&mut reconstruct_state,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
assert_eq!(
|
||||
res.into_iter()
|
||||
.map(|(k, v)| (k, v.unwrap()))
|
||||
.collect::<Vec<_>>(),
|
||||
vec![
|
||||
(base_inherited_key, test_img("metadata inherited key 1")),
|
||||
(
|
||||
base_inherited_key_overwrite,
|
||||
test_img("metadata key overwrite 1a")
|
||||
),
|
||||
(base_key, test_img("metadata key 1")),
|
||||
(base_key_overwrite, test_img("metadata key overwrite 1b")),
|
||||
]
|
||||
);
|
||||
|
||||
// test vectored scan on child timeline
|
||||
let mut reconstruct_state = ValuesReconstructState::new();
|
||||
let res = child
|
||||
.get_vectored_impl(
|
||||
KeySpace::single(Key::metadata_key_range()),
|
||||
lsn,
|
||||
&mut reconstruct_state,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
assert_eq!(
|
||||
res.into_iter()
|
||||
.map(|(k, v)| (k, v.unwrap()))
|
||||
.collect::<Vec<_>>(),
|
||||
vec![
|
||||
(base_inherited_key, test_img("metadata inherited key 1")),
|
||||
(
|
||||
base_inherited_key_child,
|
||||
test_img("metadata inherited key 2")
|
||||
),
|
||||
(
|
||||
base_inherited_key_overwrite,
|
||||
test_img("metadata key overwrite 2a")
|
||||
),
|
||||
(base_key_child, test_img("metadata key 2")),
|
||||
(base_key_overwrite, test_img("metadata key overwrite 2b")),
|
||||
]
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
|
||||
use pageserver_api::models::CompactionAlgorithmSettings;
|
||||
use pageserver_api::models::EvictionPolicy;
|
||||
use pageserver_api::models::{self, TenantConfigPatch, ThrottleConfig};
|
||||
use pageserver_api::models::{self, TenantConfigPatch};
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
|
||||
use serde::de::IntoDeserializer;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -597,7 +597,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
|
||||
.map(humantime),
|
||||
heatmap_period: value.heatmap_period.map(humantime),
|
||||
lazy_slru_download: value.lazy_slru_download,
|
||||
timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
|
||||
timeline_get_throttle: value.timeline_get_throttle,
|
||||
image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
|
||||
lsn_lease_length: value.lsn_lease_length.map(humantime),
|
||||
lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
|
||||
|
||||
@@ -84,17 +84,17 @@ impl Value {
|
||||
|
||||
fn to_u64(self) -> u64 {
|
||||
let b = &self.0;
|
||||
(b[0] as u64) << 32
|
||||
| (b[1] as u64) << 24
|
||||
| (b[2] as u64) << 16
|
||||
| (b[3] as u64) << 8
|
||||
((b[0] as u64) << 32)
|
||||
| ((b[1] as u64) << 24)
|
||||
| ((b[2] as u64) << 16)
|
||||
| ((b[3] as u64) << 8)
|
||||
| b[4] as u64
|
||||
}
|
||||
|
||||
fn to_blknum(self) -> u32 {
|
||||
let b = &self.0;
|
||||
assert!(b[0] == 0x80);
|
||||
(b[1] as u32) << 24 | (b[2] as u32) << 16 | (b[3] as u32) << 8 | b[4] as u32
|
||||
((b[1] as u32) << 24) | ((b[2] as u32) << 16) | ((b[3] as u32) << 8) | b[4] as u32
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -320,7 +320,6 @@ impl TimelineMetadata {
|
||||
|
||||
// Checksums make it awkward to build a valid instance by hand. This helper
|
||||
// provides a TimelineMetadata with a valid checksum in its header.
|
||||
#[cfg(test)]
|
||||
pub fn example() -> Self {
|
||||
let instance = Self::new(
|
||||
"0/16960E8".parse::<Lsn>().unwrap(),
|
||||
|
||||
@@ -63,22 +63,18 @@
|
||||
//! The contract between client and its user is that the user is responsible of
|
||||
//! scheduling operations in an order that keeps the remote consistent as
|
||||
//! described above.
|
||||
//!
|
||||
//! From the user's perspective, the operations are executed sequentially.
|
||||
//! Internally, the client knows which operations can be performed in parallel,
|
||||
//! and which operations act like a "barrier" that require preceding operations
|
||||
//! to finish. The calling code just needs to call the schedule-functions in the
|
||||
//! correct order, and the client will parallelize the operations in a way that
|
||||
//! is safe.
|
||||
//!
|
||||
//! The caller should be careful with deletion, though. They should not delete
|
||||
//! local files that have been scheduled for upload but not yet finished uploading.
|
||||
//! Otherwise the upload will fail. To wait for an upload to finish, use
|
||||
//! the 'wait_completion' function (more on that later.)
|
||||
//! is safe. For more details, see `UploadOp::can_bypass`.
|
||||
//!
|
||||
//! All of this relies on the following invariants:
|
||||
//!
|
||||
//! - We rely on read-after write consistency in the remote storage.
|
||||
//! - Layer files are immutable
|
||||
//! - Layer files are immutable.
|
||||
//!
|
||||
//! NB: Pageserver assumes that it has exclusive write access to the tenant in remote
|
||||
//! storage. Different tenants can be attached to different pageservers, but if the
|
||||
@@ -429,8 +425,16 @@ impl RemoteTimelineClient {
|
||||
/// an index file upload, i.e., it's not empty.
|
||||
/// The given `index_part` must be the one on the remote.
|
||||
pub fn init_upload_queue(&self, index_part: &IndexPart) -> anyhow::Result<()> {
|
||||
// Set the maximum number of inprogress tasks to the remote storage concurrency. There's
|
||||
// certainly no point in starting more upload tasks than this.
|
||||
let inprogress_limit = self
|
||||
.conf
|
||||
.remote_storage_config
|
||||
.as_ref()
|
||||
.and_then(|r| r.concurrency_limit())
|
||||
.unwrap_or(0);
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
upload_queue.initialize_with_current_remote_index_part(index_part)?;
|
||||
upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
|
||||
self.update_remote_physical_size_gauge(Some(index_part));
|
||||
info!(
|
||||
"initialized upload queue from remote index with {} layer files",
|
||||
@@ -445,8 +449,16 @@ impl RemoteTimelineClient {
|
||||
&self,
|
||||
local_metadata: &TimelineMetadata,
|
||||
) -> anyhow::Result<()> {
|
||||
// Set the maximum number of inprogress tasks to the remote storage concurrency. There's
|
||||
// certainly no point in starting more upload tasks than this.
|
||||
let inprogress_limit = self
|
||||
.conf
|
||||
.remote_storage_config
|
||||
.as_ref()
|
||||
.and_then(|r| r.concurrency_limit())
|
||||
.unwrap_or(0);
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
upload_queue.initialize_empty_remote(local_metadata)?;
|
||||
upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
|
||||
self.update_remote_physical_size_gauge(None);
|
||||
info!("initialized upload queue as empty");
|
||||
Ok(())
|
||||
@@ -462,9 +474,15 @@ impl RemoteTimelineClient {
|
||||
let deleted_at = index_part.deleted_at.ok_or(anyhow::anyhow!(
|
||||
"bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted"
|
||||
))?;
|
||||
let inprogress_limit = self
|
||||
.conf
|
||||
.remote_storage_config
|
||||
.as_ref()
|
||||
.and_then(|r| r.concurrency_limit())
|
||||
.unwrap_or(0);
|
||||
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
upload_queue.initialize_with_current_remote_index_part(index_part)?;
|
||||
upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
|
||||
self.update_remote_physical_size_gauge(Some(index_part));
|
||||
self.stop_impl(&mut upload_queue);
|
||||
|
||||
@@ -1855,57 +1873,17 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Pick next tasks from the queue, and start as many of them as possible without violating
|
||||
/// the ordering constraints.
|
||||
///
|
||||
/// The caller needs to already hold the `upload_queue` lock.
|
||||
/// TODO: consider limiting the number of in-progress tasks, beyond what remote_storage does.
|
||||
/// This can launch an unbounded number of queued tasks. `UploadQueue::next_ready()` also has
|
||||
/// worst-case quadratic cost in the number of tasks, and may struggle beyond 10,000 tasks.
|
||||
fn launch_queued_tasks(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
|
||||
while let Some(next_op) = upload_queue.queued_operations.front() {
|
||||
// Can we run this task now?
|
||||
let can_run_now = match next_op {
|
||||
UploadOp::UploadLayer(..) => {
|
||||
// Can always be scheduled.
|
||||
true
|
||||
}
|
||||
UploadOp::UploadMetadata { .. } => {
|
||||
// These can only be performed after all the preceding operations
|
||||
// have finished.
|
||||
upload_queue.inprogress_tasks.is_empty()
|
||||
}
|
||||
UploadOp::Delete(..) => {
|
||||
// Wait for preceding uploads to finish. Concurrent deletions are OK, though.
|
||||
upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
|
||||
}
|
||||
while let Some((mut next_op, coalesced_ops)) = upload_queue.next_ready() {
|
||||
debug!("starting op: {next_op}");
|
||||
|
||||
UploadOp::Barrier(_) | UploadOp::Shutdown => {
|
||||
upload_queue.inprogress_tasks.is_empty()
|
||||
}
|
||||
};
|
||||
|
||||
// If we cannot launch this task, don't look any further.
|
||||
//
|
||||
// In some cases, we could let some non-frontmost tasks to "jump the queue" and launch
|
||||
// them now, but we don't try to do that currently. For example, if the frontmost task
|
||||
// is an index-file upload that cannot proceed until preceding uploads have finished, we
|
||||
// could still start layer uploads that were scheduled later.
|
||||
if !can_run_now {
|
||||
break;
|
||||
}
|
||||
|
||||
if let UploadOp::Shutdown = next_op {
|
||||
// leave the op in the queue but do not start more tasks; it will be dropped when
|
||||
// the stop is called.
|
||||
upload_queue.shutdown_ready.close();
|
||||
break;
|
||||
}
|
||||
|
||||
// We can launch this task. Remove it from the queue first.
|
||||
let mut next_op = upload_queue.queued_operations.pop_front().unwrap();
|
||||
|
||||
debug!("starting op: {}", next_op);
|
||||
|
||||
// Update the counters and prepare
|
||||
// Prepare upload.
|
||||
match &mut next_op {
|
||||
UploadOp::UploadLayer(layer, meta, mode) => {
|
||||
if upload_queue
|
||||
@@ -1916,18 +1894,14 @@ impl RemoteTimelineClient {
|
||||
} else {
|
||||
*mode = Some(OpType::MayReorder)
|
||||
}
|
||||
upload_queue.num_inprogress_layer_uploads += 1;
|
||||
}
|
||||
UploadOp::UploadMetadata { .. } => {
|
||||
upload_queue.num_inprogress_metadata_uploads += 1;
|
||||
}
|
||||
UploadOp::UploadMetadata { .. } => {}
|
||||
UploadOp::Delete(Delete { layers }) => {
|
||||
for (name, meta) in layers {
|
||||
upload_queue
|
||||
.recently_deleted
|
||||
.insert((name.clone(), meta.generation));
|
||||
}
|
||||
upload_queue.num_inprogress_deletions += 1;
|
||||
}
|
||||
UploadOp::Barrier(sender) => {
|
||||
sender.send_replace(());
|
||||
@@ -1944,6 +1918,7 @@ impl RemoteTimelineClient {
|
||||
let task = Arc::new(UploadTask {
|
||||
task_id: upload_task_id,
|
||||
op: next_op,
|
||||
coalesced_ops,
|
||||
retries: AtomicU32::new(0),
|
||||
});
|
||||
upload_queue
|
||||
@@ -2027,6 +2002,8 @@ impl RemoteTimelineClient {
|
||||
|
||||
let upload_result: anyhow::Result<()> = match &task.op {
|
||||
UploadOp::UploadLayer(ref layer, ref layer_metadata, mode) => {
|
||||
// TODO: check if this mechanism can be removed now that can_bypass() performs
|
||||
// conflict checks during scheduling.
|
||||
if let Some(OpType::FlushDeletion) = mode {
|
||||
if self.config.read().unwrap().block_deletions {
|
||||
// Of course, this is not efficient... but usually the queue should be empty.
|
||||
@@ -2249,13 +2226,8 @@ impl RemoteTimelineClient {
|
||||
upload_queue.inprogress_tasks.remove(&task.task_id);
|
||||
|
||||
let lsn_update = match task.op {
|
||||
UploadOp::UploadLayer(_, _, _) => {
|
||||
upload_queue.num_inprogress_layer_uploads -= 1;
|
||||
None
|
||||
}
|
||||
UploadOp::UploadLayer(_, _, _) => None,
|
||||
UploadOp::UploadMetadata { ref uploaded } => {
|
||||
upload_queue.num_inprogress_metadata_uploads -= 1;
|
||||
|
||||
// the task id is reused as a monotonicity check for storing the "clean"
|
||||
// IndexPart.
|
||||
let last_updater = upload_queue.clean.1;
|
||||
@@ -2289,10 +2261,7 @@ impl RemoteTimelineClient {
|
||||
None
|
||||
}
|
||||
}
|
||||
UploadOp::Delete(_) => {
|
||||
upload_queue.num_inprogress_deletions -= 1;
|
||||
None
|
||||
}
|
||||
UploadOp::Delete(_) => None,
|
||||
UploadOp::Barrier(..) | UploadOp::Shutdown => unreachable!(),
|
||||
};
|
||||
|
||||
@@ -2317,6 +2286,9 @@ impl RemoteTimelineClient {
|
||||
}
|
||||
|
||||
self.metric_end(&task.op);
|
||||
for coalesced_op in &task.coalesced_ops {
|
||||
self.metric_end(coalesced_op);
|
||||
}
|
||||
}
|
||||
|
||||
fn metric_impl(
|
||||
@@ -2409,6 +2381,7 @@ impl RemoteTimelineClient {
|
||||
// but for this use case it doesnt really makes sense to bring unsafe code only for this usage point.
|
||||
// Deletion is not really perf sensitive so there shouldnt be any problems with cloning a fraction of it.
|
||||
let upload_queue_for_deletion = UploadQueueInitialized {
|
||||
inprogress_limit: initialized.inprogress_limit,
|
||||
task_counter: 0,
|
||||
dirty: initialized.dirty.clone(),
|
||||
clean: initialized.clean.clone(),
|
||||
@@ -2416,9 +2389,6 @@ impl RemoteTimelineClient {
|
||||
visible_remote_consistent_lsn: initialized
|
||||
.visible_remote_consistent_lsn
|
||||
.clone(),
|
||||
num_inprogress_layer_uploads: 0,
|
||||
num_inprogress_metadata_uploads: 0,
|
||||
num_inprogress_deletions: 0,
|
||||
inprogress_tasks: HashMap::default(),
|
||||
queued_operations: VecDeque::default(),
|
||||
#[cfg(feature = "testing")]
|
||||
@@ -2445,14 +2415,6 @@ impl RemoteTimelineClient {
|
||||
}
|
||||
};
|
||||
|
||||
// consistency check
|
||||
assert_eq!(
|
||||
qi.num_inprogress_layer_uploads
|
||||
+ qi.num_inprogress_metadata_uploads
|
||||
+ qi.num_inprogress_deletions,
|
||||
qi.inprogress_tasks.len()
|
||||
);
|
||||
|
||||
// We don't need to do anything here for in-progress tasks. They will finish
|
||||
// on their own, decrement the unfinished-task counter themselves, and observe
|
||||
// that the queue is Stopped.
|
||||
@@ -2899,8 +2861,8 @@ mod tests {
|
||||
let mut guard = client.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut().unwrap();
|
||||
assert!(upload_queue.queued_operations.is_empty());
|
||||
assert!(upload_queue.inprogress_tasks.len() == 2);
|
||||
assert!(upload_queue.num_inprogress_layer_uploads == 2);
|
||||
assert_eq!(upload_queue.inprogress_tasks.len(), 2);
|
||||
assert_eq!(upload_queue.num_inprogress_layer_uploads(), 2);
|
||||
|
||||
// also check that `latest_file_changes` was updated
|
||||
assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 2);
|
||||
@@ -2970,8 +2932,8 @@ mod tests {
|
||||
// Deletion schedules upload of the index file, and the file deletion itself
|
||||
assert_eq!(upload_queue.queued_operations.len(), 2);
|
||||
assert_eq!(upload_queue.inprogress_tasks.len(), 1);
|
||||
assert_eq!(upload_queue.num_inprogress_layer_uploads, 1);
|
||||
assert_eq!(upload_queue.num_inprogress_deletions, 0);
|
||||
assert_eq!(upload_queue.num_inprogress_layer_uploads(), 1);
|
||||
assert_eq!(upload_queue.num_inprogress_deletions(), 0);
|
||||
assert_eq!(
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled,
|
||||
0
|
||||
|
||||
@@ -104,7 +104,7 @@ impl IndexPart {
|
||||
|
||||
pub const FILE_NAME: &'static str = "index_part.json";
|
||||
|
||||
pub(crate) fn empty(metadata: TimelineMetadata) -> Self {
|
||||
pub fn empty(metadata: TimelineMetadata) -> Self {
|
||||
IndexPart {
|
||||
version: Self::LATEST_VERSION,
|
||||
layer_metadata: Default::default(),
|
||||
|
||||
@@ -12,7 +12,7 @@ pub mod merge_iterator;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::key::{Key, NON_INHERITED_SPARSE_RANGE};
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::value::Value;
|
||||
@@ -209,7 +209,7 @@ impl ValuesReconstructState {
|
||||
.keys
|
||||
.entry(*key)
|
||||
.or_insert(Ok(VectoredValueReconstructState::default()));
|
||||
let is_sparse_key = NON_INHERITED_SPARSE_RANGE.contains(key);
|
||||
let is_sparse_key = key.is_sparse();
|
||||
if let Ok(state) = state {
|
||||
let key_done = match state.situation {
|
||||
ValueReconstructSituation::Complete => {
|
||||
|
||||
@@ -112,8 +112,8 @@ const MAX_SUPPORTED_BLOB_LEN_BITS: usize = {
|
||||
///
|
||||
/// Layout:
|
||||
/// - 1 bit: `will_init`
|
||||
/// - [`MAX_SUPPORTED_BLOB_LEN_BITS`]: `len`
|
||||
/// - [`MAX_SUPPORTED_POS_BITS`]: `pos`
|
||||
/// - [`MAX_SUPPORTED_BLOB_LEN_BITS`][]: `len`
|
||||
/// - [`MAX_SUPPORTED_POS_BITS`](IndexEntry::MAX_SUPPORTED_POS_BITS): `pos`
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub struct IndexEntry(u64);
|
||||
|
||||
|
||||
@@ -1812,7 +1812,7 @@ enum LayerKind {
|
||||
|
||||
/// Guard for forcing a layer be resident while it exists.
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct ResidentLayer {
|
||||
pub struct ResidentLayer {
|
||||
owner: Layer,
|
||||
downloaded: Arc<DownloadedLayer>,
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::{
|
||||
atomic::{AtomicU64, Ordering},
|
||||
Arc,
|
||||
},
|
||||
time::{Duration, Instant},
|
||||
time::Instant,
|
||||
};
|
||||
|
||||
use arc_swap::ArcSwap;
|
||||
@@ -16,9 +16,8 @@ use utils::leaky_bucket::{LeakyBucketConfig, RateLimiter};
|
||||
/// To share a throttle among multiple entities, wrap it in an [`Arc`].
|
||||
///
|
||||
/// The intial use case for this is tenant-wide throttling of getpage@lsn requests.
|
||||
pub struct Throttle<M: Metric> {
|
||||
pub struct Throttle {
|
||||
inner: ArcSwap<Inner>,
|
||||
metric: M,
|
||||
/// will be turned into [`Stats::count_accounted_start`]
|
||||
count_accounted_start: AtomicU64,
|
||||
/// will be turned into [`Stats::count_accounted_finish`]
|
||||
@@ -36,15 +35,6 @@ pub struct Inner {
|
||||
|
||||
pub type Config = pageserver_api::models::ThrottleConfig;
|
||||
|
||||
pub struct Observation {
|
||||
pub wait_time: Duration,
|
||||
}
|
||||
pub trait Metric {
|
||||
fn accounting_start(&self);
|
||||
fn accounting_finish(&self);
|
||||
fn observe_throttling(&self, observation: &Observation);
|
||||
}
|
||||
|
||||
/// See [`Throttle::reset_stats`].
|
||||
pub struct Stats {
|
||||
/// Number of requests that started [`Throttle::throttle`] calls.
|
||||
@@ -59,18 +49,14 @@ pub struct Stats {
|
||||
}
|
||||
|
||||
pub enum ThrottleResult {
|
||||
NotThrottled { start: Instant },
|
||||
Throttled { start: Instant, end: Instant },
|
||||
NotThrottled { end: Instant },
|
||||
Throttled { end: Instant },
|
||||
}
|
||||
|
||||
impl<M> Throttle<M>
|
||||
where
|
||||
M: Metric,
|
||||
{
|
||||
pub fn new(config: Config, metric: M) -> Self {
|
||||
impl Throttle {
|
||||
pub fn new(config: Config) -> Self {
|
||||
Self {
|
||||
inner: ArcSwap::new(Arc::new(Self::new_inner(config))),
|
||||
metric,
|
||||
count_accounted_start: AtomicU64::new(0),
|
||||
count_accounted_finish: AtomicU64::new(0),
|
||||
count_throttled: AtomicU64::new(0),
|
||||
@@ -127,32 +113,27 @@ where
|
||||
self.inner.load().rate_limiter.steady_rps()
|
||||
}
|
||||
|
||||
pub async fn throttle(&self, key_count: usize) -> ThrottleResult {
|
||||
/// `start` must be [`Instant::now`] or earlier.
|
||||
pub async fn throttle(&self, key_count: usize, start: Instant) -> ThrottleResult {
|
||||
let inner = self.inner.load_full(); // clones the `Inner` Arc
|
||||
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
if !inner.enabled {
|
||||
return ThrottleResult::NotThrottled { start };
|
||||
return ThrottleResult::NotThrottled { end: start };
|
||||
}
|
||||
|
||||
self.metric.accounting_start();
|
||||
self.count_accounted_start.fetch_add(1, Ordering::Relaxed);
|
||||
let did_throttle = inner.rate_limiter.acquire(key_count).await;
|
||||
self.count_accounted_finish.fetch_add(1, Ordering::Relaxed);
|
||||
self.metric.accounting_finish();
|
||||
|
||||
if did_throttle {
|
||||
self.count_throttled.fetch_add(1, Ordering::Relaxed);
|
||||
let now = Instant::now();
|
||||
let wait_time = now - start;
|
||||
let end = Instant::now();
|
||||
let wait_time = end - start;
|
||||
self.sum_throttled_usecs
|
||||
.fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed);
|
||||
let observation = Observation { wait_time };
|
||||
self.metric.observe_throttling(&observation);
|
||||
ThrottleResult::Throttled { start, end: now }
|
||||
ThrottleResult::Throttled { end }
|
||||
} else {
|
||||
ThrottleResult::NotThrottled { start }
|
||||
ThrottleResult::NotThrottled { end: start }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,7 +27,7 @@ use pageserver_api::{
|
||||
config::tenant_conf_defaults::DEFAULT_COMPACTION_THRESHOLD,
|
||||
key::{
|
||||
KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
|
||||
NON_INHERITED_SPARSE_RANGE,
|
||||
SPARSE_RANGE,
|
||||
},
|
||||
keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
|
||||
models::{
|
||||
@@ -208,8 +208,8 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
|
||||
/// The outward-facing resources required to build a Timeline
|
||||
pub struct TimelineResources {
|
||||
pub remote_client: RemoteTimelineClient,
|
||||
pub pagestream_throttle:
|
||||
Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,
|
||||
pub pagestream_throttle: Arc<crate::tenant::throttle::Throttle>,
|
||||
pub pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,
|
||||
pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
|
||||
}
|
||||
|
||||
@@ -412,8 +412,7 @@ pub struct Timeline {
|
||||
gc_lock: tokio::sync::Mutex<()>,
|
||||
|
||||
/// Cloned from [`super::Tenant::pagestream_throttle`] on construction.
|
||||
pub(crate) pagestream_throttle:
|
||||
Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,
|
||||
pub(crate) pagestream_throttle: Arc<crate::tenant::throttle::Throttle>,
|
||||
|
||||
/// Size estimator for aux file v2
|
||||
pub(crate) aux_file_size_estimator: AuxFileSizeEstimator,
|
||||
@@ -2310,6 +2309,7 @@ impl Timeline {
|
||||
query_metrics: crate::metrics::SmgrQueryTimePerTimeline::new(
|
||||
&tenant_shard_id,
|
||||
&timeline_id,
|
||||
resources.pagestream_throttle_metrics,
|
||||
),
|
||||
|
||||
directory_metrics: array::from_fn(|_| AtomicU64::new(0)),
|
||||
@@ -3221,7 +3221,7 @@ impl Timeline {
|
||||
// We don't return a blanket [`GetVectoredError::MissingKey`] to avoid
|
||||
// stalling compaction.
|
||||
keyspace.remove_overlapping_with(&KeySpace {
|
||||
ranges: vec![NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE],
|
||||
ranges: vec![NON_INHERITED_RANGE, Key::sparse_non_inherited_keyspace()],
|
||||
});
|
||||
|
||||
// Keyspace is fully retrieved
|
||||
@@ -3242,7 +3242,11 @@ impl Timeline {
|
||||
// keys from `keyspace`, we expect there to be no overlap between it and the image covered key
|
||||
// space. If that's not the case, we had at least one key encounter a gap in the image layer
|
||||
// and stop the search as a result of that.
|
||||
let removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
|
||||
let mut removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
|
||||
// Do not fire missing key error for sparse keys.
|
||||
removed.remove_overlapping_with(&KeySpace {
|
||||
ranges: vec![SPARSE_RANGE],
|
||||
});
|
||||
if !removed.is_empty() {
|
||||
break Some(removed);
|
||||
}
|
||||
@@ -3257,6 +3261,21 @@ impl Timeline {
|
||||
timeline = &*timeline_owned;
|
||||
};
|
||||
|
||||
// Remove sparse keys from the keyspace so that it doesn't fire errors.
|
||||
let missing_keyspace = if let Some(missing_keyspace) = missing_keyspace {
|
||||
let mut missing_keyspace = missing_keyspace;
|
||||
missing_keyspace.remove_overlapping_with(&KeySpace {
|
||||
ranges: vec![SPARSE_RANGE],
|
||||
});
|
||||
if missing_keyspace.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(missing_keyspace)
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
if let Some(missing_keyspace) = missing_keyspace {
|
||||
return Err(GetVectoredError::MissingKey(MissingKeyError {
|
||||
key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */
|
||||
@@ -3762,36 +3781,35 @@ impl Timeline {
|
||||
return Err(FlushLayerError::Cancelled);
|
||||
}
|
||||
|
||||
let mut layers_to_upload = Vec::new();
|
||||
layers_to_upload.extend(
|
||||
self.create_image_layers(
|
||||
&rel_partition,
|
||||
self.initdb_lsn,
|
||||
ImageLayerCreationMode::Initial,
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
// Ensure that we have a single call to `create_image_layers` with a combined dense keyspace.
|
||||
// So that the key ranges don't overlap.
|
||||
let mut partitions = KeyPartitioning::default();
|
||||
partitions.parts.extend(rel_partition.parts);
|
||||
if !metadata_partition.parts.is_empty() {
|
||||
assert_eq!(
|
||||
metadata_partition.parts.len(),
|
||||
1,
|
||||
"currently sparse keyspace should only contain a single metadata keyspace"
|
||||
);
|
||||
layers_to_upload.extend(
|
||||
self.create_image_layers(
|
||||
// Safety: create_image_layers treat sparse keyspaces differently that it does not scan
|
||||
// every single key within the keyspace, and therefore, it's safe to force converting it
|
||||
// into a dense keyspace before calling this function.
|
||||
&metadata_partition.into_dense(),
|
||||
self.initdb_lsn,
|
||||
ImageLayerCreationMode::Initial,
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
// Safety: create_image_layers treat sparse keyspaces differently that it does not scan
|
||||
// every single key within the keyspace, and therefore, it's safe to force converting it
|
||||
// into a dense keyspace before calling this function.
|
||||
partitions
|
||||
.parts
|
||||
.extend(metadata_partition.into_dense().parts);
|
||||
}
|
||||
|
||||
let mut layers_to_upload = Vec::new();
|
||||
layers_to_upload.extend(
|
||||
self.create_image_layers(
|
||||
&partitions,
|
||||
self.initdb_lsn,
|
||||
ImageLayerCreationMode::Initial,
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
|
||||
(layers_to_upload, None)
|
||||
} else {
|
||||
// Normal case, write out a L0 delta layer file.
|
||||
|
||||
@@ -301,6 +301,7 @@ impl DeleteTimelineFlow {
|
||||
TimelineResources {
|
||||
remote_client,
|
||||
pagestream_throttle: tenant.pagestream_throttle.clone(),
|
||||
pagestream_throttle_metrics: tenant.pagestream_throttle_metrics.clone(),
|
||||
l0_flush_global_state: tenant.l0_flush_global_state.clone(),
|
||||
},
|
||||
// Important. We dont pass ancestor above because it can be missing.
|
||||
|
||||
@@ -264,6 +264,8 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
|
||||
let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?;
|
||||
|
||||
let shard = vec![*timeline.get_shard_identity()];
|
||||
|
||||
let interpreted_proto_config = match protocol {
|
||||
PostgresClientProtocol::Vanilla => None,
|
||||
PostgresClientProtocol::Interpreted {
|
||||
@@ -403,7 +405,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
// need to advance last record LSN on all shards. If we've not ingested the latest
|
||||
// record, then set the LSN of the modification past it. This way all shards
|
||||
// advance their last record LSN at the same time.
|
||||
let needs_last_record_lsn_advance = match next_record_lsn.map(Lsn::from) {
|
||||
let needs_last_record_lsn_advance = match next_record_lsn {
|
||||
Some(lsn) if lsn > modification.get_lsn() => {
|
||||
modification.set_lsn(lsn).unwrap();
|
||||
true
|
||||
@@ -476,10 +478,12 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
// Deserialize and interpret WAL record
|
||||
let interpreted = InterpretedWalRecord::from_bytes_filtered(
|
||||
recdata,
|
||||
modification.tline.get_shard_identity(),
|
||||
&shard,
|
||||
next_record_lsn,
|
||||
modification.tline.pg_version,
|
||||
)?;
|
||||
)?
|
||||
.remove(timeline.get_shard_identity())
|
||||
.unwrap();
|
||||
|
||||
if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
|
||||
&& uncommitted_records > 0
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -308,7 +308,7 @@ impl WalIngest {
|
||||
epoch -= 1;
|
||||
}
|
||||
|
||||
Ok((epoch as u64) << 32 | xid as u64)
|
||||
Ok(((epoch as u64) << 32) | xid as u64)
|
||||
}
|
||||
|
||||
async fn ingest_clear_vm_bits(
|
||||
@@ -2163,10 +2163,12 @@ mod tests {
|
||||
while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() {
|
||||
let interpreted = InterpretedWalRecord::from_bytes_filtered(
|
||||
recdata,
|
||||
modification.tline.get_shard_identity(),
|
||||
&[*modification.tline.get_shard_identity()],
|
||||
lsn,
|
||||
modification.tline.pg_version,
|
||||
)
|
||||
.unwrap()
|
||||
.remove(modification.tline.get_shard_identity())
|
||||
.unwrap();
|
||||
|
||||
walingest
|
||||
|
||||
@@ -51,6 +51,26 @@ HexDecodeString(uint8 *result, char *input, int nbytes)
|
||||
return true;
|
||||
}
|
||||
|
||||
/* --------------------------------
|
||||
* pq_getmsgint16 - get a binary 2-byte int from a message buffer
|
||||
* --------------------------------
|
||||
*/
|
||||
uint16
|
||||
pq_getmsgint16(StringInfo msg)
|
||||
{
|
||||
return pq_getmsgint(msg, 2);
|
||||
}
|
||||
|
||||
/* --------------------------------
|
||||
* pq_getmsgint32 - get a binary 4-byte int from a message buffer
|
||||
* --------------------------------
|
||||
*/
|
||||
uint32
|
||||
pq_getmsgint32(StringInfo msg)
|
||||
{
|
||||
return pq_getmsgint(msg, 4);
|
||||
}
|
||||
|
||||
/* --------------------------------
|
||||
* pq_getmsgint32_le - get a binary 4-byte int from a message buffer in native (LE) order
|
||||
* --------------------------------
|
||||
|
||||
@@ -8,6 +8,8 @@
|
||||
#endif
|
||||
|
||||
bool HexDecodeString(uint8 *result, char *input, int nbytes);
|
||||
uint16 pq_getmsgint16(StringInfo msg);
|
||||
uint32 pq_getmsgint32(StringInfo msg);
|
||||
uint32 pq_getmsgint32_le(StringInfo msg);
|
||||
uint64 pq_getmsgint64_le(StringInfo msg);
|
||||
void pq_sendint32_le(StringInfo buf, uint32 i);
|
||||
|
||||
@@ -70,6 +70,7 @@ static bool SendAppendRequests(Safekeeper *sk);
|
||||
static bool RecvAppendResponses(Safekeeper *sk);
|
||||
static XLogRecPtr CalculateMinFlushLsn(WalProposer *wp);
|
||||
static XLogRecPtr GetAcknowledgedByQuorumWALPosition(WalProposer *wp);
|
||||
static void PAMessageSerialize(WalProposer *wp, ProposerAcceptorMessage *msg, StringInfo buf, int proto_version);
|
||||
static void HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk);
|
||||
static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size);
|
||||
static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg);
|
||||
@@ -81,6 +82,8 @@ static char *FormatSafekeeperState(Safekeeper *sk);
|
||||
static void AssertEventsOkForState(uint32 events, Safekeeper *sk);
|
||||
static char *FormatEvents(WalProposer *wp, uint32 events);
|
||||
static void UpdateDonorShmem(WalProposer *wp);
|
||||
static char *MembershipConfigurationToString(MembershipConfiguration *mconf);
|
||||
static void MembershipConfigurationFree(MembershipConfiguration *mconf);
|
||||
|
||||
WalProposer *
|
||||
WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
@@ -137,25 +140,21 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
}
|
||||
wp->quorum = wp->n_safekeepers / 2 + 1;
|
||||
|
||||
if (wp->config->proto_version != 2 && wp->config->proto_version != 3)
|
||||
wp_log(FATAL, "unsupported safekeeper protocol version %d", wp->config->proto_version);
|
||||
wp_log(LOG, "using safekeeper protocol version %d", wp->config->proto_version);
|
||||
|
||||
/* Fill the greeting package */
|
||||
wp->greetRequest.tag = 'g';
|
||||
wp->greetRequest.protocolVersion = SK_PROTOCOL_VERSION;
|
||||
wp->greetRequest.pgVersion = PG_VERSION_NUM;
|
||||
wp->api.strong_random(wp, &wp->greetRequest.proposerId, sizeof(wp->greetRequest.proposerId));
|
||||
wp->greetRequest.systemId = wp->config->systemId;
|
||||
if (!wp->config->neon_timeline)
|
||||
wp_log(FATAL, "neon.timeline_id is not provided");
|
||||
if (*wp->config->neon_timeline != '\0' &&
|
||||
!HexDecodeString(wp->greetRequest.timeline_id, wp->config->neon_timeline, 16))
|
||||
wp_log(FATAL, "could not parse neon.timeline_id, %s", wp->config->neon_timeline);
|
||||
wp->greetRequest.pam.tag = 'g';
|
||||
if (!wp->config->neon_tenant)
|
||||
wp_log(FATAL, "neon.tenant_id is not provided");
|
||||
if (*wp->config->neon_tenant != '\0' &&
|
||||
!HexDecodeString(wp->greetRequest.tenant_id, wp->config->neon_tenant, 16))
|
||||
wp_log(FATAL, "could not parse neon.tenant_id, %s", wp->config->neon_tenant);
|
||||
|
||||
wp->greetRequest.timeline = wp->config->pgTimeline;
|
||||
wp->greetRequest.walSegSize = wp->config->wal_segment_size;
|
||||
wp->greetRequest.tenant_id = wp->config->neon_tenant;
|
||||
if (!wp->config->neon_timeline)
|
||||
wp_log(FATAL, "neon.timeline_id is not provided");
|
||||
wp->greetRequest.timeline_id = wp->config->neon_timeline;
|
||||
wp->greetRequest.pg_version = PG_VERSION_NUM;
|
||||
wp->greetRequest.system_id = wp->config->systemId;
|
||||
wp->greetRequest.wal_seg_size = wp->config->wal_segment_size;
|
||||
|
||||
wp->api.init_event_set(wp);
|
||||
|
||||
@@ -165,12 +164,14 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
void
|
||||
WalProposerFree(WalProposer *wp)
|
||||
{
|
||||
MembershipConfigurationFree(&wp->mconf);
|
||||
for (int i = 0; i < wp->n_safekeepers; i++)
|
||||
{
|
||||
Safekeeper *sk = &wp->safekeeper[i];
|
||||
|
||||
Assert(sk->outbuf.data != NULL);
|
||||
pfree(sk->outbuf.data);
|
||||
MembershipConfigurationFree(&sk->greetResponse.mconf);
|
||||
if (sk->voteResponse.termHistory.entries)
|
||||
pfree(sk->voteResponse.termHistory.entries);
|
||||
sk->voteResponse.termHistory.entries = NULL;
|
||||
@@ -308,6 +309,7 @@ ShutdownConnection(Safekeeper *sk)
|
||||
sk->state = SS_OFFLINE;
|
||||
sk->streamingAt = InvalidXLogRecPtr;
|
||||
|
||||
MembershipConfigurationFree(&sk->greetResponse.mconf);
|
||||
if (sk->voteResponse.termHistory.entries)
|
||||
pfree(sk->voteResponse.termHistory.entries);
|
||||
sk->voteResponse.termHistory.entries = NULL;
|
||||
@@ -598,11 +600,14 @@ static void
|
||||
SendStartWALPush(Safekeeper *sk)
|
||||
{
|
||||
WalProposer *wp = sk->wp;
|
||||
#define CMD_LEN 512
|
||||
char cmd[CMD_LEN];
|
||||
|
||||
if (!wp->api.conn_send_query(sk, "START_WAL_PUSH"))
|
||||
snprintf(cmd, CMD_LEN, "START_WAL_PUSH (proto_version '%d')", wp->config->proto_version);
|
||||
if (!wp->api.conn_send_query(sk, cmd))
|
||||
{
|
||||
wp_log(WARNING, "failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
|
||||
sk->host, sk->port, wp->api.conn_error_message(sk));
|
||||
wp_log(WARNING, "failed to send %s query to safekeeper %s:%s: %s",
|
||||
cmd, sk->host, sk->port, wp->api.conn_error_message(sk));
|
||||
ShutdownConnection(sk);
|
||||
return;
|
||||
}
|
||||
@@ -658,23 +663,33 @@ RecvStartWALPushResult(Safekeeper *sk)
|
||||
|
||||
/*
|
||||
* Start handshake: first of all send information about the
|
||||
* safekeeper. After sending, we wait on SS_HANDSHAKE_RECV for
|
||||
* walproposer. After sending, we wait on SS_HANDSHAKE_RECV for
|
||||
* a response to finish the handshake.
|
||||
*/
|
||||
static void
|
||||
SendProposerGreeting(Safekeeper *sk)
|
||||
{
|
||||
WalProposer *wp = sk->wp;
|
||||
char *mconf_toml = MembershipConfigurationToString(&wp->greetRequest.mconf);
|
||||
|
||||
wp_log(LOG, "sending ProposerGreeting to safekeeper %s:%s with mconf = %s", sk->host, sk->port, mconf_toml);
|
||||
pfree(mconf_toml);
|
||||
|
||||
PAMessageSerialize(wp, (ProposerAcceptorMessage *) &wp->greetRequest,
|
||||
&sk->outbuf, wp->config->proto_version);
|
||||
|
||||
/*
|
||||
* On failure, logging & resetting the connection is handled. We just need
|
||||
* to handle the control flow.
|
||||
*/
|
||||
BlockingWrite(sk, &sk->wp->greetRequest, sizeof(sk->wp->greetRequest), SS_HANDSHAKE_RECV);
|
||||
BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_HANDSHAKE_RECV);
|
||||
}
|
||||
|
||||
static void
|
||||
RecvAcceptorGreeting(Safekeeper *sk)
|
||||
{
|
||||
WalProposer *wp = sk->wp;
|
||||
char *mconf_toml;
|
||||
|
||||
/*
|
||||
* If our reading doesn't immediately succeed, any necessary error
|
||||
@@ -685,7 +700,10 @@ RecvAcceptorGreeting(Safekeeper *sk)
|
||||
if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse))
|
||||
return;
|
||||
|
||||
wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s, term=" INT64_FORMAT, sk->host, sk->port, sk->greetResponse.term);
|
||||
mconf_toml = MembershipConfigurationToString(&sk->greetResponse.mconf);
|
||||
wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s, node_id = %lu, mconf = %s, term=" UINT64_FORMAT,
|
||||
sk->host, sk->port, sk->greetResponse.nodeId, mconf_toml, sk->greetResponse.term);
|
||||
pfree(mconf_toml);
|
||||
|
||||
/* Protocol is all good, move to voting. */
|
||||
sk->state = SS_VOTING;
|
||||
@@ -707,12 +725,9 @@ RecvAcceptorGreeting(Safekeeper *sk)
|
||||
wp->propTerm++;
|
||||
wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);
|
||||
|
||||
wp->voteRequest = (VoteRequest)
|
||||
{
|
||||
.tag = 'v',
|
||||
.term = wp->propTerm
|
||||
};
|
||||
memcpy(wp->voteRequest.proposerId.data, wp->greetRequest.proposerId.data, UUID_LEN);
|
||||
wp->voteRequest.pam.tag = 'v';
|
||||
wp->voteRequest.generation = wp->mconf.generation;
|
||||
wp->voteRequest.term = wp->propTerm;
|
||||
}
|
||||
}
|
||||
else if (sk->greetResponse.term > wp->propTerm)
|
||||
@@ -759,12 +774,14 @@ SendVoteRequest(Safekeeper *sk)
|
||||
{
|
||||
WalProposer *wp = sk->wp;
|
||||
|
||||
/* We have quorum for voting, send our vote request */
|
||||
wp_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term);
|
||||
/* On failure, logging & resetting is handled */
|
||||
if (!BlockingWrite(sk, &wp->voteRequest, sizeof(wp->voteRequest), SS_WAIT_VERDICT))
|
||||
return;
|
||||
PAMessageSerialize(wp, (ProposerAcceptorMessage *) &wp->voteRequest,
|
||||
&sk->outbuf, wp->config->proto_version);
|
||||
|
||||
/* We have quorum for voting, send our vote request */
|
||||
wp_log(LOG, "requesting vote from %s:%s for generation %u term " UINT64_FORMAT, sk->host, sk->port,
|
||||
wp->voteRequest.generation, wp->voteRequest.term);
|
||||
/* On failure, logging & resetting is handled */
|
||||
BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_WAIT_VERDICT);
|
||||
/* If successful, wait for read-ready with SS_WAIT_VERDICT */
|
||||
}
|
||||
|
||||
@@ -778,11 +795,12 @@ RecvVoteResponse(Safekeeper *sk)
|
||||
return;
|
||||
|
||||
wp_log(LOG,
|
||||
"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
|
||||
sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
|
||||
"got VoteResponse from acceptor %s:%s, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
|
||||
sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term,
|
||||
sk->voteResponse.voteGiven,
|
||||
GetHighestTerm(&sk->voteResponse.termHistory),
|
||||
LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
|
||||
LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
|
||||
LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));
|
||||
LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn));
|
||||
|
||||
/*
|
||||
* In case of acceptor rejecting our vote, bail out, but only if either it
|
||||
@@ -847,9 +865,9 @@ HandleElectedProposer(WalProposer *wp)
|
||||
* otherwise we must be sync-safekeepers and we have nothing to do then.
|
||||
*
|
||||
* Proceeding is not only pointless but harmful, because we'd give
|
||||
* safekeepers term history starting with 0/0. These hacks will go away once
|
||||
* we disable implicit timeline creation on safekeepers and create it with
|
||||
* non zero LSN from the start.
|
||||
* safekeepers term history starting with 0/0. These hacks will go away
|
||||
* once we disable implicit timeline creation on safekeepers and create it
|
||||
* with non zero LSN from the start.
|
||||
*/
|
||||
if (wp->propEpochStartLsn == InvalidXLogRecPtr)
|
||||
{
|
||||
@@ -942,7 +960,6 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
wp->propEpochStartLsn = InvalidXLogRecPtr;
|
||||
wp->donorEpoch = 0;
|
||||
wp->truncateLsn = InvalidXLogRecPtr;
|
||||
wp->timelineStartLsn = InvalidXLogRecPtr;
|
||||
|
||||
for (int i = 0; i < wp->n_safekeepers; i++)
|
||||
{
|
||||
@@ -959,20 +976,6 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
wp->donor = i;
|
||||
}
|
||||
wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);
|
||||
|
||||
if (wp->safekeeper[i].voteResponse.timelineStartLsn != InvalidXLogRecPtr)
|
||||
{
|
||||
/* timelineStartLsn should be the same everywhere or unknown */
|
||||
if (wp->timelineStartLsn != InvalidXLogRecPtr &&
|
||||
wp->timelineStartLsn != wp->safekeeper[i].voteResponse.timelineStartLsn)
|
||||
{
|
||||
wp_log(WARNING,
|
||||
"inconsistent timelineStartLsn: current %X/%X, received %X/%X",
|
||||
LSN_FORMAT_ARGS(wp->timelineStartLsn),
|
||||
LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
|
||||
}
|
||||
wp->timelineStartLsn = wp->safekeeper[i].voteResponse.timelineStartLsn;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -995,22 +998,11 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
if (wp->propEpochStartLsn == InvalidXLogRecPtr && !wp->config->syncSafekeepers)
|
||||
{
|
||||
wp->propEpochStartLsn = wp->truncateLsn = wp->api.get_redo_start_lsn(wp);
|
||||
if (wp->timelineStartLsn == InvalidXLogRecPtr)
|
||||
{
|
||||
wp->timelineStartLsn = wp->api.get_redo_start_lsn(wp);
|
||||
}
|
||||
wp_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn));
|
||||
}
|
||||
pg_atomic_write_u64(&wp->api.get_shmem_state(wp)->propEpochStartLsn, wp->propEpochStartLsn);
|
||||
|
||||
/*
|
||||
* Safekeepers are setting truncateLsn after timelineStartLsn is known, so
|
||||
* it should never be zero at this point, if we know timelineStartLsn.
|
||||
*
|
||||
* timelineStartLsn can be zero only on the first syncSafekeepers run.
|
||||
*/
|
||||
Assert((wp->truncateLsn != InvalidXLogRecPtr) ||
|
||||
(wp->config->syncSafekeepers && wp->truncateLsn == wp->timelineStartLsn));
|
||||
Assert(wp->truncateLsn != InvalidXLogRecPtr || wp->config->syncSafekeepers);
|
||||
|
||||
/*
|
||||
* We will be generating WAL since propEpochStartLsn, so we should set
|
||||
@@ -1052,10 +1044,11 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
if (SkipXLogPageHeader(wp, wp->propEpochStartLsn) != wp->api.get_redo_start_lsn(wp))
|
||||
{
|
||||
/*
|
||||
* However, allow to proceed if last_log_term on the node which gave
|
||||
* the highest vote (i.e. point where we are going to start writing)
|
||||
* actually had been won by me; plain restart of walproposer not
|
||||
* intervened by concurrent compute which wrote WAL is ok.
|
||||
* However, allow to proceed if last_log_term on the node which
|
||||
* gave the highest vote (i.e. point where we are going to start
|
||||
* writing) actually had been won by me; plain restart of
|
||||
* walproposer not intervened by concurrent compute which wrote
|
||||
* WAL is ok.
|
||||
*
|
||||
* This avoids compute crash after manual term_bump.
|
||||
*/
|
||||
@@ -1125,14 +1118,8 @@ SendProposerElected(Safekeeper *sk)
|
||||
{
|
||||
/* safekeeper is empty or no common point, start from the beginning */
|
||||
sk->startStreamingAt = wp->propTermHistory.entries[0].lsn;
|
||||
wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u",
|
||||
sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries);
|
||||
|
||||
/*
|
||||
* wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline
|
||||
* is created manually (test_s3_wal_replay)
|
||||
*/
|
||||
Assert(sk->startStreamingAt == wp->timelineStartLsn || wp->timelineStartLsn == InvalidXLogRecPtr);
|
||||
wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, termHistory.n_entries=%u",
|
||||
sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), wp->propTermHistory.n_entries);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -1157,29 +1144,19 @@ SendProposerElected(Safekeeper *sk)
|
||||
|
||||
Assert(sk->startStreamingAt <= wp->availableLsn);
|
||||
|
||||
msg.tag = 'e';
|
||||
msg.apm.tag = 'e';
|
||||
msg.generation = wp->mconf.generation;
|
||||
msg.term = wp->propTerm;
|
||||
msg.startStreamingAt = sk->startStreamingAt;
|
||||
msg.termHistory = &wp->propTermHistory;
|
||||
msg.timelineStartLsn = wp->timelineStartLsn;
|
||||
|
||||
lastCommonTerm = idx >= 0 ? wp->propTermHistory.entries[idx].term : 0;
|
||||
wp_log(LOG,
|
||||
"sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
|
||||
sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
|
||||
|
||||
resetStringInfo(&sk->outbuf);
|
||||
pq_sendint64_le(&sk->outbuf, msg.tag);
|
||||
pq_sendint64_le(&sk->outbuf, msg.term);
|
||||
pq_sendint64_le(&sk->outbuf, msg.startStreamingAt);
|
||||
pq_sendint32_le(&sk->outbuf, msg.termHistory->n_entries);
|
||||
for (int i = 0; i < msg.termHistory->n_entries; i++)
|
||||
{
|
||||
pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].term);
|
||||
pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].lsn);
|
||||
}
|
||||
pq_sendint64_le(&sk->outbuf, msg.timelineStartLsn);
|
||||
"sending elected msg to node " UINT64_FORMAT " generation=%u term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s",
|
||||
sk->greetResponse.nodeId, msg.generation, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt),
|
||||
lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port);
|
||||
|
||||
PAMessageSerialize(wp, (ProposerAcceptorMessage *) &msg, &sk->outbuf, wp->config->proto_version);
|
||||
if (!AsyncWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_SEND_ELECTED_FLUSH))
|
||||
return;
|
||||
|
||||
@@ -1245,14 +1222,13 @@ static void
|
||||
PrepareAppendRequest(WalProposer *wp, AppendRequestHeader *req, XLogRecPtr beginLsn, XLogRecPtr endLsn)
|
||||
{
|
||||
Assert(endLsn >= beginLsn);
|
||||
req->tag = 'a';
|
||||
req->apm.tag = 'a';
|
||||
req->generation = wp->mconf.generation;
|
||||
req->term = wp->propTerm;
|
||||
req->epochStartLsn = wp->propEpochStartLsn;
|
||||
req->beginLsn = beginLsn;
|
||||
req->endLsn = endLsn;
|
||||
req->commitLsn = wp->commitLsn;
|
||||
req->truncateLsn = wp->truncateLsn;
|
||||
req->proposerId = wp->greetRequest.proposerId;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1353,7 +1329,8 @@ SendAppendRequests(Safekeeper *sk)
|
||||
resetStringInfo(&sk->outbuf);
|
||||
|
||||
/* write AppendRequest header */
|
||||
appendBinaryStringInfo(&sk->outbuf, (char *) req, sizeof(AppendRequestHeader));
|
||||
PAMessageSerialize(wp, (ProposerAcceptorMessage *) req, &sk->outbuf, wp->config->proto_version);
|
||||
/* prepare for reading WAL into the outbuf */
|
||||
enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn);
|
||||
sk->active_state = SS_ACTIVE_READ_WAL;
|
||||
}
|
||||
@@ -1366,14 +1343,17 @@ SendAppendRequests(Safekeeper *sk)
|
||||
req = &sk->appendRequest;
|
||||
req_len = req->endLsn - req->beginLsn;
|
||||
|
||||
/* We send zero sized AppenRequests as heartbeats; don't wal_read for these. */
|
||||
/*
|
||||
* We send zero sized AppenRequests as heartbeats; don't wal_read
|
||||
* for these.
|
||||
*/
|
||||
if (req_len > 0)
|
||||
{
|
||||
switch (wp->api.wal_read(sk,
|
||||
&sk->outbuf.data[sk->outbuf.len],
|
||||
req->beginLsn,
|
||||
req_len,
|
||||
&errmsg))
|
||||
&sk->outbuf.data[sk->outbuf.len],
|
||||
req->beginLsn,
|
||||
req_len,
|
||||
&errmsg))
|
||||
{
|
||||
case NEON_WALREAD_SUCCESS:
|
||||
break;
|
||||
@@ -1381,7 +1361,7 @@ SendAppendRequests(Safekeeper *sk)
|
||||
return true;
|
||||
case NEON_WALREAD_ERROR:
|
||||
wp_log(WARNING, "WAL reading for node %s:%s failed: %s",
|
||||
sk->host, sk->port, errmsg);
|
||||
sk->host, sk->port, errmsg);
|
||||
ShutdownConnection(sk);
|
||||
return false;
|
||||
default:
|
||||
@@ -1469,11 +1449,11 @@ RecvAppendResponses(Safekeeper *sk)
|
||||
* Term has changed to higher one, probably another compute is
|
||||
* running. If this is the case we could PANIC as well because
|
||||
* likely it inserted some data and our basebackup is unsuitable
|
||||
* anymore. However, we also bump term manually (term_bump endpoint)
|
||||
* on safekeepers for migration purposes, in this case we do want
|
||||
* compute to stay alive. So restart walproposer with FATAL instead
|
||||
* of panicking; if basebackup is spoiled next election will notice
|
||||
* this.
|
||||
* anymore. However, we also bump term manually (term_bump
|
||||
* endpoint) on safekeepers for migration purposes, in this case
|
||||
* we do want compute to stay alive. So restart walproposer with
|
||||
* FATAL instead of panicking; if basebackup is spoiled next
|
||||
* election will notice this.
|
||||
*/
|
||||
wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us",
|
||||
sk->host, sk->port,
|
||||
@@ -1749,6 +1729,208 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk)
|
||||
}
|
||||
}
|
||||
|
||||
/* Serialize MembershipConfiguration into buf. */
|
||||
static void
|
||||
MembershipConfigurationSerialize(MembershipConfiguration *mconf, StringInfo buf)
|
||||
{
|
||||
uint32 i;
|
||||
|
||||
pq_sendint32(buf, mconf->generation);
|
||||
|
||||
pq_sendint32(buf, mconf->members.len);
|
||||
for (i = 0; i < mconf->members.len; i++)
|
||||
{
|
||||
pq_sendint64(buf, mconf->members.m[i].node_id);
|
||||
pq_send_ascii_string(buf, mconf->members.m[i].host);
|
||||
pq_sendint16(buf, mconf->members.m[i].port);
|
||||
}
|
||||
|
||||
/*
|
||||
* There is no special mark for absent new_members; zero members in
|
||||
* invalid, so zero len means absent.
|
||||
*/
|
||||
pq_sendint32(buf, mconf->new_members.len);
|
||||
for (i = 0; i < mconf->new_members.len; i++)
|
||||
{
|
||||
pq_sendint64(buf, mconf->new_members.m[i].node_id);
|
||||
pq_send_ascii_string(buf, mconf->new_members.m[i].host);
|
||||
pq_sendint16(buf, mconf->new_members.m[i].port);
|
||||
}
|
||||
}
|
||||
|
||||
/* Serialize proposer -> acceptor message into buf using specified version */
|
||||
static void
|
||||
PAMessageSerialize(WalProposer *wp, ProposerAcceptorMessage *msg, StringInfo buf, int proto_version)
|
||||
{
|
||||
/* both version are supported currently until we fully migrate to 3 */
|
||||
Assert(proto_version == 3 || proto_version == 2);
|
||||
|
||||
resetStringInfo(buf);
|
||||
|
||||
if (proto_version == 3)
|
||||
{
|
||||
/*
|
||||
* v2 sends structs for some messages as is, so commonly send tag only
|
||||
* for v3
|
||||
*/
|
||||
pq_sendint8(buf, msg->tag);
|
||||
|
||||
switch (msg->tag)
|
||||
{
|
||||
case 'g':
|
||||
{
|
||||
ProposerGreeting *m = (ProposerGreeting *) msg;
|
||||
|
||||
pq_send_ascii_string(buf, m->tenant_id);
|
||||
pq_send_ascii_string(buf, m->timeline_id);
|
||||
MembershipConfigurationSerialize(&m->mconf, buf);
|
||||
pq_sendint32(buf, m->pg_version);
|
||||
pq_sendint64(buf, m->system_id);
|
||||
pq_sendint32(buf, m->wal_seg_size);
|
||||
break;
|
||||
}
|
||||
case 'v':
|
||||
{
|
||||
VoteRequest *m = (VoteRequest *) msg;
|
||||
|
||||
pq_sendint32(buf, m->generation);
|
||||
pq_sendint64(buf, m->term);
|
||||
break;
|
||||
|
||||
}
|
||||
case 'e':
|
||||
{
|
||||
ProposerElected *m = (ProposerElected *) msg;
|
||||
|
||||
pq_sendint32(buf, m->generation);
|
||||
pq_sendint64(buf, m->term);
|
||||
pq_sendint64(buf, m->startStreamingAt);
|
||||
pq_sendint32(buf, m->termHistory->n_entries);
|
||||
for (uint32 i = 0; i < m->termHistory->n_entries; i++)
|
||||
{
|
||||
pq_sendint64(buf, m->termHistory->entries[i].term);
|
||||
pq_sendint64(buf, m->termHistory->entries[i].lsn);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 'a':
|
||||
{
|
||||
/*
|
||||
* Note: this serializes only AppendRequestHeader, caller
|
||||
* is expected to append WAL data later.
|
||||
*/
|
||||
AppendRequestHeader *m = (AppendRequestHeader *) msg;
|
||||
|
||||
pq_sendint32(buf, m->generation);
|
||||
pq_sendint64(buf, m->term);
|
||||
pq_sendint64(buf, m->beginLsn);
|
||||
pq_sendint64(buf, m->endLsn);
|
||||
pq_sendint64(buf, m->commitLsn);
|
||||
pq_sendint64(buf, m->truncateLsn);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
wp_log(FATAL, "unexpected message type %c to serialize", msg->tag);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (proto_version == 2)
|
||||
{
|
||||
switch (msg->tag)
|
||||
{
|
||||
case 'g':
|
||||
{
|
||||
/* v2 sent struct as is */
|
||||
ProposerGreeting *m = (ProposerGreeting *) msg;
|
||||
ProposerGreetingV2 greetRequestV2;
|
||||
|
||||
/* Fill also v2 struct. */
|
||||
greetRequestV2.tag = 'g';
|
||||
greetRequestV2.protocolVersion = proto_version;
|
||||
greetRequestV2.pgVersion = m->pg_version;
|
||||
|
||||
/*
|
||||
* v3 removed this field because it's easier to pass as
|
||||
* libq or START_WAL_PUSH options
|
||||
*/
|
||||
memset(&greetRequestV2.proposerId, 0, sizeof(greetRequestV2.proposerId));
|
||||
greetRequestV2.systemId = wp->config->systemId;
|
||||
if (*m->timeline_id != '\0' &&
|
||||
!HexDecodeString(greetRequestV2.timeline_id, m->timeline_id, 16))
|
||||
wp_log(FATAL, "could not parse neon.timeline_id, %s", m->timeline_id);
|
||||
if (*m->tenant_id != '\0' &&
|
||||
!HexDecodeString(greetRequestV2.tenant_id, m->tenant_id, 16))
|
||||
wp_log(FATAL, "could not parse neon.tenant_id, %s", m->tenant_id);
|
||||
|
||||
greetRequestV2.timeline = wp->config->pgTimeline;
|
||||
greetRequestV2.walSegSize = wp->config->wal_segment_size;
|
||||
|
||||
pq_sendbytes(buf, (char *) &greetRequestV2, sizeof(greetRequestV2));
|
||||
break;
|
||||
}
|
||||
case 'v':
|
||||
{
|
||||
/* v2 sent struct as is */
|
||||
VoteRequest *m = (VoteRequest *) msg;
|
||||
VoteRequestV2 voteRequestV2;
|
||||
|
||||
voteRequestV2.tag = m->pam.tag;
|
||||
voteRequestV2.term = m->term;
|
||||
/* removed field */
|
||||
memset(&voteRequestV2.proposerId, 0, sizeof(voteRequestV2.proposerId));
|
||||
pq_sendbytes(buf, (char *) &voteRequestV2, sizeof(voteRequestV2));
|
||||
break;
|
||||
}
|
||||
case 'e':
|
||||
{
|
||||
ProposerElected *m = (ProposerElected *) msg;
|
||||
|
||||
pq_sendint64_le(buf, m->apm.tag);
|
||||
pq_sendint64_le(buf, m->term);
|
||||
pq_sendint64_le(buf, m->startStreamingAt);
|
||||
pq_sendint32_le(buf, m->termHistory->n_entries);
|
||||
for (int i = 0; i < m->termHistory->n_entries; i++)
|
||||
{
|
||||
pq_sendint64_le(buf, m->termHistory->entries[i].term);
|
||||
pq_sendint64_le(buf, m->termHistory->entries[i].lsn);
|
||||
}
|
||||
pq_sendint64_le(buf, 0); /* removed timeline_start_lsn */
|
||||
break;
|
||||
}
|
||||
case 'a':
|
||||
|
||||
/*
|
||||
* Note: this serializes only AppendRequestHeader, caller is
|
||||
* expected to append WAL data later.
|
||||
*/
|
||||
{
|
||||
/* v2 sent struct as is */
|
||||
AppendRequestHeader *m = (AppendRequestHeader *) msg;
|
||||
AppendRequestHeaderV2 appendRequestHeaderV2;
|
||||
|
||||
appendRequestHeaderV2.tag = m->apm.tag;
|
||||
appendRequestHeaderV2.term = m->term;
|
||||
appendRequestHeaderV2.epochStartLsn = 0; /* removed field */
|
||||
appendRequestHeaderV2.beginLsn = m->beginLsn;
|
||||
appendRequestHeaderV2.endLsn = m->endLsn;
|
||||
appendRequestHeaderV2.commitLsn = m->commitLsn;
|
||||
appendRequestHeaderV2.truncateLsn = m->truncateLsn;
|
||||
/* removed field */
|
||||
memset(&appendRequestHeaderV2.proposerId, 0, sizeof(appendRequestHeaderV2.proposerId));
|
||||
|
||||
pq_sendbytes(buf, (char *) &appendRequestHeaderV2, sizeof(appendRequestHeaderV2));
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
wp_log(FATAL, "unexpected message type %c to serialize", msg->tag);
|
||||
}
|
||||
return;
|
||||
}
|
||||
wp_log(FATAL, "unexpected proto_version %d", proto_version);
|
||||
}
|
||||
|
||||
/*
|
||||
* Try to read CopyData message from i'th safekeeper, resetting connection on
|
||||
* failure.
|
||||
@@ -1778,6 +1960,37 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Deserialize membership configuration from buf to mconf. */
|
||||
static void
|
||||
MembershipConfigurationDeserialize(MembershipConfiguration *mconf, StringInfo buf)
|
||||
{
|
||||
uint32 i;
|
||||
|
||||
mconf->generation = pq_getmsgint32(buf);
|
||||
mconf->members.len = pq_getmsgint32(buf);
|
||||
mconf->members.m = palloc0(sizeof(SafekeeperId) * mconf->members.len);
|
||||
for (i = 0; i < mconf->members.len; i++)
|
||||
{
|
||||
const char *buf_host;
|
||||
|
||||
mconf->members.m[i].node_id = pq_getmsgint64(buf);
|
||||
buf_host = pq_getmsgrawstring(buf);
|
||||
strlcpy(mconf->members.m[i].host, buf_host, sizeof(mconf->members.m[i].host));
|
||||
mconf->members.m[i].port = pq_getmsgint16(buf);
|
||||
}
|
||||
mconf->new_members.len = pq_getmsgint32(buf);
|
||||
mconf->new_members.m = palloc0(sizeof(SafekeeperId) * mconf->new_members.len);
|
||||
for (i = 0; i < mconf->new_members.len; i++)
|
||||
{
|
||||
const char *buf_host;
|
||||
|
||||
mconf->new_members.m[i].node_id = pq_getmsgint64(buf);
|
||||
buf_host = pq_getmsgrawstring(buf);
|
||||
strlcpy(mconf->new_members.m[i].host, buf_host, sizeof(mconf->new_members.m[i].host));
|
||||
mconf->new_members.m[i].port = pq_getmsgint16(buf);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Read next message with known type into provided struct, by reading a CopyData
|
||||
* block from the safekeeper's postgres connection, returning whether the read
|
||||
@@ -1786,6 +1999,8 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
|
||||
* If the read needs more polling, we return 'false' and keep the state
|
||||
* unmodified, waiting until it becomes read-ready to try again. If it fully
|
||||
* failed, a warning is emitted and the connection is reset.
|
||||
*
|
||||
* Note: it pallocs if needed, i.e. for AcceptorGreeting and VoteResponse fields.
|
||||
*/
|
||||
static bool
|
||||
AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
|
||||
@@ -1794,82 +2009,153 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
|
||||
|
||||
char *buf;
|
||||
int buf_size;
|
||||
uint64 tag;
|
||||
uint8 tag;
|
||||
StringInfoData s;
|
||||
|
||||
if (!(AsyncRead(sk, &buf, &buf_size)))
|
||||
return false;
|
||||
sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
|
||||
|
||||
/* parse it */
|
||||
s.data = buf;
|
||||
s.len = buf_size;
|
||||
s.maxlen = buf_size;
|
||||
s.cursor = 0;
|
||||
|
||||
tag = pq_getmsgint64_le(&s);
|
||||
if (tag != anymsg->tag)
|
||||
if (wp->config->proto_version == 3)
|
||||
{
|
||||
wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
|
||||
sk->port, FormatSafekeeperState(sk));
|
||||
ResetConnection(sk);
|
||||
return false;
|
||||
}
|
||||
sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
|
||||
switch (tag)
|
||||
{
|
||||
case 'g':
|
||||
{
|
||||
AcceptorGreeting *msg = (AcceptorGreeting *) anymsg;
|
||||
|
||||
msg->term = pq_getmsgint64_le(&s);
|
||||
msg->nodeId = pq_getmsgint64_le(&s);
|
||||
pq_getmsgend(&s);
|
||||
return true;
|
||||
}
|
||||
|
||||
case 'v':
|
||||
{
|
||||
VoteResponse *msg = (VoteResponse *) anymsg;
|
||||
|
||||
msg->term = pq_getmsgint64_le(&s);
|
||||
msg->voteGiven = pq_getmsgint64_le(&s);
|
||||
msg->flushLsn = pq_getmsgint64_le(&s);
|
||||
msg->truncateLsn = pq_getmsgint64_le(&s);
|
||||
msg->termHistory.n_entries = pq_getmsgint32_le(&s);
|
||||
msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries);
|
||||
for (int i = 0; i < msg->termHistory.n_entries; i++)
|
||||
tag = pq_getmsgbyte(&s);
|
||||
if (tag != anymsg->tag)
|
||||
{
|
||||
wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
|
||||
sk->port, FormatSafekeeperState(sk));
|
||||
ResetConnection(sk);
|
||||
return false;
|
||||
}
|
||||
switch (tag)
|
||||
{
|
||||
case 'g':
|
||||
{
|
||||
msg->termHistory.entries[i].term = pq_getmsgint64_le(&s);
|
||||
msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s);
|
||||
AcceptorGreeting *msg = (AcceptorGreeting *) anymsg;
|
||||
|
||||
msg->nodeId = pq_getmsgint64(&s);
|
||||
MembershipConfigurationDeserialize(&msg->mconf, &s);
|
||||
msg->term = pq_getmsgint64(&s);
|
||||
pq_getmsgend(&s);
|
||||
return true;
|
||||
}
|
||||
msg->timelineStartLsn = pq_getmsgint64_le(&s);
|
||||
pq_getmsgend(&s);
|
||||
return true;
|
||||
}
|
||||
case 'v':
|
||||
{
|
||||
VoteResponse *msg = (VoteResponse *) anymsg;
|
||||
|
||||
case 'a':
|
||||
{
|
||||
AppendResponse *msg = (AppendResponse *) anymsg;
|
||||
msg->generation = pq_getmsgint32(&s);
|
||||
msg->term = pq_getmsgint64(&s);
|
||||
msg->voteGiven = pq_getmsgbyte(&s);
|
||||
msg->flushLsn = pq_getmsgint64(&s);
|
||||
msg->truncateLsn = pq_getmsgint64(&s);
|
||||
msg->termHistory.n_entries = pq_getmsgint32(&s);
|
||||
msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries);
|
||||
for (uint32 i = 0; i < msg->termHistory.n_entries; i++)
|
||||
{
|
||||
msg->termHistory.entries[i].term = pq_getmsgint64(&s);
|
||||
msg->termHistory.entries[i].lsn = pq_getmsgint64(&s);
|
||||
}
|
||||
pq_getmsgend(&s);
|
||||
return true;
|
||||
}
|
||||
case 'a':
|
||||
{
|
||||
AppendResponse *msg = (AppendResponse *) anymsg;
|
||||
|
||||
msg->term = pq_getmsgint64_le(&s);
|
||||
msg->flushLsn = pq_getmsgint64_le(&s);
|
||||
msg->commitLsn = pq_getmsgint64_le(&s);
|
||||
msg->hs.ts = pq_getmsgint64_le(&s);
|
||||
msg->hs.xmin.value = pq_getmsgint64_le(&s);
|
||||
msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
|
||||
if (s.len > s.cursor)
|
||||
ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback);
|
||||
else
|
||||
msg->ps_feedback.present = false;
|
||||
pq_getmsgend(&s);
|
||||
return true;
|
||||
}
|
||||
|
||||
default:
|
||||
{
|
||||
Assert(false);
|
||||
return false;
|
||||
}
|
||||
msg->generation = pq_getmsgint32(&s);
|
||||
msg->term = pq_getmsgint64(&s);
|
||||
msg->flushLsn = pq_getmsgint64(&s);
|
||||
msg->commitLsn = pq_getmsgint64(&s);
|
||||
msg->hs.ts = pq_getmsgint64(&s);
|
||||
msg->hs.xmin.value = pq_getmsgint64(&s);
|
||||
msg->hs.catalog_xmin.value = pq_getmsgint64(&s);
|
||||
if (s.len > s.cursor)
|
||||
ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback);
|
||||
else
|
||||
msg->ps_feedback.present = false;
|
||||
pq_getmsgend(&s);
|
||||
return true;
|
||||
}
|
||||
default:
|
||||
{
|
||||
wp_log(FATAL, "unexpected message tag %c to read", (char) tag);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (wp->config->proto_version == 2)
|
||||
{
|
||||
tag = pq_getmsgint64_le(&s);
|
||||
if (tag != anymsg->tag)
|
||||
{
|
||||
wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
|
||||
sk->port, FormatSafekeeperState(sk));
|
||||
ResetConnection(sk);
|
||||
return false;
|
||||
}
|
||||
switch (tag)
|
||||
{
|
||||
case 'g':
|
||||
{
|
||||
AcceptorGreeting *msg = (AcceptorGreeting *) anymsg;
|
||||
|
||||
msg->term = pq_getmsgint64_le(&s);
|
||||
msg->nodeId = pq_getmsgint64_le(&s);
|
||||
pq_getmsgend(&s);
|
||||
return true;
|
||||
}
|
||||
|
||||
case 'v':
|
||||
{
|
||||
VoteResponse *msg = (VoteResponse *) anymsg;
|
||||
|
||||
msg->term = pq_getmsgint64_le(&s);
|
||||
msg->voteGiven = pq_getmsgint64_le(&s);
|
||||
msg->flushLsn = pq_getmsgint64_le(&s);
|
||||
msg->truncateLsn = pq_getmsgint64_le(&s);
|
||||
msg->termHistory.n_entries = pq_getmsgint32_le(&s);
|
||||
msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries);
|
||||
for (int i = 0; i < msg->termHistory.n_entries; i++)
|
||||
{
|
||||
msg->termHistory.entries[i].term = pq_getmsgint64_le(&s);
|
||||
msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s);
|
||||
}
|
||||
pq_getmsgint64_le(&s); /* timelineStartLsn */
|
||||
pq_getmsgend(&s);
|
||||
return true;
|
||||
}
|
||||
|
||||
case 'a':
|
||||
{
|
||||
AppendResponse *msg = (AppendResponse *) anymsg;
|
||||
|
||||
msg->term = pq_getmsgint64_le(&s);
|
||||
msg->flushLsn = pq_getmsgint64_le(&s);
|
||||
msg->commitLsn = pq_getmsgint64_le(&s);
|
||||
msg->hs.ts = pq_getmsgint64_le(&s);
|
||||
msg->hs.xmin.value = pq_getmsgint64_le(&s);
|
||||
msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
|
||||
if (s.len > s.cursor)
|
||||
ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback);
|
||||
else
|
||||
msg->ps_feedback.present = false;
|
||||
pq_getmsgend(&s);
|
||||
return true;
|
||||
}
|
||||
|
||||
default:
|
||||
{
|
||||
wp_log(FATAL, "unexpected message tag %c to read", (char) tag);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
wp_log(FATAL, "unsupported proto_version %d", wp->config->proto_version);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2245,3 +2531,45 @@ FormatEvents(WalProposer *wp, uint32 events)
|
||||
|
||||
return (char *) &return_str;
|
||||
}
|
||||
|
||||
/* Dump mconf as toml for observability / debugging. Result is palloc'ed. */
|
||||
static char *
|
||||
MembershipConfigurationToString(MembershipConfiguration *mconf)
|
||||
{
|
||||
StringInfoData s;
|
||||
uint32 i;
|
||||
|
||||
initStringInfo(&s);
|
||||
appendStringInfo(&s, "{gen = %u", mconf->generation);
|
||||
appendStringInfoString(&s, ", members = [");
|
||||
for (i = 0; i < mconf->members.len; i++)
|
||||
{
|
||||
if (i > 0)
|
||||
appendStringInfoString(&s, ", ");
|
||||
appendStringInfo(&s, "{node_id = %lu", mconf->members.m[i].node_id);
|
||||
appendStringInfo(&s, ", host = %s", mconf->members.m[i].host);
|
||||
appendStringInfo(&s, ", port = %u }", mconf->members.m[i].port);
|
||||
}
|
||||
appendStringInfo(&s, "], new_members = [");
|
||||
for (i = 0; i < mconf->new_members.len; i++)
|
||||
{
|
||||
if (i > 0)
|
||||
appendStringInfoString(&s, ", ");
|
||||
appendStringInfo(&s, "{node_id = %lu", mconf->new_members.m[i].node_id);
|
||||
appendStringInfo(&s, ", host = %s", mconf->new_members.m[i].host);
|
||||
appendStringInfo(&s, ", port = %u }", mconf->new_members.m[i].port);
|
||||
}
|
||||
appendStringInfoString(&s, "]}");
|
||||
return s.data;
|
||||
}
|
||||
|
||||
static void
|
||||
MembershipConfigurationFree(MembershipConfiguration *mconf)
|
||||
{
|
||||
if (mconf->members.m)
|
||||
pfree(mconf->members.m);
|
||||
mconf->members.m = NULL;
|
||||
if (mconf->new_members.m)
|
||||
pfree(mconf->new_members.m);
|
||||
mconf->new_members.m = NULL;
|
||||
}
|
||||
|
||||
@@ -12,9 +12,6 @@
|
||||
#include "neon_walreader.h"
|
||||
#include "pagestore_client.h"
|
||||
|
||||
#define SK_MAGIC 0xCafeCeefu
|
||||
#define SK_PROTOCOL_VERSION 2
|
||||
|
||||
#define MAX_SAFEKEEPERS 32
|
||||
#define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) /* max size of a single* WAL
|
||||
* message */
|
||||
@@ -143,12 +140,71 @@ typedef uint64 term_t;
|
||||
/* neon storage node id */
|
||||
typedef uint64 NNodeId;
|
||||
|
||||
/*
|
||||
* Number uniquely identifying safekeeper membership configuration.
|
||||
* This and following structs pair ones in membership.rs.
|
||||
*/
|
||||
typedef uint32 Generation;
|
||||
|
||||
typedef struct SafekeeperId
|
||||
{
|
||||
NNodeId node_id;
|
||||
char host[MAXCONNINFO];
|
||||
uint16 port;
|
||||
} SafekeeperId;
|
||||
|
||||
/* Set of safekeepers. */
|
||||
typedef struct MemberSet
|
||||
{
|
||||
uint32 len; /* number of members */
|
||||
SafekeeperId *m; /* ids themselves */
|
||||
} MemberSet;
|
||||
|
||||
/* Timeline safekeeper membership configuration. */
|
||||
typedef struct MembershipConfiguration
|
||||
{
|
||||
Generation generation;
|
||||
MemberSet members;
|
||||
/* Has 0 n_members in non joint conf. */
|
||||
MemberSet new_members;
|
||||
} MembershipConfiguration;
|
||||
|
||||
/*
|
||||
* Proposer <-> Acceptor messaging.
|
||||
*/
|
||||
|
||||
typedef struct ProposerAcceptorMessage
|
||||
{
|
||||
uint8 tag;
|
||||
} ProposerAcceptorMessage;
|
||||
|
||||
/* Initial Proposer -> Acceptor message */
|
||||
typedef struct ProposerGreeting
|
||||
{
|
||||
ProposerAcceptorMessage pam; /* message tag */
|
||||
|
||||
/*
|
||||
* tenant/timeline ids as C strings with standard hex notation for ease of
|
||||
* printing. In principle they are not strictly needed as ttid is also
|
||||
* passed as libpq options.
|
||||
*/
|
||||
char *tenant_id;
|
||||
char *timeline_id;
|
||||
/* Full conf is carried to allow safekeeper switch */
|
||||
MembershipConfiguration mconf;
|
||||
|
||||
/*
|
||||
* pg_version and wal_seg_size are used for timeline creation until we
|
||||
* fully migrate to doing externally. systemId is only used as a sanity
|
||||
* cross check.
|
||||
*/
|
||||
uint32 pg_version; /* in PG_VERSION_NUM format */
|
||||
uint64 system_id; /* Postgres system identifier. */
|
||||
uint32 wal_seg_size;
|
||||
} ProposerGreeting;
|
||||
|
||||
/* protocol v2 variant, kept while wp supports it */
|
||||
typedef struct ProposerGreetingV2
|
||||
{
|
||||
uint64 tag; /* message tag */
|
||||
uint32 protocolVersion; /* proposer-safekeeper protocol version */
|
||||
@@ -159,32 +215,42 @@ typedef struct ProposerGreeting
|
||||
uint8 tenant_id[16];
|
||||
TimeLineID timeline;
|
||||
uint32 walSegSize;
|
||||
} ProposerGreeting;
|
||||
} ProposerGreetingV2;
|
||||
|
||||
typedef struct AcceptorProposerMessage
|
||||
{
|
||||
uint64 tag;
|
||||
uint8 tag;
|
||||
} AcceptorProposerMessage;
|
||||
|
||||
/*
|
||||
* Acceptor -> Proposer initial response: the highest term acceptor voted for.
|
||||
* Acceptor -> Proposer initial response: the highest term acceptor voted for,
|
||||
* its node id and configuration.
|
||||
*/
|
||||
typedef struct AcceptorGreeting
|
||||
{
|
||||
AcceptorProposerMessage apm;
|
||||
term_t term;
|
||||
NNodeId nodeId;
|
||||
MembershipConfiguration mconf;
|
||||
term_t term;
|
||||
} AcceptorGreeting;
|
||||
|
||||
/*
|
||||
* Proposer -> Acceptor vote request.
|
||||
*/
|
||||
typedef struct VoteRequest
|
||||
{
|
||||
ProposerAcceptorMessage pam; /* message tag */
|
||||
Generation generation; /* membership conf generation */
|
||||
term_t term;
|
||||
} VoteRequest;
|
||||
|
||||
/* protocol v2 variant, kept while wp supports it */
|
||||
typedef struct VoteRequestV2
|
||||
{
|
||||
uint64 tag;
|
||||
term_t term;
|
||||
pg_uuid_t proposerId; /* for monitoring/debugging */
|
||||
} VoteRequest;
|
||||
} VoteRequestV2;
|
||||
|
||||
/* Element of term switching chain. */
|
||||
typedef struct TermSwitchEntry
|
||||
@@ -203,8 +269,15 @@ typedef struct TermHistory
|
||||
typedef struct VoteResponse
|
||||
{
|
||||
AcceptorProposerMessage apm;
|
||||
|
||||
/*
|
||||
* Membership conf generation. It's not strictly required because on
|
||||
* mismatch safekeeper is expected to ERROR the connection, but let's
|
||||
* sanity check it.
|
||||
*/
|
||||
Generation generation;
|
||||
term_t term;
|
||||
uint64 voteGiven;
|
||||
uint8 voteGiven;
|
||||
|
||||
/*
|
||||
* Safekeeper flush_lsn (end of WAL) + history of term switches allow
|
||||
@@ -214,7 +287,6 @@ typedef struct VoteResponse
|
||||
XLogRecPtr truncateLsn; /* minimal LSN which may be needed for*
|
||||
* recovery of some safekeeper */
|
||||
TermHistory termHistory;
|
||||
XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */
|
||||
} VoteResponse;
|
||||
|
||||
/*
|
||||
@@ -223,20 +295,37 @@ typedef struct VoteResponse
|
||||
*/
|
||||
typedef struct ProposerElected
|
||||
{
|
||||
uint64 tag;
|
||||
AcceptorProposerMessage apm;
|
||||
Generation generation; /* membership conf generation */
|
||||
term_t term;
|
||||
/* proposer will send since this point */
|
||||
XLogRecPtr startStreamingAt;
|
||||
/* history of term switches up to this proposer */
|
||||
TermHistory *termHistory;
|
||||
/* timeline globally starts at this LSN */
|
||||
XLogRecPtr timelineStartLsn;
|
||||
} ProposerElected;
|
||||
|
||||
/*
|
||||
* Header of request with WAL message sent from proposer to safekeeper.
|
||||
*/
|
||||
typedef struct AppendRequestHeader
|
||||
{
|
||||
AcceptorProposerMessage apm;
|
||||
Generation generation; /* membership conf generation */
|
||||
term_t term; /* term of the proposer */
|
||||
XLogRecPtr beginLsn; /* start position of message in WAL */
|
||||
XLogRecPtr endLsn; /* end position of message in WAL */
|
||||
XLogRecPtr commitLsn; /* LSN committed by quorum of safekeepers */
|
||||
|
||||
/*
|
||||
* minimal LSN which may be needed for recovery of some safekeeper (end
|
||||
* lsn + 1 of last chunk streamed to everyone)
|
||||
*/
|
||||
XLogRecPtr truncateLsn;
|
||||
/* in the AppendRequest message, WAL data follows */
|
||||
} AppendRequestHeader;
|
||||
|
||||
/* protocol v2 variant, kept while wp supports it */
|
||||
typedef struct AppendRequestHeaderV2
|
||||
{
|
||||
uint64 tag;
|
||||
term_t term; /* term of the proposer */
|
||||
@@ -256,7 +345,8 @@ typedef struct AppendRequestHeader
|
||||
*/
|
||||
XLogRecPtr truncateLsn;
|
||||
pg_uuid_t proposerId; /* for monitoring/debugging */
|
||||
} AppendRequestHeader;
|
||||
/* in the AppendRequest message, WAL data follows */
|
||||
} AppendRequestHeaderV2;
|
||||
|
||||
/*
|
||||
* Hot standby feedback received from replica
|
||||
@@ -309,6 +399,13 @@ typedef struct AppendResponse
|
||||
{
|
||||
AcceptorProposerMessage apm;
|
||||
|
||||
/*
|
||||
* Membership conf generation. It's not strictly required because on
|
||||
* mismatch safekeeper is expected to ERROR the connection, but let's
|
||||
* sanity check it.
|
||||
*/
|
||||
Generation generation;
|
||||
|
||||
/*
|
||||
* Current term of the safekeeper; if it is higher than proposer's, the
|
||||
* compute is out of date.
|
||||
@@ -644,6 +741,8 @@ typedef struct WalProposerConfig
|
||||
/* Will be passed to safekeepers in greet request. */
|
||||
TimeLineID pgTimeline;
|
||||
|
||||
int proto_version;
|
||||
|
||||
#ifdef WALPROPOSER_LIB
|
||||
void *callback_data;
|
||||
#endif
|
||||
@@ -656,11 +755,14 @@ typedef struct WalProposerConfig
|
||||
typedef struct WalProposer
|
||||
{
|
||||
WalProposerConfig *config;
|
||||
int n_safekeepers;
|
||||
/* Current walproposer membership configuration */
|
||||
MembershipConfiguration mconf;
|
||||
|
||||
/* (n_safekeepers / 2) + 1 */
|
||||
int quorum;
|
||||
|
||||
/* Number of occupied slots in safekeepers[] */
|
||||
int n_safekeepers;
|
||||
Safekeeper safekeeper[MAX_SAFEKEEPERS];
|
||||
|
||||
/* WAL has been generated up to this point */
|
||||
@@ -670,6 +772,7 @@ typedef struct WalProposer
|
||||
XLogRecPtr commitLsn;
|
||||
|
||||
ProposerGreeting greetRequest;
|
||||
ProposerGreetingV2 greetRequestV2;
|
||||
|
||||
/* Vote request for safekeeper */
|
||||
VoteRequest voteRequest;
|
||||
|
||||
@@ -155,6 +155,16 @@ pq_getmsgend(StringInfo msg)
|
||||
ExceptionalCondition("invalid msg format", __FILE__, __LINE__);
|
||||
}
|
||||
|
||||
/* --------------------------------
|
||||
* pq_sendbytes - append raw data to a StringInfo buffer
|
||||
* --------------------------------
|
||||
*/
|
||||
void
|
||||
pq_sendbytes(StringInfo buf, const void *data, int datalen)
|
||||
{
|
||||
/* use variant that maintains a trailing null-byte, out of caution */
|
||||
appendBinaryStringInfo(buf, data, datalen);
|
||||
}
|
||||
|
||||
/*
|
||||
* Produce a C-string representation of a TimestampTz.
|
||||
|
||||
@@ -59,9 +59,11 @@
|
||||
|
||||
#define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"
|
||||
|
||||
/* GUCs */
|
||||
char *wal_acceptors_list = "";
|
||||
int wal_acceptor_reconnect_timeout = 1000;
|
||||
int wal_acceptor_connection_timeout = 10000;
|
||||
int safekeeper_proto_version = 2;
|
||||
|
||||
/* Set to true in the walproposer bgw. */
|
||||
static bool am_walproposer;
|
||||
@@ -126,6 +128,7 @@ init_walprop_config(bool syncSafekeepers)
|
||||
else
|
||||
walprop_config.systemId = 0;
|
||||
walprop_config.pgTimeline = walprop_pg_get_timeline_id();
|
||||
walprop_config.proto_version = safekeeper_proto_version;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -219,25 +222,37 @@ nwp_register_gucs(void)
|
||||
PGC_SIGHUP,
|
||||
GUC_UNIT_MS,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
DefineCustomIntVariable(
|
||||
"neon.safekeeper_proto_version",
|
||||
"Version of compute <-> safekeeper protocol.",
|
||||
"Used while migrating from 2 to 3.",
|
||||
&safekeeper_proto_version,
|
||||
2, 0, INT_MAX,
|
||||
PGC_POSTMASTER,
|
||||
0,
|
||||
NULL, NULL, NULL);
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
split_safekeepers_list(char *safekeepers_list, char *safekeepers[])
|
||||
{
|
||||
int n_safekeepers = 0;
|
||||
char *curr_sk = safekeepers_list;
|
||||
int n_safekeepers = 0;
|
||||
char *curr_sk = safekeepers_list;
|
||||
|
||||
for (char *coma = safekeepers_list; coma != NULL && *coma != '\0'; curr_sk = coma)
|
||||
{
|
||||
if (++n_safekeepers >= MAX_SAFEKEEPERS) {
|
||||
if (++n_safekeepers >= MAX_SAFEKEEPERS)
|
||||
{
|
||||
wpg_log(FATAL, "too many safekeepers");
|
||||
}
|
||||
|
||||
coma = strchr(coma, ',');
|
||||
safekeepers[n_safekeepers-1] = curr_sk;
|
||||
safekeepers[n_safekeepers - 1] = curr_sk;
|
||||
|
||||
if (coma != NULL) {
|
||||
if (coma != NULL)
|
||||
{
|
||||
*coma++ = '\0';
|
||||
}
|
||||
}
|
||||
@@ -252,10 +267,10 @@ split_safekeepers_list(char *safekeepers_list, char *safekeepers[])
|
||||
static bool
|
||||
safekeepers_cmp(char *old, char *new)
|
||||
{
|
||||
char *safekeepers_old[MAX_SAFEKEEPERS];
|
||||
char *safekeepers_new[MAX_SAFEKEEPERS];
|
||||
int len_old = 0;
|
||||
int len_new = 0;
|
||||
char *safekeepers_old[MAX_SAFEKEEPERS];
|
||||
char *safekeepers_new[MAX_SAFEKEEPERS];
|
||||
int len_old = 0;
|
||||
int len_new = 0;
|
||||
|
||||
len_old = split_safekeepers_list(old, safekeepers_old);
|
||||
len_new = split_safekeepers_list(new, safekeepers_new);
|
||||
@@ -292,7 +307,8 @@ assign_neon_safekeepers(const char *newval, void *extra)
|
||||
if (!am_walproposer)
|
||||
return;
|
||||
|
||||
if (!newval) {
|
||||
if (!newval)
|
||||
{
|
||||
/* should never happen */
|
||||
wpg_log(FATAL, "neon.safekeepers is empty");
|
||||
}
|
||||
@@ -301,11 +317,11 @@ assign_neon_safekeepers(const char *newval, void *extra)
|
||||
newval_copy = pstrdup(newval);
|
||||
oldval = pstrdup(wal_acceptors_list);
|
||||
|
||||
/*
|
||||
/*
|
||||
* TODO: restarting through FATAL is stupid and introduces 1s delay before
|
||||
* next bgw start. We should refactor walproposer to allow graceful exit and
|
||||
* thus remove this delay.
|
||||
* XXX: If you change anything here, sync with test_safekeepers_reconfigure_reorder.
|
||||
* next bgw start. We should refactor walproposer to allow graceful exit
|
||||
* and thus remove this delay. XXX: If you change anything here, sync with
|
||||
* test_safekeepers_reconfigure_reorder.
|
||||
*/
|
||||
if (!safekeepers_cmp(oldval, newval_copy))
|
||||
{
|
||||
@@ -454,7 +470,8 @@ backpressure_throttling_impl(void)
|
||||
memcpy(new_status, old_status, len);
|
||||
snprintf(new_status + len, 64, "backpressure throttling: lag %lu", lag);
|
||||
set_ps_display(new_status);
|
||||
new_status[len] = '\0'; /* truncate off " backpressure ..." to later reset the ps */
|
||||
new_status[len] = '\0'; /* truncate off " backpressure ..." to later
|
||||
* reset the ps */
|
||||
|
||||
elog(DEBUG2, "backpressure throttling: lag %lu", lag);
|
||||
start = GetCurrentTimestamp();
|
||||
@@ -621,7 +638,7 @@ walprop_pg_start_streaming(WalProposer *wp, XLogRecPtr startpos)
|
||||
wpg_log(LOG, "WAL proposer starts streaming at %X/%X",
|
||||
LSN_FORMAT_ARGS(startpos));
|
||||
cmd.slotname = WAL_PROPOSER_SLOT_NAME;
|
||||
cmd.timeline = wp->greetRequest.timeline;
|
||||
cmd.timeline = wp->config->pgTimeline;
|
||||
cmd.startpoint = startpos;
|
||||
StartProposerReplication(wp, &cmd);
|
||||
}
|
||||
@@ -1963,10 +1980,11 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk)
|
||||
FullTransactionId xmin = hsFeedback.xmin;
|
||||
FullTransactionId catalog_xmin = hsFeedback.catalog_xmin;
|
||||
FullTransactionId next_xid = ReadNextFullTransactionId();
|
||||
|
||||
/*
|
||||
* Page server is updating nextXid in checkpoint each 1024 transactions,
|
||||
* so feedback xmin can be actually larger then nextXid and
|
||||
* function TransactionIdInRecentPast return false in this case,
|
||||
* Page server is updating nextXid in checkpoint each 1024
|
||||
* transactions, so feedback xmin can be actually larger then nextXid
|
||||
* and function TransactionIdInRecentPast return false in this case,
|
||||
* preventing update of slot's xmin.
|
||||
*/
|
||||
if (FullTransactionIdPrecedes(next_xid, xmin))
|
||||
|
||||
@@ -106,6 +106,7 @@ jose-jwk = { version = "0.1.2", features = ["p256", "p384", "rsa"] }
|
||||
signature = "2"
|
||||
ecdsa = "0.16"
|
||||
p256 = { version = "0.13", features = ["jwk"] }
|
||||
ed25519-dalek = { version = "2", default-features = false, features = ["rand_core"] }
|
||||
rsa = "0.9"
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -187,10 +187,6 @@ pub async fn worker(
|
||||
let rx = futures::stream::poll_fn(move |cx| rx.poll_recv(cx));
|
||||
let rx = rx.map(RequestData::from);
|
||||
|
||||
let storage = GenericRemoteStorage::from_config(&remote_storage_config)
|
||||
.await
|
||||
.context("remote storage init")?;
|
||||
|
||||
let properties = WriterProperties::builder()
|
||||
.set_data_page_size_limit(config.parquet_upload_page_size)
|
||||
.set_compression(config.parquet_upload_compression);
|
||||
@@ -224,18 +220,18 @@ pub async fn worker(
|
||||
let rx_disconnect = futures::stream::poll_fn(move |cx| rx_disconnect.poll_recv(cx));
|
||||
let rx_disconnect = rx_disconnect.map(RequestData::from);
|
||||
|
||||
let storage_disconnect =
|
||||
GenericRemoteStorage::from_config(&disconnect_events_storage_config)
|
||||
.await
|
||||
.context("remote storage for disconnect events init")?;
|
||||
let parquet_config_disconnect = parquet_config.clone();
|
||||
tokio::try_join!(
|
||||
worker_inner(storage, rx, parquet_config),
|
||||
worker_inner(storage_disconnect, rx_disconnect, parquet_config_disconnect)
|
||||
worker_inner(remote_storage_config, rx, parquet_config),
|
||||
worker_inner(
|
||||
disconnect_events_storage_config,
|
||||
rx_disconnect,
|
||||
parquet_config_disconnect
|
||||
)
|
||||
)
|
||||
.map(|_| ())
|
||||
} else {
|
||||
worker_inner(storage, rx, parquet_config).await
|
||||
worker_inner(remote_storage_config, rx, parquet_config).await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -251,18 +247,32 @@ struct ParquetConfig {
|
||||
test_remote_failures: u64,
|
||||
}
|
||||
|
||||
impl ParquetConfig {
|
||||
async fn storage(
|
||||
&self,
|
||||
storage_config: &RemoteStorageConfig,
|
||||
) -> anyhow::Result<GenericRemoteStorage> {
|
||||
let storage = GenericRemoteStorage::from_config(storage_config)
|
||||
.await
|
||||
.context("remote storage init")?;
|
||||
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
if self.test_remote_failures > 0 {
|
||||
return Ok(GenericRemoteStorage::unreliable_wrapper(
|
||||
storage,
|
||||
self.test_remote_failures,
|
||||
));
|
||||
}
|
||||
|
||||
Ok(storage)
|
||||
}
|
||||
}
|
||||
|
||||
async fn worker_inner(
|
||||
storage: GenericRemoteStorage,
|
||||
storage_config: RemoteStorageConfig,
|
||||
rx: impl Stream<Item = RequestData>,
|
||||
config: ParquetConfig,
|
||||
) -> anyhow::Result<()> {
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
let storage = if config.test_remote_failures > 0 {
|
||||
GenericRemoteStorage::unreliable_wrapper(storage, config.test_remote_failures)
|
||||
} else {
|
||||
storage
|
||||
};
|
||||
|
||||
let mut rx = std::pin::pin!(rx);
|
||||
|
||||
let mut rows = Vec::with_capacity(config.rows_per_group);
|
||||
@@ -285,7 +295,7 @@ async fn worker_inner(
|
||||
}
|
||||
if len > config.file_size || force {
|
||||
last_upload = time::Instant::now();
|
||||
let file = upload_parquet(w, len, &storage).await?;
|
||||
let file = upload_parquet(w, len, &storage_config, &config).await?;
|
||||
w = SerializedFileWriter::new(file, schema.clone(), config.propeties.clone())?;
|
||||
len = 0;
|
||||
}
|
||||
@@ -298,7 +308,7 @@ async fn worker_inner(
|
||||
}
|
||||
|
||||
if !w.flushed_row_groups().is_empty() {
|
||||
let _rtchk: Writer<BytesMut> = upload_parquet(w, len, &storage).await?;
|
||||
let _rtchk: Writer<BytesMut> = upload_parquet(w, len, &storage_config, &config).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -340,7 +350,8 @@ where
|
||||
async fn upload_parquet(
|
||||
mut w: SerializedFileWriter<Writer<BytesMut>>,
|
||||
len: i64,
|
||||
storage: &GenericRemoteStorage,
|
||||
storage_config: &RemoteStorageConfig,
|
||||
config: &ParquetConfig,
|
||||
) -> anyhow::Result<Writer<BytesMut>> {
|
||||
let len_uncompressed = w
|
||||
.flushed_row_groups()
|
||||
@@ -377,6 +388,15 @@ async fn upload_parquet(
|
||||
size, compression, "uploading request parquet file"
|
||||
);
|
||||
|
||||
// A bug in azure-sdk means that the identity-token-file that expires after
|
||||
// 1 hour is not refreshed. This identity-token is used to fetch the actual azure storage
|
||||
// tokens that last for 24 hours. After this 24 hour period, azure-sdk tries to refresh
|
||||
// the storage token, but the identity token has now expired.
|
||||
// <https://github.com/Azure/azure-sdk-for-rust/issues/1739>
|
||||
//
|
||||
// To work around this, we recreate the storage every time.
|
||||
let storage = config.storage(storage_config).await?;
|
||||
|
||||
let year = now.year();
|
||||
let month = now.month();
|
||||
let day = now.day();
|
||||
@@ -431,8 +451,8 @@ mod tests {
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
use remote_storage::{
|
||||
GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind, S3Config,
|
||||
DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
|
||||
RemoteStorageConfig, RemoteStorageKind, S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
|
||||
DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
|
||||
};
|
||||
use tokio::sync::mpsc;
|
||||
use tokio::time;
|
||||
@@ -559,12 +579,11 @@ mod tests {
|
||||
timeout: std::time::Duration::from_secs(120),
|
||||
small_timeout: std::time::Duration::from_secs(30),
|
||||
};
|
||||
let storage = GenericRemoteStorage::from_config(&remote_storage_config)
|
||||
|
||||
worker_inner(remote_storage_config, rx, config)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
worker_inner(storage, rx, config).await.unwrap();
|
||||
|
||||
let mut files = WalkDir::new(tmpdir.as_std_path())
|
||||
.into_iter()
|
||||
.filter_map(|entry| entry.ok())
|
||||
|
||||
@@ -3,9 +3,9 @@ use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use ed25519_dalek::SigningKey;
|
||||
use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer};
|
||||
use p256::ecdsa::SigningKey;
|
||||
use p256::elliptic_curve::JwkEcKey;
|
||||
use jose_jwk::jose_b64;
|
||||
use rand::rngs::OsRng;
|
||||
use tokio::net::{lookup_host, TcpStream};
|
||||
use tracing::field::display;
|
||||
@@ -354,9 +354,15 @@ impl PoolingBackend {
|
||||
}
|
||||
}
|
||||
|
||||
fn create_random_jwk() -> (SigningKey, JwkEcKey) {
|
||||
let key = SigningKey::random(&mut OsRng);
|
||||
let jwk = p256::PublicKey::from(key.verifying_key()).to_jwk();
|
||||
fn create_random_jwk() -> (SigningKey, jose_jwk::Key) {
|
||||
let key = SigningKey::generate(&mut OsRng);
|
||||
|
||||
let jwk = jose_jwk::Key::Okp(jose_jwk::Okp {
|
||||
crv: jose_jwk::OkpCurves::Ed25519,
|
||||
x: jose_b64::serde::Bytes::from(key.verifying_key().to_bytes().to_vec()),
|
||||
d: None,
|
||||
});
|
||||
|
||||
(key, jwk)
|
||||
}
|
||||
|
||||
|
||||
@@ -16,17 +16,16 @@ use std::sync::Arc;
|
||||
use std::task::{ready, Poll};
|
||||
use std::time::Duration;
|
||||
|
||||
use ed25519_dalek::{Signature, Signer, SigningKey};
|
||||
use futures::future::poll_fn;
|
||||
use futures::Future;
|
||||
use indexmap::IndexMap;
|
||||
use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding};
|
||||
use p256::ecdsa::{Signature, SigningKey};
|
||||
use parking_lot::RwLock;
|
||||
use postgres_client::tls::NoTlsStream;
|
||||
use postgres_client::types::ToSql;
|
||||
use postgres_client::AsyncMessage;
|
||||
use serde_json::value::RawValue;
|
||||
use signature::Signer;
|
||||
use tokio::net::TcpStream;
|
||||
use tokio::time::Instant;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -42,7 +41,7 @@ use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
|
||||
use crate::metrics::Metrics;
|
||||
|
||||
pub(crate) const EXT_NAME: &str = "pg_session_jwt";
|
||||
pub(crate) const EXT_VERSION: &str = "0.1.2";
|
||||
pub(crate) const EXT_VERSION: &str = "0.2.0";
|
||||
pub(crate) const EXT_SCHEMA: &str = "auth";
|
||||
|
||||
#[derive(Clone)]
|
||||
@@ -339,8 +338,8 @@ fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String {
|
||||
let cap = jwt.capacity();
|
||||
|
||||
// we only need an empty header with the alg specified.
|
||||
// base64url(r#"{"alg":"ES256"}"#) == "eyJhbGciOiJFUzI1NiJ9"
|
||||
jwt.push_str("eyJhbGciOiJFUzI1NiJ9.");
|
||||
// base64url(r#"{"alg":"EdDSA"}"#) == "eyJhbGciOiJFZERTQSJ9"
|
||||
jwt.push_str("eyJhbGciOiJFZERTQSJ9.");
|
||||
|
||||
// encode the jwt payload in-place
|
||||
base64::encode_config_buf(payload, base64::URL_SAFE_NO_PAD, &mut jwt);
|
||||
@@ -366,14 +365,14 @@ fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String {
|
||||
#[cfg(test)]
|
||||
#[expect(clippy::unwrap_used)]
|
||||
mod tests {
|
||||
use p256::ecdsa::SigningKey;
|
||||
use ed25519_dalek::SigningKey;
|
||||
use typed_json::json;
|
||||
|
||||
use super::resign_jwt;
|
||||
|
||||
#[test]
|
||||
fn jwt_token_snapshot() {
|
||||
let key = SigningKey::from_bytes(&[1; 32].into()).unwrap();
|
||||
let key = SigningKey::from_bytes(&[1; 32]);
|
||||
let data =
|
||||
json!({"foo":"bar","jti":"foo\nbar","nested":{"jti":"tricky nesting"}}).to_string();
|
||||
|
||||
@@ -381,12 +380,17 @@ mod tests {
|
||||
|
||||
// To validate the JWT, copy the JWT string and paste it into https://jwt.io/.
|
||||
// In the public-key box, paste the following jwk public key
|
||||
// `{"kty":"EC","crv":"P-256","x":"b_A7lJJBzh2t1DUZ5pYOCoW0GmmgXDKBA6orzhWUyhY","y":"PE91OlW_AdxT9sCwx-7ni0DG_30lqW4igrmJzvccFEo"}`
|
||||
// `{"kty":"OKP","crv":"Ed25519","x":"iojj3XQJ8ZX9UtstPLpdcspnCb8dlBIb83SIAbQPb1w"}`
|
||||
// Note - jwt.io doesn't support EdDSA :(
|
||||
// https://github.com/jsonwebtoken/jsonwebtoken.github.io/issues/509
|
||||
|
||||
// let pub_key = p256::ecdsa::VerifyingKey::from(&key);
|
||||
// let pub_key = p256::PublicKey::from(pub_key);
|
||||
// println!("{}", pub_key.to_jwk_string());
|
||||
// let jwk = jose_jwk::Key::Okp(jose_jwk::Okp {
|
||||
// crv: jose_jwk::OkpCurves::Ed25519,
|
||||
// x: jose_jwk::jose_b64::serde::Bytes::from(key.verifying_key().to_bytes().to_vec()),
|
||||
// d: None,
|
||||
// });
|
||||
// println!("{}", serde_json::to_string(&jwk).unwrap());
|
||||
|
||||
assert_eq!(jwt, "eyJhbGciOiJFUzI1NiJ9.eyJmb28iOiJiYXIiLCJqdGkiOjIsIm5lc3RlZCI6eyJqdGkiOiJ0cmlja3kgbmVzdGluZyJ9fQ.pYf0LxoJ8sDgpmsYOgrbNecOSipnPBEGwnZzB-JhW2cONrKlqRsgXwK8_cOsyolGy-hTTe8GXbWTl_UdpF5RyA");
|
||||
assert_eq!(jwt, "eyJhbGciOiJFZERTQSJ9.eyJmb28iOiJiYXIiLCJqdGkiOjIsIm5lc3RlZCI6eyJqdGkiOiJ0cmlja3kgbmVzdGluZyJ9fQ.Cvyc2By33KI0f0obystwdy8PN111L3Sc9_Mr2CU3XshtSqSdxuRxNEZGbb_RvyJf2IzheC_s7aBZ-jLeQ9N0Bg");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -21,14 +21,13 @@ const KB: usize = 1024;
|
||||
const MB: usize = 1024 * KB;
|
||||
const GB: usize = 1024 * MB;
|
||||
|
||||
/// Use jemalloc, and configure it to sample allocations for profiles every 1 MB.
|
||||
/// This mirrors the configuration in bin/safekeeper.rs.
|
||||
/// Use jemalloc and enable profiling, to mirror bin/safekeeper.rs.
|
||||
#[global_allocator]
|
||||
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
|
||||
|
||||
#[allow(non_upper_case_globals)]
|
||||
#[export_name = "malloc_conf"]
|
||||
pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
|
||||
pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
|
||||
|
||||
// Register benchmarks with Criterion.
|
||||
criterion_group!(
|
||||
|
||||
@@ -51,10 +51,12 @@ use utils::{
|
||||
#[global_allocator]
|
||||
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
|
||||
|
||||
/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
|
||||
/// Configure jemalloc to profile heap allocations by sampling stack traces every 2 MB (1 << 21).
|
||||
/// This adds roughly 3% overhead for allocations on average, which is acceptable considering
|
||||
/// performance-sensitive code will avoid allocations as far as possible anyway.
|
||||
#[allow(non_upper_case_globals)]
|
||||
#[export_name = "malloc_conf"]
|
||||
pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
|
||||
pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
|
||||
|
||||
const PID_FILE_NAME: &str = "safekeeper.pid";
|
||||
const ID_FILE_NAME: &str = "safekeeper.id";
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use safekeeper_api::membership::INVALID_GENERATION;
|
||||
use tokio::fs::File;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use utils::crashsafe::durable_rename;
|
||||
@@ -13,14 +14,14 @@ use std::ops::Deref;
|
||||
use std::path::Path;
|
||||
use std::time::Instant;
|
||||
|
||||
use crate::control_file_upgrade::downgrade_v9_to_v8;
|
||||
use crate::control_file_upgrade::downgrade_v10_to_v9;
|
||||
use crate::control_file_upgrade::upgrade_control_file;
|
||||
use crate::metrics::PERSIST_CONTROL_FILE_SECONDS;
|
||||
use crate::state::{EvictionState, TimelinePersistentState};
|
||||
use utils::bin_ser::LeSer;
|
||||
|
||||
pub const SK_MAGIC: u32 = 0xcafeceefu32;
|
||||
pub const SK_FORMAT_VERSION: u32 = 9;
|
||||
pub const SK_FORMAT_VERSION: u32 = 10;
|
||||
|
||||
// contains persistent metadata for safekeeper
|
||||
pub const CONTROL_FILE_NAME: &str = "safekeeper.control";
|
||||
@@ -169,10 +170,11 @@ impl TimelinePersistentState {
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_MAGIC)?;
|
||||
|
||||
if self.eviction_state == EvictionState::Present {
|
||||
// temp hack for forward compatibility
|
||||
const PREV_FORMAT_VERSION: u32 = 8;
|
||||
let prev = downgrade_v9_to_v8(self);
|
||||
if self.mconf.generation == INVALID_GENERATION {
|
||||
// Temp hack for forward compatibility test: in case of none
|
||||
// configuration save cfile in previous v9 format.
|
||||
const PREV_FORMAT_VERSION: u32 = 9;
|
||||
let prev = downgrade_v10_to_v9(self);
|
||||
WriteBytesExt::write_u32::<LittleEndian>(&mut buf, PREV_FORMAT_VERSION)?;
|
||||
prev.ser_into(&mut buf)?;
|
||||
} else {
|
||||
@@ -233,6 +235,7 @@ impl Storage for FileStorage {
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
use safekeeper_api::membership::{Configuration, MemberSet};
|
||||
use tokio::fs;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -242,6 +245,11 @@ mod test {
|
||||
async fn test_read_write_safekeeper_state() -> anyhow::Result<()> {
|
||||
let tempdir = camino_tempfile::tempdir()?;
|
||||
let mut state = TimelinePersistentState::empty();
|
||||
state.mconf = Configuration {
|
||||
generation: 42,
|
||||
members: MemberSet::empty(),
|
||||
new_members: None,
|
||||
};
|
||||
let mut storage = FileStorage::create_new(tempdir.path(), state.clone(), NO_SYNC).await?;
|
||||
|
||||
// Make a change.
|
||||
|
||||
@@ -1,17 +1,22 @@
|
||||
//! Code to deal with safekeeper control file upgrades
|
||||
use std::vec;
|
||||
|
||||
use crate::{
|
||||
safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn},
|
||||
state::{EvictionState, PersistedPeers, TimelinePersistentState},
|
||||
state::{EvictionState, TimelinePersistentState},
|
||||
wal_backup_partial,
|
||||
};
|
||||
use anyhow::{bail, Result};
|
||||
use pq_proto::SystemId;
|
||||
use safekeeper_api::{ServerInfo, Term};
|
||||
use safekeeper_api::{
|
||||
membership::{Configuration, INVALID_GENERATION},
|
||||
ServerInfo, Term,
|
||||
};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::*;
|
||||
use utils::{
|
||||
bin_ser::LeSer,
|
||||
id::{TenantId, TimelineId},
|
||||
id::{NodeId, TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
@@ -233,6 +238,90 @@ pub struct SafeKeeperStateV8 {
|
||||
pub partial_backup: wal_backup_partial::State,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>);
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct PersistedPeerInfo {
|
||||
/// LSN up to which safekeeper offloaded WAL to s3.
|
||||
pub backup_lsn: Lsn,
|
||||
/// Term of the last entry.
|
||||
pub term: Term,
|
||||
/// LSN of the last record.
|
||||
pub flush_lsn: Lsn,
|
||||
/// Up to which LSN safekeeper regards its WAL as committed.
|
||||
pub commit_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl PersistedPeerInfo {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
backup_lsn: Lsn::INVALID,
|
||||
term: safekeeper_api::INITIAL_TERM,
|
||||
flush_lsn: Lsn(0),
|
||||
commit_lsn: Lsn(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// make clippy happy
|
||||
impl Default for PersistedPeerInfo {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Note: SafekeeperStateVn is old name for TimelinePersistentStateVn.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct TimelinePersistentStateV9 {
|
||||
#[serde(with = "hex")]
|
||||
pub tenant_id: TenantId,
|
||||
#[serde(with = "hex")]
|
||||
pub timeline_id: TimelineId,
|
||||
/// persistent acceptor state
|
||||
pub acceptor_state: AcceptorState,
|
||||
/// information about server
|
||||
pub server: ServerInfo,
|
||||
/// Unique id of the last *elected* proposer we dealt with. Not needed
|
||||
/// for correctness, exists for monitoring purposes.
|
||||
#[serde(with = "hex")]
|
||||
pub proposer_uuid: PgUuid,
|
||||
/// Since which LSN this timeline generally starts. Safekeeper might have
|
||||
/// joined later.
|
||||
pub timeline_start_lsn: Lsn,
|
||||
/// Since which LSN safekeeper has (had) WAL for this timeline.
|
||||
/// All WAL segments next to one containing local_start_lsn are
|
||||
/// filled with data from the beginning.
|
||||
pub local_start_lsn: Lsn,
|
||||
/// Part of WAL acknowledged by quorum *and available locally*. Always points
|
||||
/// to record boundary.
|
||||
pub commit_lsn: Lsn,
|
||||
/// LSN that points to the end of the last backed up segment. Useful to
|
||||
/// persist to avoid finding out offloading progress on boot.
|
||||
pub backup_lsn: Lsn,
|
||||
/// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn
|
||||
/// of last record streamed to everyone). Persisting it helps skipping
|
||||
/// recovery in walproposer, generally we compute it from peers. In
|
||||
/// walproposer proto called 'truncate_lsn'. Updates are currently drived
|
||||
/// only by walproposer.
|
||||
pub peer_horizon_lsn: Lsn,
|
||||
/// LSN of the oldest known checkpoint made by pageserver and successfully
|
||||
/// pushed to s3. We don't remove WAL beyond it. Persisted only for
|
||||
/// informational purposes, we receive it from pageserver (or broker).
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
/// Peers and their state as we remember it. Knowing peers themselves is
|
||||
/// fundamental; but state is saved here only for informational purposes and
|
||||
/// obviously can be stale. (Currently not saved at all, but let's provision
|
||||
/// place to have less file version upgrades).
|
||||
pub peers: PersistedPeers,
|
||||
/// Holds names of partial segments uploaded to remote storage. Used to
|
||||
/// clean up old objects without leaving garbage in remote storage.
|
||||
pub partial_backup: wal_backup_partial::State,
|
||||
/// Eviction state of the timeline. If it's Offloaded, we should download
|
||||
/// WAL files from remote storage to serve the timeline.
|
||||
pub eviction_state: EvictionState,
|
||||
}
|
||||
|
||||
pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersistentState> {
|
||||
// migrate to storing full term history
|
||||
if version == 1 {
|
||||
@@ -248,6 +337,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
return Ok(TimelinePersistentState {
|
||||
tenant_id: oldstate.server.tenant_id,
|
||||
timeline_id: oldstate.server.timeline_id,
|
||||
mconf: Configuration::empty(),
|
||||
acceptor_state: ac,
|
||||
server: ServerInfo {
|
||||
pg_version: oldstate.server.pg_version,
|
||||
@@ -261,9 +351,9 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
backup_lsn: Lsn(0),
|
||||
peer_horizon_lsn: oldstate.truncate_lsn,
|
||||
remote_consistent_lsn: Lsn(0),
|
||||
peers: PersistedPeers(vec![]),
|
||||
partial_backup: wal_backup_partial::State::default(),
|
||||
eviction_state: EvictionState::Present,
|
||||
creation_ts: std::time::SystemTime::UNIX_EPOCH,
|
||||
});
|
||||
// migrate to hexing some ids
|
||||
} else if version == 2 {
|
||||
@@ -277,6 +367,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
return Ok(TimelinePersistentState {
|
||||
tenant_id: oldstate.server.tenant_id,
|
||||
timeline_id: oldstate.server.timeline_id,
|
||||
mconf: Configuration::empty(),
|
||||
acceptor_state: oldstate.acceptor_state,
|
||||
server,
|
||||
proposer_uuid: oldstate.proposer_uuid,
|
||||
@@ -286,9 +377,9 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
backup_lsn: Lsn(0),
|
||||
peer_horizon_lsn: oldstate.truncate_lsn,
|
||||
remote_consistent_lsn: Lsn(0),
|
||||
peers: PersistedPeers(vec![]),
|
||||
partial_backup: wal_backup_partial::State::default(),
|
||||
eviction_state: EvictionState::Present,
|
||||
creation_ts: std::time::SystemTime::UNIX_EPOCH,
|
||||
});
|
||||
// migrate to moving tenant_id/timeline_id to the top and adding some lsns
|
||||
} else if version == 3 {
|
||||
@@ -302,6 +393,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
return Ok(TimelinePersistentState {
|
||||
tenant_id: oldstate.server.tenant_id,
|
||||
timeline_id: oldstate.server.timeline_id,
|
||||
mconf: Configuration::empty(),
|
||||
acceptor_state: oldstate.acceptor_state,
|
||||
server,
|
||||
proposer_uuid: oldstate.proposer_uuid,
|
||||
@@ -311,9 +403,9 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
backup_lsn: Lsn(0),
|
||||
peer_horizon_lsn: oldstate.truncate_lsn,
|
||||
remote_consistent_lsn: Lsn(0),
|
||||
peers: PersistedPeers(vec![]),
|
||||
partial_backup: wal_backup_partial::State::default(),
|
||||
eviction_state: EvictionState::Present,
|
||||
creation_ts: std::time::SystemTime::UNIX_EPOCH,
|
||||
});
|
||||
// migrate to having timeline_start_lsn
|
||||
} else if version == 4 {
|
||||
@@ -327,6 +419,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
return Ok(TimelinePersistentState {
|
||||
tenant_id: oldstate.tenant_id,
|
||||
timeline_id: oldstate.timeline_id,
|
||||
mconf: Configuration::empty(),
|
||||
acceptor_state: oldstate.acceptor_state,
|
||||
server,
|
||||
proposer_uuid: oldstate.proposer_uuid,
|
||||
@@ -336,9 +429,9 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
backup_lsn: Lsn::INVALID,
|
||||
peer_horizon_lsn: oldstate.peer_horizon_lsn,
|
||||
remote_consistent_lsn: Lsn(0),
|
||||
peers: PersistedPeers(vec![]),
|
||||
partial_backup: wal_backup_partial::State::default(),
|
||||
eviction_state: EvictionState::Present,
|
||||
creation_ts: std::time::SystemTime::UNIX_EPOCH,
|
||||
});
|
||||
} else if version == 5 {
|
||||
info!("reading safekeeper control file version {}", version);
|
||||
@@ -372,6 +465,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
return Ok(TimelinePersistentState {
|
||||
tenant_id: oldstate.tenant_id,
|
||||
timeline_id: oldstate.timeline_id,
|
||||
mconf: Configuration::empty(),
|
||||
acceptor_state: oldstate.acceptor_state,
|
||||
server: oldstate.server,
|
||||
proposer_uuid: oldstate.proposer_uuid,
|
||||
@@ -381,9 +475,9 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
backup_lsn: oldstate.backup_lsn,
|
||||
peer_horizon_lsn: oldstate.peer_horizon_lsn,
|
||||
remote_consistent_lsn: oldstate.remote_consistent_lsn,
|
||||
peers: oldstate.peers,
|
||||
partial_backup: wal_backup_partial::State::default(),
|
||||
eviction_state: EvictionState::Present,
|
||||
creation_ts: std::time::SystemTime::UNIX_EPOCH,
|
||||
});
|
||||
} else if version == 8 {
|
||||
let oldstate = SafeKeeperStateV8::des(&buf[..buf.len()])?;
|
||||
@@ -391,6 +485,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
return Ok(TimelinePersistentState {
|
||||
tenant_id: oldstate.tenant_id,
|
||||
timeline_id: oldstate.timeline_id,
|
||||
mconf: Configuration::empty(),
|
||||
acceptor_state: oldstate.acceptor_state,
|
||||
server: oldstate.server,
|
||||
proposer_uuid: oldstate.proposer_uuid,
|
||||
@@ -400,9 +495,28 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
backup_lsn: oldstate.backup_lsn,
|
||||
peer_horizon_lsn: oldstate.peer_horizon_lsn,
|
||||
remote_consistent_lsn: oldstate.remote_consistent_lsn,
|
||||
peers: oldstate.peers,
|
||||
partial_backup: oldstate.partial_backup,
|
||||
eviction_state: EvictionState::Present,
|
||||
creation_ts: std::time::SystemTime::UNIX_EPOCH,
|
||||
});
|
||||
} else if version == 9 {
|
||||
let oldstate = TimelinePersistentStateV9::des(&buf[..buf.len()])?;
|
||||
return Ok(TimelinePersistentState {
|
||||
tenant_id: oldstate.tenant_id,
|
||||
timeline_id: oldstate.timeline_id,
|
||||
mconf: Configuration::empty(),
|
||||
acceptor_state: oldstate.acceptor_state,
|
||||
server: oldstate.server,
|
||||
proposer_uuid: oldstate.proposer_uuid,
|
||||
timeline_start_lsn: oldstate.timeline_start_lsn,
|
||||
local_start_lsn: oldstate.local_start_lsn,
|
||||
commit_lsn: oldstate.commit_lsn,
|
||||
backup_lsn: oldstate.backup_lsn,
|
||||
peer_horizon_lsn: oldstate.peer_horizon_lsn,
|
||||
remote_consistent_lsn: oldstate.remote_consistent_lsn,
|
||||
partial_backup: oldstate.partial_backup,
|
||||
eviction_state: oldstate.eviction_state,
|
||||
creation_ts: std::time::SystemTime::UNIX_EPOCH,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -412,9 +526,11 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
bail!("unsupported safekeeper control file version {}", version)
|
||||
}
|
||||
|
||||
pub fn downgrade_v9_to_v8(state: &TimelinePersistentState) -> SafeKeeperStateV8 {
|
||||
assert!(state.eviction_state == EvictionState::Present);
|
||||
SafeKeeperStateV8 {
|
||||
// Used as a temp hack to make forward compatibility test work. Should be
|
||||
// removed after PR adding v10 is merged.
|
||||
pub fn downgrade_v10_to_v9(state: &TimelinePersistentState) -> TimelinePersistentStateV9 {
|
||||
assert!(state.mconf.generation == INVALID_GENERATION);
|
||||
TimelinePersistentStateV9 {
|
||||
tenant_id: state.tenant_id,
|
||||
timeline_id: state.timeline_id,
|
||||
acceptor_state: state.acceptor_state.clone(),
|
||||
@@ -426,8 +542,9 @@ pub fn downgrade_v9_to_v8(state: &TimelinePersistentState) -> SafeKeeperStateV8
|
||||
backup_lsn: state.backup_lsn,
|
||||
peer_horizon_lsn: state.peer_horizon_lsn,
|
||||
remote_consistent_lsn: state.remote_consistent_lsn,
|
||||
peers: state.peers.clone(),
|
||||
peers: PersistedPeers(vec![]),
|
||||
partial_backup: state.partial_backup.clone(),
|
||||
eviction_state: state.eviction_state,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -437,7 +554,7 @@ mod tests {
|
||||
|
||||
use utils::{id::NodeId, Hex};
|
||||
|
||||
use crate::safekeeper::PersistedPeerInfo;
|
||||
use crate::control_file_upgrade::PersistedPeerInfo;
|
||||
|
||||
use super::*;
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use anyhow::{bail, Result};
|
||||
use camino::Utf8PathBuf;
|
||||
use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE};
|
||||
use safekeeper_api::membership::Configuration;
|
||||
use std::sync::Arc;
|
||||
use tokio::{
|
||||
fs::OpenOptions,
|
||||
@@ -147,10 +148,10 @@ pub async fn handle_request(
|
||||
|
||||
let mut new_state = TimelinePersistentState::new(
|
||||
&request.destination_ttid,
|
||||
Configuration::empty(),
|
||||
state.server.clone(),
|
||||
vec![],
|
||||
request.until_lsn,
|
||||
start_lsn,
|
||||
request.until_lsn,
|
||||
)?;
|
||||
new_state.timeline_start_lsn = start_lsn;
|
||||
new_state.peer_horizon_lsn = request.until_lsn;
|
||||
|
||||
@@ -52,16 +52,70 @@ pub struct SafekeeperPostgresHandler {
|
||||
|
||||
/// Parsed Postgres command.
|
||||
enum SafekeeperPostgresCommand {
|
||||
StartWalPush,
|
||||
StartReplication { start_lsn: Lsn, term: Option<Term> },
|
||||
StartWalPush {
|
||||
proto_version: u32,
|
||||
// Eventually timelines will be always created explicitly by storcon.
|
||||
// This option allows legacy behaviour for compute to do that until we
|
||||
// fully migrate.
|
||||
allow_timeline_creation: bool,
|
||||
},
|
||||
StartReplication {
|
||||
start_lsn: Lsn,
|
||||
term: Option<Term>,
|
||||
},
|
||||
IdentifySystem,
|
||||
TimelineStatus,
|
||||
JSONCtrl { cmd: AppendLogicalMessage },
|
||||
JSONCtrl {
|
||||
cmd: AppendLogicalMessage,
|
||||
},
|
||||
}
|
||||
|
||||
fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
|
||||
if cmd.starts_with("START_WAL_PUSH") {
|
||||
Ok(SafekeeperPostgresCommand::StartWalPush)
|
||||
// Allow additional options in postgres START_REPLICATION style like
|
||||
// START_WAL_PUSH (proto_version '3', allow_timeline_creation 'false').
|
||||
// Parsing here is very naive and breaks in case of commas or
|
||||
// whitespaces in values, but enough for our purposes.
|
||||
let re = Regex::new(r"START_WAL_PUSH(\s+?\((.*)\))?").unwrap();
|
||||
let caps = re
|
||||
.captures(cmd)
|
||||
.context(format!("failed to parse START_WAL_PUSH command {}", cmd))?;
|
||||
// capture () content
|
||||
let options = caps.get(2).map(|m| m.as_str()).unwrap_or("");
|
||||
// default values
|
||||
let mut proto_version = 2;
|
||||
let mut allow_timeline_creation = true;
|
||||
for kvstr in options.split(",") {
|
||||
if kvstr.is_empty() {
|
||||
continue;
|
||||
}
|
||||
let mut kvit = kvstr.split_whitespace();
|
||||
let key = kvit.next().context(format!(
|
||||
"failed to parse key in kv {} in command {}",
|
||||
kvstr, cmd
|
||||
))?;
|
||||
let value = kvit.next().context(format!(
|
||||
"failed to parse value in kv {} in command {}",
|
||||
kvstr, cmd
|
||||
))?;
|
||||
let value_trimmed = value.trim_matches('\'');
|
||||
if key == "proto_version" {
|
||||
proto_version = value_trimmed.parse::<u32>().context(format!(
|
||||
"failed to parse proto_version value {} in command {}",
|
||||
value, cmd
|
||||
))?;
|
||||
}
|
||||
if key == "allow_timeline_creation" {
|
||||
allow_timeline_creation = value_trimmed.parse::<bool>().context(format!(
|
||||
"failed to parse allow_timeline_creation value {} in command {}",
|
||||
value, cmd
|
||||
))?;
|
||||
}
|
||||
}
|
||||
Ok(SafekeeperPostgresCommand::StartWalPush {
|
||||
proto_version,
|
||||
allow_timeline_creation,
|
||||
})
|
||||
} else if cmd.starts_with("START_REPLICATION") {
|
||||
let re = Regex::new(
|
||||
// We follow postgres START_REPLICATION LOGICAL options to pass term.
|
||||
@@ -95,7 +149,7 @@ fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
|
||||
|
||||
fn cmd_to_string(cmd: &SafekeeperPostgresCommand) -> &str {
|
||||
match cmd {
|
||||
SafekeeperPostgresCommand::StartWalPush => "START_WAL_PUSH",
|
||||
SafekeeperPostgresCommand::StartWalPush { .. } => "START_WAL_PUSH",
|
||||
SafekeeperPostgresCommand::StartReplication { .. } => "START_REPLICATION",
|
||||
SafekeeperPostgresCommand::TimelineStatus => "TIMELINE_STATUS",
|
||||
SafekeeperPostgresCommand::IdentifySystem => "IDENTIFY_SYSTEM",
|
||||
@@ -293,8 +347,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
|
||||
self.ttid = TenantTimelineId::new(tenant_id, timeline_id);
|
||||
|
||||
match cmd {
|
||||
SafekeeperPostgresCommand::StartWalPush => {
|
||||
self.handle_start_wal_push(pgb)
|
||||
SafekeeperPostgresCommand::StartWalPush {
|
||||
proto_version,
|
||||
allow_timeline_creation,
|
||||
} => {
|
||||
self.handle_start_wal_push(pgb, proto_version, allow_timeline_creation)
|
||||
.instrument(info_span!("WAL receiver"))
|
||||
.await
|
||||
}
|
||||
@@ -467,3 +524,39 @@ impl SafekeeperPostgresHandler {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::SafekeeperPostgresCommand;
|
||||
|
||||
/// Test parsing of START_WAL_PUSH command
|
||||
#[test]
|
||||
fn test_start_wal_push_parse() {
|
||||
let cmd = "START_WAL_PUSH";
|
||||
let parsed = super::parse_cmd(cmd).expect("failed to parse");
|
||||
match parsed {
|
||||
SafekeeperPostgresCommand::StartWalPush {
|
||||
proto_version,
|
||||
allow_timeline_creation,
|
||||
} => {
|
||||
assert_eq!(proto_version, 2);
|
||||
assert!(allow_timeline_creation);
|
||||
}
|
||||
_ => panic!("unexpected command"),
|
||||
}
|
||||
|
||||
let cmd =
|
||||
"START_WAL_PUSH (proto_version '3', allow_timeline_creation 'false', unknown 'hoho')";
|
||||
let parsed = super::parse_cmd(cmd).expect("failed to parse");
|
||||
match parsed {
|
||||
SafekeeperPostgresCommand::StartWalPush {
|
||||
proto_version,
|
||||
allow_timeline_creation,
|
||||
} => {
|
||||
assert_eq!(proto_version, 3);
|
||||
assert!(!allow_timeline_creation);
|
||||
}
|
||||
_ => panic!("unexpected command"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use hyper::{Body, Request, Response, StatusCode};
|
||||
use safekeeper_api::models;
|
||||
use safekeeper_api::models::AcceptorStateStatus;
|
||||
use safekeeper_api::models::SafekeeperStatus;
|
||||
use safekeeper_api::models::TermSwitchApiEntry;
|
||||
@@ -111,14 +112,15 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
||||
system_id: request_data.system_id.unwrap_or(0),
|
||||
wal_seg_size: request_data.wal_seg_size.unwrap_or(WAL_SEGMENT_SIZE as u32),
|
||||
};
|
||||
let local_start_lsn = request_data.local_start_lsn.unwrap_or_else(|| {
|
||||
request_data
|
||||
.commit_lsn
|
||||
.segment_lsn(server_info.wal_seg_size as usize)
|
||||
});
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
global_timelines
|
||||
.create(ttid, server_info, request_data.commit_lsn, local_start_lsn)
|
||||
.create(
|
||||
ttid,
|
||||
request_data.mconf,
|
||||
server_info,
|
||||
request_data.start_lsn,
|
||||
request_data.commit_lsn.unwrap_or(request_data.start_lsn),
|
||||
)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -182,6 +184,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
|
||||
let status = TimelineStatus {
|
||||
tenant_id: ttid.tenant_id,
|
||||
timeline_id: ttid.timeline_id,
|
||||
mconf: state.mconf,
|
||||
acceptor_state: acc_state,
|
||||
pg_info: state.server,
|
||||
flush_lsn,
|
||||
@@ -267,6 +270,28 @@ async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Bo
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
/// Consider switching timeline membership configuration to the provided one.
|
||||
async fn timeline_membership_handler(
|
||||
mut request: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let ttid = TenantTimelineId::new(
|
||||
parse_request_param(&request, "tenant_id")?,
|
||||
parse_request_param(&request, "timeline_id")?,
|
||||
);
|
||||
check_permission(&request, Some(ttid.tenant_id))?;
|
||||
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
|
||||
|
||||
let data: models::TimelineMembershipSwitchRequest = json_request(&mut request).await?;
|
||||
let response = tli
|
||||
.membership_switch(data.mconf)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, response)
|
||||
}
|
||||
|
||||
async fn timeline_copy_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
|
||||
@@ -618,6 +643,10 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/snapshot/:destination_id",
|
||||
|r| request_span(r, timeline_snapshot_handler),
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/membership",
|
||||
|r| request_span(r, timeline_membership_handler),
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_id/timeline/:source_timeline_id/copy",
|
||||
|r| request_span(r, timeline_copy_handler),
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
|
||||
use anyhow::Context;
|
||||
use postgres_backend::QueryError;
|
||||
use safekeeper_api::membership::{Configuration, INVALID_GENERATION};
|
||||
use safekeeper_api::{ServerInfo, Term};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
@@ -105,6 +106,7 @@ async fn prepare_safekeeper(
|
||||
.global_timelines
|
||||
.create(
|
||||
spg.ttid,
|
||||
Configuration::empty(),
|
||||
ServerInfo {
|
||||
pg_version,
|
||||
wal_seg_size: WAL_SEGMENT_SIZE as u32,
|
||||
@@ -131,10 +133,10 @@ async fn send_proposer_elected(
|
||||
let history = TermHistory(history_entries);
|
||||
|
||||
let proposer_elected_request = ProposerAcceptorMessage::Elected(ProposerElected {
|
||||
generation: INVALID_GENERATION,
|
||||
term,
|
||||
start_streaming_at: lsn,
|
||||
term_history: history,
|
||||
timeline_start_lsn: lsn,
|
||||
});
|
||||
|
||||
tli.process_msg(&proposer_elected_request).await?;
|
||||
@@ -168,13 +170,12 @@ pub async fn append_logical_message(
|
||||
|
||||
let append_request = ProposerAcceptorMessage::AppendRequest(AppendRequest {
|
||||
h: AppendRequestHeader {
|
||||
generation: INVALID_GENERATION,
|
||||
term: msg.term,
|
||||
term_start_lsn: begin_lsn,
|
||||
begin_lsn,
|
||||
end_lsn,
|
||||
commit_lsn,
|
||||
truncate_lsn: msg.truncate_lsn,
|
||||
proposer_uuid: [0u8; 16],
|
||||
},
|
||||
wal_data,
|
||||
});
|
||||
|
||||
@@ -21,6 +21,7 @@ use postgres_backend::PostgresBackend;
|
||||
use postgres_backend::PostgresBackendReader;
|
||||
use postgres_backend::QueryError;
|
||||
use pq_proto::BeMessage;
|
||||
use safekeeper_api::membership::Configuration;
|
||||
use safekeeper_api::models::{ConnectionId, WalReceiverState, WalReceiverStatus};
|
||||
use safekeeper_api::ServerInfo;
|
||||
use std::future;
|
||||
@@ -199,9 +200,14 @@ impl SafekeeperPostgresHandler {
|
||||
pub async fn handle_start_wal_push<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
proto_version: u32,
|
||||
allow_timeline_creation: bool,
|
||||
) -> Result<(), QueryError> {
|
||||
let mut tli: Option<WalResidentTimeline> = None;
|
||||
if let Err(end) = self.handle_start_wal_push_guts(pgb, &mut tli).await {
|
||||
if let Err(end) = self
|
||||
.handle_start_wal_push_guts(pgb, &mut tli, proto_version, allow_timeline_creation)
|
||||
.await
|
||||
{
|
||||
// Log the result and probably send it to the client, closing the stream.
|
||||
let handle_end_fut = pgb.handle_copy_stream_end(end);
|
||||
// If we managed to create the timeline, augment logging with current LSNs etc.
|
||||
@@ -221,6 +227,8 @@ impl SafekeeperPostgresHandler {
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
tli: &mut Option<WalResidentTimeline>,
|
||||
proto_version: u32,
|
||||
allow_timeline_creation: bool,
|
||||
) -> Result<(), CopyStreamHandlerEnd> {
|
||||
// The `tli` parameter is only used for passing _out_ a timeline, one should
|
||||
// not have been passed in.
|
||||
@@ -249,12 +257,17 @@ impl SafekeeperPostgresHandler {
|
||||
conn_id: self.conn_id,
|
||||
pgb_reader: &mut pgb_reader,
|
||||
peer_addr,
|
||||
proto_version,
|
||||
acceptor_handle: &mut acceptor_handle,
|
||||
global_timelines: self.global_timelines.clone(),
|
||||
};
|
||||
|
||||
// Read first message and create timeline if needed.
|
||||
let res = network_reader.read_first_message().await;
|
||||
// Read first message and create timeline if needed and allowed. This
|
||||
// won't be when timelines will be always created by storcon and
|
||||
// allow_timeline_creation becomes false.
|
||||
let res = network_reader
|
||||
.read_first_message(allow_timeline_creation)
|
||||
.await;
|
||||
|
||||
let network_res = if let Ok((timeline, next_msg)) = res {
|
||||
let pageserver_feedback_rx: tokio::sync::broadcast::Receiver<PageserverFeedback> =
|
||||
@@ -268,7 +281,7 @@ impl SafekeeperPostgresHandler {
|
||||
tokio::select! {
|
||||
// todo: add read|write .context to these errors
|
||||
r = network_reader.run(msg_tx, msg_rx, reply_tx, timeline, next_msg) => r,
|
||||
r = network_write(pgb, reply_rx, pageserver_feedback_rx) => r,
|
||||
r = network_write(pgb, reply_rx, pageserver_feedback_rx, proto_version) => r,
|
||||
_ = timeline_cancel.cancelled() => {
|
||||
return Err(CopyStreamHandlerEnd::Cancelled);
|
||||
}
|
||||
@@ -312,6 +325,7 @@ struct NetworkReader<'a, IO> {
|
||||
conn_id: ConnectionId,
|
||||
pgb_reader: &'a mut PostgresBackendReader<IO>,
|
||||
peer_addr: SocketAddr,
|
||||
proto_version: u32,
|
||||
// WalAcceptor is spawned when we learn server info from walproposer and
|
||||
// create timeline; handle is put here.
|
||||
acceptor_handle: &'a mut Option<JoinHandle<anyhow::Result<()>>>,
|
||||
@@ -321,25 +335,37 @@ struct NetworkReader<'a, IO> {
|
||||
impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
|
||||
async fn read_first_message(
|
||||
&mut self,
|
||||
allow_timeline_creation: bool,
|
||||
) -> Result<(WalResidentTimeline, ProposerAcceptorMessage), CopyStreamHandlerEnd> {
|
||||
// Receive information about server to create timeline, if not yet.
|
||||
let next_msg = read_message(self.pgb_reader).await?;
|
||||
let next_msg = read_message(self.pgb_reader, self.proto_version).await?;
|
||||
let tli = match next_msg {
|
||||
ProposerAcceptorMessage::Greeting(ref greeting) => {
|
||||
info!(
|
||||
"start handshake with walproposer {} sysid {} timeline {}",
|
||||
self.peer_addr, greeting.system_id, greeting.tli,
|
||||
"start handshake with walproposer {} sysid {}",
|
||||
self.peer_addr, greeting.system_id,
|
||||
);
|
||||
let server_info = ServerInfo {
|
||||
pg_version: greeting.pg_version,
|
||||
system_id: greeting.system_id,
|
||||
wal_seg_size: greeting.wal_seg_size,
|
||||
};
|
||||
let tli = self
|
||||
.global_timelines
|
||||
.create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID)
|
||||
.await
|
||||
.context("create timeline")?;
|
||||
let tli = if allow_timeline_creation {
|
||||
self.global_timelines
|
||||
.create(
|
||||
self.ttid,
|
||||
Configuration::empty(),
|
||||
server_info,
|
||||
Lsn::INVALID,
|
||||
Lsn::INVALID,
|
||||
)
|
||||
.await
|
||||
.context("create timeline")?
|
||||
} else {
|
||||
self.global_timelines
|
||||
.get(self.ttid)
|
||||
.context("get timeline")?
|
||||
};
|
||||
tli.wal_residence_guard().await?
|
||||
}
|
||||
_ => {
|
||||
@@ -368,7 +394,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
|
||||
));
|
||||
|
||||
// Forward all messages to WalAcceptor
|
||||
read_network_loop(self.pgb_reader, msg_tx, next_msg).await
|
||||
read_network_loop(self.pgb_reader, msg_tx, next_msg, self.proto_version).await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -376,9 +402,10 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
|
||||
/// TODO: Return Ok(None) on graceful termination.
|
||||
async fn read_message<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
pgb_reader: &mut PostgresBackendReader<IO>,
|
||||
proto_version: u32,
|
||||
) -> Result<ProposerAcceptorMessage, CopyStreamHandlerEnd> {
|
||||
let copy_data = pgb_reader.read_copy_message().await?;
|
||||
let msg = ProposerAcceptorMessage::parse(copy_data)?;
|
||||
let msg = ProposerAcceptorMessage::parse(copy_data, proto_version)?;
|
||||
Ok(msg)
|
||||
}
|
||||
|
||||
@@ -386,6 +413,7 @@ async fn read_network_loop<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
pgb_reader: &mut PostgresBackendReader<IO>,
|
||||
msg_tx: Sender<ProposerAcceptorMessage>,
|
||||
mut next_msg: ProposerAcceptorMessage,
|
||||
proto_version: u32,
|
||||
) -> Result<(), CopyStreamHandlerEnd> {
|
||||
/// Threshold for logging slow WalAcceptor sends.
|
||||
const SLOW_THRESHOLD: Duration = Duration::from_secs(5);
|
||||
@@ -418,7 +446,7 @@ async fn read_network_loop<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
WAL_RECEIVER_QUEUE_DEPTH_TOTAL.inc();
|
||||
WAL_RECEIVER_QUEUE_SIZE_TOTAL.add(size as i64);
|
||||
|
||||
next_msg = read_message(pgb_reader).await?;
|
||||
next_msg = read_message(pgb_reader, proto_version).await?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -431,6 +459,7 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
pgb_writer: &mut PostgresBackend<IO>,
|
||||
mut reply_rx: Receiver<AcceptorProposerMessage>,
|
||||
mut pageserver_feedback_rx: tokio::sync::broadcast::Receiver<PageserverFeedback>,
|
||||
proto_version: u32,
|
||||
) -> Result<(), CopyStreamHandlerEnd> {
|
||||
let mut buf = BytesMut::with_capacity(128);
|
||||
|
||||
@@ -468,7 +497,7 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
};
|
||||
|
||||
buf.clear();
|
||||
msg.serialize(&mut buf)?;
|
||||
msg.serialize(&mut buf, proto_version)?;
|
||||
pgb_writer.write_message(&BeMessage::CopyData(&buf)).await?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@ use std::{fmt, pin::pin};
|
||||
use anyhow::{bail, Context};
|
||||
use futures::StreamExt;
|
||||
use postgres_protocol::message::backend::ReplicationMessage;
|
||||
use safekeeper_api::membership::INVALID_GENERATION;
|
||||
use safekeeper_api::models::{PeerInfo, TimelineStatus};
|
||||
use safekeeper_api::Term;
|
||||
use tokio::sync::mpsc::{channel, Receiver, Sender};
|
||||
@@ -267,7 +268,10 @@ async fn recover(
|
||||
);
|
||||
|
||||
// Now understand our term history.
|
||||
let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: donor.term });
|
||||
let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest {
|
||||
generation: INVALID_GENERATION,
|
||||
term: donor.term,
|
||||
});
|
||||
let vote_response = match tli
|
||||
.process_msg(&vote_request)
|
||||
.await
|
||||
@@ -302,10 +306,10 @@ async fn recover(
|
||||
|
||||
// truncate WAL locally
|
||||
let pe = ProposerAcceptorMessage::Elected(ProposerElected {
|
||||
generation: INVALID_GENERATION,
|
||||
term: donor.term,
|
||||
start_streaming_at: last_common_point.lsn,
|
||||
term_history: donor_th,
|
||||
timeline_start_lsn: Lsn::INVALID,
|
||||
});
|
||||
// Successful ProposerElected handling always returns None. If term changed,
|
||||
// we'll find out that during the streaming. Note: it is expected to get
|
||||
@@ -434,13 +438,12 @@ async fn network_io(
|
||||
match msg {
|
||||
ReplicationMessage::XLogData(xlog_data) => {
|
||||
let ar_hdr = AppendRequestHeader {
|
||||
generation: INVALID_GENERATION,
|
||||
term: donor.term,
|
||||
term_start_lsn: Lsn::INVALID, // unused
|
||||
begin_lsn: Lsn(xlog_data.wal_start()),
|
||||
end_lsn: Lsn(xlog_data.wal_start()) + xlog_data.data().len() as u64,
|
||||
commit_lsn: Lsn::INVALID, // do not attempt to advance, peer communication anyway does it
|
||||
truncate_lsn: Lsn::INVALID, // do not attempt to advance
|
||||
proposer_uuid: [0; 16],
|
||||
};
|
||||
let ar = AppendRequest {
|
||||
h: ar_hdr,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -57,6 +57,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
|
||||
keepalive_ticker.reset();
|
||||
|
||||
let (tx, mut rx) = tokio::sync::mpsc::channel::<Batch>(2);
|
||||
let shard = vec![self.shard];
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
@@ -80,14 +81,17 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
|
||||
assert!(next_record_lsn.is_aligned());
|
||||
max_next_record_lsn = Some(next_record_lsn);
|
||||
|
||||
|
||||
// Deserialize and interpret WAL record
|
||||
let interpreted = InterpretedWalRecord::from_bytes_filtered(
|
||||
recdata,
|
||||
&self.shard,
|
||||
&shard,
|
||||
next_record_lsn,
|
||||
self.pg_version,
|
||||
)
|
||||
.with_context(|| "Failed to interpret WAL")?;
|
||||
.with_context(|| "Failed to interpret WAL")?
|
||||
.remove(&self.shard)
|
||||
.unwrap();
|
||||
|
||||
if !interpreted.is_empty() {
|
||||
records.push(interpreted);
|
||||
|
||||
@@ -1,20 +1,25 @@
|
||||
//! Defines per timeline data stored persistently (SafeKeeperPersistentState)
|
||||
//! and its wrapper with in memory layer (SafekeeperState).
|
||||
|
||||
use std::{cmp::max, ops::Deref};
|
||||
use std::{cmp::max, ops::Deref, time::SystemTime};
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
use postgres_ffi::WAL_SEGMENT_SIZE;
|
||||
use safekeeper_api::{models::TimelineTermBumpResponse, ServerInfo, Term};
|
||||
use safekeeper_api::{
|
||||
membership::Configuration,
|
||||
models::{TimelineMembershipSwitchResponse, TimelineTermBumpResponse},
|
||||
ServerInfo, Term, INITIAL_TERM,
|
||||
};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::info;
|
||||
use utils::{
|
||||
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
|
||||
id::{TenantId, TenantTimelineId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::{
|
||||
control_file,
|
||||
safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, TermHistory, UNKNOWN_SERVER_VERSION},
|
||||
safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn, UNKNOWN_SERVER_VERSION},
|
||||
timeline::TimelineError,
|
||||
wal_backup_partial::{self},
|
||||
};
|
||||
@@ -27,6 +32,8 @@ pub struct TimelinePersistentState {
|
||||
pub tenant_id: TenantId,
|
||||
#[serde(with = "hex")]
|
||||
pub timeline_id: TimelineId,
|
||||
/// Membership configuration.
|
||||
pub mconf: Configuration,
|
||||
/// persistent acceptor state
|
||||
pub acceptor_state: AcceptorState,
|
||||
/// information about server
|
||||
@@ -58,22 +65,15 @@ pub struct TimelinePersistentState {
|
||||
/// pushed to s3. We don't remove WAL beyond it. Persisted only for
|
||||
/// informational purposes, we receive it from pageserver (or broker).
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
/// Peers and their state as we remember it. Knowing peers themselves is
|
||||
/// fundamental; but state is saved here only for informational purposes and
|
||||
/// obviously can be stale. (Currently not saved at all, but let's provision
|
||||
/// place to have less file version upgrades).
|
||||
pub peers: PersistedPeers,
|
||||
/// Holds names of partial segments uploaded to remote storage. Used to
|
||||
/// clean up old objects without leaving garbage in remote storage.
|
||||
pub partial_backup: wal_backup_partial::State,
|
||||
/// Eviction state of the timeline. If it's Offloaded, we should download
|
||||
/// WAL files from remote storage to serve the timeline.
|
||||
pub eviction_state: EvictionState,
|
||||
pub creation_ts: SystemTime,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>);
|
||||
|
||||
/// State of the local WAL files. Used to track current timeline state,
|
||||
/// that can be either WAL files are present on disk or last partial segment
|
||||
/// is offloaded to remote storage.
|
||||
@@ -87,12 +87,14 @@ pub enum EvictionState {
|
||||
}
|
||||
|
||||
impl TimelinePersistentState {
|
||||
/// commit_lsn is the same as start_lsn in the normal creaiton; see
|
||||
/// `TimelineCreateRequest` comments.`
|
||||
pub fn new(
|
||||
ttid: &TenantTimelineId,
|
||||
mconf: Configuration,
|
||||
server_info: ServerInfo,
|
||||
peers: Vec<NodeId>,
|
||||
start_lsn: Lsn,
|
||||
commit_lsn: Lsn,
|
||||
local_start_lsn: Lsn,
|
||||
) -> anyhow::Result<TimelinePersistentState> {
|
||||
if server_info.wal_seg_size == 0 {
|
||||
bail!(TimelineError::UninitializedWalSegSize(*ttid));
|
||||
@@ -102,49 +104,59 @@ impl TimelinePersistentState {
|
||||
bail!(TimelineError::UninitialinzedPgVersion(*ttid));
|
||||
}
|
||||
|
||||
if commit_lsn < local_start_lsn {
|
||||
if commit_lsn < start_lsn {
|
||||
bail!(
|
||||
"commit_lsn {} is smaller than local_start_lsn {}",
|
||||
"commit_lsn {} is smaller than start_lsn {}",
|
||||
commit_lsn,
|
||||
local_start_lsn
|
||||
start_lsn
|
||||
);
|
||||
}
|
||||
|
||||
// If we are given with init LSN, initialize term history with it. It
|
||||
// ensures that walproposer always must be able to find a common point
|
||||
// in histories; if it can't something is corrupted. Not having LSN here
|
||||
// is so far left for legacy case where timeline is created by compute
|
||||
// and LSN during creation is not known yet.
|
||||
let term_history = if commit_lsn != Lsn::INVALID {
|
||||
TermHistory(vec![TermLsn {
|
||||
term: INITIAL_TERM,
|
||||
lsn: start_lsn,
|
||||
}])
|
||||
} else {
|
||||
TermHistory::empty()
|
||||
};
|
||||
|
||||
Ok(TimelinePersistentState {
|
||||
tenant_id: ttid.tenant_id,
|
||||
timeline_id: ttid.timeline_id,
|
||||
mconf,
|
||||
acceptor_state: AcceptorState {
|
||||
term: 0,
|
||||
term_history: TermHistory::empty(),
|
||||
term: INITIAL_TERM,
|
||||
term_history,
|
||||
},
|
||||
server: server_info,
|
||||
proposer_uuid: [0; 16],
|
||||
timeline_start_lsn: Lsn(0),
|
||||
local_start_lsn,
|
||||
timeline_start_lsn: start_lsn,
|
||||
local_start_lsn: start_lsn,
|
||||
commit_lsn,
|
||||
backup_lsn: local_start_lsn,
|
||||
peer_horizon_lsn: local_start_lsn,
|
||||
backup_lsn: start_lsn,
|
||||
peer_horizon_lsn: start_lsn,
|
||||
remote_consistent_lsn: Lsn(0),
|
||||
peers: PersistedPeers(
|
||||
peers
|
||||
.iter()
|
||||
.map(|p| (*p, PersistedPeerInfo::new()))
|
||||
.collect(),
|
||||
),
|
||||
partial_backup: wal_backup_partial::State::default(),
|
||||
eviction_state: EvictionState::Present,
|
||||
creation_ts: SystemTime::now(),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn empty() -> Self {
|
||||
TimelinePersistentState::new(
|
||||
&TenantTimelineId::empty(),
|
||||
Configuration::empty(),
|
||||
ServerInfo {
|
||||
pg_version: 170000, /* Postgres server version (major * 10000) */
|
||||
system_id: 0, /* Postgres system identifier */
|
||||
wal_seg_size: WAL_SEGMENT_SIZE as u32,
|
||||
},
|
||||
vec![],
|
||||
Lsn::INVALID,
|
||||
Lsn::INVALID,
|
||||
)
|
||||
@@ -249,6 +261,31 @@ where
|
||||
current_term: after,
|
||||
})
|
||||
}
|
||||
|
||||
/// Switch into membership configuration `to` if it is higher than the
|
||||
/// current one.
|
||||
pub async fn membership_switch(
|
||||
&mut self,
|
||||
to: Configuration,
|
||||
) -> Result<TimelineMembershipSwitchResponse> {
|
||||
let before = self.mconf.clone();
|
||||
// Is switch allowed?
|
||||
if to.generation <= self.mconf.generation {
|
||||
info!(
|
||||
"ignoring request to switch membership conf to lower {}, current conf {}",
|
||||
to, self.mconf
|
||||
);
|
||||
} else {
|
||||
let mut state = self.start_change();
|
||||
state.mconf = to.clone();
|
||||
self.finish_change(&state).await?;
|
||||
info!("switched membership conf to {} from {}", to, before);
|
||||
}
|
||||
Ok(TimelineMembershipSwitchResponse {
|
||||
previous_conf: before,
|
||||
current_conf: self.mconf.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<CTRL> Deref for TimelineState<CTRL>
|
||||
|
||||
@@ -4,7 +4,10 @@
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use remote_storage::RemotePath;
|
||||
use safekeeper_api::models::{PeerInfo, TimelineTermBumpResponse};
|
||||
use safekeeper_api::membership::Configuration;
|
||||
use safekeeper_api::models::{
|
||||
PeerInfo, TimelineMembershipSwitchResponse, TimelineTermBumpResponse,
|
||||
};
|
||||
use safekeeper_api::Term;
|
||||
use tokio::fs::{self};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -188,6 +191,13 @@ impl StateSK {
|
||||
self.state_mut().term_bump(to).await
|
||||
}
|
||||
|
||||
pub async fn membership_switch(
|
||||
&mut self,
|
||||
to: Configuration,
|
||||
) -> Result<TimelineMembershipSwitchResponse> {
|
||||
self.state_mut().membership_switch(to).await
|
||||
}
|
||||
|
||||
/// Close open WAL files to release FDs.
|
||||
fn close_wal_store(&mut self) {
|
||||
if let StateSK::Loaded(sk) = self {
|
||||
@@ -768,6 +778,14 @@ impl Timeline {
|
||||
state.sk.term_bump(to).await
|
||||
}
|
||||
|
||||
pub async fn membership_switch(
|
||||
self: &Arc<Self>,
|
||||
to: Configuration,
|
||||
) -> Result<TimelineMembershipSwitchResponse> {
|
||||
let mut state = self.write_shared_state().await;
|
||||
state.sk.membership_switch(to).await
|
||||
}
|
||||
|
||||
/// Guts of [`Self::wal_residence_guard`] and [`Self::try_wal_residence_guard`]
|
||||
async fn do_wal_residence_guard(
|
||||
self: &Arc<Self>,
|
||||
|
||||
@@ -495,7 +495,6 @@ impl Manager {
|
||||
}
|
||||
|
||||
/// Update is_active flag and returns its value.
|
||||
// Timelines marked active are pushed to the broker by the `push_loop` task.
|
||||
fn update_is_active(
|
||||
&mut self,
|
||||
is_wal_backup_required: bool,
|
||||
|
||||
@@ -12,6 +12,7 @@ use crate::{control_file, wal_storage, SafeKeeperConf};
|
||||
use anyhow::{bail, Context, Result};
|
||||
use camino::Utf8PathBuf;
|
||||
use camino_tempfile::Utf8TempDir;
|
||||
use safekeeper_api::membership::Configuration;
|
||||
use safekeeper_api::ServerInfo;
|
||||
use serde::Serialize;
|
||||
use std::collections::HashMap;
|
||||
@@ -214,9 +215,10 @@ impl GlobalTimelines {
|
||||
pub(crate) async fn create(
|
||||
&self,
|
||||
ttid: TenantTimelineId,
|
||||
mconf: Configuration,
|
||||
server_info: ServerInfo,
|
||||
start_lsn: Lsn,
|
||||
commit_lsn: Lsn,
|
||||
local_start_lsn: Lsn,
|
||||
) -> Result<Arc<Timeline>> {
|
||||
let (conf, _, _) = {
|
||||
let state = self.state.lock().unwrap();
|
||||
@@ -239,8 +241,7 @@ impl GlobalTimelines {
|
||||
|
||||
// TODO: currently we create only cfile. It would be reasonable to
|
||||
// immediately initialize first WAL segment as well.
|
||||
let state =
|
||||
TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn)?;
|
||||
let state = TimelinePersistentState::new(&ttid, mconf, server_info, start_lsn, commit_lsn)?;
|
||||
control_file::FileStorage::create_new(&tmp_dir_path, state, conf.no_sync).await?;
|
||||
let timeline = self.load_temp_timeline(ttid, &tmp_dir_path, true).await?;
|
||||
Ok(timeline)
|
||||
|
||||
@@ -61,9 +61,7 @@ pub(crate) fn is_wal_backup_required(
|
||||
state: &StateSnapshot,
|
||||
) -> bool {
|
||||
num_computes > 0 ||
|
||||
// This task backups completed segments only.
|
||||
// The current partial segment is backed up by a separate task/code module (wal_backup_partial).
|
||||
// So, need for completed segment backup <=> last backup was at at older segment.
|
||||
// Currently only the whole segment is offloaded, so compare segment numbers.
|
||||
(state.commit_lsn.segment_number(wal_seg_size) > state.backup_lsn.segment_number(wal_seg_size))
|
||||
}
|
||||
|
||||
@@ -71,11 +69,6 @@ pub(crate) fn is_wal_backup_required(
|
||||
/// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task
|
||||
/// is running, kill it.
|
||||
pub(crate) async fn update_task(mgr: &mut Manager, need_backup: bool, state: &StateSnapshot) {
|
||||
// Based on the peer information received from broker, each safekeeper figures out
|
||||
// whether it, or one of the peers, is the offloader.
|
||||
// The algorithm is deterministic, so, if all peers have the same information,
|
||||
// the system converges. In unconverged state, multiple peers upload the same
|
||||
// segments, which is inefficient but safe.
|
||||
let (offloader, election_dbg_str) =
|
||||
determine_offloader(&state.peers, state.backup_lsn, mgr.tli.ttid, &mgr.conf);
|
||||
let elected_me = Some(mgr.conf.my_id) == offloader;
|
||||
|
||||
@@ -15,13 +15,15 @@ use desim::{
|
||||
};
|
||||
use http::Uri;
|
||||
use safekeeper::{
|
||||
safekeeper::{ProposerAcceptorMessage, SafeKeeper, UNKNOWN_SERVER_VERSION},
|
||||
safekeeper::{
|
||||
ProposerAcceptorMessage, SafeKeeper, SK_PROTOCOL_VERSION, UNKNOWN_SERVER_VERSION,
|
||||
},
|
||||
state::{TimelinePersistentState, TimelineState},
|
||||
timeline::TimelineError,
|
||||
wal_storage::Storage,
|
||||
SafeKeeperConf,
|
||||
};
|
||||
use safekeeper_api::ServerInfo;
|
||||
use safekeeper_api::{membership::Configuration, ServerInfo};
|
||||
use tracing::{debug, info_span, warn};
|
||||
use utils::{
|
||||
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
|
||||
@@ -96,8 +98,13 @@ impl GlobalMap {
|
||||
let commit_lsn = Lsn::INVALID;
|
||||
let local_start_lsn = Lsn::INVALID;
|
||||
|
||||
let state =
|
||||
TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn)?;
|
||||
let state = TimelinePersistentState::new(
|
||||
&ttid,
|
||||
Configuration::empty(),
|
||||
server_info,
|
||||
commit_lsn,
|
||||
local_start_lsn,
|
||||
)?;
|
||||
|
||||
let disk_timeline = self.disk.put_state(&ttid, state);
|
||||
let control_store = DiskStateStorage::new(disk_timeline.clone());
|
||||
@@ -278,7 +285,7 @@ impl ConnState {
|
||||
bail!("finished processing START_REPLICATION")
|
||||
}
|
||||
|
||||
let msg = ProposerAcceptorMessage::parse(copy_data)?;
|
||||
let msg = ProposerAcceptorMessage::parse(copy_data, SK_PROTOCOL_VERSION)?;
|
||||
debug!("got msg: {:?}", msg);
|
||||
self.process(msg, global)
|
||||
} else {
|
||||
|
||||
@@ -55,4 +55,4 @@ r2d2 = { version = "0.8.10" }
|
||||
utils = { path = "../libs/utils/" }
|
||||
metrics = { path = "../libs/metrics/" }
|
||||
control_plane = { path = "../control_plane" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
@@ -124,7 +124,10 @@ impl ComputeHookTenant {
|
||||
if let Some(shard_idx) = shard_idx {
|
||||
sharded.shards.remove(shard_idx);
|
||||
} else {
|
||||
tracing::warn!("Shard not found while handling detach")
|
||||
// This is a valid but niche case, where the tenant was previously attached
|
||||
// as a Secondary location and then detached, so has no previously notified
|
||||
// state.
|
||||
tracing::info!("Shard not found while handling detach")
|
||||
}
|
||||
}
|
||||
ComputeHookTenant::Unsharded(_) => {
|
||||
@@ -761,7 +764,10 @@ impl ComputeHook {
|
||||
let mut state_locked = self.state.lock().unwrap();
|
||||
match state_locked.entry(tenant_shard_id.tenant_id) {
|
||||
Entry::Vacant(_) => {
|
||||
tracing::warn!("Compute hook tenant not found for detach");
|
||||
// This is a valid but niche case, where the tenant was previously attached
|
||||
// as a Secondary location and then detached, so has no previously notified
|
||||
// state.
|
||||
tracing::info!("Compute hook tenant not found for detach");
|
||||
}
|
||||
Entry::Occupied(mut e) => {
|
||||
let sharded = e.get().is_sharded();
|
||||
|
||||
@@ -112,7 +112,7 @@ impl TenantShardDrain {
|
||||
}
|
||||
}
|
||||
|
||||
match scheduler.node_preferred(tenant_shard.intent.get_secondary()) {
|
||||
match tenant_shard.preferred_secondary(scheduler) {
|
||||
Some(node) => Some(node),
|
||||
None => {
|
||||
tracing::warn!(
|
||||
|
||||
@@ -653,6 +653,10 @@ async fn handle_tenant_list(
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let limit: Option<usize> = parse_query_param(&req, "limit")?;
|
||||
let start_after: Option<TenantId> = parse_query_param(&req, "start_after")?;
|
||||
tracing::info!("start_after: {:?}", start_after);
|
||||
|
||||
match maybe_forward(req).await {
|
||||
ForwardOutcome::Forwarded(res) => {
|
||||
return res;
|
||||
@@ -660,7 +664,7 @@ async fn handle_tenant_list(
|
||||
ForwardOutcome::NotForwarded(_req) => {}
|
||||
};
|
||||
|
||||
json_response(StatusCode::OK, service.tenant_list())
|
||||
json_response(StatusCode::OK, service.tenant_list(limit, start_after))
|
||||
}
|
||||
|
||||
async fn handle_node_register(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
@@ -690,7 +694,8 @@ async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError
|
||||
};
|
||||
|
||||
let state = get_state(&req);
|
||||
let nodes = state.service.node_list().await?;
|
||||
let mut nodes = state.service.node_list().await?;
|
||||
nodes.sort_by_key(|n| n.get_id());
|
||||
let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::<Vec<_>>();
|
||||
|
||||
json_response(StatusCode::OK, api_nodes)
|
||||
@@ -1005,6 +1010,29 @@ async fn handle_tenant_shard_migrate(
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_shard_migrate_secondary(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let mut req = match maybe_forward(req).await {
|
||||
ForwardOutcome::Forwarded(res) => {
|
||||
return res;
|
||||
}
|
||||
ForwardOutcome::NotForwarded(req) => req,
|
||||
};
|
||||
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
|
||||
let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
service
|
||||
.tenant_shard_migrate_secondary(tenant_shard_id, migrate_req)
|
||||
.await?,
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_shard_cancel_reconcile(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
@@ -1855,6 +1883,16 @@ pub fn make_router(
|
||||
RequestName("control_v1_tenant_migrate"),
|
||||
)
|
||||
})
|
||||
.put(
|
||||
"/control/v1/tenant/:tenant_shard_id/migrate_secondary",
|
||||
|r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
handle_tenant_shard_migrate_secondary,
|
||||
RequestName("control_v1_tenant_migrate_secondary"),
|
||||
)
|
||||
},
|
||||
)
|
||||
.put(
|
||||
"/control/v1/tenant/:tenant_shard_id/cancel_reconcile",
|
||||
|r| {
|
||||
|
||||
@@ -53,6 +53,16 @@ pub(crate) struct StorageControllerMetricGroup {
|
||||
/// How many shards are not scheduled into their preferred AZ
|
||||
pub(crate) storage_controller_schedule_az_violation: measured::Gauge,
|
||||
|
||||
/// How many shard locations (secondary or attached) on each node
|
||||
pub(crate) storage_controller_node_shards: measured::GaugeVec<NodeLabelGroupSet>,
|
||||
|
||||
/// How many _attached_ shard locations on each node
|
||||
pub(crate) storage_controller_node_attached_shards: measured::GaugeVec<NodeLabelGroupSet>,
|
||||
|
||||
/// How many _home_ shard locations on each node (i.e. the node's AZ matches the shard's
|
||||
/// preferred AZ)
|
||||
pub(crate) storage_controller_node_home_shards: measured::GaugeVec<NodeLabelGroupSet>,
|
||||
|
||||
/// How many shards would like to reconcile but were blocked by concurrency limits
|
||||
pub(crate) storage_controller_pending_reconciles: measured::Gauge,
|
||||
|
||||
@@ -132,6 +142,15 @@ impl Default for StorageControllerMetrics {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(measured::LabelGroup, Clone)]
|
||||
#[label(set = NodeLabelGroupSet)]
|
||||
pub(crate) struct NodeLabelGroup<'a> {
|
||||
#[label(dynamic_with = lasso::ThreadedRodeo, default)]
|
||||
pub(crate) az: &'a str,
|
||||
#[label(dynamic_with = lasso::ThreadedRodeo, default)]
|
||||
pub(crate) node_id: &'a str,
|
||||
}
|
||||
|
||||
#[derive(measured::LabelGroup)]
|
||||
#[label(set = ReconcileCompleteLabelGroupSet)]
|
||||
pub(crate) struct ReconcileCompleteLabelGroup {
|
||||
|
||||
@@ -299,6 +299,7 @@ impl Node {
|
||||
id: self.id,
|
||||
availability: self.availability.clone().into(),
|
||||
scheduling: self.scheduling,
|
||||
availability_zone_id: self.availability_zone_id.0.clone(),
|
||||
listen_http_addr: self.listen_http_addr.clone(),
|
||||
listen_http_port: self.listen_http_port,
|
||||
listen_pg_addr: self.listen_pg_addr.clone(),
|
||||
|
||||
@@ -708,10 +708,11 @@ impl Persistence {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Note that passing None for a shard clears the preferred AZ (rather than leaving it unmodified)
|
||||
pub(crate) async fn set_tenant_shard_preferred_azs(
|
||||
&self,
|
||||
preferred_azs: Vec<(TenantShardId, AvailabilityZone)>,
|
||||
) -> DatabaseResult<Vec<(TenantShardId, AvailabilityZone)>> {
|
||||
preferred_azs: Vec<(TenantShardId, Option<AvailabilityZone>)>,
|
||||
) -> DatabaseResult<Vec<(TenantShardId, Option<AvailabilityZone>)>> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
|
||||
self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| {
|
||||
@@ -722,7 +723,7 @@ impl Persistence {
|
||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
|
||||
.set(preferred_az_id.eq(preferred_az.0.clone()))
|
||||
.set(preferred_az_id.eq(preferred_az.as_ref().map(|az| az.0.clone())))
|
||||
.execute(conn)?;
|
||||
|
||||
if updated == 1 {
|
||||
|
||||
@@ -696,6 +696,11 @@ impl Reconciler {
|
||||
/// First we apply special case handling (e.g. for live migrations), and then a
|
||||
/// general case reconciliation where we walk through the intent by pageserver
|
||||
/// and call out to the pageserver to apply the desired state.
|
||||
///
|
||||
/// An Ok(()) result indicates that we successfully attached the tenant, but _not_ that
|
||||
/// all locations for the tenant are in the expected state. When nodes that are to be detached
|
||||
/// or configured as secondary are unavailable, we may return Ok(()) but leave the shard in a
|
||||
/// state where it still requires later reconciliation.
|
||||
pub(crate) async fn reconcile(&mut self) -> Result<(), ReconcileError> {
|
||||
// Prepare: if we have uncertain `observed` state for our would-be attachement location, then refresh it
|
||||
self.maybe_refresh_observed().await?;
|
||||
@@ -784,10 +789,18 @@ impl Reconciler {
|
||||
tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.")
|
||||
}
|
||||
_ => {
|
||||
// In all cases other than a matching observed configuration, we will
|
||||
// reconcile this location.
|
||||
tracing::info!(node_id=%node.get_id(), "Observed configuration requires update.");
|
||||
changes.push((node.clone(), wanted_conf))
|
||||
// Only try and configure secondary locations on nodes that are available. This
|
||||
// allows the reconciler to "succeed" while some secondaries are offline (e.g. after
|
||||
// a node failure, where the failed node will have a secondary intent)
|
||||
if node.is_available() {
|
||||
tracing::info!(node_id=%node.get_id(), "Observed configuration requires update.");
|
||||
changes.push((node.clone(), wanted_conf))
|
||||
} else {
|
||||
tracing::info!(node_id=%node.get_id(), "Skipping configuration as secondary, node is unavailable");
|
||||
self.observed
|
||||
.locations
|
||||
.insert(node.get_id(), ObservedStateLocation { conf: None });
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -813,7 +826,21 @@ impl Reconciler {
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(ReconcileError::Cancel);
|
||||
}
|
||||
self.location_config(&node, conf, None, false).await?;
|
||||
// We only try to configure secondary locations if the node is available. This does
|
||||
// not stop us succeeding with the reconcile, because our core goal is to make the
|
||||
// shard _available_ (the attached location), and configuring secondary locations
|
||||
// can be done lazily when the node becomes available (via background reconciliation).
|
||||
if node.is_available() {
|
||||
self.location_config(&node, conf, None, false).await?;
|
||||
} else {
|
||||
// If the node is unavailable, we skip and consider the reconciliation successful: this
|
||||
// is a common case where a pageserver is marked unavailable: we demote a location on
|
||||
// that unavailable pageserver to secondary.
|
||||
tracing::info!("Skipping configuring secondary location {node}, it is unavailable");
|
||||
self.observed
|
||||
.locations
|
||||
.insert(node.get_id(), ObservedStateLocation { conf: None });
|
||||
}
|
||||
}
|
||||
|
||||
// The condition below identifies a detach. We must have no attached intent and
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::{node::Node, tenant_shard::TenantShard};
|
||||
use crate::{metrics::NodeLabelGroup, node::Node, tenant_shard::TenantShard};
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::{controller_api::AvailabilityZone, models::PageserverUtilization};
|
||||
use serde::Serialize;
|
||||
@@ -32,6 +32,9 @@ pub(crate) struct SchedulerNode {
|
||||
shard_count: usize,
|
||||
/// How many shards are currently attached on this node, via their [`crate::tenant_shard::IntentState`].
|
||||
attached_shard_count: usize,
|
||||
/// How many shards have a location on this node (via [`crate::tenant_shard::IntentState`]) _and_ this node
|
||||
/// is in their preferred AZ (i.e. this is their 'home' location)
|
||||
home_shard_count: usize,
|
||||
/// Availability zone id in which the node resides
|
||||
az: AvailabilityZone,
|
||||
|
||||
@@ -47,6 +50,12 @@ pub(crate) trait NodeSchedulingScore: Debug + Ord + Copy + Sized {
|
||||
preferred_az: &Option<AvailabilityZone>,
|
||||
context: &ScheduleContext,
|
||||
) -> Option<Self>;
|
||||
|
||||
/// Return a score that drops any components based on node utilization: this is useful
|
||||
/// for finding scores for scheduling optimisation, when we want to avoid rescheduling
|
||||
/// shards due to e.g. disk usage, to avoid flapping.
|
||||
fn for_optimization(&self) -> Self;
|
||||
|
||||
fn is_overloaded(&self) -> bool;
|
||||
fn node_id(&self) -> NodeId;
|
||||
}
|
||||
@@ -136,17 +145,13 @@ impl PartialOrd for SecondaryAzMatch {
|
||||
/// Ordering is given by member declaration order (top to bottom).
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
|
||||
pub(crate) struct NodeAttachmentSchedulingScore {
|
||||
/// The number of shards belonging to the tenant currently being
|
||||
/// scheduled that are attached to this node.
|
||||
affinity_score: AffinityScore,
|
||||
/// Flag indicating whether this node matches the preferred AZ
|
||||
/// of the shard. For equal affinity scores, nodes in the matching AZ
|
||||
/// are considered first.
|
||||
az_match: AttachmentAzMatch,
|
||||
/// Size of [`ScheduleContext::attached_nodes`] for the current node.
|
||||
/// This normally tracks the number of attached shards belonging to the
|
||||
/// tenant being scheduled that are already on this node.
|
||||
attached_shards_in_context: usize,
|
||||
/// The number of shards belonging to the tenant currently being
|
||||
/// scheduled that are attached to this node.
|
||||
affinity_score: AffinityScore,
|
||||
/// Utilisation score that combines shard count and disk utilisation
|
||||
utilization_score: u64,
|
||||
/// Total number of shards attached to this node. When nodes have identical utilisation, this
|
||||
@@ -177,13 +182,25 @@ impl NodeSchedulingScore for NodeAttachmentSchedulingScore {
|
||||
.copied()
|
||||
.unwrap_or(AffinityScore::FREE),
|
||||
az_match: AttachmentAzMatch(AzMatch::new(&node.az, preferred_az.as_ref())),
|
||||
attached_shards_in_context: context.attached_nodes.get(node_id).copied().unwrap_or(0),
|
||||
utilization_score: utilization.cached_score(),
|
||||
total_attached_shard_count: node.attached_shard_count,
|
||||
node_id: *node_id,
|
||||
})
|
||||
}
|
||||
|
||||
/// For use in scheduling optimisation, where we only want to consider the aspects
|
||||
/// of the score that can only be resolved by moving things (such as inter-shard affinity
|
||||
/// and AZ affinity), and ignore aspects that reflect the total utilization of a node (which
|
||||
/// can fluctuate for other reasons)
|
||||
fn for_optimization(&self) -> Self {
|
||||
Self {
|
||||
utilization_score: 0,
|
||||
total_attached_shard_count: 0,
|
||||
node_id: NodeId(0),
|
||||
..*self
|
||||
}
|
||||
}
|
||||
|
||||
fn is_overloaded(&self) -> bool {
|
||||
PageserverUtilization::is_overloaded(self.utilization_score)
|
||||
}
|
||||
@@ -208,9 +225,9 @@ pub(crate) struct NodeSecondarySchedulingScore {
|
||||
affinity_score: AffinityScore,
|
||||
/// Utilisation score that combines shard count and disk utilisation
|
||||
utilization_score: u64,
|
||||
/// Total number of shards attached to this node. When nodes have identical utilisation, this
|
||||
/// acts as an anti-affinity between attached shards.
|
||||
total_attached_shard_count: usize,
|
||||
/// Anti-affinity with other non-home locations: this gives the behavior that secondaries
|
||||
/// will spread out across the nodes in an AZ.
|
||||
total_non_home_shard_count: usize,
|
||||
/// Convenience to make selection deterministic in tests and empty systems
|
||||
node_id: NodeId,
|
||||
}
|
||||
@@ -237,11 +254,20 @@ impl NodeSchedulingScore for NodeSecondarySchedulingScore {
|
||||
.copied()
|
||||
.unwrap_or(AffinityScore::FREE),
|
||||
utilization_score: utilization.cached_score(),
|
||||
total_attached_shard_count: node.attached_shard_count,
|
||||
total_non_home_shard_count: (node.shard_count - node.home_shard_count),
|
||||
node_id: *node_id,
|
||||
})
|
||||
}
|
||||
|
||||
fn for_optimization(&self) -> Self {
|
||||
Self {
|
||||
utilization_score: 0,
|
||||
total_non_home_shard_count: 0,
|
||||
node_id: NodeId(0),
|
||||
..*self
|
||||
}
|
||||
}
|
||||
|
||||
fn is_overloaded(&self) -> bool {
|
||||
PageserverUtilization::is_overloaded(self.utilization_score)
|
||||
}
|
||||
@@ -293,6 +319,10 @@ impl AffinityScore {
|
||||
pub(crate) fn inc(&mut self) {
|
||||
self.0 += 1;
|
||||
}
|
||||
|
||||
pub(crate) fn dec(&mut self) {
|
||||
self.0 -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Add for AffinityScore {
|
||||
@@ -324,9 +354,6 @@ pub(crate) struct ScheduleContext {
|
||||
/// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`]
|
||||
pub(crate) nodes: HashMap<NodeId, AffinityScore>,
|
||||
|
||||
/// Specifically how many _attached_ locations are on each node
|
||||
pub(crate) attached_nodes: HashMap<NodeId, usize>,
|
||||
|
||||
pub(crate) mode: ScheduleMode,
|
||||
}
|
||||
|
||||
@@ -334,7 +361,6 @@ impl ScheduleContext {
|
||||
pub(crate) fn new(mode: ScheduleMode) -> Self {
|
||||
Self {
|
||||
nodes: HashMap::new(),
|
||||
attached_nodes: HashMap::new(),
|
||||
mode,
|
||||
}
|
||||
}
|
||||
@@ -348,25 +374,31 @@ impl ScheduleContext {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn push_attached(&mut self, node_id: NodeId) {
|
||||
let entry = self.attached_nodes.entry(node_id).or_default();
|
||||
*entry += 1;
|
||||
}
|
||||
|
||||
pub(crate) fn get_node_affinity(&self, node_id: NodeId) -> AffinityScore {
|
||||
self.nodes
|
||||
.get(&node_id)
|
||||
.copied()
|
||||
.unwrap_or(AffinityScore::FREE)
|
||||
}
|
||||
|
||||
pub(crate) fn get_node_attachments(&self, node_id: NodeId) -> usize {
|
||||
self.attached_nodes.get(&node_id).copied().unwrap_or(0)
|
||||
/// Remove `shard`'s contributions to this context. This is useful when considering scheduling
|
||||
/// this shard afresh, where we don't want it to e.g. experience anti-affinity to its current location.
|
||||
pub(crate) fn project_detach(&self, shard: &TenantShard) -> Self {
|
||||
let mut new_context = self.clone();
|
||||
|
||||
if let Some(attached) = shard.intent.get_attached() {
|
||||
if let Some(score) = new_context.nodes.get_mut(attached) {
|
||||
score.dec();
|
||||
}
|
||||
}
|
||||
|
||||
for secondary in shard.intent.get_secondary() {
|
||||
if let Some(score) = new_context.nodes.get_mut(secondary) {
|
||||
score.dec();
|
||||
}
|
||||
}
|
||||
|
||||
new_context
|
||||
}
|
||||
|
||||
/// For test, track the sum of AffinityScore values, which is effectively how many
|
||||
/// attached or secondary locations have been registered with this context.
|
||||
#[cfg(test)]
|
||||
pub(crate) fn attach_count(&self) -> usize {
|
||||
self.attached_nodes.values().sum()
|
||||
pub(crate) fn location_count(&self) -> usize {
|
||||
self.nodes.values().map(|i| i.0).sum()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -388,6 +420,7 @@ impl Scheduler {
|
||||
SchedulerNode {
|
||||
shard_count: 0,
|
||||
attached_shard_count: 0,
|
||||
home_shard_count: 0,
|
||||
may_schedule: node.may_schedule(),
|
||||
az: node.get_availability_zone_id().clone(),
|
||||
},
|
||||
@@ -415,6 +448,7 @@ impl Scheduler {
|
||||
SchedulerNode {
|
||||
shard_count: 0,
|
||||
attached_shard_count: 0,
|
||||
home_shard_count: 0,
|
||||
may_schedule: node.may_schedule(),
|
||||
az: node.get_availability_zone_id().clone(),
|
||||
},
|
||||
@@ -427,6 +461,9 @@ impl Scheduler {
|
||||
Some(node) => {
|
||||
node.shard_count += 1;
|
||||
node.attached_shard_count += 1;
|
||||
if Some(&node.az) == shard.preferred_az() {
|
||||
node.home_shard_count += 1;
|
||||
}
|
||||
}
|
||||
None => anyhow::bail!(
|
||||
"Tenant {} references nonexistent node {}",
|
||||
@@ -438,7 +475,12 @@ impl Scheduler {
|
||||
|
||||
for node_id in shard.intent.get_secondary() {
|
||||
match expect_nodes.get_mut(node_id) {
|
||||
Some(node) => node.shard_count += 1,
|
||||
Some(node) => {
|
||||
node.shard_count += 1;
|
||||
if Some(&node.az) == shard.preferred_az() {
|
||||
node.home_shard_count += 1;
|
||||
}
|
||||
}
|
||||
None => anyhow::bail!(
|
||||
"Tenant {} references nonexistent node {}",
|
||||
shard.tenant_shard_id,
|
||||
@@ -482,13 +524,20 @@ impl Scheduler {
|
||||
///
|
||||
/// It is an error to call this for a node that is not known to the scheduler (i.e. passed into
|
||||
/// [`Self::new`] or [`Self::node_upsert`])
|
||||
pub(crate) fn update_node_ref_counts(&mut self, node_id: NodeId, update: RefCountUpdate) {
|
||||
pub(crate) fn update_node_ref_counts(
|
||||
&mut self,
|
||||
node_id: NodeId,
|
||||
preferred_az: Option<&AvailabilityZone>,
|
||||
update: RefCountUpdate,
|
||||
) {
|
||||
let Some(node) = self.nodes.get_mut(&node_id) else {
|
||||
debug_assert!(false);
|
||||
tracing::error!("Scheduler missing node {node_id}");
|
||||
return;
|
||||
};
|
||||
|
||||
let is_home_az = Some(&node.az) == preferred_az;
|
||||
|
||||
match update {
|
||||
RefCountUpdate::PromoteSecondary => {
|
||||
node.attached_shard_count += 1;
|
||||
@@ -496,19 +545,31 @@ impl Scheduler {
|
||||
RefCountUpdate::Attach => {
|
||||
node.shard_count += 1;
|
||||
node.attached_shard_count += 1;
|
||||
if is_home_az {
|
||||
node.home_shard_count += 1;
|
||||
}
|
||||
}
|
||||
RefCountUpdate::Detach => {
|
||||
node.shard_count -= 1;
|
||||
node.attached_shard_count -= 1;
|
||||
if is_home_az {
|
||||
node.home_shard_count -= 1;
|
||||
}
|
||||
}
|
||||
RefCountUpdate::DemoteAttached => {
|
||||
node.attached_shard_count -= 1;
|
||||
}
|
||||
RefCountUpdate::AddSecondary => {
|
||||
node.shard_count += 1;
|
||||
if is_home_az {
|
||||
node.home_shard_count += 1;
|
||||
}
|
||||
}
|
||||
RefCountUpdate::RemoveSecondary => {
|
||||
node.shard_count -= 1;
|
||||
if is_home_az {
|
||||
node.home_shard_count -= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -594,6 +655,7 @@ impl Scheduler {
|
||||
entry.insert(SchedulerNode {
|
||||
shard_count: 0,
|
||||
attached_shard_count: 0,
|
||||
home_shard_count: 0,
|
||||
may_schedule: node.may_schedule(),
|
||||
az: node.get_availability_zone_id().clone(),
|
||||
});
|
||||
@@ -607,33 +669,20 @@ impl Scheduler {
|
||||
}
|
||||
}
|
||||
|
||||
/// Where we have several nodes to choose from, for example when picking a secondary location
|
||||
/// to promote to an attached location, this method may be used to pick the best choice based
|
||||
/// on the scheduler's knowledge of utilization and availability.
|
||||
///
|
||||
/// If the input is empty, or all the nodes are not elegible for scheduling, return None: the
|
||||
/// caller can pick a node some other way.
|
||||
pub(crate) fn node_preferred(&self, nodes: &[NodeId]) -> Option<NodeId> {
|
||||
if nodes.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
// TODO: When the utilization score returned by the pageserver becomes meaningful,
|
||||
// schedule based on that instead of the shard count.
|
||||
let node = nodes
|
||||
.iter()
|
||||
.map(|node_id| {
|
||||
let may_schedule = self
|
||||
.nodes
|
||||
.get(node_id)
|
||||
.map(|n| !matches!(n.may_schedule, MaySchedule::No))
|
||||
.unwrap_or(false);
|
||||
(*node_id, may_schedule)
|
||||
})
|
||||
.max_by_key(|(_n, may_schedule)| *may_schedule);
|
||||
|
||||
// If even the preferred node has may_schedule==false, return None
|
||||
node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
|
||||
/// Calculate a single node's score, used in optimizer logic to compare specific
|
||||
/// nodes' scores.
|
||||
pub(crate) fn compute_node_score<Score>(
|
||||
&mut self,
|
||||
node_id: NodeId,
|
||||
preferred_az: &Option<AvailabilityZone>,
|
||||
context: &ScheduleContext,
|
||||
) -> Option<Score>
|
||||
where
|
||||
Score: NodeSchedulingScore,
|
||||
{
|
||||
self.nodes
|
||||
.get_mut(&node_id)
|
||||
.and_then(|node| Score::generate(&node_id, node, preferred_az, context))
|
||||
}
|
||||
|
||||
/// Compute a schedulling score for each node that the scheduler knows of
|
||||
@@ -727,7 +776,7 @@ impl Scheduler {
|
||||
tracing::info!(
|
||||
"scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
|
||||
scores.iter().map(|i| i.node_id().0).collect::<Vec<_>>()
|
||||
);
|
||||
);
|
||||
}
|
||||
|
||||
// Note that we do not update shard count here to reflect the scheduling: that
|
||||
@@ -743,47 +792,74 @@ impl Scheduler {
|
||||
}
|
||||
|
||||
/// For choosing which AZ to schedule a new shard into, use this. It will return the
|
||||
/// AZ with the lowest median utilization.
|
||||
/// AZ with the the lowest number of shards currently scheduled in this AZ as their home
|
||||
/// location.
|
||||
///
|
||||
/// We use an AZ-wide measure rather than simply selecting the AZ of the least-loaded
|
||||
/// node, because while tenants start out single sharded, when they grow and undergo
|
||||
/// shard-split, they will occupy space on many nodes within an AZ.
|
||||
/// shard-split, they will occupy space on many nodes within an AZ. It is important
|
||||
/// that we pick the AZ in a way that balances this _future_ load.
|
||||
///
|
||||
/// We use median rather than total free space or mean utilization, because
|
||||
/// we wish to avoid preferring AZs that have low-load nodes resulting from
|
||||
/// recent replacements.
|
||||
///
|
||||
/// The practical result is that we will pick an AZ based on its median node, and
|
||||
/// then actually _schedule_ the new shard onto the lowest-loaded node in that AZ.
|
||||
/// Once we've picked an AZ, subsequent scheduling within that AZ will be driven by
|
||||
/// nodes' utilization scores.
|
||||
pub(crate) fn get_az_for_new_tenant(&self) -> Option<AvailabilityZone> {
|
||||
if self.nodes.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut scores_by_az = HashMap::new();
|
||||
for (node_id, node) in &self.nodes {
|
||||
let az_scores = scores_by_az.entry(&node.az).or_insert_with(Vec::new);
|
||||
let score = match &node.may_schedule {
|
||||
MaySchedule::Yes(utilization) => utilization.score(),
|
||||
MaySchedule::No => PageserverUtilization::full().score(),
|
||||
};
|
||||
az_scores.push((node_id, node, score));
|
||||
#[derive(Default)]
|
||||
struct AzScore {
|
||||
home_shard_count: usize,
|
||||
scheduleable: bool,
|
||||
}
|
||||
|
||||
// Sort by utilization. Also include the node ID to break ties.
|
||||
for scores in scores_by_az.values_mut() {
|
||||
scores.sort_by_key(|i| (i.2, i.0));
|
||||
let mut azs: HashMap<&AvailabilityZone, AzScore> = HashMap::new();
|
||||
for node in self.nodes.values() {
|
||||
let az = azs.entry(&node.az).or_default();
|
||||
az.home_shard_count += node.home_shard_count;
|
||||
az.scheduleable |= matches!(node.may_schedule, MaySchedule::Yes(_));
|
||||
}
|
||||
|
||||
let mut median_by_az = scores_by_az
|
||||
// If any AZs are schedulable, then filter out the non-schedulable ones (i.e. AZs where
|
||||
// all nodes are overloaded or otherwise unschedulable).
|
||||
if azs.values().any(|i| i.scheduleable) {
|
||||
azs.retain(|_, i| i.scheduleable);
|
||||
}
|
||||
|
||||
// Find the AZ with the lowest number of shards currently allocated
|
||||
Some(
|
||||
azs.into_iter()
|
||||
.min_by_key(|i| (i.1.home_shard_count, i.0))
|
||||
.unwrap()
|
||||
.0
|
||||
.clone(),
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn get_node_az(&self, node_id: &NodeId) -> Option<AvailabilityZone> {
|
||||
self.nodes.get(node_id).map(|n| n.az.clone())
|
||||
}
|
||||
|
||||
/// For use when choosing a preferred secondary location: filter out nodes that are not
|
||||
/// available, and gather their AZs.
|
||||
pub(crate) fn filter_usable_nodes(
|
||||
&self,
|
||||
nodes: &[NodeId],
|
||||
) -> Vec<(NodeId, Option<AvailabilityZone>)> {
|
||||
nodes
|
||||
.iter()
|
||||
.map(|(az, nodes)| (*az, nodes.get(nodes.len() / 2).unwrap().2))
|
||||
.collect::<Vec<_>>();
|
||||
// Sort by utilization. Also include the AZ to break ties.
|
||||
median_by_az.sort_by_key(|i| (i.1, i.0));
|
||||
|
||||
// Return the AZ with the lowest median utilization
|
||||
Some(median_by_az.first().unwrap().0.clone())
|
||||
.filter_map(|node_id| {
|
||||
let node = self
|
||||
.nodes
|
||||
.get(node_id)
|
||||
.expect("Referenced nodes always exist");
|
||||
if matches!(node.may_schedule, MaySchedule::Yes(_)) {
|
||||
Some((*node_id, Some(node.az.clone())))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Unit test access to internal state
|
||||
@@ -796,6 +872,33 @@ impl Scheduler {
|
||||
pub(crate) fn get_node_attached_shard_count(&self, node_id: NodeId) -> usize {
|
||||
self.nodes.get(&node_id).unwrap().attached_shard_count
|
||||
}
|
||||
|
||||
/// Some metrics that we only calculate periodically: this is simpler than
|
||||
/// rigorously updating them on every change.
|
||||
pub(crate) fn update_metrics(&self) {
|
||||
for (node_id, node) in &self.nodes {
|
||||
let node_id_str = format!("{}", node_id);
|
||||
let label_group = NodeLabelGroup {
|
||||
az: &node.az.0,
|
||||
node_id: &node_id_str,
|
||||
};
|
||||
|
||||
crate::metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_node_shards
|
||||
.set(label_group.clone(), node.shard_count as i64);
|
||||
|
||||
crate::metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_node_attached_shards
|
||||
.set(label_group.clone(), node.attached_shard_count as i64);
|
||||
|
||||
crate::metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_node_home_shards
|
||||
.set(label_group.clone(), node.home_shard_count as i64);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -843,7 +946,14 @@ pub(crate) mod test_utils {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use pageserver_api::{controller_api::NodeAvailability, models::utilization::test_utilization};
|
||||
use pageserver_api::{
|
||||
controller_api::NodeAvailability, models::utilization::test_utilization,
|
||||
shard::ShardIdentity,
|
||||
};
|
||||
use utils::{
|
||||
id::TenantId,
|
||||
shard::{ShardCount, ShardNumber, TenantShardId},
|
||||
};
|
||||
|
||||
use super::*;
|
||||
|
||||
@@ -853,8 +963,8 @@ mod tests {
|
||||
let nodes = test_utils::make_test_nodes(2, &[]);
|
||||
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
let mut t1_intent = IntentState::new();
|
||||
let mut t2_intent = IntentState::new();
|
||||
let mut t1_intent = IntentState::new(None);
|
||||
let mut t2_intent = IntentState::new(None);
|
||||
|
||||
let context = ScheduleContext::default();
|
||||
|
||||
@@ -930,7 +1040,7 @@ mod tests {
|
||||
let scheduled = scheduler
|
||||
.schedule_shard::<AttachedShardTag>(&[], &None, context)
|
||||
.unwrap();
|
||||
let mut intent = IntentState::new();
|
||||
let mut intent = IntentState::new(None);
|
||||
intent.set_attached(scheduler, Some(scheduled));
|
||||
scheduled_intents.push(intent);
|
||||
assert_eq!(scheduled, expect_node);
|
||||
@@ -1063,7 +1173,7 @@ mod tests {
|
||||
let scheduled = scheduler
|
||||
.schedule_shard::<Tag>(&[], &preferred_az, context)
|
||||
.unwrap();
|
||||
let mut intent = IntentState::new();
|
||||
let mut intent = IntentState::new(preferred_az.clone());
|
||||
intent.set_attached(scheduler, Some(scheduled));
|
||||
scheduled_intents.push(intent);
|
||||
assert_eq!(scheduled, expect_node);
|
||||
@@ -1089,9 +1199,9 @@ mod tests {
|
||||
&mut context,
|
||||
);
|
||||
|
||||
// Node 2 is not in "az-a", but it has the lowest affinity so we prefer that.
|
||||
// Node 1 and 3 (az-a) have same affinity score, so prefer the lowest node id.
|
||||
assert_scheduler_chooses::<AttachedShardTag>(
|
||||
NodeId(2),
|
||||
NodeId(1),
|
||||
Some(az_a_tag.clone()),
|
||||
&mut scheduled_intents,
|
||||
&mut scheduler,
|
||||
@@ -1107,26 +1217,6 @@ mod tests {
|
||||
&mut context,
|
||||
);
|
||||
|
||||
// Avoid nodes in "az-b" for the secondary location.
|
||||
// Nodes 1 and 3 are identically loaded, so prefer the lowest node id.
|
||||
assert_scheduler_chooses::<SecondaryShardTag>(
|
||||
NodeId(1),
|
||||
Some(az_b_tag.clone()),
|
||||
&mut scheduled_intents,
|
||||
&mut scheduler,
|
||||
&mut context,
|
||||
);
|
||||
|
||||
// Avoid nodes in "az-b" for the secondary location.
|
||||
// Node 3 has lower affinity score than 1, so prefer that.
|
||||
assert_scheduler_chooses::<SecondaryShardTag>(
|
||||
NodeId(3),
|
||||
Some(az_b_tag.clone()),
|
||||
&mut scheduled_intents,
|
||||
&mut scheduler,
|
||||
&mut context,
|
||||
);
|
||||
|
||||
for mut intent in scheduled_intents {
|
||||
intent.clear(&mut scheduler);
|
||||
}
|
||||
@@ -1150,34 +1240,292 @@ mod tests {
|
||||
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
/// Force the utilization of a node in Scheduler's state to a particular
|
||||
/// number of bytes used.
|
||||
fn set_utilization(scheduler: &mut Scheduler, node_id: NodeId, shard_count: u32) {
|
||||
let mut node = Node::new(
|
||||
node_id,
|
||||
"".to_string(),
|
||||
0,
|
||||
"".to_string(),
|
||||
0,
|
||||
scheduler.nodes.get(&node_id).unwrap().az.clone(),
|
||||
);
|
||||
node.set_availability(NodeAvailability::Active(test_utilization::simple(
|
||||
shard_count,
|
||||
0,
|
||||
)));
|
||||
scheduler.node_upsert(&node);
|
||||
/// Force the `home_shard_count` of a node directly: this is the metric used
|
||||
/// by the scheduler when picking AZs.
|
||||
fn set_shard_count(scheduler: &mut Scheduler, node_id: NodeId, shard_count: usize) {
|
||||
let node = scheduler.nodes.get_mut(&node_id).unwrap();
|
||||
node.home_shard_count = shard_count;
|
||||
}
|
||||
|
||||
// Initial empty state. Scores are tied, scheduler prefers lower AZ ID.
|
||||
assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone()));
|
||||
|
||||
// Put some utilization on one node in AZ A: this should change nothing, as the median hasn't changed
|
||||
set_utilization(&mut scheduler, NodeId(1), 1000000);
|
||||
assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone()));
|
||||
|
||||
// Put some utilization on a second node in AZ A: now the median has changed, so the scheduler
|
||||
// should prefer the other AZ.
|
||||
set_utilization(&mut scheduler, NodeId(2), 1000000);
|
||||
// Home shard count is higher in AZ A, so AZ B will be preferred
|
||||
set_shard_count(&mut scheduler, NodeId(1), 10);
|
||||
assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_b_tag.clone()));
|
||||
|
||||
// Total home shard count is higher in AZ B, so we revert to preferring AZ A
|
||||
set_shard_count(&mut scheduler, NodeId(4), 6);
|
||||
set_shard_count(&mut scheduler, NodeId(5), 6);
|
||||
assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone()));
|
||||
}
|
||||
|
||||
/// Test that when selecting AZs for many new tenants, we get the expected balance across nodes
|
||||
#[test]
|
||||
fn az_selection_many() {
|
||||
let az_a_tag = AvailabilityZone("az-a".to_string());
|
||||
let az_b_tag = AvailabilityZone("az-b".to_string());
|
||||
let az_c_tag = AvailabilityZone("az-c".to_string());
|
||||
let nodes = test_utils::make_test_nodes(
|
||||
6,
|
||||
&[
|
||||
az_a_tag.clone(),
|
||||
az_b_tag.clone(),
|
||||
az_c_tag.clone(),
|
||||
az_a_tag.clone(),
|
||||
az_b_tag.clone(),
|
||||
az_c_tag.clone(),
|
||||
],
|
||||
);
|
||||
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
// We should get 1/6th of these on each node, give or take a few...
|
||||
let total_tenants = 300;
|
||||
|
||||
// ...where the 'few' is the number of AZs, because the scheduling will sometimes overshoot
|
||||
// on one AZ before correcting itself. This is because we select the 'home' AZ based on
|
||||
// an AZ-wide metric, but we select the location for secondaries on a purely node-based
|
||||
// metric (while excluding the home AZ).
|
||||
let grace = 3;
|
||||
|
||||
let mut scheduled_shards = Vec::new();
|
||||
for _i in 0..total_tenants {
|
||||
let preferred_az = scheduler.get_az_for_new_tenant().unwrap();
|
||||
|
||||
let mut node_home_counts = scheduler
|
||||
.nodes
|
||||
.iter()
|
||||
.map(|(node_id, node)| (node_id, node.home_shard_count))
|
||||
.collect::<Vec<_>>();
|
||||
node_home_counts.sort_by_key(|i| i.0);
|
||||
eprintln!("Selected {}, vs nodes {:?}", preferred_az, node_home_counts);
|
||||
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id: TenantId::generate(),
|
||||
shard_number: ShardNumber(0),
|
||||
shard_count: ShardCount(1),
|
||||
};
|
||||
|
||||
let shard_identity = ShardIdentity::new(
|
||||
tenant_shard_id.shard_number,
|
||||
tenant_shard_id.shard_count,
|
||||
pageserver_api::shard::ShardStripeSize(1),
|
||||
)
|
||||
.unwrap();
|
||||
let mut shard = TenantShard::new(
|
||||
tenant_shard_id,
|
||||
shard_identity,
|
||||
pageserver_api::controller_api::PlacementPolicy::Attached(1),
|
||||
Some(preferred_az),
|
||||
);
|
||||
|
||||
let mut context = ScheduleContext::default();
|
||||
shard.schedule(&mut scheduler, &mut context).unwrap();
|
||||
eprintln!("Scheduled shard at {:?}", shard.intent);
|
||||
|
||||
scheduled_shards.push(shard);
|
||||
}
|
||||
|
||||
for (node_id, node) in &scheduler.nodes {
|
||||
eprintln!(
|
||||
"Node {}: {} {} {}",
|
||||
node_id, node.shard_count, node.attached_shard_count, node.home_shard_count
|
||||
);
|
||||
}
|
||||
|
||||
for node in scheduler.nodes.values() {
|
||||
assert!((node.home_shard_count as i64 - total_tenants as i64 / 6).abs() < grace);
|
||||
}
|
||||
|
||||
for mut shard in scheduled_shards {
|
||||
shard.intent.clear(&mut scheduler);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
/// Make sure that when we have an odd number of nodes and an even number of shards, we still
|
||||
/// get scheduling stability.
|
||||
fn odd_nodes_stability() {
|
||||
let az_a = AvailabilityZone("az-a".to_string());
|
||||
let az_b = AvailabilityZone("az-b".to_string());
|
||||
|
||||
let nodes = test_utils::make_test_nodes(
|
||||
10,
|
||||
&[
|
||||
az_a.clone(),
|
||||
az_a.clone(),
|
||||
az_a.clone(),
|
||||
az_a.clone(),
|
||||
az_a.clone(),
|
||||
az_b.clone(),
|
||||
az_b.clone(),
|
||||
az_b.clone(),
|
||||
az_b.clone(),
|
||||
az_b.clone(),
|
||||
],
|
||||
);
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
// Need to keep these alive because they contribute to shard counts via RAII
|
||||
let mut scheduled_shards = Vec::new();
|
||||
|
||||
let mut context = ScheduleContext::default();
|
||||
|
||||
fn schedule_shard(
|
||||
tenant_shard_id: TenantShardId,
|
||||
expect_attached: NodeId,
|
||||
expect_secondary: NodeId,
|
||||
scheduled_shards: &mut Vec<TenantShard>,
|
||||
scheduler: &mut Scheduler,
|
||||
preferred_az: Option<AvailabilityZone>,
|
||||
context: &mut ScheduleContext,
|
||||
) {
|
||||
let shard_identity = ShardIdentity::new(
|
||||
tenant_shard_id.shard_number,
|
||||
tenant_shard_id.shard_count,
|
||||
pageserver_api::shard::ShardStripeSize(1),
|
||||
)
|
||||
.unwrap();
|
||||
let mut shard = TenantShard::new(
|
||||
tenant_shard_id,
|
||||
shard_identity,
|
||||
pageserver_api::controller_api::PlacementPolicy::Attached(1),
|
||||
preferred_az,
|
||||
);
|
||||
|
||||
shard.schedule(scheduler, context).unwrap();
|
||||
|
||||
assert_eq!(shard.intent.get_attached().unwrap(), expect_attached);
|
||||
assert_eq!(
|
||||
shard.intent.get_secondary().first().unwrap(),
|
||||
&expect_secondary
|
||||
);
|
||||
|
||||
scheduled_shards.push(shard);
|
||||
}
|
||||
|
||||
let tenant_id = TenantId::generate();
|
||||
|
||||
schedule_shard(
|
||||
TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(0),
|
||||
shard_count: ShardCount(8),
|
||||
},
|
||||
NodeId(1),
|
||||
NodeId(6),
|
||||
&mut scheduled_shards,
|
||||
&mut scheduler,
|
||||
Some(az_a.clone()),
|
||||
&mut context,
|
||||
);
|
||||
|
||||
schedule_shard(
|
||||
TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(1),
|
||||
shard_count: ShardCount(8),
|
||||
},
|
||||
NodeId(2),
|
||||
NodeId(7),
|
||||
&mut scheduled_shards,
|
||||
&mut scheduler,
|
||||
Some(az_a.clone()),
|
||||
&mut context,
|
||||
);
|
||||
|
||||
schedule_shard(
|
||||
TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(2),
|
||||
shard_count: ShardCount(8),
|
||||
},
|
||||
NodeId(3),
|
||||
NodeId(8),
|
||||
&mut scheduled_shards,
|
||||
&mut scheduler,
|
||||
Some(az_a.clone()),
|
||||
&mut context,
|
||||
);
|
||||
|
||||
schedule_shard(
|
||||
TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(3),
|
||||
shard_count: ShardCount(8),
|
||||
},
|
||||
NodeId(4),
|
||||
NodeId(9),
|
||||
&mut scheduled_shards,
|
||||
&mut scheduler,
|
||||
Some(az_a.clone()),
|
||||
&mut context,
|
||||
);
|
||||
|
||||
schedule_shard(
|
||||
TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(4),
|
||||
shard_count: ShardCount(8),
|
||||
},
|
||||
NodeId(5),
|
||||
NodeId(10),
|
||||
&mut scheduled_shards,
|
||||
&mut scheduler,
|
||||
Some(az_a.clone()),
|
||||
&mut context,
|
||||
);
|
||||
|
||||
schedule_shard(
|
||||
TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(5),
|
||||
shard_count: ShardCount(8),
|
||||
},
|
||||
NodeId(1),
|
||||
NodeId(6),
|
||||
&mut scheduled_shards,
|
||||
&mut scheduler,
|
||||
Some(az_a.clone()),
|
||||
&mut context,
|
||||
);
|
||||
|
||||
schedule_shard(
|
||||
TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(6),
|
||||
shard_count: ShardCount(8),
|
||||
},
|
||||
NodeId(2),
|
||||
NodeId(7),
|
||||
&mut scheduled_shards,
|
||||
&mut scheduler,
|
||||
Some(az_a.clone()),
|
||||
&mut context,
|
||||
);
|
||||
|
||||
schedule_shard(
|
||||
TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(7),
|
||||
shard_count: ShardCount(8),
|
||||
},
|
||||
NodeId(3),
|
||||
NodeId(8),
|
||||
&mut scheduled_shards,
|
||||
&mut scheduler,
|
||||
Some(az_a.clone()),
|
||||
&mut context,
|
||||
);
|
||||
|
||||
// Assert that the optimizer suggests nochanges, i.e. our initial scheduling was stable.
|
||||
for shard in &scheduled_shards {
|
||||
assert_eq!(shard.optimize_attachment(&mut scheduler, &context), None);
|
||||
}
|
||||
|
||||
for mut shard in scheduled_shards {
|
||||
shard.intent.clear(&mut scheduler);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1404,7 +1404,11 @@ impl Service {
|
||||
|
||||
// We will populate intent properly later in [`Self::startup_reconcile`], initially populate
|
||||
// it with what we can infer: the node for which a generation was most recently issued.
|
||||
let mut intent = IntentState::new();
|
||||
let mut intent = IntentState::new(
|
||||
tsp.preferred_az_id
|
||||
.as_ref()
|
||||
.map(|az| AvailabilityZone(az.clone())),
|
||||
);
|
||||
if let Some(generation_pageserver) = tsp.generation_pageserver.map(|n| NodeId(n as u64))
|
||||
{
|
||||
if nodes.contains_key(&generation_pageserver) {
|
||||
@@ -2474,18 +2478,29 @@ impl Service {
|
||||
tenant_id: TenantId,
|
||||
_guard: &TracingExclusiveGuard<TenantOperations>,
|
||||
) -> Result<(), ApiError> {
|
||||
let present_in_memory = {
|
||||
// Check if the tenant is present in memory, and select an AZ to use when loading
|
||||
// if we will load it.
|
||||
let load_in_az = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
locked
|
||||
let existing = locked
|
||||
.tenants
|
||||
.range(TenantShardId::tenant_range(tenant_id))
|
||||
.next()
|
||||
.is_some()
|
||||
};
|
||||
.next();
|
||||
|
||||
if present_in_memory {
|
||||
return Ok(());
|
||||
}
|
||||
// If the tenant is not present in memory, we expect to load it from database,
|
||||
// so let's figure out what AZ to load it into while we have self.inner locked.
|
||||
if existing.is_none() {
|
||||
locked
|
||||
.scheduler
|
||||
.get_az_for_new_tenant()
|
||||
.ok_or(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"No AZ with nodes found to load tenant"
|
||||
)))?
|
||||
} else {
|
||||
// We already have this tenant in memory
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
|
||||
let tenant_shards = self.persistence.load_tenant(tenant_id).await?;
|
||||
if tenant_shards.is_empty() {
|
||||
@@ -2494,8 +2509,20 @@ impl Service {
|
||||
));
|
||||
}
|
||||
|
||||
// TODO: choose a fresh AZ to use for this tenant when un-detaching: there definitely isn't a running
|
||||
// compute, so no benefit to making AZ sticky across detaches.
|
||||
// Update the persistent shards with the AZ that we are about to apply to in-memory state
|
||||
self.persistence
|
||||
.set_tenant_shard_preferred_azs(
|
||||
tenant_shards
|
||||
.iter()
|
||||
.map(|t| {
|
||||
(
|
||||
t.get_tenant_shard_id().expect("Corrupt shard in database"),
|
||||
Some(load_in_az.clone()),
|
||||
)
|
||||
})
|
||||
.collect(),
|
||||
)
|
||||
.await?;
|
||||
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
tracing::info!(
|
||||
@@ -2505,7 +2532,7 @@ impl Service {
|
||||
);
|
||||
|
||||
locked.tenants.extend(tenant_shards.into_iter().map(|p| {
|
||||
let intent = IntentState::new();
|
||||
let intent = IntentState::new(Some(load_in_az.clone()));
|
||||
let shard =
|
||||
TenantShard::from_persistent(p, intent).expect("Corrupt shard row in database");
|
||||
|
||||
@@ -4131,17 +4158,42 @@ impl Service {
|
||||
.ok_or_else(|| ApiError::NotFound(anyhow::anyhow!("Tenant {tenant_id} not found").into()))
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_list(&self) -> Vec<TenantDescribeResponse> {
|
||||
/// limit & offset are pagination parameters. Since we are walking an in-memory HashMap, `offset` does not
|
||||
/// avoid traversing data, it just avoid returning it. This is suitable for our purposes, since our in memory
|
||||
/// maps are small enough to traverse fast, our pagination is just to avoid serializing huge JSON responses
|
||||
/// in our external API.
|
||||
pub(crate) fn tenant_list(
|
||||
&self,
|
||||
limit: Option<usize>,
|
||||
start_after: Option<TenantId>,
|
||||
) -> Vec<TenantDescribeResponse> {
|
||||
let locked = self.inner.read().unwrap();
|
||||
|
||||
// Apply start_from parameter
|
||||
let shard_range = match start_after {
|
||||
None => locked.tenants.range(..),
|
||||
Some(tenant_id) => locked.tenants.range(
|
||||
TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(u8::MAX),
|
||||
shard_count: ShardCount(u8::MAX),
|
||||
}..,
|
||||
),
|
||||
};
|
||||
|
||||
let mut result = Vec::new();
|
||||
for (_tenant_id, tenant_shards) in
|
||||
&locked.tenants.iter().group_by(|(id, _shard)| id.tenant_id)
|
||||
{
|
||||
for (_tenant_id, tenant_shards) in &shard_range.group_by(|(id, _shard)| id.tenant_id) {
|
||||
result.push(
|
||||
self.tenant_describe_impl(tenant_shards.map(|(_k, v)| v))
|
||||
.expect("Groups are always non-empty"),
|
||||
);
|
||||
|
||||
// Enforce `limit` parameter
|
||||
if let Some(limit) = limit {
|
||||
if result.len() >= limit {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result
|
||||
@@ -4236,6 +4288,22 @@ impl Service {
|
||||
}
|
||||
|
||||
tracing::info!("Restoring parent shard {tenant_shard_id}");
|
||||
|
||||
// Drop any intents that refer to unavailable nodes, to enable this abort to proceed even
|
||||
// if the original attachment location is offline.
|
||||
if let Some(node_id) = shard.intent.get_attached() {
|
||||
if !nodes.get(node_id).unwrap().is_available() {
|
||||
tracing::info!("Demoting attached intent for {tenant_shard_id} on unavailable node {node_id}");
|
||||
shard.intent.demote_attached(scheduler, *node_id);
|
||||
}
|
||||
}
|
||||
for node_id in shard.intent.get_secondary().clone() {
|
||||
if !nodes.get(&node_id).unwrap().is_available() {
|
||||
tracing::info!("Dropping secondary intent for {tenant_shard_id} on unavailable node {node_id}");
|
||||
shard.intent.remove_secondary(scheduler, node_id);
|
||||
}
|
||||
}
|
||||
|
||||
shard.splitting = SplitState::Idle;
|
||||
if let Err(e) = shard.schedule(scheduler, &mut ScheduleContext::default()) {
|
||||
// If this shard can't be scheduled now (perhaps due to offline nodes or
|
||||
@@ -4389,15 +4457,13 @@ impl Service {
|
||||
|
||||
let mut child_state =
|
||||
TenantShard::new(child, child_shard, policy.clone(), preferred_az.clone());
|
||||
child_state.intent = IntentState::single(scheduler, Some(pageserver));
|
||||
child_state.intent =
|
||||
IntentState::single(scheduler, Some(pageserver), preferred_az.clone());
|
||||
child_state.observed = ObservedState {
|
||||
locations: child_observed,
|
||||
};
|
||||
child_state.generation = Some(generation);
|
||||
child_state.config = config.clone();
|
||||
if let Some(preferred_az) = &preferred_az {
|
||||
child_state.set_preferred_az(preferred_az.clone());
|
||||
}
|
||||
|
||||
// The child's TenantShard::splitting is intentionally left at the default value of Idle,
|
||||
// as at this point in the split process we have succeeded and this part is infallible:
|
||||
@@ -5014,6 +5080,8 @@ impl Service {
|
||||
// If our new attached node was a secondary, it no longer should be.
|
||||
shard.intent.remove_secondary(scheduler, migrate_req.node_id);
|
||||
|
||||
shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
|
||||
|
||||
// If we were already attached to something, demote that to a secondary
|
||||
if let Some(old_attached) = old_attached {
|
||||
if n > 0 {
|
||||
@@ -5025,8 +5093,6 @@ impl Service {
|
||||
shard.intent.push_secondary(scheduler, old_attached);
|
||||
}
|
||||
}
|
||||
|
||||
shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
|
||||
}
|
||||
PlacementPolicy::Secondary => {
|
||||
shard.intent.clear(scheduler);
|
||||
@@ -5055,6 +5121,69 @@ impl Service {
|
||||
Ok(TenantShardMigrateResponse {})
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_shard_migrate_secondary(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
migrate_req: TenantShardMigrateRequest,
|
||||
) -> Result<TenantShardMigrateResponse, ApiError> {
|
||||
let waiter = {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let (nodes, tenants, scheduler) = locked.parts_mut();
|
||||
|
||||
let Some(node) = nodes.get(&migrate_req.node_id) else {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Node {} not found",
|
||||
migrate_req.node_id
|
||||
)));
|
||||
};
|
||||
|
||||
if !node.is_available() {
|
||||
// Warn but proceed: the caller may intend to manually adjust the placement of
|
||||
// a shard even if the node is down, e.g. if intervening during an incident.
|
||||
tracing::warn!("Migrating to unavailable node {node}");
|
||||
}
|
||||
|
||||
let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
|
||||
return Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Tenant shard not found").into(),
|
||||
));
|
||||
};
|
||||
|
||||
if shard.intent.get_secondary().len() == 1
|
||||
&& shard.intent.get_secondary()[0] == migrate_req.node_id
|
||||
{
|
||||
tracing::info!(
|
||||
"Migrating secondary to {node}: intent is unchanged {:?}",
|
||||
shard.intent
|
||||
);
|
||||
} else if shard.intent.get_attached() == &Some(migrate_req.node_id) {
|
||||
tracing::info!("Migrating secondary to {node}: already attached where we were asked to create a secondary");
|
||||
} else {
|
||||
let old_secondaries = shard.intent.get_secondary().clone();
|
||||
for secondary in old_secondaries {
|
||||
shard.intent.remove_secondary(scheduler, secondary);
|
||||
}
|
||||
|
||||
shard.intent.push_secondary(scheduler, migrate_req.node_id);
|
||||
shard.sequence = shard.sequence.next();
|
||||
tracing::info!(
|
||||
"Migrating secondary to {node}: new intent {:?}",
|
||||
shard.intent
|
||||
);
|
||||
}
|
||||
|
||||
self.maybe_reconcile_shard(shard, nodes)
|
||||
};
|
||||
|
||||
if let Some(waiter) = waiter {
|
||||
waiter.wait_timeout(RECONCILE_TIMEOUT).await?;
|
||||
} else {
|
||||
tracing::info!("Migration is a no-op");
|
||||
}
|
||||
|
||||
Ok(TenantShardMigrateResponse {})
|
||||
}
|
||||
|
||||
/// 'cancel' in this context means cancel any ongoing reconcile
|
||||
pub(crate) async fn tenant_shard_cancel_reconcile(
|
||||
&self,
|
||||
@@ -5256,7 +5385,8 @@ impl Service {
|
||||
expect_nodes.sort_by_key(|n| n.node_id);
|
||||
nodes.sort_by_key(|n| n.node_id);
|
||||
|
||||
if nodes != expect_nodes {
|
||||
// Errors relating to nodes are deferred so that we don't skip the shard checks below if we have a node error
|
||||
let node_result = if nodes != expect_nodes {
|
||||
tracing::error!("Consistency check failed on nodes.");
|
||||
tracing::error!(
|
||||
"Nodes in memory: {}",
|
||||
@@ -5268,10 +5398,12 @@ impl Service {
|
||||
serde_json::to_string(&nodes)
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?
|
||||
);
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Node consistency failure"
|
||||
)));
|
||||
}
|
||||
)))
|
||||
} else {
|
||||
Ok(())
|
||||
};
|
||||
|
||||
let mut persistent_shards = self.persistence.load_active_tenant_shards().await?;
|
||||
persistent_shards
|
||||
@@ -5281,6 +5413,7 @@ impl Service {
|
||||
|
||||
if persistent_shards != expect_shards {
|
||||
tracing::error!("Consistency check failed on shards.");
|
||||
|
||||
tracing::error!(
|
||||
"Shards in memory: {}",
|
||||
serde_json::to_string(&expect_shards)
|
||||
@@ -5291,12 +5424,57 @@ impl Service {
|
||||
serde_json::to_string(&persistent_shards)
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?
|
||||
);
|
||||
|
||||
// The total dump log lines above are useful in testing but in the field grafana will
|
||||
// usually just drop them because they're so large. So we also do some explicit logging
|
||||
// of just the diffs.
|
||||
let persistent_shards = persistent_shards
|
||||
.into_iter()
|
||||
.map(|tsp| (tsp.get_tenant_shard_id().unwrap(), tsp))
|
||||
.collect::<HashMap<_, _>>();
|
||||
let expect_shards = expect_shards
|
||||
.into_iter()
|
||||
.map(|tsp| (tsp.get_tenant_shard_id().unwrap(), tsp))
|
||||
.collect::<HashMap<_, _>>();
|
||||
for (tenant_shard_id, persistent_tsp) in &persistent_shards {
|
||||
match expect_shards.get(tenant_shard_id) {
|
||||
None => {
|
||||
tracing::error!(
|
||||
"Shard {} found in database but not in memory",
|
||||
tenant_shard_id
|
||||
);
|
||||
}
|
||||
Some(expect_tsp) => {
|
||||
if expect_tsp != persistent_tsp {
|
||||
tracing::error!(
|
||||
"Shard {} is inconsistent. In memory: {}, database has: {}",
|
||||
tenant_shard_id,
|
||||
serde_json::to_string(expect_tsp).unwrap(),
|
||||
serde_json::to_string(&persistent_tsp).unwrap()
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Having already logged any differences, log any shards that simply aren't present in the database
|
||||
for (tenant_shard_id, memory_tsp) in &expect_shards {
|
||||
if !persistent_shards.contains_key(tenant_shard_id) {
|
||||
tracing::error!(
|
||||
"Shard {} found in memory but not in database: {}",
|
||||
tenant_shard_id,
|
||||
serde_json::to_string(memory_tsp)
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Shard consistency failure"
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
node_result
|
||||
}
|
||||
|
||||
/// For debug/support: a JSON dump of the [`Scheduler`]. Returns a response so that
|
||||
@@ -5600,7 +5778,7 @@ impl Service {
|
||||
register_req.listen_http_port,
|
||||
register_req.listen_pg_addr,
|
||||
register_req.listen_pg_port,
|
||||
register_req.availability_zone_id,
|
||||
register_req.availability_zone_id.clone(),
|
||||
);
|
||||
|
||||
// TODO: idempotency if the node already exists in the database
|
||||
@@ -5620,8 +5798,9 @@ impl Service {
|
||||
.set(locked.nodes.len() as i64);
|
||||
|
||||
tracing::info!(
|
||||
"Registered pageserver {}, now have {} pageservers",
|
||||
"Registered pageserver {} ({}), now have {} pageservers",
|
||||
register_req.node_id,
|
||||
register_req.availability_zone_id,
|
||||
locked.nodes.len()
|
||||
);
|
||||
Ok(())
|
||||
@@ -6236,7 +6415,7 @@ impl Service {
|
||||
/// available. A return value of 0 indicates that everything is fully reconciled already.
|
||||
fn reconcile_all(&self) -> usize {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let (nodes, tenants, _scheduler) = locked.parts_mut();
|
||||
let (nodes, tenants, scheduler) = locked.parts_mut();
|
||||
let pageservers = nodes.clone();
|
||||
|
||||
// This function is an efficient place to update lazy statistics, since we are walking
|
||||
@@ -6297,6 +6476,9 @@ impl Service {
|
||||
}
|
||||
}
|
||||
|
||||
// Some metrics are calculated from SchedulerNode state, update these periodically
|
||||
scheduler.update_metrics();
|
||||
|
||||
// Process any deferred tenant drops
|
||||
for (tenant_id, guard) in drop_detached_tenants {
|
||||
self.maybe_drop_tenant(tenant_id, &mut locked, &guard);
|
||||
@@ -6355,6 +6537,7 @@ impl Service {
|
||||
// Shard was dropped between planning and execution;
|
||||
continue;
|
||||
};
|
||||
tracing::info!(tenant_shard_id=%tenant_shard_id, "Applying optimization: {optimization:?}");
|
||||
if shard.apply_optimization(scheduler, optimization) {
|
||||
optimizations_applied += 1;
|
||||
if self.maybe_reconcile_shard(shard, nodes).is_some() {
|
||||
@@ -6385,7 +6568,13 @@ impl Service {
|
||||
|
||||
let mut work = Vec::new();
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let (nodes, tenants, scheduler) = locked.parts_mut();
|
||||
let (_nodes, tenants, scheduler) = locked.parts_mut();
|
||||
|
||||
// We are going to plan a bunch of optimisations before applying any of them, so the
|
||||
// utilisation stats on nodes will be effectively stale for the >1st optimisation we
|
||||
// generate. To avoid this causing unstable migrations/flapping, it's important that the
|
||||
// code in TenantShard for finding optimisations uses [`NodeAttachmentSchedulingScore::disregard_utilization`]
|
||||
// to ignore the utilisation component of the score.
|
||||
|
||||
for (_tenant_id, schedule_context, shards) in
|
||||
TenantShardContextIterator::new(tenants, ScheduleMode::Speculative)
|
||||
@@ -6416,13 +6605,28 @@ impl Service {
|
||||
continue;
|
||||
}
|
||||
|
||||
// TODO: optimization calculations are relatively expensive: create some fast-path for
|
||||
// the common idle case (avoiding the search on tenants that we have recently checked)
|
||||
// Fast path: we may quickly identify shards that don't have any possible optimisations
|
||||
if !shard.maybe_optimizable(scheduler, &schedule_context) {
|
||||
if cfg!(feature = "testing") {
|
||||
// Check that maybe_optimizable doesn't disagree with the actual optimization functions.
|
||||
// Only do this in testing builds because it is not a correctness-critical check, so we shouldn't
|
||||
// panic in prod if we hit this, or spend cycles on it in prod.
|
||||
assert!(shard
|
||||
.optimize_attachment(scheduler, &schedule_context)
|
||||
.is_none());
|
||||
assert!(shard
|
||||
.optimize_secondary(scheduler, &schedule_context)
|
||||
.is_none());
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(optimization) =
|
||||
// If idle, maybe ptimize attachments: if a shard has a secondary location that is preferable to
|
||||
// If idle, maybe optimize attachments: if a shard has a secondary location that is preferable to
|
||||
// its primary location based on soft constraints, cut it over.
|
||||
shard.optimize_attachment(nodes, &schedule_context)
|
||||
shard.optimize_attachment(scheduler, &schedule_context)
|
||||
{
|
||||
tracing::info!(tenant_shard_id=%shard.tenant_shard_id, "Identified optimization for attachment: {optimization:?}");
|
||||
work.push((shard.tenant_shard_id, optimization));
|
||||
break;
|
||||
} else if let Some(optimization) =
|
||||
@@ -6432,6 +6636,7 @@ impl Service {
|
||||
// in the same tenant with secondary locations on the node where they originally split.
|
||||
shard.optimize_secondary(scheduler, &schedule_context)
|
||||
{
|
||||
tracing::info!(tenant_shard_id=%shard.tenant_shard_id, "Identified optimization for secondary: {optimization:?}");
|
||||
work.push((shard.tenant_shard_id, optimization));
|
||||
break;
|
||||
}
|
||||
@@ -6480,8 +6685,10 @@ impl Service {
|
||||
}
|
||||
}
|
||||
}
|
||||
ScheduleOptimizationAction::ReplaceSecondary(_) => {
|
||||
// No extra checks needed to replace a secondary: this does not interrupt client access
|
||||
ScheduleOptimizationAction::ReplaceSecondary(_)
|
||||
| ScheduleOptimizationAction::CreateSecondary(_)
|
||||
| ScheduleOptimizationAction::RemoveSecondary(_) => {
|
||||
// No extra checks needed to manage secondaries: this does not interrupt client access
|
||||
validated_work.push((tenant_shard_id, optimization))
|
||||
}
|
||||
};
|
||||
@@ -6553,26 +6760,35 @@ impl Service {
|
||||
/// we have this helper to move things along faster.
|
||||
#[cfg(feature = "testing")]
|
||||
async fn kick_secondary_download(&self, tenant_shard_id: TenantShardId) {
|
||||
let (attached_node, secondary_node) = {
|
||||
let (attached_node, secondaries) = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
let Some(shard) = locked.tenants.get(&tenant_shard_id) else {
|
||||
tracing::warn!(
|
||||
"Skipping kick of secondary download for {tenant_shard_id}: not found"
|
||||
);
|
||||
return;
|
||||
};
|
||||
let (Some(attached), Some(secondary)) = (
|
||||
shard.intent.get_attached(),
|
||||
shard.intent.get_secondary().first(),
|
||||
) else {
|
||||
|
||||
let Some(attached) = shard.intent.get_attached() else {
|
||||
tracing::warn!(
|
||||
"Skipping kick of secondary download for {tenant_shard_id}: no attached"
|
||||
);
|
||||
return;
|
||||
};
|
||||
(
|
||||
locked.nodes.get(attached).unwrap().clone(),
|
||||
locked.nodes.get(secondary).unwrap().clone(),
|
||||
)
|
||||
|
||||
let secondaries = shard
|
||||
.intent
|
||||
.get_secondary()
|
||||
.iter()
|
||||
.map(|n| locked.nodes.get(n).unwrap().clone())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
(locked.nodes.get(attached).unwrap().clone(), secondaries)
|
||||
};
|
||||
|
||||
// Make remote API calls to upload + download heatmaps: we ignore errors because this is just
|
||||
// a 'kick' to let scheduling optimisation run more promptly.
|
||||
attached_node
|
||||
match attached_node
|
||||
.with_client_retries(
|
||||
|client| async move { client.tenant_heatmap_upload(tenant_shard_id).await },
|
||||
&self.config.jwt_token,
|
||||
@@ -6581,22 +6797,57 @@ impl Service {
|
||||
SHORT_RECONCILE_TIMEOUT,
|
||||
&self.cancel,
|
||||
)
|
||||
.await;
|
||||
.await
|
||||
{
|
||||
Some(Err(e)) => {
|
||||
tracing::info!(
|
||||
"Failed to upload heatmap from {attached_node} for {tenant_shard_id}: {e}"
|
||||
);
|
||||
}
|
||||
None => {
|
||||
tracing::info!(
|
||||
"Cancelled while uploading heatmap from {attached_node} for {tenant_shard_id}"
|
||||
);
|
||||
}
|
||||
Some(Ok(_)) => {
|
||||
tracing::info!(
|
||||
"Successfully uploaded heatmap from {attached_node} for {tenant_shard_id}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
secondary_node
|
||||
.with_client_retries(
|
||||
|client| async move {
|
||||
client
|
||||
.tenant_secondary_download(tenant_shard_id, Some(Duration::from_secs(1)))
|
||||
.await
|
||||
},
|
||||
&self.config.jwt_token,
|
||||
3,
|
||||
10,
|
||||
SHORT_RECONCILE_TIMEOUT,
|
||||
&self.cancel,
|
||||
)
|
||||
.await;
|
||||
for secondary_node in secondaries {
|
||||
match secondary_node
|
||||
.with_client_retries(
|
||||
|client| async move {
|
||||
client
|
||||
.tenant_secondary_download(
|
||||
tenant_shard_id,
|
||||
Some(Duration::from_secs(1)),
|
||||
)
|
||||
.await
|
||||
},
|
||||
&self.config.jwt_token,
|
||||
3,
|
||||
10,
|
||||
SHORT_RECONCILE_TIMEOUT,
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Some(Err(e)) => {
|
||||
tracing::info!(
|
||||
"Failed to download heatmap from {secondary_node} for {tenant_shard_id}: {e}"
|
||||
);
|
||||
}
|
||||
None => {
|
||||
tracing::info!("Cancelled while downloading heatmap from {secondary_node} for {tenant_shard_id}");
|
||||
}
|
||||
Some(Ok(progress)) => {
|
||||
tracing::info!("Successfully downloaded heatmap from {secondary_node} for {tenant_shard_id}: {progress:?}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Look for shards which are oversized and in need of splitting
|
||||
@@ -7032,9 +7283,15 @@ impl Service {
|
||||
fn fill_node_plan(&self, node_id: NodeId) -> Vec<TenantShardId> {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let fill_requirement = locked.scheduler.compute_fill_requirement(node_id);
|
||||
let (nodes, tenants, _scheduler) = locked.parts_mut();
|
||||
|
||||
let mut tids_by_node = locked
|
||||
.tenants
|
||||
let node_az = nodes
|
||||
.get(&node_id)
|
||||
.expect("Node must exist")
|
||||
.get_availability_zone_id()
|
||||
.clone();
|
||||
|
||||
let mut tids_by_node = tenants
|
||||
.iter_mut()
|
||||
.filter_map(|(tid, tenant_shard)| {
|
||||
if !matches!(
|
||||
@@ -7047,6 +7304,25 @@ impl Service {
|
||||
return None;
|
||||
}
|
||||
|
||||
// AZ check: when filling nodes after a restart, our intent is to move _back_ the
|
||||
// shards which belong on this node, not to promote shards whose scheduling preference
|
||||
// would be on their currently attached node. So will avoid promoting shards whose
|
||||
// home AZ doesn't match the AZ of the node we're filling.
|
||||
match tenant_shard.preferred_az() {
|
||||
None => {
|
||||
// Shard doesn't have an AZ preference: it is elegible to be moved.
|
||||
}
|
||||
Some(az) if az == &node_az => {
|
||||
// This shard's home AZ is equal to the node we're filling: it is
|
||||
// elegible to be moved: fall through;
|
||||
}
|
||||
Some(_) => {
|
||||
// This shard's home AZ is somewhere other than the node we're filling:
|
||||
// do not include it in the fill plan.
|
||||
return None;
|
||||
}
|
||||
}
|
||||
|
||||
if tenant_shard.intent.get_secondary().contains(&node_id) {
|
||||
if let Some(primary) = tenant_shard.intent.get_attached() {
|
||||
return Some((*primary, *tid));
|
||||
|
||||
@@ -43,9 +43,6 @@ impl<'a> Iterator for TenantShardContextIterator<'a> {
|
||||
|
||||
// Accumulate the schedule context for all the shards in a tenant
|
||||
schedule_context.avoid(&shard.intent.all_pageservers());
|
||||
if let Some(attached) = shard.intent.get_attached() {
|
||||
schedule_context.push_attached(*attached);
|
||||
}
|
||||
tenant_shards.push(shard);
|
||||
|
||||
if tenant_shard_id.shard_number.0 == tenant_shard_id.shard_count.count() - 1 {
|
||||
@@ -115,7 +112,7 @@ mod tests {
|
||||
assert_eq!(tenant_id, t1_id);
|
||||
assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0));
|
||||
assert_eq!(shards.len(), 1);
|
||||
assert_eq!(context.attach_count(), 1);
|
||||
assert_eq!(context.location_count(), 2);
|
||||
|
||||
let (tenant_id, context, shards) = iter.next().unwrap();
|
||||
assert_eq!(tenant_id, t2_id);
|
||||
@@ -124,13 +121,13 @@ mod tests {
|
||||
assert_eq!(shards[2].tenant_shard_id.shard_number, ShardNumber(2));
|
||||
assert_eq!(shards[3].tenant_shard_id.shard_number, ShardNumber(3));
|
||||
assert_eq!(shards.len(), 4);
|
||||
assert_eq!(context.attach_count(), 4);
|
||||
assert_eq!(context.location_count(), 8);
|
||||
|
||||
let (tenant_id, context, shards) = iter.next().unwrap();
|
||||
assert_eq!(tenant_id, t3_id);
|
||||
assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0));
|
||||
assert_eq!(shards.len(), 1);
|
||||
assert_eq!(context.attach_count(), 1);
|
||||
assert_eq!(context.location_count(), 2);
|
||||
|
||||
for shard in tenants.values_mut() {
|
||||
shard.intent.clear(&mut scheduler);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1884,7 +1884,10 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
)
|
||||
return response.json()
|
||||
|
||||
def tenant_list(self):
|
||||
def tenant_shard_dump(self):
|
||||
"""
|
||||
Debug listing API: dumps the internal map of tenant shards
|
||||
"""
|
||||
response = self.request(
|
||||
"GET",
|
||||
f"{self.api}/debug/v1/tenant",
|
||||
@@ -1892,6 +1895,18 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
)
|
||||
return response.json()
|
||||
|
||||
def tenant_list(self, **kwargs):
|
||||
"""
|
||||
Control API tenant listing: a vector of the same content returned by tenant_describe
|
||||
"""
|
||||
response = self.request(
|
||||
"GET",
|
||||
f"{self.api}/control/v1/tenant",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
params=kwargs,
|
||||
)
|
||||
return response.json()
|
||||
|
||||
def node_configure(self, node_id, body: dict[str, Any]):
|
||||
log.info(f"node_configure({node_id}, {body})")
|
||||
body["node_id"] = node_id
|
||||
@@ -2238,7 +2253,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
"""
|
||||
Get the intent and observed placements of all tenants known to the storage controller.
|
||||
"""
|
||||
tenants = self.tenant_list()
|
||||
tenants = self.tenant_shard_dump()
|
||||
|
||||
tenant_placement: defaultdict[str, dict[str, Any]] = defaultdict(
|
||||
lambda: {
|
||||
|
||||
@@ -15,7 +15,6 @@ from requests.adapters import HTTPAdapter
|
||||
from urllib3.util.retry import Retry
|
||||
|
||||
from fixtures.common_types import (
|
||||
Id,
|
||||
Lsn,
|
||||
TenantId,
|
||||
TenantShardId,
|
||||
@@ -25,7 +24,7 @@ from fixtures.common_types import (
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
|
||||
from fixtures.pg_version import PgVersion
|
||||
from fixtures.utils import Fn
|
||||
from fixtures.utils import EnhancedJSONEncoder, Fn
|
||||
|
||||
|
||||
class PageserverApiException(Exception):
|
||||
@@ -83,14 +82,6 @@ class TimelineCreateRequest:
|
||||
mode: TimelineCreateRequestMode
|
||||
|
||||
def to_json(self) -> str:
|
||||
class EnhancedJSONEncoder(json.JSONEncoder):
|
||||
def default(self, o):
|
||||
if dataclasses.is_dataclass(o) and not isinstance(o, type):
|
||||
return dataclasses.asdict(o)
|
||||
elif isinstance(o, Id):
|
||||
return o.id.hex()
|
||||
return super().default(o)
|
||||
|
||||
# mode is flattened
|
||||
this = dataclasses.asdict(self)
|
||||
mode = this.pop("mode")
|
||||
|
||||
@@ -10,7 +10,7 @@ import requests
|
||||
from fixtures.common_types import Lsn, TenantId, TenantTimelineId, TimelineId
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
|
||||
from fixtures.utils import wait_until
|
||||
from fixtures.utils import EnhancedJSONEncoder, wait_until
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import Any
|
||||
@@ -25,6 +25,7 @@ class Walreceiver:
|
||||
|
||||
@dataclass
|
||||
class SafekeeperTimelineStatus:
|
||||
mconf: Configuration | None
|
||||
term: int
|
||||
last_log_term: int
|
||||
pg_version: int # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
|
||||
@@ -69,6 +70,56 @@ class TermBumpResponse:
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SafekeeperId:
|
||||
id: int
|
||||
host: str
|
||||
pg_port: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class Configuration:
|
||||
generation: int
|
||||
members: list[SafekeeperId]
|
||||
new_members: list[SafekeeperId] | None
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, d: dict[str, Any]) -> Configuration:
|
||||
generation = d["generation"]
|
||||
members = d["members"]
|
||||
new_members = d.get("new_members")
|
||||
return Configuration(generation, members, new_members)
|
||||
|
||||
def to_json(self) -> str:
|
||||
return json.dumps(self, cls=EnhancedJSONEncoder)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TimelineCreateRequest:
|
||||
tenant_id: TenantId
|
||||
timeline_id: TimelineId
|
||||
mconf: Configuration
|
||||
# not exactly PgVersion, for example 150002 for 15.2
|
||||
pg_version: int
|
||||
start_lsn: Lsn
|
||||
commit_lsn: Lsn | None
|
||||
|
||||
def to_json(self) -> str:
|
||||
return json.dumps(self, cls=EnhancedJSONEncoder)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TimelineMembershipSwitchResponse:
|
||||
previous_conf: Configuration
|
||||
current_conf: Configuration
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, d: dict[str, Any]) -> TimelineMembershipSwitchResponse:
|
||||
previous_conf = Configuration.from_json(d["previous_conf"])
|
||||
current_conf = Configuration.from_json(d["current_conf"])
|
||||
return TimelineMembershipSwitchResponse(previous_conf, current_conf)
|
||||
|
||||
|
||||
class SafekeeperHttpClient(requests.Session, MetricsGetter):
|
||||
HTTPError = requests.HTTPError
|
||||
|
||||
@@ -131,20 +182,8 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
|
||||
resj = res.json()
|
||||
return [TenantTimelineId.from_json(ttidj) for ttidj in resj]
|
||||
|
||||
def timeline_create(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
pg_version: int, # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
|
||||
commit_lsn: Lsn,
|
||||
):
|
||||
body = {
|
||||
"tenant_id": str(tenant_id),
|
||||
"timeline_id": str(timeline_id),
|
||||
"pg_version": pg_version,
|
||||
"commit_lsn": str(commit_lsn),
|
||||
}
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/timeline", json=body)
|
||||
def timeline_create(self, r: TimelineCreateRequest):
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/timeline", data=r.to_json())
|
||||
res.raise_for_status()
|
||||
|
||||
def timeline_status(
|
||||
@@ -154,7 +193,10 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
|
||||
res.raise_for_status()
|
||||
resj = res.json()
|
||||
walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
|
||||
# It is always normally not None, it is allowed only to make forward compat tests happy.
|
||||
mconf = Configuration.from_json(resj["mconf"]) if "mconf" in resj else None
|
||||
return SafekeeperTimelineStatus(
|
||||
mconf=mconf,
|
||||
term=resj["acceptor_state"]["term"],
|
||||
last_log_term=resj["acceptor_state"]["epoch"],
|
||||
pg_version=resj["pg_info"]["pg_version"],
|
||||
@@ -180,6 +222,11 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
|
||||
def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn:
|
||||
return self.timeline_status(tenant_id, timeline_id).commit_lsn
|
||||
|
||||
# Get timeline membership configuration.
|
||||
def get_membership(self, tenant_id: TenantId, timeline_id: TimelineId) -> Configuration:
|
||||
# make mypy happy
|
||||
return self.timeline_status(tenant_id, timeline_id).mconf # type: ignore
|
||||
|
||||
# only_local doesn't remove segments in the remote storage.
|
||||
def timeline_delete(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False
|
||||
@@ -226,6 +273,16 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def membership_switch(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration
|
||||
) -> TimelineMembershipSwitchResponse:
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/membership",
|
||||
data=to.to_json(),
|
||||
)
|
||||
res.raise_for_status()
|
||||
return TimelineMembershipSwitchResponse.from_json(res.json())
|
||||
|
||||
def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: dict[str, Any]):
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy",
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import dataclasses
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
@@ -21,6 +22,7 @@ import zstandard
|
||||
from psycopg2.extensions import cursor
|
||||
from typing_extensions import override
|
||||
|
||||
from fixtures.common_types import Id, Lsn
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.pageserver.common_types import (
|
||||
parse_delta_layer,
|
||||
@@ -605,6 +607,22 @@ class PropagatingThread(threading.Thread):
|
||||
return self.ret
|
||||
|
||||
|
||||
class EnhancedJSONEncoder(json.JSONEncoder):
|
||||
"""
|
||||
Default json.JSONEncoder works only on primitive builtins. Extend it to any
|
||||
dataclass plus our custom types.
|
||||
"""
|
||||
|
||||
def default(self, o):
|
||||
if dataclasses.is_dataclass(o) and not isinstance(o, type):
|
||||
return dataclasses.asdict(o)
|
||||
elif isinstance(o, Id):
|
||||
return o.id.hex()
|
||||
elif isinstance(o, Lsn):
|
||||
return str(o) # standard hex notation
|
||||
return super().default(o)
|
||||
|
||||
|
||||
def human_bytes(amt: float) -> str:
|
||||
"""
|
||||
Render a bytes amount into nice IEC bytes string.
|
||||
|
||||
@@ -2,6 +2,7 @@ from __future__ import annotations
|
||||
|
||||
import concurrent.futures
|
||||
import re
|
||||
import threading
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
@@ -188,7 +189,20 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
|
||||
check_pgbench_output(out_path)
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads:
|
||||
stop_pump = threading.Event()
|
||||
|
||||
def pump_controller():
|
||||
# Run a background loop to force the storage controller to run its
|
||||
# background work faster than it otherwise would: this helps
|
||||
# us:
|
||||
# A) to create a test that runs in a shorter time
|
||||
# B) to create a test that is more intensive by doing the shard migrations
|
||||
# after splits happen more rapidly.
|
||||
while not stop_pump.is_set():
|
||||
env.storage_controller.reconcile_all()
|
||||
stop_pump.wait(0.1)
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count + 1) as pgbench_threads:
|
||||
pgbench_futs = []
|
||||
for tenant_state in tenants.values():
|
||||
fut = pgbench_threads.submit(run_pgbench_init, tenant_state.endpoint)
|
||||
@@ -198,6 +212,8 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
for fut in pgbench_futs:
|
||||
fut.result()
|
||||
|
||||
pump_fut = pgbench_threads.submit(pump_controller)
|
||||
|
||||
pgbench_futs = []
|
||||
for tenant_state in tenants.values():
|
||||
fut = pgbench_threads.submit(run_pgbench_main, tenant_state.endpoint)
|
||||
@@ -207,6 +223,9 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
for fut in pgbench_futs:
|
||||
fut.result()
|
||||
|
||||
stop_pump.set()
|
||||
pump_fut.result()
|
||||
|
||||
def assert_all_split():
|
||||
for tenant_id in tenants.keys():
|
||||
shards = tenant_get_shards(env, tenant_id)
|
||||
|
||||
@@ -13,11 +13,13 @@ from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
NeonPageserver,
|
||||
PageserverAvailability,
|
||||
PageserverSchedulingPolicy,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
|
||||
from fixtures.pg_version import PgVersion
|
||||
from fixtures.utils import wait_until
|
||||
|
||||
|
||||
def get_consistent_node_shard_counts(env: NeonEnv, total_shards) -> defaultdict[str, int]:
|
||||
@@ -85,8 +87,12 @@ def test_storage_controller_many_tenants(
|
||||
)
|
||||
|
||||
AZS = ["alpha", "bravo", "charlie"]
|
||||
|
||||
def az_selector(node_id):
|
||||
return f"az-{AZS[(node_id - 1) % len(AZS)]}"
|
||||
|
||||
neon_env_builder.pageserver_config_override = lambda ps_cfg: ps_cfg.update(
|
||||
{"availability_zone": f"az-{AZS[ps_cfg['id'] % len(AZS)]}"}
|
||||
{"availability_zone": az_selector(ps_cfg["id"])}
|
||||
)
|
||||
|
||||
# A small sleep on each call into the notify hook, to simulate the latency of doing a database write
|
||||
@@ -168,6 +174,31 @@ def test_storage_controller_many_tenants(
|
||||
log.info(f"Resident memory: {rss} ({ rss / total_shards} per shard)")
|
||||
assert rss < expect_memory_per_shard * total_shards
|
||||
|
||||
def assert_all_tenants_scheduled_in_home_az():
|
||||
for tenant_id in tenant_ids:
|
||||
desc = env.storage_controller.tenant_describe(tenant_id)
|
||||
preferred_az = None
|
||||
for shard in desc["shards"]:
|
||||
# All shards in a tenant should have the same preferred AZ
|
||||
if preferred_az is None:
|
||||
preferred_az = shard["preferred_az_id"]
|
||||
else:
|
||||
assert preferred_az == shard["preferred_az_id"]
|
||||
|
||||
# Attachment should be in the preferred AZ
|
||||
assert shard["preferred_az_id"] == az_selector(
|
||||
shard["node_attached"]
|
||||
), f"Shard {shard['tenant_shard_id']} not in {shard['preferred_az_id']}"
|
||||
|
||||
# Secondary locations should not be in the preferred AZ
|
||||
for node_secondary in shard["node_secondary"]:
|
||||
assert (
|
||||
shard["preferred_az_id"] != az_selector(node_secondary)
|
||||
), f"Shard {shard['tenant_shard_id']} secondary should be in {shard['preferred_az_id']}"
|
||||
|
||||
# There should only be one secondary location (i.e. no migrations in flight)
|
||||
assert len(shard["node_secondary"]) == 1
|
||||
|
||||
# Issue more concurrent operations than the storage controller's reconciler concurrency semaphore
|
||||
# permits, to ensure that we are exercising stressing that.
|
||||
api_concurrency = 135
|
||||
@@ -242,6 +273,22 @@ def test_storage_controller_many_tenants(
|
||||
f"Created {len(tenants_with_timelines)} timelines in {time.time() - t1}, {len(tenants_with_timelines) / (time.time() - t1)}/s"
|
||||
)
|
||||
|
||||
# Check initial scheduling
|
||||
assert_all_tenants_scheduled_in_home_az()
|
||||
az_attached_counts: defaultdict[str, int] = defaultdict(int)
|
||||
az_secondary_counts: defaultdict[str, int] = defaultdict(int)
|
||||
node_attached_counts: defaultdict[str, int] = defaultdict(int)
|
||||
for tenant_id in tenants.keys():
|
||||
desc = env.storage_controller.tenant_describe(tenant_id)
|
||||
for shard in desc["shards"]:
|
||||
az_attached_counts[az_selector(shard["node_attached"])] += 1
|
||||
node_attached_counts[shard["node_attached"]] += 1
|
||||
for node_secondary in shard["node_secondary"]:
|
||||
az_secondary_counts[az_selector(node_secondary)] += 1
|
||||
|
||||
log.info(f"Initial node attached counts: {node_attached_counts}")
|
||||
log.info(f"Initial AZ shard counts: {az_attached_counts}, {az_secondary_counts}")
|
||||
|
||||
# Plan operations: ensure each tenant with a timeline gets at least
|
||||
# one of each operation type. Then add other tenants to make up the
|
||||
# numbers.
|
||||
@@ -450,11 +497,77 @@ def test_storage_controller_many_tenants(
|
||||
env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120)
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
# Since we did `reconcile_until_idle` during the above loop, the system should be left in
|
||||
# an optimally scheduled state. Validate that this includes all the tenants being scheduled
|
||||
# in their home AZ.
|
||||
assert_all_tenants_scheduled_in_home_az()
|
||||
|
||||
# Consistency check is safe here: restarting pageservers should not have caused any Reconcilers to spawn,
|
||||
# as they were not offline long enough to trigger any scheduling changes.
|
||||
env.storage_controller.consistency_check()
|
||||
check_memory()
|
||||
|
||||
# Simulate loss of an AZ
|
||||
victim_az = "az-alpha"
|
||||
killed_pageservers = []
|
||||
for ps in env.pageservers:
|
||||
if az_selector(ps.id) == victim_az:
|
||||
ps.stop(immediate=True)
|
||||
killed_pageservers.append(ps)
|
||||
log.info(f"Killed pageserver {ps.id}")
|
||||
|
||||
assert killed_pageservers
|
||||
|
||||
# Wait for the controller to notice the pageservers are dead
|
||||
def assert_pageservers_availability(
|
||||
pageservers: list[NeonPageserver], expected_availability: PageserverAvailability
|
||||
):
|
||||
nodes = env.storage_controller.nodes()
|
||||
checked_any = False
|
||||
node_ids = [ps.id for ps in pageservers]
|
||||
for node in nodes:
|
||||
if node["id"] in node_ids:
|
||||
checked_any = True
|
||||
assert (
|
||||
node["availability"] == expected_availability
|
||||
), f"Node {node['id']} is not {expected_availability} yet: {node['availability']}"
|
||||
|
||||
assert checked_any
|
||||
|
||||
wait_until(
|
||||
lambda: assert_pageservers_availability(killed_pageservers, PageserverAvailability.OFFLINE),
|
||||
timeout=60,
|
||||
)
|
||||
|
||||
# Let the controller finish all its rescheduling
|
||||
env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120)
|
||||
|
||||
# Check that all the tenants are rescheduled to the remaining pageservers
|
||||
for tenant_id in tenant_ids:
|
||||
desc = env.storage_controller.tenant_describe(tenant_id)
|
||||
for shard in desc["shards"]:
|
||||
# Attachment should be outside the AZ where we killed the pageservers
|
||||
assert (
|
||||
az_selector(shard["node_attached"]) != victim_az
|
||||
), f"Shard {shard['tenant_shard_id']} still in {victim_az} (node {shard['node_attached']})"
|
||||
|
||||
# Bring back the pageservers
|
||||
for ps in killed_pageservers:
|
||||
ps.start()
|
||||
|
||||
wait_until(
|
||||
lambda: assert_pageservers_availability(killed_pageservers, PageserverAvailability.ACTIVE),
|
||||
timeout=60,
|
||||
)
|
||||
|
||||
# A very long timeout is required: we will be migrating all the tenants on all the pageservers
|
||||
# in the region that we just restored. Assume it'll take up to twice as long as it took to fill
|
||||
# a single node
|
||||
env.storage_controller.reconcile_until_idle(
|
||||
max_interval=0.1, timeout_secs=DRAIN_FILL_TIMEOUT * 4
|
||||
)
|
||||
assert_all_tenants_scheduled_in_home_az()
|
||||
|
||||
# Stop the storage controller before tearing down fixtures, because it otherwise might log
|
||||
# errors trying to call our `ComputeReconfigure`.
|
||||
env.storage_controller.stop()
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user