diff --git a/Cargo.lock b/Cargo.lock index 44edbabaf6..dbbf330cf9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2915,6 +2915,12 @@ version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" +[[package]] +name = "linux-raw-sys" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0b5399f6804fbab912acbd8878ed3532d506b7c951b8f9f164ef90fef39e3f4" + [[package]] name = "lock_api" version = "0.4.10" @@ -3564,6 +3570,7 @@ dependencies = [ "serde", "serde_json", "svg_fmt", + "thiserror", "tokio", "tokio-util", "toml_edit", @@ -5122,6 +5129,7 @@ dependencies = [ "futures-util", "hex", "histogram", + "humantime", "itertools", "once_cell", "pageserver", @@ -5812,6 +5820,7 @@ dependencies = [ "anyhow", "clap", "comfy-table", + "humantime", "hyper 0.14.26", "pageserver_api", "pageserver_client", @@ -6157,7 +6166,7 @@ dependencies = [ [[package]] name = "tokio-epoll-uring" version = "0.1.0" -source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939" +source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6" dependencies = [ "futures", "nix 0.26.4", @@ -6669,11 +6678,12 @@ dependencies = [ [[package]] name = "uring-common" version = "0.1.0" -source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939" +source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6" dependencies = [ "bytes", "io-uring", "libc", + "linux-raw-sys 0.6.4", ] [[package]] diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 87fb218245..90b8868b43 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -89,7 +89,7 @@ RUN apt update && \ # SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2 RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \ echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \ - mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \ + mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C . && \ cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \ DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \ make clean && cp -R /sfcgal/* / @@ -98,7 +98,7 @@ ENV PATH "/usr/local/pgsql/bin:$PATH" RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \ echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \ - mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \ + mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \ find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\ ./autogen.sh && \ ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \ @@ -124,7 +124,7 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postg RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \ echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \ - mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \ + mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C . && \ mkdir build && cd build && \ cmake -DCMAKE_BUILD_TYPE=Release .. && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -149,7 +149,7 @@ RUN apt update && \ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \ echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \ - mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \ + mkdir plv8-src && cd plv8-src && tar xzf ../plv8.tar.gz --strip-components=1 -C . && \ # generate and copy upgrade scripts mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \ cp upgrade/* /usr/local/pgsql/share/extension/ && \ @@ -194,7 +194,7 @@ RUN case "$(uname -m)" in \ RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \ echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \ - mkdir h3-src && cd h3-src && tar xvzf ../h3.tar.gz --strip-components=1 -C . && \ + mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C . && \ mkdir build && cd build && \ cmake .. -DCMAKE_BUILD_TYPE=Release && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -204,7 +204,7 @@ RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \ echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \ - mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \ + mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ @@ -222,7 +222,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \ echo "411d05beeb97e5a4abf17572bfcfbb5a68d98d1018918feff995f6ee3bb03e79 postgresql-unit.tar.gz" | sha256sum --check && \ - mkdir postgresql-unit-src && cd postgresql-unit-src && tar xvzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \ + mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ # unit extension's "create extension" script relies on absolute install path to fill some reference tables. @@ -243,12 +243,12 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY patches/pgvector.patch /pgvector.patch -# By default, pgvector Makefile uses `-march=native`. We don't want that, +# By default, pgvector Makefile uses `-march=native`. We don't want that, # because we build the images on different machines than where we run them. # Pass OPTFLAGS="" to remove it. -RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -O pgvector.tar.gz && \ - echo "1b5503a35c265408b6eb282621c5e1e75f7801afc04eecb950796cfee2e3d1d8 pgvector.tar.gz" | sha256sum --check && \ - mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \ +RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.1.tar.gz -O pgvector.tar.gz && \ + echo "fe6c8cb4e0cd1a8cb60f5badf9e1701e0fcabcfc260931c26d01e155c4dd21d1 pgvector.tar.gz" | sha256sum --check && \ + mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \ patch -p1 < /pgvector.patch && \ make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -266,7 +266,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021 RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \ echo "cfdefb15007286f67d3d45510f04a6a7a495004be5b3aecb12cda667e774203f pgjwt.tar.gz" | sha256sum --check && \ - mkdir pgjwt-src && cd pgjwt-src && tar xvzf ../pgjwt.tar.gz --strip-components=1 -C . && \ + mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control @@ -281,7 +281,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \ echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \ - mkdir hypopg-src && cd hypopg-src && tar xvzf ../hypopg.tar.gz --strip-components=1 -C . && \ + mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control @@ -297,7 +297,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \ echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \ - mkdir pg_hashids-src && cd pg_hashids-src && tar xvzf ../pg_hashids.tar.gz --strip-components=1 -C . && \ + mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control @@ -313,7 +313,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \ echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \ - mkdir rum-src && cd rum-src && tar xvzf ../rum.tar.gz --strip-components=1 -C . && \ + mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control @@ -329,7 +329,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \ echo "9c7c3de67ea41638e14f06da5da57bac6f5bd03fea05c165a0ec862205a5c052 pgtap.tar.gz" | sha256sum --check && \ - mkdir pgtap-src && cd pgtap-src && tar xvzf ../pgtap.tar.gz --strip-components=1 -C . && \ + mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control @@ -345,7 +345,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \ echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \ - mkdir ip4r-src && cd ip4r-src && tar xvzf ../ip4r.tar.gz --strip-components=1 -C . && \ + mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/ip4r.control @@ -361,7 +361,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \ echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \ - mkdir prefix-src && cd prefix-src && tar xvzf ../prefix.tar.gz --strip-components=1 -C . && \ + mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control @@ -377,7 +377,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \ echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \ - mkdir hll-src && cd hll-src && tar xvzf ../hll.tar.gz --strip-components=1 -C . && \ + mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control @@ -393,7 +393,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \ echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \ - mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \ + mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control @@ -424,7 +424,7 @@ RUN case "${PG_VERSION}" in \ apt-get install -y cmake && \ wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \ echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \ - mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \ + mkdir timescaledb-src && cd timescaledb-src && tar xzf ../timescaledb.tar.gz --strip-components=1 -C . && \ ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \ cd build && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -462,7 +462,7 @@ RUN case "${PG_VERSION}" in \ esac && \ wget https://github.com/ossc-db/pg_hint_plan/archive/refs/tags/REL${PG_HINT_PLAN_VERSION}.tar.gz -O pg_hint_plan.tar.gz && \ echo "${PG_HINT_PLAN_CHECKSUM} pg_hint_plan.tar.gz" | sha256sum --check && \ - mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xvzf ../pg_hint_plan.tar.gz --strip-components=1 -C . && \ + mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xzf ../pg_hint_plan.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make install -j $(getconf _NPROCESSORS_ONLN) && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_hint_plan.control @@ -481,7 +481,7 @@ RUN apt-get update && \ apt-get install -y git libgtk2.0-dev libpq-dev libpam-dev libxslt-dev libkrb5-dev cmake && \ wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \ echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \ - mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \ + mkdir kq_imcx-src && cd kq_imcx-src && tar xzf ../kq_imcx.tar.gz --strip-components=1 -C . && \ find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\ mkdir build && cd build && \ cmake -DCMAKE_BUILD_TYPE=Release .. && \ @@ -505,7 +505,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH "/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \ echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \ - mkdir pg_cron-src && cd pg_cron-src && tar xvzf ../pg_cron.tar.gz --strip-components=1 -C . && \ + mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control @@ -531,7 +531,7 @@ RUN apt-get update && \ ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH" RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \ echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \ - mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \ + mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \ cmake \ -D RDK_BUILD_CAIRO_SUPPORT=OFF \ -D RDK_BUILD_INCHI_SUPPORT=ON \ @@ -571,7 +571,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH "/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \ echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \ - mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xvzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \ + mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_uuidv7.control @@ -588,7 +588,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH "/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \ echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \ - mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xvzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \ + mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control @@ -605,7 +605,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH "/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \ echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \ - mkdir pg_semver-src && cd pg_semver-src && tar xvzf ../pg_semver.tar.gz --strip-components=1 -C . && \ + mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/semver.control @@ -631,7 +631,7 @@ RUN case "${PG_VERSION}" in \ esac && \ wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/${PG_EMBEDDING_VERSION}.tar.gz -O pg_embedding.tar.gz && \ echo "${PG_EMBEDDING_CHECKSUM} pg_embedding.tar.gz" | sha256sum --check && \ - mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \ + mkdir pg_embedding-src && cd pg_embedding-src && tar xzf ../pg_embedding.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install @@ -647,7 +647,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH "/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \ echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \ - mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \ + mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \ find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \ @@ -696,7 +696,7 @@ ARG PG_VERSION RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.gz -O pg_jsonschema.tar.gz && \ echo "9118fc508a6e231e7a39acaa6f066fcd79af17a5db757b47d2eefbe14f7794f0 pg_jsonschema.tar.gz" | sha256sum --check && \ - mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \ + mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = "0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ cargo pgrx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control @@ -713,7 +713,7 @@ ARG PG_VERSION RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.4.0.tar.gz -O pg_graphql.tar.gz && \ echo "bd8dc7230282b3efa9ae5baf053a54151ed0e66881c7c53750e2d0c765776edc pg_graphql.tar.gz" | sha256sum --check && \ - mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \ + mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ cargo pgrx install --release && \ # it's needed to enable extension because it uses untrusted C language @@ -733,7 +733,7 @@ ARG PG_VERSION # 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023 RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \ echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \ - mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \ + mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \ cargo pgrx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control @@ -749,7 +749,7 @@ ARG PG_VERSION RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -O pgx_ulid.tar.gz && \ echo "ee5db82945d2d9f2d15597a80cf32de9dca67b897f605beb830561705f12683c pgx_ulid.tar.gz" | sha256sum --check && \ - mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \ + mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \ echo "******************* Apply a patch for Postgres 16 support; delete in the next release ******************" && \ wget https://github.com/pksunkara/pgx_ulid/commit/f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \ patch -p1 < f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \ @@ -771,7 +771,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH "/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \ echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \ - mkdir wal2json-src && cd wal2json-src && tar xvzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \ + mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install @@ -787,7 +787,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH "/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \ echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \ - mkdir pg_ivm-src && cd pg_ivm-src && tar xvzf ../pg_ivm.tar.gz --strip-components=1 -C . && \ + mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_ivm.control @@ -804,7 +804,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH "/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \ echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \ - mkdir pg_partman-src && cd pg_partman-src && tar xvzf ../pg_partman.tar.gz --strip-components=1 -C . && \ + mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control diff --git a/control_plane/storcon_cli/Cargo.toml b/control_plane/storcon_cli/Cargo.toml index 61eb7fa4e4..ed3462961f 100644 --- a/control_plane/storcon_cli/Cargo.toml +++ b/control_plane/storcon_cli/Cargo.toml @@ -9,6 +9,7 @@ license.workspace = true anyhow.workspace = true clap.workspace = true comfy-table.workspace = true +humantime.workspace = true hyper.workspace = true pageserver_api.workspace = true pageserver_client.workspace = true diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index c19bc96cdb..05c4acdf90 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -7,8 +7,9 @@ use pageserver_api::{ TenantDescribeResponse, TenantPolicyRequest, }, models::{ - LocationConfigSecondary, ShardParameters, TenantConfig, TenantConfigRequest, - TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse, + EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary, + ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest, + TenantShardSplitRequest, TenantShardSplitResponse, }, shard::{ShardStripeSize, TenantShardId}, }; @@ -125,6 +126,28 @@ enum Command { #[arg(long)] tenant_id: TenantId, }, + /// Uncleanly drop a tenant from the storage controller: this doesn't delete anything from pageservers. Appropriate + /// if you e.g. used `tenant-warmup` by mistake on a tenant ID that doesn't really exist, or is in some other region. + TenantDrop { + #[arg(long)] + tenant_id: TenantId, + #[arg(long)] + unclean: bool, + }, + NodeDrop { + #[arg(long)] + node_id: NodeId, + #[arg(long)] + unclean: bool, + }, + TenantSetTimeBasedEviction { + #[arg(long)] + tenant_id: TenantId, + #[arg(long)] + period: humantime::Duration, + #[arg(long)] + threshold: humantime::Duration, + }, } #[derive(Parser)] @@ -674,6 +697,46 @@ async fn main() -> anyhow::Result<()> { } } } + Command::TenantDrop { tenant_id, unclean } => { + if !unclean { + anyhow::bail!("This command is not a tenant deletion, and uncleanly drops all controller state for the tenant. If you know what you're doing, add `--unclean` to proceed.") + } + storcon_client + .dispatch::<(), ()>( + Method::POST, + format!("debug/v1/tenant/{tenant_id}/drop"), + None, + ) + .await?; + } + Command::NodeDrop { node_id, unclean } => { + if !unclean { + anyhow::bail!("This command is not a clean node decommission, and uncleanly drops all controller state for the node, without checking if any tenants still refer to it. If you know what you're doing, add `--unclean` to proceed.") + } + storcon_client + .dispatch::<(), ()>(Method::POST, format!("debug/v1/node/{node_id}/drop"), None) + .await?; + } + Command::TenantSetTimeBasedEviction { + tenant_id, + period, + threshold, + } => { + vps_client + .tenant_config(&TenantConfigRequest { + tenant_id, + config: TenantConfig { + eviction_policy: Some(EvictionPolicy::LayerAccessThreshold( + EvictionPolicyLayerAccessThreshold { + period: period.into(), + threshold: threshold.into(), + }, + )), + ..Default::default() + }, + }) + .await?; + } } Ok(()) diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 2511de00d5..997c1cc43a 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -1,7 +1,7 @@ use anyhow::{bail, Result}; use byteorder::{ByteOrder, BE}; -use bytes::BufMut; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; +use postgres_ffi::RepOriginId; use postgres_ffi::{Oid, TransactionId}; use serde::{Deserialize, Serialize}; use std::{fmt, ops::Range}; @@ -39,6 +39,9 @@ pub const RELATION_SIZE_PREFIX: u8 = 0x61; /// The key prefix of AUX file keys. pub const AUX_KEY_PREFIX: u8 = 0x62; +/// The key prefix of ReplOrigin keys. +pub const REPL_ORIGIN_KEY_PREFIX: u8 = 0x63; + /// Check if the key falls in the range of metadata keys. pub const fn is_metadata_key_slice(key: &[u8]) -> bool { key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX @@ -53,14 +56,8 @@ impl Key { /// Encode a metadata key to a storage key. pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self { assert!(is_metadata_key_slice(key), "key not in metadata key range"); - Key { - field1: key[0], - field2: u16::from_be_bytes(key[1..3].try_into().unwrap()) as u32, - field3: u32::from_be_bytes(key[3..7].try_into().unwrap()), - field4: u32::from_be_bytes(key[7..11].try_into().unwrap()), - field5: key[11], - field6: u32::from_be_bytes(key[12..16].try_into().unwrap()), - } + // Metadata key space ends at 0x7F so it's fine to directly convert it to i128. + Self::from_i128(i128::from_be_bytes(*key)) } /// Encode a metadata key to a storage key. @@ -68,17 +65,6 @@ impl Key { Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key")) } - /// Extract a metadata key to a writer. The result should always be 16 bytes. - pub fn extract_metadata_key_to_writer(&self, mut writer: impl BufMut) { - writer.put_u8(self.field1); - assert!(self.field2 <= 0xFFFF); - writer.put_u16(self.field2 as u16); - writer.put_u32(self.field3); - writer.put_u32(self.field4); - writer.put_u8(self.field5); - writer.put_u32(self.field6); - } - /// Get the range of metadata keys. pub const fn metadata_key_range() -> Range { Key { @@ -121,7 +107,7 @@ impl Key { /// As long as Neon does not support tablespace (because of lack of access to local file system), /// we can assume that only some predefined namespace OIDs are used which can fit in u16 pub fn to_i128(&self) -> i128 { - assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222); + assert!(self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222); (((self.field1 & 0x7F) as i128) << 120) | (((self.field2 & 0xFFFF) as i128) << 104) | ((self.field3 as i128) << 72) @@ -175,7 +161,7 @@ impl Key { } /// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently. - /// Use [`Key::from_metadata_key`] instead. + /// Use [`Key::from_i128`] instead if you want to handle 16B keys (i.e., metadata keys). pub fn from_slice(b: &[u8]) -> Self { Key { field1: b[0], @@ -188,7 +174,7 @@ impl Key { } /// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently. - /// Use [`Key::extract_metadata_key_to_writer`] instead. + /// Use [`Key::to_i128`] instead if you want to get a 16B key (i.e., metadata keys). pub fn write_to_byte_slice(&self, buf: &mut [u8]) { buf[0] = self.field1; BE::write_u32(&mut buf[1..5], self.field2); @@ -399,7 +385,14 @@ pub fn rel_size_to_key(rel: RelTag) -> Key { field3: rel.dbnode, field4: rel.relnode, field5: rel.forknum, - field6: 0xffffffff, + field6: 0xffff_ffff, + } +} + +impl Key { + #[inline(always)] + pub fn is_rel_size_key(&self) -> bool { + self.field1 == 0 && self.field6 == u32::MAX } } @@ -440,6 +433,25 @@ pub fn slru_dir_to_key(kind: SlruKind) -> Key { } } +#[inline(always)] +pub fn slru_dir_kind(key: &Key) -> Option> { + if key.field1 == 0x01 + && key.field3 == 0 + && key.field4 == 0 + && key.field5 == 0 + && key.field6 == 0 + { + match key.field2 { + 0 => Some(Ok(SlruKind::Clog)), + 1 => Some(Ok(SlruKind::MultiXactMembers)), + 2 => Some(Ok(SlruKind::MultiXactOffsets)), + x => Some(Err(x)), + } + } else { + None + } +} + #[inline(always)] pub fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key { Key { @@ -468,7 +480,17 @@ pub fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key { field3: 1, field4: segno, field5: 0, - field6: 0xffffffff, + field6: 0xffff_ffff, + } +} + +impl Key { + pub fn is_slru_segment_size_key(&self) -> bool { + self.field1 == 0x01 + && self.field2 < 0x03 + && self.field3 == 0x01 + && self.field5 == 0 + && self.field6 == u32::MAX } } @@ -569,6 +591,37 @@ pub const AUX_FILES_KEY: Key = Key { field6: 2, }; +#[inline(always)] +pub fn repl_origin_key(origin_id: RepOriginId) -> Key { + Key { + field1: REPL_ORIGIN_KEY_PREFIX, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: origin_id as u32, + } +} + +/// Get the range of replorigin keys. +pub fn repl_origin_key_range() -> Range { + Key { + field1: REPL_ORIGIN_KEY_PREFIX, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + }..Key { + field1: REPL_ORIGIN_KEY_PREFIX, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0x10000, + } +} + // Reverse mappings for a few Keys. // These are needed by WAL redo manager. @@ -577,73 +630,78 @@ pub const NON_INHERITED_RANGE: Range = AUX_FILES_KEY..AUX_FILES_KEY.next(); /// Sparse keyspace range for vectored get. Missing key error will be ignored for this range. pub const NON_INHERITED_SPARSE_RANGE: Range = Key::metadata_key_range(); -// AUX_FILES currently stores only data for logical replication (slots etc), and -// we don't preserve these on a branch because safekeepers can't follow timeline -// switch (and generally it likely should be optional), so ignore these. -#[inline(always)] -pub fn is_inherited_key(key: Key) -> bool { - !NON_INHERITED_RANGE.contains(&key) && !NON_INHERITED_SPARSE_RANGE.contains(&key) -} +impl Key { + // AUX_FILES currently stores only data for logical replication (slots etc), and + // we don't preserve these on a branch because safekeepers can't follow timeline + // switch (and generally it likely should be optional), so ignore these. + #[inline(always)] + pub fn is_inherited_key(self) -> bool { + !NON_INHERITED_RANGE.contains(&self) && !NON_INHERITED_SPARSE_RANGE.contains(&self) + } -#[inline(always)] -pub fn is_rel_fsm_block_key(key: Key) -> bool { - key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff -} + #[inline(always)] + pub fn is_rel_fsm_block_key(self) -> bool { + self.field1 == 0x00 + && self.field4 != 0 + && self.field5 == FSM_FORKNUM + && self.field6 != 0xffffffff + } -#[inline(always)] -pub fn is_rel_vm_block_key(key: Key) -> bool { - key.field1 == 0x00 - && key.field4 != 0 - && key.field5 == VISIBILITYMAP_FORKNUM - && key.field6 != 0xffffffff -} + #[inline(always)] + pub fn is_rel_vm_block_key(self) -> bool { + self.field1 == 0x00 + && self.field4 != 0 + && self.field5 == VISIBILITYMAP_FORKNUM + && self.field6 != 0xffffffff + } -#[inline(always)] -pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> { - Ok(match key.field1 { - 0x01 => { - let kind = match key.field2 { - 0x00 => SlruKind::Clog, - 0x01 => SlruKind::MultiXactMembers, - 0x02 => SlruKind::MultiXactOffsets, - _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2), - }; - let segno = key.field4; - let blknum = key.field6; + #[inline(always)] + pub fn to_slru_block(self) -> anyhow::Result<(SlruKind, u32, BlockNumber)> { + Ok(match self.field1 { + 0x01 => { + let kind = match self.field2 { + 0x00 => SlruKind::Clog, + 0x01 => SlruKind::MultiXactMembers, + 0x02 => SlruKind::MultiXactOffsets, + _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", self.field2), + }; + let segno = self.field4; + let blknum = self.field6; - (kind, segno, blknum) - } - _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1), - }) -} + (kind, segno, blknum) + } + _ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1), + }) + } -#[inline(always)] -pub fn is_slru_block_key(key: Key) -> bool { - key.field1 == 0x01 // SLRU-related - && key.field3 == 0x00000001 // but not SlruDir - && key.field6 != 0xffffffff // and not SlruSegSize -} + #[inline(always)] + pub fn is_slru_block_key(self) -> bool { + self.field1 == 0x01 // SLRU-related + && self.field3 == 0x00000001 // but not SlruDir + && self.field6 != 0xffffffff // and not SlruSegSize + } -#[inline(always)] -pub fn is_rel_block_key(key: &Key) -> bool { - key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff -} + #[inline(always)] + pub fn is_rel_block_key(&self) -> bool { + self.field1 == 0x00 && self.field4 != 0 && self.field6 != 0xffffffff + } -/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`. -#[inline(always)] -pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> { - Ok(match key.field1 { - 0x00 => ( - RelTag { - spcnode: key.field2, - dbnode: key.field3, - relnode: key.field4, - forknum: key.field5, - }, - key.field6, - ), - _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1), - }) + /// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`. + #[inline(always)] + pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> { + Ok(match self.field1 { + 0x00 => ( + RelTag { + spcnode: self.field2, + dbnode: self.field3, + relnode: self.field4, + forknum: self.field5, + }, + self.field6, + ), + _ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1), + }) + } } impl std::str::FromStr for Key { @@ -687,10 +745,15 @@ mod tests { let mut metadata_key = vec![AUX_KEY_PREFIX]; metadata_key.extend_from_slice(&[0xFF; 15]); let encoded_key = Key::from_metadata_key(&metadata_key); - let mut output_key = Vec::new(); - encoded_key.extract_metadata_key_to_writer(&mut output_key); + let output_key = encoded_key.to_i128().to_be_bytes(); assert_eq!(metadata_key, output_key); assert!(encoded_key.is_metadata_key()); assert!(is_metadata_key_slice(&metadata_key)); } + + #[test] + fn test_possible_largest_key() { + Key::from_i128(0x7FFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF); + // TODO: put this key into the system and see if anything breaks. + } } diff --git a/libs/pageserver_api/src/reltag.rs b/libs/pageserver_api/src/reltag.rs index 38693ab847..010a9c2932 100644 --- a/libs/pageserver_api/src/reltag.rs +++ b/libs/pageserver_api/src/reltag.rs @@ -3,7 +3,7 @@ use std::cmp::Ordering; use std::fmt; use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID; -use postgres_ffi::relfile_utils::forknumber_to_name; +use postgres_ffi::relfile_utils::{forkname_to_number, forknumber_to_name, MAIN_FORKNUM}; use postgres_ffi::Oid; /// @@ -68,6 +68,57 @@ impl fmt::Display for RelTag { } } +#[derive(Debug, thiserror::Error)] +pub enum ParseRelTagError { + #[error("invalid forknum")] + InvalidForknum(#[source] std::num::ParseIntError), + #[error("missing triplet member {}", .0)] + MissingTripletMember(usize), + #[error("invalid triplet member {}", .0)] + InvalidTripletMember(usize, #[source] std::num::ParseIntError), +} + +impl std::str::FromStr for RelTag { + type Err = ParseRelTagError; + + fn from_str(s: &str) -> Result { + use ParseRelTagError::*; + + // FIXME: in postgres logs this separator is dot + // Example: + // could not read block 2 in rel 1663/208101/2620.1 from page server at lsn 0/2431E6F0 + // with a regex we could get this more painlessly + let (triplet, forknum) = match s.split_once('_').or_else(|| s.split_once('.')) { + Some((t, f)) => { + let forknum = forkname_to_number(Some(f)); + let forknum = if let Ok(f) = forknum { + f + } else { + f.parse::().map_err(InvalidForknum)? + }; + + (t, Some(forknum)) + } + None => (s, None), + }; + + let mut split = triplet + .splitn(3, '/') + .enumerate() + .map(|(i, s)| s.parse::().map_err(|e| InvalidTripletMember(i, e))); + let spcnode = split.next().ok_or(MissingTripletMember(0))??; + let dbnode = split.next().ok_or(MissingTripletMember(1))??; + let relnode = split.next().ok_or(MissingTripletMember(2))??; + + Ok(RelTag { + spcnode, + forknum: forknum.unwrap_or(MAIN_FORKNUM), + dbnode, + relnode, + }) + } +} + impl RelTag { pub fn to_segfile_name(&self, segno: u32) -> String { let mut name = if self.spcnode == GLOBALTABLESPACE_OID { diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index 1c05a01926..8c5a4e6168 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -1,9 +1,6 @@ use std::{ops::RangeInclusive, str::FromStr}; -use crate::{ - key::{is_rel_block_key, Key}, - models::ShardParameters, -}; +use crate::{key::Key, models::ShardParameters}; use hex::FromHex; use postgres_ffi::relfile_utils::INIT_FORKNUM; use serde::{Deserialize, Serialize}; @@ -428,6 +425,12 @@ impl<'de> Deserialize<'de> for TenantShardId { #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)] pub struct ShardStripeSize(pub u32); +impl Default for ShardStripeSize { + fn default() -> Self { + DEFAULT_STRIPE_SIZE + } +} + /// Layout version: for future upgrades where we might change how the key->shard mapping works #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)] pub struct ShardLayout(u8); @@ -666,7 +669,7 @@ fn key_is_shard0(key: &Key) -> bool { // because they must be included in basebackups. let is_initfork = key.field5 == INIT_FORKNUM; - !is_rel_block_key(key) || is_initfork + !key.is_rel_block_key() || is_initfork } /// Provide the same result as the function in postgres `hashfn.h` with the same name @@ -713,6 +716,25 @@ fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Ke ShardNumber((hash % count.0 as u32) as u8) } +/// For debugging, while not exposing the internals. +#[derive(Debug)] +#[allow(unused)] // used by debug formatting by pagectl +struct KeyShardingInfo { + shard0: bool, + shard_number: ShardNumber, +} + +pub fn describe( + key: &Key, + shard_count: ShardCount, + stripe_size: ShardStripeSize, +) -> impl std::fmt::Debug { + KeyShardingInfo { + shard0: key_is_shard0(key), + shard_number: key_to_shard_number(shard_count, stripe_size, key), + } +} + #[cfg(test)] mod tests { use utils::Hex; diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index 8e6761d6d3..370d9e9a6f 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -126,6 +126,7 @@ fn main() -> anyhow::Result<()> { .allowlist_type("PageHeaderData") .allowlist_type("DBState") .allowlist_type("RelMapFile") + .allowlist_type("RepOriginId") // Because structs are used for serialization, tell bindgen to emit // explicit padding fields. .explicit_padding(true) diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 0d6986778a..729f57f829 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -110,6 +110,7 @@ pub mod pg_constants; pub mod relfile_utils; // Export some widely used datatypes that are unlikely to change across Postgres versions +pub use v14::bindings::RepOriginId; pub use v14::bindings::{uint32, uint64, Oid}; pub use v14::bindings::{BlockNumber, OffsetNumber}; pub use v14::bindings::{MultiXactId, TransactionId}; diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index 2701ddf5e0..54b032d138 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -102,7 +102,7 @@ pub const XACT_XINFO_HAS_SUBXACTS: u32 = 1u32 << 1; pub const XACT_XINFO_HAS_RELFILENODES: u32 = 1u32 << 2; pub const XACT_XINFO_HAS_INVALS: u32 = 1u32 << 3; pub const XACT_XINFO_HAS_TWOPHASE: u32 = 1u32 << 4; -// pub const XACT_XINFO_HAS_ORIGIN: u32 = 1u32 << 5; +pub const XACT_XINFO_HAS_ORIGIN: u32 = 1u32 << 5; // pub const XACT_XINFO_HAS_AE_LOCKS: u32 = 1u32 << 6; // pub const XACT_XINFO_HAS_GID: u32 = 1u32 << 7; @@ -167,6 +167,7 @@ pub const RM_RELMAP_ID: u8 = 7; pub const RM_STANDBY_ID: u8 = 8; pub const RM_HEAP2_ID: u8 = 9; pub const RM_HEAP_ID: u8 = 10; +pub const RM_REPLORIGIN_ID: u8 = 19; pub const RM_LOGICALMSG_ID: u8 = 21; // from neon_rmgr.h @@ -223,6 +224,10 @@ pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10; pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001; pub const XLP_LONG_HEADER: u16 = 0x0002; +/* From xlog.h */ +pub const XLOG_REPLORIGIN_SET: u8 = 0x00; +pub const XLOG_REPLORIGIN_DROP: u8 = 0x10; + /* From replication/slot.h */ pub const REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN: usize = 4*4 /* offset of `slotdata` in ReplicationSlotOnDisk */ + 64 /* NameData */ + 4*4; @@ -237,6 +242,9 @@ pub const SLOTS_PER_FSM_PAGE: u32 = FSM_LEAF_NODES_PER_PAGE as u32; pub const VM_HEAPBLOCKS_PER_PAGE: u32 = (BLCKSZ as usize - SIZEOF_PAGE_HEADER_DATA) as u32 * (8 / 2); // MAPSIZE * (BITS_PER_BYTE / BITS_PER_HEAPBLOCK) +/* From origin.c */ +pub const REPLICATION_STATE_MAGIC: u32 = 0x1257DADE; + // List of subdirectories inside pgdata. // Copied from src/bin/initdb/initdb.c pub const PGDATA_SUBDIRS: [&str; 22] = [ diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index 24c1248304..aca22c6b3e 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -26,13 +26,14 @@ use futures::stream::Stream; use futures_util::StreamExt; use futures_util::TryStreamExt; use http_types::{StatusCode, Url}; +use scopeguard::ScopeGuard; use tokio_util::sync::CancellationToken; use tracing::debug; +use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind}; use crate::{ - error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download, - DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata, - TimeTravelError, TimeoutOrCancel, + error::Cancelled, AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing, + ListingMode, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel, }; pub struct AzureBlobStorage { @@ -137,6 +138,8 @@ impl AzureBlobStorage { let mut last_modified = None; let mut metadata = HashMap::new(); + let started_at = start_measuring_requests(kind); + let download = async { let response = builder // convert to concrete Pageable @@ -200,13 +203,22 @@ impl AzureBlobStorage { }) }; - tokio::select! { + let download = tokio::select! { bufs = download => bufs, cancel_or_timeout = cancel_or_timeout => match cancel_or_timeout { - TimeoutOrCancel::Timeout => Err(DownloadError::Timeout), - TimeoutOrCancel::Cancel => Err(DownloadError::Cancelled), + TimeoutOrCancel::Timeout => return Err(DownloadError::Timeout), + TimeoutOrCancel::Cancel => return Err(DownloadError::Cancelled), }, - } + }; + let started_at = ScopeGuard::into_inner(started_at); + let outcome = match &download { + Ok(_) => AttemptOutcome::Ok, + Err(_) => AttemptOutcome::Err, + }; + crate::metrics::BUCKET_METRICS + .req_seconds + .observe_elapsed(kind, outcome, started_at); + download } async fn permit( @@ -340,7 +352,10 @@ impl RemoteStorage for AzureBlobStorage { metadata: Option, cancel: &CancellationToken, ) -> anyhow::Result<()> { - let _permit = self.permit(RequestKind::Put, cancel).await?; + let kind = RequestKind::Put; + let _permit = self.permit(kind, cancel).await?; + + let started_at = start_measuring_requests(kind); let op = async { let blob_client = self.client.blob_client(self.relative_path_to_name(to)); @@ -364,14 +379,25 @@ impl RemoteStorage for AzureBlobStorage { match fut.await { Ok(Ok(_response)) => Ok(()), Ok(Err(azure)) => Err(azure.into()), - Err(_timeout) => Err(TimeoutOrCancel::Cancel.into()), + Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()), } }; - tokio::select! { + let res = tokio::select! { res = op => res, - _ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()), - } + _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()), + }; + + let outcome = match res { + Ok(_) => AttemptOutcome::Ok, + Err(_) => AttemptOutcome::Err, + }; + let started_at = ScopeGuard::into_inner(started_at); + crate::metrics::BUCKET_METRICS + .req_seconds + .observe_elapsed(kind, outcome, started_at); + + res } async fn download( @@ -417,12 +443,13 @@ impl RemoteStorage for AzureBlobStorage { paths: &'a [RemotePath], cancel: &CancellationToken, ) -> anyhow::Result<()> { - let _permit = self.permit(RequestKind::Delete, cancel).await?; + let kind = RequestKind::Delete; + let _permit = self.permit(kind, cancel).await?; + let started_at = start_measuring_requests(kind); let op = async { - // TODO batch requests are also not supported by the SDK + // TODO batch requests are not supported by the SDK // https://github.com/Azure/azure-sdk-for-rust/issues/1068 - // https://github.com/Azure/azure-sdk-for-rust/issues/1249 for path in paths { let blob_client = self.client.blob_client(self.relative_path_to_name(path)); @@ -447,10 +474,16 @@ impl RemoteStorage for AzureBlobStorage { Ok(()) }; - tokio::select! { + let res = tokio::select! { res = op => res, - _ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()), - } + _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()), + }; + + let started_at = ScopeGuard::into_inner(started_at); + crate::metrics::BUCKET_METRICS + .req_seconds + .observe_elapsed(kind, &res, started_at); + res } async fn copy( @@ -459,7 +492,9 @@ impl RemoteStorage for AzureBlobStorage { to: &RemotePath, cancel: &CancellationToken, ) -> anyhow::Result<()> { - let _permit = self.permit(RequestKind::Copy, cancel).await?; + let kind = RequestKind::Copy; + let _permit = self.permit(kind, cancel).await?; + let started_at = start_measuring_requests(kind); let timeout = tokio::time::sleep(self.timeout); @@ -503,15 +538,21 @@ impl RemoteStorage for AzureBlobStorage { } }; - tokio::select! { + let res = tokio::select! { res = op => res, - _ = cancel.cancelled() => Err(anyhow::Error::new(TimeoutOrCancel::Cancel)), + _ = cancel.cancelled() => return Err(anyhow::Error::new(TimeoutOrCancel::Cancel)), _ = timeout => { let e = anyhow::Error::new(TimeoutOrCancel::Timeout); let e = e.context(format!("Timeout, last status: {copy_status:?}")); Err(e) }, - } + }; + + let started_at = ScopeGuard::into_inner(started_at); + crate::metrics::BUCKET_METRICS + .req_seconds + .observe_elapsed(kind, &res, started_at); + res } async fn time_travel_recover( diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 708662f20f..8c984abed2 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -12,6 +12,7 @@ mod azure_blob; mod error; mod local_fs; +mod metrics; mod s3_bucket; mod simulate_failures; mod support; @@ -121,8 +122,8 @@ impl RemotePath { self.0.file_name() } - pub fn join(&self, segment: &Utf8Path) -> Self { - Self(self.0.join(segment)) + pub fn join(&self, path: impl AsRef) -> Self { + Self(self.0.join(path)) } pub fn get_path(&self) -> &Utf8PathBuf { diff --git a/libs/remote_storage/src/s3_bucket/metrics.rs b/libs/remote_storage/src/metrics.rs similarity index 76% rename from libs/remote_storage/src/s3_bucket/metrics.rs rename to libs/remote_storage/src/metrics.rs index beca755920..bbb51590f3 100644 --- a/libs/remote_storage/src/s3_bucket/metrics.rs +++ b/libs/remote_storage/src/metrics.rs @@ -15,6 +15,7 @@ pub(crate) enum RequestKind { TimeTravel = 5, } +use scopeguard::ScopeGuard; use RequestKind::*; impl RequestKind { @@ -33,10 +34,10 @@ impl RequestKind { } } -pub(super) struct RequestTyped([C; 6]); +pub(crate) struct RequestTyped([C; 6]); impl RequestTyped { - pub(super) fn get(&self, kind: RequestKind) -> &C { + pub(crate) fn get(&self, kind: RequestKind) -> &C { &self.0[kind.as_index()] } @@ -58,19 +59,19 @@ impl RequestTyped { } impl RequestTyped { - pub(super) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) { + pub(crate) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) { self.get(kind).observe(started_at.elapsed().as_secs_f64()) } } -pub(super) struct PassFailCancelledRequestTyped { +pub(crate) struct PassFailCancelledRequestTyped { success: RequestTyped, fail: RequestTyped, cancelled: RequestTyped, } #[derive(Debug, Clone, Copy)] -pub(super) enum AttemptOutcome { +pub(crate) enum AttemptOutcome { Ok, Err, Cancelled, @@ -86,7 +87,7 @@ impl From<&Result> for AttemptOutcome { } impl AttemptOutcome { - pub(super) fn as_str(&self) -> &'static str { + pub(crate) fn as_str(&self) -> &'static str { match self { AttemptOutcome::Ok => "ok", AttemptOutcome::Err => "err", @@ -96,7 +97,7 @@ impl AttemptOutcome { } impl PassFailCancelledRequestTyped { - pub(super) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C { + pub(crate) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C { let target = match outcome { AttemptOutcome::Ok => &self.success, AttemptOutcome::Err => &self.fail, @@ -119,7 +120,7 @@ impl PassFailCancelledRequestTyped { } impl PassFailCancelledRequestTyped { - pub(super) fn observe_elapsed( + pub(crate) fn observe_elapsed( &self, kind: RequestKind, outcome: impl Into, @@ -130,19 +131,44 @@ impl PassFailCancelledRequestTyped { } } -pub(super) struct BucketMetrics { +/// On drop (cancellation) count towards [`BucketMetrics::cancelled_waits`]. +pub(crate) fn start_counting_cancelled_wait( + kind: RequestKind, +) -> ScopeGuard { + scopeguard::guard_on_success(std::time::Instant::now(), move |_| { + crate::metrics::BUCKET_METRICS + .cancelled_waits + .get(kind) + .inc() + }) +} + +/// On drop (cancellation) add time to [`BucketMetrics::req_seconds`]. +pub(crate) fn start_measuring_requests( + kind: RequestKind, +) -> ScopeGuard { + scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| { + crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed( + kind, + AttemptOutcome::Cancelled, + started_at, + ) + }) +} + +pub(crate) struct BucketMetrics { /// Full request duration until successful completion, error or cancellation. - pub(super) req_seconds: PassFailCancelledRequestTyped, + pub(crate) req_seconds: PassFailCancelledRequestTyped, /// Total amount of seconds waited on queue. - pub(super) wait_seconds: RequestTyped, + pub(crate) wait_seconds: RequestTyped, /// Track how many semaphore awaits were cancelled per request type. /// /// This is in case cancellations are happening more than expected. - pub(super) cancelled_waits: RequestTyped, + pub(crate) cancelled_waits: RequestTyped, /// Total amount of deleted objects in batches or single requests. - pub(super) deleted_objects_total: IntCounter, + pub(crate) deleted_objects_total: IntCounter, } impl Default for BucketMetrics { diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index c3d6c75e20..76cf3eac80 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -46,15 +46,16 @@ use utils::backoff; use super::StorageMetadata; use crate::{ - error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError, - Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel, - MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR, + error::Cancelled, + metrics::{start_counting_cancelled_wait, start_measuring_requests}, + support::PermitCarrying, + ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, + S3Config, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, + REMOTE_STORAGE_PREFIX_SEPARATOR, }; -pub(super) mod metrics; - -use self::metrics::AttemptOutcome; -pub(super) use self::metrics::RequestKind; +use crate::metrics::AttemptOutcome; +pub(super) use crate::metrics::RequestKind; /// AWS S3 storage. pub struct S3Bucket { @@ -227,7 +228,7 @@ impl S3Bucket { }; let started_at = ScopeGuard::into_inner(started_at); - metrics::BUCKET_METRICS + crate::metrics::BUCKET_METRICS .wait_seconds .observe_elapsed(kind, started_at); @@ -248,7 +249,7 @@ impl S3Bucket { }; let started_at = ScopeGuard::into_inner(started_at); - metrics::BUCKET_METRICS + crate::metrics::BUCKET_METRICS .wait_seconds .observe_elapsed(kind, started_at); Ok(permit) @@ -287,7 +288,7 @@ impl S3Bucket { // Count this in the AttemptOutcome::Ok bucket, because 404 is not // an error: we expect to sometimes fetch an object and find it missing, // e.g. when probing for timeline indices. - metrics::BUCKET_METRICS.req_seconds.observe_elapsed( + crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed( kind, AttemptOutcome::Ok, started_at, @@ -295,7 +296,7 @@ impl S3Bucket { return Err(DownloadError::NotFound); } Err(e) => { - metrics::BUCKET_METRICS.req_seconds.observe_elapsed( + crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed( kind, AttemptOutcome::Err, started_at, @@ -371,12 +372,12 @@ impl S3Bucket { }; let started_at = ScopeGuard::into_inner(started_at); - metrics::BUCKET_METRICS + crate::metrics::BUCKET_METRICS .req_seconds .observe_elapsed(kind, &resp, started_at); let resp = resp.context("request deletion")?; - metrics::BUCKET_METRICS + crate::metrics::BUCKET_METRICS .deleted_objects_total .inc_by(chunk.len() as u64); @@ -435,14 +436,14 @@ pin_project_lite::pin_project! { /// Times and tracks the outcome of the request. struct TimedDownload { started_at: std::time::Instant, - outcome: metrics::AttemptOutcome, + outcome: AttemptOutcome, #[pin] inner: S } impl PinnedDrop for TimedDownload { fn drop(mut this: Pin<&mut Self>) { - metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at); + crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at); } } } @@ -451,7 +452,7 @@ impl TimedDownload { fn new(started_at: std::time::Instant, inner: S) -> Self { TimedDownload { started_at, - outcome: metrics::AttemptOutcome::Cancelled, + outcome: AttemptOutcome::Cancelled, inner, } } @@ -468,8 +469,8 @@ impl>> Stream for TimedDownload { let res = ready!(this.inner.poll_next(cx)); match &res { Some(Ok(_)) => {} - Some(Err(_)) => *this.outcome = metrics::AttemptOutcome::Err, - None => *this.outcome = metrics::AttemptOutcome::Ok, + Some(Err(_)) => *this.outcome = AttemptOutcome::Err, + None => *this.outcome = AttemptOutcome::Ok, } Poll::Ready(res) @@ -543,7 +544,7 @@ impl RemoteStorage for S3Bucket { let started_at = ScopeGuard::into_inner(started_at); - metrics::BUCKET_METRICS + crate::metrics::BUCKET_METRICS .req_seconds .observe_elapsed(kind, &response, started_at); @@ -625,7 +626,7 @@ impl RemoteStorage for S3Bucket { if let Ok(inner) = &res { // do not incl. timeouts as errors in metrics but cancellations let started_at = ScopeGuard::into_inner(started_at); - metrics::BUCKET_METRICS + crate::metrics::BUCKET_METRICS .req_seconds .observe_elapsed(kind, inner, started_at); } @@ -673,7 +674,7 @@ impl RemoteStorage for S3Bucket { }; let started_at = ScopeGuard::into_inner(started_at); - metrics::BUCKET_METRICS + crate::metrics::BUCKET_METRICS .req_seconds .observe_elapsed(kind, &res, started_at); @@ -977,28 +978,6 @@ impl RemoteStorage for S3Bucket { } } -/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`]. -fn start_counting_cancelled_wait( - kind: RequestKind, -) -> ScopeGuard { - scopeguard::guard_on_success(std::time::Instant::now(), move |_| { - metrics::BUCKET_METRICS.cancelled_waits.get(kind).inc() - }) -} - -/// On drop (cancellation) add time to [`metrics::BucketMetrics::req_seconds`]. -fn start_measuring_requests( - kind: RequestKind, -) -> ScopeGuard { - scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| { - metrics::BUCKET_METRICS.req_seconds.observe_elapsed( - kind, - AttemptOutcome::Cancelled, - started_at, - ) - }) -} - // Save RAM and only store the needed data instead of the entire ObjectVersion/DeleteMarkerEntry struct VerOrDelete { kind: VerOrDeleteKind, diff --git a/libs/remote_storage/src/support.rs b/libs/remote_storage/src/support.rs index d146b5445b..1ed9ed9305 100644 --- a/libs/remote_storage/src/support.rs +++ b/libs/remote_storage/src/support.rs @@ -78,6 +78,10 @@ where let e = Err(std::io::Error::from(e)); return Poll::Ready(Some(e)); } + } else { + // this would be perfectly valid behaviour for doing a graceful completion on the + // download for example, but not one we expect to do right now. + tracing::warn!("continuing polling after having cancelled or timeouted"); } this.inner.poll_next(cx) @@ -89,13 +93,22 @@ where } /// Fires only on the first cancel or timeout, not on both. -pub(crate) async fn cancel_or_timeout( +pub(crate) fn cancel_or_timeout( timeout: Duration, cancel: CancellationToken, -) -> TimeoutOrCancel { - tokio::select! { - _ = tokio::time::sleep(timeout) => TimeoutOrCancel::Timeout, - _ = cancel.cancelled() => TimeoutOrCancel::Cancel, +) -> impl std::future::Future + 'static { + // futures are lazy, they don't do anything before being polled. + // + // "precalculate" the wanted deadline before returning the future, so that we can use pause + // failpoint to trigger a timeout in test. + let deadline = tokio::time::Instant::now() + timeout; + async move { + tokio::select! { + _ = tokio::time::sleep_until(deadline) => TimeoutOrCancel::Timeout, + _ = cancel.cancelled() => { + TimeoutOrCancel::Cancel + }, + } } } @@ -172,4 +185,31 @@ mod tests { _ = tokio::time::sleep(Duration::from_secs(121)) => {}, } } + + #[tokio::test] + async fn notified_but_pollable_after() { + let inner = futures::stream::once(futures::future::ready(Ok(bytes::Bytes::from_static( + b"hello world", + )))); + let timeout = Duration::from_secs(120); + let cancel = CancellationToken::new(); + + cancel.cancel(); + let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner); + let mut stream = std::pin::pin!(stream); + + let next = stream.next().await; + let ioe = next.unwrap().unwrap_err(); + assert!( + matches!( + ioe.get_ref().unwrap().downcast_ref::(), + Some(&DownloadError::Cancelled) + ), + "{ioe:?}" + ); + + let next = stream.next().await; + let bytes = next.unwrap().unwrap(); + assert_eq!(&b"hello world"[..], bytes); + } } diff --git a/libs/utils/src/fs_ext.rs b/libs/utils/src/fs_ext.rs index 90ba348a02..8e53d2c79b 100644 --- a/libs/utils/src/fs_ext.rs +++ b/libs/utils/src/fs_ext.rs @@ -3,6 +3,9 @@ use std::{fs, io, path::Path}; use anyhow::Context; +mod rename_noreplace; +pub use rename_noreplace::rename_noreplace; + pub trait PathExt { /// Returns an error if `self` is not a directory. fn is_empty_dir(&self) -> io::Result; diff --git a/libs/utils/src/fs_ext/rename_noreplace.rs b/libs/utils/src/fs_ext/rename_noreplace.rs new file mode 100644 index 0000000000..897e30d7f1 --- /dev/null +++ b/libs/utils/src/fs_ext/rename_noreplace.rs @@ -0,0 +1,109 @@ +use nix::NixPath; + +/// Rename a file without replacing an existing file. +/// +/// This is a wrapper around platform-specific APIs. +pub fn rename_noreplace( + src: &P1, + dst: &P2, +) -> nix::Result<()> { + { + #[cfg(target_os = "linux")] + { + nix::fcntl::renameat2( + None, + src, + None, + dst, + nix::fcntl::RenameFlags::RENAME_NOREPLACE, + ) + } + #[cfg(target_os = "macos")] + { + let res = src.with_nix_path(|src| { + dst.with_nix_path(|dst| + // SAFETY: `src` and `dst` are valid C strings as per the NixPath trait and they outlive the call to renamex_np. + unsafe { + nix::libc::renamex_np(src.as_ptr(), dst.as_ptr(), nix::libc::RENAME_EXCL) + }) + })??; + nix::errno::Errno::result(res).map(drop) + } + #[cfg(not(any(target_os = "linux", target_os = "macos")))] + { + std::compile_error!("OS does not support no-replace renames"); + } + } +} + +#[cfg(test)] +mod test { + use std::{fs, path::PathBuf}; + + use super::*; + + fn testdir() -> camino_tempfile::Utf8TempDir { + match crate::env::var("NEON_UTILS_RENAME_NOREPLACE_TESTDIR") { + Some(path) => { + let path: camino::Utf8PathBuf = path; + camino_tempfile::tempdir_in(path).unwrap() + } + None => camino_tempfile::tempdir().unwrap(), + } + } + + #[test] + fn test_absolute_paths() { + let testdir = testdir(); + println!("testdir: {}", testdir.path()); + + let src = testdir.path().join("src"); + let dst = testdir.path().join("dst"); + + fs::write(&src, b"").unwrap(); + fs::write(&dst, b"").unwrap(); + + let src = src.canonicalize().unwrap(); + assert!(src.is_absolute()); + let dst = dst.canonicalize().unwrap(); + assert!(dst.is_absolute()); + + let result = rename_noreplace(&src, &dst); + assert_eq!(result.unwrap_err(), nix::Error::EEXIST); + } + + #[test] + fn test_relative_paths() { + let testdir = testdir(); + println!("testdir: {}", testdir.path()); + + // this is fine because we run in nextest => process per test + std::env::set_current_dir(testdir.path()).unwrap(); + + let src = PathBuf::from("src"); + let dst = PathBuf::from("dst"); + + fs::write(&src, b"").unwrap(); + fs::write(&dst, b"").unwrap(); + + let result = rename_noreplace(&src, &dst); + assert_eq!(result.unwrap_err(), nix::Error::EEXIST); + } + + #[test] + fn test_works_when_not_exists() { + let testdir = testdir(); + println!("testdir: {}", testdir.path()); + + let src = testdir.path().join("src"); + let dst = testdir.path().join("dst"); + + fs::write(&src, b"content").unwrap(); + + rename_noreplace(src.as_std_path(), dst.as_std_path()).unwrap(); + assert_eq!( + "content", + String::from_utf8(std::fs::read(&dst).unwrap()).unwrap() + ); + } +} diff --git a/libs/utils/src/hex.rs b/libs/utils/src/hex.rs index fc0bb7e4a2..382f805a96 100644 --- a/libs/utils/src/hex.rs +++ b/libs/utils/src/hex.rs @@ -19,13 +19,13 @@ /// // right: [0x68; 1] /// # fn serialize_something() -> Vec { "hello world".as_bytes().to_vec() } /// ``` -#[derive(PartialEq)] -pub struct Hex<'a>(pub &'a [u8]); +pub struct Hex(pub S); -impl std::fmt::Debug for Hex<'_> { +impl> std::fmt::Debug for Hex { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "[")?; - for (i, c) in self.0.chunks(16).enumerate() { + let chunks = self.0.as_ref().chunks(16); + for (i, c) in chunks.enumerate() { if i > 0 && !c.is_empty() { writeln!(f, ", ")?; } @@ -36,6 +36,15 @@ impl std::fmt::Debug for Hex<'_> { write!(f, "0x{b:02x}")?; } } - write!(f, "; {}]", self.0.len()) + write!(f, "; {}]", self.0.as_ref().len()) + } +} + +impl, L: AsRef<[u8]>> PartialEq> for Hex { + fn eq(&self, other: &Hex) -> bool { + let left = self.0.as_ref(); + let right = other.0.as_ref(); + + left == right } } diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml index 843f5dd862..be5626040b 100644 --- a/pageserver/ctl/Cargo.toml +++ b/pageserver/ctl/Cargo.toml @@ -17,6 +17,7 @@ pageserver = { path = ".." } pageserver_api.workspace = true remote_storage = { path = "../../libs/remote_storage" } postgres_ffi.workspace = true +thiserror.workspace = true tokio.workspace = true tokio-util.workspace = true toml_edit.workspace = true diff --git a/pageserver/ctl/src/index_part.rs b/pageserver/ctl/src/index_part.rs index 2998b5c732..a33cae6769 100644 --- a/pageserver/ctl/src/index_part.rs +++ b/pageserver/ctl/src/index_part.rs @@ -26,7 +26,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> { let output = Output { layer_metadata: &des.layer_metadata, - disk_consistent_lsn: des.get_disk_consistent_lsn(), + disk_consistent_lsn: des.metadata.disk_consistent_lsn(), timeline_metadata: &des.metadata, }; diff --git a/pageserver/ctl/src/key.rs b/pageserver/ctl/src/key.rs new file mode 100644 index 0000000000..af4b5a21ab --- /dev/null +++ b/pageserver/ctl/src/key.rs @@ -0,0 +1,475 @@ +use anyhow::Context; +use clap::Parser; +use pageserver_api::{ + key::Key, + reltag::{BlockNumber, RelTag, SlruKind}, + shard::{ShardCount, ShardStripeSize}, +}; +use std::str::FromStr; + +#[derive(Parser)] +pub(super) struct DescribeKeyCommand { + /// Key material in one of the forms: hex, span attributes captured from log, reltag blocknum + input: Vec, + + /// The number of shards to calculate what Keys placement would be. + #[arg(long)] + shard_count: Option, + + /// The sharding stripe size. + /// + /// The default is hardcoded. It makes no sense to provide this without providing + /// `--shard-count`. + #[arg(long, requires = "shard_count")] + stripe_size: Option, +} + +/// Sharded shard count without unsharded count, which the actual ShardCount supports. +#[derive(Clone, Copy)] +pub(super) struct CustomShardCount(std::num::NonZeroU8); + +#[derive(Debug, thiserror::Error)] +pub(super) enum InvalidShardCount { + #[error(transparent)] + ParsingFailed(#[from] std::num::ParseIntError), + #[error("too few shards")] + TooFewShards, +} + +impl FromStr for CustomShardCount { + type Err = InvalidShardCount; + + fn from_str(s: &str) -> Result { + let inner: std::num::NonZeroU8 = s.parse()?; + if inner.get() < 2 { + Err(InvalidShardCount::TooFewShards) + } else { + Ok(CustomShardCount(inner)) + } + } +} + +impl From for ShardCount { + fn from(value: CustomShardCount) -> Self { + ShardCount::new(value.0.get()) + } +} + +impl DescribeKeyCommand { + pub(super) fn execute(self) { + let DescribeKeyCommand { + input, + shard_count, + stripe_size, + } = self; + + let material = KeyMaterial::try_from(input.as_slice()).unwrap(); + let kind = material.kind(); + let key = Key::from(material); + + println!("parsed from {kind}: {key}:"); + println!(); + println!("{key:?}"); + + macro_rules! kind_query { + ([$($name:ident),*$(,)?]) => {{[$(kind_query!($name)),*]}}; + ($name:ident) => {{ + let s: &'static str = stringify!($name); + let s = s.strip_prefix("is_").unwrap_or(s); + let s = s.strip_suffix("_key").unwrap_or(s); + + #[allow(clippy::needless_borrow)] + (s, key.$name()) + }}; + } + + // the current characterization is a mess of these boolean queries and separate + // "recognization". I think it accurately represents how strictly we model the Key + // right now, but could of course be made less confusing. + + let queries = kind_query!([ + is_rel_block_key, + is_rel_vm_block_key, + is_rel_fsm_block_key, + is_slru_block_key, + is_inherited_key, + is_rel_size_key, + is_slru_segment_size_key, + ]); + + let recognized_kind = "recognized kind"; + let metadata_key = "metadata key"; + let shard_placement = "shard placement"; + + let longest = queries + .iter() + .map(|t| t.0) + .chain([recognized_kind, metadata_key, shard_placement]) + .map(|s| s.len()) + .max() + .unwrap(); + + let colon = 1; + let padding = 1; + + for (name, is) in queries { + let width = longest - name.len() + colon + padding; + println!("{}{:width$}{}", name, ":", is); + } + + let width = longest - recognized_kind.len() + colon + padding; + println!( + "{}{:width$}{:?}", + recognized_kind, + ":", + RecognizedKeyKind::new(key), + ); + + if let Some(shard_count) = shard_count { + // seeing the sharding placement might be confusing, so leave it out unless shard + // count was given. + + let stripe_size = stripe_size.map(ShardStripeSize).unwrap_or_default(); + println!( + "# placement with shard_count: {} and stripe_size: {}:", + shard_count.0, stripe_size.0 + ); + let width = longest - shard_placement.len() + colon + padding; + println!( + "{}{:width$}{:?}", + shard_placement, + ":", + pageserver_api::shard::describe(&key, shard_count.into(), stripe_size) + ); + } + } +} + +/// Hand-wavy "inputs we accept" for a key. +#[derive(Debug)] +pub(super) enum KeyMaterial { + Hex(Key), + String(SpanAttributesFromLogs), + Split(RelTag, BlockNumber), +} + +impl KeyMaterial { + fn kind(&self) -> &'static str { + match self { + KeyMaterial::Hex(_) => "hex", + KeyMaterial::String(_) | KeyMaterial::Split(_, _) => "split", + } + } +} + +impl From for Key { + fn from(value: KeyMaterial) -> Self { + match value { + KeyMaterial::Hex(key) => key, + KeyMaterial::String(SpanAttributesFromLogs(rt, blocknum)) + | KeyMaterial::Split(rt, blocknum) => { + pageserver_api::key::rel_block_to_key(rt, blocknum) + } + } + } +} + +impl> TryFrom<&[S]> for KeyMaterial { + type Error = anyhow::Error; + + fn try_from(value: &[S]) -> Result { + match value { + [] => anyhow::bail!( + "need 1..N positional arguments describing the key, try hex or a log line" + ), + [one] => { + let one = one.as_ref(); + + let key = Key::from_hex(one).map(KeyMaterial::Hex); + + let attrs = SpanAttributesFromLogs::from_str(one).map(KeyMaterial::String); + + match (key, attrs) { + (Ok(key), _) => Ok(key), + (_, Ok(s)) => Ok(s), + (Err(e1), Err(e2)) => anyhow::bail!( + "failed to parse {one:?} as hex or span attributes:\n- {e1:#}\n- {e2:#}" + ), + } + } + more => { + // assume going left to right one of these is a reltag and then we find a blocknum + // this works, because we don't have plain numbers at least right after reltag in + // logs. for some definition of "works". + + let Some((reltag_at, reltag)) = more + .iter() + .map(AsRef::as_ref) + .enumerate() + .find_map(|(i, s)| { + s.split_once("rel=") + .map(|(_garbage, actual)| actual) + .unwrap_or(s) + .parse::() + .ok() + .map(|rt| (i, rt)) + }) + else { + anyhow::bail!("found no RelTag in arguments"); + }; + + let Some(blocknum) = more + .iter() + .map(AsRef::as_ref) + .skip(reltag_at) + .find_map(|s| { + s.split_once("blkno=") + .map(|(_garbage, actual)| actual) + .unwrap_or(s) + .parse::() + .ok() + }) + else { + anyhow::bail!("found no blocknum in arguments"); + }; + + Ok(KeyMaterial::Split(reltag, blocknum)) + } + } + } +} + +#[derive(Debug)] +pub(super) struct SpanAttributesFromLogs(RelTag, BlockNumber); + +impl std::str::FromStr for SpanAttributesFromLogs { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + // accept the span separator but do not require or fail if either is missing + // "whatever{rel=1663/16389/24615 blkno=1052204 req_lsn=FFFFFFFF/FFFFFFFF}" + let (_, reltag) = s + .split_once("rel=") + .ok_or_else(|| anyhow::anyhow!("cannot find 'rel='"))?; + let reltag = reltag.split_whitespace().next().unwrap(); + + let (_, blocknum) = s + .split_once("blkno=") + .ok_or_else(|| anyhow::anyhow!("cannot find 'blkno='"))?; + let blocknum = blocknum.split_whitespace().next().unwrap(); + + let reltag = reltag + .parse() + .with_context(|| format!("parse reltag from {reltag:?}"))?; + let blocknum = blocknum + .parse() + .with_context(|| format!("parse blocknum from {blocknum:?}"))?; + + Ok(Self(reltag, blocknum)) + } +} + +#[derive(Debug)] +#[allow(dead_code)] // debug print is used +enum RecognizedKeyKind { + DbDir, + ControlFile, + Checkpoint, + AuxFilesV1, + SlruDir(Result), + RelMap(RelTagish<2>), + RelDir(RelTagish<2>), + AuxFileV2(Result>), +} + +#[derive(Debug, PartialEq)] +#[allow(unused)] +enum AuxFileV2 { + Recognized(&'static str, utils::Hex<[u8; 13]>), + OtherWithPrefix(&'static str, utils::Hex<[u8; 13]>), + Other(utils::Hex<[u8; 13]>), +} + +impl RecognizedKeyKind { + fn new(key: Key) -> Option { + use RecognizedKeyKind::{ + AuxFilesV1, Checkpoint, ControlFile, DbDir, RelDir, RelMap, SlruDir, + }; + + let slru_dir_kind = pageserver_api::key::slru_dir_kind(&key); + + Some(match key { + pageserver_api::key::DBDIR_KEY => DbDir, + pageserver_api::key::CONTROLFILE_KEY => ControlFile, + pageserver_api::key::CHECKPOINT_KEY => Checkpoint, + pageserver_api::key::AUX_FILES_KEY => AuxFilesV1, + _ if slru_dir_kind.is_some() => SlruDir(slru_dir_kind.unwrap()), + _ if key.field1 == 0 && key.field4 == 0 && key.field5 == 0 && key.field6 == 0 => { + RelMap([key.field2, key.field3].into()) + } + _ if key.field1 == 0 && key.field4 == 0 && key.field5 == 0 && key.field6 == 1 => { + RelDir([key.field2, key.field3].into()) + } + _ if key.is_metadata_key() => RecognizedKeyKind::AuxFileV2( + AuxFileV2::new(key).ok_or_else(|| utils::Hex(key.to_i128().to_be_bytes())), + ), + _ => return None, + }) + } +} + +impl AuxFileV2 { + fn new(key: Key) -> Option { + const EMPTY_HASH: [u8; 13] = { + let mut out = [0u8; 13]; + let hash = pageserver::aux_file::fnv_hash(b"").to_be_bytes(); + let mut i = 3; + while i < 16 { + out[i - 3] = hash[i]; + i += 1; + } + out + }; + + let bytes = key.to_i128().to_be_bytes(); + let hash = utils::Hex(<[u8; 13]>::try_from(&bytes[3..]).unwrap()); + + assert_eq!(EMPTY_HASH.len(), hash.0.len()); + + // TODO: we could probably find the preimages for the hashes + + Some(match (bytes[1], bytes[2]) { + (1, 1) => AuxFileV2::Recognized("pg_logical/mappings/", hash), + (1, 2) => AuxFileV2::Recognized("pg_logical/snapshots/", hash), + (1, 3) if hash.0 == EMPTY_HASH => { + AuxFileV2::Recognized("pg_logical/replorigin_checkpoint", hash) + } + (2, 1) => AuxFileV2::Recognized("pg_replslot/", hash), + (1, 0xff) => AuxFileV2::OtherWithPrefix("pg_logical/", hash), + (0xff, 0xff) => AuxFileV2::Other(hash), + _ => return None, + }) + } +} + +/// Prefix of RelTag, currently only known use cases are the two item versions. +/// +/// Renders like a reltag with `/`, nothing else. +struct RelTagish([u32; N]); + +impl From<[u32; N]> for RelTagish { + fn from(val: [u32; N]) -> Self { + RelTagish(val) + } +} + +impl std::fmt::Debug for RelTagish { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use std::fmt::Write as _; + let mut first = true; + self.0.iter().try_for_each(|x| { + if !first { + f.write_char('/')?; + } + first = false; + write!(f, "{}", x) + }) + } +} + +#[cfg(test)] +mod tests { + use pageserver::aux_file::encode_aux_file_key; + + use super::*; + + #[test] + fn hex_is_key_material() { + let m = KeyMaterial::try_from(&["000000067F0000400200DF927900FFFFFFFF"][..]).unwrap(); + assert!(matches!(m, KeyMaterial::Hex(_)), "{m:?}"); + } + + #[test] + fn single_positional_spanalike_is_key_material() { + // why is this needed? if you are checking many, then copypaste starts to appeal + let strings = [ + (line!(), "2024-05-15T15:33:49.873906Z ERROR page_service_conn_main{peer_addr=A:B}:process_query{tenant_id=C timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm blkno=2 req_lsn=0/238D98C8}: error reading relation or page version: Read error: could not find data for key 000000067F00032CE5000000000000000001 (shard ShardNumber(0)) at LSN 0/1D0A16C1, request LSN 0/238D98C8, ancestor 0/0"), + (line!(), "rel=1663/208101/2620_fsm blkno=2"), + (line!(), "rel=1663/208101/2620.1 blkno=2"), + ]; + + let mut first: Option = None; + + for (line, example) in strings { + let m = KeyMaterial::try_from(&[example][..]) + .unwrap_or_else(|e| panic!("failed to parse example from line {line}: {e:?}")); + let key = Key::from(m); + if let Some(first) = first { + assert_eq!(first, key); + } else { + first = Some(key); + } + } + + // not supporting this is rather accidential, but I think the input parsing is lenient + // enough already + KeyMaterial::try_from(&["1663/208101/2620_fsm 2"][..]).unwrap_err(); + } + + #[test] + fn multiple_spanlike_args() { + let strings = [ + (line!(), &["process_query{tenant_id=C", "timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm", "blkno=2", "req_lsn=0/238D98C8}"][..]), + (line!(), &["rel=1663/208101/2620_fsm", "blkno=2"][..]), + (line!(), &["1663/208101/2620_fsm", "2"][..]), + ]; + + let mut first: Option = None; + + for (line, example) in strings { + let m = KeyMaterial::try_from(example) + .unwrap_or_else(|e| panic!("failed to parse example from line {line}: {e:?}")); + let key = Key::from(m); + if let Some(first) = first { + assert_eq!(first, key); + } else { + first = Some(key); + } + } + } + #[test] + fn recognized_auxfiles() { + use AuxFileV2::*; + + let empty = [ + 0x2e, 0x07, 0xbb, 0x01, 0x42, 0x62, 0xb8, 0x21, 0x75, 0x62, 0x95, 0xc5, 0x8d, + ]; + let foobar = [ + 0x62, 0x79, 0x3c, 0x64, 0xbf, 0x6f, 0x0d, 0x35, 0x97, 0xba, 0x44, 0x6f, 0x18, + ]; + + #[rustfmt::skip] + let examples = [ + (line!(), "pg_logical/mappings/foobar", Recognized("pg_logical/mappings/", utils::Hex(foobar))), + (line!(), "pg_logical/snapshots/foobar", Recognized("pg_logical/snapshots/", utils::Hex(foobar))), + (line!(), "pg_logical/replorigin_checkpoint", Recognized("pg_logical/replorigin_checkpoint", utils::Hex(empty))), + (line!(), "pg_logical/foobar", OtherWithPrefix("pg_logical/", utils::Hex(foobar))), + (line!(), "pg_replslot/foobar", Recognized("pg_replslot/", utils::Hex(foobar))), + (line!(), "foobar", Other(utils::Hex(foobar))), + ]; + + for (line, path, expected) in examples { + let key = encode_aux_file_key(path); + let recognized = + AuxFileV2::new(key).unwrap_or_else(|| panic!("line {line} example failed")); + + assert_eq!(recognized, expected); + } + + assert_eq!( + AuxFileV2::new(Key::from_hex("600000102000000000000000000000000000").unwrap()), + None, + "example key has one too few 0 after 6 before 1" + ); + } +} diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index e92c352dab..50c3ac4c61 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -6,6 +6,7 @@ mod draw_timeline_dir; mod index_part; +mod key; mod layer_map_analyzer; mod layers; @@ -61,6 +62,8 @@ enum Commands { AnalyzeLayerMap(AnalyzeLayerMapCmd), #[command(subcommand)] Layer(LayerCmd), + /// Debug print a hex key found from logs + Key(key::DescribeKeyCommand), } /// Read and update pageserver metadata file @@ -183,6 +186,7 @@ async fn main() -> anyhow::Result<()> { .time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel) .await?; } + Commands::Key(dkc) => dkc.execute(), }; Ok(()) } diff --git a/pageserver/pagebench/src/cmd/aux_files.rs b/pageserver/pagebench/src/cmd/aux_files.rs index eb5b242a5f..bce3285606 100644 --- a/pageserver/pagebench/src/cmd/aux_files.rs +++ b/pageserver/pagebench/src/cmd/aux_files.rs @@ -5,6 +5,7 @@ use utils::lsn::Lsn; use std::collections::HashMap; use std::sync::Arc; +use std::time::Instant; /// Ingest aux files into the pageserver. #[derive(clap::Parser)] @@ -88,11 +89,17 @@ async fn main_impl(args: Args) -> anyhow::Result<()> { println!("ingested {file_cnt} files"); } - let files = mgmt_api_client - .list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1)) - .await?; - - println!("{} files found", files.len()); + for _ in 0..100 { + let start = Instant::now(); + let files = mgmt_api_client + .list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1)) + .await?; + println!( + "{} files found in {}s", + files.len(), + start.elapsed().as_secs_f64() + ); + } anyhow::Ok(()) } diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index 5043a207fc..4992f37465 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -1,6 +1,6 @@ use anyhow::Context; use camino::Utf8PathBuf; -use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key}; +use pageserver_api::key::Key; use pageserver_api::keyspace::KeySpaceAccum; use pageserver_api::models::PagestreamGetPageRequest; @@ -187,7 +187,7 @@ async fn main_impl( for r in partitioning.keys.ranges.iter() { let mut i = r.start; while i != r.end { - if is_rel_block_key(&i) { + if i.is_rel_block_key() { filtered.add_key(i); } i = i.next(); @@ -308,9 +308,10 @@ async fn main_impl( let r = &ranges[weights.sample(&mut rng)]; let key: i128 = rng.gen_range(r.start..r.end); let key = Key::from_i128(key); - assert!(is_rel_block_key(&key)); - let (rel_tag, block_no) = - key_to_rel_block(key).expect("we filter non-rel-block keys out above"); + assert!(key.is_rel_block_key()); + let (rel_tag, block_no) = key + .to_rel_block() + .expect("we filter non-rel-block keys out above"); PagestreamGetPageRequest { request_lsn: if rng.gen_bool(args.req_latest_probability) { Lsn::MAX diff --git a/pageserver/src/aux_file.rs b/pageserver/src/aux_file.rs index 38e1875db1..5e527b7d61 100644 --- a/pageserver/src/aux_file.rs +++ b/pageserver/src/aux_file.rs @@ -178,7 +178,8 @@ impl AuxFileSizeEstimator { } } - pub fn on_base_backup(&self, new_size: usize) { + /// When generating base backup or doing initial logical size calculation + pub fn on_initial(&self, new_size: usize) { let mut guard = self.size.lock().unwrap(); *guard = Some(new_size as isize); self.report(new_size as isize); diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index dca1510810..0f057a4368 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -13,7 +13,7 @@ use anyhow::{anyhow, Context}; use bytes::{BufMut, Bytes, BytesMut}; use fail::fail_point; -use pageserver_api::key::{key_to_slru_block, Key}; +use pageserver_api::key::Key; use postgres_ffi::pg_constants; use std::fmt::Write as FmtWrite; use std::time::SystemTime; @@ -170,7 +170,7 @@ where } async fn add_block(&mut self, key: &Key, block: Bytes) -> Result<(), BasebackupError> { - let (kind, segno, _) = key_to_slru_block(*key)?; + let (kind, segno, _) = key.to_slru_block()?; match kind { SlruKind::Clog => { @@ -362,6 +362,13 @@ where )); info!("Replication slot {} restart LSN={}", path, restart_lsn); min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn); + } else if path == "pg_logical/replorigin_checkpoint" { + // replorigin_checkoint is written only on compute shutdown, so it contains + // deteriorated values. So we generate our own version of this file for the particular LSN + // based on information about replorigins extracted from transaction commit records. + // In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all, + // but now we should handle (skip) it for backward compatibility. + continue; } let header = new_tar_header(&path, content.len() as u64)?; self.ar @@ -390,6 +397,32 @@ where { self.add_twophase_file(xid).await?; } + let repl_origins = self + .timeline + .get_replorigins(self.lsn, self.ctx) + .await + .map_err(|e| BasebackupError::Server(e.into()))?; + let n_origins = repl_origins.len(); + if n_origins != 0 { + // + // Construct "pg_logical/replorigin_checkpoint" file based on information about replication origins + // extracted from transaction commit record. We are using this file to pass information about replication + // origins to compute to allow logical replication to restart from proper point. + // + let mut content = Vec::with_capacity(n_origins * 16 + 8); + content.extend_from_slice(&pg_constants::REPLICATION_STATE_MAGIC.to_le_bytes()); + for (origin_id, origin_lsn) in repl_origins { + content.extend_from_slice(&origin_id.to_le_bytes()); + content.extend_from_slice(&[0u8; 6]); // align to 8 bytes + content.extend_from_slice(&origin_lsn.0.to_le_bytes()); + } + let crc32 = crc32c::crc32c(&content); + content.extend_from_slice(&crc32.to_le_bytes()); + let header = new_tar_header("pg_logical/replorigin_checkpoint", content.len() as u64)?; + self.ar.append(&header, &*content).await.context( + "could not add pg_logical/replorigin_checkpoint file to basebackup tarball", + )?; + } fail_point!("basebackup-before-control-file", |_| { Err(BasebackupError::Server(anyhow!( diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index b0afb6414b..b4a0d1ac02 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -99,8 +99,6 @@ pub mod defaults { pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0; - pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "async"; - /// /// Default built-in configuration file. /// @@ -146,8 +144,6 @@ pub mod defaults { #validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}' -#walredo_process_kind = '{DEFAULT_WALREDO_PROCESS_KIND}' - [tenant_config] #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} @@ -300,8 +296,6 @@ pub struct PageServerConf { /// /// Setting this to zero disables limits on total ephemeral layer size. pub ephemeral_bytes_per_memory_kb: usize, - - pub walredo_process_kind: crate::walredo::ProcessKind, } /// We do not want to store this in a PageServerConf because the latter may be logged @@ -407,8 +401,6 @@ struct PageServerConfigBuilder { validate_vectored_get: BuilderValue, ephemeral_bytes_per_memory_kb: BuilderValue, - - walredo_process_kind: BuilderValue, } impl PageServerConfigBuilder { @@ -497,8 +489,6 @@ impl PageServerConfigBuilder { )), validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET), ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), - - walredo_process_kind: Set(DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap()), } } } @@ -686,10 +676,6 @@ impl PageServerConfigBuilder { self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value); } - pub fn get_walredo_process_kind(&mut self, value: crate::walredo::ProcessKind) { - self.walredo_process_kind = BuilderValue::Set(value); - } - pub fn build(self) -> anyhow::Result { let default = Self::default_values(); @@ -747,7 +733,6 @@ impl PageServerConfigBuilder { max_vectored_read_bytes, validate_vectored_get, ephemeral_bytes_per_memory_kb, - walredo_process_kind, } CUSTOM LOGIC { @@ -1044,9 +1029,6 @@ impl PageServerConf { "ephemeral_bytes_per_memory_kb" => { builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize) } - "walredo_process_kind" => { - builder.get_walredo_process_kind(parse_toml_from_str("walredo_process_kind", item)?) - } _ => bail!("unrecognized pageserver option '{key}'"), } } @@ -1130,7 +1112,6 @@ impl PageServerConf { ), validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, - walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(), } } } @@ -1370,7 +1351,6 @@ background_task_maximum_delay = '334 s' ), validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, - walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(), }, "Correct defaults should be used when no config values are provided" ); @@ -1444,7 +1424,6 @@ background_task_maximum_delay = '334 s' ), validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, - walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(), }, "Should be able to parse all basic config values correctly" ); diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index 62bbde42f4..540d0d2e8c 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -358,7 +358,7 @@ async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &Re // mean the synthetic size worker should terminate. let shutting_down = matches!( e.downcast_ref::(), - Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_)) + Some(PageReconstructError::Cancelled) ); if !shutting_down { diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 8790a9b0a8..3960fc1b99 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -311,7 +311,7 @@ impl DeletionList { result.extend( timeline_layers .into_iter() - .map(|l| timeline_remote_path.join(&Utf8PathBuf::from(l))), + .map(|l| timeline_remote_path.join(Utf8PathBuf::from(l))), ); } } diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index e5eafc51f4..71b486a4d3 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -81,8 +81,10 @@ paths: Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved. 404 means that deletion successfully finished" responses: + "200": + description: Tenant was successfully deleted, or was already not found. "404": - description: Tenant not found. This is the success path. + description: Tenant not found. This is a success result, equivalent to 200. content: application/json: schema: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 8a061f3ae1..7fa6c35ad6 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -74,6 +74,7 @@ use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; use crate::tenant::storage_layer::LayerName; use crate::tenant::timeline::CompactFlags; +use crate::tenant::timeline::CompactionError; use crate::tenant::timeline::Timeline; use crate::tenant::GetTimelineError; use crate::tenant::SpawnMode; @@ -183,9 +184,6 @@ impl From for ApiError { PageReconstructError::Cancelled => { ApiError::InternalServerError(anyhow::anyhow!("request was cancelled")) } - PageReconstructError::AncestorStopping(_) => { - ApiError::ResourceUnavailable(format!("{pre}").into()) - } PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()), PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre), } @@ -1075,7 +1073,7 @@ async fn tenant_delete_handler( let state = get_state(&request); - state + let status = state .tenant_manager .delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT) .instrument(info_span!("tenant_delete_handler", @@ -1084,7 +1082,14 @@ async fn tenant_delete_handler( )) .await?; - json_response(StatusCode::ACCEPTED, ()) + // Callers use 404 as success for deletions, for historical reasons. + if status == StatusCode::NOT_FOUND { + return Err(ApiError::NotFound( + anyhow::anyhow!("Deletion complete").into(), + )); + } + + json_response(status, ()) } /// HTTP endpoint to query the current tenant_size of a tenant. @@ -1813,11 +1818,22 @@ async fn timeline_checkpoint_handler( timeline .freeze_and_flush() .await - .map_err(ApiError::InternalServerError)?; + .map_err(|e| { + match e { + tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown, + other => ApiError::InternalServerError(other.into()), + + } + })?; timeline .compact(&cancel, flags, &ctx) .await - .map_err(|e| ApiError::InternalServerError(e.into()))?; + .map_err(|e| + match e { + CompactionError::ShuttingDown => ApiError::ShuttingDown, + CompactionError::Other(e) => ApiError::InternalServerError(e) + } + )?; if wait_until_uploaded { timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?; @@ -2173,7 +2189,7 @@ async fn tenant_scan_remote_handler( { Ok((index_part, index_generation)) => { tracing::info!("Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)", - index_part.layer_metadata.len(), index_part.get_disk_consistent_lsn()); + index_part.layer_metadata.len(), index_part.metadata.disk_consistent_lsn()); generation = std::cmp::max(generation, index_generation); } Err(DownloadError::NotFound) => { diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index e9651165b1..ae389826d5 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -66,6 +66,7 @@ use crate::tenant::mgr::GetTenantError; use crate::tenant::mgr::ShardResolveResult; use crate::tenant::mgr::ShardSelector; use crate::tenant::mgr::TenantManager; +use crate::tenant::timeline::FlushLayerError; use crate::tenant::timeline::WaitLsnError; use crate::tenant::GetTimelineError; use crate::tenant::PageReconstructError; @@ -372,7 +373,7 @@ impl From for PageStreamError { match value { e @ WaitLsnError::Timeout(_) => Self::LsnTimeout(e), WaitLsnError::Shutdown => Self::Shutdown, - WaitLsnError::BadState => Self::Reconnect("Timeline is not active".into()), + e @ WaitLsnError::BadState { .. } => Self::Reconnect(format!("{e}").into()), } } } @@ -382,7 +383,7 @@ impl From for QueryError { match value { e @ WaitLsnError::Timeout(_) => Self::Other(anyhow::Error::new(e)), WaitLsnError::Shutdown => Self::Shutdown, - WaitLsnError::BadState => Self::Reconnect, + WaitLsnError::BadState { .. } => Self::Reconnect, } } } @@ -830,7 +831,10 @@ impl PageServerHandler { // We only want to persist the data, and it doesn't matter if it's in the // shape of deltas or images. info!("flushing layers"); - timeline.freeze_and_flush().await?; + timeline.freeze_and_flush().await.map_err(|e| match e { + FlushLayerError::Cancelled => QueryError::Shutdown, + other => QueryError::Other(other.into()), + })?; info!("done"); Ok(()) diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index afba34c6d1..0bff4be150 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -17,8 +17,8 @@ use bytes::{Buf, Bytes, BytesMut}; use enum_map::Enum; use itertools::Itertools; use pageserver_api::key::{ - dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key, - rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key, + dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key, + relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key, slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY, }; @@ -27,7 +27,7 @@ use pageserver_api::models::AuxFilePolicy; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::BLCKSZ; -use postgres_ffi::{Oid, TimestampTz, TransactionId}; +use postgres_ffi::{Oid, RepOriginId, TimestampTz, TransactionId}; use serde::{Deserialize, Serialize}; use std::collections::{hash_map, HashMap, HashSet}; use std::ops::ControlFlow; @@ -78,11 +78,19 @@ pub enum LsnForTimestamp { } #[derive(Debug, thiserror::Error)] -pub enum CalculateLogicalSizeError { +pub(crate) enum CalculateLogicalSizeError { #[error("cancelled")] Cancelled, + + /// Something went wrong while reading the metadata we use to calculate logical size + /// Note that cancellation variants of `PageReconstructError` are transformed to [`Self::Cancelled`] + /// in the `From` implementation for this variant. #[error(transparent)] - Other(#[from] anyhow::Error), + PageRead(PageReconstructError), + + /// Something went wrong deserializing metadata that we read to calculate logical size + #[error("decode error: {0}")] + Decode(#[from] DeserializeError), } #[derive(Debug, thiserror::Error)] @@ -107,10 +115,8 @@ impl From for CollectKeySpaceError { impl From for CalculateLogicalSizeError { fn from(pre: PageReconstructError) -> Self { match pre { - PageReconstructError::AncestorStopping(_) | PageReconstructError::Cancelled => { - Self::Cancelled - } - _ => Self::Other(pre.into()), + PageReconstructError::Cancelled => Self::Cancelled, + _ => Self::PageRead(pre), } } } @@ -712,10 +718,22 @@ impl Timeline { result.insert(fname, content); } } - self.aux_file_size_estimator.on_base_backup(sz); + self.aux_file_size_estimator.on_initial(sz); Ok(result) } + pub(crate) async fn trigger_aux_file_size_computation( + &self, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result<(), PageReconstructError> { + let current_policy = self.last_aux_file_policy.load(); + if let Some(AuxFilePolicy::V2) | Some(AuxFilePolicy::CrossValidation) = current_policy { + self.list_aux_files_v2(lsn, ctx).await?; + } + Ok(()) + } + pub(crate) async fn list_aux_files( &self, lsn: Lsn, @@ -754,6 +772,27 @@ impl Timeline { } } + pub(crate) async fn get_replorigins( + &self, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result, PageReconstructError> { + let kv = self + .scan(KeySpace::single(repl_origin_key_range()), lsn, ctx) + .await + .context("scan")?; + let mut result = HashMap::new(); + for (k, v) in kv { + let v = v.context("get value")?; + let origin_id = k.field6 as RepOriginId; + let origin_lsn = Lsn::des(&v).unwrap(); + if origin_lsn != Lsn::INVALID { + result.insert(origin_id, origin_lsn); + } + } + Ok(result) + } + /// Does the same as get_current_logical_size but counted on demand. /// Used to initialize the logical size tracking on startup. /// @@ -763,7 +802,7 @@ impl Timeline { /// # Cancel-Safety /// /// This method is cancellation-safe. - pub async fn get_current_logical_size_non_incremental( + pub(crate) async fn get_current_logical_size_non_incremental( &self, lsn: Lsn, ctx: &RequestContext, @@ -772,7 +811,7 @@ impl Timeline { // Fetch list of database dirs and iterate them let buf = self.get(DBDIR_KEY, lsn, ctx).await?; - let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?; + let dbdir = DbDirectory::des(&buf)?; let mut total_size: u64 = 0; for (spcnode, dbnode) in dbdir.dbdirs.keys() { @@ -879,7 +918,9 @@ impl Timeline { Ok(( result.to_keyspace(), /* AUX sparse key space */ - SparseKeySpace(KeySpace::single(Key::metadata_aux_key_range())), + SparseKeySpace(KeySpace { + ranges: vec![repl_origin_key_range(), Key::metadata_aux_key_range()], + }), )) } @@ -1148,6 +1189,20 @@ impl<'a> DatadirModification<'a> { Ok(()) } + pub async fn set_replorigin( + &mut self, + origin_id: RepOriginId, + origin_lsn: Lsn, + ) -> anyhow::Result<()> { + let key = repl_origin_key(origin_id); + self.put(key, Value::Image(origin_lsn.ser().unwrap().into())); + Ok(()) + } + + pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> anyhow::Result<()> { + self.set_replorigin(origin_id, Lsn::INVALID).await + } + pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> { self.put(CONTROLFILE_KEY, Value::Image(img)); Ok(()) @@ -1552,7 +1607,7 @@ impl<'a> DatadirModification<'a> { self.tline.aux_file_size_estimator.on_add(content.len()); new_files.push((path, content)); } - (None, true) => anyhow::bail!("removing non-existing aux file: {}", path), + (None, true) => warn!("removing non-existing aux file: {}", path), } let new_val = aux_file::encode_file_value(&new_files)?; self.put(key, Value::Image(new_val.into())); @@ -1606,8 +1661,7 @@ impl<'a> DatadirModification<'a> { aux_files.dir = Some(dir); } Err( - e @ (PageReconstructError::AncestorStopping(_) - | PageReconstructError::Cancelled + e @ (PageReconstructError::Cancelled | PageReconstructError::AncestorLsnTimeout(_)), ) => { // Important that we do not interpret a shutdown error as "not found" and thereby @@ -1679,7 +1733,7 @@ impl<'a> DatadirModification<'a> { let mut retained_pending_updates = HashMap::<_, Vec<_>>::new(); for (key, values) in self.pending_updates.drain() { for (lsn, value) in values { - if is_rel_block_key(&key) || is_slru_block_key(key) { + if key.is_rel_block_key() || key.is_slru_block_key() { // This bails out on first error without modifying pending_updates. // That's Ok, cf this function's doc comment. writer.put(key, lsn, &value, ctx).await?; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index e6bfd57a44..60cd5c9695 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -487,6 +487,33 @@ enum CreateTimelineCause { Delete, } +#[derive(thiserror::Error, Debug)] +pub(crate) enum GcError { + // The tenant is shutting down + #[error("tenant shutting down")] + TenantCancelled, + + // The tenant is shutting down + #[error("timeline shutting down")] + TimelineCancelled, + + // The tenant is in a state inelegible to run GC + #[error("not active")] + NotActive, + + // A requested GC cutoff LSN was invalid, for example it tried to move backwards + #[error("not active")] + BadLsn { why: String }, + + // A remote storage error while scheduling updates after compaction + #[error(transparent)] + Remote(anyhow::Error), + + // If GC was invoked for a particular timeline, this error means it didn't exist + #[error("timeline not found")] + TimelineNotFound, +} + impl Tenant { /// Yet another helper for timeline initialization. /// @@ -1393,6 +1420,36 @@ impl Tenant { Ok(tl) } + /// Helper for unit tests to create a timeline with some pre-loaded states. + #[cfg(test)] + #[allow(clippy::too_many_arguments)] + pub async fn create_test_timeline_with_layers( + &self, + new_timeline_id: TimelineId, + initdb_lsn: Lsn, + pg_version: u32, + ctx: &RequestContext, + delta_layer_desc: Vec>, + image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>, + end_lsn: Lsn, + ) -> anyhow::Result> { + let tline = self + .create_test_timeline(new_timeline_id, initdb_lsn, pg_version, ctx) + .await?; + tline.force_advance_lsn(end_lsn); + for deltas in delta_layer_desc { + tline + .force_create_delta_layer(deltas, Some(initdb_lsn), ctx) + .await?; + } + for (lsn, images) in image_layer_desc { + tline + .force_create_image_layer(lsn, images, Some(initdb_lsn), ctx) + .await?; + } + Ok(tline) + } + /// Create a new timeline. /// /// Returns the new timeline ID and reference to its Timeline object. @@ -1507,7 +1564,7 @@ impl Tenant { .wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx) .await .map_err(|e| match e { - e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => { + e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState { .. }) => { CreateTimelineError::AncestorLsn(anyhow::anyhow!(e)) } WaitLsnError::Shutdown => CreateTimelineError::ShuttingDown, @@ -1575,24 +1632,23 @@ impl Tenant { /// GC cutoff point is determined conservatively by either `horizon` and `pitr`, whichever /// requires more history to be retained. // - pub async fn gc_iteration( + pub(crate) async fn gc_iteration( &self, target_timeline_id: Option, horizon: u64, pitr: Duration, cancel: &CancellationToken, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { // Don't start doing work during shutdown if let TenantState::Stopping { .. } = self.current_state() { return Ok(GcResult::default()); } // there is a global allowed_error for this - anyhow::ensure!( - self.is_active(), - "Cannot run GC iteration on inactive tenant" - ); + if !self.is_active() { + return Err(GcError::NotActive); + } { let conf = self.tenant_conf.load(); @@ -2760,28 +2816,13 @@ impl Tenant { pitr: Duration, cancel: &CancellationToken, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { let mut totals: GcResult = Default::default(); let now = Instant::now(); - let gc_timelines = match self + let gc_timelines = self .refresh_gc_info_internal(target_timeline_id, horizon, pitr, cancel, ctx) - .await - { - Ok(result) => result, - Err(e) => { - if let Some(PageReconstructError::Cancelled) = - e.downcast_ref::() - { - // Handle cancellation - totals.elapsed = now.elapsed(); - return Ok(totals); - } else { - // Propagate other errors - return Err(e); - } - } - }; + .await?; failpoint_support::sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines"); @@ -2806,7 +2847,19 @@ impl Tenant { // made. break; } - let result = timeline.gc().await?; + let result = match timeline.gc().await { + Err(GcError::TimelineCancelled) => { + if target_timeline_id.is_some() { + // If we were targetting this specific timeline, surface cancellation to caller + return Err(GcError::TimelineCancelled); + } else { + // A timeline may be shutting down independently of the tenant's lifecycle: we should + // skip past this and proceed to try GC on other timelines. + continue; + } + } + r => r?, + }; totals += result; } @@ -2819,11 +2872,11 @@ impl Tenant { /// [`Tenant::get_gc_horizon`]. /// /// This is usually executed as part of periodic gc, but can now be triggered more often. - pub async fn refresh_gc_info( + pub(crate) async fn refresh_gc_info( &self, cancel: &CancellationToken, ctx: &RequestContext, - ) -> anyhow::Result>> { + ) -> Result>, GcError> { // since this method can now be called at different rates than the configured gc loop, it // might be that these configuration values get applied faster than what it was previously, // since these were only read from the gc task. @@ -2844,7 +2897,7 @@ impl Tenant { pitr: Duration, cancel: &CancellationToken, ctx: &RequestContext, - ) -> anyhow::Result>> { + ) -> Result>, GcError> { // before taking the gc_cs lock, do the heavier weight finding of gc_cutoff points for // currently visible timelines. let timelines = self @@ -2881,8 +2934,8 @@ impl Tenant { } } - if !self.is_active() { - anyhow::bail!("shutting down"); + if !self.is_active() || self.cancel.is_cancelled() { + return Err(GcError::TenantCancelled); } // grab mutex to prevent new timelines from being created here; avoid doing long operations @@ -2891,19 +2944,19 @@ impl Tenant { // Scan all timelines. For each timeline, remember the timeline ID and // the branch point where it was created. - let (all_branchpoints, timeline_ids): (BTreeSet<(TimelineId, Lsn)>, _) = { + let (all_branchpoints, timelines): (BTreeSet<(TimelineId, Lsn)>, _) = { let timelines = self.timelines.lock().unwrap(); let mut all_branchpoints = BTreeSet::new(); - let timeline_ids = { + let timelines = { if let Some(target_timeline_id) = target_timeline_id.as_ref() { if timelines.get(target_timeline_id).is_none() { - bail!("gc target timeline does not exist") + return Err(GcError::TimelineNotFound); } }; timelines .iter() - .map(|(timeline_id, timeline_entry)| { + .map(|(_timeline_id, timeline_entry)| { if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() { @@ -2925,33 +2978,28 @@ impl Tenant { } } - *timeline_id + timeline_entry.clone() }) .collect::>() }; - (all_branchpoints, timeline_ids) + (all_branchpoints, timelines) }; // Ok, we now know all the branch points. // Update the GC information for each timeline. - let mut gc_timelines = Vec::with_capacity(timeline_ids.len()); - for timeline_id in timeline_ids { - // Timeline is known to be local and loaded. - let timeline = self - .get_timeline(timeline_id, false) - .with_context(|| format!("Timeline {timeline_id} was not found"))?; - + let mut gc_timelines = Vec::with_capacity(timelines.len()); + for timeline in timelines { // If target_timeline is specified, ignore all other timelines if let Some(target_timeline_id) = target_timeline_id { - if timeline_id != target_timeline_id { + if timeline.timeline_id != target_timeline_id { continue; } } let branchpoints: Vec = all_branchpoints .range(( - Included((timeline_id, Lsn(0))), - Included((timeline_id, Lsn(u64::MAX))), + Included((timeline.timeline_id, Lsn(0))), + Included((timeline.timeline_id, Lsn(u64::MAX))), )) .map(|&x| x.1) .collect(); @@ -2959,7 +3007,7 @@ impl Tenant { { let mut target = timeline.gc_info.write().unwrap(); - match gc_cutoffs.remove(&timeline_id) { + match gc_cutoffs.remove(&timeline.timeline_id) { Some(cutoffs) => { *target = GcInfo { retain_lsns: branchpoints, @@ -2992,17 +3040,53 @@ impl Tenant { &self, src_timeline: &Arc, dst_id: TimelineId, - start_lsn: Option, + ancestor_lsn: Option, ctx: &RequestContext, ) -> Result, CreateTimelineError> { let create_guard = self.create_timeline_create_guard(dst_id).unwrap(); let tl = self - .branch_timeline_impl(src_timeline, dst_id, start_lsn, create_guard, ctx) + .branch_timeline_impl(src_timeline, dst_id, ancestor_lsn, create_guard, ctx) .await?; tl.set_state(TimelineState::Active); Ok(tl) } + /// Helper for unit tests to branch a timeline with some pre-loaded states. + #[cfg(test)] + #[allow(clippy::too_many_arguments)] + pub async fn branch_timeline_test_with_layers( + &self, + src_timeline: &Arc, + dst_id: TimelineId, + ancestor_lsn: Option, + ctx: &RequestContext, + delta_layer_desc: Vec>, + image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>, + end_lsn: Lsn, + ) -> anyhow::Result> { + let tline = self + .branch_timeline_test(src_timeline, dst_id, ancestor_lsn, ctx) + .await?; + let ancestor_lsn = if let Some(ancestor_lsn) = ancestor_lsn { + ancestor_lsn + } else { + tline.get_last_record_lsn() + }; + assert!(end_lsn >= ancestor_lsn); + tline.force_advance_lsn(end_lsn); + for deltas in delta_layer_desc { + tline + .force_create_delta_layer(deltas, Some(ancestor_lsn), ctx) + .await?; + } + for (lsn, images) in image_layer_desc { + tline + .force_create_image_layer(lsn, images, Some(ancestor_lsn), ctx) + .await?; + } + Ok(tline) + } + /// Branch an existing timeline. /// /// The caller is responsible for activating the returned timeline. @@ -3781,6 +3865,9 @@ pub(crate) mod harness { pub fn create_custom( test_name: &'static str, tenant_conf: TenantConf, + tenant_id: TenantId, + shard_identity: ShardIdentity, + generation: Generation, ) -> anyhow::Result { setup_logging(); @@ -3793,8 +3880,12 @@ pub(crate) mod harness { // OK in a test. let conf: &'static PageServerConf = Box::leak(Box::new(conf)); - let tenant_id = TenantId::generate(); - let tenant_shard_id = TenantShardId::unsharded(tenant_id); + let shard = shard_identity.shard_index(); + let tenant_shard_id = TenantShardId { + tenant_id, + shard_number: shard.shard_number, + shard_count: shard.shard_count, + }; fs::create_dir_all(conf.tenant_path(&tenant_shard_id))?; fs::create_dir_all(conf.timelines_path(&tenant_shard_id))?; @@ -3812,8 +3903,8 @@ pub(crate) mod harness { conf, tenant_conf, tenant_shard_id, - generation: Generation::new(0xdeadbeef), - shard: ShardIndex::unsharded(), + generation, + shard, remote_storage, remote_fs_dir, deletion_queue, @@ -3828,8 +3919,15 @@ pub(crate) mod harness { compaction_period: Duration::ZERO, ..TenantConf::default() }; - - Self::create_custom(test_name, tenant_conf) + let tenant_id = TenantId::generate(); + let shard = ShardIdentity::unsharded(); + Self::create_custom( + test_name, + tenant_conf, + tenant_id, + shard, + Generation::new(0xdeadbeef), + ) } pub fn span(&self) -> tracing::Span { @@ -3908,8 +4006,8 @@ pub(crate) mod harness { let base_img = base_img.expect("Neon WAL redo requires base image").1; let mut page = BytesMut::new(); page.extend_from_slice(&base_img); - for (_record_lsn, record) in records { - apply_neon::apply_in_neon(&record, key, &mut page)?; + for (record_lsn, record) in records { + apply_neon::apply_in_neon(&record, record_lsn, key, &mut page)?; } Ok(page.freeze()) } else { @@ -3953,6 +4051,7 @@ mod tests { use tests::storage_layer::ValuesReconstructState; use tests::timeline::{GetVectoredError, ShutdownMode}; use utils::bin_ser::BeSer; + use utils::id::TenantId; static TEST_KEY: Lazy = Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001"))); @@ -4154,7 +4253,7 @@ mod tests { .await?; writer.finish_write(lsn); } - tline.freeze_and_flush().await + tline.freeze_and_flush().await.map_err(|e| e.into()) } #[tokio::test] @@ -4308,9 +4407,10 @@ mod tests { // This needs to traverse to the parent, and fails. let err = newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await.unwrap_err(); - assert!(err - .to_string() - .contains("will not become active. Current state: Broken")); + assert!(err.to_string().starts_with(&format!( + "Bad state on timeline {}: Broken", + tline.timeline_id + ))); Ok(()) } @@ -4851,7 +4951,13 @@ mod tests { ..TenantConf::default() }; - let harness = TenantHarness::create_custom("test_get_vectored_key_gap", tenant_conf)?; + let harness = TenantHarness::create_custom( + "test_get_vectored_key_gap", + tenant_conf, + TenantId::generate(), + ShardIdentity::unsharded(), + Generation::new(0xdeadbeef), + )?; let (tenant, ctx) = harness.load().await; let mut current_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); @@ -6205,75 +6311,36 @@ mod tests { async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> { let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?; let (tenant, ctx) = harness.load().await; - let tline = tenant - .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) - .await?; - - let cancel = CancellationToken::new(); let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap(); let base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap(); let base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap(); - let mut lsn = Lsn(0x20); - - { - let mut writer = tline.writer().await; - writer - .put(base_key, lsn, &Value::Image(test_img("data key 1")), &ctx) - .await?; - writer.finish_write(lsn); - drop(writer); - - tline.freeze_and_flush().await?; // this will create a image layer - } + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + Vec::new(), // delta layers + vec![(Lsn(0x20), vec![(base_key, test_img("data key 1"))])], // image layers + Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN + ) + .await?; let child = tenant - .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx) + .branch_timeline_test_with_layers( + &tline, + NEW_TIMELINE_ID, + Some(Lsn(0x20)), + &ctx, + Vec::new(), // delta layers + vec![(Lsn(0x30), vec![(base_key_child, test_img("data key 2"))])], // image layers + Lsn(0x30), + ) .await .unwrap(); - lsn.0 += 0x10; - - { - let mut writer = child.writer().await; - writer - .put( - base_key_child, - lsn, - &Value::Image(test_img("data key 2")), - &ctx, - ) - .await?; - writer.finish_write(lsn); - drop(writer); - - child.freeze_and_flush().await?; // this will create a delta - - { - // update the partitioning to include the test key space, otherwise they - // will be dropped by image layer creation - let mut guard = child.partitioning.lock().await; - let ((partitioning, _), partition_lsn) = &mut *guard; - partitioning - .parts - .push(KeySpace::single(base_key..base_key_nonexist)); // exclude the nonexist key - *partition_lsn = lsn; - } - - child - .compact( - &cancel, - { - let mut set = EnumSet::empty(); - set.insert(CompactFlags::ForceImageLayerCreation); - set - }, - &ctx, - ) - .await?; // force create an image layer for the keys, TODO: check if the image layer is created - } - async fn get_vectored_impl_wrapper( tline: &Arc, key: Key, @@ -6295,6 +6362,8 @@ mod tests { })) } + let lsn = Lsn(0x30); + // test vectored get on parent timeline assert_eq!( get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?, @@ -6332,94 +6401,42 @@ mod tests { #[tokio::test] async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?; + let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?; let (tenant, ctx) = harness.load().await; + + let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap(); + let base_key_child = Key::from_hex("620000000033333333444444445500000001").unwrap(); + let base_key_nonexist = Key::from_hex("620000000033333333444444445500000002").unwrap(); + assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix... + let tline = tenant - .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + Vec::new(), // delta layers + vec![(Lsn(0x20), vec![(base_key, test_img("metadata key 1"))])], // image layers + Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN + ) .await?; - let cancel = CancellationToken::new(); - - let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap(); - let mut base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap(); - let mut base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap(); - base_key.field1 = AUX_KEY_PREFIX; - base_key_child.field1 = AUX_KEY_PREFIX; - base_key_nonexist.field1 = AUX_KEY_PREFIX; - - let mut lsn = Lsn(0x20); - - { - let mut writer = tline.writer().await; - writer - .put( - base_key, - lsn, - &Value::Image(test_img("metadata key 1")), - &ctx, - ) - .await?; - writer.finish_write(lsn); - drop(writer); - - tline.freeze_and_flush().await?; // this will create an image layer - - tline - .compact( - &cancel, - { - let mut set = EnumSet::empty(); - set.insert(CompactFlags::ForceImageLayerCreation); - set.insert(CompactFlags::ForceRepartition); - set - }, - &ctx, - ) - .await?; // force create an image layer for metadata keys - tenant - .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) - .await?; - } - let child = tenant - .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx) + .branch_timeline_test_with_layers( + &tline, + NEW_TIMELINE_ID, + Some(Lsn(0x20)), + &ctx, + Vec::new(), // delta layers + vec![( + Lsn(0x30), + vec![(base_key_child, test_img("metadata key 2"))], + )], // image layers + Lsn(0x30), + ) .await .unwrap(); - lsn.0 += 0x10; - - { - let mut writer = child.writer().await; - writer - .put( - base_key_child, - lsn, - &Value::Image(test_img("metadata key 2")), - &ctx, - ) - .await?; - writer.finish_write(lsn); - drop(writer); - - child.freeze_and_flush().await?; - - child - .compact( - &cancel, - { - let mut set = EnumSet::empty(); - set.insert(CompactFlags::ForceImageLayerCreation); - set.insert(CompactFlags::ForceRepartition); - set - }, - &ctx, - ) - .await?; // force create an image layer for metadata keys - tenant - .gc_iteration(Some(child.timeline_id), 0, Duration::ZERO, &cancel, &ctx) - .await?; - } - async fn get_vectored_impl_wrapper( tline: &Arc, key: Key, @@ -6441,6 +6458,8 @@ mod tests { })) } + let lsn = Lsn(0x30); + // test vectored get on parent timeline assert_eq!( get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?, @@ -6471,4 +6490,208 @@ mod tests { Ok(()) } + + async fn get_vectored_impl_wrapper( + tline: &Arc, + key: Key, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result, GetVectoredError> { + let mut reconstruct_state = ValuesReconstructState::new(); + let mut res = tline + .get_vectored_impl( + KeySpace::single(key..key.next()), + lsn, + &mut reconstruct_state, + ctx, + ) + .await?; + Ok(res.pop_last().map(|(k, v)| { + assert_eq!(k, key); + v.unwrap() + })) + } + + #[tokio::test] + async fn test_metadata_tombstone_reads() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_metadata_tombstone_reads")?; + let (tenant, ctx) = harness.load().await; + let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap(); + let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap(); + let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap(); + let key3 = Key::from_hex("620000000033333333444444445500000003").unwrap(); + + // We emulate the situation that the compaction algorithm creates an image layer that removes the tombstones + // Lsn 0x30 key0, key3, no key1+key2 + // Lsn 0x20 key1+key2 tomestones + // Lsn 0x10 key1 in image, key2 in delta + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + // delta layers + vec![ + vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))], + vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))], + vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))], + ], + // image layers + vec![ + (Lsn(0x10), vec![(key1, test_img("metadata key 1"))]), + ( + Lsn(0x30), + vec![ + (key0, test_img("metadata key 0")), + (key3, test_img("metadata key 3")), + ], + ), + ], + Lsn(0x30), + ) + .await?; + + let lsn = Lsn(0x30); + let old_lsn = Lsn(0x20); + + assert_eq!( + get_vectored_impl_wrapper(&tline, key0, lsn, &ctx).await?, + Some(test_img("metadata key 0")) + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, key1, lsn, &ctx).await?, + None, + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, key2, lsn, &ctx).await?, + None, + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, key1, old_lsn, &ctx).await?, + Some(Bytes::new()), + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, key2, old_lsn, &ctx).await?, + Some(Bytes::new()), + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, key3, lsn, &ctx).await?, + Some(test_img("metadata key 3")) + ); + + Ok(()) + } + + #[tokio::test] + async fn test_metadata_tombstone_image_creation() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_metadata_tombstone_image_creation")?; + let (tenant, ctx) = harness.load().await; + + let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap(); + let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap(); + let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap(); + let key3 = Key::from_hex("620000000033333333444444445500000003").unwrap(); + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + // delta layers + vec![ + vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))], + vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))], + vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))], + vec![ + (key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))), + (key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))), + ], + ], + // image layers + vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])], + Lsn(0x30), + ) + .await?; + + let cancel = CancellationToken::new(); + + tline + .compact( + &cancel, + { + let mut flags = EnumSet::new(); + flags.insert(CompactFlags::ForceImageLayerCreation); + flags.insert(CompactFlags::ForceRepartition); + flags + }, + &ctx, + ) + .await?; + + // Image layers are created at last_record_lsn + let images = tline + .inspect_image_layers(Lsn(0x30), &ctx) + .await? + .into_iter() + .filter(|(k, _)| k.is_metadata_key()) + .collect::>(); + assert_eq!(images.len(), 2); // the image layer should only contain two existing keys, tombstones should be removed. + + Ok(()) + } + + #[tokio::test] + async fn test_metadata_tombstone_empty_image_creation() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_metadata_tombstone_image_creation")?; + let (tenant, ctx) = harness.load().await; + + let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap(); + let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap(); + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + // delta layers + vec![ + vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))], + vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))], + vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))], + ], + // image layers + vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])], + Lsn(0x30), + ) + .await?; + + let cancel = CancellationToken::new(); + + tline + .compact( + &cancel, + { + let mut flags = EnumSet::new(); + flags.insert(CompactFlags::ForceImageLayerCreation); + flags.insert(CompactFlags::ForceRepartition); + flags + }, + &ctx, + ) + .await?; + + // Image layers are created at last_record_lsn + let images = tline + .inspect_image_layers(Lsn(0x30), &ctx) + .await? + .into_iter() + .filter(|(k, _)| k.is_metadata_key()) + .collect::>(); + assert_eq!(images.len(), 0); // the image layer should not contain tombstones, or it is not created + + Ok(()) + } } diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs index 7c6640eaac..8b36aa15e5 100644 --- a/pageserver/src/tenant/delete.rs +++ b/pageserver/src/tenant/delete.rs @@ -16,6 +16,7 @@ use crate::{ task_mgr::{self, TaskKind}, tenant::{ mgr::{TenantSlot, TenantsMapRemoveResult}, + remote_timeline_client::remote_heatmap_path, timeline::ShutdownMode, }, }; @@ -531,6 +532,25 @@ impl DeleteTenantFlow { } } + // Remove top-level tenant objects that don't belong to a timeline, such as heatmap + let heatmap_path = remote_heatmap_path(&tenant.tenant_shard_id()); + if let Some(Err(e)) = backoff::retry( + || async { + remote_storage + .delete(&heatmap_path, &task_mgr::shutdown_token()) + .await + }, + TimeoutOrCancel::caused_by_cancel, + FAILED_UPLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, + "remove_remote_tenant_heatmap", + &task_mgr::shutdown_token(), + ) + .await + { + tracing::warn!("Failed to delete heatmap at {heatmap_path}: {e}"); + } + let timelines_path = conf.timelines_path(&tenant.tenant_shard_id); // May not exist if we fail in cleanup_remaining_fs_traces after removing it if timelines_path.exists() { diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index fc71ea7642..c00672895a 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -267,7 +267,7 @@ impl<'de> Deserialize<'de> for TimelineMetadata { D: serde::Deserializer<'de>, { let bytes = Vec::::deserialize(deserializer)?; - Self::from_bytes(bytes.as_slice()).map_err(|e| D::Error::custom(format!("{e}"))) + Self::from_bytes(bytes.as_slice()).map_err(D::Error::custom) } } @@ -276,13 +276,163 @@ impl Serialize for TimelineMetadata { where S: Serializer, { - let bytes = self - .to_bytes() - .map_err(|e| serde::ser::Error::custom(format!("{e}")))?; + let bytes = self.to_bytes().map_err(serde::ser::Error::custom)?; bytes.serialize(serializer) } } +pub(crate) mod modern_serde { + use crate::tenant::metadata::METADATA_FORMAT_VERSION; + + use super::{ + TimelineMetadata, TimelineMetadataBodyV2, TimelineMetadataHeader, METADATA_HDR_SIZE, + }; + use serde::{Deserialize, Serialize}; + + pub(crate) fn deserialize<'de, D>(deserializer: D) -> Result + where + D: serde::de::Deserializer<'de>, + { + // for legacy reasons versions 1-5 had TimelineMetadata serialized as a Vec field with + // BeSer. + struct Visitor; + + impl<'d> serde::de::Visitor<'d> for Visitor { + type Value = TimelineMetadata; + + fn expecting(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + f.write_str("BeSer bytes or json structure") + } + + fn visit_seq(self, seq: A) -> Result + where + A: serde::de::SeqAccess<'d>, + { + use serde::de::Error; + let de = serde::de::value::SeqAccessDeserializer::new(seq); + Vec::::deserialize(de) + .map(|v| TimelineMetadata::from_bytes(&v).map_err(A::Error::custom))? + } + + fn visit_map(self, map: A) -> Result + where + A: serde::de::MapAccess<'d>, + { + use serde::de::Error; + + let de = serde::de::value::MapAccessDeserializer::new(map); + let body = TimelineMetadataBodyV2::deserialize(de)?; + + // jump through hoops to calculate the crc32 so that TimelineMetadata::ne works + // across serialization versions + let mut sink = Crc32Sink::default(); + ::ser_into(&body, &mut sink) + .map_err(|e| A::Error::custom(Crc32CalculationFailed(e)))?; + + let size = METADATA_HDR_SIZE + sink.count; + + Ok(TimelineMetadata { + hdr: TimelineMetadataHeader { + checksum: sink.crc, + size: size as u16, + format_version: METADATA_FORMAT_VERSION, + }, + body, + }) + } + } + + deserializer.deserialize_any(Visitor) + } + + #[derive(Default)] + struct Crc32Sink { + crc: u32, + count: usize, + } + + impl std::io::Write for Crc32Sink { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + self.crc = crc32c::crc32c_append(self.crc, buf); + self.count += buf.len(); + Ok(buf.len()) + } + + fn flush(&mut self) -> std::io::Result<()> { + Ok(()) + } + } + + #[derive(thiserror::Error)] + #[error("re-serializing for crc32 failed")] + struct Crc32CalculationFailed(#[source] E); + + // this should be true for one release, after that we can change it to false + // remember to check the IndexPart::metadata field TODO comment as well + const LEGACY_BINCODED_BYTES: bool = true; + + #[derive(serde::Serialize)] + #[serde(transparent)] + struct LegacyPaddedBytes<'a>(&'a TimelineMetadata); + + struct JustTheBodyV2<'a>(&'a TimelineMetadata); + + impl serde::Serialize for JustTheBodyV2<'_> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + // header is not needed, upon reading we've upgraded all v1 to v2 + self.0.body.serialize(serializer) + } + } + + pub(crate) fn serialize( + metadata: &TimelineMetadata, + serializer: S, + ) -> Result + where + S: serde::Serializer, + { + // we cannot use TimelineMetadata::serialize for now because it'll do + // TimelineMetadata::to_bytes + if LEGACY_BINCODED_BYTES { + LegacyPaddedBytes(metadata).serialize(serializer) + } else { + JustTheBodyV2(metadata).serialize(serializer) + } + } + + #[test] + fn deserializes_bytes_as_well_as_equivalent_body_v2() { + #[derive(serde::Deserialize, serde::Serialize)] + struct Wrapper(#[serde(deserialize_with = "deserialize")] TimelineMetadata); + + let too_many_bytes = "[216,111,252,208,0,54,0,4,0,0,0,0,1,73,253,144,1,0,0,0,0,1,73,253,24,0,0,0,0,0,0,0,0,0,0,0,0,0,1,73,253,24,0,0,0,0,1,73,253,24,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]"; + + let wrapper_from_bytes = serde_json::from_str::(too_many_bytes).unwrap(); + + let serialized = serde_json::to_value(JustTheBodyV2(&wrapper_from_bytes.0)).unwrap(); + + assert_eq!( + serialized, + serde_json::json! {{ + "disk_consistent_lsn": "0/149FD90", + "prev_record_lsn": "0/149FD18", + "ancestor_timeline": null, + "ancestor_lsn": "0/0", + "latest_gc_cutoff_lsn": "0/149FD18", + "initdb_lsn": "0/149FD18", + "pg_version": 15 + }} + ); + + let wrapper_from_json = serde_json::value::from_value::(serialized).unwrap(); + + assert_eq!(wrapper_from_bytes.0, wrapper_from_json.0); + } +} + /// Parts of the metadata which are regularly modified. pub(crate) struct MetadataUpdate { disk_consistent_lsn: Lsn, diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 89fdf31849..4520bb9295 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -3,6 +3,7 @@ use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf}; use futures::StreamExt; +use hyper::StatusCode; use itertools::Itertools; use pageserver_api::key::Key; use pageserver_api::models::LocationConfigMode; @@ -45,7 +46,7 @@ use crate::tenant::delete::DeleteTenantFlow; use crate::tenant::span::debug_assert_current_span_has_tenant_id; use crate::tenant::storage_layer::inmemory_layer; use crate::tenant::timeline::ShutdownMode; -use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState}; +use crate::tenant::{AttachedTenantConf, GcError, SpawnMode, Tenant, TenantState}; use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX}; use utils::crashsafe::path_with_suffix_extension; @@ -54,6 +55,7 @@ use utils::generation::Generation; use utils::id::{TenantId, TimelineId}; use super::delete::DeleteTenantError; +use super::remote_timeline_client::remote_tenant_path; use super::secondary::SecondaryTenant; use super::timeline::detach_ancestor::PreparedTimelineDetach; use super::TenantSharedResources; @@ -1369,7 +1371,7 @@ impl TenantManager { &self, tenant_shard_id: TenantShardId, activation_timeout: Duration, - ) -> Result<(), DeleteTenantError> { + ) -> Result { super::span::debug_assert_current_span_has_tenant_id(); // We acquire a SlotGuard during this function to protect against concurrent // changes while the ::prepare phase of DeleteTenantFlow executes, but then @@ -1382,18 +1384,79 @@ impl TenantManager { // // See https://github.com/neondatabase/neon/issues/5080 - let slot_guard = - tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?; + // Tenant deletion can happen two ways: + // - Legacy: called on an attached location. The attached Tenant object stays alive in Stopping + // state until deletion is complete. + // - New: called on a pageserver without an attached location. We proceed with deletion from + // remote storage. + // + // See https://github.com/neondatabase/neon/issues/5080 for more context on this transition. - // unwrap is safe because we used MustExist mode when acquiring - let tenant = match slot_guard.get_old_value().as_ref().unwrap() { - TenantSlot::Attached(tenant) => tenant.clone(), - _ => { - // Express "not attached" as equivalent to "not found" - return Err(DeleteTenantError::NotAttached); + let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?; + match &slot_guard.old_value { + Some(TenantSlot::Attached(tenant)) => { + // Legacy deletion flow: the tenant remains attached, goes to Stopping state, and + // deletion will be resumed across restarts. + let tenant = tenant.clone(); + return self + .delete_tenant_attached(slot_guard, tenant, activation_timeout) + .await; } + Some(TenantSlot::Secondary(secondary_tenant)) => { + secondary_tenant.shutdown().await; + let local_tenant_directory = self.conf.tenant_path(&tenant_shard_id); + let tmp_dir = safe_rename_tenant_dir(&local_tenant_directory) + .await + .with_context(|| { + format!("local tenant directory {local_tenant_directory:?} rename") + })?; + spawn_background_purge(tmp_dir); + } + Some(TenantSlot::InProgress(_)) => unreachable!(), + None => {} }; + // Fall through: local state for this tenant is no longer present, proceed with remote delete + let remote_path = remote_tenant_path(&tenant_shard_id); + let keys = match self + .resources + .remote_storage + .list( + Some(&remote_path), + remote_storage::ListingMode::NoDelimiter, + None, + &self.cancel, + ) + .await + { + Ok(listing) => listing.keys, + Err(remote_storage::DownloadError::Cancelled) => { + return Err(DeleteTenantError::Cancelled) + } + Err(remote_storage::DownloadError::NotFound) => return Ok(StatusCode::NOT_FOUND), + Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))), + }; + + if keys.is_empty() { + tracing::info!("Remote storage already deleted"); + } else { + tracing::info!("Deleting {} keys from remote storage", keys.len()); + self.resources + .remote_storage + .delete_objects(&keys, &self.cancel) + .await?; + } + + // Callers use 404 as success for deletions, for historical reasons. + Ok(StatusCode::NOT_FOUND) + } + + async fn delete_tenant_attached( + &self, + slot_guard: SlotGuard, + tenant: Arc, + activation_timeout: Duration, + ) -> Result { match tenant.current_state() { TenantState::Broken { .. } | TenantState::Stopping { .. } => { // If deletion is already in progress, return success (the semantics of this @@ -1403,7 +1466,7 @@ impl TenantManager { // The `delete_progress` lock is held: deletion is already happening // in the bacckground slot_guard.revert(); - return Ok(()); + return Ok(StatusCode::ACCEPTED); } } _ => { @@ -1436,7 +1499,8 @@ impl TenantManager { // The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow slot_guard.revert(); - result + let () = result?; + Ok(StatusCode::ACCEPTED) } #[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))] @@ -2833,7 +2897,13 @@ pub(crate) async fn immediate_gc( } } - result.map_err(ApiError::InternalServerError) + result.map_err(|e| match e { + GcError::TenantCancelled | GcError::TimelineCancelled => ApiError::ShuttingDown, + GcError::TimelineNotFound => { + ApiError::NotFound(anyhow::anyhow!("Timeline not found").into()) + } + other => ApiError::InternalServerError(anyhow::anyhow!(other)), + }) } #[cfg(test)] diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 73438a790f..e33e4b84aa 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -91,8 +91,7 @@ //! //! The *actual* remote state lags behind the *desired* remote state while //! there are in-flight operations. -//! We keep track of the desired remote state in -//! [`UploadQueueInitialized::latest_files`] and [`UploadQueueInitialized::latest_metadata`]. +//! We keep track of the desired remote state in [`UploadQueueInitialized::dirty`]. //! It is initialized based on the [`IndexPart`] that was passed during init //! and updated with every `schedule_*` function call. //! All this is necessary necessary to compute the future [`IndexPart`]s @@ -115,8 +114,7 @@ //! //! # Completion //! -//! Once an operation has completed, we update -//! [`UploadQueueInitialized::projected_remote_consistent_lsn`] immediately, +//! Once an operation has completed, we update [`UploadQueueInitialized::clean`] immediately, //! and submit a request through the DeletionQueue to update //! [`UploadQueueInitialized::visible_remote_consistent_lsn`] after it has //! validated that our generation is not stale. It is this visible value @@ -416,6 +414,7 @@ impl RemoteTimelineClient { Ok(()) } + /// Returns `None` if nothing is yet uplodaded, `Some(disk_consistent_lsn)` otherwise. pub fn remote_consistent_lsn_projected(&self) -> Option { match &mut *self.upload_queue.lock().unwrap() { UploadQueue::Uninitialized => None, @@ -442,13 +441,11 @@ impl RemoteTimelineClient { /// Returns true if this timeline was previously detached at this Lsn and the remote timeline /// client is currently initialized. pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool { - // technically this is a dirty read, but given how timeline detach ancestor is implemented - // via tenant restart, the lineage has always been uploaded. self.upload_queue .lock() .unwrap() .initialized_mut() - .map(|uq| uq.latest_lineage.is_previous_ancestor_lsn(lsn)) + .map(|uq| uq.clean.0.lineage.is_previous_ancestor_lsn(lsn)) .unwrap_or(false) } @@ -457,7 +454,6 @@ impl RemoteTimelineClient { current_remote_index_part .layer_metadata .values() - // If we don't have the file size for the layer, don't account for it in the metric. .map(|ilmd| ilmd.file_size) .sum() } else { @@ -585,9 +581,9 @@ impl RemoteTimelineClient { // As documented in the struct definition, it's ok for latest_metadata to be // ahead of what's _actually_ on the remote during index upload. - upload_queue.latest_metadata = metadata.clone(); + upload_queue.dirty.metadata = metadata.clone(); - self.schedule_index_upload(upload_queue); + self.schedule_index_upload(upload_queue)?; Ok(()) } @@ -606,9 +602,9 @@ impl RemoteTimelineClient { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; - upload_queue.latest_metadata.apply(update); + upload_queue.dirty.metadata.apply(update); - self.schedule_index_upload(upload_queue); + self.schedule_index_upload(upload_queue)?; Ok(()) } @@ -620,8 +616,8 @@ impl RemoteTimelineClient { ) -> anyhow::Result<()> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; - upload_queue.last_aux_file_policy = last_aux_file_policy; - self.schedule_index_upload(upload_queue); + upload_queue.dirty.last_aux_file_policy = last_aux_file_policy; + self.schedule_index_upload(upload_queue)?; Ok(()) } /// @@ -639,30 +635,44 @@ impl RemoteTimelineClient { let upload_queue = guard.initialized_mut()?; if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 { - self.schedule_index_upload(upload_queue); + self.schedule_index_upload(upload_queue)?; } Ok(()) } /// Launch an index-file upload operation in the background (internal function) - fn schedule_index_upload(self: &Arc, upload_queue: &mut UploadQueueInitialized) { - let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn(); + fn schedule_index_upload( + self: &Arc, + upload_queue: &mut UploadQueueInitialized, + ) -> anyhow::Result<()> { + let disk_consistent_lsn = upload_queue.dirty.metadata.disk_consistent_lsn(); + // fix up the duplicated field + upload_queue.dirty.disk_consistent_lsn = disk_consistent_lsn; + + // make sure it serializes before doing it in perform_upload_task so that it doesn't + // look like a retryable error + let void = std::io::sink(); + serde_json::to_writer(void, &upload_queue.dirty).context("serialize index_part.json")?; + + let index_part = &upload_queue.dirty; info!( "scheduling metadata upload up to consistent LSN {disk_consistent_lsn} with {} files ({} changed)", - upload_queue.latest_files.len(), + index_part.layer_metadata.len(), upload_queue.latest_files_changes_since_metadata_upload_scheduled, ); - let index_part = IndexPart::from(&*upload_queue); - let op = UploadOp::UploadMetadata(Box::new(index_part), disk_consistent_lsn); + let op = UploadOp::UploadMetadata { + uploaded: Box::new(index_part.clone()), + }; self.metric_begin(&op); upload_queue.queued_operations.push_back(op); upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0; // Launch the task immediately, if possible self.launch_queued_tasks(upload_queue); + Ok(()) } pub(crate) async fn schedule_reparenting_and_wait( @@ -675,16 +685,16 @@ impl RemoteTimelineClient { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; - let Some(prev) = upload_queue.latest_metadata.ancestor_timeline() else { + let Some(prev) = upload_queue.dirty.metadata.ancestor_timeline() else { return Err(anyhow::anyhow!( "cannot reparent without a current ancestor" )); }; - upload_queue.latest_metadata.reparent(new_parent); - upload_queue.latest_lineage.record_previous_ancestor(&prev); + upload_queue.dirty.metadata.reparent(new_parent); + upload_queue.dirty.lineage.record_previous_ancestor(&prev); - self.schedule_index_upload(upload_queue); + self.schedule_index_upload(upload_queue)?; self.schedule_barrier0(upload_queue) }; @@ -705,16 +715,17 @@ impl RemoteTimelineClient { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; - upload_queue.latest_metadata.detach_from_ancestor(&adopted); - upload_queue.latest_lineage.record_detaching(&adopted); + upload_queue.dirty.metadata.detach_from_ancestor(&adopted); + upload_queue.dirty.lineage.record_detaching(&adopted); for layer in layers { upload_queue - .latest_files + .dirty + .layer_metadata .insert(layer.layer_desc().layer_name(), layer.metadata()); } - self.schedule_index_upload(upload_queue); + self.schedule_index_upload(upload_queue)?; let barrier = self.schedule_barrier0(upload_queue); self.launch_queued_tasks(upload_queue); @@ -746,7 +757,8 @@ impl RemoteTimelineClient { let metadata = layer.metadata(); upload_queue - .latest_files + .dirty + .layer_metadata .insert(layer.layer_desc().layer_name(), metadata.clone()); upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1; @@ -776,8 +788,8 @@ impl RemoteTimelineClient { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; - let with_metadata = - self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned()); + let with_metadata = self + .schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned())?; self.schedule_deletion_of_unlinked0(upload_queue, with_metadata); @@ -801,7 +813,7 @@ impl RemoteTimelineClient { let names = gc_layers.iter().map(|x| x.layer_desc().layer_name()); - self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names); + self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names)?; self.launch_queued_tasks(upload_queue); @@ -814,7 +826,7 @@ impl RemoteTimelineClient { self: &Arc, upload_queue: &mut UploadQueueInitialized, names: I, - ) -> Vec<(LayerName, LayerFileMetadata)> + ) -> anyhow::Result> where I: IntoIterator, { @@ -824,7 +836,7 @@ impl RemoteTimelineClient { let with_metadata: Vec<_> = names .into_iter() .filter_map(|name| { - let meta = upload_queue.latest_files.remove(&name); + let meta = upload_queue.dirty.layer_metadata.remove(&name); if let Some(meta) = meta { upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1; @@ -856,10 +868,10 @@ impl RemoteTimelineClient { // index_part update, because that needs to be uploaded before we can actually delete the // files. if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 { - self.schedule_index_upload(upload_queue); + self.schedule_index_upload(upload_queue)?; } - with_metadata + Ok(with_metadata) } /// Schedules deletion for layer files which have previously been unlinked from the @@ -950,7 +962,7 @@ impl RemoteTimelineClient { let names = compacted_from.iter().map(|x| x.layer_desc().layer_name()); - self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names); + self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names)?; self.launch_queued_tasks(upload_queue); Ok(()) @@ -1085,7 +1097,7 @@ impl RemoteTimelineClient { let deleted_at = Utc::now().naive_utc(); stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at); - let mut index_part = IndexPart::from(&stopped.upload_queue_for_deletion); + let mut index_part = stopped.upload_queue_for_deletion.dirty.clone(); index_part.deleted_at = Some(deleted_at); index_part }; @@ -1296,7 +1308,8 @@ impl RemoteTimelineClient { stopped .upload_queue_for_deletion - .latest_files + .dirty + .layer_metadata .drain() .map(|(file_name, meta)| { remote_layer_path( @@ -1433,7 +1446,7 @@ impl RemoteTimelineClient { // Can always be scheduled. true } - UploadOp::UploadMetadata(_, _) => { + UploadOp::UploadMetadata { .. } => { // These can only be performed after all the preceding operations // have finished. upload_queue.inprogress_tasks.is_empty() @@ -1475,7 +1488,7 @@ impl RemoteTimelineClient { UploadOp::UploadLayer(_, _) => { upload_queue.num_inprogress_layer_uploads += 1; } - UploadOp::UploadMetadata(_, _) => { + UploadOp::UploadMetadata { .. } => { upload_queue.num_inprogress_metadata_uploads += 1; } UploadOp::Delete(_) => { @@ -1584,22 +1597,13 @@ impl RemoteTimelineClient { ) .await } - UploadOp::UploadMetadata(ref index_part, _lsn) => { - let mention_having_future_layers = if cfg!(feature = "testing") { - index_part - .layer_metadata - .keys() - .any(|x| x.is_in_future(*_lsn)) - } else { - false - }; - + UploadOp::UploadMetadata { ref uploaded } => { let res = upload::upload_index_part( &self.storage_impl, &self.tenant_shard_id, &self.timeline_id, self.generation, - index_part, + uploaded, &self.cancel, ) .measure_remote_op( @@ -1609,10 +1613,21 @@ impl RemoteTimelineClient { ) .await; if res.is_ok() { - self.update_remote_physical_size_gauge(Some(index_part)); + self.update_remote_physical_size_gauge(Some(uploaded)); + let mention_having_future_layers = if cfg!(feature = "testing") { + uploaded + .layer_metadata + .keys() + .any(|x| x.is_in_future(uploaded.metadata.disk_consistent_lsn())) + } else { + false + }; if mention_having_future_layers { // find rationale near crate::tenant::timeline::init::cleanup_future_layer - tracing::info!(disk_consistent_lsn=%_lsn, "uploaded an index_part.json with future layers -- this is ok! if shutdown now, expect future layer cleanup"); + tracing::info!( + disk_consistent_lsn = %uploaded.metadata.disk_consistent_lsn(), + "uploaded an index_part.json with future layers -- this is ok! if shutdown now, expect future layer cleanup" + ); } } res @@ -1713,11 +1728,23 @@ impl RemoteTimelineClient { upload_queue.num_inprogress_layer_uploads -= 1; None } - UploadOp::UploadMetadata(_, lsn) => { + UploadOp::UploadMetadata { ref uploaded } => { upload_queue.num_inprogress_metadata_uploads -= 1; - // XXX monotonicity check? - upload_queue.projected_remote_consistent_lsn = Some(lsn); + // the task id is reused as a monotonicity check for storing the "clean" + // IndexPart. + let last_updater = upload_queue.clean.1; + let is_later = last_updater.is_some_and(|task_id| task_id < task.task_id); + let monotone = is_later || last_updater.is_none(); + + assert!(monotone, "no two index uploads should be completing at the same time, prev={last_updater:?}, task.task_id={}", task.task_id); + + // not taking ownership is wasteful + upload_queue.clean.0.clone_from(uploaded); + upload_queue.clean.1 = Some(task.task_id); + + let lsn = upload_queue.clean.0.metadata.disk_consistent_lsn(); + if self.generation.is_none() { // Legacy mode: skip validating generation upload_queue.visible_remote_consistent_lsn.store(lsn); @@ -1771,7 +1798,7 @@ impl RemoteTimelineClient { RemoteOpKind::Upload, RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size), ), - UploadOp::UploadMetadata(_, _) => ( + UploadOp::UploadMetadata { .. } => ( RemoteOpFileKind::Index, RemoteOpKind::Upload, DontTrackSize { @@ -1847,11 +1874,9 @@ impl RemoteTimelineClient { // Deletion is not really perf sensitive so there shouldnt be any problems with cloning a fraction of it. let upload_queue_for_deletion = UploadQueueInitialized { task_counter: 0, - latest_files: initialized.latest_files.clone(), + dirty: initialized.dirty.clone(), + clean: initialized.clean.clone(), latest_files_changes_since_metadata_upload_scheduled: 0, - latest_metadata: initialized.latest_metadata.clone(), - latest_lineage: initialized.latest_lineage.clone(), - projected_remote_consistent_lsn: None, visible_remote_consistent_lsn: initialized .visible_remote_consistent_lsn .clone(), @@ -1864,7 +1889,6 @@ impl RemoteTimelineClient { dangling_files: HashMap::default(), shutting_down: false, shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)), - last_aux_file_policy: initialized.last_aux_file_policy, }; let upload_queue = std::mem::replace( diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index bd75f980e8..d0385e4aee 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -28,6 +28,7 @@ use crate::TEMP_FILE_SUFFIX; use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath}; use utils::crashsafe::path_with_suffix_extension; use utils::id::{TenantId, TimelineId}; +use utils::pausable_failpoint; use super::index::{IndexPart, LayerFileMetadata}; use super::{ @@ -152,6 +153,8 @@ async fn download_object<'a>( let download = storage.download(src_path, cancel).await?; + pausable_failpoint!("before-downloading-layer-stream-pausable"); + let mut buf_writer = tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file); @@ -199,6 +202,8 @@ async fn download_object<'a>( let mut download = storage.download(src_path, cancel).await?; + pausable_failpoint!("before-downloading-layer-stream-pausable"); + // TODO: use vectored write (writev) once supported by tokio-epoll-uring. // There's chunks_vectored() on the stream. let (bytes_amount, destination_file) = async { diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index f5d939c747..7d2e9b9a91 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -11,7 +11,6 @@ use utils::id::TimelineId; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::storage_layer::LayerName; -use crate::tenant::upload_queue::UploadQueueInitialized; use crate::tenant::Generation; use pageserver_api::shard::ShardIndex; @@ -42,9 +41,13 @@ pub struct IndexPart { // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata. // It's duplicated for convenience when reading the serialized structure, but is // private because internally we would read from metadata instead. - disk_consistent_lsn: Lsn, + pub(super) disk_consistent_lsn: Lsn, - #[serde(rename = "metadata_bytes")] + // TODO: later make this "rename" to "alias", rename field as "legacy_metadata" + #[serde( + rename = "metadata_bytes", + with = "crate::tenant::metadata::modern_serde" + )] pub metadata: TimelineMetadata, #[serde(default)] @@ -80,23 +83,15 @@ impl IndexPart { pub const FILE_NAME: &'static str = "index_part.json"; - fn new( - layers_and_metadata: &HashMap, - disk_consistent_lsn: Lsn, - metadata: TimelineMetadata, - lineage: Lineage, - last_aux_file_policy: Option, - ) -> Self { - let layer_metadata = layers_and_metadata.clone(); - - Self { + pub(crate) fn empty(metadata: TimelineMetadata) -> Self { + IndexPart { version: Self::LATEST_VERSION, - layer_metadata, - disk_consistent_lsn, + layer_metadata: Default::default(), + disk_consistent_lsn: metadata.disk_consistent_lsn(), metadata, deleted_at: None, - lineage, - last_aux_file_policy, + lineage: Default::default(), + last_aux_file_policy: None, } } @@ -106,7 +101,7 @@ impl IndexPart { /// If you want this under normal operations, read it from self.metadata: /// this method is just for the scrubber to use when validating an index. - pub fn get_disk_consistent_lsn(&self) -> Lsn { + pub fn duplicated_disk_consistent_lsn(&self) -> Lsn { self.disk_consistent_lsn } @@ -120,14 +115,7 @@ impl IndexPart { #[cfg(test)] pub(crate) fn example() -> Self { - let example_metadata = TimelineMetadata::example(); - Self::new( - &HashMap::new(), - example_metadata.disk_consistent_lsn(), - example_metadata, - Default::default(), - Some(AuxFilePolicy::V1), - ) + Self::empty(TimelineMetadata::example()) } pub(crate) fn last_aux_file_policy(&self) -> Option { @@ -135,22 +123,6 @@ impl IndexPart { } } -impl From<&UploadQueueInitialized> for IndexPart { - fn from(uq: &UploadQueueInitialized) -> Self { - let disk_consistent_lsn = uq.latest_metadata.disk_consistent_lsn(); - let metadata = uq.latest_metadata.clone(); - let lineage = uq.latest_lineage.clone(); - - Self::new( - &uq.latest_files, - disk_consistent_lsn, - metadata, - lineage, - uq.last_aux_file_policy, - ) - } -} - /// Metadata gathered for each of the layer files. /// /// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which @@ -236,11 +208,10 @@ impl Lineage { /// The queried lsn is most likely the basebackup lsn, and this answers question "is it allowed /// to start a read/write primary at this lsn". /// - /// Returns true if the Lsn was previously a branch point. + /// Returns true if the Lsn was previously our branch point. pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool { self.original_ancestor - .as_ref() - .is_some_and(|(_, ancestor_lsn, _)| lsn == *ancestor_lsn) + .is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn) } } diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index e8e824f415..c4dd184610 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -1,6 +1,7 @@ //! Helper functions to upload files to remote storage with a RemoteStorage use anyhow::{bail, Context}; +use bytes::Bytes; use camino::Utf8Path; use fail::fail_point; use pageserver_api::shard::TenantShardId; @@ -11,10 +12,10 @@ use tokio::io::AsyncSeekExt; use tokio_util::sync::CancellationToken; use utils::{backoff, pausable_failpoint}; +use super::index::IndexPart; use super::Generation; use crate::tenant::remote_timeline_client::{ - index::IndexPart, remote_index_path, remote_initdb_archive_path, - remote_initdb_preserved_archive_path, + remote_index_path, remote_initdb_archive_path, remote_initdb_preserved_archive_path, }; use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError}; use utils::id::{TenantId, TimelineId}; @@ -27,7 +28,7 @@ pub(crate) async fn upload_index_part<'a>( tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, generation: Generation, - index_part: &'a IndexPart, + index_part: &IndexPart, cancel: &CancellationToken, ) -> anyhow::Result<()> { tracing::trace!("uploading new index part"); @@ -37,16 +38,16 @@ pub(crate) async fn upload_index_part<'a>( }); pausable_failpoint!("before-upload-index-pausable"); - let index_part_bytes = index_part - .to_s3_bytes() - .context("serialize index part file into bytes")?; - let index_part_size = index_part_bytes.len(); - let index_part_bytes = bytes::Bytes::from(index_part_bytes); + // FIXME: this error comes too late + let serialized = index_part.to_s3_bytes()?; + let serialized = Bytes::from(serialized); + + let index_part_size = serialized.len(); let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation); storage .upload_storage_object( - futures::stream::once(futures::future::ready(Ok(index_part_bytes))), + futures::stream::once(futures::future::ready(Ok(serialized))), index_part_size, &remote_path, cancel, diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 5c915d6b53..62803c7838 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -1000,7 +1000,7 @@ impl<'a> TenantDownloader<'a> { layer.name, layer.metadata.file_size ); - let downloaded_bytes = match download_layer_file( + let downloaded_bytes = download_layer_file( self.conf, self.remote_storage, *tenant_shard_id, @@ -1011,8 +1011,9 @@ impl<'a> TenantDownloader<'a> { &self.secondary_state.cancel, ctx, ) - .await - { + .await; + + let downloaded_bytes = match downloaded_bytes { Ok(bytes) => bytes, Err(DownloadError::NotFound) => { // A heatmap might be out of date and refer to a layer that doesn't exist any more. diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs index 0ec1c7872a..28cf2125df 100644 --- a/pageserver/src/tenant/secondary/scheduler.rs +++ b/pageserver/src/tenant/secondary/scheduler.rs @@ -334,8 +334,11 @@ where let tenant_shard_id = job.get_tenant_shard_id(); let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) { - tracing::info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), - "Command already running, waiting for it"); + tracing::info!( + tenant_id=%tenant_shard_id.tenant_id, + shard_id=%tenant_shard_id.shard_slug(), + "Command already running, waiting for it" + ); barrier } else { let running = self.spawn_now(job); diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 9ccf20c0d4..0b3f841ccf 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -318,7 +318,7 @@ pub(crate) struct LayerFringe { #[derive(Debug)] struct LayerKeyspace { layer: ReadableLayer, - target_keyspace: KeySpace, + target_keyspace: Vec, } impl LayerFringe { @@ -336,6 +336,7 @@ impl LayerFringe { }; let removed = self.layers.remove_entry(&read_desc.layer_id); + match removed { Some(( _, @@ -343,7 +344,15 @@ impl LayerFringe { layer, target_keyspace, }, - )) => Some((layer, target_keyspace, read_desc.lsn_range)), + )) => { + let mut keyspace = KeySpaceRandomAccum::new(); + for ks in target_keyspace { + for part in ks.ranges { + keyspace.add_range(part); + } + } + Some((layer, keyspace.consume_keyspace(), read_desc.lsn_range)) + } None => unreachable!("fringe internals are always consistent"), } } @@ -358,7 +367,7 @@ impl LayerFringe { let entry = self.layers.entry(layer_id.clone()); match entry { Entry::Occupied(mut entry) => { - entry.get_mut().target_keyspace.merge(&keyspace); + entry.get_mut().target_keyspace.push(keyspace); } Entry::Vacant(entry) => { self.planned_reads_by_lsn.push(ReadDesc { @@ -367,7 +376,7 @@ impl LayerFringe { }); entry.insert(LayerKeyspace { layer, - target_keyspace: keyspace, + target_keyspace: vec![keyspace], }); } } diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 1b3802840f..999e2e8679 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -478,6 +478,23 @@ impl DeltaLayerWriterInner { key_end: Key, timeline: &Arc, ctx: &RequestContext, + ) -> anyhow::Result { + let temp_path = self.path.clone(); + let result = self.finish0(key_end, timeline, ctx).await; + if result.is_err() { + tracing::info!(%temp_path, "cleaning up temporary file after error during writing"); + if let Err(e) = std::fs::remove_file(&temp_path) { + tracing::warn!(error=%e, %temp_path, "error cleaning up temporary layer file after error during writing"); + } + } + result + } + + async fn finish0( + self, + key_end: Key, + timeline: &Arc, + ctx: &RequestContext, ) -> anyhow::Result { let index_start_blk = ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; @@ -651,19 +668,11 @@ impl DeltaLayerWriter { timeline: &Arc, ctx: &RequestContext, ) -> anyhow::Result { - let inner = self.inner.take().unwrap(); - let temp_path = inner.path.clone(); - let result = inner.finish(key_end, timeline, ctx).await; - // The delta layer files can sometimes be really large. Clean them up. - if result.is_err() { - tracing::warn!( - "Cleaning up temporary delta file {temp_path} after error during writing" - ); - if let Err(e) = std::fs::remove_file(&temp_path) { - tracing::warn!("Error cleaning up temporary delta layer file {temp_path}: {e:?}") - } - } - result + self.inner + .take() + .unwrap() + .finish(key_end, timeline, ctx) + .await } } diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 8394b33f19..285618b146 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -917,26 +917,57 @@ impl Drop for ImageLayerWriter { #[cfg(test)] mod test { + use std::time::Duration; + use bytes::Bytes; use pageserver_api::{ key::Key, shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}, }; - use utils::{id::TimelineId, lsn::Lsn}; + use utils::{ + generation::Generation, + id::{TenantId, TimelineId}, + lsn::Lsn, + }; - use crate::{tenant::harness::TenantHarness, DEFAULT_PG_VERSION}; + use crate::{ + tenant::{config::TenantConf, harness::TenantHarness}, + DEFAULT_PG_VERSION, + }; use super::ImageLayerWriter; #[tokio::test] async fn image_layer_rewrite() { - let harness = TenantHarness::create("test_image_layer_rewrite").unwrap(); - let (tenant, ctx) = harness.load().await; - + let tenant_conf = TenantConf { + gc_period: Duration::ZERO, + compaction_period: Duration::ZERO, + ..TenantConf::default() + }; + let tenant_id = TenantId::generate(); + let mut gen = Generation::new(0xdead0001); + let mut get_next_gen = || { + let ret = gen; + gen = gen.next(); + ret + }; // The LSN at which we will create an image layer to filter let lsn = Lsn(0xdeadbeef0000); - let timeline_id = TimelineId::generate(); + + // + // Create an unsharded parent with a layer. + // + + let harness = TenantHarness::create_custom( + "test_image_layer_rewrite--parent", + tenant_conf.clone(), + tenant_id, + ShardIdentity::unsharded(), + get_next_gen(), + ) + .unwrap(); + let (tenant, ctx) = harness.load().await; let timeline = tenant .create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx) .await @@ -971,9 +1002,47 @@ mod test { }; let original_size = resident.metadata().file_size; + // + // Create child shards and do the rewrite, exercising filter(). + // TODO: abstraction in TenantHarness for splits. + // + // Filter for various shards: this exercises cases like values at start of key range, end of key // range, middle of key range. - for shard_number in 0..4 { + let shard_count = ShardCount::new(4); + for shard_number in 0..shard_count.count() { + // + // mimic the shard split + // + let shard_identity = ShardIdentity::new( + ShardNumber(shard_number), + shard_count, + ShardStripeSize(0x8000), + ) + .unwrap(); + let harness = TenantHarness::create_custom( + Box::leak(Box::new(format!( + "test_image_layer_rewrite--child{}", + shard_identity.shard_slug() + ))), + tenant_conf.clone(), + tenant_id, + shard_identity, + // NB: in reality, the shards would each fork off their own gen number sequence from the parent. + // But here, all we care about is that the gen number is unique. + get_next_gen(), + ) + .unwrap(); + let (tenant, ctx) = harness.load().await; + let timeline = tenant + .create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + // + // use filter() and make assertions + // + let mut filtered_writer = ImageLayerWriter::new( harness.conf, timeline_id, @@ -985,15 +1054,6 @@ mod test { .await .unwrap(); - // TenantHarness gave us an unsharded tenant, but we'll use a sharded ShardIdentity - // to exercise filter() - let shard_identity = ShardIdentity::new( - ShardNumber(shard_number), - ShardCount::new(4), - ShardStripeSize(0x8000), - ) - .unwrap(); - let wrote_keys = resident .filter(&shard_identity, &mut filtered_writer, &ctx) .await diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 3ac799c69a..18f9ba4ef8 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -277,9 +277,10 @@ impl Layer { let downloaded = resident.expect("just initialized"); - // if the rename works, the path is as expected - // TODO: sync system call - std::fs::rename(temp_path, owner.local_path()) + // We never want to overwrite an existing file, so we use `RENAME_NOREPLACE`. + // TODO: this leaves the temp file in place if the rename fails, risking us running + // out of space. Should we clean it up here or does the calling context deal with this? + utils::fs_ext::rename_noreplace(temp_path.as_std_path(), owner.local_path().as_std_path()) .with_context(|| format!("rename temporary file as correct path for {owner}"))?; Ok(ResidentLayer { downloaded, owner }) @@ -366,7 +367,10 @@ impl Layer { .0 .get_or_maybe_download(true, Some(ctx)) .await - .map_err(|err| GetVectoredError::Other(anyhow::anyhow!(err)))?; + .map_err(|err| match err { + DownloadError::DownloadCancelled => GetVectoredError::Cancelled, + other => GetVectoredError::Other(anyhow::anyhow!(other)), + })?; self.0 .access_stats @@ -1158,6 +1162,11 @@ impl LayerInner { let consecutive_failures = 1 + self.consecutive_failures.fetch_add(1, Ordering::Relaxed); + if timeline.cancel.is_cancelled() { + // If we're shutting down, drop out before logging the error + return Err(e); + } + tracing::error!(consecutive_failures, "layer file download failed: {e:#}"); let backoff = utils::backoff::exponential_backoff_duration_seconds( diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index bf2d8a47b4..a6dfa84f35 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -380,21 +380,28 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { let res = tenant .gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx) .await; - if let Err(e) = res { - let wait_duration = backoff::exponential_backoff_duration_seconds( - error_run_count + 1, - 1.0, - MAX_BACKOFF_SECS, - ); - error_run_count += 1; - let wait_duration = Duration::from_secs_f64(wait_duration); - error!( + match res { + Ok(_) => { + error_run_count = 0; + period + } + Err(crate::tenant::GcError::TenantCancelled) => { + return; + } + Err(e) => { + let wait_duration = backoff::exponential_backoff_duration_seconds( + error_run_count + 1, + 1.0, + MAX_BACKOFF_SECS, + ); + error_run_count += 1; + let wait_duration = Duration::from_secs_f64(wait_duration); + + error!( "Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}", ); - wait_duration - } else { - error_run_count = 0; - period + wait_duration + } } }; diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index d4f6e25843..4c46c4e635 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -102,7 +102,6 @@ use crate::metrics::{ }; use crate::pgdatadir_mapping::CalculateLogicalSizeError; use crate::tenant::config::TenantConfOpt; -use pageserver_api::key::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key}; use pageserver_api::reltag::RelTag; use pageserver_api::shard::ShardIndex; @@ -131,14 +130,17 @@ use self::layer_manager::LayerManager; use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; -use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline}; use super::{config::TenantConf, storage_layer::VectoredValueReconstructState}; use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf}; use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe}; use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer}; +use super::{ + secondary::heatmap::{HeatMapLayer, HeatMapTimeline}, + GcError, +}; #[derive(Debug, PartialEq, Eq, Clone, Copy)] -pub(super) enum FlushLoopState { +pub(crate) enum FlushLoopState { NotStarted, Running { #[cfg(test)] @@ -496,15 +498,11 @@ pub(crate) enum PageReconstructError { Other(#[from] anyhow::Error), #[error("Ancestor LSN wait error: {0}")] - AncestorLsnTimeout(#[from] WaitLsnError), + AncestorLsnTimeout(WaitLsnError), #[error("timeline shutting down")] Cancelled, - /// The ancestor of this is being stopped - #[error("ancestor timeline {0} is being stopped")] - AncestorStopping(TimelineId), - /// An error happened replaying WAL records #[error(transparent)] WalRedo(anyhow::Error), @@ -569,7 +567,7 @@ impl PageReconstructError { match self { Other(_) => false, AncestorLsnTimeout(_) => false, - Cancelled | AncestorStopping(_) => true, + Cancelled => true, WalRedo(_) => false, MissingKey { .. } => false, } @@ -577,7 +575,7 @@ impl PageReconstructError { } #[derive(thiserror::Error, Debug)] -enum CreateImageLayersError { +pub(crate) enum CreateImageLayersError { #[error("timeline shutting down")] Cancelled, @@ -591,17 +589,35 @@ enum CreateImageLayersError { Other(#[from] anyhow::Error), } -#[derive(thiserror::Error, Debug)] -enum FlushLayerError { +#[derive(thiserror::Error, Debug, Clone)] +pub(crate) enum FlushLayerError { /// Timeline cancellation token was cancelled #[error("timeline shutting down")] Cancelled, + /// We tried to flush a layer while the Timeline is in an unexpected state + #[error("cannot flush frozen layers when flush_loop is not running, state is {0:?}")] + NotRunning(FlushLoopState), + + // Arc<> the following non-clonable error types: we must be Clone-able because the flush error is propagated from the flush + // loop via a watch channel, where we can only borrow it. #[error(transparent)] - CreateImageLayersError(CreateImageLayersError), + CreateImageLayersError(Arc), #[error(transparent)] - Other(#[from] anyhow::Error), + Other(#[from] Arc), +} + +impl FlushLayerError { + // When crossing from generic anyhow errors to this error type, we explicitly check + // for timeline cancellation to avoid logging inoffensive shutdown errors as warn/err. + fn from_anyhow(timeline: &Timeline, err: anyhow::Error) -> Self { + if timeline.cancel.is_cancelled() { + Self::Cancelled + } else { + Self::Other(Arc::new(err)) + } + } } #[derive(thiserror::Error, Debug)] @@ -627,17 +643,17 @@ pub(crate) enum GetVectoredError { #[derive(thiserror::Error, Debug)] pub(crate) enum GetReadyAncestorError { - #[error("ancestor timeline {0} is being stopped")] - AncestorStopping(TimelineId), - #[error("Ancestor LSN wait error: {0}")] AncestorLsnTimeout(#[from] WaitLsnError), + #[error("Bad state on timeline {timeline_id}: {state:?}")] + BadState { + timeline_id: TimelineId, + state: TimelineState, + }, + #[error("Cancelled")] Cancelled, - - #[error(transparent)] - Other(#[from] anyhow::Error), } #[derive(Clone, Copy)] @@ -672,8 +688,8 @@ pub(crate) enum WaitLsnError { Shutdown, // Called on an timeline not in active state or shutting down - #[error("Bad state (not active)")] - BadState, + #[error("Bad timeline state: {0:?}")] + BadState(TimelineState), // Timeout expired while waiting for LSN to catch up with goal. #[error("{0}")] @@ -696,7 +712,7 @@ impl From for FlushLayerError { fn from(e: CreateImageLayersError) -> Self { match e { CreateImageLayersError::Cancelled => FlushLayerError::Cancelled, - any => FlushLayerError::CreateImageLayersError(any), + any => FlushLayerError::CreateImageLayersError(Arc::new(any)), } } } @@ -736,10 +752,9 @@ impl From for PageReconstructError { fn from(e: GetReadyAncestorError) -> Self { use GetReadyAncestorError::*; match e { - AncestorStopping(tid) => PageReconstructError::AncestorStopping(tid), AncestorLsnTimeout(wait_err) => PageReconstructError::AncestorLsnTimeout(wait_err), + bad_state @ BadState { .. } => PageReconstructError::Other(anyhow::anyhow!(bad_state)), Cancelled => PageReconstructError::Cancelled, - Other(other) => PageReconstructError::Other(other), } } } @@ -1171,9 +1186,7 @@ impl Timeline { use PageReconstructError::*; match block { - Err(Cancelled | AncestorStopping(_)) => { - return Err(GetVectoredError::Cancelled) - } + Err(Cancelled) => return Err(GetVectoredError::Cancelled), Err(MissingKey(_)) if NON_INHERITED_RANGE.contains(&key) || NON_INHERITED_SPARSE_RANGE.contains(&key) => @@ -1448,10 +1461,11 @@ impl Timeline { who_is_waiting: WaitLsnWaiter<'_>, ctx: &RequestContext, /* Prepare for use by cancellation */ ) -> Result<(), WaitLsnError> { - if self.cancel.is_cancelled() { + let state = self.current_state(); + if self.cancel.is_cancelled() || matches!(state, TimelineState::Stopping) { return Err(WaitLsnError::Shutdown); - } else if !self.is_active() { - return Err(WaitLsnError::BadState); + } else if !matches!(state, TimelineState::Active) { + return Err(WaitLsnError::BadState(state)); } if cfg!(debug_assertions) { @@ -1547,13 +1561,13 @@ impl Timeline { /// Flush to disk all data that was written with the put_* functions #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))] - pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> { + pub(crate) async fn freeze_and_flush(&self) -> Result<(), FlushLayerError> { self.freeze_and_flush0().await } // This exists to provide a non-span creating version of `freeze_and_flush` we can call without // polluting the span hierarchy. - pub(crate) async fn freeze_and_flush0(&self) -> anyhow::Result<()> { + pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> { let to_lsn = self.freeze_inmem_layer(false).await; self.flush_frozen_layers_and_wait(to_lsn).await } @@ -2735,11 +2749,6 @@ impl Timeline { self.current_logical_size.initialized.add_permits(1); } - enum BackgroundCalculationError { - Cancelled, - Other(anyhow::Error), - } - let try_once = |attempt: usize| { let background_ctx = &background_ctx; let self_ref = &self; @@ -2757,10 +2766,10 @@ impl Timeline { (Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit) } _ = self_ref.cancel.cancelled() => { - return Err(BackgroundCalculationError::Cancelled); + return Err(CalculateLogicalSizeError::Cancelled); } _ = cancel.cancelled() => { - return Err(BackgroundCalculationError::Cancelled); + return Err(CalculateLogicalSizeError::Cancelled); }, () = skip_concurrency_limiter.cancelled() => { // Some action that is part of a end user interaction requested logical size @@ -2778,28 +2787,21 @@ impl Timeline { crate::metrics::initial_logical_size::START_CALCULATION.retry(circumstances) }; - match self_ref + let calculated_size = self_ref .logical_size_calculation_task( initial_part_end, LogicalSizeCalculationCause::Initial, background_ctx, ) - .await - { - Ok(calculated_size) => Ok((calculated_size, metrics_guard)), - Err(CalculateLogicalSizeError::Cancelled) => { - Err(BackgroundCalculationError::Cancelled) - } - Err(CalculateLogicalSizeError::Other(err)) => { - if let Some(PageReconstructError::AncestorStopping(_)) = - err.root_cause().downcast_ref() - { - Err(BackgroundCalculationError::Cancelled) - } else { - Err(BackgroundCalculationError::Other(err)) - } - } - } + .await?; + + self_ref + .trigger_aux_file_size_computation(initial_part_end, background_ctx) + .await?; + + // TODO: add aux file size to logical size + + Ok((calculated_size, metrics_guard)) } }; @@ -2810,8 +2812,11 @@ impl Timeline { match try_once(attempt).await { Ok(res) => return ControlFlow::Continue(res), - Err(BackgroundCalculationError::Cancelled) => return ControlFlow::Break(()), - Err(BackgroundCalculationError::Other(e)) => { + Err(CalculateLogicalSizeError::Cancelled) => return ControlFlow::Break(()), + Err( + e @ (CalculateLogicalSizeError::Decode(_) + | CalculateLogicalSizeError::PageRead(_)), + ) => { warn!(attempt, "initial size calculation failed: {e:?}"); // exponential back-off doesn't make sense at these long intervals; // use fixed retry interval with generous jitter instead @@ -3188,17 +3193,21 @@ impl Timeline { } // Recurse into ancestor if needed - if is_inherited_key(key) && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn { - trace!( - "going into ancestor {}, cont_lsn is {}", - timeline.ancestor_lsn, - cont_lsn - ); + if let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() { + if key.is_inherited_key() && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn { + trace!( + "going into ancestor {}, cont_lsn is {}", + timeline.ancestor_lsn, + cont_lsn + ); - timeline_owned = timeline.get_ready_ancestor_timeline(ctx).await?; - timeline = &*timeline_owned; - prev_lsn = None; - continue 'outer; + timeline_owned = timeline + .get_ready_ancestor_timeline(ancestor_timeline, ctx) + .await?; + timeline = &*timeline_owned; + prev_lsn = None; + continue 'outer; + } } let guard = timeline.layers.read().await; @@ -3347,10 +3356,10 @@ impl Timeline { break None; } - // Not fully retrieved but no ancestor timeline. - if timeline.ancestor_timeline.is_none() { + let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() else { + // Not fully retrieved but no ancestor timeline. break Some(keyspace); - } + }; // Now we see if there are keys covered by the image layer but does not exist in the // image layer, which means that the key does not exist. @@ -3370,7 +3379,7 @@ impl Timeline { // Take the min to avoid reconstructing a page with data newer than request Lsn. cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1)); timeline_owned = timeline - .get_ready_ancestor_timeline(ctx) + .get_ready_ancestor_timeline(ancestor_timeline, ctx) .await .map_err(GetVectoredError::GetReadyAncestorError)?; timeline = &*timeline_owned; @@ -3542,13 +3551,9 @@ impl Timeline { async fn get_ready_ancestor_timeline( &self, + ancestor: &Arc, ctx: &RequestContext, ) -> Result, GetReadyAncestorError> { - let ancestor = match self.get_ancestor_timeline() { - Ok(timeline) => timeline, - Err(e) => return Err(GetReadyAncestorError::from(e)), - }; - // It's possible that the ancestor timeline isn't active yet, or // is active but hasn't yet caught up to the branch point. Wait // for it. @@ -3576,16 +3581,14 @@ impl Timeline { match ancestor.wait_to_become_active(ctx).await { Ok(()) => {} Err(TimelineState::Stopping) => { - return Err(GetReadyAncestorError::AncestorStopping( - ancestor.timeline_id, - )); + // If an ancestor is stopping, it means the tenant is stopping: handle this the same as if this timeline was stopping. + return Err(GetReadyAncestorError::Cancelled); } Err(state) => { - return Err(GetReadyAncestorError::Other(anyhow::anyhow!( - "Timeline {} will not become active. Current state: {:?}", - ancestor.timeline_id, - &state, - ))); + return Err(GetReadyAncestorError::BadState { + timeline_id: ancestor.timeline_id, + state, + }); } } ancestor @@ -3594,21 +3597,17 @@ impl Timeline { .map_err(|e| match e { e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e), WaitLsnError::Shutdown => GetReadyAncestorError::Cancelled, - e @ WaitLsnError::BadState => GetReadyAncestorError::Other(anyhow::anyhow!(e)), + WaitLsnError::BadState(state) => GetReadyAncestorError::BadState { + timeline_id: ancestor.timeline_id, + state, + }, })?; - Ok(ancestor) + Ok(ancestor.clone()) } - pub(crate) fn get_ancestor_timeline(&self) -> anyhow::Result> { - let ancestor = self.ancestor_timeline.as_ref().with_context(|| { - format!( - "Ancestor is missing. Timeline id: {} Ancestor id {:?}", - self.timeline_id, - self.get_ancestor_timeline_id(), - ) - })?; - Ok(Arc::clone(ancestor)) + pub(crate) fn get_ancestor_timeline(&self) -> Option> { + self.ancestor_timeline.clone() } pub(crate) fn get_shard_identity(&self) -> &ShardIdentity { @@ -3717,7 +3716,9 @@ impl Timeline { return; } err @ Err( - FlushLayerError::Other(_) | FlushLayerError::CreateImageLayersError(_), + FlushLayerError::NotRunning(_) + | FlushLayerError::Other(_) + | FlushLayerError::CreateImageLayersError(_), ) => { error!("could not flush frozen layer: {err:?}"); break err.map(|_| ()); @@ -3763,7 +3764,10 @@ impl Timeline { /// `last_record_lsn` may be higher than the highest LSN of a frozen layer: if this is the case, /// it means no data will be written between the top of the highest frozen layer and to_lsn, /// e.g. because this tenant shard has ingested up to to_lsn and not written any data locally for that part of the WAL. - async fn flush_frozen_layers_and_wait(&self, last_record_lsn: Lsn) -> anyhow::Result<()> { + async fn flush_frozen_layers_and_wait( + &self, + last_record_lsn: Lsn, + ) -> Result<(), FlushLayerError> { let mut rx = self.layer_flush_done_tx.subscribe(); // Increment the flush cycle counter and wake up the flush task. @@ -3774,7 +3778,7 @@ impl Timeline { let flush_loop_state = { *self.flush_loop_state.lock().unwrap() }; if !matches!(flush_loop_state, FlushLoopState::Running { .. }) { - anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}") + return Err(FlushLayerError::NotRunning(flush_loop_state)); } self.layer_flush_start_tx.send_modify(|(counter, lsn)| { @@ -3787,14 +3791,11 @@ impl Timeline { { let (last_result_counter, last_result) = &*rx.borrow(); if *last_result_counter >= my_flush_request { - if let Err(_err) = last_result { + if let Err(err) = last_result { // We already logged the original error in // flush_loop. We cannot propagate it to the caller // here, because it might not be Cloneable - anyhow::bail!( - "Could not flush frozen layer. Request id: {}", - my_flush_request - ); + return Err(err.clone()); } else { return Ok(()); } @@ -3803,7 +3804,7 @@ impl Timeline { trace!("waiting for flush to complete"); tokio::select! { rx_e = rx.changed() => { - rx_e?; + rx_e.map_err(|_| FlushLayerError::NotRunning(*self.flush_loop_state.lock().unwrap()))?; }, // Cancellation safety: we are not leaving an I/O in-flight for the flush, we're just ignoring // the notification from [`flush_loop`] that it completed. @@ -3875,31 +3876,36 @@ impl Timeline { EnumSet::empty(), ctx, ) - .await?; + .await + .map_err(|e| FlushLayerError::from_anyhow(self, e))?; if self.cancel.is_cancelled() { return Err(FlushLayerError::Cancelled); } + // FIXME(auxfilesv2): support multiple metadata key partitions might need initdb support as well? + // This code path will not be hit during regression tests. After #7099 we have a single partition + // with two key ranges. If someone wants to fix initdb optimization in the future, this might need + // to be fixed. + // For metadata, always create delta layers. let delta_layer = if !metadata_partition.parts.is_empty() { assert_eq!( metadata_partition.parts.len(), 1, - "currently sparse keyspace should only contain a single aux file keyspace" + "currently sparse keyspace should only contain a single metadata keyspace" ); let metadata_keyspace = &metadata_partition.parts[0]; - assert_eq!( - metadata_keyspace.0.ranges.len(), - 1, - "aux file keyspace should be a single range" - ); self.create_delta_layer( &frozen_layer, - Some(metadata_keyspace.0.ranges[0].clone()), + Some( + metadata_keyspace.0.ranges.first().unwrap().start + ..metadata_keyspace.0.ranges.last().unwrap().end, + ), ctx, ) - .await? + .await + .map_err(|e| FlushLayerError::from_anyhow(self, e))? } else { None }; @@ -3926,7 +3932,11 @@ impl Timeline { // Normal case, write out a L0 delta layer file. // `create_delta_layer` will not modify the layer map. // We will remove frozen layer and add delta layer in one atomic operation later. - let Some(layer) = self.create_delta_layer(&frozen_layer, None, ctx).await? else { + let Some(layer) = self + .create_delta_layer(&frozen_layer, None, ctx) + .await + .map_err(|e| FlushLayerError::from_anyhow(self, e))? + else { panic!("delta layer cannot be empty if no filter is applied"); }; ( @@ -3959,7 +3969,8 @@ impl Timeline { if self.set_disk_consistent_lsn(disk_consistent_lsn) { // Schedule remote uploads that will reflect our new disk_consistent_lsn - self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?; + self.schedule_uploads(disk_consistent_lsn, layers_to_upload) + .map_err(|e| FlushLayerError::from_anyhow(self, e))?; } // release lock on 'layers' }; @@ -4257,7 +4268,7 @@ impl Timeline { // Unfortunately we cannot do this for the main fork, or for // any metadata keys, keys, as that would lead to actual data // loss. - if is_rel_fsm_block_key(img_key) || is_rel_vm_block_key(img_key) { + if img_key.is_rel_fsm_block_key() || img_key.is_rel_vm_block_key() { warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}"); ZERO_PAGE.clone() } else { @@ -4307,6 +4318,7 @@ impl Timeline { ctx: &RequestContext, img_range: Range, mode: ImageLayerCreationMode, + start: Key, ) -> Result { assert!(!matches!(mode, ImageLayerCreationMode::Initial)); @@ -4315,39 +4327,43 @@ impl Timeline { let data = self .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx) .await?; - let (data, total_kb_retrieved, total_key_retrieved) = { + let (data, total_kb_retrieved, total_keys_retrieved) = { let mut new_data = BTreeMap::new(); let mut total_kb_retrieved = 0; - let mut total_key_retrieved = 0; + let mut total_keys_retrieved = 0; for (k, v) in data { let v = v.map_err(CreateImageLayersError::PageReconstructError)?; total_kb_retrieved += KEY_SIZE + v.len(); - total_key_retrieved += 1; + total_keys_retrieved += 1; new_data.insert(k, v); } - (new_data, total_kb_retrieved / 1024, total_key_retrieved) + (new_data, total_kb_retrieved / 1024, total_keys_retrieved) }; - let delta_file_accessed = reconstruct_state.get_delta_layers_visited(); + let delta_files_accessed = reconstruct_state.get_delta_layers_visited(); - let trigger_generation = delta_file_accessed as usize >= MAX_AUX_FILE_V2_DELTAS; + let trigger_generation = delta_files_accessed as usize >= MAX_AUX_FILE_V2_DELTAS; debug!( - "generate image layers for metadata keys: trigger_generation={trigger_generation}, \ - delta_file_accessed={delta_file_accessed}, total_kb_retrieved={total_kb_retrieved}, \ - total_key_retrieved={total_key_retrieved}" + trigger_generation, + delta_files_accessed, + total_kb_retrieved, + total_keys_retrieved, + "generate metadata images" ); + if !trigger_generation && mode == ImageLayerCreationMode::Try { return Ok(ImageLayerCreationOutcome { image: None, next_start_key: img_range.end, }); } - let has_keys = !data.is_empty(); + let mut wrote_any_image = false; for (k, v) in data { - // Even if the value is empty (deleted), we do not delete it for now until we can ensure vectored get - // considers this situation properly. - // if v.is_empty() { - // continue; - // } + if v.is_empty() { + // the key has been deleted, it does not need an image + // in metadata keyspace, an empty image == tombstone + continue; + } + wrote_any_image = true; // No need to handle sharding b/c metadata keys are always on the 0-th shard. @@ -4355,16 +4371,26 @@ impl Timeline { // on the normal data path either. image_layer_writer.put_image(k, v, ctx).await?; } - Ok(ImageLayerCreationOutcome { - image: if has_keys { - let image_layer = image_layer_writer.finish(self, ctx).await?; - Some(image_layer) - } else { - tracing::debug!("no data in range {}-{}", img_range.start, img_range.end); - None - }, - next_start_key: img_range.end, - }) + + if wrote_any_image { + // Normal path: we have written some data into the new image layer for this + // partition, so flush it to disk. + let image_layer = image_layer_writer.finish(self, ctx).await?; + Ok(ImageLayerCreationOutcome { + image: Some(image_layer), + next_start_key: img_range.end, + }) + } else { + // Special case: the image layer may be empty if this is a sharded tenant and the + // partition does not cover any keys owned by this shard. In this case, to ensure + // we don't leave gaps between image layers, leave `start` where it is, so that the next + // layer we write will cover the key range that we just scanned. + tracing::debug!("no data in range {}-{}", img_range.start, img_range.end); + Ok(ImageLayerCreationOutcome { + image: None, + next_start_key: start, + }) + } } #[tracing::instrument(skip_all, fields(%lsn, %mode))] @@ -4474,6 +4500,7 @@ impl Timeline { ctx, img_range, mode, + start, ) .await?; start = next_start_key; @@ -4835,7 +4862,7 @@ impl Timeline { /// Currently, we don't make any attempt at removing unneeded page versions /// within a layer file. We can only remove the whole file if it's fully /// obsolete. - pub(super) async fn gc(&self) -> anyhow::Result { + pub(super) async fn gc(&self) -> Result { // this is most likely the background tasks, but it might be the spawned task from // immediate_gc let _g = tokio::select! { @@ -4848,7 +4875,7 @@ impl Timeline { // Is the timeline being deleted? if self.is_stopping() { - anyhow::bail!("timeline is Stopping"); + return Err(GcError::TimelineCancelled); } let (horizon_cutoff, pitr_cutoff, retain_lsns) = { @@ -4906,7 +4933,7 @@ impl Timeline { pitr_cutoff: Lsn, retain_lsns: Vec, new_gc_cutoff: Lsn, - ) -> anyhow::Result { + ) -> Result { // FIXME: if there is an ongoing detach_from_ancestor, we should just skip gc let now = SystemTime::now(); @@ -4928,12 +4955,15 @@ impl Timeline { // The GC cutoff should only ever move forwards. let waitlist = { let write_guard = self.latest_gc_cutoff_lsn.lock_for_write(); - ensure!( - *write_guard <= new_gc_cutoff, - "Cannot move GC cutoff LSN backwards (was {}, new {})", - *write_guard, - new_gc_cutoff - ); + if *write_guard > new_gc_cutoff { + return Err(GcError::BadLsn { + why: format!( + "Cannot move GC cutoff LSN backwards (was {}, new {})", + *write_guard, new_gc_cutoff + ), + }); + } + write_guard.store_and_unlock(new_gc_cutoff) }; waitlist.wait().await; @@ -5042,7 +5072,14 @@ impl Timeline { // This unconditionally schedules also an index_part.json update, even though, we will // be doing one a bit later with the unlinked gc'd layers. let disk_consistent_lsn = self.disk_consistent_lsn.load(); - self.schedule_uploads(disk_consistent_lsn, None)?; + self.schedule_uploads(disk_consistent_lsn, None) + .map_err(|e| { + if self.cancel.is_cancelled() { + GcError::TimelineCancelled + } else { + GcError::Remote(e) + } + })?; let gc_layers = layers_to_remove .iter() @@ -5051,7 +5088,15 @@ impl Timeline { result.layers_removed = gc_layers.len() as u64; - self.remote_client.schedule_gc_update(&gc_layers)?; + self.remote_client + .schedule_gc_update(&gc_layers) + .map_err(|e| { + if self.cancel.is_cancelled() { + GcError::TimelineCancelled + } else { + GcError::Remote(e) + } + })?; guard.finish_gc_timeline(&gc_layers); @@ -5066,7 +5111,7 @@ impl Timeline { result.layers_removed, new_gc_cutoff ); - result.elapsed = now.elapsed()?; + result.elapsed = now.elapsed().unwrap_or(Duration::ZERO); Ok(result) } @@ -5358,6 +5403,133 @@ impl Timeline { shard_count: self.tenant_shard_id.shard_count, } } + + #[cfg(test)] + pub(super) fn force_advance_lsn(self: &Arc, new_lsn: Lsn) { + self.last_record_lsn.advance(new_lsn); + } + + /// Force create an image layer and place it into the layer map. + /// + /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`] + /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run. + #[cfg(test)] + pub(super) async fn force_create_image_layer( + self: &Arc, + lsn: Lsn, + mut images: Vec<(Key, Bytes)>, + check_start_lsn: Option, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + let last_record_lsn = self.get_last_record_lsn(); + assert!( + lsn <= last_record_lsn, + "advance last record lsn before inserting a layer, lsn={lsn}, last_record_lsn={last_record_lsn}" + ); + if let Some(check_start_lsn) = check_start_lsn { + assert!(lsn >= check_start_lsn); + } + images.sort_unstable_by(|(ka, _), (kb, _)| ka.cmp(kb)); + let min_key = *images.first().map(|(k, _)| k).unwrap(); + let max_key = images.last().map(|(k, _)| k).unwrap().next(); + let mut image_layer_writer = ImageLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + &(min_key..max_key), + lsn, + ctx, + ) + .await?; + for (key, img) in images { + image_layer_writer.put_image(key, img, ctx).await?; + } + let image_layer = image_layer_writer.finish(self, ctx).await?; + + { + let mut guard = self.layers.write().await; + guard.force_insert_layer(image_layer); + } + + Ok(()) + } + + /// Force create a delta layer and place it into the layer map. + /// + /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`] + /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run. + #[cfg(test)] + pub(super) async fn force_create_delta_layer( + self: &Arc, + mut deltas: Vec<(Key, Lsn, Value)>, + check_start_lsn: Option, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + let last_record_lsn = self.get_last_record_lsn(); + deltas.sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb))); + let min_key = *deltas.first().map(|(k, _, _)| k).unwrap(); + let max_key = deltas.last().map(|(k, _, _)| k).unwrap().next(); + let min_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap(); + let max_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap(); + assert!( + max_lsn <= last_record_lsn, + "advance last record lsn before inserting a layer, max_lsn={max_lsn}, last_record_lsn={last_record_lsn}" + ); + let end_lsn = Lsn(max_lsn.0 + 1); + if let Some(check_start_lsn) = check_start_lsn { + assert!(min_lsn >= check_start_lsn); + } + let mut delta_layer_writer = DeltaLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + min_key, + min_lsn..end_lsn, + ctx, + ) + .await?; + for (key, lsn, val) in deltas { + delta_layer_writer.put_value(key, lsn, val, ctx).await?; + } + let delta_layer = delta_layer_writer.finish(max_key, self, ctx).await?; + + { + let mut guard = self.layers.write().await; + guard.force_insert_layer(delta_layer); + } + + Ok(()) + } + + /// Return all keys at the LSN in the image layers + #[cfg(test)] + pub(crate) async fn inspect_image_layers( + self: &Arc, + lsn: Lsn, + ctx: &RequestContext, + ) -> anyhow::Result> { + let mut all_data = Vec::new(); + let guard = self.layers.read().await; + for layer in guard.layer_map().iter_historic_layers() { + if !layer.is_delta() && layer.image_layer_lsn() == lsn { + let layer = guard.get_from_desc(&layer); + let mut reconstruct_data = ValuesReconstructState::default(); + layer + .get_values_reconstruct_data( + KeySpace::single(Key::MIN..Key::MAX), + lsn..Lsn(lsn.0 + 1), + &mut reconstruct_data, + ctx, + ) + .await?; + for (k, v) in reconstruct_data.keys { + all_data.push((k, v?.img.unwrap().1)); + } + } + } + all_data.sort(); + Ok(all_data) + } } type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId); diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 07a12f535a..d8de6aee7c 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -133,8 +133,7 @@ impl Timeline { }, &image_ctx, ) - .await - .map_err(anyhow::Error::from)?; + .await?; self.upload_new_image_layers(image_layers)?; partitioning.parts.len() @@ -422,48 +421,6 @@ impl Timeline { return Ok(CompactLevel0Phase1Result::default()); } - // This failpoint is used together with `test_duplicate_layers` integration test. - // It returns the compaction result exactly the same layers as input to compaction. - // We want to ensure that this will not cause any problem when updating the layer map - // after the compaction is finished. - // - // Currently, there are two rare edge cases that will cause duplicated layers being - // inserted. - // 1. The compaction job is inturrupted / did not finish successfully. Assume we have file 1, 2, 3, 4, which - // is compacted to 5, but the page server is shut down, next time we start page server we will get a layer - // map containing 1, 2, 3, 4, and 5, whereas 5 has the same content as 4. If we trigger L0 compation at this - // point again, it is likely that we will get a file 6 which has the same content and the key range as 5, - // and this causes an overwrite. This is acceptable because the content is the same, and we should do a - // layer replace instead of the normal remove / upload process. - // 2. The input workload pattern creates exactly n files that are sorted, non-overlapping and is of target file - // size length. Compaction will likely create the same set of n files afterwards. - // - // This failpoint is a superset of both of the cases. - if cfg!(feature = "testing") { - let active = (|| { - ::fail::fail_point!("compact-level0-phase1-return-same", |_| true); - false - })(); - - if active { - let mut new_layers = Vec::with_capacity(level0_deltas.len()); - for delta in &level0_deltas { - // we are just faking these layers as being produced again for this failpoint - new_layers.push( - delta - .download_and_keep_resident() - .await - .context("download layer for failpoint")?, - ); - } - tracing::info!("compact-level0-phase1-return-same"); // so that we can check if we hit the failpoint - return Ok(CompactLevel0Phase1Result { - new_layers, - deltas_to_compact: level0_deltas, - }); - } - } - // Gather the files to compact in this iteration. // // Start with the oldest Level 0 delta file, and collect any other diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index e6ddabe5b5..4fc89330ba 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -1,6 +1,6 @@ use std::sync::Arc; -use super::{layer_manager::LayerManager, Timeline}; +use super::{layer_manager::LayerManager, FlushLayerError, Timeline}; use crate::{ context::{DownloadBehavior, RequestContext}, task_mgr::TaskKind, @@ -23,7 +23,7 @@ pub(crate) enum Error { #[error("shutting down, please retry later")] ShuttingDown, #[error("flushing failed")] - FlushAncestor(#[source] anyhow::Error), + FlushAncestor(#[source] FlushLayerError), #[error("layer download failed")] RewrittenDeltaDownloadFailed(#[source] anyhow::Error), #[error("copying LSN prefix locally failed")] diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index 884b71df75..b78c98a506 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -255,6 +255,13 @@ impl LayerManager { updates.flush() } + #[cfg(test)] + pub(crate) fn force_insert_layer(&mut self, layer: ResidentLayer) { + let mut updates = self.layer_map.batch_update(); + Self::insert_historic_layer(layer.as_ref().clone(), &mut updates, &mut self.layer_fmgr); + updates.flush() + } + /// Helper function to insert a layer into the layer map and file manager. fn insert_historic_layer( layer: Layer, diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index 02f87303d1..50c977a950 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -3,12 +3,10 @@ use super::storage_layer::ResidentLayer; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::remote_timeline_client::index::IndexPart; use crate::tenant::remote_timeline_client::index::LayerFileMetadata; -use crate::tenant::remote_timeline_client::index::Lineage; use std::collections::{HashMap, VecDeque}; use std::fmt::Debug; use chrono::NaiveDateTime; -use pageserver_api::models::AuxFilePolicy; use std::sync::Arc; use tracing::info; use utils::lsn::AtomicLsn; @@ -45,34 +43,25 @@ pub(crate) struct UploadQueueInitialized { /// Counter to assign task IDs pub(crate) task_counter: u64, - /// All layer files stored in the remote storage, taking into account all - /// in-progress and queued operations - pub(crate) latest_files: HashMap, + /// The next uploaded index_part.json; assumed to be dirty. + /// + /// Should not be read, directly except for layer file updates. Instead you should add a + /// projected field. + pub(crate) dirty: IndexPart, + + /// The latest remote persisted IndexPart. + /// + /// Each completed metadata upload will update this. The second item is the task_id which last + /// updated the value, used to ensure we never store an older value over a newer one. + pub(crate) clean: (IndexPart, Option), /// How many file uploads or deletions been scheduled, since the /// last (scheduling of) metadata index upload? pub(crate) latest_files_changes_since_metadata_upload_scheduled: u64, - /// Metadata stored in the remote storage, taking into account all - /// in-progress and queued operations. - /// DANGER: do not return to outside world, e.g., safekeepers. - pub(crate) latest_metadata: TimelineMetadata, - - /// Part of the flattened "next" `index_part.json`. - pub(crate) latest_lineage: Lineage, - - /// The last aux file policy used on this timeline. - pub(crate) last_aux_file_policy: Option, - - /// `disk_consistent_lsn` from the last metadata file that was successfully - /// uploaded. `Lsn(0)` if nothing was uploaded yet. - /// Unlike `latest_files` or `latest_metadata`, this value is never ahead. - /// Safekeeper can rely on it to make decisions for WAL storage. - /// - /// visible_remote_consistent_lsn is only updated after our generation has been validated with + /// The Lsn is only updated after our generation has been validated with /// the control plane (unlesss a timeline's generation is None, in which case /// we skip validation) - pub(crate) projected_remote_consistent_lsn: Option, pub(crate) visible_remote_consistent_lsn: Arc, // Breakdown of different kinds of tasks currently in-progress @@ -118,7 +107,8 @@ impl UploadQueueInitialized { } pub(super) fn get_last_remote_consistent_lsn_projected(&self) -> Option { - self.projected_remote_consistent_lsn + let lsn = self.clean.0.metadata.disk_consistent_lsn(); + self.clean.1.map(|_| lsn) } } @@ -174,13 +164,12 @@ impl UploadQueue { info!("initializing upload queue for empty remote"); + let index_part = IndexPart::empty(metadata.clone()); + let state = UploadQueueInitialized { - // As described in the doc comment, it's ok for `latest_files` and `latest_metadata` to be ahead. - latest_files: HashMap::new(), + dirty: index_part.clone(), + clean: (index_part, None), latest_files_changes_since_metadata_upload_scheduled: 0, - latest_metadata: metadata.clone(), - latest_lineage: Lineage::default(), - projected_remote_consistent_lsn: None, visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)), // what follows are boring default initializations task_counter: 0, @@ -193,7 +182,6 @@ impl UploadQueue { dangling_files: HashMap::new(), shutting_down: false, shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)), - last_aux_file_policy: Default::default(), }; *self = UploadQueue::Initialized(state); @@ -211,22 +199,15 @@ impl UploadQueue { } } - let mut files = HashMap::with_capacity(index_part.layer_metadata.len()); - for (layer_name, layer_metadata) in &index_part.layer_metadata { - files.insert(layer_name.to_owned(), layer_metadata.clone()); - } - info!( "initializing upload queue with remote index_part.disk_consistent_lsn: {}", index_part.metadata.disk_consistent_lsn() ); let state = UploadQueueInitialized { - latest_files: files, + dirty: index_part.clone(), + clean: (index_part.clone(), None), latest_files_changes_since_metadata_upload_scheduled: 0, - latest_metadata: index_part.metadata.clone(), - latest_lineage: index_part.lineage.clone(), - projected_remote_consistent_lsn: Some(index_part.metadata.disk_consistent_lsn()), visible_remote_consistent_lsn: Arc::new( index_part.metadata.disk_consistent_lsn().into(), ), @@ -241,7 +222,6 @@ impl UploadQueue { dangling_files: HashMap::new(), shutting_down: false, shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)), - last_aux_file_policy: index_part.last_aux_file_policy(), }; *self = UploadQueue::Initialized(state); @@ -298,13 +278,16 @@ pub(crate) enum UploadOp { /// Upload a layer file UploadLayer(ResidentLayer, LayerFileMetadata), - /// Upload the metadata file - UploadMetadata(Box, Lsn), + /// Upload a index_part.json file + UploadMetadata { + /// The next [`UploadQueueInitialized::clean`] after this upload succeeds. + uploaded: Box, + }, /// Delete layer files Delete(Delete), - /// Barrier. When the barrier operation is reached, + /// Barrier. When the barrier operation is reached, the channel is closed. Barrier(tokio::sync::watch::Sender<()>), /// Shutdown; upon encountering this operation no new operations will be spawned, otherwise @@ -322,8 +305,12 @@ impl std::fmt::Display for UploadOp { layer, metadata.file_size, metadata.generation ) } - UploadOp::UploadMetadata(_, lsn) => { - write!(f, "UploadMetadata(lsn: {})", lsn) + UploadOp::UploadMetadata { uploaded, .. } => { + write!( + f, + "UploadMetadata(lsn: {})", + uploaded.metadata.disk_consistent_lsn() + ) } UploadOp::Delete(delete) => { write!(f, "Delete({} layers)", delete.layers.len()) diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index b68f3a0e89..04d9386fab 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -344,21 +344,21 @@ macro_rules! with_file { impl VirtualFile { /// Open a file in read-only mode. Like File::open. - pub async fn open( - path: &Utf8Path, + pub async fn open>( + path: P, ctx: &RequestContext, ) -> Result { - Self::open_with_options(path, OpenOptions::new().read(true), ctx).await + Self::open_with_options(path.as_ref(), OpenOptions::new().read(true), ctx).await } /// Create a new file for writing. If the file exists, it will be truncated. /// Like File::create. - pub async fn create( - path: &Utf8Path, + pub async fn create>( + path: P, ctx: &RequestContext, ) -> Result { Self::open_with_options( - path, + path.as_ref(), OpenOptions::new().write(true).create(true).truncate(true), ctx, ) @@ -370,12 +370,13 @@ impl VirtualFile { /// Note: If any custom flags were set in 'open_options' through OpenOptionsExt, /// they will be applied also when the file is subsequently re-opened, not only /// on the first time. Make sure that's sane! - pub async fn open_with_options( - path: &Utf8Path, + pub async fn open_with_options>( + path: P, open_options: &OpenOptions, _ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */ ) -> Result { - let path_str = path.to_string(); + let path_ref = path.as_ref(); + let path_str = path_ref.to_string(); let parts = path_str.split('/').collect::>(); let (tenant_id, shard_id, timeline_id) = if parts.len() > 5 && parts[parts.len() - 5] == TENANTS_SEGMENT_NAME { @@ -401,7 +402,7 @@ impl VirtualFile { // where our caller doesn't get to use the returned VirtualFile before its // slot gets re-used by someone else. let file = observe_duration!(StorageIoOperation::Open, { - open_options.open(path.as_std_path()).await? + open_options.open(path_ref.as_std_path()).await? }); // Strip all options other than read and write. @@ -417,7 +418,7 @@ impl VirtualFile { let vfile = VirtualFile { handle: RwLock::new(handle), pos: 0, - path: path.to_path_buf(), + path: path_ref.to_path_buf(), open_options: reopen_options, tenant_id, shard_id, diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 79f075b877..4f26f2f6d1 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -234,6 +234,7 @@ impl WalIngest { modification, &parsed_xact, info == pg_constants::XLOG_XACT_COMMIT, + decoded.origin_id, ctx, ) .await?; @@ -246,6 +247,7 @@ impl WalIngest { modification, &parsed_xact, info == pg_constants::XLOG_XACT_COMMIT_PREPARED, + decoded.origin_id, ctx, ) .await?; @@ -375,6 +377,18 @@ impl WalIngest { self.checkpoint.oldestActiveXid = xlrec.oldest_running_xid; } } + pg_constants::RM_REPLORIGIN_ID => { + let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; + if info == pg_constants::XLOG_REPLORIGIN_SET { + let xlrec = crate::walrecord::XlReploriginSet::decode(&mut buf); + modification + .set_replorigin(xlrec.node_id, xlrec.remote_lsn) + .await? + } else if info == pg_constants::XLOG_REPLORIGIN_DROP { + let xlrec = crate::walrecord::XlReploriginDrop::decode(&mut buf); + modification.drop_replorigin(xlrec.node_id).await? + } + } _x => { // TODO: should probably log & fail here instead of blindly // doing something without understanding the protocol @@ -1178,6 +1192,7 @@ impl WalIngest { modification: &mut DatadirModification<'_>, parsed: &XlXactParsedRecord, is_commit: bool, + origin_id: u16, ctx: &RequestContext, ) -> anyhow::Result<()> { // Record update of CLOG pages @@ -1243,6 +1258,11 @@ impl WalIngest { } } } + if origin_id != 0 { + modification + .set_replorigin(origin_id, parsed.origin_lsn) + .await?; + } Ok(()) } diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index 02f6f49694..205f8dee4d 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -9,10 +9,10 @@ use postgres_ffi::pg_constants; use postgres_ffi::BLCKSZ; use postgres_ffi::{BlockNumber, TimestampTz}; use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, TransactionId}; -use postgres_ffi::{XLogRecord, XLOG_SIZE_OF_XLOG_RECORD}; +use postgres_ffi::{RepOriginId, XLogRecord, XLOG_SIZE_OF_XLOG_RECORD}; use serde::{Deserialize, Serialize}; use tracing::*; -use utils::bin_ser::DeserializeError; +use utils::{bin_ser::DeserializeError, lsn::Lsn}; /// Each update to a page is represented by a NeonWalRecord. It can be a wrapper /// around a PostgreSQL WAL record, or a custom neon-specific "record". @@ -116,6 +116,7 @@ pub struct DecodedWALRecord { pub blocks: Vec, pub main_data_offset: usize, + pub origin_id: u16, } #[repr(C)] @@ -573,6 +574,7 @@ pub struct XlXactParsedRecord { pub subxacts: Vec, pub xnodes: Vec, + pub origin_lsn: Lsn, } impl XlXactParsedRecord { @@ -651,6 +653,11 @@ impl XlXactParsedRecord { debug!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE xid {}", xid); } + let origin_lsn = if xinfo & pg_constants::XACT_XINFO_HAS_ORIGIN != 0 { + Lsn(buf.get_u64_le()) + } else { + Lsn::INVALID + }; XlXactParsedRecord { xid, info, @@ -660,6 +667,7 @@ impl XlXactParsedRecord { ts_id, subxacts, xnodes, + origin_lsn, } } } @@ -810,6 +818,36 @@ impl XlRunningXacts { } } +#[repr(C)] +#[derive(Debug)] +pub struct XlReploriginDrop { + pub node_id: RepOriginId, +} + +impl XlReploriginDrop { + pub fn decode(buf: &mut Bytes) -> XlReploriginDrop { + XlReploriginDrop { + node_id: buf.get_u16_le(), + } + } +} + +#[repr(C)] +#[derive(Debug)] +pub struct XlReploriginSet { + pub remote_lsn: Lsn, + pub node_id: RepOriginId, +} + +impl XlReploriginSet { + pub fn decode(buf: &mut Bytes) -> XlReploriginSet { + XlReploriginSet { + remote_lsn: Lsn(buf.get_u64_le()), + node_id: buf.get_u16_le(), + } + } +} + /// Main routine to decode a WAL record and figure out which blocks are modified // // See xlogrecord.h for details @@ -844,6 +882,7 @@ pub fn decode_wal_record( let mut rnode_dbnode: u32 = 0; let mut rnode_relnode: u32 = 0; let mut got_rnode = false; + let mut origin_id: u16 = 0; let mut buf = record.clone(); @@ -891,7 +930,7 @@ pub fn decode_wal_record( pg_constants::XLR_BLOCK_ID_ORIGIN => { // RepOriginId is uint16 - buf.advance(2); + origin_id = buf.get_u16_le(); } pg_constants::XLR_BLOCK_ID_TOPLEVEL_XID => { @@ -1088,6 +1127,7 @@ pub fn decode_wal_record( decoded.xl_info = xlogrec.xl_info; decoded.xl_rmid = xlogrec.xl_rmid; decoded.record = record; + decoded.origin_id = origin_id; decoded.main_data_offset = main_data_offset; Ok(()) diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 3decea0c6d..d562540bde 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -20,7 +20,6 @@ /// Process lifecycle and abstracction for the IPC protocol. mod process; -pub use process::Kind as ProcessKind; /// Code to apply [`NeonWalRecord`]s. pub(crate) mod apply_neon; @@ -34,7 +33,6 @@ use crate::repository::Key; use crate::walrecord::NeonWalRecord; use anyhow::Context; use bytes::{Bytes, BytesMut}; -use pageserver_api::key::key_to_rel_block; use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus}; use pageserver_api::shard::TenantShardId; use std::sync::Arc; @@ -55,7 +53,7 @@ pub struct PostgresRedoManager { tenant_shard_id: TenantShardId, conf: &'static PageServerConf, last_redo_at: std::sync::Mutex>, - /// The current [`process::Process`] that is used by new redo requests. + /// The current [`process::WalRedoProcess`] that is used by new redo requests. /// We use [`heavier_once_cell`] for coalescing the spawning, but the redo /// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the /// their process object; we use [`Arc::clone`] for that. @@ -67,7 +65,7 @@ pub struct PostgresRedoManager { /// still be using the old redo process. But, those other tasks will most likely /// encounter an error as well, and errors are an unexpected condition anyway. /// So, probably we could get rid of the `Arc` in the future. - redo_process: heavier_once_cell::OnceCell>, + redo_process: heavier_once_cell::OnceCell>, } /// @@ -208,30 +206,35 @@ impl PostgresRedoManager { ) -> anyhow::Result { *(self.last_redo_at.lock().unwrap()) = Some(Instant::now()); - let (rel, blknum) = key_to_rel_block(key).context("invalid record")?; + let (rel, blknum) = key.to_rel_block().context("invalid record")?; const MAX_RETRY_ATTEMPTS: u32 = 1; let mut n_attempts = 0u32; loop { - let proc: Arc = match self.redo_process.get_or_init_detached().await { - Ok(guard) => Arc::clone(&guard), - Err(permit) => { - // don't hold poison_guard, the launch code can bail - let start = Instant::now(); - let proc = Arc::new( - process::Process::launch(self.conf, self.tenant_shard_id, pg_version) + let proc: Arc = + match self.redo_process.get_or_init_detached().await { + Ok(guard) => Arc::clone(&guard), + Err(permit) => { + // don't hold poison_guard, the launch code can bail + let start = Instant::now(); + let proc = Arc::new( + process::WalRedoProcess::launch( + self.conf, + self.tenant_shard_id, + pg_version, + ) .context("launch walredo process")?, - ); - let duration = start.elapsed(); - WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64()); - info!( - duration_ms = duration.as_millis(), - pid = proc.id(), - "launched walredo process" - ); - self.redo_process.set(Arc::clone(&proc), permit); - proc - } - }; + ); + let duration = start.elapsed(); + WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64()); + info!( + duration_ms = duration.as_millis(), + pid = proc.id(), + "launched walredo process" + ); + self.redo_process.set(Arc::clone(&proc), permit); + proc + } + }; let started_at = std::time::Instant::now(); @@ -362,10 +365,10 @@ impl PostgresRedoManager { &self, key: Key, page: &mut BytesMut, - _record_lsn: Lsn, + record_lsn: Lsn, record: &NeonWalRecord, ) -> anyhow::Result<()> { - apply_neon::apply_in_neon(record, key, page)?; + apply_neon::apply_in_neon(record, record_lsn, key, page)?; Ok(()) } diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs index 247704e2a5..24e8d8b01c 100644 --- a/pageserver/src/walredo/apply_neon.rs +++ b/pageserver/src/walredo/apply_neon.rs @@ -3,7 +3,7 @@ use crate::walrecord::NeonWalRecord; use anyhow::Context; use byteorder::{ByteOrder, LittleEndian}; use bytes::{BufMut, BytesMut}; -use pageserver_api::key::{key_to_rel_block, key_to_slru_block, Key}; +use pageserver_api::key::Key; use pageserver_api::reltag::SlruKind; use postgres_ffi::pg_constants; use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM; @@ -14,6 +14,7 @@ use postgres_ffi::v14::nonrelfile_utils::{ use postgres_ffi::BLCKSZ; use tracing::*; use utils::bin_ser::BeSer; +use utils::lsn::Lsn; /// Can this request be served by neon redo functions /// or we need to pass it to wal-redo postgres process? @@ -32,6 +33,7 @@ pub(crate) fn can_apply_in_neon(rec: &NeonWalRecord) -> bool { pub(crate) fn apply_in_neon( record: &NeonWalRecord, + lsn: Lsn, key: Key, page: &mut BytesMut, ) -> Result<(), anyhow::Error> { @@ -48,7 +50,7 @@ pub(crate) fn apply_in_neon( flags, } => { // sanity check that this is modifying the correct relation - let (rel, blknum) = key_to_rel_block(key).context("invalid record")?; + let (rel, blknum) = key.to_rel_block().context("invalid record")?; assert!( rel.forknum == VISIBILITYMAP_FORKNUM, "ClearVisibilityMapFlags record on unexpected rel {}", @@ -67,6 +69,7 @@ pub(crate) fn apply_in_neon( let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..]; map[map_byte as usize] &= !(flags << map_offset); + postgres_ffi::page_set_lsn(page, lsn); } // Repeat for 'old_heap_blkno', if any @@ -80,12 +83,13 @@ pub(crate) fn apply_in_neon( let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..]; map[map_byte as usize] &= !(flags << map_offset); + postgres_ffi::page_set_lsn(page, lsn); } } // Non-relational WAL records are handled here, with custom code that has the // same effects as the corresponding Postgres WAL redo function. NeonWalRecord::ClogSetCommitted { xids, timestamp } => { - let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?; + let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?; assert_eq!( slru_kind, SlruKind::Clog, @@ -130,7 +134,7 @@ pub(crate) fn apply_in_neon( } } NeonWalRecord::ClogSetAborted { xids } => { - let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?; + let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?; assert_eq!( slru_kind, SlruKind::Clog, @@ -160,7 +164,7 @@ pub(crate) fn apply_in_neon( } } NeonWalRecord::MultixactOffsetCreate { mid, moff } => { - let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?; + let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?; assert_eq!( slru_kind, SlruKind::MultiXactOffsets, @@ -192,7 +196,7 @@ pub(crate) fn apply_in_neon( LittleEndian::write_u32(&mut page[offset..offset + 4], *moff); } NeonWalRecord::MultixactMembersCreate { moff, members } => { - let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?; + let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?; assert_eq!( slru_kind, SlruKind::MultiXactMembers, @@ -285,7 +289,7 @@ mod test { let mut page = BytesMut::from_iter(base_image); for record in deltas { - apply_in_neon(&record, file_path, &mut page)?; + apply_in_neon(&record, Lsn(8), file_path, &mut page)?; } let reconstructed = AuxFilesDirectory::des(&page)?; diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs index 02c9c04bf1..5b0af334ee 100644 --- a/pageserver/src/walredo/process.rs +++ b/pageserver/src/walredo/process.rs @@ -1,64 +1,184 @@ -/// Layer of indirection previously used to support multiple implementations. -/// Subject to removal: -use std::time::Duration; - -use bytes::Bytes; -use pageserver_api::{reltag::RelTag, shard::TenantShardId}; -use tracing::warn; -use utils::lsn::Lsn; - -use crate::{config::PageServerConf, walrecord::NeonWalRecord}; - mod no_leak_child; /// The IPC protocol that pageserver and walredo process speak over their shared pipe. mod protocol; -mod process_impl { - pub(super) mod process_async; +use self::no_leak_child::NoLeakChild; +use crate::{ + config::PageServerConf, + metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER}, + walrecord::NeonWalRecord, +}; +use anyhow::Context; +use bytes::Bytes; +use pageserver_api::{reltag::RelTag, shard::TenantShardId}; +use postgres_ffi::BLCKSZ; +#[cfg(feature = "testing")] +use std::sync::atomic::AtomicUsize; +use std::{ + collections::VecDeque, + process::{Command, Stdio}, + time::Duration, +}; +use tokio::io::{AsyncReadExt, AsyncWriteExt}; +use tracing::{debug, error, instrument, Instrument}; +use utils::{lsn::Lsn, poison::Poison}; + +pub struct WalRedoProcess { + #[allow(dead_code)] + conf: &'static PageServerConf, + tenant_shard_id: TenantShardId, + // Some() on construction, only becomes None on Drop. + child: Option, + stdout: tokio::sync::Mutex>, + stdin: tokio::sync::Mutex>, + /// Counter to separate same sized walredo inputs failing at the same millisecond. + #[cfg(feature = "testing")] + dump_sequence: AtomicUsize, } -#[derive( - Clone, - Copy, - Debug, - PartialEq, - Eq, - strum_macros::EnumString, - strum_macros::Display, - strum_macros::IntoStaticStr, - serde_with::DeserializeFromStr, - serde_with::SerializeDisplay, -)] -#[strum(serialize_all = "kebab-case")] -#[repr(u8)] -pub enum Kind { - Sync, - Async, +struct ProcessInput { + stdin: tokio::process::ChildStdin, + n_requests: usize, } -pub(crate) struct Process(process_impl::process_async::WalRedoProcess); +struct ProcessOutput { + stdout: tokio::process::ChildStdout, + pending_responses: VecDeque>, + n_processed_responses: usize, +} -impl Process { - #[inline(always)] - pub fn launch( +impl WalRedoProcess { + // + // Start postgres binary in special WAL redo mode. + // + #[instrument(skip_all,fields(pg_version=pg_version))] + pub(crate) fn launch( conf: &'static PageServerConf, tenant_shard_id: TenantShardId, pg_version: u32, ) -> anyhow::Result { - if conf.walredo_process_kind != Kind::Async { - warn!( - configured = %conf.walredo_process_kind, - "the walredo_process_kind setting has been turned into a no-op, using async implementation" - ); - } - Ok(Self(process_impl::process_async::WalRedoProcess::launch( + crate::span::debug_assert_current_span_has_tenant_id(); + + let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible. + let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?; + + use no_leak_child::NoLeakChildCommandExt; + // Start postgres itself + let child = Command::new(pg_bin_dir_path.join("postgres")) + // the first arg must be --wal-redo so the child process enters into walredo mode + .arg("--wal-redo") + // the child doesn't process this arg, but, having it in the argv helps indentify the + // walredo process for a particular tenant when debugging a pagserver + .args(["--tenant-shard-id", &format!("{tenant_shard_id}")]) + .stdin(Stdio::piped()) + .stderr(Stdio::piped()) + .stdout(Stdio::piped()) + .env_clear() + .env("LD_LIBRARY_PATH", &pg_lib_dir_path) + .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) + // NB: The redo process is not trusted after we sent it the first + // walredo work. Before that, it is trusted. Specifically, we trust + // it to + // 1. close all file descriptors except stdin, stdout, stderr because + // pageserver might not be 100% diligent in setting FD_CLOEXEC on all + // the files it opens, and + // 2. to use seccomp to sandbox itself before processing the first + // walredo request. + .spawn_no_leak_child(tenant_shard_id) + .context("spawn process")?; + WAL_REDO_PROCESS_COUNTERS.started.inc(); + let mut child = scopeguard::guard(child, |child| { + error!("killing wal-redo-postgres process due to a problem during launch"); + child.kill_and_wait(WalRedoKillCause::Startup); + }); + + let stdin = child.stdin.take().unwrap(); + let stdout = child.stdout.take().unwrap(); + let stderr = child.stderr.take().unwrap(); + let stderr = tokio::process::ChildStderr::from_std(stderr) + .context("convert to tokio::ChildStderr")?; + let stdin = + tokio::process::ChildStdin::from_std(stdin).context("convert to tokio::ChildStdin")?; + let stdout = tokio::process::ChildStdout::from_std(stdout) + .context("convert to tokio::ChildStdout")?; + + // all fallible operations post-spawn are complete, so get rid of the guard + let child = scopeguard::ScopeGuard::into_inner(child); + + tokio::spawn( + async move { + scopeguard::defer! { + debug!("wal-redo-postgres stderr_logger_task finished"); + crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc(); + } + debug!("wal-redo-postgres stderr_logger_task started"); + crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc(); + + use tokio::io::AsyncBufReadExt; + let mut stderr_lines = tokio::io::BufReader::new(stderr); + let mut buf = Vec::new(); + let res = loop { + buf.clear(); + // TODO we don't trust the process to cap its stderr length. + // Currently it can do unbounded Vec allocation. + match stderr_lines.read_until(b'\n', &mut buf).await { + Ok(0) => break Ok(()), // eof + Ok(num_bytes) => { + let output = String::from_utf8_lossy(&buf[..num_bytes]); + error!(%output, "received output"); + } + Err(e) => { + break Err(e); + } + } + }; + match res { + Ok(()) => (), + Err(e) => { + error!(error=?e, "failed to read from walredo stderr"); + } + } + }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version)) + ); + + Ok(Self { conf, tenant_shard_id, - pg_version, - )?)) + child: Some(child), + stdin: tokio::sync::Mutex::new(Poison::new( + "stdin", + ProcessInput { + stdin, + n_requests: 0, + }, + )), + stdout: tokio::sync::Mutex::new(Poison::new( + "stdout", + ProcessOutput { + stdout, + pending_responses: VecDeque::new(), + n_processed_responses: 0, + }, + )), + #[cfg(feature = "testing")] + dump_sequence: AtomicUsize::default(), + }) } - #[inline(always)] + pub(crate) fn id(&self) -> u32 { + self.child + .as_ref() + .expect("must not call this during Drop") + .id() + } + + /// Apply given WAL records ('records') over an old page image. Returns + /// new page image. + /// + /// # Cancel-Safety + /// + /// Cancellation safe. + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))] pub(crate) async fn apply_wal_records( &self, rel: RelTag, @@ -67,12 +187,191 @@ impl Process { records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, ) -> anyhow::Result { - self.0 - .apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout) - .await + let tag = protocol::BufferTag { rel, blknum }; + + // Serialize all the messages to send the WAL redo process first. + // + // This could be problematic if there are millions of records to replay, + // but in practice the number of records is usually so small that it doesn't + // matter, and it's better to keep this code simple. + // + // Most requests start with a before-image with BLCKSZ bytes, followed by + // by some other WAL records. Start with a buffer that can hold that + // comfortably. + let mut writebuf: Vec = Vec::with_capacity((BLCKSZ as usize) * 3); + protocol::build_begin_redo_for_block_msg(tag, &mut writebuf); + if let Some(img) = base_img { + protocol::build_push_page_msg(tag, img, &mut writebuf); + } + for (lsn, rec) in records.iter() { + if let NeonWalRecord::Postgres { + will_init: _, + rec: postgres_rec, + } = rec + { + protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf); + } else { + anyhow::bail!("tried to pass neon wal record to postgres WAL redo"); + } + } + protocol::build_get_page_msg(tag, &mut writebuf); + WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64); + + let Ok(res) = + tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await + else { + anyhow::bail!("WAL redo timed out"); + }; + + if res.is_err() { + // not all of these can be caused by this particular input, however these are so rare + // in tests so capture all. + self.record_and_log(&writebuf); + } + + res } - pub(crate) fn id(&self) -> u32 { - self.0.id() + /// # Cancel-Safety + /// + /// When not polled to completion (e.g. because in `tokio::select!` another + /// branch becomes ready before this future), concurrent and subsequent + /// calls may fail due to [`utils::poison::Poison::check_and_arm`] calls. + /// Dispose of this process instance and create a new one. + async fn apply_wal_records0(&self, writebuf: &[u8]) -> anyhow::Result { + let request_no = { + let mut lock_guard = self.stdin.lock().await; + let mut poison_guard = lock_guard.check_and_arm()?; + let input = poison_guard.data_mut(); + input + .stdin + .write_all(writebuf) + .await + .context("write to walredo stdin")?; + let request_no = input.n_requests; + input.n_requests += 1; + poison_guard.disarm(); + request_no + }; + + // To improve walredo performance we separate sending requests and receiving + // responses. Them are protected by different mutexes (output and input). + // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process + // then there is not warranty that T1 will first granted output mutex lock. + // To address this issue we maintain number of sent requests, number of processed + // responses and ring buffer with pending responses. After sending response + // (under input mutex), threads remembers request number. Then it releases + // input mutex, locks output mutex and fetch in ring buffer all responses until + // its stored request number. The it takes correspondent element from + // pending responses ring buffer and truncate all empty elements from the front, + // advancing processed responses number. + + let mut lock_guard = self.stdout.lock().await; + let mut poison_guard = lock_guard.check_and_arm()?; + let output = poison_guard.data_mut(); + let n_processed_responses = output.n_processed_responses; + while n_processed_responses + output.pending_responses.len() <= request_no { + // We expect the WAL redo process to respond with an 8k page image. We read it + // into this buffer. + let mut resultbuf = vec![0; BLCKSZ.into()]; + output + .stdout + .read_exact(&mut resultbuf) + .await + .context("read walredo stdout")?; + output + .pending_responses + .push_back(Some(Bytes::from(resultbuf))); + } + // Replace our request's response with None in `pending_responses`. + // Then make space in the ring buffer by clearing out any seqence of contiguous + // `None`'s from the front of `pending_responses`. + // NB: We can't pop_front() because other requests' responses because another + // requester might have grabbed the output mutex before us: + // T1: grab input mutex + // T1: send request_no 23 + // T1: release input mutex + // T2: grab input mutex + // T2: send request_no 24 + // T2: release input mutex + // T2: grab output mutex + // T2: n_processed_responses + output.pending_responses.len() <= request_no + // 23 0 24 + // T2: enters poll loop that reads stdout + // T2: put response for 23 into pending_responses + // T2: put response for 24 into pending_resposnes + // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back + // T2: takes its response_24 + // pending_responses now looks like this: Front Some(response_23) None Back + // T2: does the while loop below + // pending_responses now looks like this: Front Some(response_23) None Back + // T2: releases output mutex + // T1: grabs output mutex + // T1: n_processed_responses + output.pending_responses.len() > request_no + // 23 2 23 + // T1: skips poll loop that reads stdout + // T1: takes its response_23 + // pending_responses now looks like this: Front None None Back + // T2: does the while loop below + // pending_responses now looks like this: Front Back + // n_processed_responses now has value 25 + let res = output.pending_responses[request_no - n_processed_responses] + .take() + .expect("we own this request_no, nobody else is supposed to take it"); + while let Some(front) = output.pending_responses.front() { + if front.is_none() { + output.pending_responses.pop_front(); + output.n_processed_responses += 1; + } else { + break; + } + } + poison_guard.disarm(); + Ok(res) + } + + #[cfg(feature = "testing")] + fn record_and_log(&self, writebuf: &[u8]) { + use std::sync::atomic::Ordering; + + let millis = std::time::SystemTime::now() + .duration_since(std::time::SystemTime::UNIX_EPOCH) + .unwrap() + .as_millis(); + + let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed); + + // these files will be collected to an allure report + let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len()); + + let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename); + + use std::io::Write; + let res = std::fs::OpenOptions::new() + .write(true) + .create_new(true) + .read(true) + .open(path) + .and_then(|mut f| f.write_all(writebuf)); + + // trip up allowed_errors + if let Err(e) = res { + tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}"); + } else { + tracing::error!(filename, "erroring walredo input saved"); + } + } + + #[cfg(not(feature = "testing"))] + fn record_and_log(&self, _: &[u8]) {} +} + +impl Drop for WalRedoProcess { + fn drop(&mut self) { + self.child + .take() + .expect("we only do this once") + .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop); + // no way to wait for stderr_logger_task from Drop because that is async only } } diff --git a/pageserver/src/walredo/process/process_impl/process_async.rs b/pageserver/src/walredo/process/process_impl/process_async.rs deleted file mode 100644 index 262858b033..0000000000 --- a/pageserver/src/walredo/process/process_impl/process_async.rs +++ /dev/null @@ -1,374 +0,0 @@ -use self::no_leak_child::NoLeakChild; -use crate::{ - config::PageServerConf, - metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER}, - walrecord::NeonWalRecord, - walredo::process::{no_leak_child, protocol}, -}; -use anyhow::Context; -use bytes::Bytes; -use pageserver_api::{reltag::RelTag, shard::TenantShardId}; -use postgres_ffi::BLCKSZ; -#[cfg(feature = "testing")] -use std::sync::atomic::AtomicUsize; -use std::{ - collections::VecDeque, - process::{Command, Stdio}, - time::Duration, -}; -use tokio::io::{AsyncReadExt, AsyncWriteExt}; -use tracing::{debug, error, instrument, Instrument}; -use utils::{lsn::Lsn, poison::Poison}; - -pub struct WalRedoProcess { - #[allow(dead_code)] - conf: &'static PageServerConf, - tenant_shard_id: TenantShardId, - // Some() on construction, only becomes None on Drop. - child: Option, - stdout: tokio::sync::Mutex>, - stdin: tokio::sync::Mutex>, - /// Counter to separate same sized walredo inputs failing at the same millisecond. - #[cfg(feature = "testing")] - dump_sequence: AtomicUsize, -} - -struct ProcessInput { - stdin: tokio::process::ChildStdin, - n_requests: usize, -} - -struct ProcessOutput { - stdout: tokio::process::ChildStdout, - pending_responses: VecDeque>, - n_processed_responses: usize, -} - -impl WalRedoProcess { - // - // Start postgres binary in special WAL redo mode. - // - #[instrument(skip_all,fields(pg_version=pg_version))] - pub(crate) fn launch( - conf: &'static PageServerConf, - tenant_shard_id: TenantShardId, - pg_version: u32, - ) -> anyhow::Result { - crate::span::debug_assert_current_span_has_tenant_id(); - - let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible. - let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?; - - use no_leak_child::NoLeakChildCommandExt; - // Start postgres itself - let child = Command::new(pg_bin_dir_path.join("postgres")) - // the first arg must be --wal-redo so the child process enters into walredo mode - .arg("--wal-redo") - // the child doesn't process this arg, but, having it in the argv helps indentify the - // walredo process for a particular tenant when debugging a pagserver - .args(["--tenant-shard-id", &format!("{tenant_shard_id}")]) - .stdin(Stdio::piped()) - .stderr(Stdio::piped()) - .stdout(Stdio::piped()) - .env_clear() - .env("LD_LIBRARY_PATH", &pg_lib_dir_path) - .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) - // NB: The redo process is not trusted after we sent it the first - // walredo work. Before that, it is trusted. Specifically, we trust - // it to - // 1. close all file descriptors except stdin, stdout, stderr because - // pageserver might not be 100% diligent in setting FD_CLOEXEC on all - // the files it opens, and - // 2. to use seccomp to sandbox itself before processing the first - // walredo request. - .spawn_no_leak_child(tenant_shard_id) - .context("spawn process")?; - WAL_REDO_PROCESS_COUNTERS.started.inc(); - let mut child = scopeguard::guard(child, |child| { - error!("killing wal-redo-postgres process due to a problem during launch"); - child.kill_and_wait(WalRedoKillCause::Startup); - }); - - let stdin = child.stdin.take().unwrap(); - let stdout = child.stdout.take().unwrap(); - let stderr = child.stderr.take().unwrap(); - let stderr = tokio::process::ChildStderr::from_std(stderr) - .context("convert to tokio::ChildStderr")?; - let stdin = - tokio::process::ChildStdin::from_std(stdin).context("convert to tokio::ChildStdin")?; - let stdout = tokio::process::ChildStdout::from_std(stdout) - .context("convert to tokio::ChildStdout")?; - - // all fallible operations post-spawn are complete, so get rid of the guard - let child = scopeguard::ScopeGuard::into_inner(child); - - tokio::spawn( - async move { - scopeguard::defer! { - debug!("wal-redo-postgres stderr_logger_task finished"); - crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc(); - } - debug!("wal-redo-postgres stderr_logger_task started"); - crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc(); - - use tokio::io::AsyncBufReadExt; - let mut stderr_lines = tokio::io::BufReader::new(stderr); - let mut buf = Vec::new(); - let res = loop { - buf.clear(); - // TODO we don't trust the process to cap its stderr length. - // Currently it can do unbounded Vec allocation. - match stderr_lines.read_until(b'\n', &mut buf).await { - Ok(0) => break Ok(()), // eof - Ok(num_bytes) => { - let output = String::from_utf8_lossy(&buf[..num_bytes]); - error!(%output, "received output"); - } - Err(e) => { - break Err(e); - } - } - }; - match res { - Ok(()) => (), - Err(e) => { - error!(error=?e, "failed to read from walredo stderr"); - } - } - }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version)) - ); - - Ok(Self { - conf, - tenant_shard_id, - child: Some(child), - stdin: tokio::sync::Mutex::new(Poison::new( - "stdin", - ProcessInput { - stdin, - n_requests: 0, - }, - )), - stdout: tokio::sync::Mutex::new(Poison::new( - "stdout", - ProcessOutput { - stdout, - pending_responses: VecDeque::new(), - n_processed_responses: 0, - }, - )), - #[cfg(feature = "testing")] - dump_sequence: AtomicUsize::default(), - }) - } - - pub(crate) fn id(&self) -> u32 { - self.child - .as_ref() - .expect("must not call this during Drop") - .id() - } - - /// Apply given WAL records ('records') over an old page image. Returns - /// new page image. - /// - /// # Cancel-Safety - /// - /// Cancellation safe. - #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))] - pub(crate) async fn apply_wal_records( - &self, - rel: RelTag, - blknum: u32, - base_img: &Option, - records: &[(Lsn, NeonWalRecord)], - wal_redo_timeout: Duration, - ) -> anyhow::Result { - let tag = protocol::BufferTag { rel, blknum }; - - // Serialize all the messages to send the WAL redo process first. - // - // This could be problematic if there are millions of records to replay, - // but in practice the number of records is usually so small that it doesn't - // matter, and it's better to keep this code simple. - // - // Most requests start with a before-image with BLCKSZ bytes, followed by - // by some other WAL records. Start with a buffer that can hold that - // comfortably. - let mut writebuf: Vec = Vec::with_capacity((BLCKSZ as usize) * 3); - protocol::build_begin_redo_for_block_msg(tag, &mut writebuf); - if let Some(img) = base_img { - protocol::build_push_page_msg(tag, img, &mut writebuf); - } - for (lsn, rec) in records.iter() { - if let NeonWalRecord::Postgres { - will_init: _, - rec: postgres_rec, - } = rec - { - protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf); - } else { - anyhow::bail!("tried to pass neon wal record to postgres WAL redo"); - } - } - protocol::build_get_page_msg(tag, &mut writebuf); - WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64); - - let Ok(res) = - tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await - else { - anyhow::bail!("WAL redo timed out"); - }; - - if res.is_err() { - // not all of these can be caused by this particular input, however these are so rare - // in tests so capture all. - self.record_and_log(&writebuf); - } - - res - } - - /// # Cancel-Safety - /// - /// When not polled to completion (e.g. because in `tokio::select!` another - /// branch becomes ready before this future), concurrent and subsequent - /// calls may fail due to [`utils::poison::Poison::check_and_arm`] calls. - /// Dispose of this process instance and create a new one. - async fn apply_wal_records0(&self, writebuf: &[u8]) -> anyhow::Result { - let request_no = { - let mut lock_guard = self.stdin.lock().await; - let mut poison_guard = lock_guard.check_and_arm()?; - let input = poison_guard.data_mut(); - input - .stdin - .write_all(writebuf) - .await - .context("write to walredo stdin")?; - let request_no = input.n_requests; - input.n_requests += 1; - poison_guard.disarm(); - request_no - }; - - // To improve walredo performance we separate sending requests and receiving - // responses. Them are protected by different mutexes (output and input). - // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process - // then there is not warranty that T1 will first granted output mutex lock. - // To address this issue we maintain number of sent requests, number of processed - // responses and ring buffer with pending responses. After sending response - // (under input mutex), threads remembers request number. Then it releases - // input mutex, locks output mutex and fetch in ring buffer all responses until - // its stored request number. The it takes correspondent element from - // pending responses ring buffer and truncate all empty elements from the front, - // advancing processed responses number. - - let mut lock_guard = self.stdout.lock().await; - let mut poison_guard = lock_guard.check_and_arm()?; - let output = poison_guard.data_mut(); - let n_processed_responses = output.n_processed_responses; - while n_processed_responses + output.pending_responses.len() <= request_no { - // We expect the WAL redo process to respond with an 8k page image. We read it - // into this buffer. - let mut resultbuf = vec![0; BLCKSZ.into()]; - output - .stdout - .read_exact(&mut resultbuf) - .await - .context("read walredo stdout")?; - output - .pending_responses - .push_back(Some(Bytes::from(resultbuf))); - } - // Replace our request's response with None in `pending_responses`. - // Then make space in the ring buffer by clearing out any seqence of contiguous - // `None`'s from the front of `pending_responses`. - // NB: We can't pop_front() because other requests' responses because another - // requester might have grabbed the output mutex before us: - // T1: grab input mutex - // T1: send request_no 23 - // T1: release input mutex - // T2: grab input mutex - // T2: send request_no 24 - // T2: release input mutex - // T2: grab output mutex - // T2: n_processed_responses + output.pending_responses.len() <= request_no - // 23 0 24 - // T2: enters poll loop that reads stdout - // T2: put response for 23 into pending_responses - // T2: put response for 24 into pending_resposnes - // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back - // T2: takes its response_24 - // pending_responses now looks like this: Front Some(response_23) None Back - // T2: does the while loop below - // pending_responses now looks like this: Front Some(response_23) None Back - // T2: releases output mutex - // T1: grabs output mutex - // T1: n_processed_responses + output.pending_responses.len() > request_no - // 23 2 23 - // T1: skips poll loop that reads stdout - // T1: takes its response_23 - // pending_responses now looks like this: Front None None Back - // T2: does the while loop below - // pending_responses now looks like this: Front Back - // n_processed_responses now has value 25 - let res = output.pending_responses[request_no - n_processed_responses] - .take() - .expect("we own this request_no, nobody else is supposed to take it"); - while let Some(front) = output.pending_responses.front() { - if front.is_none() { - output.pending_responses.pop_front(); - output.n_processed_responses += 1; - } else { - break; - } - } - poison_guard.disarm(); - Ok(res) - } - - #[cfg(feature = "testing")] - fn record_and_log(&self, writebuf: &[u8]) { - use std::sync::atomic::Ordering; - - let millis = std::time::SystemTime::now() - .duration_since(std::time::SystemTime::UNIX_EPOCH) - .unwrap() - .as_millis(); - - let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed); - - // these files will be collected to an allure report - let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len()); - - let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename); - - use std::io::Write; - let res = std::fs::OpenOptions::new() - .write(true) - .create_new(true) - .read(true) - .open(path) - .and_then(|mut f| f.write_all(writebuf)); - - // trip up allowed_errors - if let Err(e) = res { - tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}"); - } else { - tracing::error!(filename, "erroring walredo input saved"); - } - } - - #[cfg(not(feature = "testing"))] - fn record_and_log(&self, _: &[u8]) {} -} - -impl Drop for WalRedoProcess { - fn drop(&mut self) { - self.child - .take() - .expect("we only do this once") - .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop); - // no way to wait for stderr_logger_task from Drop because that is async only - } -} diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c index e43f4d9d96..60eb8e1fc9 100644 --- a/pgxn/neon/neon_walreader.c +++ b/pgxn/neon/neon_walreader.c @@ -184,8 +184,8 @@ NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, Ti } else if (state->wre_errno == ENOENT) { - nwr_log(LOG, "local read failed as segment at %X/%X doesn't exist, attempting remote", - LSN_FORMAT_ARGS(startptr)); + nwr_log(LOG, "local read at %X/%X len %zu failed as segment file doesn't exist, attempting remote", + LSN_FORMAT_ARGS(startptr), count); return NeonWALReadRemote(state, buf, startptr, count, tli); } else @@ -614,6 +614,7 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun uint32 startoff; int segbytes; int readbytes; + XLogSegNo lastRemovedSegNo; startoff = XLogSegmentOffset(recptr, state->segcxt.ws_segsize); @@ -689,6 +690,23 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun return false; } + /* + * Recheck that the segment hasn't been removed while we were reading + * it. + */ + lastRemovedSegNo = XLogGetLastRemovedSegno(); + if (state->seg.ws_segno <= lastRemovedSegNo) + { + char fname[MAXFNAMELEN]; + + state->wre_errno = ENOENT; + + XLogFileName(fname, tli, state->seg.ws_segno, state->segcxt.ws_segsize); + snprintf(state->err_msg, sizeof(state->err_msg), "WAL segment %s has been removed during the read, lastRemovedSegNo " UINT64_FORMAT, + fname, lastRemovedSegNo); + return false; + } + /* Update state for read */ recptr += readbytes; nbytes -= readbytes; diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index 4d074f98a5..634ec9042c 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -452,7 +452,7 @@ pub struct ApiLocks { #[derive(Debug, thiserror::Error)] pub enum ApiLockError { - #[error("permit could not be acquired")] + #[error("timeout acquiring resource permit")] TimeoutError(#[from] tokio::time::error::Elapsed), } @@ -504,7 +504,7 @@ impl ApiLocks { .clone() } }; - let permit = semaphore.acquire_deadline(now + self.timeout).await; + let permit = semaphore.acquire_timeout(self.timeout).await; self.metrics .semaphore_acquire_seconds diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs index 072fdb80b0..3842ce269e 100644 --- a/proxy/src/rate_limiter/limit_algorithm.rs +++ b/proxy/src/rate_limiter/limit_algorithm.rs @@ -3,7 +3,7 @@ use parking_lot::Mutex; use std::{pin::pin, sync::Arc, time::Duration}; use tokio::{ sync::Notify, - time::{error::Elapsed, timeout_at, Instant}, + time::{error::Elapsed, Instant}, }; use self::aimd::Aimd; @@ -80,7 +80,7 @@ pub struct LimiterInner { } impl LimiterInner { - fn update(&mut self, latency: Duration, outcome: Option) { + fn update_limit(&mut self, latency: Duration, outcome: Option) { if let Some(outcome) = outcome { let sample = Sample { latency, @@ -92,12 +92,12 @@ impl LimiterInner { } fn take(&mut self, ready: &Notify) -> Option<()> { - if self.available > 1 { + if self.available >= 1 { self.available -= 1; self.in_flight += 1; // tell the next in the queue that there is a permit ready - if self.available > 1 { + if self.available >= 1 { ready.notify_one(); } Some(()) @@ -157,16 +157,12 @@ impl DynamicLimiter { } /// Try to acquire a concurrency [Token], waiting for `duration` if there are none available. - /// - /// Returns `None` if there are none available after `duration`. pub async fn acquire_timeout(self: &Arc, duration: Duration) -> Result { - self.acquire_deadline(Instant::now() + duration).await + tokio::time::timeout(duration, self.acquire()).await? } - /// Try to acquire a concurrency [Token], waiting until `deadline` if there are none available. - /// - /// Returns `None` if there are none available after `deadline`. - pub async fn acquire_deadline(self: &Arc, deadline: Instant) -> Result { + /// Try to acquire a concurrency [Token]. + async fn acquire(self: &Arc) -> Result { if self.config.initial_limit == 0 { // If the rate limiter is disabled, we can always acquire a token. Ok(Token::disabled()) @@ -174,22 +170,16 @@ impl DynamicLimiter { let mut notified = pin!(self.ready.notified()); let mut ready = notified.as_mut().enable(); loop { - let mut limit = None; if ready { let mut inner = self.inner.lock(); if inner.take(&self.ready).is_some() { break Ok(Token::new(self.clone())); - } - limit = Some(inner.limit); - } - match timeout_at(deadline, notified.as_mut()).await { - Ok(()) => ready = true, - Err(e) => { - let limit = limit.unwrap_or_else(|| self.inner.lock().limit); - tracing::info!(limit, "could not acquire token in time"); - break Err(e); + } else { + notified.set(self.ready.notified()); } } + notified.as_mut().await; + ready = true; } } } @@ -208,14 +198,14 @@ impl DynamicLimiter { let mut inner = self.inner.lock(); - inner.update(start.elapsed(), outcome); + inner.update_limit(start.elapsed(), outcome); + + inner.in_flight -= 1; if inner.in_flight < inner.limit { inner.available = inner.limit - inner.in_flight; // At least 1 permit is now available self.ready.notify_one(); } - - inner.in_flight -= 1; } /// The current state of the limiter. diff --git a/proxy/src/rate_limiter/limit_algorithm/aimd.rs b/proxy/src/rate_limiter/limit_algorithm/aimd.rs index 370d4be802..ccc9c42420 100644 --- a/proxy/src/rate_limiter/limit_algorithm/aimd.rs +++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs @@ -51,7 +51,9 @@ impl LimitAlgorithm for Aimd { // E.g. round(2 * 0.9) = 2, but floor(2 * 0.9) = 1 let limit = limit.floor() as usize; - limit.clamp(self.min, self.max) + let limit = limit.clamp(self.min, self.max); + tracing::info!(limit, "limit decreased"); + limit } } } @@ -67,6 +69,53 @@ mod tests { use super::*; + #[tokio::test(start_paused = true)] + async fn increase_decrease() { + let config = RateLimiterConfig { + initial_limit: 1, + algorithm: RateLimitAlgorithm::Aimd { + conf: Aimd { + min: 1, + max: 2, + inc: 10, + dec: 0.5, + utilisation: 0.8, + }, + }, + }; + + let limiter = DynamicLimiter::new(config); + + let token = limiter + .acquire_timeout(Duration::from_millis(1)) + .await + .unwrap(); + token.release(Outcome::Success); + + assert_eq!(limiter.state().limit(), 2); + + let token = limiter + .acquire_timeout(Duration::from_millis(1)) + .await + .unwrap(); + token.release(Outcome::Success); + assert_eq!(limiter.state().limit(), 2); + + let token = limiter + .acquire_timeout(Duration::from_millis(1)) + .await + .unwrap(); + token.release(Outcome::Overload); + assert_eq!(limiter.state().limit(), 1); + + let token = limiter + .acquire_timeout(Duration::from_millis(1)) + .await + .unwrap(); + token.release(Outcome::Overload); + assert_eq!(limiter.state().limit(), 1); + } + #[tokio::test(start_paused = true)] async fn should_decrease_limit_on_overload() { let config = RateLimiterConfig { @@ -85,7 +134,7 @@ mod tests { let limiter = DynamicLimiter::new(config); let token = limiter - .acquire_timeout(Duration::from_millis(1)) + .acquire_timeout(Duration::from_millis(100)) .await .unwrap(); token.release(Outcome::Overload); @@ -93,6 +142,41 @@ mod tests { assert_eq!(limiter.state().limit(), 5, "overload: decrease"); } + #[tokio::test(start_paused = true)] + async fn acquire_timeout_times_out() { + let config = RateLimiterConfig { + initial_limit: 1, + algorithm: RateLimitAlgorithm::Aimd { + conf: Aimd { + min: 1, + max: 2, + inc: 10, + dec: 0.5, + utilisation: 0.8, + }, + }, + }; + + let limiter = DynamicLimiter::new(config); + + let token = limiter + .acquire_timeout(Duration::from_millis(1)) + .await + .unwrap(); + let now = tokio::time::Instant::now(); + limiter + .acquire_timeout(Duration::from_secs(1)) + .await + .err() + .unwrap(); + + assert!(now.elapsed() >= Duration::from_secs(1)); + + token.release(Outcome::Success); + + assert_eq!(limiter.state().limit(), 2); + } + #[tokio::test(start_paused = true)] async fn should_increase_limit_on_success_when_using_gt_util_threshold() { let config = RateLimiterConfig { diff --git a/s3_scrubber/Cargo.toml b/s3_scrubber/Cargo.toml index e56bd43fb8..48b50ca21c 100644 --- a/s3_scrubber/Cargo.toml +++ b/s3_scrubber/Cargo.toml @@ -11,6 +11,7 @@ either.workspace = true tokio-rustls.workspace = true anyhow.workspace = true hex.workspace = true +humantime.workspace = true thiserror.workspace = true rand.workspace = true bytes.workspace = true diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs index 134afa53da..44fb53696c 100644 --- a/s3_scrubber/src/checks.rs +++ b/s3_scrubber/src/checks.rs @@ -1,7 +1,7 @@ use std::collections::{HashMap, HashSet}; use anyhow::Context; -use aws_sdk_s3::{types::ObjectIdentifier, Client}; +use aws_sdk_s3::Client; use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; use pageserver_api::shard::ShardIndex; use tracing::{error, info, warn}; @@ -70,7 +70,7 @@ pub(crate) fn branch_cleanup_and_check_errors( match s3_data { Some(s3_data) => { - result.garbage_keys.extend(s3_data.keys_to_remove); + result.garbage_keys.extend(s3_data.unknown_keys); match s3_data.blob_data { BlobDataParseResult::Parsed { @@ -93,12 +93,12 @@ pub(crate) fn branch_cleanup_and_check_errors( } if index_part.metadata.disk_consistent_lsn() - != index_part.get_disk_consistent_lsn() + != index_part.duplicated_disk_consistent_lsn() { result.errors.push(format!( "Mismatching disk_consistent_lsn in TimelineMetadata ({}) and in the index_part ({})", index_part.metadata.disk_consistent_lsn(), - index_part.get_disk_consistent_lsn(), + index_part.duplicated_disk_consistent_lsn(), )) } @@ -240,7 +240,12 @@ impl TenantObjectListing { #[derive(Debug)] pub(crate) struct S3TimelineBlobData { pub(crate) blob_data: BlobDataParseResult, - pub(crate) keys_to_remove: Vec, + + // Index objects that were not used when loading `blob_data`, e.g. those from old generations + pub(crate) unused_index_keys: Vec, + + // Objects whose keys were not recognized at all, i.e. not layer files, not indices + pub(crate) unknown_keys: Vec, } #[derive(Debug)] @@ -276,12 +281,12 @@ pub(crate) async fn list_timeline_blobs( let mut s3_layers = HashSet::new(); let mut errors = Vec::new(); - let mut keys_to_remove = Vec::new(); + let mut unknown_keys = Vec::new(); let mut timeline_dir_target = s3_root.timeline_root(&id); timeline_dir_target.delimiter = String::new(); - let mut index_parts: Vec = Vec::new(); + let mut index_part_keys: Vec = Vec::new(); let mut initdb_archive: bool = false; let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target)); @@ -292,16 +297,16 @@ pub(crate) async fn list_timeline_blobs( let blob_name = key.strip_prefix(&timeline_dir_target.prefix_in_bucket); match blob_name { Some(name) if name.starts_with("index_part.json") => { - tracing::info!("Index key {key}"); - index_parts.push(obj) + tracing::debug!("Index key {key}"); + index_part_keys.push(key.to_owned()) } Some("initdb.tar.zst") => { - tracing::info!("initdb archive {key}"); + tracing::debug!("initdb archive {key}"); initdb_archive = true; } Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) { Ok((new_layer, gen)) => { - tracing::info!("Parsed layer key: {} {:?}", new_layer, gen); + tracing::debug!("Parsed layer key: {} {:?}", new_layer, gen); s3_layers.insert((new_layer, gen)); } Err(e) => { @@ -309,37 +314,37 @@ pub(crate) async fn list_timeline_blobs( errors.push( format!("S3 list response got an object with key {key} that is not a layer name: {e}"), ); - keys_to_remove.push(key.to_string()); + unknown_keys.push(key.to_string()); } }, None => { - tracing::info!("Peculiar key {}", key); + tracing::warn!("Unknown key {}", key); errors.push(format!("S3 list response got an object with odd key {key}")); - keys_to_remove.push(key.to_string()); + unknown_keys.push(key.to_string()); } } } - if index_parts.is_empty() && s3_layers.is_empty() && initdb_archive { - tracing::info!( + if index_part_keys.is_empty() && s3_layers.is_empty() && initdb_archive { + tracing::debug!( "Timeline is empty apart from initdb archive: expected post-deletion state." ); return Ok(S3TimelineBlobData { blob_data: BlobDataParseResult::Relic, - keys_to_remove: Vec::new(), + unused_index_keys: index_part_keys, + unknown_keys: Vec::new(), }); } // Choose the index_part with the highest generation - let (index_part_object, index_part_generation) = match index_parts + let (index_part_object, index_part_generation) = match index_part_keys .iter() - .filter_map(|k| { - let key = k.key(); + .filter_map(|key| { // Stripping the index key to the last part, because RemotePath doesn't // like absolute paths, and depending on prefix_in_bucket it's possible // for the keys we read back to start with a slash. let basename = key.rsplit_once('/').unwrap().1; - parse_remote_index_path(RemotePath::from_string(basename).unwrap()).map(|g| (k, g)) + parse_remote_index_path(RemotePath::from_string(basename).unwrap()).map(|g| (key, g)) }) .max_by_key(|i| i.1) .map(|(k, g)| (k.clone(), g)) @@ -347,15 +352,18 @@ pub(crate) async fn list_timeline_blobs( Some((key, gen)) => (Some(key), gen), None => { // Legacy/missing case: one or zero index parts, which did not have a generation - (index_parts.pop(), Generation::none()) + (index_part_keys.pop(), Generation::none()) } }; - if index_part_object.is_none() { - errors.push("S3 list response got no index_part.json file".to_string()); + match index_part_object.as_ref() { + Some(selected) => index_part_keys.retain(|k| k != selected), + None => { + errors.push("S3 list response got no index_part.json file".to_string()); + } } - if let Some(index_part_object_key) = index_part_object.as_ref().map(|object| object.key()) { + if let Some(index_part_object_key) = index_part_object.as_ref() { let index_part_bytes = download_object_with_retries( s3_client, &timeline_dir_target.bucket_name, @@ -372,17 +380,14 @@ pub(crate) async fn list_timeline_blobs( index_part_generation, s3_layers, }, - keys_to_remove, + unused_index_keys: index_part_keys, + unknown_keys, }) } Err(index_parse_error) => errors.push(format!( "index_part.json body parsing error: {index_parse_error}" )), } - } else { - errors.push(format!( - "Index part object {index_part_object:?} has no key" - )); } if errors.is_empty() { @@ -393,6 +398,7 @@ pub(crate) async fn list_timeline_blobs( Ok(S3TimelineBlobData { blob_data: BlobDataParseResult::Incorrect(errors), - keys_to_remove, + unused_index_keys: index_part_keys, + unknown_keys, }) } diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs index e0f99ecd9c..64273432fc 100644 --- a/s3_scrubber/src/lib.rs +++ b/s3_scrubber/src/lib.rs @@ -4,6 +4,7 @@ pub mod checks; pub mod cloud_admin_api; pub mod garbage; pub mod metadata_stream; +pub mod pageserver_physical_gc; pub mod scan_pageserver_metadata; pub mod scan_safekeeper_metadata; pub mod tenant_snapshot; @@ -396,7 +397,7 @@ async fn download_object_with_retries( .await { Ok(bytes_read) => { - tracing::info!("Downloaded {bytes_read} bytes for object object with key {key}"); + tracing::debug!("Downloaded {bytes_read} bytes for object {key}"); return Ok(body_buf); } Err(e) => { diff --git a/s3_scrubber/src/main.rs b/s3_scrubber/src/main.rs index e49c280b99..ade8ef7d7a 100644 --- a/s3_scrubber/src/main.rs +++ b/s3_scrubber/src/main.rs @@ -2,11 +2,13 @@ use anyhow::bail; use camino::Utf8PathBuf; use pageserver_api::shard::TenantShardId; use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode}; +use s3_scrubber::pageserver_physical_gc::GcMode; use s3_scrubber::scan_pageserver_metadata::scan_metadata; use s3_scrubber::tenant_snapshot::SnapshotDownloader; use s3_scrubber::{ - init_logging, scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig, - NodeKind, TraversingDepth, + init_logging, pageserver_physical_gc::pageserver_physical_gc, + scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig, NodeKind, + TraversingDepth, }; use clap::{Parser, Subcommand}; @@ -62,6 +64,14 @@ enum Command { #[arg(short, long)] output_path: Utf8PathBuf, }, + PageserverPhysicalGc { + #[arg(long = "tenant-id", num_args = 0..)] + tenant_ids: Vec, + #[arg(long = "min-age")] + min_age: humantime::Duration, + #[arg(short, long, default_value_t = GcMode::IndicesOnly)] + mode: GcMode, + }, } #[tokio::main] @@ -75,6 +85,7 @@ async fn main() -> anyhow::Result<()> { Command::FindGarbage { .. } => "find-garbage", Command::PurgeGarbage { .. } => "purge-garbage", Command::TenantSnapshot { .. } => "tenant-snapshot", + Command::PageserverPhysicalGc { .. } => "pageserver-physical-gc", }; let _guard = init_logging(&format!( "{}_{}_{}_{}.log", @@ -178,5 +189,15 @@ async fn main() -> anyhow::Result<()> { SnapshotDownloader::new(bucket_config, tenant_id, output_path, concurrency)?; downloader.download().await } + Command::PageserverPhysicalGc { + tenant_ids, + min_age, + mode, + } => { + let summary = + pageserver_physical_gc(bucket_config, tenant_ids, min_age.into(), mode).await?; + println!("{}", serde_json::to_string(&summary).unwrap()); + Ok(()) + } } } diff --git a/s3_scrubber/src/pageserver_physical_gc.rs b/s3_scrubber/src/pageserver_physical_gc.rs new file mode 100644 index 0000000000..0146433128 --- /dev/null +++ b/s3_scrubber/src/pageserver_physical_gc.rs @@ -0,0 +1,239 @@ +use std::time::{Duration, UNIX_EPOCH}; + +use crate::checks::{list_timeline_blobs, BlobDataParseResult}; +use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; +use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId}; +use aws_sdk_s3::Client; +use futures_util::{StreamExt, TryStreamExt}; +use pageserver::tenant::remote_timeline_client::parse_remote_index_path; +use pageserver::tenant::IndexPart; +use pageserver_api::shard::TenantShardId; +use remote_storage::RemotePath; +use serde::Serialize; +use tracing::{info_span, Instrument}; +use utils::generation::Generation; + +#[derive(Serialize, Default)] +pub struct GcSummary { + indices_deleted: usize, + remote_storage_errors: usize, +} + +#[derive(clap::ValueEnum, Debug, Clone, Copy)] +pub enum GcMode { + // Delete nothing + DryRun, + + // Enable only removing old-generation indices + IndicesOnly, + // Enable all forms of GC + // TODO: this will be used when shard split ancestor layer deletion is added + // All, +} + +impl std::fmt::Display for GcMode { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + GcMode::DryRun => write!(f, "dry-run"), + GcMode::IndicesOnly => write!(f, "indices-only"), + } + } +} + +async fn maybe_delete_index( + s3_client: &Client, + bucket_config: &BucketConfig, + min_age: &Duration, + latest_gen: Generation, + key: &str, + mode: GcMode, + summary: &mut GcSummary, +) { + // Validation: we will only delete things that parse cleanly + let basename = key.rsplit_once('/').unwrap().1; + let candidate_generation = + match parse_remote_index_path(RemotePath::from_string(basename).unwrap()) { + Some(g) => g, + None => { + if basename == IndexPart::FILE_NAME { + // A legacy pre-generation index + Generation::none() + } else { + // A strange key: we will not delete this because we don't understand it. + tracing::warn!("Bad index key"); + return; + } + } + }; + + // Validation: we will only delete indices more than one generation old, to avoid interfering + // in typical migrations, even if they are very long running. + if candidate_generation >= latest_gen { + // This shouldn't happen: when we loaded metadata, it should have selected the latest + // generation already, and only populated [`S3TimelineBlobData::unused_index_keys`] + // with older generations. + tracing::warn!("Deletion candidate is >= latest generation, this is a bug!"); + return; + } else if candidate_generation.next() == latest_gen { + // Skip deleting the latest-1th generation's index. + return; + } + + // Validation: we will only delete indices after one week, so that during incidents we will have + // easy access to recent indices. + let age: Duration = match s3_client + .head_object() + .bucket(&bucket_config.bucket) + .key(key) + .send() + .await + { + Ok(response) => match response.last_modified { + None => { + tracing::warn!("Missing last_modified"); + summary.remote_storage_errors += 1; + return; + } + Some(last_modified) => { + let last_modified = + UNIX_EPOCH + Duration::from_secs_f64(last_modified.as_secs_f64()); + match last_modified.elapsed() { + Ok(e) => e, + Err(_) => { + tracing::warn!("Bad last_modified time: {last_modified:?}"); + return; + } + } + } + }, + Err(e) => { + tracing::warn!("Failed to HEAD {key}: {e}"); + summary.remote_storage_errors += 1; + return; + } + }; + if &age < min_age { + tracing::info!( + "Skipping young object {} < {}", + age.as_secs_f64(), + min_age.as_secs_f64() + ); + return; + } + + if matches!(mode, GcMode::DryRun) { + tracing::info!("Dry run: would delete this key"); + return; + } + + // All validations passed: erase the object + match s3_client + .delete_object() + .bucket(&bucket_config.bucket) + .key(key) + .send() + .await + { + Ok(_) => { + tracing::info!("Successfully deleted index"); + summary.indices_deleted += 1; + } + Err(e) => { + tracing::warn!("Failed to delete index: {e}"); + summary.remote_storage_errors += 1; + } + } +} + +/// Physical garbage collection: removing unused S3 objects. This is distinct from the garbage collection +/// done inside the pageserver, which operates at a higher level (keys, layers). This type of garbage collection +/// is about removing: +/// - Objects that were uploaded but never referenced in the remote index (e.g. because of a shutdown between +/// uploading a layer and uploading an index) +/// - Index objects from historic generations +/// +/// This type of GC is not necessary for correctness: rather it serves to reduce wasted storage capacity, and +/// make sure that object listings don't get slowed down by large numbers of garbage objects. +pub async fn pageserver_physical_gc( + bucket_config: BucketConfig, + tenant_ids: Vec, + min_age: Duration, + mode: GcMode, +) -> anyhow::Result { + let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?; + + let tenants = if tenant_ids.is_empty() { + futures::future::Either::Left(stream_tenants(&s3_client, &target)) + } else { + futures::future::Either::Right(futures::stream::iter(tenant_ids.into_iter().map(Ok))) + }; + + // How many tenants to process in parallel. We need to be mindful of pageservers + // accessing the same per tenant prefixes, so use a lower setting than pageservers. + const CONCURRENCY: usize = 32; + + // Generate a stream of TenantTimelineId + let timelines = tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, t)); + let timelines = timelines.try_buffered(CONCURRENCY); + let timelines = timelines.try_flatten(); + + // Generate a stream of S3TimelineBlobData + async fn gc_timeline( + s3_client: &Client, + bucket_config: &BucketConfig, + min_age: &Duration, + target: &RootTarget, + mode: GcMode, + ttid: TenantShardTimelineId, + ) -> anyhow::Result { + let mut summary = GcSummary::default(); + let data = list_timeline_blobs(s3_client, ttid, target).await?; + + let (latest_gen, candidates) = match &data.blob_data { + BlobDataParseResult::Parsed { + index_part: _index_part, + index_part_generation, + s3_layers: _s3_layers, + } => (*index_part_generation, data.unused_index_keys), + BlobDataParseResult::Relic => { + // Post-deletion tenant location: don't try and GC it. + return Ok(summary); + } + BlobDataParseResult::Incorrect(reasons) => { + // Our primary purpose isn't to report on bad data, but log this rather than skipping silently + tracing::warn!("Skipping timeline {ttid}, bad metadata: {reasons:?}"); + return Ok(summary); + } + }; + + for key in candidates { + maybe_delete_index( + s3_client, + bucket_config, + min_age, + latest_gen, + &key, + mode, + &mut summary, + ) + .instrument(info_span!("maybe_delete_index", %ttid, ?latest_gen, key)) + .await; + } + + Ok(summary) + } + let timelines = timelines + .map_ok(|ttid| gc_timeline(&s3_client, &bucket_config, &min_age, &target, mode, ttid)); + let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY)); + + let mut summary = GcSummary::default(); + + while let Some(i) = timelines.next().await { + let tl_summary = i?; + + summary.indices_deleted += tl_summary.indices_deleted; + summary.remote_storage_errors += tl_summary.remote_storage_errors; + } + + Ok(summary) +} diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index aee3898ac7..7476654426 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -29,13 +29,12 @@ use safekeeper::defaults::{ DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, }; -use safekeeper::remove_wal; +use safekeeper::http; use safekeeper::wal_service; use safekeeper::GlobalTimelines; use safekeeper::SafeKeeperConf; use safekeeper::{broker, WAL_SERVICE_RUNTIME}; use safekeeper::{control_file, BROKER_RUNTIME}; -use safekeeper::{http, WAL_REMOVER_RUNTIME}; use safekeeper::{wal_backup, HTTP_RUNTIME}; use storage_broker::DEFAULT_ENDPOINT; use utils::auth::{JwtAuth, Scope, SwappableJwtAuth}; @@ -441,14 +440,6 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { .map(|res| ("broker main".to_owned(), res)); tasks_handles.push(Box::pin(broker_task_handle)); - let conf_ = conf.clone(); - let wal_remover_handle = current_thread_rt - .as_ref() - .unwrap_or_else(|| WAL_REMOVER_RUNTIME.handle()) - .spawn(remove_wal::task_main(conf_)) - .map(|res| ("WAL remover".to_owned(), res)); - tasks_handles.push(Box::pin(wal_remover_handle)); - set_build_info_metric(GIT_VERSION, BUILD_TAG); // TODO: update tokio-stream, convert to real async Stream with diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index fe9f2e6899..e9bb5202da 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -2,7 +2,7 @@ use anyhow::{bail, ensure, Context, Result}; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; -use camino::Utf8PathBuf; +use camino::{Utf8Path, Utf8PathBuf}; use tokio::fs::File; use tokio::io::AsyncWriteExt; use utils::crashsafe::durable_rename; @@ -12,9 +12,9 @@ use std::ops::Deref; use std::path::Path; use std::time::Instant; -use crate::control_file_upgrade::upgrade_control_file; use crate::metrics::PERSIST_CONTROL_FILE_SECONDS; use crate::state::TimelinePersistentState; +use crate::{control_file_upgrade::upgrade_control_file, timeline::get_timeline_dir}; use utils::{bin_ser::LeSer, id::TenantTimelineId}; use crate::SafeKeeperConf; @@ -43,7 +43,7 @@ pub trait Storage: Deref { pub struct FileStorage { // save timeline dir to avoid reconstructing it every time timeline_dir: Utf8PathBuf, - conf: SafeKeeperConf, + no_sync: bool, /// Last state persisted to disk. state: TimelinePersistentState, @@ -54,13 +54,12 @@ pub struct FileStorage { impl FileStorage { /// Initialize storage by loading state from disk. pub fn restore_new(ttid: &TenantTimelineId, conf: &SafeKeeperConf) -> Result { - let timeline_dir = conf.timeline_dir(ttid); - - let state = Self::load_control_file_conf(conf, ttid)?; + let timeline_dir = get_timeline_dir(conf, ttid); + let state = Self::load_control_file_from_dir(&timeline_dir)?; Ok(FileStorage { timeline_dir, - conf: conf.clone(), + no_sync: conf.no_sync, state, last_persist_at: Instant::now(), }) @@ -74,7 +73,7 @@ impl FileStorage { ) -> Result { let store = FileStorage { timeline_dir, - conf: conf.clone(), + no_sync: conf.no_sync, state, last_persist_at: Instant::now(), }; @@ -102,12 +101,9 @@ impl FileStorage { upgrade_control_file(buf, version) } - /// Load control file for given ttid at path specified by conf. - pub fn load_control_file_conf( - conf: &SafeKeeperConf, - ttid: &TenantTimelineId, - ) -> Result { - let path = conf.timeline_dir(ttid).join(CONTROL_FILE_NAME); + /// Load control file from given directory. + pub fn load_control_file_from_dir(timeline_dir: &Utf8Path) -> Result { + let path = timeline_dir.join(CONTROL_FILE_NAME); Self::load_control_file(path) } @@ -203,7 +199,7 @@ impl Storage for FileStorage { })?; let control_path = self.timeline_dir.join(CONTROL_FILE_NAME); - durable_rename(&control_partial_path, &control_path, !self.conf.no_sync).await?; + durable_rename(&control_partial_path, &control_path, !self.no_sync).await?; // update internal state self.state = s.clone(); @@ -233,12 +229,13 @@ mod test { conf: &SafeKeeperConf, ttid: &TenantTimelineId, ) -> Result<(FileStorage, TimelinePersistentState)> { - fs::create_dir_all(conf.timeline_dir(ttid)) + let timeline_dir = get_timeline_dir(conf, ttid); + fs::create_dir_all(&timeline_dir) .await .expect("failed to create timeline dir"); Ok(( FileStorage::restore_new(ttid, conf)?, - FileStorage::load_control_file_conf(conf, ttid)?, + FileStorage::load_control_file_from_dir(&timeline_dir)?, )) } @@ -246,11 +243,11 @@ mod test { conf: &SafeKeeperConf, ttid: &TenantTimelineId, ) -> Result<(FileStorage, TimelinePersistentState)> { - fs::create_dir_all(conf.timeline_dir(ttid)) + let timeline_dir = get_timeline_dir(conf, ttid); + fs::create_dir_all(&timeline_dir) .await .expect("failed to create timeline dir"); let state = TimelinePersistentState::empty(); - let timeline_dir = conf.timeline_dir(ttid); let storage = FileStorage::create_new(timeline_dir, conf, state.clone())?; Ok((storage, state)) } @@ -291,7 +288,7 @@ mod test { .await .expect("failed to persist state"); } - let control_path = conf.timeline_dir(&ttid).join(CONTROL_FILE_NAME); + let control_path = get_timeline_dir(&conf, &ttid).join(CONTROL_FILE_NAME); let mut data = fs::read(&control_path).await.unwrap(); data[0] += 1; // change the first byte of the file to fail checksum validation fs::write(&control_path, &data) diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs index 3023d4e2cb..51cf4db6b5 100644 --- a/safekeeper/src/copy_timeline.rs +++ b/safekeeper/src/copy_timeline.rs @@ -15,10 +15,10 @@ use crate::{ control_file::{FileStorage, Storage}, pull_timeline::{create_temp_timeline_dir, load_temp_timeline, validate_temp_timeline}, state::TimelinePersistentState, - timeline::{Timeline, TimelineError}, + timeline::{FullAccessTimeline, Timeline, TimelineError}, wal_backup::copy_s3_segments, wal_storage::{wal_file_paths, WalReader}, - GlobalTimelines, SafeKeeperConf, + GlobalTimelines, }; // we don't want to have more than 10 segments on disk after copy, because they take space @@ -46,12 +46,14 @@ pub async fn handle_request(request: Request) -> Result<()> { } } + let source_tli = request.source.full_access_guard().await?; + let conf = &GlobalTimelines::get_global_config(); let ttid = request.destination_ttid; let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?; - let (mem_state, state) = request.source.get_state().await; + let (mem_state, state) = source_tli.get_state().await; let start_lsn = state.timeline_start_lsn; if start_lsn == Lsn::INVALID { bail!("timeline is not initialized"); @@ -60,7 +62,7 @@ pub async fn handle_request(request: Request) -> Result<()> { { let commit_lsn = mem_state.commit_lsn; - let flush_lsn = request.source.get_flush_lsn().await; + let flush_lsn = source_tli.get_flush_lsn().await; info!( "collected info about source timeline: start_lsn={}, backup_lsn={}, commit_lsn={}, flush_lsn={}", @@ -127,10 +129,8 @@ pub async fn handle_request(request: Request) -> Result<()> { .await?; copy_disk_segments( - conf, - &state, + &source_tli, wal_seg_size, - &request.source.ttid, new_backup_lsn, request.until_lsn, &tli_dir_path, @@ -159,21 +159,13 @@ pub async fn handle_request(request: Request) -> Result<()> { } async fn copy_disk_segments( - conf: &SafeKeeperConf, - persisted_state: &TimelinePersistentState, + tli: &FullAccessTimeline, wal_seg_size: usize, - source_ttid: &TenantTimelineId, start_lsn: Lsn, end_lsn: Lsn, tli_dir_path: &Utf8PathBuf, ) -> Result<()> { - let mut wal_reader = WalReader::new( - conf.workdir.clone(), - conf.timeline_dir(source_ttid), - persisted_state, - start_lsn, - true, - )?; + let mut wal_reader = tli.get_walreader(start_lsn).await?; let mut buf = [0u8; MAX_SEND_SIZE]; diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs index b50f2e1158..062ff4b3db 100644 --- a/safekeeper/src/debug_dump.rs +++ b/safekeeper/src/debug_dump.rs @@ -10,6 +10,7 @@ use std::sync::Arc; use anyhow::bail; use anyhow::Result; use camino::Utf8Path; +use camino::Utf8PathBuf; use chrono::{DateTime, Utc}; use postgres_ffi::XLogSegNo; use postgres_ffi::MAX_SEND_SIZE; @@ -26,7 +27,8 @@ use crate::safekeeper::TermHistory; use crate::send_wal::WalSenderState; use crate::state::TimelineMemState; use crate::state::TimelinePersistentState; -use crate::wal_storage::WalReader; +use crate::timeline::get_timeline_dir; +use crate::timeline::FullAccessTimeline; use crate::GlobalTimelines; use crate::SafeKeeperConf; @@ -68,6 +70,7 @@ pub struct Response { pub struct TimelineDumpSer { pub tli: Arc, pub args: Args, + pub timeline_dir: Utf8PathBuf, pub runtime: Arc, } @@ -85,14 +88,20 @@ impl Serialize for TimelineDumpSer { where S: serde::Serializer, { - let dump = self - .runtime - .block_on(build_from_tli_dump(self.tli.clone(), self.args.clone())); + let dump = self.runtime.block_on(build_from_tli_dump( + &self.tli, + &self.args, + &self.timeline_dir, + )); dump.serialize(serializer) } } -async fn build_from_tli_dump(timeline: Arc, args: Args) -> Timeline { +async fn build_from_tli_dump( + timeline: &Arc, + args: &Args, + timeline_dir: &Utf8Path, +) -> Timeline { let control_file = if args.dump_control_file { let mut state = timeline.get_state().await.1; if !args.dump_term_history { @@ -112,7 +121,8 @@ async fn build_from_tli_dump(timeline: Arc, args: Arg let disk_content = if args.dump_disk_content { // build_disk_content can fail, but we don't want to fail the whole // request because of that. - build_disk_content(&timeline.timeline_dir).ok() + // Note: timeline can be in offloaded state, this is not a problem. + build_disk_content(timeline_dir).ok() } else { None }; @@ -186,6 +196,7 @@ pub struct FileInfo { pub async fn build(args: Args) -> Result { let start_time = Utc::now(); let timelines_count = GlobalTimelines::timelines_count(); + let config = GlobalTimelines::get_global_config(); let ptrs_snapshot = if args.tenant_id.is_some() && args.timeline_id.is_some() { // If both tenant_id and timeline_id are specified, we can just get the @@ -223,12 +234,11 @@ pub async fn build(args: Args) -> Result { timelines.push(TimelineDumpSer { tli, args: args.clone(), + timeline_dir: get_timeline_dir(&config, &ttid), runtime: runtime.clone(), }); } - let config = GlobalTimelines::get_global_config(); - Ok(Response { start_time, finish_time: Utc::now(), @@ -316,27 +326,19 @@ pub struct TimelineDigest { } pub async fn calculate_digest( - tli: &Arc, + tli: &FullAccessTimeline, request: TimelineDigestRequest, ) -> Result { if request.from_lsn > request.until_lsn { bail!("from_lsn is greater than until_lsn"); } - let conf = GlobalTimelines::get_global_config(); let (_, persisted_state) = tli.get_state().await; - if persisted_state.timeline_start_lsn > request.from_lsn { bail!("requested LSN is before the start of the timeline"); } - let mut wal_reader = WalReader::new( - conf.workdir.clone(), - tli.timeline_dir.clone(), - &persisted_state, - request.from_lsn, - true, - )?; + let mut wal_reader = tli.get_walreader(request.from_lsn).await?; let mut hasher = Sha256::new(); let mut buf = [0u8; MAX_SEND_SIZE]; diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 4aacd3421d..1e29b21fac 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -85,11 +85,11 @@ impl From for TermLsn { } } -/// Augment AcceptorState with epoch for convenience +/// Augment AcceptorState with last_log_term for convenience #[derive(Debug, Serialize, Deserialize)] pub struct AcceptorStateStatus { pub term: Term, - pub epoch: Term, + pub epoch: Term, // aka last_log_term pub term_history: Vec, } @@ -130,7 +130,7 @@ async fn timeline_status_handler(request: Request) -> Result) -> Result) -> Result) -> Result let filename: String = parse_request_param(&request, "filename")?; let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?; + let tli = tli + .full_access_guard() + .await + .map_err(ApiError::InternalServerError)?; - let filepath = tli.timeline_dir.join(filename); + let filepath = tli.get_timeline_dir().join(filename); let mut file = File::open(&filepath) .await .map_err(|e| ApiError::InternalServerError(e.into()))?; @@ -287,7 +295,7 @@ async fn timeline_files_handler(request: Request) -> Result .map_err(|e| ApiError::InternalServerError(e.into())) } -/// Force persist control file and remove old WAL. +/// Force persist control file. async fn timeline_checkpoint_handler(request: Request) -> Result, ApiError> { check_permission(&request, None)?; @@ -297,13 +305,13 @@ async fn timeline_checkpoint_handler(request: Request) -> Result( async fn prepare_safekeeper( ttid: TenantTimelineId, pg_version: u32, -) -> anyhow::Result> { - GlobalTimelines::create( +) -> anyhow::Result { + let tli = GlobalTimelines::create( ttid, ServerInfo { pg_version, @@ -115,10 +113,16 @@ async fn prepare_safekeeper( Lsn::INVALID, Lsn::INVALID, ) - .await + .await?; + + tli.full_access_guard().await } -async fn send_proposer_elected(tli: &Arc, term: Term, lsn: Lsn) -> anyhow::Result<()> { +async fn send_proposer_elected( + tli: &FullAccessTimeline, + term: Term, + lsn: Lsn, +) -> anyhow::Result<()> { // add new term to existing history let history = tli.get_state().await.1.acceptor_state.term_history; let history = history.up_to(lsn.checked_sub(1u64).unwrap()); @@ -147,7 +151,7 @@ pub struct InsertedWAL { /// Extend local WAL with new LogicalMessage record. To do that, /// create AppendRequest with new WAL and pass it to safekeeper. pub async fn append_logical_message( - tli: &Arc, + tli: &FullAccessTimeline, msg: &AppendLogicalMessage, ) -> anyhow::Result { let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message); @@ -165,7 +169,7 @@ pub async fn append_logical_message( let append_request = ProposerAcceptorMessage::AppendRequest(AppendRequest { h: AppendRequestHeader { term: msg.term, - epoch_start_lsn: begin_lsn, + term_start_lsn: begin_lsn, begin_lsn, end_lsn, commit_lsn, diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 8d8d2cf23e..1a56ff736c 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -7,10 +7,7 @@ use tokio::runtime::Runtime; use std::time::Duration; use storage_broker::Uri; -use utils::{ - auth::SwappableJwtAuth, - id::{NodeId, TenantId, TenantTimelineId}, -}; +use utils::{auth::SwappableJwtAuth, id::NodeId}; mod auth; pub mod broker; @@ -89,15 +86,6 @@ pub struct SafeKeeperConf { } impl SafeKeeperConf { - pub fn tenant_dir(&self, tenant_id: &TenantId) -> Utf8PathBuf { - self.workdir.join(tenant_id.to_string()) - } - - pub fn timeline_dir(&self, ttid: &TenantTimelineId) -> Utf8PathBuf { - self.tenant_dir(&ttid.tenant_id) - .join(ttid.timeline_id.to_string()) - } - pub fn is_wal_backup_enabled(&self) -> bool { self.remote_storage.is_some() && self.wal_backup_enabled } diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index f7cc40f58a..7b41c98cb8 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -17,7 +17,7 @@ use utils::{ use crate::{ control_file, debug_dump, http::routes::TimelineStatus, - timeline::{Timeline, TimelineError}, + timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError}, wal_storage::{self, Storage}, GlobalTimelines, SafeKeeperConf, }; @@ -283,13 +283,13 @@ pub async fn load_temp_timeline( } // Move timeline dir to the correct location - let timeline_path = conf.timeline_dir(&ttid); + let timeline_path = get_timeline_dir(conf, &ttid); info!( "moving timeline {} from {} to {}", ttid, tmp_path, timeline_path ); - tokio::fs::create_dir_all(conf.tenant_dir(&ttid.tenant_id)).await?; + tokio::fs::create_dir_all(get_tenant_dir(conf, &ttid.tenant_id)).await?; tokio::fs::rename(tmp_path, &timeline_path).await?; let tli = GlobalTimelines::load_timeline(&guard, ttid) diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 03cfa882c4..7943a2fd86 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -6,7 +6,7 @@ use crate::handler::SafekeeperPostgresHandler; use crate::safekeeper::AcceptorProposerMessage; use crate::safekeeper::ProposerAcceptorMessage; use crate::safekeeper::ServerInfo; -use crate::timeline::Timeline; +use crate::timeline::FullAccessTimeline; use crate::wal_service::ConnectionId; use crate::GlobalTimelines; use anyhow::{anyhow, Context}; @@ -213,7 +213,7 @@ impl SafekeeperPostgresHandler { &mut self, pgb: &mut PostgresBackend, ) -> Result<(), QueryError> { - let mut tli: Option> = None; + let mut tli: Option = None; if let Err(end) = self.handle_start_wal_push_guts(pgb, &mut tli).await { // Log the result and probably send it to the client, closing the stream. let handle_end_fut = pgb.handle_copy_stream_end(end); @@ -233,7 +233,7 @@ impl SafekeeperPostgresHandler { pub async fn handle_start_wal_push_guts( &mut self, pgb: &mut PostgresBackend, - tli: &mut Option>, + tli: &mut Option, ) -> Result<(), CopyStreamHandlerEnd> { // Notify the libpq client that it's allowed to send `CopyData` messages pgb.write_message(&BeMessage::CopyBothResponse).await?; @@ -323,7 +323,7 @@ struct NetworkReader<'a, IO> { impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> { async fn read_first_message( &mut self, - ) -> Result<(Arc, ProposerAcceptorMessage), CopyStreamHandlerEnd> { + ) -> Result<(FullAccessTimeline, ProposerAcceptorMessage), CopyStreamHandlerEnd> { // Receive information about server to create timeline, if not yet. let next_msg = read_message(self.pgb_reader).await?; let tli = match next_msg { @@ -337,7 +337,10 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> { system_id: greeting.system_id, wal_seg_size: greeting.wal_seg_size, }; - GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID).await? + let tli = + GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID) + .await?; + tli.full_access_guard().await? } _ => { return Err(CopyStreamHandlerEnd::Other(anyhow::anyhow!( @@ -353,7 +356,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> { msg_tx: Sender, msg_rx: Receiver, reply_tx: Sender, - tli: Arc, + tli: FullAccessTimeline, next_msg: ProposerAcceptorMessage, ) -> Result<(), CopyStreamHandlerEnd> { *self.acceptor_handle = Some(WalAcceptor::spawn( @@ -448,7 +451,7 @@ const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1); /// replies to reply_tx; reading from socket and writing to disk in parallel is /// beneficial for performance, this struct provides writing to disk part. pub struct WalAcceptor { - tli: Arc, + tli: FullAccessTimeline, msg_rx: Receiver, reply_tx: Sender, conn_id: Option, @@ -461,7 +464,7 @@ impl WalAcceptor { /// /// conn_id None means WalAcceptor is used by recovery initiated at this safekeeper. pub fn spawn( - tli: Arc, + tli: FullAccessTimeline, msg_rx: Receiver, reply_tx: Sender, conn_id: Option, diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs index dfa1892c40..80a630b1e1 100644 --- a/safekeeper/src/recovery.rs +++ b/safekeeper/src/recovery.rs @@ -2,7 +2,7 @@ //! provide it, i.e. safekeeper lags too much. use std::time::SystemTime; -use std::{fmt, pin::pin, sync::Arc}; +use std::{fmt, pin::pin}; use anyhow::{bail, Context}; use futures::StreamExt; @@ -21,6 +21,7 @@ use utils::{id::NodeId, lsn::Lsn, postgres_client::wal_stream_connection_config} use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE}; use crate::safekeeper::{AppendRequest, AppendRequestHeader}; +use crate::timeline::FullAccessTimeline; use crate::{ http::routes::TimelineStatus, receive_wal::MSG_QUEUE_SIZE, @@ -28,14 +29,14 @@ use crate::{ AcceptorProposerMessage, ProposerAcceptorMessage, ProposerElected, Term, TermHistory, TermLsn, VoteRequest, }, - timeline::{PeerInfo, Timeline}, + timeline::PeerInfo, SafeKeeperConf, }; /// Entrypoint for per timeline task which always runs, checking whether /// recovery for this safekeeper is needed and starting it if so. #[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))] -pub async fn recovery_main(tli: Arc, conf: SafeKeeperConf) { +pub async fn recovery_main(tli: FullAccessTimeline, conf: SafeKeeperConf) { info!("started"); let cancel = tli.cancel.clone(); @@ -47,6 +48,87 @@ pub async fn recovery_main(tli: Arc, conf: SafeKeeperConf) { } } +/// Should we start fetching WAL from a peer safekeeper, and if yes, from +/// which? Answer is yes, i.e. .donors is not empty if 1) there is something +/// to fetch, and we can do that without running elections; 2) there is no +/// actively streaming compute, as we don't want to compete with it. +/// +/// If donor(s) are choosen, theirs last_log_term is guaranteed to be equal +/// to its last_log_term so we are sure such a leader ever had been elected. +/// +/// All possible donors are returned so that we could keep connection to the +/// current one if it is good even if it slightly lags behind. +/// +/// Note that term conditions above might be not met, but safekeepers are +/// still not aligned on last flush_lsn. Generally in this case until +/// elections are run it is not possible to say which safekeeper should +/// recover from which one -- history which would be committed is different +/// depending on assembled quorum (e.g. classic picture 8 from Raft paper). +/// Thus we don't try to predict it here. +async fn recovery_needed( + tli: &FullAccessTimeline, + heartbeat_timeout: Duration, +) -> RecoveryNeededInfo { + let ss = tli.read_shared_state().await; + let term = ss.sk.state.acceptor_state.term; + let last_log_term = ss.sk.get_last_log_term(); + let flush_lsn = ss.sk.flush_lsn(); + // note that peers contain myself, but that's ok -- we are interested only in peers which are strictly ahead of us. + let mut peers = ss.get_peers(heartbeat_timeout); + // Sort by pairs. + peers.sort_by(|p1, p2| { + let tl1 = TermLsn { + term: p1.last_log_term, + lsn: p1.flush_lsn, + }; + let tl2 = TermLsn { + term: p2.last_log_term, + lsn: p2.flush_lsn, + }; + tl2.cmp(&tl1) // desc + }); + let num_streaming_computes = tli.get_walreceivers().get_num_streaming(); + let donors = if num_streaming_computes > 0 { + vec![] // If there is a streaming compute, don't try to recover to not intervene. + } else { + peers + .iter() + .filter_map(|candidate| { + // Are we interested in this candidate? + let candidate_tl = TermLsn { + term: candidate.last_log_term, + lsn: candidate.flush_lsn, + }; + let my_tl = TermLsn { + term: last_log_term, + lsn: flush_lsn, + }; + if my_tl < candidate_tl { + // Yes, we are interested. Can we pull from it without + // (re)running elections? It is possible if 1) his term + // is equal to his last_log_term so we could act on + // behalf of leader of this term (we must be sure he was + // ever elected) and 2) our term is not higher, or we'll refuse data. + if candidate.term == candidate.last_log_term && candidate.term >= term { + Some(Donor::from(candidate)) + } else { + None + } + } else { + None + } + }) + .collect() + }; + RecoveryNeededInfo { + term, + last_log_term, + flush_lsn, + peers, + num_streaming_computes, + donors, + } +} /// Result of Timeline::recovery_needed, contains donor(s) if recovery needed and /// fields to explain the choice. #[derive(Debug)] @@ -113,10 +195,10 @@ impl From<&PeerInfo> for Donor { const CHECK_INTERVAL_MS: u64 = 2000; /// Check regularly whether we need to start recovery. -async fn recovery_main_loop(tli: Arc, conf: SafeKeeperConf) { +async fn recovery_main_loop(tli: FullAccessTimeline, conf: SafeKeeperConf) { let check_duration = Duration::from_millis(CHECK_INTERVAL_MS); loop { - let recovery_needed_info = tli.recovery_needed(conf.heartbeat_timeout).await; + let recovery_needed_info = recovery_needed(&tli, conf.heartbeat_timeout).await; match recovery_needed_info.donors.first() { Some(donor) => { info!( @@ -146,7 +228,7 @@ async fn recovery_main_loop(tli: Arc, conf: SafeKeeperConf) { /// Recover from the specified donor. Returns message explaining normal finish /// reason or error. async fn recover( - tli: Arc, + tli: FullAccessTimeline, donor: &Donor, conf: &SafeKeeperConf, ) -> anyhow::Result { @@ -232,7 +314,7 @@ async fn recover( // Pull WAL from donor, assuming handshake is already done. async fn recovery_stream( - tli: Arc, + tli: FullAccessTimeline, donor: &Donor, start_streaming_at: Lsn, conf: &SafeKeeperConf, @@ -316,7 +398,7 @@ async fn network_io( physical_stream: ReplicationStream, msg_tx: Sender, donor: Donor, - tli: Arc, + tli: FullAccessTimeline, conf: SafeKeeperConf, ) -> anyhow::Result> { let mut physical_stream = pin!(physical_stream); @@ -337,7 +419,7 @@ async fn network_io( ReplicationMessage::XLogData(xlog_data) => { let ar_hdr = AppendRequestHeader { term: donor.term, - epoch_start_lsn: Lsn::INVALID, // unused + term_start_lsn: Lsn::INVALID, // unused begin_lsn: Lsn(xlog_data.wal_start()), end_lsn: Lsn(xlog_data.wal_start()) + xlog_data.data().len() as u64, commit_lsn: Lsn::INVALID, // do not attempt to advance, peer communication anyway does it @@ -365,7 +447,7 @@ async fn network_io( } ReplicationMessage::PrimaryKeepAlive(_) => { // keepalive means nothing is being streamed for a while. Check whether we need to stop. - let recovery_needed_info = tli.recovery_needed(conf.heartbeat_timeout).await; + let recovery_needed_info = recovery_needed(&tli, conf.heartbeat_timeout).await; // do current donors still contain one we currently connected to? if !recovery_needed_info .donors diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs index 3400eee9b7..b661e48cb5 100644 --- a/safekeeper/src/remove_wal.rs +++ b/safekeeper/src/remove_wal.rs @@ -1,41 +1,25 @@ -//! Thread removing old WAL. +use utils::lsn::Lsn; -use std::time::Duration; +use crate::timeline_manager::StateSnapshot; -use tokio::time::sleep; -use tracing::*; +/// Get oldest LSN we still need to keep. We hold WAL till it is consumed +/// by all of 1) pageserver (remote_consistent_lsn) 2) peers 3) s3 +/// offloading. +/// While it is safe to use inmem values for determining horizon, +/// we use persistent to make possible normal states less surprising. +/// All segments covering LSNs before horizon_lsn can be removed. +pub fn calc_horizon_lsn(state: &StateSnapshot, extra_horizon_lsn: Option) -> Lsn { + use std::cmp::min; -use crate::{GlobalTimelines, SafeKeeperConf}; - -pub async fn task_main(_conf: SafeKeeperConf) -> anyhow::Result<()> { - let wal_removal_interval = Duration::from_millis(5000); - loop { - let now = tokio::time::Instant::now(); - let tlis = GlobalTimelines::get_all(); - for tli in &tlis { - let ttid = tli.ttid; - async { - if let Err(e) = tli.maybe_persist_control_file(false).await { - warn!("failed to persist control file: {e}"); - } - if let Err(e) = tli.remove_old_wal().await { - error!("failed to remove WAL: {}", e); - } - } - .instrument(info_span!("WAL removal", ttid = %ttid)) - .await; - } - - let elapsed = now.elapsed(); - let total_timelines = tlis.len(); - - if elapsed > wal_removal_interval { - info!( - "WAL removal is too long, processed {} timelines in {:?}", - total_timelines, elapsed - ); - } - - sleep(wal_removal_interval).await; + let mut horizon_lsn = min( + state.cfile_remote_consistent_lsn, + state.cfile_peer_horizon_lsn, + ); + // we don't want to remove WAL that is not yet offloaded to s3 + horizon_lsn = min(horizon_lsn, state.cfile_backup_lsn); + if let Some(extra_horizon_lsn) = extra_horizon_lsn { + horizon_lsn = min(horizon_lsn, extra_horizon_lsn); } + + horizon_lsn } diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 2a620f5fef..563dbbe315 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -10,7 +10,6 @@ use std::cmp::max; use std::cmp::min; use std::fmt; use std::io::Read; -use std::time::Duration; use storage_broker::proto::SafekeeperTimelineInfo; use tracing::*; @@ -188,8 +187,8 @@ pub struct AcceptorState { } impl AcceptorState { - /// acceptor's epoch is the term of the highest entry in the log - pub fn get_epoch(&self, flush_lsn: Lsn) -> Term { + /// acceptor's last_log_term is the term of the highest entry in the log + pub fn get_last_log_term(&self, flush_lsn: Lsn) -> Term { let th = self.term_history.up_to(flush_lsn); match th.0.last() { Some(e) => e.term, @@ -305,9 +304,9 @@ pub struct AppendRequest { pub struct AppendRequestHeader { // safekeeper's current term; if it is higher than proposer's, the compute is out of date. pub term: Term, - // TODO: remove this field, it in unused -- LSN of term switch can be taken - // from ProposerElected (as well as from term history). - pub epoch_start_lsn: Lsn, + // TODO: remove this field from the protocol, it in unused -- LSN of term + // switch can be taken from ProposerElected (as well as from term history). + pub term_start_lsn: Lsn, /// start position of message in WAL pub begin_lsn: Lsn, /// end position of message in WAL @@ -326,9 +325,10 @@ pub struct AppendResponse { // Current term of the safekeeper; if it is higher than proposer's, the // compute is out of date. pub term: Term, - // NOTE: this is physical end of wal on safekeeper; currently it doesn't - // make much sense without taking epoch into account, as history can be - // diverged. + // Flushed end of wal on safekeeper; one should be always mindful from what + // term history this value comes, either checking history directly or + // observing term being set to one for which WAL truncation is known to have + // happened. pub flush_lsn: Lsn, // We report back our awareness about which WAL is committed, as this is // a criterion for walproposer --sync mode exit @@ -482,8 +482,8 @@ impl AcceptorProposerMessage { /// - messages from broker peers pub struct SafeKeeper { /// LSN since the proposer safekeeper currently talking to appends WAL; - /// determines epoch switch point. - pub epoch_start_lsn: Lsn, + /// determines last_log_term switch point. + pub term_start_lsn: Lsn, pub state: TimelineState, // persistent state storage pub wal_store: WAL, @@ -511,7 +511,7 @@ where } Ok(SafeKeeper { - epoch_start_lsn: Lsn(0), + term_start_lsn: Lsn(0), state: TimelineState::new(state), wal_store, node_id, @@ -531,8 +531,10 @@ where self.state.acceptor_state.term } - pub fn get_epoch(&self) -> Term { - self.state.acceptor_state.get_epoch(self.flush_lsn()) + pub fn get_last_log_term(&self) -> Term { + self.state + .acceptor_state + .get_last_log_term(self.flush_lsn()) } /// wal_store wrapper avoiding commit_lsn <= flush_lsn violation when we don't have WAL yet. @@ -713,7 +715,7 @@ where // proceed, but to prevent commit_lsn surprisingly going down we should // either refuse the session (simpler) or skip the part we already have // from the stream (can be implemented). - if msg.term == self.get_epoch() && self.flush_lsn() > msg.start_streaming_at { + if msg.term == self.get_last_log_term() && self.flush_lsn() > msg.start_streaming_at { bail!("refusing ProposerElected which is going to overwrite correct WAL: term={}, flush_lsn={}, start_streaming_at={}; restarting the handshake should help", msg.term, self.flush_lsn(), msg.start_streaming_at) } @@ -788,7 +790,7 @@ where // Cache LSN where term starts to immediately fsync control file with // commit_lsn once we reach it -- sync-safekeepers finishes when // persisted commit_lsn on majority of safekeepers aligns. - self.epoch_start_lsn = match msg.term_history.0.last() { + self.term_start_lsn = match msg.term_history.0.last() { None => bail!("proposer elected with empty term history"), Some(term_lsn_start) => term_lsn_start.lsn, }; @@ -814,35 +816,17 @@ where self.state.inmem.commit_lsn = commit_lsn; - // If new commit_lsn reached epoch switch, force sync of control + // If new commit_lsn reached term switch, force sync of control // file: walproposer in sync mode is very interested when this // happens. Note: this is for sync-safekeepers mode only, as - // otherwise commit_lsn might jump over epoch_start_lsn. - if commit_lsn >= self.epoch_start_lsn && self.state.commit_lsn < self.epoch_start_lsn { + // otherwise commit_lsn might jump over term_start_lsn. + if commit_lsn >= self.term_start_lsn && self.state.commit_lsn < self.term_start_lsn { self.state.flush().await?; } Ok(()) } - /// Persist control file if there is something to save and enough time - /// passed after the last save. - pub async fn maybe_persist_inmem_control_file(&mut self, force: bool) -> Result { - const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300); - if !force && self.state.pers.last_persist_at().elapsed() < CF_SAVE_INTERVAL { - return Ok(false); - } - let need_persist = self.state.inmem.commit_lsn > self.state.commit_lsn - || self.state.inmem.backup_lsn > self.state.backup_lsn - || self.state.inmem.peer_horizon_lsn > self.state.peer_horizon_lsn - || self.state.inmem.remote_consistent_lsn > self.state.remote_consistent_lsn; - if need_persist { - self.state.flush().await?; - trace!("saved control file: {CF_SAVE_INTERVAL:?} passed"); - } - Ok(need_persist) - } - /// Handle request to append WAL. #[allow(clippy::comparison_chain)] async fn handle_append_request( @@ -933,7 +917,7 @@ where // Note: the check is too restrictive, generally we can update local // commit_lsn if our history matches (is part of) history of advanced // commit_lsn provider. - if sk_info.last_log_term == self.get_epoch() { + if sk_info.last_log_term == self.get_last_log_term() { self.update_commit_lsn(Lsn(sk_info.commit_lsn)).await?; } } @@ -1079,7 +1063,7 @@ mod tests { } #[tokio::test] - async fn test_epoch_switch() { + async fn test_last_log_term_switch() { let storage = InMemoryState { persisted_state: test_sk_state(), }; @@ -1089,7 +1073,7 @@ mod tests { let mut ar_hdr = AppendRequestHeader { term: 1, - epoch_start_lsn: Lsn(3), + term_start_lsn: Lsn(3), begin_lsn: Lsn(1), end_lsn: Lsn(2), commit_lsn: Lsn(0), @@ -1114,14 +1098,14 @@ mod tests { .await .unwrap(); - // check that AppendRequest before epochStartLsn doesn't switch epoch + // check that AppendRequest before term_start_lsn doesn't switch last_log_term. let resp = sk .process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)) .await; assert!(resp.is_ok()); - assert_eq!(sk.get_epoch(), 0); + assert_eq!(sk.get_last_log_term(), 0); - // but record at epochStartLsn does the switch + // but record at term_start_lsn does the switch ar_hdr.begin_lsn = Lsn(2); ar_hdr.end_lsn = Lsn(3); append_request = AppendRequest { @@ -1133,7 +1117,7 @@ mod tests { .await; assert!(resp.is_ok()); sk.wal_store.truncate_wal(Lsn(3)).await.unwrap(); // imitate the complete record at 3 %) - assert_eq!(sk.get_epoch(), 1); + assert_eq!(sk.get_last_log_term(), 1); } #[test] diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 5a9745e1c9..df75893838 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -5,7 +5,7 @@ use crate::handler::SafekeeperPostgresHandler; use crate::metrics::RECEIVED_PS_FEEDBACKS; use crate::receive_wal::WalReceivers; use crate::safekeeper::{Term, TermLsn}; -use crate::timeline::Timeline; +use crate::timeline::FullAccessTimeline; use crate::wal_service::ConnectionId; use crate::wal_storage::WalReader; use crate::GlobalTimelines; @@ -387,8 +387,10 @@ impl SafekeeperPostgresHandler { term: Option, ) -> Result<(), QueryError> { let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?; + let full_access = tli.full_access_guard().await?; + if let Err(end) = self - .handle_start_replication_guts(pgb, start_pos, term, tli.clone()) + .handle_start_replication_guts(pgb, start_pos, term, full_access) .await { let info = tli.get_safekeeper_info(&self.conf).await; @@ -405,7 +407,7 @@ impl SafekeeperPostgresHandler { pgb: &mut PostgresBackend, start_pos: Lsn, term: Option, - tli: Arc, + tli: FullAccessTimeline, ) -> Result<(), CopyStreamHandlerEnd> { let appname = self.appname.clone(); @@ -448,14 +450,7 @@ impl SafekeeperPostgresHandler { // switch to copy pgb.write_message(&BeMessage::CopyBothResponse).await?; - let (_, persisted_state) = tli.get_state().await; - let wal_reader = WalReader::new( - self.conf.workdir.clone(), - self.conf.timeline_dir(&tli.ttid), - &persisted_state, - start_pos, - self.conf.is_wal_backup_enabled(), - )?; + let wal_reader = tli.get_walreader(start_pos).await?; // Split to concurrently receive and send data; replies are generally // not synchronized with sends, so this avoids deadlocks. @@ -532,7 +527,7 @@ impl EndWatch { /// A half driving sending WAL. struct WalSender<'a, IO> { pgb: &'a mut PostgresBackend, - tli: Arc, + tli: FullAccessTimeline, appname: Option, // Position since which we are sending next chunk. start_pos: Lsn, @@ -741,7 +736,7 @@ impl WalSender<'_, IO> { struct ReplyReader { reader: PostgresBackendReader, ws_guard: Arc, - tli: Arc, + tli: FullAccessTimeline, } impl ReplyReader { diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index f30c503382..148a7e90bd 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -3,14 +3,14 @@ use anyhow::{anyhow, bail, Result}; use camino::Utf8PathBuf; -use postgres_ffi::XLogSegNo; use serde::{Deserialize, Serialize}; use tokio::fs; use tokio_util::sync::CancellationToken; +use utils::id::TenantId; use std::cmp::max; use std::ops::{Deref, DerefMut}; -use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; use std::sync::Arc; use std::time::Duration; use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; @@ -26,7 +26,6 @@ use storage_broker::proto::SafekeeperTimelineInfo; use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; use crate::receive_wal::WalReceivers; -use crate::recovery::{recovery_main, Donor, RecoveryNeededInfo}; use crate::safekeeper::{ AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, ServerInfo, Term, TermLsn, INVALID_TERM, @@ -38,8 +37,8 @@ use crate::wal_backup::{self}; use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION}; use crate::metrics::FullTimelineInfo; -use crate::wal_storage::Storage as wal_storage_iface; -use crate::{debug_dump, timeline_manager, wal_backup_partial, wal_storage}; +use crate::wal_storage::{Storage as wal_storage_iface, WalReader}; +use crate::{debug_dump, timeline_manager, wal_storage}; use crate::{GlobalTimelines, SafeKeeperConf}; /// Things safekeeper should know about timeline state on peers. @@ -169,7 +168,6 @@ pub struct SharedState { pub(crate) sk: SafeKeeper, /// In memory list containing state of peers sent in latest messages from them. pub(crate) peers_info: PeersInfo, - pub(crate) last_removed_segno: XLogSegNo, } impl SharedState { @@ -197,33 +195,33 @@ impl SharedState { // We don't want to write anything to disk, because we may have existing timeline there. // These functions should not change anything on disk. - let timeline_dir = conf.timeline_dir(ttid); - let control_store = control_file::FileStorage::create_new(timeline_dir, conf, state)?; + let timeline_dir = get_timeline_dir(conf, ttid); + let control_store = + control_file::FileStorage::create_new(timeline_dir.clone(), conf, state)?; let wal_store = - wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?; + wal_storage::PhysicalStorage::new(ttid, timeline_dir, conf, &control_store)?; let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?; Ok(Self { sk, peers_info: PeersInfo(vec![]), - last_removed_segno: 0, }) } /// Restore SharedState from control file. If file doesn't exist, bails out. fn restore(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Result { + let timeline_dir = get_timeline_dir(conf, ttid); let control_store = control_file::FileStorage::restore_new(ttid, conf)?; if control_store.server.wal_seg_size == 0 { bail!(TimelineError::UninitializedWalSegSize(*ttid)); } let wal_store = - wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?; + wal_storage::PhysicalStorage::new(ttid, timeline_dir, conf, &control_store)?; Ok(Self { sk: SafeKeeper::new(control_store, wal_store, conf.my_id)?, peers_info: PeersInfo(vec![]), - last_removed_segno: 0, }) } @@ -244,7 +242,7 @@ impl SharedState { timeline_id: ttid.timeline_id.as_ref().to_owned(), }), term: self.sk.state.acceptor_state.term, - last_log_term: self.sk.get_epoch(), + last_log_term: self.sk.get_last_log_term(), flush_lsn: self.sk.flush_lsn().0, // note: this value is not flushed to control file yet and can be lost commit_lsn: self.sk.state.inmem.commit_lsn.0, @@ -275,24 +273,6 @@ impl SharedState { .cloned() .collect() } - - /// Get oldest segno we still need to keep. We hold WAL till it is consumed - /// by all of 1) pageserver (remote_consistent_lsn) 2) peers 3) s3 - /// offloading. - /// While it is safe to use inmem values for determining horizon, - /// we use persistent to make possible normal states less surprising. - fn get_horizon_segno(&self, extra_horizon_lsn: Option) -> XLogSegNo { - let state = &self.sk.state; - - use std::cmp::min; - let mut horizon_lsn = min(state.remote_consistent_lsn, state.peer_horizon_lsn); - // we don't want to remove WAL that is not yet offloaded to s3 - horizon_lsn = min(horizon_lsn, state.backup_lsn); - if let Some(extra_horizon_lsn) = extra_horizon_lsn { - horizon_lsn = min(horizon_lsn, extra_horizon_lsn); - } - horizon_lsn.segment_number(state.server.wal_seg_size as usize) - } } #[derive(Debug, thiserror::Error)] @@ -349,22 +329,15 @@ pub struct Timeline { mutex: RwLock, walsenders: Arc, walreceivers: Arc, + timeline_dir: Utf8PathBuf, /// Delete/cancel will trigger this, background tasks should drop out as soon as it fires pub(crate) cancel: CancellationToken, - /// Directory where timeline state is stored. - pub timeline_dir: Utf8PathBuf, - - /// Should we keep WAL on disk for active replication connections. - /// Especially useful for sharding, when different shards process WAL - /// with different speed. - // TODO: add `Arc` here instead of adding each field separately. - walsenders_keep_horizon: bool, - // timeline_manager controlled state pub(crate) broker_active: AtomicBool, pub(crate) wal_backup_active: AtomicBool, + pub(crate) last_removed_segno: AtomicU64, } impl Timeline { @@ -394,10 +367,10 @@ impl Timeline { walsenders: WalSenders::new(walreceivers.clone()), walreceivers, cancel: CancellationToken::default(), - timeline_dir: conf.timeline_dir(&ttid), - walsenders_keep_horizon: conf.walsenders_keep_horizon, + timeline_dir: get_timeline_dir(conf, &ttid), broker_active: AtomicBool::new(false), wal_backup_active: AtomicBool::new(false), + last_removed_segno: AtomicU64::new(0), }) } @@ -430,10 +403,10 @@ impl Timeline { walsenders: WalSenders::new(walreceivers.clone()), walreceivers, cancel: CancellationToken::default(), - timeline_dir: conf.timeline_dir(&ttid), - walsenders_keep_horizon: conf.walsenders_keep_horizon, + timeline_dir: get_timeline_dir(conf, &ttid), broker_active: AtomicBool::new(false), wal_backup_active: AtomicBool::new(false), + last_removed_segno: AtomicU64::new(0), }) } @@ -494,15 +467,6 @@ impl Timeline { conf.clone(), broker_active_set, )); - - // Start recovery task which always runs on the timeline. - if conf.peer_recovery_enabled { - tokio::spawn(recovery_main(self.clone(), conf.clone())); - } - // TODO: migrate to timeline_manager - if conf.is_wal_backup_enabled() && conf.partial_backup_enabled { - tokio::spawn(wal_backup_partial::main_task(self.clone(), conf.clone())); - } } /// Delete timeline from disk completely, by removing timeline directory. @@ -555,36 +519,6 @@ impl Timeline { self.mutex.read().await } - /// Returns true if walsender should stop sending WAL to pageserver. We - /// terminate it if remote_consistent_lsn reached commit_lsn and there is no - /// computes. While there might be nothing to stream already, we learn about - /// remote_consistent_lsn update through replication feedback, and we want - /// to stop pushing to the broker if pageserver is fully caughtup. - pub async fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool { - if self.is_cancelled() { - return true; - } - let shared_state = self.read_shared_state().await; - if self.walreceivers.get_num() == 0 { - return shared_state.sk.state.inmem.commit_lsn == Lsn(0) || // no data at all yet - reported_remote_consistent_lsn >= shared_state.sk.state.inmem.commit_lsn; - } - false - } - - /// Ensure that current term is t, erroring otherwise, and lock the state. - pub async fn acquire_term(&self, t: Term) -> Result { - let ss = self.read_shared_state().await; - if ss.sk.state.acceptor_state.term != t { - bail!( - "failed to acquire term {}, current term {}", - t, - ss.sk.state.acceptor_state.term - ); - } - Ok(ss) - } - /// Returns commit_lsn watch channel. pub fn get_commit_lsn_watch_rx(&self) -> watch::Receiver { self.commit_lsn_watch_rx.clone() @@ -600,28 +534,6 @@ impl Timeline { self.shared_state_version_rx.clone() } - /// Pass arrived message to the safekeeper. - pub async fn process_msg( - self: &Arc, - msg: &ProposerAcceptorMessage, - ) -> Result> { - if self.is_cancelled() { - bail!(TimelineError::Cancelled(self.ttid)); - } - - let mut rmsg: Option; - { - let mut shared_state = self.write_shared_state().await; - rmsg = shared_state.sk.process_msg(msg).await?; - - // if this is AppendResponse, fill in proper hot standby feedback. - if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg { - resp.hs_feedback = self.walsenders.get_hotstandby().hs_feedback; - } - } - Ok(rmsg) - } - /// Returns wal_seg_size. pub async fn get_wal_seg_size(&self) -> usize { self.read_shared_state().await.get_wal_seg_size() @@ -672,97 +584,11 @@ impl Timeline { Ok(()) } - /// Update in memory remote consistent lsn. - pub async fn update_remote_consistent_lsn(self: &Arc, candidate: Lsn) { - let mut shared_state = self.write_shared_state().await; - shared_state.sk.state.inmem.remote_consistent_lsn = - max(shared_state.sk.state.inmem.remote_consistent_lsn, candidate); - } - pub async fn get_peers(&self, conf: &SafeKeeperConf) -> Vec { let shared_state = self.read_shared_state().await; shared_state.get_peers(conf.heartbeat_timeout) } - /// Should we start fetching WAL from a peer safekeeper, and if yes, from - /// which? Answer is yes, i.e. .donors is not empty if 1) there is something - /// to fetch, and we can do that without running elections; 2) there is no - /// actively streaming compute, as we don't want to compete with it. - /// - /// If donor(s) are choosen, theirs last_log_term is guaranteed to be equal - /// to its last_log_term so we are sure such a leader ever had been elected. - /// - /// All possible donors are returned so that we could keep connection to the - /// current one if it is good even if it slightly lags behind. - /// - /// Note that term conditions above might be not met, but safekeepers are - /// still not aligned on last flush_lsn. Generally in this case until - /// elections are run it is not possible to say which safekeeper should - /// recover from which one -- history which would be committed is different - /// depending on assembled quorum (e.g. classic picture 8 from Raft paper). - /// Thus we don't try to predict it here. - pub async fn recovery_needed(&self, heartbeat_timeout: Duration) -> RecoveryNeededInfo { - let ss = self.read_shared_state().await; - let term = ss.sk.state.acceptor_state.term; - let last_log_term = ss.sk.get_epoch(); - let flush_lsn = ss.sk.flush_lsn(); - // note that peers contain myself, but that's ok -- we are interested only in peers which are strictly ahead of us. - let mut peers = ss.get_peers(heartbeat_timeout); - // Sort by pairs. - peers.sort_by(|p1, p2| { - let tl1 = TermLsn { - term: p1.last_log_term, - lsn: p1.flush_lsn, - }; - let tl2 = TermLsn { - term: p2.last_log_term, - lsn: p2.flush_lsn, - }; - tl2.cmp(&tl1) // desc - }); - let num_streaming_computes = self.walreceivers.get_num_streaming(); - let donors = if num_streaming_computes > 0 { - vec![] // If there is a streaming compute, don't try to recover to not intervene. - } else { - peers - .iter() - .filter_map(|candidate| { - // Are we interested in this candidate? - let candidate_tl = TermLsn { - term: candidate.last_log_term, - lsn: candidate.flush_lsn, - }; - let my_tl = TermLsn { - term: last_log_term, - lsn: flush_lsn, - }; - if my_tl < candidate_tl { - // Yes, we are interested. Can we pull from it without - // (re)running elections? It is possible if 1) his term - // is equal to his last_log_term so we could act on - // behalf of leader of this term (we must be sure he was - // ever elected) and 2) our term is not higher, or we'll refuse data. - if candidate.term == candidate.last_log_term && candidate.term >= term { - Some(Donor::from(candidate)) - } else { - None - } - } else { - None - } - }) - .collect() - }; - RecoveryNeededInfo { - term, - last_log_term, - flush_lsn, - peers, - num_streaming_computes, - donors, - } - } - pub fn get_walsenders(&self) -> &Arc { &self.walsenders } @@ -776,58 +602,6 @@ impl Timeline { self.read_shared_state().await.sk.wal_store.flush_lsn() } - /// Delete WAL segments from disk that are no longer needed. This is determined - /// based on pageserver's remote_consistent_lsn and local backup_lsn/peer_lsn. - pub async fn remove_old_wal(self: &Arc) -> Result<()> { - if self.is_cancelled() { - bail!(TimelineError::Cancelled(self.ttid)); - } - - // If enabled, we use LSN of the most lagging walsender as a WAL removal horizon. - // This allows to get better read speed for pageservers that are lagging behind, - // at the cost of keeping more WAL on disk. - let replication_horizon_lsn = if self.walsenders_keep_horizon { - self.walsenders.laggard_lsn() - } else { - None - }; - - let horizon_segno: XLogSegNo; - let remover = { - let shared_state = self.read_shared_state().await; - horizon_segno = shared_state.get_horizon_segno(replication_horizon_lsn); - if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno { - return Ok(()); // nothing to do - } - - // release the lock before removing - shared_state.sk.wal_store.remove_up_to(horizon_segno - 1) - }; - - // delete old WAL files - remover.await?; - - // update last_removed_segno - let mut shared_state = self.write_shared_state().await; - if shared_state.last_removed_segno != horizon_segno { - shared_state.last_removed_segno = horizon_segno; - } else { - shared_state.skip_update = true; - } - Ok(()) - } - - /// Persist control file if there is something to save and enough time - /// passed after the last save. This helps to keep remote_consistent_lsn up - /// to date so that storage nodes restart doesn't cause many pageserver -> - /// safekeeper reconnections. - pub async fn maybe_persist_control_file(self: &Arc, force: bool) -> Result<()> { - let mut guard = self.write_shared_state().await; - let changed = guard.sk.maybe_persist_inmem_control_file(force).await?; - guard.skip_update = !changed; - Ok(()) - } - /// Gather timeline data for metrics. pub async fn info_for_metrics(&self) -> Option { if self.is_cancelled() { @@ -843,8 +617,8 @@ impl Timeline { wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed), timeline_is_active: self.broker_active.load(Ordering::Relaxed), num_computes: self.walreceivers.get_num() as u32, - last_removed_segno: state.last_removed_segno, - epoch_start_lsn: state.sk.epoch_start_lsn, + last_removed_segno: self.last_removed_segno.load(Ordering::Relaxed), + epoch_start_lsn: state.sk.term_start_lsn, mem_state: state.sk.state.inmem.clone(), persisted_state: state.sk.state.clone(), flush_lsn: state.sk.wal_store.flush_lsn(), @@ -866,8 +640,8 @@ impl Timeline { wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed), active: self.broker_active.load(Ordering::Relaxed), num_computes: self.walreceivers.get_num() as u32, - last_removed_segno: state.last_removed_segno, - epoch_start_lsn: state.sk.epoch_start_lsn, + last_removed_segno: self.last_removed_segno.load(Ordering::Relaxed), + epoch_start_lsn: state.sk.term_start_lsn, mem_state: state.sk.state.inmem.clone(), write_lsn, write_record_lsn, @@ -889,6 +663,110 @@ impl Timeline { state.sk.state.finish_change(&persistent_state).await?; Ok(res) } + + /// Get the timeline guard for reading/writing WAL files. + /// TODO: if WAL files are not present on disk (evicted), they will be + /// downloaded from S3. Also there will logic for preventing eviction + /// while someone is holding FullAccessTimeline guard. + pub async fn full_access_guard(self: &Arc) -> Result { + if self.is_cancelled() { + bail!(TimelineError::Cancelled(self.ttid)); + } + Ok(FullAccessTimeline { tli: self.clone() }) + } +} + +/// This is a guard that allows to read/write disk timeline state. +/// All tasks that are using the disk should use this guard. +#[derive(Clone)] +pub struct FullAccessTimeline { + pub tli: Arc, +} + +impl Deref for FullAccessTimeline { + type Target = Arc; + + fn deref(&self) -> &Self::Target { + &self.tli + } +} + +impl FullAccessTimeline { + /// Returns true if walsender should stop sending WAL to pageserver. We + /// terminate it if remote_consistent_lsn reached commit_lsn and there is no + /// computes. While there might be nothing to stream already, we learn about + /// remote_consistent_lsn update through replication feedback, and we want + /// to stop pushing to the broker if pageserver is fully caughtup. + pub async fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool { + if self.is_cancelled() { + return true; + } + let shared_state = self.read_shared_state().await; + if self.walreceivers.get_num() == 0 { + return shared_state.sk.state.inmem.commit_lsn == Lsn(0) || // no data at all yet + reported_remote_consistent_lsn >= shared_state.sk.state.inmem.commit_lsn; + } + false + } + + /// Ensure that current term is t, erroring otherwise, and lock the state. + pub async fn acquire_term(&self, t: Term) -> Result { + let ss = self.read_shared_state().await; + if ss.sk.state.acceptor_state.term != t { + bail!( + "failed to acquire term {}, current term {}", + t, + ss.sk.state.acceptor_state.term + ); + } + Ok(ss) + } + + /// Pass arrived message to the safekeeper. + pub async fn process_msg( + &self, + msg: &ProposerAcceptorMessage, + ) -> Result> { + if self.is_cancelled() { + bail!(TimelineError::Cancelled(self.ttid)); + } + + let mut rmsg: Option; + { + let mut shared_state = self.write_shared_state().await; + rmsg = shared_state.sk.process_msg(msg).await?; + + // if this is AppendResponse, fill in proper hot standby feedback. + if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg { + resp.hs_feedback = self.walsenders.get_hotstandby().hs_feedback; + } + } + Ok(rmsg) + } + + pub async fn get_walreader(&self, start_lsn: Lsn) -> Result { + let (_, persisted_state) = self.get_state().await; + let enable_remote_read = GlobalTimelines::get_global_config().is_wal_backup_enabled(); + + WalReader::new( + &self.ttid, + self.timeline_dir.clone(), + &persisted_state, + start_lsn, + enable_remote_read, + ) + } + + pub fn get_timeline_dir(&self) -> Utf8PathBuf { + self.timeline_dir.clone() + } + + /// Update in memory remote consistent lsn. + pub async fn update_remote_consistent_lsn(&self, candidate: Lsn) { + let mut shared_state = self.write_shared_state().await; + shared_state.sk.state.inmem.remote_consistent_lsn = + max(shared_state.sk.state.inmem.remote_consistent_lsn, candidate); + } } /// Deletes directory and it's contents. Returns false if directory does not exist. @@ -899,3 +777,16 @@ async fn delete_dir(path: &Utf8PathBuf) -> Result { Err(e) => Err(e.into()), } } + +/// Get a path to the tenant directory. If you just need to get a timeline directory, +/// use FullAccessTimeline::get_timeline_dir instead. +pub(crate) fn get_tenant_dir(conf: &SafeKeeperConf, tenant_id: &TenantId) -> Utf8PathBuf { + conf.workdir.join(tenant_id.to_string()) +} + +/// Get a path to the timeline directory. If you need to read WAL files from disk, +/// use FullAccessTimeline::get_timeline_dir instead. This function does not check +/// timeline eviction status and WAL files might not be present on disk. +pub(crate) fn get_timeline_dir(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Utf8PathBuf { + get_tenant_dir(conf, &ttid.tenant_id).join(ttid.timeline_id.to_string()) +} diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs index ed544352f9..7174d843fc 100644 --- a/safekeeper/src/timeline_manager.rs +++ b/safekeeper/src/timeline_manager.rs @@ -3,23 +3,42 @@ //! It watches for changes in the timeline state and decides when to spawn or kill background tasks. //! It also can manage some reactive state, like should the timeline be active for broker pushes or not. -use std::{sync::Arc, time::Duration}; +use std::{ + sync::Arc, + time::{Duration, Instant}, +}; -use tracing::{info, instrument, warn}; +use postgres_ffi::XLogSegNo; +use tokio::task::{JoinError, JoinHandle}; +use tracing::{info, info_span, instrument, warn, Instrument}; use utils::lsn::Lsn; use crate::{ + control_file::Storage, metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL}, + recovery::recovery_main, + remove_wal::calc_horizon_lsn, + send_wal::WalSenders, timeline::{PeerInfo, ReadGuardSharedState, Timeline}, - timelines_set::TimelinesSet, + timelines_set::{TimelineSetGuard, TimelinesSet}, wal_backup::{self, WalBackupTaskHandle}, - SafeKeeperConf, + wal_backup_partial, SafeKeeperConf, }; pub struct StateSnapshot { + // inmem values pub commit_lsn: Lsn, pub backup_lsn: Lsn, pub remote_consistent_lsn: Lsn, + + // persistent control file values + pub cfile_peer_horizon_lsn: Lsn, + pub cfile_remote_consistent_lsn: Lsn, + pub cfile_backup_lsn: Lsn, + + // misc + pub cfile_last_persist_at: Instant, + pub inmem_flush_pending: bool, pub peers: Vec, } @@ -30,17 +49,34 @@ impl StateSnapshot { commit_lsn: read_guard.sk.state.inmem.commit_lsn, backup_lsn: read_guard.sk.state.inmem.backup_lsn, remote_consistent_lsn: read_guard.sk.state.inmem.remote_consistent_lsn, + cfile_peer_horizon_lsn: read_guard.sk.state.peer_horizon_lsn, + cfile_remote_consistent_lsn: read_guard.sk.state.remote_consistent_lsn, + cfile_backup_lsn: read_guard.sk.state.backup_lsn, + cfile_last_persist_at: read_guard.sk.state.pers.last_persist_at(), + inmem_flush_pending: Self::has_unflushed_inmem_state(&read_guard), peers: read_guard.get_peers(heartbeat_timeout), } } + + fn has_unflushed_inmem_state(read_guard: &ReadGuardSharedState) -> bool { + let state = &read_guard.sk.state; + state.inmem.commit_lsn > state.commit_lsn + || state.inmem.backup_lsn > state.backup_lsn + || state.inmem.peer_horizon_lsn > state.peer_horizon_lsn + || state.inmem.remote_consistent_lsn > state.remote_consistent_lsn + } } /// Control how often the manager task should wake up to check updates. /// There is no need to check for updates more often than this. const REFRESH_INTERVAL: Duration = Duration::from_millis(300); +/// How often to save the control file if the is no other activity. +const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300); + /// This task gets spawned alongside each timeline and is responsible for managing the timeline's /// background tasks. +/// Be careful, this task is not respawned on panic, so it should not panic. #[instrument(name = "manager", skip_all, fields(ttid = %tli.ttid))] pub async fn main_task( tli: Arc, @@ -55,20 +91,50 @@ pub async fn main_task( } }; - // sets whether timeline is active for broker pushes or not - let mut tli_broker_active = broker_active_set.guard(tli.clone()); - - let ttid = tli.ttid; + // configuration & dependencies let wal_seg_size = tli.get_wal_seg_size().await; let heartbeat_timeout = conf.heartbeat_timeout; - - let mut state_version_rx = tli.get_state_version_rx(); - + let walsenders = tli.get_walsenders(); let walreceivers = tli.get_walreceivers(); + + // current state + let mut state_version_rx = tli.get_state_version_rx(); let mut num_computes_rx = walreceivers.get_num_rx(); + let mut tli_broker_active = broker_active_set.guard(tli.clone()); + let mut last_removed_segno = 0 as XLogSegNo; // list of background tasks let mut backup_task: Option = None; + let mut recovery_task: Option> = None; + let mut partial_backup_task: Option> = None; + let mut wal_removal_task: Option>> = None; + + // Start recovery task which always runs on the timeline. + if conf.peer_recovery_enabled { + match tli.full_access_guard().await { + Ok(tli) => { + recovery_task = Some(tokio::spawn(recovery_main(tli, conf.clone()))); + } + Err(e) => { + warn!("failed to start recovery task: {:?}", e); + } + } + } + + // Start partial backup task which always runs on the timeline. + if conf.is_wal_backup_enabled() && conf.partial_backup_enabled { + match tli.full_access_guard().await { + Ok(tli) => { + partial_backup_task = Some(tokio::spawn(wal_backup_partial::main_task( + tli, + conf.clone(), + ))); + } + Err(e) => { + warn!("failed to start partial backup task: {:?}", e); + } + } + } let last_state = 'outer: loop { MANAGER_ITERATIONS_TOTAL.inc(); @@ -76,47 +142,36 @@ pub async fn main_task( let state_snapshot = StateSnapshot::new(tli.read_shared_state().await, heartbeat_timeout); let num_computes = *num_computes_rx.borrow(); - let is_wal_backup_required = - wal_backup::is_wal_backup_required(wal_seg_size, num_computes, &state_snapshot); + let is_wal_backup_required = update_backup( + &conf, + &tli, + wal_seg_size, + num_computes, + &state_snapshot, + &mut backup_task, + ) + .await; - if conf.is_wal_backup_enabled() { - wal_backup::update_task( - &conf, - ttid, - is_wal_backup_required, - &state_snapshot, - &mut backup_task, - ) - .await; - } + let _is_active = update_is_active( + is_wal_backup_required, + num_computes, + &state_snapshot, + &mut tli_broker_active, + &tli, + ); - let is_active = is_wal_backup_required - || num_computes > 0 - || state_snapshot.remote_consistent_lsn < state_snapshot.commit_lsn; + let next_cfile_save = update_control_file_save(&state_snapshot, &tli).await; - // update the broker timeline set - if tli_broker_active.set(is_active) { - // write log if state has changed - info!( - "timeline active={} now, remote_consistent_lsn={}, commit_lsn={}", - is_active, state_snapshot.remote_consistent_lsn, state_snapshot.commit_lsn, - ); - - MANAGER_ACTIVE_CHANGES.inc(); - - if !is_active { - // TODO: maybe use tokio::spawn? - if let Err(e) = tli.maybe_persist_control_file(false).await { - warn!("control file save in update_status failed: {:?}", e); - } - } - } - - // update the state in Arc - tli.wal_backup_active - .store(backup_task.is_some(), std::sync::atomic::Ordering::Relaxed); - tli.broker_active - .store(is_active, std::sync::atomic::Ordering::Relaxed); + update_wal_removal( + &conf, + walsenders, + &tli, + wal_seg_size, + &state_snapshot, + last_removed_segno, + &mut wal_removal_task, + ) + .await; // wait until something changes. tx channels are stored under Arc, so they will not be // dropped until the manager task is finished. @@ -135,11 +190,192 @@ pub async fn main_task( _ = num_computes_rx.changed() => { // number of connected computes was updated } + _ = async { + if let Some(timeout) = next_cfile_save { + tokio::time::sleep_until(timeout).await + } else { + futures::future::pending().await + } + } => { + // it's time to save the control file + } + res = async { + if let Some(task) = &mut wal_removal_task { + task.await + } else { + futures::future::pending().await + } + } => { + // WAL removal task finished + wal_removal_task = None; + update_wal_removal_end(res, &tli, &mut last_removed_segno); + } } }; // shutdown background tasks if conf.is_wal_backup_enabled() { - wal_backup::update_task(&conf, ttid, false, &last_state, &mut backup_task).await; + wal_backup::update_task(&conf, &tli, false, &last_state, &mut backup_task).await; + } + + if let Some(recovery_task) = recovery_task { + if let Err(e) = recovery_task.await { + warn!("recovery task failed: {:?}", e); + } + } + + if let Some(partial_backup_task) = partial_backup_task { + if let Err(e) = partial_backup_task.await { + warn!("partial backup task failed: {:?}", e); + } + } + + if let Some(wal_removal_task) = wal_removal_task { + let res = wal_removal_task.await; + update_wal_removal_end(res, &tli, &mut last_removed_segno); } } + +/// Spawns/kills backup task and returns true if backup is required. +async fn update_backup( + conf: &SafeKeeperConf, + tli: &Arc, + wal_seg_size: usize, + num_computes: usize, + state: &StateSnapshot, + backup_task: &mut Option, +) -> bool { + let is_wal_backup_required = + wal_backup::is_wal_backup_required(wal_seg_size, num_computes, state); + + if conf.is_wal_backup_enabled() { + wal_backup::update_task(conf, tli, is_wal_backup_required, state, backup_task).await; + } + + // update the state in Arc + tli.wal_backup_active + .store(backup_task.is_some(), std::sync::atomic::Ordering::Relaxed); + is_wal_backup_required +} + +/// Update is_active flag and returns its value. +fn update_is_active( + is_wal_backup_required: bool, + num_computes: usize, + state: &StateSnapshot, + tli_broker_active: &mut TimelineSetGuard, + tli: &Arc, +) -> bool { + let is_active = is_wal_backup_required + || num_computes > 0 + || state.remote_consistent_lsn < state.commit_lsn; + + // update the broker timeline set + if tli_broker_active.set(is_active) { + // write log if state has changed + info!( + "timeline active={} now, remote_consistent_lsn={}, commit_lsn={}", + is_active, state.remote_consistent_lsn, state.commit_lsn, + ); + + MANAGER_ACTIVE_CHANGES.inc(); + } + + // update the state in Arc + tli.broker_active + .store(is_active, std::sync::atomic::Ordering::Relaxed); + is_active +} + +/// Save control file if needed. Returns Instant if we should persist the control file in the future. +async fn update_control_file_save( + state: &StateSnapshot, + tli: &Arc, +) -> Option { + if !state.inmem_flush_pending { + return None; + } + + if state.cfile_last_persist_at.elapsed() > CF_SAVE_INTERVAL { + let mut write_guard = tli.write_shared_state().await; + // this can be done in the background because it blocks manager task, but flush() should + // be fast enough not to be a problem now + if let Err(e) = write_guard.sk.state.flush().await { + warn!("failed to save control file: {:?}", e); + } + + None + } else { + // we should wait until next CF_SAVE_INTERVAL + Some((state.cfile_last_persist_at + CF_SAVE_INTERVAL).into()) + } +} + +/// Spawns WAL removal task if needed. +async fn update_wal_removal( + conf: &SafeKeeperConf, + walsenders: &Arc, + tli: &Arc, + wal_seg_size: usize, + state: &StateSnapshot, + last_removed_segno: u64, + wal_removal_task: &mut Option>>, +) { + if wal_removal_task.is_some() { + // WAL removal is already in progress + return; + } + + // If enabled, we use LSN of the most lagging walsender as a WAL removal horizon. + // This allows to get better read speed for pageservers that are lagging behind, + // at the cost of keeping more WAL on disk. + let replication_horizon_lsn = if conf.walsenders_keep_horizon { + walsenders.laggard_lsn() + } else { + None + }; + + let removal_horizon_lsn = calc_horizon_lsn(state, replication_horizon_lsn); + let removal_horizon_segno = removal_horizon_lsn + .segment_number(wal_seg_size) + .saturating_sub(1); + + if removal_horizon_segno > last_removed_segno { + // we need to remove WAL + let remover = crate::wal_storage::Storage::remove_up_to( + &tli.read_shared_state().await.sk.wal_store, + removal_horizon_segno, + ); + *wal_removal_task = Some(tokio::spawn( + async move { + remover.await?; + Ok(removal_horizon_segno) + } + .instrument(info_span!("WAL removal", ttid=%tli.ttid)), + )); + } +} + +/// Update the state after WAL removal task finished. +fn update_wal_removal_end( + res: Result, JoinError>, + tli: &Arc, + last_removed_segno: &mut u64, +) { + let new_last_removed_segno = match res { + Ok(Ok(segno)) => segno, + Err(e) => { + warn!("WAL removal task failed: {:?}", e); + return; + } + Ok(Err(e)) => { + warn!("WAL removal task failed: {:?}", e); + return; + } + }; + + *last_removed_segno = new_last_removed_segno; + // update the state in Arc + tli.last_removed_segno + .store(new_last_removed_segno, std::sync::atomic::Ordering::Relaxed); +} diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index 8d37bd6371..45e08ede3c 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -3,7 +3,7 @@ //! all from the disk on startup and keeping them in memory. use crate::safekeeper::ServerInfo; -use crate::timeline::{Timeline, TimelineError}; +use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError}; use crate::timelines_set::TimelinesSet; use crate::SafeKeeperConf; use anyhow::{bail, Context, Result}; @@ -127,7 +127,7 @@ impl GlobalTimelines { state.get_dependencies() }; - let timelines_dir = conf.tenant_dir(&tenant_id); + let timelines_dir = get_tenant_dir(&conf, &tenant_id); for timelines_dir_entry in std::fs::read_dir(&timelines_dir) .with_context(|| format!("failed to list timelines dir {}", timelines_dir))? { @@ -348,11 +348,7 @@ impl GlobalTimelines { } Err(_) => { // Timeline is not memory, but it may still exist on disk in broken state. - let dir_path = TIMELINES_STATE - .lock() - .unwrap() - .get_conf() - .timeline_dir(ttid); + let dir_path = get_timeline_dir(TIMELINES_STATE.lock().unwrap().get_conf(), ttid); let dir_existed = delete_dir(dir_path)?; Ok(TimelineDeleteForceResult { @@ -401,13 +397,10 @@ impl GlobalTimelines { // Note that we could concurrently create new timelines while we were deleting them, // so the directory may be not empty. In this case timelines will have bad state // and timeline background jobs can panic. - delete_dir( - TIMELINES_STATE - .lock() - .unwrap() - .get_conf() - .tenant_dir(tenant_id), - )?; + delete_dir(get_tenant_dir( + TIMELINES_STATE.lock().unwrap().get_conf(), + tenant_id, + ))?; // FIXME: we temporarily disabled removing timelines from the map, see `delete_force` // let tlis_after_delete = Self::get_all_for_tenant(*tenant_id); diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 84680557f9..58591aecfa 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -30,9 +30,9 @@ use tracing::*; use utils::{id::TenantTimelineId, lsn::Lsn}; use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS}; -use crate::timeline::{PeerInfo, Timeline}; +use crate::timeline::{FullAccessTimeline, PeerInfo, Timeline}; use crate::timeline_manager::StateSnapshot; -use crate::{GlobalTimelines, SafeKeeperConf, WAL_BACKUP_RUNTIME}; +use crate::{SafeKeeperConf, WAL_BACKUP_RUNTIME}; use once_cell::sync::OnceCell; @@ -63,13 +63,13 @@ pub fn is_wal_backup_required( /// is running, kill it. pub async fn update_task( conf: &SafeKeeperConf, - ttid: TenantTimelineId, + tli: &Arc, need_backup: bool, state: &StateSnapshot, entry: &mut Option, ) { let (offloader, election_dbg_str) = - determine_offloader(&state.peers, state.backup_lsn, ttid, conf); + determine_offloader(&state.peers, state.backup_lsn, tli.ttid, conf); let elected_me = Some(conf.my_id) == offloader; let should_task_run = need_backup && elected_me; @@ -80,15 +80,8 @@ pub async fn update_task( info!("elected for backup: {}", election_dbg_str); let (shutdown_tx, shutdown_rx) = mpsc::channel(1); - let timeline_dir = conf.timeline_dir(&ttid); - let async_task = backup_task_main( - ttid, - timeline_dir, - conf.workdir.clone(), - conf.backup_parallel_jobs, - shutdown_rx, - ); + let async_task = backup_task_main(tli.clone(), conf.backup_parallel_jobs, shutdown_rx); let handle = if conf.current_thread_runtime { tokio::spawn(async_task) @@ -198,39 +191,32 @@ pub fn init_remote_storage(conf: &SafeKeeperConf) { } struct WalBackupTask { - timeline: Arc, + timeline: FullAccessTimeline, timeline_dir: Utf8PathBuf, - workspace_dir: Utf8PathBuf, wal_seg_size: usize, parallel_jobs: usize, commit_lsn_watch_rx: watch::Receiver, } /// Offload single timeline. -#[instrument(name = "WAL backup", skip_all, fields(ttid = %ttid))] -async fn backup_task_main( - ttid: TenantTimelineId, - timeline_dir: Utf8PathBuf, - workspace_dir: Utf8PathBuf, - parallel_jobs: usize, - mut shutdown_rx: Receiver<()>, -) { +#[instrument(name = "WAL backup", skip_all, fields(ttid = %tli.ttid))] +async fn backup_task_main(tli: Arc, parallel_jobs: usize, mut shutdown_rx: Receiver<()>) { let _guard = WAL_BACKUP_TASKS.guard(); + let tli = match tli.full_access_guard().await { + Ok(tli) => tli, + Err(e) => { + error!("backup error: {}", e); + return; + } + }; info!("started"); - let res = GlobalTimelines::get(ttid); - if let Err(e) = res { - error!("backup error: {}", e); - return; - } - let tli = res.unwrap(); let mut wb = WalBackupTask { wal_seg_size: tli.get_wal_seg_size().await, commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(), + timeline_dir: tli.get_timeline_dir(), timeline: tli, - timeline_dir, - workspace_dir, parallel_jobs, }; @@ -297,7 +283,6 @@ impl WalBackupTask { commit_lsn, self.wal_seg_size, &self.timeline_dir, - &self.workspace_dir, self.parallel_jobs, ) .await @@ -319,18 +304,18 @@ impl WalBackupTask { } async fn backup_lsn_range( - timeline: &Arc, + timeline: &FullAccessTimeline, backup_lsn: &mut Lsn, end_lsn: Lsn, wal_seg_size: usize, timeline_dir: &Utf8Path, - workspace_dir: &Utf8Path, parallel_jobs: usize, ) -> Result<()> { if parallel_jobs < 1 { anyhow::bail!("parallel_jobs must be >= 1"); } + let remote_timeline_path = remote_timeline_path(&timeline.ttid)?; let start_lsn = *backup_lsn; let segments = get_segments(start_lsn, end_lsn, wal_seg_size); @@ -343,7 +328,11 @@ async fn backup_lsn_range( loop { let added_task = match iter.next() { Some(s) => { - uploads.push_back(backup_single_segment(s, timeline_dir, workspace_dir)); + uploads.push_back(backup_single_segment( + s, + timeline_dir, + &remote_timeline_path, + )); true } None => false, @@ -381,18 +370,10 @@ async fn backup_lsn_range( async fn backup_single_segment( seg: &Segment, timeline_dir: &Utf8Path, - workspace_dir: &Utf8Path, + remote_timeline_path: &RemotePath, ) -> Result { let segment_file_path = seg.file_path(timeline_dir)?; - let remote_segment_path = segment_file_path - .strip_prefix(workspace_dir) - .context("Failed to strip workspace dir prefix") - .and_then(RemotePath::new) - .with_context(|| { - format!( - "Failed to resolve remote part of path {segment_file_path:?} for base {workspace_dir:?}", - ) - })?; + let remote_segment_path = seg.remote_path(remote_timeline_path); let res = backup_object(&segment_file_path, &remote_segment_path, seg.size()).await; if res.is_ok() { @@ -430,6 +411,10 @@ impl Segment { Ok(timeline_dir.join(self.object_name())) } + pub fn remote_path(self, remote_timeline_path: &RemotePath) -> RemotePath { + remote_timeline_path.join(self.object_name()) + } + pub fn size(self) -> usize { (u64::from(self.end_lsn) - u64::from(self.start_lsn)) as usize } @@ -530,8 +515,7 @@ pub async fn read_object( /// when called. pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> { let storage = get_configured_remote_storage(); - let ttid_path = Utf8Path::new(&ttid.tenant_id.to_string()).join(ttid.timeline_id.to_string()); - let remote_path = RemotePath::new(&ttid_path)?; + let remote_path = remote_timeline_path(ttid)?; // see DEFAULT_MAX_KEYS_PER_LIST_RESPONSE // const Option unwrap is not stable, otherwise it would be const. @@ -613,15 +597,17 @@ pub async fn copy_s3_segments( .as_ref() .unwrap(); - let relative_dst_path = - Utf8Path::new(&dst_ttid.tenant_id.to_string()).join(dst_ttid.timeline_id.to_string()); - - let remote_path = RemotePath::new(&relative_dst_path)?; + let remote_dst_path = remote_timeline_path(dst_ttid)?; let cancel = CancellationToken::new(); let files = storage - .list(Some(&remote_path), ListingMode::NoDelimiter, None, &cancel) + .list( + Some(&remote_dst_path), + ListingMode::NoDelimiter, + None, + &cancel, + ) .await? .keys; @@ -635,9 +621,6 @@ pub async fn copy_s3_segments( uploaded_segments ); - let relative_src_path = - Utf8Path::new(&src_ttid.tenant_id.to_string()).join(src_ttid.timeline_id.to_string()); - for segno in from_segment..to_segment { if segno % SEGMENTS_PROGRESS_REPORT_INTERVAL == 0 { info!("copied all segments from {} until {}", from_segment, segno); @@ -649,8 +632,8 @@ pub async fn copy_s3_segments( } debug!("copying segment {}", segment_name); - let from = RemotePath::new(&relative_src_path.join(&segment_name))?; - let to = RemotePath::new(&relative_dst_path.join(&segment_name))?; + let from = remote_timeline_path(src_ttid)?.join(&segment_name); + let to = remote_dst_path.join(&segment_name); storage.copy_object(&from, &to, &cancel).await?; } @@ -661,3 +644,8 @@ pub async fn copy_s3_segments( ); Ok(()) } + +/// Get S3 (remote_storage) prefix path used for timeline files. +pub fn remote_timeline_path(ttid: &TenantTimelineId) -> Result { + RemotePath::new(&Utf8Path::new(&ttid.tenant_id.to_string()).join(ttid.timeline_id.to_string())) +} diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs index 29e944bff3..6c0f35095b 100644 --- a/safekeeper/src/wal_backup_partial.rs +++ b/safekeeper/src/wal_backup_partial.rs @@ -18,22 +18,21 @@ //! This way control file stores information about all potentially existing //! remote partial segments and can clean them up after uploading a newer version. -use std::sync::Arc; - use camino::Utf8PathBuf; use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI}; use rand::Rng; use remote_storage::RemotePath; use serde::{Deserialize, Serialize}; -use tracing::{debug, error, info, instrument}; +use tracing::{debug, error, info, instrument, warn}; use utils::lsn::Lsn; use crate::{ metrics::{PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS}, safekeeper::Term, - timeline::Timeline, - wal_backup, SafeKeeperConf, + timeline::FullAccessTimeline, + wal_backup::{self, remote_timeline_path}, + SafeKeeperConf, }; #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] @@ -83,10 +82,10 @@ impl State { struct PartialBackup { wal_seg_size: usize, - tli: Arc, + tli: FullAccessTimeline, conf: SafeKeeperConf, local_prefix: Utf8PathBuf, - remote_prefix: Utf8PathBuf, + remote_timeline_path: RemotePath, state: State, } @@ -153,7 +152,7 @@ impl PartialBackup { let backup_bytes = flush_lsn.segment_offset(self.wal_seg_size); let local_path = self.local_prefix.join(self.local_segment_name(segno)); - let remote_path = RemotePath::new(self.remote_prefix.join(&prepared.name).as_ref())?; + let remote_path = self.remote_timeline_path.join(&prepared.name); // Upload first `backup_bytes` bytes of the segment to the remote storage. wal_backup::backup_partial_segment(&local_path, &remote_path, backup_bytes).await?; @@ -253,7 +252,7 @@ impl PartialBackup { info!("deleting objects: {:?}", segments_to_delete); let mut objects_to_delete = vec![]; for seg in segments_to_delete.iter() { - let remote_path = RemotePath::new(self.remote_prefix.join(seg).as_ref())?; + let remote_path = self.remote_timeline_path.join(seg); objects_to_delete.push(remote_path); } @@ -273,7 +272,7 @@ impl PartialBackup { } #[instrument(name = "Partial backup", skip_all, fields(ttid = %tli.ttid))] -pub async fn main_task(tli: Arc, conf: SafeKeeperConf) { +pub async fn main_task(tli: FullAccessTimeline, conf: SafeKeeperConf) { debug!("started"); let await_duration = conf.partial_backup_timeout; @@ -289,11 +288,11 @@ pub async fn main_task(tli: Arc, conf: SafeKeeperConf) { let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx(); let wal_seg_size = tli.get_wal_seg_size().await; - let local_prefix = tli.timeline_dir.clone(); - let remote_prefix = match tli.timeline_dir.strip_prefix(&conf.workdir) { - Ok(path) => path.to_owned(), + let local_prefix = tli.get_timeline_dir(); + let remote_timeline_path = match remote_timeline_path(&tli.ttid) { + Ok(path) => path, Err(e) => { - error!("failed to strip workspace dir prefix: {:?}", e); + error!("failed to create remote path: {:?}", e); return; } }; @@ -304,12 +303,28 @@ pub async fn main_task(tli: Arc, conf: SafeKeeperConf) { state: persistent_state.partial_backup, conf, local_prefix, - remote_prefix, + remote_timeline_path, }; debug!("state: {:?}", backup.state); + // The general idea is that each safekeeper keeps only one partial segment + // both in remote storage and in local state. If this is not true, something + // went wrong. + const MAX_SIMULTANEOUS_SEGMENTS: usize = 10; + 'outer: loop { + if backup.state.segments.len() > MAX_SIMULTANEOUS_SEGMENTS { + warn!( + "too many segments in control_file state, running gc: {}", + backup.state.segments.len() + ); + + backup.gc().await.unwrap_or_else(|e| { + error!("failed to run gc: {:#}", e); + }); + } + // wait until we have something to upload let uploaded_segment = backup.state.uploaded_segment(); if let Some(seg) = &uploaded_segment { diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 6bc8c7c3f9..45e27e1951 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -25,7 +25,7 @@ use utils::crashsafe::durable_rename; use crate::metrics::{time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS}; use crate::state::TimelinePersistentState; -use crate::wal_backup::read_object; +use crate::wal_backup::{read_object, remote_timeline_path}; use crate::SafeKeeperConf; use postgres_ffi::waldecoder::WalStreamDecoder; use postgres_ffi::XLogFileName; @@ -536,7 +536,7 @@ async fn remove_segments_from_disk( } pub struct WalReader { - workdir: Utf8PathBuf, + remote_path: RemotePath, timeline_dir: Utf8PathBuf, wal_seg_size: usize, pos: Lsn, @@ -558,7 +558,7 @@ pub struct WalReader { impl WalReader { pub fn new( - workdir: Utf8PathBuf, + ttid: &TenantTimelineId, timeline_dir: Utf8PathBuf, state: &TimelinePersistentState, start_pos: Lsn, @@ -586,7 +586,7 @@ impl WalReader { } Ok(Self { - workdir, + remote_path: remote_timeline_path(ttid)?, timeline_dir, wal_seg_size: state.server.wal_seg_size as usize, pos: start_pos, @@ -684,7 +684,7 @@ impl WalReader { let xlogoff = self.pos.segment_offset(self.wal_seg_size); let segno = self.pos.segment_number(self.wal_seg_size); let wal_file_name = XLogFileName(PG_TLI, segno, self.wal_seg_size); - let wal_file_path = self.timeline_dir.join(wal_file_name); + let wal_file_path = self.timeline_dir.join(&wal_file_name); // Try to open local file, if we may have WAL locally if self.pos >= self.local_start_lsn { @@ -712,16 +712,7 @@ impl WalReader { // Try to open remote file, if remote reads are enabled if self.enable_remote_read { - let remote_wal_file_path = wal_file_path - .strip_prefix(&self.workdir) - .context("Failed to strip workdir prefix") - .and_then(RemotePath::new) - .with_context(|| { - format!( - "Failed to resolve remote part of path {:?} for base {:?}", - wal_file_path, self.workdir, - ) - })?; + let remote_wal_file_path = self.remote_path.join(&wal_file_name); return read_object(&remote_wal_file_path, xlogoff as u64).await; } diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 604ad6fbaa..bbb6d2cb32 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -142,52 +142,6 @@ async fn handle_tenant_create( ) } -// For tenant and timeline deletions, which both implement an "initially return 202, then 404 once -// we're done" semantic, we wrap with a retry loop to expose a simpler API upstream. This avoids -// needing to track a "deleting" state for tenants. -async fn deletion_wrapper(service: Arc, f: F) -> Result, ApiError> -where - R: std::future::Future> + Send + 'static, - F: Fn(Arc) -> R + Send + Sync + 'static, -{ - let started_at = Instant::now(); - // To keep deletion reasonably snappy for small tenants, initially check after 1 second if deletion - // completed. - let mut retry_period = Duration::from_secs(1); - // On subsequent retries, wait longer. - let max_retry_period = Duration::from_secs(5); - // Enable callers with a 30 second request timeout to reliably get a response - let max_wait = Duration::from_secs(25); - - loop { - let status = f(service.clone()).await?; - match status { - StatusCode::ACCEPTED => { - tracing::info!("Deletion accepted, waiting to try again..."); - tokio::time::sleep(retry_period).await; - retry_period = max_retry_period; - } - StatusCode::NOT_FOUND => { - tracing::info!("Deletion complete"); - return json_response(StatusCode::OK, ()); - } - _ => { - tracing::warn!("Unexpected status {status}"); - return json_response(status, ()); - } - } - - let now = Instant::now(); - if now + retry_period > started_at + max_wait { - tracing::info!("Deletion timed out waiting for 404"); - // REQUEST_TIMEOUT would be more appropriate, but CONFLICT is already part of - // the pageserver's swagger definition for this endpoint, and has the same desired - // effect of causing the control plane to retry later. - return json_response(StatusCode::CONFLICT, ()); - } - } -} - async fn handle_tenant_location_config( service: Arc, mut req: Request, @@ -283,13 +237,17 @@ async fn handle_tenant_delete( let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; - deletion_wrapper(service, move |service| async move { - service - .tenant_delete(tenant_id) - .await - .and_then(map_reqwest_hyper_status) - }) - .await + let status_code = service + .tenant_delete(tenant_id) + .await + .and_then(map_reqwest_hyper_status)?; + + if status_code == StatusCode::NOT_FOUND { + // The pageserver uses 404 for successful deletion, but we use 200 + json_response(StatusCode::OK, ()) + } else { + json_response(status_code, ()) + } } async fn handle_tenant_timeline_create( @@ -317,6 +275,51 @@ async fn handle_tenant_timeline_delete( let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + // For timeline deletions, which both implement an "initially return 202, then 404 once + // we're done" semantic, we wrap with a retry loop to expose a simpler API upstream. + async fn deletion_wrapper(service: Arc, f: F) -> Result, ApiError> + where + R: std::future::Future> + Send + 'static, + F: Fn(Arc) -> R + Send + Sync + 'static, + { + let started_at = Instant::now(); + // To keep deletion reasonably snappy for small tenants, initially check after 1 second if deletion + // completed. + let mut retry_period = Duration::from_secs(1); + // On subsequent retries, wait longer. + let max_retry_period = Duration::from_secs(5); + // Enable callers with a 30 second request timeout to reliably get a response + let max_wait = Duration::from_secs(25); + + loop { + let status = f(service.clone()).await?; + match status { + StatusCode::ACCEPTED => { + tracing::info!("Deletion accepted, waiting to try again..."); + tokio::time::sleep(retry_period).await; + retry_period = max_retry_period; + } + StatusCode::NOT_FOUND => { + tracing::info!("Deletion complete"); + return json_response(StatusCode::OK, ()); + } + _ => { + tracing::warn!("Unexpected status {status}"); + return json_response(status, ()); + } + } + + let now = Instant::now(); + if now + retry_period > started_at + max_wait { + tracing::info!("Deletion timed out waiting for 404"); + // REQUEST_TIMEOUT would be more appropriate, but CONFLICT is already part of + // the pageserver's swagger definition for this endpoint, and has the same desired + // effect of causing the control plane to retry later. + return json_response(StatusCode::CONFLICT, ()); + } + } + } + deletion_wrapper(service, move |service| async move { service .tenant_timeline_delete(tenant_id, timeline_id) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index f914f4e0bb..756dc10a2a 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -2376,61 +2376,80 @@ impl Service { let _tenant_lock = trace_exclusive_lock(&self.tenant_op_locks, tenant_id, TenantOperations::Delete).await; - self.ensure_attached_wait(tenant_id).await?; - - // TODO: refactor into helper - let targets = { - let locked = self.inner.read().unwrap(); - let mut targets = Vec::new(); - + // Detach all shards + let (detach_waiters, shard_ids, node) = { + let mut shard_ids = Vec::new(); + let mut detach_waiters = Vec::new(); + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); for (tenant_shard_id, shard) in - locked.tenants.range(TenantShardId::tenant_range(tenant_id)) + tenants.range_mut(TenantShardId::tenant_range(tenant_id)) { - let node_id = shard.intent.get_attached().ok_or_else(|| { - ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled")) - })?; - let node = locked - .nodes - .get(&node_id) - .expect("Pageservers may not be deleted while referenced"); + shard_ids.push(*tenant_shard_id); - targets.push((*tenant_shard_id, node.clone())); + // Update the tenant's intent to remove all attachments + shard.policy = PlacementPolicy::Detached; + shard + .schedule(scheduler, &mut ScheduleContext::default()) + .expect("De-scheduling is infallible"); + debug_assert!(shard.intent.get_attached().is_none()); + debug_assert!(shard.intent.get_secondary().is_empty()); + + if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) { + detach_waiters.push(waiter); + } } - targets + + // Pick an arbitrary node to use for remote deletions (does not have to be where the tenant + // was attached, just has to be able to see the S3 content) + let node_id = scheduler.schedule_shard(&[], &ScheduleContext::default())?; + let node = nodes + .get(&node_id) + .expect("Pageservers may not be deleted while lock is active"); + (detach_waiters, shard_ids, node.clone()) }; - // Phase 1: delete on the pageservers - let mut any_pending = false; - for (tenant_shard_id, node) in targets { - let client = PageserverClient::new( - node.get_id(), - node.base_url(), - self.config.jwt_token.as_deref(), - ); - // TODO: this, like many other places, requires proper retry handling for 503, timeout: those should not - // surface immediately as an error to our caller. - let status = client.tenant_delete(tenant_shard_id).await.map_err(|e| { - ApiError::InternalServerError(anyhow::anyhow!( - "Error deleting shard {tenant_shard_id} on node {node}: {e}", - )) - })?; - tracing::info!( - "Shard {tenant_shard_id} on node {node}, delete returned {}", - status - ); - if status == StatusCode::ACCEPTED { - any_pending = true; - } + if let Err(e) = self.await_waiters(detach_waiters, RECONCILE_TIMEOUT).await { + // Failing to detach shouldn't hold up deletion, e.g. if a node is offline we should be able + // to use some other node to run the remote deletion. + tracing::warn!("Failed to detach some locations: {e}"); } - if any_pending { - // Caller should call us again later. When we eventually see 404s from - // all the shards, we may proceed to delete our records of the tenant. - tracing::info!( - "Tenant {} has some shards pending deletion, returning 202", - tenant_id - ); - return Ok(StatusCode::ACCEPTED); + let locations = shard_ids + .into_iter() + .map(|s| (s, node.clone())) + .collect::>(); + let results = self.tenant_for_shards_api( + locations, + |tenant_shard_id, client| async move { client.tenant_delete(tenant_shard_id).await }, + 1, + 3, + RECONCILE_TIMEOUT, + &self.cancel, + ) + .await; + for result in results { + match result { + Ok(StatusCode::ACCEPTED) => { + // This could happen if we failed detach above, and hit a pageserver where the tenant + // is still attached: it will accept the deletion in the background + tracing::warn!( + "Unexpectedly still attached on {}, client should retry", + node + ); + return Ok(StatusCode::ACCEPTED); + } + Ok(_) => {} + Err(mgmt_api::Error::Cancelled) => { + return Err(ApiError::ShuttingDown); + } + Err(e) => { + // This is unexpected: remote deletion should be infallible, unless the object store + // at large is unavailable. + tracing::error!("Error deleting via node {}: {e}", node); + return Err(ApiError::InternalServerError(anyhow::anyhow!(e))); + } + } } // Fall through: deletion of the tenant on pageservers is complete, we may proceed to drop diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py index e9be765669..147264762c 100644 --- a/test_runner/fixtures/common_types.py +++ b/test_runner/fixtures/common_types.py @@ -72,6 +72,18 @@ class Lsn: def segment_lsn(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> "Lsn": return Lsn(self.lsn_int - (self.lsn_int % seg_sz)) + def segno(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> int: + return self.lsn_int // seg_sz + + def segment_name(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> str: + segno = self.segno(seg_sz) + # The filename format is 00000001XXXXXXXX000000YY, where XXXXXXXXYY is segno in hex. + # XXXXXXXX is the higher 8 hex digits of segno + high_bits = segno >> 8 + # YY is the lower 2 hex digits of segno + low_bits = segno & 0xFF + return f"00000001{high_bits:08X}000000{low_bits:02X}" + @dataclass(frozen=True) class Key: diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index b8ef63faa9..a25b8bfca1 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -973,6 +973,9 @@ class NeonEnvBuilder: for pageserver in self.env.pageservers: pageserver.assert_no_errors() + for safekeeper in self.env.safekeepers: + safekeeper.assert_no_errors() + self.env.storage_controller.assert_no_errors() try: @@ -3813,6 +3816,9 @@ class Safekeeper(LogUtils): self.running = False return self + def assert_no_errors(self): + assert not self.log_contains("manager task finished prematurely") + def append_logical_message( self, tenant_id: TenantId, timeline_id: TimelineId, request: Dict[str, Any] ) -> Dict[str, Any]: @@ -3898,6 +3904,15 @@ class Safekeeper(LogUtils): """ cli = self.http_client() + target_segment_file = lsn.segment_name() + + def are_segments_removed(): + segments = self.list_segments(tenant_id, timeline_id) + log.info( + f"waiting for all segments before {target_segment_file} to be removed from sk {self.id}, current segments: {segments}" + ) + assert all(target_segment_file <= s for s in segments) + def are_lsns_advanced(): stat = cli.timeline_status(tenant_id, timeline_id) log.info( @@ -3909,6 +3924,7 @@ class Safekeeper(LogUtils): # pageserver to this safekeeper wait_until(30, 1, are_lsns_advanced) cli.checkpoint(tenant_id, timeline_id) + wait_until(30, 1, are_segments_removed) def wait_until_paused(self, failpoint: str): msg = f"at failpoint {failpoint}" @@ -3982,6 +3998,30 @@ class S3Scrubber: ) log.info(f"tenant-snapshot output: {stdout}") + def pageserver_physical_gc( + self, min_age_secs: int, tenant_ids: Optional[list[TenantId]] = None + ): + args = ["pageserver-physical-gc", "--min-age", f"{min_age_secs}s"] + + if tenant_ids is None: + tenant_ids = [] + + for tenant_id in tenant_ids: + args.extend(["--tenant-id", str(tenant_id)]) + + stdout = self.scrubber_cli( + args, + timeout=30, + ) + try: + return json.loads(stdout) + except: + log.error( + "Failed to decode JSON output from `pageserver-physical_gc`. Dumping stdout:" + ) + log.error(stdout) + raise + def _get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str) -> Path: """Compute the path to a working directory for an individual test.""" diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index ad8bbe2021..ef412cade7 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -66,7 +66,7 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( ".*query handler for 'pagestream.*failed: Timeline .* is not active", # timeline delete in progress ".*task iteration took longer than the configured period.*", # these can happen anytime we do compactions from background task and shutdown pageserver - r".*ERROR.*ancestor timeline \S+ is being stopped", + ".*could not compact.*cancelled.*", # this is expected given our collaborative shutdown approach for the UploadQueue ".*Compaction failed.*, retrying in .*: Other\\(queue is in state Stopped.*", ".*Compaction failed.*, retrying in .*: ShuttingDown", diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index ee18c53b52..6f6526d3fc 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -171,6 +171,8 @@ class S3Storage: """Is this MOCK_S3 (false) or REAL_S3 (true)""" real: bool endpoint: Optional[str] = None + """formatting deserialized with humantime crate, for example "1s".""" + custom_timeout: Optional[str] = None def access_env_vars(self) -> Dict[str, str]: if self.aws_profile is not None: @@ -208,6 +210,9 @@ class S3Storage: if self.endpoint is not None: rv["endpoint"] = self.endpoint + if self.custom_timeout is not None: + rv["timeout"] = self.custom_timeout + return rv def to_toml_inline_table(self) -> str: diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py index a5480f557f..11e6fef28f 100644 --- a/test_runner/fixtures/safekeeper/http.py +++ b/test_runner/fixtures/safekeeper/http.py @@ -19,7 +19,8 @@ class Walreceiver: @dataclass class SafekeeperTimelineStatus: - acceptor_epoch: int + term: int + last_log_term: int pg_version: int # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2 flush_lsn: Lsn commit_lsn: Lsn @@ -156,7 +157,8 @@ class SafekeeperHttpClient(requests.Session): resj = res.json() walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]] return SafekeeperTimelineStatus( - acceptor_epoch=resj["acceptor_state"]["epoch"], + term=resj["acceptor_state"]["term"], + last_log_term=resj["acceptor_state"]["epoch"], pg_version=resj["pg_info"]["pg_version"], flush_lsn=Lsn(resj["flush_lsn"]), commit_lsn=Lsn(resj["commit_lsn"]), diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 4850a5c688..49dcb9b86a 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -81,11 +81,19 @@ page_cache_size=10 non_vectored_sum = metrics.query_one("pageserver_layers_visited_per_read_global_sum") non_vectored_count = metrics.query_one("pageserver_layers_visited_per_read_global_count") - non_vectored_average = non_vectored_sum.value / non_vectored_count.value - + if non_vectored_count.value != 0: + non_vectored_average = non_vectored_sum.value / non_vectored_count.value + else: + non_vectored_average = 0 vectored_sum = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_sum") vectored_count = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_count") - vectored_average = vectored_sum.value / vectored_count.value + if vectored_count.value > 0: + assert vectored_sum.value > 0 + vectored_average = vectored_sum.value / vectored_count.value + else: + # special case: running local tests with default legacy configuration + assert vectored_sum.value == 0 + vectored_average = 0 log.info(f"{non_vectored_average=} {vectored_average=}") @@ -230,7 +238,7 @@ def test_uploads_and_deletions( # https://github.com/neondatabase/neon/issues/7707 # https://github.com/neondatabase/neon/issues/7759 allowed_errors = [ - ".*duplicated L1 layer.*", + ".*/checkpoint.*rename temporary file as correct path for.*", # EEXIST ".*delta layer created with.*duplicate values.*", ".*assertion failed: self.lsn_range.start <= lsn.*", ".*HTTP request handler task panicked: task.*panicked.*", diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index b137fb3a5c..4a25dfd874 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -3,8 +3,10 @@ import time from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor from typing import Any, DefaultDict, Dict, Tuple +import pytest from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import ( @@ -13,7 +15,7 @@ from fixtures.neon_fixtures import ( last_flush_lsn_upload, wait_for_last_flush_lsn, ) -from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient from fixtures.pageserver.utils import ( assert_tenant_state, wait_for_last_record_lsn, @@ -21,7 +23,7 @@ from fixtures.pageserver.utils import ( wait_for_upload_queue_empty, wait_until_tenant_active, ) -from fixtures.remote_storage import RemoteStorageKind +from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage from fixtures.utils import query_scalar, wait_until @@ -402,7 +404,7 @@ def test_download_remote_layers_api( env.pageserver.allowed_errors.extend( [ ".*download failed: downloading evicted layer file failed.*", - f".*initial_size_calculation.*{tenant_id}.*{timeline_id}.*initial size calculation failed: downloading evicted layer file failed", + f".*initial_size_calculation.*{tenant_id}.*{timeline_id}.*initial size calculation failed.*downloading evicted layer file failed", ] ) @@ -656,5 +658,200 @@ def test_compaction_downloads_on_demand_with_image_creation(neon_env_builder: Ne assert dict(kinds_after) == {"Delta": 4, "Image": 1} +def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBuilder): + """ + Demonstrates that tenant shutdown will cancel on-demand download and secondary doing warmup. + """ + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + + # turn off background tasks so that they don't interfere with the downloads + env = neon_env_builder.init_start( + initial_tenant_conf={ + "gc_period": "0s", + "compaction_period": "0s", + } + ) + client = env.pageserver.http_client() + failpoint = "before-downloading-layer-stream-pausable" + client.configure_failpoints((failpoint, "pause")) + + env.pageserver.allowed_errors.extend( + [ + ".*downloading failed, possibly for shutdown.*", + ] + ) + + info = client.layer_map_info(env.initial_tenant, env.initial_timeline) + assert len(info.delta_layers()) == 1 + + layer = info.delta_layers()[0] + + client.tenant_heatmap_upload(env.initial_tenant) + + # evict the initdb layer so we can download it + client.evict_layer(env.initial_tenant, env.initial_timeline, layer.layer_file_name) + + with ThreadPoolExecutor(max_workers=2) as exec: + download = exec.submit( + client.download_layer, + env.initial_tenant, + env.initial_timeline, + layer.layer_file_name, + ) + + _, offset = wait_until( + 20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}") + ) + + location_conf = {"mode": "Detached", "tenant_conf": {}} + # assume detach removes the layers + detach = exec.submit(client.tenant_location_conf, env.initial_tenant, location_conf) + + _, offset = wait_until( + 20, + 0.5, + lambda: env.pageserver.assert_log_contains( + "closing is taking longer than expected", offset + ), + ) + + client.configure_failpoints((failpoint, "off")) + + with pytest.raises( + PageserverApiException, match="downloading failed, possibly for shutdown" + ): + download.result() + + env.pageserver.assert_log_contains(".*downloading failed, possibly for shutdown.*") + + detach.result() + + client.configure_failpoints((failpoint, "pause")) + + _, offset = wait_until( + 20, + 0.5, + lambda: env.pageserver.assert_log_contains(f"cfg failpoint: {failpoint} pause", offset), + ) + + location_conf = { + "mode": "Secondary", + "secondary_conf": {"warm": True}, + "tenant_conf": {}, + } + + client.tenant_location_conf(env.initial_tenant, location_conf) + + warmup = exec.submit(client.tenant_secondary_download, env.initial_tenant, wait_ms=30000) + + _, offset = wait_until( + 20, + 0.5, + lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}", offset), + ) + + client.configure_failpoints((failpoint, "off")) + location_conf = {"mode": "Detached", "tenant_conf": {}} + client.tenant_location_conf(env.initial_tenant, location_conf) + + client.configure_failpoints((failpoint, "off")) + + # here we have nothing in the log, but we see that the warmup and conf location update worked + warmup.result() + + +def test_layer_download_timeouted(neon_env_builder: NeonEnvBuilder): + """ + Pause using a pausable_failpoint longer than the client timeout to simulate the timeout happening. + """ + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + assert isinstance(neon_env_builder.pageserver_remote_storage, S3Storage) + neon_env_builder.pageserver_remote_storage.custom_timeout = "1s" + + # turn off background tasks so that they don't interfere with the downloads + env = neon_env_builder.init_start( + initial_tenant_conf={ + "gc_period": "0s", + "compaction_period": "0s", + } + ) + client = env.pageserver.http_client() + failpoint = "before-downloading-layer-stream-pausable" + client.configure_failpoints((failpoint, "pause")) + + info = client.layer_map_info(env.initial_tenant, env.initial_timeline) + assert len(info.delta_layers()) == 1 + + layer = info.delta_layers()[0] + + client.tenant_heatmap_upload(env.initial_tenant) + + # evict so we can download it + client.evict_layer(env.initial_tenant, env.initial_timeline, layer.layer_file_name) + + with ThreadPoolExecutor(max_workers=2) as exec: + download = exec.submit( + client.download_layer, + env.initial_tenant, + env.initial_timeline, + layer.layer_file_name, + ) + + _, offset = wait_until( + 20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}") + ) + # ensure enough time while paused to trip the timeout + time.sleep(2) + + client.configure_failpoints((failpoint, "off")) + download.result() + + _, offset = env.pageserver.assert_log_contains( + ".*failed, will retry \\(attempt 0\\): timeout.*" + ) + _, offset = env.pageserver.assert_log_contains(".*succeeded after [0-9]+ retries.*", offset) + + client.evict_layer(env.initial_tenant, env.initial_timeline, layer.layer_file_name) + + client.configure_failpoints((failpoint, "pause")) + + # capture the next offset for a new synchronization with the failpoint + _, offset = wait_until( + 20, + 0.5, + lambda: env.pageserver.assert_log_contains(f"cfg failpoint: {failpoint} pause", offset), + ) + + location_conf = { + "mode": "Secondary", + "secondary_conf": {"warm": True}, + "tenant_conf": {}, + } + + client.tenant_location_conf( + env.initial_tenant, + location_conf, + ) + + started = time.time() + + warmup = exec.submit(client.tenant_secondary_download, env.initial_tenant, wait_ms=30000) + # ensure enough time while paused to trip the timeout + time.sleep(2) + + client.configure_failpoints((failpoint, "off")) + + warmup.result() + + elapsed = time.time() - started + + _, offset = env.pageserver.assert_log_contains( + ".*failed, will retry \\(attempt 0\\): timeout.*", offset + ) + _, offset = env.pageserver.assert_log_contains(".*succeeded after [0-9]+ retries.*", offset) + + assert elapsed < 30, "too long passed: {elapsed=}" + + def stringify(conf: Dict[str, Any]) -> Dict[str, str]: return dict(map(lambda x: (x[0], str(x[1])), conf.items())) diff --git a/test_runner/regress/test_duplicate_layers.py b/test_runner/regress/test_pageserver_crash_consistency.py similarity index 66% rename from test_runner/regress/test_duplicate_layers.py rename to test_runner/regress/test_pageserver_crash_consistency.py index 0ebb99c712..3831d2f917 100644 --- a/test_runner/regress/test_duplicate_layers.py +++ b/test_runner/regress/test_pageserver_crash_consistency.py @@ -12,42 +12,14 @@ from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind from requests.exceptions import ConnectionError -def test_duplicate_layers(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): - env = neon_env_builder.init_start() - pageserver_http = env.pageserver.http_client() - - # use a failpoint to return all L0s as L1s - message = ".*duplicated L1 layer layer=.*" - env.pageserver.allowed_errors.append(message) - - # Use aggressive compaction and checkpoint settings - tenant_id, _ = env.neon_cli.create_tenant( - conf={ - "checkpoint_distance": f"{1024 ** 2}", - "compaction_target_size": f"{1024 ** 2}", - "compaction_period": "5 s", - "compaction_threshold": "3", - } - ) - - pageserver_http.configure_failpoints(("compact-level0-phase1-return-same", "return")) - - endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) - connstr = endpoint.connstr(options="-csynchronous_commit=off") - pg_bin.run_capture(["pgbench", "-i", "-s1", connstr]) - - time.sleep(10) # let compaction to be performed - env.pageserver.assert_log_contains("compact-level0-phase1-return-same") - - -def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): +def test_local_only_layers_after_crash(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): """ - Test sets fail point at the end of first compaction phase: after - flushing new L1 layer but before deletion of L0 layers. + Test case for docs/rfcs/027-crash-consistent-layer-map-through-index-part.md. - The L1 used to be overwritten, but with crash-consistency via remote - index_part.json, we end up deleting the not yet uploaded L1 layer on - startup. + Simulate crash after compaction has written layers to disk + but before they have been uploaded/linked into remote index_part.json. + + Startup handles this situation by deleting the not yet uploaded L1 layer files. """ neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) @@ -126,13 +98,6 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) # give time for log flush time.sleep(1) - message = f".*duplicated L1 layer layer={l1_found}" - found_msg = env.pageserver.log_contains(message) - # resident or evicted, it should not be overwritten, however it should had been non-existing at startup - assert ( - found_msg is None - ), "layer should had been removed during startup, did it live on as evicted?" - assert env.pageserver.layer_exists(tenant_id, timeline_id, l1_found), "the L1 reappears" wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id) @@ -141,3 +106,6 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) tenant_id, timeline_id, l1_found.to_str() ) assert uploaded.exists(), "the L1 is uploaded" + + +# TODO: same test for L0s produced by ingest. diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index 759e845927..4ce53df214 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -163,11 +163,6 @@ def test_pageserver_chaos( env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) - # these can happen, if we shutdown at a good time. to be fixed as part of #5172. - message = ".*duplicated L1 layer layer=.*" - for ps in env.pageservers: - ps.allowed_errors.append(message) - # Use a tiny checkpoint distance, to create a lot of layers quickly. # That allows us to stress the compaction and layer flushing logic more. tenant, _ = env.neon_cli.create_tenant( diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 25a3f8521c..5bfa9cce8c 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -15,7 +15,7 @@ from fixtures.pageserver.utils import ( tenant_delete_wait_completed, wait_for_upload_queue_empty, ) -from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage +from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage, s3_storage from fixtures.utils import wait_until from fixtures.workload import Workload @@ -73,7 +73,7 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): """ neon_env_builder.num_pageservers = 3 neon_env_builder.enable_pageserver_remote_storage( - remote_storage_kind=RemoteStorageKind.MOCK_S3, + remote_storage_kind=s3_storage(), ) env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) @@ -100,10 +100,6 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): ] ) - # these can happen, if we shutdown at a good time. to be fixed as part of #5172. - message = ".*duplicated L1 layer layer=.*" - ps.allowed_errors.append(message) - workload = Workload(env, tenant_id, timeline_id) workload.init(env.pageservers[0].id) workload.write_rows(256, env.pageservers[0].id) @@ -215,6 +211,13 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): ) workload.validate(pageserver.id) + # Having done a bunch of attach/detach cycles, we will have generated some index garbage: check + # that the scrubber sees it and cleans it up. We do this before the final attach+validate pass, + # to also validate that the scrubber isn't breaking anything. + gc_summary = S3Scrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1) + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] > 0 + # Attach all pageservers for ps in env.pageservers: location_conf = {"mode": "AttachedMulti", "secondary_conf": None, "tenant_conf": {}} @@ -227,10 +230,11 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): # Detach all pageservers for ps in env.pageservers: location_conf = {"mode": "Detached", "secondary_conf": None, "tenant_conf": {}} + assert ps.list_layers(tenant_id, timeline_id) != [] ps.tenant_location_configure(tenant_id, location_conf) - # Confirm that all local disk state was removed on detach - # TODO + # Confirm that all local disk state was removed on detach + assert ps.list_layers(tenant_id, timeline_id) == [] def test_live_migration(neon_env_builder: NeonEnvBuilder): diff --git a/test_runner/regress/test_s3_scrubber.py b/test_runner/regress/test_s3_scrubber.py index 8981000c24..6baba190f3 100644 --- a/test_runner/regress/test_s3_scrubber.py +++ b/test_runner/regress/test_s3_scrubber.py @@ -3,7 +3,7 @@ import shutil from typing import Optional import pytest -from fixtures.common_types import TenantShardId +from fixtures.common_types import TenantId, TenantShardId, TimelineId from fixtures.neon_fixtures import ( NeonEnvBuilder, S3Scrubber, @@ -109,3 +109,52 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: # Check we can read everything workload.validate() + + +@pytest.mark.parametrize("shard_count", [None, 4]) +def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.num_pageservers = 2 + + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant(tenant_id, timeline_id, shard_count=shard_count) + + workload = Workload(env, tenant_id, timeline_id) + workload.init() + + # We will end up with an index per shard, per cycle, plus one for the initial startup + n_cycles = 4 + expect_indices_per_shard = n_cycles + 1 + shard_count = 1 if shard_count is None else shard_count + + # For each cycle, detach and attach the tenant to bump the generation, and do some writes to generate uploads + for _i in range(0, n_cycles): + env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"}) + env.storage_controller.reconcile_until_idle() + + env.storage_controller.tenant_policy_update(tenant_id, {"placement": {"Attached": 0}}) + env.storage_controller.reconcile_until_idle() + + # This write includes remote upload, will generate an index in this generation + workload.write_rows(1) + + # With a high min_age, the scrubber should decline to delete anything + gc_summary = S3Scrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=3600) + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] == 0 + + # If targeting a different tenant, the scrubber shouldn't do anything + gc_summary = S3Scrubber(neon_env_builder).pageserver_physical_gc( + min_age_secs=1, tenant_ids=[TenantId.generate()] + ) + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] == 0 + + # With a low min_age, the scrubber should go ahead and clean up all but the latest 2 generations + gc_summary = S3Scrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1) + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] == (expect_indices_per_shard - 2) * shard_count diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 3a9a522f3f..2031feaa83 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -7,6 +7,7 @@ from typing import Any, Dict, List, Union import pytest from fixtures.common_types import TenantId, TenantShardId, TimelineId +from fixtures.compute_reconfigure import ComputeReconfigure from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, @@ -18,6 +19,8 @@ from fixtures.neon_fixtures import ( from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import ( MANY_SMALL_LAYERS_TENANT_CONFIG, + assert_prefix_empty, + assert_prefix_not_empty, enable_remote_storage_versioning, list_prefix, remote_storage_delete_key, @@ -839,6 +842,86 @@ def test_storage_controller_tenant_conf(neon_env_builder: NeonEnvBuilder): env.storage_controller.consistency_check() +def test_storage_controller_tenant_deletion( + neon_env_builder: NeonEnvBuilder, + compute_reconfigure_listener: ComputeReconfigure, +): + """ + Validate that: + - Deleting a tenant deletes all its shards + - Deletion does not require the compute notification hook to be responsive + - Deleting a tenant also removes all secondary locations + """ + neon_env_builder.num_pageservers = 4 + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.control_plane_compute_hook_api = ( + compute_reconfigure_listener.control_plane_compute_hook_api + ) + + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant( + tenant_id, timeline_id, shard_count=2, placement_policy='{"Attached":1}' + ) + + # Ensure all the locations are configured, including secondaries + env.storage_controller.reconcile_until_idle() + + shard_ids = [ + TenantShardId.parse(shard["shard_id"]) for shard in env.storage_controller.locate(tenant_id) + ] + + # Assert attachments all have local content + for shard_id in shard_ids: + pageserver = env.get_tenant_pageserver(shard_id) + assert pageserver.tenant_dir(shard_id).exists() + + # Assert all shards have some content in remote storage + for shard_id in shard_ids: + assert_prefix_not_empty( + neon_env_builder.pageserver_remote_storage, + prefix="/".join( + ( + "tenants", + str(shard_id), + ) + ), + ) + + # Break the compute hook: we are checking that deletion does not depend on the compute hook being available + def break_hook(): + raise RuntimeError("Unexpected call to compute hook") + + compute_reconfigure_listener.register_on_notify(break_hook) + + # No retry loop: deletion should complete in one shot without polling for 202 responses, because + # it cleanly detaches all the shards first, and then deletes them in remote storage + env.storage_controller.pageserver_api().tenant_delete(tenant_id) + + # Assert no pageservers have any local content + for pageserver in env.pageservers: + for shard_id in shard_ids: + assert not pageserver.tenant_dir(shard_id).exists() + + for shard_id in shard_ids: + assert_prefix_empty( + neon_env_builder.pageserver_remote_storage, + prefix="/".join( + ( + "tenants", + str(shard_id), + ) + ), + ) + + # Assert the tenant is not visible in storage controller API + with pytest.raises(StorageControllerApiException): + env.storage_controller.tenant_describe(tenant_id) + + class Failure: pageserver_id: int diff --git a/test_runner/regress/test_subscriber_restart.py b/test_runner/regress/test_subscriber_restart.py new file mode 100644 index 0000000000..d7f3962620 --- /dev/null +++ b/test_runner/regress/test_subscriber_restart.py @@ -0,0 +1,57 @@ +import threading +import time + +from fixtures.neon_fixtures import NeonEnv +from fixtures.utils import wait_until + + +# This test checks of logical replication subscriber is able to correctly restart replication without receiving duplicates. +# It requires tracking information about replication origins at page server side +def test_subscriber_restart(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("publisher") + pub = env.endpoints.create("publisher") + pub.start() + + env.neon_cli.create_branch("subscriber") + sub = env.endpoints.create("subscriber") + sub.start() + + n_records = 100000 + n_restarts = 100 + + def check_that_changes_propagated(): + scur.execute("SELECT count(*) FROM t") + res = scur.fetchall() + assert res[0][0] == n_records + + def insert_data(pub): + with pub.cursor() as pcur: + for i in range(0, n_records): + pcur.execute("INSERT into t values (%s,random()*100000)", (i,)) + + with pub.cursor() as pcur: + with sub.cursor() as scur: + pcur.execute("CREATE TABLE t (pk integer primary key, sk integer)") + pcur.execute("CREATE PUBLICATION pub FOR TABLE t") + scur.execute("CREATE TABLE t (pk integer primary key, sk integer)") + # scur.execute("CREATE INDEX on t(sk)") # slowdown applying WAL at replica + pub_conn = f"host=localhost port={pub.pg_port} dbname=postgres user=cloud_admin" + query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub" + scur.execute(query) + time.sleep(2) # let initial table sync complete + + thread = threading.Thread(target=insert_data, args=(pub,), daemon=True) + thread.start() + + for _ in range(n_restarts): + # restart subscriber + # time.sleep(2) + sub.stop("immediate") + sub.start() + + thread.join() + pcur.execute(f"INSERT into t values ({n_records}, 0)") + n_records += 1 + with sub.cursor() as scur: + wait_until(10, 0.5, check_that_changes_propagated) diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 3fc44de6fa..fa7cead1bd 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -54,9 +54,26 @@ def test_tenant_delete_smoke( # first try to delete non existing tenant tenant_id = TenantId.generate() - env.pageserver.allowed_errors.append(f".*NotFound: tenant {tenant_id}.*") - with pytest.raises(PageserverApiException, match=f"NotFound: tenant {tenant_id}"): - ps_http.tenant_delete(tenant_id=tenant_id) + env.pageserver.allowed_errors.append(".*NotFound.*") + env.pageserver.allowed_errors.append(".*simulated failure.*") + + # Check that deleting a non-existent tenant gives the expected result: this is a loop because we + # may need to retry on some remote storage errors injected by the test harness + while True: + try: + ps_http.tenant_delete(tenant_id=tenant_id) + except PageserverApiException as e: + if e.status_code == 500: + # This test uses failure injection, which can produce 500s as the pageserver expects + # the object store to always be available, and the ListObjects during deletion is generally + # an infallible operation + assert "simulated failure of remote operation" in e.message + elif e.status_code == 404: + # This is our expected result: trying to erase a non-existent tenant gives us 404 + assert "NotFound" in e.message + break + else: + raise env.neon_cli.create_tenant( tenant_id=tenant_id, @@ -88,6 +105,9 @@ def test_tenant_delete_smoke( parent = timeline + # Upload a heatmap so that we exercise deletion of that too + ps_http.tenant_heatmap_upload(tenant_id) + iterations = poll_for_remote_storage_iterations(remote_storage_kind) assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 2 diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 12a4730e69..871351b2d5 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -302,7 +302,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): # gc should not try to even start on a timeline that doesn't exist with pytest.raises( - expected_exception=PageserverApiException, match="gc target timeline does not exist" + expected_exception=PageserverApiException, match="NotFound: Timeline not found" ): bogus_timeline_id = TimelineId.generate() pageserver_http.timeline_gc(tenant_id, bogus_timeline_id, 0) @@ -310,7 +310,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): env.pageserver.allowed_errors.extend( [ # the error will be printed to the log too - ".*gc target timeline does not exist.*", + ".*NotFound: Timeline not found.*", # Timelines get stopped during detach, ignore the gc calls that error, witnessing that ".*InternalServerError\\(timeline is Stopping.*", ] diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index cff13e74ee..dce30f5388 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -17,6 +17,7 @@ import psycopg2 import psycopg2.errors import psycopg2.extras import pytest +import requests from fixtures.broker import NeonBroker from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log @@ -841,7 +842,7 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): # fetch something sensible from status tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id) - epoch = tli_status.acceptor_epoch + term = tli_status.term timeline_start_lsn = tli_status.timeline_start_lsn if auth_enabled: @@ -862,8 +863,8 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): endpoint.safe_psql("insert into t values(10)") tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id) - epoch_after_reboot = tli_status.acceptor_epoch - assert epoch_after_reboot > epoch + term_after_reboot = tli_status.term + assert term_after_reboot > term # and timeline_start_lsn stays the same assert tli_status.timeline_start_lsn == timeline_start_lsn @@ -1104,11 +1105,11 @@ def cmp_sk_wal(sks: List[Safekeeper], tenant_id: TenantId, timeline_id: Timeline # First check that term / flush_lsn are the same: it is easier to # report/understand if WALs are different due to that. statuses = [sk_http_cli.timeline_status(tenant_id, timeline_id) for sk_http_cli in sk_http_clis] - term_flush_lsns = [(s.acceptor_epoch, s.flush_lsn) for s in statuses] + term_flush_lsns = [(s.last_log_term, s.flush_lsn) for s in statuses] for tfl, sk in zip(term_flush_lsns[1:], sks[1:]): assert ( term_flush_lsns[0] == tfl - ), f"(term, flush_lsn) are not equal on sks {sks[0].id} and {sk.id}: {term_flush_lsns[0]} != {tfl}" + ), f"(last_log_term, flush_lsn) are not equal on sks {sks[0].id} and {sk.id}: {term_flush_lsns[0]} != {tfl}" # check that WALs are identic. segs = [sk.list_segments(tenant_id, timeline_id) for sk in sks] @@ -1867,6 +1868,65 @@ def test_pull_timeline_gc(neon_env_builder: NeonEnvBuilder): assert digests[0] == digests[1], f"digest on src is {digests[0]} but on dst is {digests[1]}" +# Test pull_timeline while concurrently changing term on the donor: +# 1) Start pull_timeline, listing files to fetch. +# 2) Change term on the donor +# 3) Finish pull_timeline. +# +# Currently (until proper membership change procedure), we want to pull_timeline +# to fetch the log up to . This is unsafe if term +# changes during the procedure (unless timeline is locked all the time but we +# don't want that): recepient might end up with mix of WAL from different +# histories. Thus the schedule above is expected to fail. Later we'd allow +# pull_timeline to only initialize timeline to any valid state (up to +# commit_lsn), holding switch to fully new configuration until it recovers +# enough, so it won't be affected by term change anymore. +# +# Expected to fail while term check is not implemented. +@pytest.mark.xfail +def test_pull_timeline_term_change(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage()) + env = neon_env_builder.init_start() + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + (src_sk, dst_sk) = (env.safekeepers[0], env.safekeepers[2]) + + log.info("use only first 2 safekeepers, 3rd will be seeded") + ep = env.endpoints.create("main") + ep.active_safekeepers = [1, 2] + ep.start() + ep.safe_psql("create table t(key int, value text)") + ep.safe_psql("insert into t select generate_series(1, 1000), 'pear'") + + dst_http = dst_sk.http_client() + # run pull_timeline which will halt before downloading files + dst_http.configure_failpoints(("sk-pull-timeline-after-list-pausable", "pause")) + pt_handle = PropagatingThread( + target=dst_sk.pull_timeline, args=([src_sk], tenant_id, timeline_id) + ) + pt_handle.start() + dst_sk.wait_until_paused("sk-pull-timeline-after-list-pausable") + + src_http = src_sk.http_client() + term_before = src_http.timeline_status(tenant_id, timeline_id).term + + # restart compute to bump term + ep.stop() + ep = env.endpoints.create("main") + ep.active_safekeepers = [1, 2] + ep.start() + ep.safe_psql("insert into t select generate_series(1, 100), 'pear'") + + term_after = src_http.timeline_status(tenant_id, timeline_id).term + assert term_after > term_before, f"term_after={term_after}, term_before={term_before}" + + dst_http.configure_failpoints(("sk-pull-timeline-after-list-pausable", "off")) + with pytest.raises(requests.exceptions.HTTPError): + pt_handle.join() + + # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries # when compute is active, but there are no writes to the timeline. In that case # pageserver should maintain a single connection to safekeeper and don't attempt