mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-09 21:50:37 +00:00
Compare commits
50 Commits
fcdm/combi
...
fcdm/image
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1d0829c0bf | ||
|
|
bc484109f9 | ||
|
|
b428c6ad7a | ||
|
|
36c845dcb7 | ||
|
|
cd296c962d | ||
|
|
70a9d1a5b1 | ||
|
|
4c10cce1d5 | ||
|
|
3e4a625cc5 | ||
|
|
ced8c8c046 | ||
|
|
4c15633b4b | ||
|
|
32609329bb | ||
|
|
7a4470f705 | ||
|
|
16236be84d | ||
|
|
56b9f49fe0 | ||
|
|
87c8353d1d | ||
|
|
57f700b1cc | ||
|
|
5d039c6e9b | ||
|
|
36e1100949 | ||
|
|
59c5b374de | ||
|
|
0f3b87d023 | ||
|
|
c19625a29c | ||
|
|
f2e5212fed | ||
|
|
568bc1fde3 | ||
|
|
45e929c069 | ||
|
|
6b980f38da | ||
|
|
f0d8bd7855 | ||
|
|
046d9c69e6 | ||
|
|
c72cb44213 | ||
|
|
cd3e4ac18d | ||
|
|
9ad940086c | ||
|
|
936f2ee2a5 | ||
|
|
1af047dd3e | ||
|
|
5fa747e493 | ||
|
|
80854b98ff | ||
|
|
024372a3db | ||
|
|
fff2468aa2 | ||
|
|
c7538a2c20 | ||
|
|
a2d0d44b42 | ||
|
|
7d3cdc05d4 | ||
|
|
840abe3954 | ||
|
|
774a6e7475 | ||
|
|
df5d588f63 | ||
|
|
f39b0fce9b | ||
|
|
a9ec4eb4fc | ||
|
|
a97b54e3b9 | ||
|
|
a5114a99b2 | ||
|
|
ee7bbdda0e | ||
|
|
b6e070bf85 | ||
|
|
7fa732c96c | ||
|
|
331935df91 |
2
.github/workflows/build_and_test.yml
vendored
2
.github/workflows/build_and_test.yml
vendored
@@ -253,7 +253,7 @@ jobs:
|
||||
done
|
||||
|
||||
if [ "${FAILED}" = "true" ]; then
|
||||
echo >&2 "Please update vendors/revisions.json if these changes are intentional"
|
||||
echo >&2 "Please update vendor/revisions.json if these changes are intentional"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
9
Cargo.lock
generated
9
Cargo.lock
generated
@@ -2263,11 +2263,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "hashlink"
|
||||
version = "0.8.2"
|
||||
version = "0.8.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0761a1b9491c4f2e3d66aa0f62d0fba0af9a0e2852e4d48ea506632a4b56e6aa"
|
||||
checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7"
|
||||
dependencies = [
|
||||
"hashbrown 0.13.2",
|
||||
"hashbrown 0.14.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3952,6 +3952,7 @@ dependencies = [
|
||||
"pin-project-lite",
|
||||
"postgres-protocol",
|
||||
"rand 0.8.5",
|
||||
"serde",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tracing",
|
||||
@@ -4435,6 +4436,7 @@ dependencies = [
|
||||
"futures",
|
||||
"futures-util",
|
||||
"http-types",
|
||||
"humantime",
|
||||
"hyper",
|
||||
"itertools",
|
||||
"metrics",
|
||||
@@ -4446,6 +4448,7 @@ dependencies = [
|
||||
"serde_json",
|
||||
"test-context",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"tracing",
|
||||
|
||||
@@ -81,7 +81,7 @@ futures-core = "0.3"
|
||||
futures-util = "0.3"
|
||||
git-version = "0.3"
|
||||
hashbrown = "0.13"
|
||||
hashlink = "0.8.1"
|
||||
hashlink = "0.8.4"
|
||||
hdrhistogram = "7.5.2"
|
||||
hex = "0.4"
|
||||
hex-literal = "0.4"
|
||||
|
||||
@@ -47,7 +47,7 @@ COPY --chown=nonroot . .
|
||||
# Show build caching stats to check if it was used in the end.
|
||||
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
|
||||
RUN set -e \
|
||||
&& mold -run cargo build \
|
||||
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
|
||||
--bin pg_sni_router \
|
||||
--bin pageserver \
|
||||
--bin pagectl \
|
||||
|
||||
@@ -72,703 +72,6 @@ RUN cd postgres && \
|
||||
fi; \
|
||||
done
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "postgis-build"
|
||||
# Build PostGIS from the upstream PostGIS mirror.
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS postgis-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
RUN apt update && \
|
||||
apt install -y cmake gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
|
||||
libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \
|
||||
libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \
|
||||
protobuf-c-compiler xsltproc
|
||||
|
||||
# SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2
|
||||
RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
|
||||
echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \
|
||||
mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
|
||||
cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make clean && cp -R /sfcgal/* /
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin:$PATH"
|
||||
|
||||
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
|
||||
echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
|
||||
mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
|
||||
./autogen.sh && \
|
||||
./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
cd extensions/postgis && \
|
||||
make clean && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_raster.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_sfcgal.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control && \
|
||||
mkdir -p /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/postgis.control /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/postgis_raster.control /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/postgis_sfcgal.control /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/postgis_topology.control /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/address_standardizer.control /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/address_standardizer_data_us.control /extensions/postgis
|
||||
|
||||
RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
|
||||
echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \
|
||||
mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
|
||||
mkdir build && cd build && \
|
||||
cmake -DCMAKE_BUILD_TYPE=Release .. && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
|
||||
cp /usr/local/pgsql/share/extension/pgrouting.control /extensions/postgis && \
|
||||
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
|
||||
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/postgis.tar.zst -T -
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "plv8-build"
|
||||
# Build plv8
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS plv8-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN apt update && \
|
||||
apt install -y ninja-build python3-dev libncurses5 binutils clang
|
||||
|
||||
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \
|
||||
echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \
|
||||
mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
|
||||
# generate and copy upgrade scripts
|
||||
mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \
|
||||
cp upgrade/* /usr/local/pgsql/share/extension/ && \
|
||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||
make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
rm -rf /plv8-* && \
|
||||
find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
|
||||
# don't break computes with installed old version of plv8
|
||||
cd /usr/local/pgsql/lib/ && \
|
||||
ln -s plv8-3.1.10.so plv8-3.1.5.so && \
|
||||
ln -s plv8-3.1.10.so plv8-3.1.8.so && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "h3-pg-build"
|
||||
# Build h3_pg
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS h3-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN case "$(uname -m)" in \
|
||||
"x86_64") \
|
||||
export CMAKE_CHECKSUM=739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 \
|
||||
;; \
|
||||
"aarch64") \
|
||||
export CMAKE_CHECKSUM=281b42627c9a1beed03e29706574d04c6c53fae4994472e90985ef018dd29c02 \
|
||||
;; \
|
||||
*) \
|
||||
echo "Unsupported architecture '$(uname -m)'. Supported are x86_64 and aarch64" && exit 1 \
|
||||
;; \
|
||||
esac && \
|
||||
wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-$(uname -m).sh \
|
||||
-q -O /tmp/cmake-install.sh \
|
||||
&& echo "${CMAKE_CHECKSUM} /tmp/cmake-install.sh" | sha256sum --check \
|
||||
&& chmod u+x /tmp/cmake-install.sh \
|
||||
&& /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
|
||||
&& rm /tmp/cmake-install.sh
|
||||
|
||||
RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \
|
||||
echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \
|
||||
mkdir h3-src && cd h3-src && tar xvzf ../h3.tar.gz --strip-components=1 -C . && \
|
||||
mkdir build && cd build && \
|
||||
cmake .. -DCMAKE_BUILD_TYPE=Release && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
DESTDIR=/h3 make install && \
|
||||
cp -R /h3/usr / && \
|
||||
rm -rf build
|
||||
|
||||
RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
|
||||
echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
|
||||
mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \
|
||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "unit-pg-build"
|
||||
# compile unit extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS unit-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \
|
||||
echo "411d05beeb97e5a4abf17572bfcfbb5a68d98d1018918feff995f6ee3bb03e79 postgresql-unit.tar.gz" | sha256sum --check && \
|
||||
mkdir postgresql-unit-src && cd postgresql-unit-src && tar xvzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
# unit extension's "create extension" script relies on absolute install path to fill some reference tables.
|
||||
# We move the extension from '/usr/local/pgsql/' to '/usr/local/' after it is build. So we need to adjust the path.
|
||||
# This one-liner removes pgsql/ part of the path.
|
||||
# NOTE: Other extensions that rely on MODULEDIR variable after building phase will need the same fix.
|
||||
find /usr/local/pgsql/share/extension/ -name "unit*.sql" -print0 | xargs -0 sed -i "s|pgsql/||g" && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/unit.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "vector-pg-build"
|
||||
# compile pgvector extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS vector-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
|
||||
echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
|
||||
mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pgjwt-pg-build"
|
||||
# compile pgjwt extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pgjwt-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021
|
||||
RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \
|
||||
echo "cfdefb15007286f67d3d45510f04a6a7a495004be5b3aecb12cda667e774203f pgjwt.tar.gz" | sha256sum --check && \
|
||||
mkdir pgjwt-src && cd pgjwt-src && tar xvzf ../pgjwt.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "hypopg-pg-build"
|
||||
# compile hypopg extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS hypopg-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \
|
||||
echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \
|
||||
mkdir hypopg-src && cd hypopg-src && tar xvzf ../hypopg.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-hashids-pg-build"
|
||||
# compile pg_hashids extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-hashids-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
|
||||
echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_hashids-src && cd pg_hashids-src && tar xvzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "rum-pg-build"
|
||||
# compile rum extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS rum-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
|
||||
echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \
|
||||
mkdir rum-src && cd rum-src && tar xvzf ../rum.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pgtap-pg-build"
|
||||
# compile pgTAP extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pgtap-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \
|
||||
echo "9c7c3de67ea41638e14f06da5da57bac6f5bd03fea05c165a0ec862205a5c052 pgtap.tar.gz" | sha256sum --check && \
|
||||
mkdir pgtap-src && cd pgtap-src && tar xvzf ../pgtap.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "ip4r-pg-build"
|
||||
# compile ip4r extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS ip4r-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
|
||||
echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
|
||||
mkdir ip4r-src && cd ip4r-src && tar xvzf ../ip4r.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/ip4r.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "prefix-pg-build"
|
||||
# compile Prefix extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS prefix-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
|
||||
echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
|
||||
mkdir prefix-src && cd prefix-src && tar xvzf ../prefix.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "hll-pg-build"
|
||||
# compile hll extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS hll-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
|
||||
echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
|
||||
mkdir hll-src && cd hll-src && tar xvzf ../hll.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "plpgsql-check-pg-build"
|
||||
# compile plpgsql_check extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS plpgsql-check-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \
|
||||
echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \
|
||||
mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "timescaledb-pg-build"
|
||||
# compile timescaledb extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS timescaledb-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ARG PG_VERSION
|
||||
ENV PATH "/usr/local/pgsql/bin:$PATH"
|
||||
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v14" | "v15") \
|
||||
export TIMESCALEDB_VERSION=2.10.1 \
|
||||
export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
|
||||
;; \
|
||||
*) \
|
||||
export TIMESCALEDB_VERSION=2.13.0 \
|
||||
export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \
|
||||
;; \
|
||||
esac && \
|
||||
apt-get update && \
|
||||
apt-get install -y cmake && \
|
||||
wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
|
||||
echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
|
||||
mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
|
||||
./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
|
||||
cd build && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make install -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/timescaledb.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-hint-plan-pg-build"
|
||||
# compile pg_hint_plan extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-hint-plan-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ARG PG_VERSION
|
||||
ENV PATH "/usr/local/pgsql/bin:$PATH"
|
||||
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v14") \
|
||||
export PG_HINT_PLAN_VERSION=14_1_4_1 \
|
||||
export PG_HINT_PLAN_CHECKSUM=c3501becf70ead27f70626bce80ea401ceac6a77e2083ee5f3ff1f1444ec1ad1 \
|
||||
;; \
|
||||
"v15") \
|
||||
export PG_HINT_PLAN_VERSION=15_1_5_0 \
|
||||
export PG_HINT_PLAN_CHECKSUM=564cbbf4820973ffece63fbf76e3c0af62c4ab23543142c7caaa682bc48918be \
|
||||
;; \
|
||||
"v16") \
|
||||
export PG_HINT_PLAN_VERSION=16_1_6_0 \
|
||||
export PG_HINT_PLAN_CHECKSUM=fc85a9212e7d2819d4ae4ac75817481101833c3cfa9f0fe1f980984e12347d00 \
|
||||
;; \
|
||||
*) \
|
||||
echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \
|
||||
;; \
|
||||
esac && \
|
||||
wget https://github.com/ossc-db/pg_hint_plan/archive/refs/tags/REL${PG_HINT_PLAN_VERSION}.tar.gz -O pg_hint_plan.tar.gz && \
|
||||
echo "${PG_HINT_PLAN_CHECKSUM} pg_hint_plan.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xvzf ../pg_hint_plan.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make install -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_hint_plan.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "kq-imcx-pg-build"
|
||||
# compile kq_imcx extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS kq-imcx-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN apt-get update && \
|
||||
apt-get install -y git libgtk2.0-dev libpq-dev libpam-dev libxslt-dev libkrb5-dev cmake && \
|
||||
wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \
|
||||
echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \
|
||||
mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
|
||||
mkdir build && cd build && \
|
||||
cmake -DCMAKE_BUILD_TYPE=Release .. && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
|
||||
mkdir -p /extensions/kq_imcx && cp /usr/local/pgsql/share/extension/kq_imcx.control /extensions/kq_imcx && \
|
||||
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
|
||||
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/kq_imcx.tar.zst -T -
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-cron-pg-build"
|
||||
# compile pg_cron extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-cron-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
|
||||
echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_cron-src && cd pg_cron-src && tar xvzf ../pg_cron.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "rdkit-pg-build"
|
||||
# compile rdkit extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS rdkit-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y \
|
||||
cmake \
|
||||
libboost-iostreams1.74-dev \
|
||||
libboost-regex1.74-dev \
|
||||
libboost-serialization1.74-dev \
|
||||
libboost-system1.74-dev \
|
||||
libeigen3-dev
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
|
||||
RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
|
||||
echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \
|
||||
mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \
|
||||
cmake \
|
||||
-D RDK_BUILD_CAIRO_SUPPORT=OFF \
|
||||
-D RDK_BUILD_INCHI_SUPPORT=ON \
|
||||
-D RDK_BUILD_AVALON_SUPPORT=ON \
|
||||
-D RDK_BUILD_PYTHON_WRAPPERS=OFF \
|
||||
-D RDK_BUILD_DESCRIPTORS3D=OFF \
|
||||
-D RDK_BUILD_FREESASA_SUPPORT=OFF \
|
||||
-D RDK_BUILD_COORDGEN_SUPPORT=ON \
|
||||
-D RDK_BUILD_MOLINTERCHANGE_SUPPORT=OFF \
|
||||
-D RDK_BUILD_YAEHMOP_SUPPORT=OFF \
|
||||
-D RDK_BUILD_STRUCTCHECKER_SUPPORT=OFF \
|
||||
-D RDK_USE_URF=OFF \
|
||||
-D RDK_BUILD_PGSQL=ON \
|
||||
-D RDK_PGSQL_STATIC=ON \
|
||||
-D PostgreSQL_CONFIG=pg_config \
|
||||
-D PostgreSQL_INCLUDE_DIR=`pg_config --includedir` \
|
||||
-D PostgreSQL_TYPE_INCLUDE_DIR=`pg_config --includedir-server` \
|
||||
-D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \
|
||||
-D RDK_INSTALL_INTREE=OFF \
|
||||
-D RDK_INSTALL_COMIC_FONTS=OFF \
|
||||
-D RDK_BUILD_FREETYPE_SUPPORT=OFF \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
. && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/rdkit.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-uuidv7-pg-build"
|
||||
# compile pg_uuidv7 extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-uuidv7-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
|
||||
echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xvzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_uuidv7.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-roaringbitmap-pg-build"
|
||||
# compile pg_roaringbitmap extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-roaringbitmap-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
|
||||
echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xvzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-semver-pg-build"
|
||||
# compile pg_semver extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-semver-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
|
||||
echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_semver-src && cd pg_semver-src && tar xvzf ../pg_semver.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/semver.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-embedding-pg-build"
|
||||
# compile pg_embedding extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-embedding-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ARG PG_VERSION
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v14" | "v15") \
|
||||
export PG_EMBEDDING_VERSION=0.3.5 \
|
||||
export PG_EMBEDDING_CHECKSUM=0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 \
|
||||
;; \
|
||||
*) \
|
||||
echo "pg_embedding not supported on this PostgreSQL version. Use pgvector instead." && exit 0;; \
|
||||
esac && \
|
||||
wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/${PG_EMBEDDING_VERSION}.tar.gz -O pg_embedding.tar.gz && \
|
||||
echo "${PG_EMBEDDING_CHECKSUM} pg_embedding.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-anon-pg-build"
|
||||
# compile anon extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-anon-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
|
||||
echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
|
||||
mkdir -p /extensions/anon && cp /usr/local/pgsql/share/extension/anon.control /extensions/anon && \
|
||||
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
|
||||
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/anon.tar.zst -T -
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "rust extensions"
|
||||
# This layer is used to build `pgrx` deps
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS rust-extensions-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y curl libclang-dev cmake && \
|
||||
useradd -ms /bin/bash nonroot -b /home
|
||||
|
||||
ENV HOME=/home/nonroot
|
||||
ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
|
||||
USER nonroot
|
||||
WORKDIR /home/nonroot
|
||||
ARG PG_VERSION
|
||||
|
||||
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
|
||||
chmod +x rustup-init && \
|
||||
./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
|
||||
rm rustup-init && \
|
||||
cargo install --locked --version 0.10.2 cargo-pgrx && \
|
||||
/bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
|
||||
|
||||
USER root
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-jsonschema-pg-build"
|
||||
# Compile "pg_jsonschema" extension
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM rust-extensions-build AS pg-jsonschema-pg-build
|
||||
ARG PG_VERSION
|
||||
|
||||
RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.gz -O pg_jsonschema.tar.gz && \
|
||||
echo "9118fc508a6e231e7a39acaa6f066fcd79af17a5db757b47d2eefbe14f7794f0 pg_jsonschema.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
|
||||
sed -i 's/pgrx = "0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-graphql-pg-build"
|
||||
# Compile "pg_graphql" extension
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM rust-extensions-build AS pg-graphql-pg-build
|
||||
ARG PG_VERSION
|
||||
|
||||
RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.4.0.tar.gz -O pg_graphql.tar.gz && \
|
||||
echo "bd8dc7230282b3efa9ae5baf053a54151ed0e66881c7c53750e2d0c765776edc pg_graphql.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
|
||||
sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
# it's needed to enable extension because it uses untrusted C language
|
||||
sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_graphql.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-tiktoken-build"
|
||||
# Compile "pg_tiktoken" extension
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM rust-extensions-build AS pg-tiktoken-pg-build
|
||||
ARG PG_VERSION
|
||||
|
||||
# 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023
|
||||
RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
|
||||
echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
|
||||
cargo pgrx install --release && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-pgx-ulid-build"
|
||||
# Compile "pgx_ulid" extension
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM rust-extensions-build AS pg-pgx-ulid-build
|
||||
ARG PG_VERSION
|
||||
|
||||
RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -O pgx_ulid.tar.gz && \
|
||||
echo "ee5db82945d2d9f2d15597a80cf32de9dca67b897f605beb830561705f12683c pgx_ulid.tar.gz" | sha256sum --check && \
|
||||
mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
|
||||
echo "******************* Apply a patch for Postgres 16 support; delete in the next release ******************" && \
|
||||
wget https://github.com/pksunkara/pgx_ulid/commit/f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
|
||||
patch -p1 < f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
|
||||
echo "********************************************************************************************************" && \
|
||||
sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "=0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "wal2json-build"
|
||||
# Compile "wal2json" extension
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM build-deps AS wal2json-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
|
||||
echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
|
||||
mkdir wal2json-src && cd wal2json-src && tar xvzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "neon-pg-ext-build"
|
||||
@@ -778,38 +81,6 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.
|
||||
FROM build-deps AS neon-pg-ext-build
|
||||
ARG PG_VERSION
|
||||
|
||||
# Public extensions
|
||||
COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=postgis-build /sfcgal/* /
|
||||
COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=h3-pg-build /h3/usr /
|
||||
COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=vector-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pgjwt-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-jsonschema-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-graphql-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-tiktoken-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=hypopg-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-hashids-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=rum-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pgtap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=ip4r-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=prefix-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=hll-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-semver-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
|
||||
COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY pgxn/ pgxn/
|
||||
|
||||
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
@@ -820,6 +91,10 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
|
||||
-C pgxn/neon_utils \
|
||||
-s install && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
|
||||
-C pgxn/neon_test_utils \
|
||||
-s install && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
|
||||
-C pgxn/neon_rmgr \
|
||||
@@ -891,7 +166,7 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
||||
# create folder for file cache
|
||||
mkdir -p -m 777 /neon/cache
|
||||
|
||||
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
|
||||
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local/pgsql-$PG_VERSION
|
||||
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
|
||||
|
||||
# Install:
|
||||
|
||||
179
Dockerfile.compute-node-combined
Normal file
179
Dockerfile.compute-node-combined
Normal file
@@ -0,0 +1,179 @@
|
||||
ARG REPOSITORY=neondatabase
|
||||
ARG IMAGE=build-tools
|
||||
ARG TAG=pinned
|
||||
ARG BUILD_TAG
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "build-deps"
|
||||
#
|
||||
#########################################################################################
|
||||
FROM debian:bullseye-slim AS build-deps
|
||||
RUN apt update && \
|
||||
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
|
||||
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
|
||||
libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-build"
|
||||
# Build Postgres from the neon postgres repository.
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-build
|
||||
COPY "vendor/postgres-v14" /postgres-v14
|
||||
COPY "vendor/postgres-v15" /postgres-v15
|
||||
COPY "vendor/postgres-v16" /postgres-v16
|
||||
RUN for pg_version in v14 v15 v16; do \
|
||||
install_dir="/postgres-$pg_version"; \
|
||||
cd "$install_dir"; \
|
||||
prefix="/usr/local/pgsql-${pg_version}"; \
|
||||
export CONFIGURE_CMD="./configure --prefix ${prefix} CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp \
|
||||
--with-icu --with-libxml --with-libxslt --with-lz4" && \
|
||||
if [ "${pg_version}" != "v14" ]; then \
|
||||
# zstd is available only from PG15
|
||||
export CONFIGURE_CMD="${CONFIGURE_CMD} --with-zstd"; \
|
||||
fi && \
|
||||
eval $CONFIGURE_CMD && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
|
||||
# Install headers
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
|
||||
extension_dir="${prefix}/share/extension" && \
|
||||
# Enable some of contrib extensions
|
||||
echo 'trusted = true' >> $extension_dir/autoinc.control && \
|
||||
echo 'trusted = true' >> $extension_dir/bloom.control && \
|
||||
echo 'trusted = true' >> $extension_dir/earthdistance.control && \
|
||||
echo 'trusted = true' >> $extension_dir/insert_username.control && \
|
||||
echo 'trusted = true' >> $extension_dir/intagg.control && \
|
||||
echo 'trusted = true' >> $extension_dir/moddatetime.control && \
|
||||
echo 'trusted = true' >> $extension_dir/pg_stat_statements.control && \
|
||||
echo 'trusted = true' >> $extension_dir/pgrowlocks.control && \
|
||||
echo 'trusted = true' >> $extension_dir/pgstattuple.control && \
|
||||
echo 'trusted = true' >> $extension_dir/refint.control && \
|
||||
echo 'trusted = true' >> $extension_dir/xml2.control && \
|
||||
# We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser.
|
||||
# In vanilla postgres this function is limited to Postgres role superuser.
|
||||
# In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases.
|
||||
# We could add the additional grant statements to the postgres repository but it would be hard to maintain,
|
||||
# whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
|
||||
# so we do it here.
|
||||
old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
|
||||
# the first loop is for pg_stat_statement extension version <= 1.6
|
||||
for file in $prefix/share/extension/pg_stat_statements--*.sql; do \
|
||||
filename=$(basename "$file"); \
|
||||
if echo "$old_list" | grep -q -F "$filename"; then \
|
||||
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
|
||||
fi; \
|
||||
done; \
|
||||
# the second loop is for pg_stat_statement extension versions >= 1.7,
|
||||
# where pg_stat_statement_reset() got 3 additional arguments
|
||||
for file in $prefix/share/extension/pg_stat_statements--*.sql; do \
|
||||
filename=$(basename "$file"); \
|
||||
if ! echo "$old_list" | grep -q -F "$filename"; then \
|
||||
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
|
||||
fi; \
|
||||
done; \
|
||||
# Go back to root dir from `/postgres-v<version>` dir
|
||||
cd ..; \
|
||||
done
|
||||
|
||||
# #########################################################################################
|
||||
# #
|
||||
# # Compile and run the Neon-specific `compute_ctl` binary
|
||||
# #
|
||||
# #########################################################################################
|
||||
# FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
|
||||
# ARG BUILD_TAG
|
||||
# ENV BUILD_TAG=$BUILD_TAG
|
||||
#
|
||||
# USER nonroot
|
||||
# # Copy entire project to get Cargo.* files with proper dependencies for the whole project
|
||||
# COPY --chown=nonroot . .
|
||||
# RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
|
||||
#
|
||||
# #########################################################################################
|
||||
# #
|
||||
# # Clean up postgres folder before inclusion
|
||||
# #
|
||||
# #########################################################################################
|
||||
# FROM pg-build AS postgres-cleanup-layer
|
||||
# # COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
|
||||
#
|
||||
# RUN for pg_version in v14 v15 v16; do \
|
||||
# prefix="/usr/local/pgsql-${pg_version}"; \
|
||||
# # Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise)
|
||||
# cd "${prefix}/bin" && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp; \
|
||||
# cd ..; \
|
||||
# # Remove headers that we won't need anymore - we've completed installation of all extensions
|
||||
# rm -r "${prefix}/include"; \
|
||||
# # Remove static postgresql libraries - all compilation is finished, so we
|
||||
# # can now remove these files - they must be included in other binaries by now
|
||||
# # if they were to be used by other libraries.
|
||||
# rm ${prefix}/lib/lib*.a; \
|
||||
# done
|
||||
#
|
||||
# #########################################################################################
|
||||
# #
|
||||
# # Final layer
|
||||
# # Put it all together into the final image
|
||||
# #
|
||||
# #########################################################################################
|
||||
# FROM debian:bullseye-slim
|
||||
# # Add user postgres
|
||||
# RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
||||
# echo "postgres:test_console_pass" | chpasswd && \
|
||||
# mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
|
||||
# mkdir /var/db/postgres/pgbouncer && \
|
||||
# chown -R postgres:postgres /var/db/postgres && \
|
||||
# chmod 0750 /var/db/postgres/compute && \
|
||||
# chmod 0750 /var/db/postgres/pgbouncer && \
|
||||
# echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig && \
|
||||
# # create folder for file cache
|
||||
# mkdir -p -m 777 /neon/cache
|
||||
#
|
||||
# COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql-v14 /usr/local/pgsql-v14
|
||||
# COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql-v15 /usr/local/pgsql-v15
|
||||
# COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql-v16 /usr/local/pgsql-v16
|
||||
# COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
|
||||
#
|
||||
# # Install:
|
||||
# # libreadline8 for psql
|
||||
# # libicu67, locales for collations (including ICU and plpgsql_check)
|
||||
# # liblz4-1 for lz4
|
||||
# # libossp-uuid16 for extension ossp-uuid
|
||||
# # libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
|
||||
# # libxml2, libxslt1.1 for xml2
|
||||
# # libzstd1 for zstd
|
||||
# # libboost* for rdkit
|
||||
# # ca-certificates for communicating with s3 by compute_ctl
|
||||
# RUN apt update && \
|
||||
# apt install --no-install-recommends -y \
|
||||
# gdb \
|
||||
# libicu67 \
|
||||
# liblz4-1 \
|
||||
# libreadline8 \
|
||||
# libboost-iostreams1.74.0 \
|
||||
# libboost-regex1.74.0 \
|
||||
# libboost-serialization1.74.0 \
|
||||
# libboost-system1.74.0 \
|
||||
# libossp-uuid16 \
|
||||
# libgeos-c1v5 \
|
||||
# libgdal28 \
|
||||
# libproj19 \
|
||||
# libprotobuf-c1 \
|
||||
# libsfcgal1 \
|
||||
# libxml2 \
|
||||
# libxslt1.1 \
|
||||
# libzstd1 \
|
||||
# libcurl4-openssl-dev \
|
||||
# locales \
|
||||
# procps \
|
||||
# ca-certificates && \
|
||||
# rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
|
||||
# localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
|
||||
#
|
||||
# ENV LANG en_US.utf8
|
||||
# USER postgres
|
||||
# ENTRYPOINT ["/usr/local/bin/compute_ctl"]
|
||||
65
Dockerfile.compute-node-merged
Normal file
65
Dockerfile.compute-node-merged
Normal file
@@ -0,0 +1,65 @@
|
||||
FROM debian:bullseye-slim
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Final layer
|
||||
# Put it all together into the final image
|
||||
#
|
||||
#########################################################################################
|
||||
FROM debian:bullseye-slim
|
||||
# Add user postgres
|
||||
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
||||
echo "postgres:test_console_pass" | chpasswd && \
|
||||
mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
|
||||
mkdir /var/db/postgres/pgbouncer && \
|
||||
chown -R postgres:postgres /var/db/postgres && \
|
||||
chmod 0750 /var/db/postgres/compute && \
|
||||
chmod 0750 /var/db/postgres/pgbouncer && \
|
||||
echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig && \
|
||||
# create folder for file cache
|
||||
mkdir -p -m 777 /neon/cache
|
||||
|
||||
COPY --from=base:v14 --chown=postgres /usr/local/pgsql /usr/local/pgsql-v14
|
||||
COPY --from=base:v15 --chown=postgres /usr/local/pgsql /usr/local/pgsql-v15
|
||||
COPY --from=base:v16 --chown=postgres /usr/local/pgsql /usr/local/pgsql-v16
|
||||
COPY --from=tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
|
||||
|
||||
# Install:
|
||||
# libreadline8 for psql
|
||||
# libicu67, locales for collations (including ICU and plpgsql_check)
|
||||
# liblz4-1 for lz4
|
||||
# libossp-uuid16 for extension ossp-uuid
|
||||
# libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
|
||||
# libxml2, libxslt1.1 for xml2
|
||||
# libzstd1 for zstd
|
||||
# libboost* for rdkit
|
||||
# ca-certificates for communicating with s3 by compute_ctl
|
||||
RUN apt update && \
|
||||
apt install --no-install-recommends -y \
|
||||
gdb \
|
||||
libicu67 \
|
||||
liblz4-1 \
|
||||
libreadline8 \
|
||||
libboost-iostreams1.74.0 \
|
||||
libboost-regex1.74.0 \
|
||||
libboost-serialization1.74.0 \
|
||||
libboost-system1.74.0 \
|
||||
libossp-uuid16 \
|
||||
libgeos-c1v5 \
|
||||
libgdal28 \
|
||||
libproj19 \
|
||||
libprotobuf-c1 \
|
||||
libsfcgal1 \
|
||||
libxml2 \
|
||||
libxslt1.1 \
|
||||
libzstd1 \
|
||||
libcurl4-openssl-dev \
|
||||
locales \
|
||||
procps \
|
||||
ca-certificates && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
|
||||
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
|
||||
|
||||
ENV LANG en_US.utf8
|
||||
USER postgres
|
||||
ENTRYPOINT ["/usr/local/bin/compute_ctl"]
|
||||
159
Dockerfile.compute-node-simple
Normal file
159
Dockerfile.compute-node-simple
Normal file
@@ -0,0 +1,159 @@
|
||||
ARG PG_VERSION
|
||||
ARG REPOSITORY=neondatabase
|
||||
ARG IMAGE=build-tools
|
||||
ARG TAG=pinned
|
||||
ARG BUILD_TAG
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "build-deps"
|
||||
#
|
||||
#########################################################################################
|
||||
FROM debian:bullseye-slim AS build-deps
|
||||
RUN apt update && \
|
||||
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
|
||||
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
|
||||
libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-build"
|
||||
# Build Postgres from the neon postgres repository.
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-build
|
||||
ARG PG_VERSION
|
||||
COPY vendor/postgres-${PG_VERSION} postgres
|
||||
RUN cd postgres && \
|
||||
export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp \
|
||||
--with-icu --with-libxml --with-libxslt --with-lz4" && \
|
||||
if [ "${PG_VERSION}" != "v14" ]; then \
|
||||
# zstd is available only from PG15
|
||||
export CONFIGURE_CMD="${CONFIGURE_CMD} --with-zstd"; \
|
||||
fi && \
|
||||
eval $CONFIGURE_CMD && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
|
||||
# Install headers
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
|
||||
# Enable some of contrib extensions
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/moddatetime.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_stat_statements.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control && \
|
||||
# We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser.
|
||||
# In vanilla postgres this function is limited to Postgres role superuser.
|
||||
# In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases.
|
||||
# We could add the additional grant statements to the postgres repository but it would be hard to maintain,
|
||||
# whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
|
||||
# so we do it here.
|
||||
old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
|
||||
# the first loop is for pg_stat_statement extension version <= 1.6
|
||||
for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
|
||||
filename=$(basename "$file"); \
|
||||
if echo "$old_list" | grep -q -F "$filename"; then \
|
||||
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
|
||||
fi; \
|
||||
done; \
|
||||
# the second loop is for pg_stat_statement extension versions >= 1.7,
|
||||
# where pg_stat_statement_reset() got 3 additional arguments
|
||||
for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
|
||||
filename=$(basename "$file"); \
|
||||
if ! echo "$old_list" | grep -q -F "$filename"; then \
|
||||
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
|
||||
fi; \
|
||||
done
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "neon-pg-ext-build"
|
||||
# compile neon extensions
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS neon-pg-ext-build
|
||||
ARG PG_VERSION
|
||||
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY pgxn/ pgxn/
|
||||
|
||||
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
|
||||
-C pgxn/neon \
|
||||
-s install && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
|
||||
-C pgxn/neon_utils \
|
||||
-s install && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
|
||||
-C pgxn/neon_test_utils \
|
||||
-s install && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
|
||||
-C pgxn/neon_rmgr \
|
||||
-s install && \
|
||||
case "${PG_VERSION}" in \
|
||||
"v14" | "v15") \
|
||||
;; \
|
||||
"v16") \
|
||||
echo "Skipping HNSW for PostgreSQL 16" && exit 0 \
|
||||
;; \
|
||||
*) \
|
||||
echo "unexpected PostgreSQL version" && exit 1 \
|
||||
;; \
|
||||
esac && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
|
||||
-C pgxn/hnsw \
|
||||
-s install
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Compile and run the Neon-specific `compute_ctl` binary
|
||||
#
|
||||
#########################################################################################
|
||||
FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
|
||||
ARG BUILD_TAG
|
||||
ENV BUILD_TAG=$BUILD_TAG
|
||||
|
||||
USER nonroot
|
||||
# Copy entire project to get Cargo.* files with proper dependencies for the whole project
|
||||
COPY --chown=nonroot . .
|
||||
RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Clean up postgres folder before inclusion
|
||||
#
|
||||
#########################################################################################
|
||||
FROM neon-pg-ext-build AS postgres-cleanup-layer
|
||||
COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
|
||||
|
||||
# Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise)
|
||||
RUN cd /usr/local/pgsql/bin && rm ecpg
|
||||
|
||||
# Remove headers that we won't need anymore - we've completed installation of all extensions
|
||||
RUN rm -r /usr/local/pgsql/include
|
||||
|
||||
# Remove static postgresql libraries - all compilation is finished, so we
|
||||
# can now remove these files - they must be included in other binaries by now
|
||||
# if they were to be used by other libraries.
|
||||
RUN rm /usr/local/pgsql/lib/lib*.a
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Final layer
|
||||
# Put it all together into the final image
|
||||
#
|
||||
#########################################################################################
|
||||
FROM debian:bullseye-slim
|
||||
|
||||
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local/pgsql
|
||||
18
Dockerfile.compute-node-tools
Normal file
18
Dockerfile.compute-node-tools
Normal file
@@ -0,0 +1,18 @@
|
||||
ARG REPOSITORY=neondatabase
|
||||
ARG IMAGE=build-tools
|
||||
ARG TAG=pinned
|
||||
ARG BUILD_TAG
|
||||
#########################################################################################
|
||||
#
|
||||
# Compile and run the Neon-specific `compute_ctl` binary
|
||||
#
|
||||
#########################################################################################
|
||||
FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
|
||||
ARG BUILD_TAG
|
||||
ENV BUILD_TAG=$BUILD_TAG
|
||||
|
||||
USER nonroot
|
||||
# Copy entire project to get Cargo.* files with proper dependencies for the whole project
|
||||
COPY --chown=nonroot . .
|
||||
RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
|
||||
|
||||
16
Makefile
16
Makefile
@@ -159,8 +159,8 @@ neon-pg-ext-%: postgres-%
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install
|
||||
|
||||
.PHONY: neon-pg-ext-clean-%
|
||||
neon-pg-ext-clean-%:
|
||||
.PHONY: neon-pg-clean-ext-%
|
||||
neon-pg-clean-ext-%:
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
|
||||
@@ -216,11 +216,11 @@ neon-pg-ext: \
|
||||
neon-pg-ext-v15 \
|
||||
neon-pg-ext-v16
|
||||
|
||||
.PHONY: neon-pg-ext-clean
|
||||
neon-pg-ext-clean: \
|
||||
neon-pg-ext-clean-v14 \
|
||||
neon-pg-ext-clean-v15 \
|
||||
neon-pg-ext-clean-v16
|
||||
.PHONY: neon-pg-clean-ext
|
||||
neon-pg-clean-ext: \
|
||||
neon-pg-clean-ext-v14 \
|
||||
neon-pg-clean-ext-v15 \
|
||||
neon-pg-clean-ext-v16
|
||||
|
||||
# shorthand to build all Postgres versions
|
||||
.PHONY: postgres
|
||||
@@ -249,7 +249,7 @@ postgres-check: \
|
||||
|
||||
# This doesn't remove the effects of 'configure'.
|
||||
.PHONY: clean
|
||||
clean: postgres-clean neon-pg-ext-clean
|
||||
clean: postgres-clean neon-pg-clean-ext
|
||||
$(CARGO_CMD_PREFIX) cargo clean
|
||||
|
||||
# This removes everything
|
||||
|
||||
10
README.md
10
README.md
@@ -249,6 +249,16 @@ testing locally, it is convenient to run just one set of permutations, like this
|
||||
DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
|
||||
```
|
||||
|
||||
## Flamegraphs
|
||||
|
||||
You may find yourself in need of flamegraphs for software in this repository.
|
||||
You can use [`flamegraph-rs`](https://github.com/flamegraph-rs/flamegraph) or the original [`flamegraph.pl`](https://github.com/brendangregg/FlameGraph). Your choice!
|
||||
|
||||
>[!IMPORTANT]
|
||||
> If you're using `lld` or `mold`, you need the `--no-rosegment` linker argument.
|
||||
> It's a [general thing with Rust / lld / mold](https://crbug.com/919499#c16), not specific to this repository.
|
||||
> See [this PR for further instructions](https://github.com/neondatabase/neon/pull/6764).
|
||||
|
||||
## Documentation
|
||||
|
||||
[docs](/docs) Contains a top-level overview of all available markdown documentation.
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::collections::HashMap;
|
||||
use std::env;
|
||||
use std::fs;
|
||||
use std::io::BufRead;
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
use std::os::unix::fs::{symlink, PermissionsExt};
|
||||
use std::path::Path;
|
||||
use std::process::{Command, Stdio};
|
||||
use std::str::FromStr;
|
||||
@@ -324,7 +324,8 @@ impl ComputeNode {
|
||||
let spec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
let start_time = Instant::now();
|
||||
|
||||
let mut config = postgres::Config::from_str(&spec.pageserver_connstr)?;
|
||||
let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
|
||||
let mut config = postgres::Config::from_str(shard0_connstr)?;
|
||||
|
||||
// Use the storage auth token from the config file, if given.
|
||||
// Note: this overrides any password set in the connection string.
|
||||
@@ -634,6 +635,48 @@ impl ComputeNode {
|
||||
// Update pg_hba.conf received with basebackup.
|
||||
update_pg_hba(pgdata_path)?;
|
||||
|
||||
// Place pg_dynshmem under /dev/shm. This allows us to use
|
||||
// 'dynamic_shared_memory_type = mmap' so that the files are placed in
|
||||
// /dev/shm, similar to how 'dynamic_shared_memory_type = posix' works.
|
||||
//
|
||||
// Why on earth don't we just stick to the 'posix' default, you might
|
||||
// ask. It turns out that making large allocations with 'posix' doesn't
|
||||
// work very well with autoscaling. The behavior we want is that:
|
||||
//
|
||||
// 1. You can make large DSM allocations, larger than the current RAM
|
||||
// size of the VM, without errors
|
||||
//
|
||||
// 2. If the allocated memory is really used, the VM is scaled up
|
||||
// automatically to accommodate that
|
||||
//
|
||||
// We try to make that possible by having swap in the VM. But with the
|
||||
// default 'posix' DSM implementation, we fail step 1, even when there's
|
||||
// plenty of swap available. PostgreSQL uses posix_fallocate() to create
|
||||
// the shmem segment, which is really just a file in /dev/shm in Linux,
|
||||
// but posix_fallocate() on tmpfs returns ENOMEM if the size is larger
|
||||
// than available RAM.
|
||||
//
|
||||
// Using 'dynamic_shared_memory_type = mmap' works around that, because
|
||||
// the Postgres 'mmap' DSM implementation doesn't use
|
||||
// posix_fallocate(). Instead, it uses repeated calls to write(2) to
|
||||
// fill the file with zeros. It's weird that that differs between
|
||||
// 'posix' and 'mmap', but we take advantage of it. When the file is
|
||||
// filled slowly with write(2), the kernel allows it to grow larger, as
|
||||
// long as there's swap available.
|
||||
//
|
||||
// In short, using 'dynamic_shared_memory_type = mmap' allows us one DSM
|
||||
// segment to be larger than currently available RAM. But because we
|
||||
// don't want to store it on a real file, which the kernel would try to
|
||||
// flush to disk, so symlink pg_dynshm to /dev/shm.
|
||||
//
|
||||
// We don't set 'dynamic_shared_memory_type = mmap' here, we let the
|
||||
// control plane control that option. If 'mmap' is not used, this
|
||||
// symlink doesn't affect anything.
|
||||
//
|
||||
// See https://github.com/neondatabase/autoscaling/issues/800
|
||||
std::fs::remove_dir(pgdata_path.join("pg_dynshmem"))?;
|
||||
symlink("/dev/shm/", pgdata_path.join("pg_dynshmem"))?;
|
||||
|
||||
match spec.mode {
|
||||
ComputeMode::Primary => {}
|
||||
ComputeMode::Replica | ComputeMode::Static(..) => {
|
||||
|
||||
@@ -4,6 +4,11 @@ version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
default = []
|
||||
# Enables test-only APIs and behaviors
|
||||
testing = []
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
aws-config.workspace = true
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::{collections::HashMap, time::Duration};
|
||||
use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
|
||||
use control_plane::local_env::LocalEnv;
|
||||
use hyper::{Method, StatusCode};
|
||||
use pageserver_api::shard::{ShardCount, ShardIndex, ShardNumber, TenantShardId};
|
||||
use pageserver_api::shard::{ShardIndex, ShardNumber, TenantShardId};
|
||||
use postgres_connection::parse_host_port;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -77,7 +77,7 @@ impl ComputeHookTenant {
|
||||
self.shards
|
||||
.sort_by_key(|(shard, _node_id)| shard.shard_number);
|
||||
|
||||
if self.shards.len() == shard_count.0 as usize || shard_count == ShardCount(0) {
|
||||
if self.shards.len() == shard_count.count() as usize || shard_count.is_unsharded() {
|
||||
// We have pageservers for all the shards: emit a configuration update
|
||||
return Some(ComputeHookNotifyRequest {
|
||||
tenant_id,
|
||||
@@ -94,7 +94,7 @@ impl ComputeHookTenant {
|
||||
tracing::info!(
|
||||
"ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
|
||||
self.shards.len(),
|
||||
shard_count.0
|
||||
shard_count.count()
|
||||
);
|
||||
}
|
||||
|
||||
@@ -155,7 +155,7 @@ impl ComputeHook {
|
||||
|
||||
for (endpoint_name, endpoint) in &cplane.endpoints {
|
||||
if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running {
|
||||
tracing::info!("🔁 Reconfiguring endpoint {}", endpoint_name,);
|
||||
tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
|
||||
endpoint.reconfigure(compute_pageservers.clone()).await?;
|
||||
}
|
||||
}
|
||||
@@ -177,7 +177,7 @@ impl ComputeHook {
|
||||
req
|
||||
};
|
||||
|
||||
tracing::debug!(
|
||||
tracing::info!(
|
||||
"Sending notify request to {} ({:?})",
|
||||
url,
|
||||
reconfigure_request
|
||||
@@ -266,7 +266,7 @@ impl ComputeHook {
|
||||
/// periods, but we don't retry forever. The **caller** is responsible for handling failures and
|
||||
/// ensuring that they eventually call again to ensure that the compute is eventually notified of
|
||||
/// the proper pageserver nodes for a tenant.
|
||||
#[tracing::instrument(skip_all, fields(tenant_shard_id, node_id))]
|
||||
#[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), node_id))]
|
||||
pub(super) async fn notify(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
@@ -298,7 +298,7 @@ impl ComputeHook {
|
||||
let Some(reconfigure_request) = reconfigure_request else {
|
||||
// The tenant doesn't yet have pageservers for all its shards: we won't notify anything
|
||||
// until it does.
|
||||
tracing::debug!("Tenant isn't yet ready to emit a notification",);
|
||||
tracing::info!("Tenant isn't yet ready to emit a notification");
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
|
||||
@@ -37,6 +37,12 @@ impl std::fmt::Display for Sequence {
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Sequence {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl MonotonicCounter<Sequence> for Sequence {
|
||||
fn cnt_advance(&mut self, v: Sequence) {
|
||||
assert!(*self <= v);
|
||||
|
||||
@@ -15,6 +15,7 @@ use diesel::Connection;
|
||||
use metrics::launch_timestamp::LaunchTimestamp;
|
||||
use std::sync::Arc;
|
||||
use tokio::signal::unix::SignalKind;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::auth::{JwtAuth, SwappableJwtAuth};
|
||||
use utils::logging::{self, LogFormat};
|
||||
|
||||
@@ -237,15 +238,23 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
let auth = secrets
|
||||
.public_key
|
||||
.map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth)));
|
||||
let router = make_router(service, auth)
|
||||
let router = make_router(service.clone(), auth)
|
||||
.build()
|
||||
.map_err(|err| anyhow!(err))?;
|
||||
let router_service = utils::http::RouterService::new(router).unwrap();
|
||||
let server = hyper::Server::from_tcp(http_listener)?.serve(router_service);
|
||||
|
||||
// Start HTTP server
|
||||
let server_shutdown = CancellationToken::new();
|
||||
let server = hyper::Server::from_tcp(http_listener)?
|
||||
.serve(router_service)
|
||||
.with_graceful_shutdown({
|
||||
let server_shutdown = server_shutdown.clone();
|
||||
async move {
|
||||
server_shutdown.cancelled().await;
|
||||
}
|
||||
});
|
||||
tracing::info!("Serving on {0}", args.listen);
|
||||
|
||||
tokio::task::spawn(server);
|
||||
let server_task = tokio::task::spawn(server);
|
||||
|
||||
// Wait until we receive a signal
|
||||
let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?;
|
||||
@@ -266,5 +275,16 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
}
|
||||
}
|
||||
|
||||
// Stop HTTP server first, so that we don't have to service requests
|
||||
// while shutting down Service
|
||||
server_shutdown.cancel();
|
||||
if let Err(e) = server_task.await {
|
||||
tracing::error!("Error joining HTTP server task: {e}")
|
||||
}
|
||||
tracing::info!("Joined HTTP server task");
|
||||
|
||||
service.shutdown().await;
|
||||
tracing::info!("Service shutdown complete");
|
||||
|
||||
std::process::exit(0);
|
||||
}
|
||||
|
||||
@@ -222,7 +222,7 @@ impl Persistence {
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
|
||||
shard_number: ShardNumber(tsp.shard_number as u8),
|
||||
shard_count: ShardCount(tsp.shard_count as u8),
|
||||
shard_count: ShardCount::new(tsp.shard_count as u8),
|
||||
};
|
||||
|
||||
tenants_map.insert(tenant_shard_id, tsp);
|
||||
@@ -318,7 +318,7 @@ impl Persistence {
|
||||
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())
|
||||
.map_err(|e| DatabaseError::Logical(format!("Malformed tenant id: {e}")))?,
|
||||
shard_number: ShardNumber(tsp.shard_number as u8),
|
||||
shard_count: ShardCount(tsp.shard_count as u8),
|
||||
shard_count: ShardCount::new(tsp.shard_count as u8),
|
||||
};
|
||||
result.insert(tenant_shard_id, Generation::new(tsp.generation as u32));
|
||||
}
|
||||
@@ -340,7 +340,7 @@ impl Persistence {
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||
.filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
|
||||
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
|
||||
.set((
|
||||
generation.eq(generation + 1),
|
||||
generation_pageserver.eq(node_id.0 as i64),
|
||||
@@ -362,7 +362,7 @@ impl Persistence {
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||
.filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
|
||||
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
|
||||
.set((
|
||||
generation_pageserver.eq(i64::MAX),
|
||||
placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
|
||||
@@ -381,7 +381,6 @@ impl Persistence {
|
||||
//
|
||||
// We create the child shards here, so that they will be available for increment_generation calls
|
||||
// if some pageserver holding a child shard needs to restart before the overall tenant split is complete.
|
||||
#[allow(dead_code)]
|
||||
pub(crate) async fn begin_shard_split(
|
||||
&self,
|
||||
old_shard_count: ShardCount,
|
||||
@@ -393,21 +392,19 @@ impl Persistence {
|
||||
conn.transaction(|conn| -> DatabaseResult<()> {
|
||||
// Mark parent shards as splitting
|
||||
|
||||
let expect_parent_records = std::cmp::max(1, old_shard_count.0);
|
||||
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.eq(old_shard_count.0 as i32))
|
||||
.filter(shard_count.eq(old_shard_count.literal() as i32))
|
||||
.set((splitting.eq(1),))
|
||||
.execute(conn)?;
|
||||
if u8::try_from(updated)
|
||||
.map_err(|_| DatabaseError::Logical(
|
||||
format!("Overflow existing shard count {} while splitting", updated))
|
||||
)? != expect_parent_records {
|
||||
)? != old_shard_count.count() {
|
||||
// Perhaps a deletion or another split raced with this attempt to split, mutating
|
||||
// the parent shards that we intend to split. In this case the split request should fail.
|
||||
return Err(DatabaseError::Logical(
|
||||
format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {expect_parent_records})")
|
||||
format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {})", old_shard_count.count())
|
||||
));
|
||||
}
|
||||
|
||||
@@ -419,7 +416,7 @@ impl Persistence {
|
||||
let mut parent = crate::schema::tenant_shards::table
|
||||
.filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
|
||||
.filter(shard_count.eq(parent_shard_id.shard_count.0 as i32))
|
||||
.filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32))
|
||||
.load::<TenantShardPersistence>(conn)?;
|
||||
let parent = if parent.len() != 1 {
|
||||
return Err(DatabaseError::Logical(format!(
|
||||
@@ -449,7 +446,6 @@ impl Persistence {
|
||||
|
||||
// When we finish shard splitting, we must atomically clean up the old shards
|
||||
// and insert the new shards, and clear the splitting marker.
|
||||
#[allow(dead_code)]
|
||||
pub(crate) async fn complete_shard_split(
|
||||
&self,
|
||||
split_tenant_id: TenantId,
|
||||
@@ -461,7 +457,7 @@ impl Persistence {
|
||||
// Drop parent shards
|
||||
diesel::delete(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.eq(old_shard_count.0 as i32))
|
||||
.filter(shard_count.eq(old_shard_count.literal() as i32))
|
||||
.execute(conn)?;
|
||||
|
||||
// Clear sharding flag
|
||||
|
||||
@@ -13,6 +13,7 @@ use tokio_util::sync::CancellationToken;
|
||||
use utils::generation::Generation;
|
||||
use utils::id::{NodeId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::sync::gate::GateGuard;
|
||||
|
||||
use crate::compute_hook::{ComputeHook, NotifyError};
|
||||
use crate::node::Node;
|
||||
@@ -53,6 +54,10 @@ pub(super) struct Reconciler {
|
||||
/// the tenant is changed.
|
||||
pub(crate) cancel: CancellationToken,
|
||||
|
||||
/// Reconcilers are registered with a Gate so that during a graceful shutdown we
|
||||
/// can wait for all the reconcilers to respond to their cancellation tokens.
|
||||
pub(crate) _gate_guard: GateGuard,
|
||||
|
||||
/// Access to persistent storage for updating generation numbers
|
||||
pub(crate) persistence: Arc<Persistence>,
|
||||
}
|
||||
@@ -263,7 +268,7 @@ impl Reconciler {
|
||||
secondary_conf,
|
||||
tenant_conf: config.clone(),
|
||||
shard_number: shard.number.0,
|
||||
shard_count: shard.count.0,
|
||||
shard_count: shard.count.literal(),
|
||||
shard_stripe_size: shard.stripe_size.0,
|
||||
}
|
||||
}
|
||||
@@ -458,7 +463,7 @@ impl Reconciler {
|
||||
generation: None,
|
||||
secondary_conf: None,
|
||||
shard_number: self.shard.number.0,
|
||||
shard_count: self.shard.count.0,
|
||||
shard_count: self.shard.count.literal(),
|
||||
shard_stripe_size: self.shard.stripe_size.0,
|
||||
tenant_conf: self.config.clone(),
|
||||
},
|
||||
@@ -506,7 +511,7 @@ pub(crate) fn attached_location_conf(
|
||||
generation: generation.into(),
|
||||
secondary_conf: None,
|
||||
shard_number: shard.number.0,
|
||||
shard_count: shard.count.0,
|
||||
shard_count: shard.count.literal(),
|
||||
shard_stripe_size: shard.stripe_size.0,
|
||||
tenant_conf: config.clone(),
|
||||
}
|
||||
@@ -521,7 +526,7 @@ pub(crate) fn secondary_location_conf(
|
||||
generation: None,
|
||||
secondary_conf: Some(LocationConfigSecondary { warm: true }),
|
||||
shard_number: shard.number.0,
|
||||
shard_count: shard.count.0,
|
||||
shard_count: shard.count.literal(),
|
||||
shard_stripe_size: shard.stripe_size.0,
|
||||
tenant_conf: config.clone(),
|
||||
}
|
||||
|
||||
@@ -77,12 +77,11 @@ impl Scheduler {
|
||||
return Err(ScheduleError::ImpossibleConstraint);
|
||||
}
|
||||
|
||||
for (node_id, count) in &tenant_counts {
|
||||
tracing::info!("tenant_counts[{node_id}]={count}");
|
||||
}
|
||||
|
||||
let node_id = tenant_counts.first().unwrap().0;
|
||||
tracing::info!("scheduler selected node {node_id}");
|
||||
tracing::info!(
|
||||
"scheduler selected node {node_id} (elegible nodes {:?}, exclude: {hard_exclude:?})",
|
||||
tenant_counts.iter().map(|i| i.0 .0).collect::<Vec<_>>()
|
||||
);
|
||||
*self.tenant_counts.get_mut(&node_id).unwrap() += 1;
|
||||
Ok(node_id)
|
||||
}
|
||||
|
||||
@@ -30,6 +30,7 @@ use pageserver_api::{
|
||||
};
|
||||
use pageserver_client::mgmt_api;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::instrument;
|
||||
use utils::{
|
||||
backoff,
|
||||
completion::Barrier,
|
||||
@@ -37,6 +38,7 @@ use utils::{
|
||||
http::error::ApiError,
|
||||
id::{NodeId, TenantId, TimelineId},
|
||||
seqwait::SeqWait,
|
||||
sync::gate::Gate,
|
||||
};
|
||||
|
||||
use crate::{
|
||||
@@ -124,6 +126,12 @@ pub struct Service {
|
||||
config: Config,
|
||||
persistence: Arc<Persistence>,
|
||||
|
||||
// Process shutdown will fire this token
|
||||
cancel: CancellationToken,
|
||||
|
||||
// Background tasks will hold this gate
|
||||
gate: Gate,
|
||||
|
||||
/// This waits for initial reconciliation with pageservers to complete. Until this barrier
|
||||
/// passes, it isn't safe to do any actions that mutate tenants.
|
||||
pub(crate) startup_complete: Barrier,
|
||||
@@ -144,8 +152,9 @@ impl Service {
|
||||
&self.config
|
||||
}
|
||||
|
||||
/// TODO: don't allow other API calls until this is done, don't start doing any background housekeeping
|
||||
/// until this is done.
|
||||
/// Called once on startup, this function attempts to contact all pageservers to build an up-to-date
|
||||
/// view of the world, and determine which pageservers are responsive.
|
||||
#[instrument(skip_all)]
|
||||
async fn startup_reconcile(&self) {
|
||||
// For all tenant shards, a vector of observed states on nodes (where None means
|
||||
// indeterminate, same as in [`ObservedStateLocation`])
|
||||
@@ -153,9 +162,6 @@ impl Service {
|
||||
|
||||
let mut nodes_online = HashSet::new();
|
||||
|
||||
// TODO: give Service a cancellation token for clean shutdown
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
// TODO: issue these requests concurrently
|
||||
{
|
||||
let nodes = {
|
||||
@@ -190,7 +196,7 @@ impl Service {
|
||||
1,
|
||||
5,
|
||||
"Location config listing",
|
||||
&cancel,
|
||||
&self.cancel,
|
||||
)
|
||||
.await;
|
||||
let Some(list_response) = list_response else {
|
||||
@@ -292,7 +298,7 @@ impl Service {
|
||||
generation: None,
|
||||
secondary_conf: None,
|
||||
shard_number: tenant_shard_id.shard_number.0,
|
||||
shard_count: tenant_shard_id.shard_count.0,
|
||||
shard_count: tenant_shard_id.shard_count.literal(),
|
||||
shard_stripe_size: 0,
|
||||
tenant_conf: models::TenantConfig::default(),
|
||||
},
|
||||
@@ -331,7 +337,7 @@ impl Service {
|
||||
let stream = futures::stream::iter(compute_notifications.into_iter())
|
||||
.map(|(tenant_shard_id, node_id)| {
|
||||
let compute_hook = compute_hook.clone();
|
||||
let cancel = cancel.clone();
|
||||
let cancel = self.cancel.clone();
|
||||
async move {
|
||||
if let Err(e) = compute_hook.notify(tenant_shard_id, node_id, &cancel).await {
|
||||
tracing::error!(
|
||||
@@ -368,8 +374,98 @@ impl Service {
|
||||
tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
|
||||
}
|
||||
|
||||
/// Long running background task that periodically wakes up and looks for shards that need
|
||||
/// reconciliation. Reconciliation is fallible, so any reconciliation tasks that fail during
|
||||
/// e.g. a tenant create/attach/migrate must eventually be retried: this task is responsible
|
||||
/// for those retries.
|
||||
#[instrument(skip_all)]
|
||||
async fn background_reconcile(&self) {
|
||||
self.startup_complete.clone().wait().await;
|
||||
|
||||
const BACKGROUND_RECONCILE_PERIOD: Duration = Duration::from_secs(20);
|
||||
|
||||
let mut interval = tokio::time::interval(BACKGROUND_RECONCILE_PERIOD);
|
||||
while !self.cancel.is_cancelled() {
|
||||
tokio::select! {
|
||||
_ = interval.tick() => { self.reconcile_all(); }
|
||||
_ = self.cancel.cancelled() => return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
async fn process_results(
|
||||
&self,
|
||||
mut result_rx: tokio::sync::mpsc::UnboundedReceiver<ReconcileResult>,
|
||||
) {
|
||||
loop {
|
||||
// Wait for the next result, or for cancellation
|
||||
let result = tokio::select! {
|
||||
r = result_rx.recv() => {
|
||||
match r {
|
||||
Some(result) => {result},
|
||||
None => {break;}
|
||||
}
|
||||
}
|
||||
_ = self.cancel.cancelled() => {
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
tracing::info!(
|
||||
"Reconcile result for sequence {}, ok={}",
|
||||
result.sequence,
|
||||
result.result.is_ok()
|
||||
);
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let Some(tenant) = locked.tenants.get_mut(&result.tenant_shard_id) else {
|
||||
// A reconciliation result might race with removing a tenant: drop results for
|
||||
// tenants that aren't in our map.
|
||||
continue;
|
||||
};
|
||||
|
||||
// Usually generation should only be updated via this path, so the max() isn't
|
||||
// needed, but it is used to handle out-of-band updates via. e.g. test hook.
|
||||
tenant.generation = std::cmp::max(tenant.generation, result.generation);
|
||||
|
||||
// If the reconciler signals that it failed to notify compute, set this state on
|
||||
// the shard so that a future [`TenantState::maybe_reconcile`] will try again.
|
||||
tenant.pending_compute_notification = result.pending_compute_notification;
|
||||
|
||||
match result.result {
|
||||
Ok(()) => {
|
||||
for (node_id, loc) in &result.observed.locations {
|
||||
if let Some(conf) = &loc.conf {
|
||||
tracing::info!("Updating observed location {}: {:?}", node_id, conf);
|
||||
} else {
|
||||
tracing::info!("Setting observed location {} to None", node_id,)
|
||||
}
|
||||
}
|
||||
tenant.observed = result.observed;
|
||||
tenant.waiter.advance(result.sequence);
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
"Reconcile error on tenant {}: {}",
|
||||
tenant.tenant_shard_id,
|
||||
e
|
||||
);
|
||||
|
||||
// Ordering: populate last_error before advancing error_seq,
|
||||
// so that waiters will see the correct error after waiting.
|
||||
*(tenant.last_error.lock().unwrap()) = format!("{e}");
|
||||
tenant.error_waiter.advance(result.sequence);
|
||||
|
||||
for (node_id, o) in result.observed.locations {
|
||||
tenant.observed.locations.insert(node_id, o);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn spawn(config: Config, persistence: Arc<Persistence>) -> anyhow::Result<Arc<Self>> {
|
||||
let (result_tx, mut result_rx) = tokio::sync::mpsc::unbounded_channel();
|
||||
let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();
|
||||
|
||||
tracing::info!("Loading nodes from database...");
|
||||
let nodes = persistence.list_nodes().await?;
|
||||
@@ -389,14 +485,14 @@ impl Service {
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
|
||||
shard_number: ShardNumber(tsp.shard_number as u8),
|
||||
shard_count: ShardCount(tsp.shard_count as u8),
|
||||
shard_count: ShardCount::new(tsp.shard_count as u8),
|
||||
};
|
||||
let shard_identity = if tsp.shard_count == 0 {
|
||||
ShardIdentity::unsharded()
|
||||
} else {
|
||||
ShardIdentity::new(
|
||||
ShardNumber(tsp.shard_number as u8),
|
||||
ShardCount(tsp.shard_count as u8),
|
||||
ShardCount::new(tsp.shard_count as u8),
|
||||
ShardStripeSize(tsp.shard_stripe_size as u32),
|
||||
)?
|
||||
};
|
||||
@@ -418,6 +514,7 @@ impl Service {
|
||||
observed: ObservedState::new(),
|
||||
config: serde_json::from_str(&tsp.config).unwrap(),
|
||||
reconciler: None,
|
||||
splitting: tsp.splitting,
|
||||
waiter: Arc::new(SeqWait::new(Sequence::initial())),
|
||||
error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
|
||||
last_error: Arc::default(),
|
||||
@@ -439,73 +536,35 @@ impl Service {
|
||||
config,
|
||||
persistence,
|
||||
startup_complete: startup_complete.clone(),
|
||||
cancel: CancellationToken::new(),
|
||||
gate: Gate::default(),
|
||||
});
|
||||
|
||||
let result_task_this = this.clone();
|
||||
tokio::task::spawn(async move {
|
||||
while let Some(result) = result_rx.recv().await {
|
||||
tracing::info!(
|
||||
"Reconcile result for sequence {}, ok={}",
|
||||
result.sequence,
|
||||
result.result.is_ok()
|
||||
);
|
||||
let mut locked = result_task_this.inner.write().unwrap();
|
||||
let Some(tenant) = locked.tenants.get_mut(&result.tenant_shard_id) else {
|
||||
// A reconciliation result might race with removing a tenant: drop results for
|
||||
// tenants that aren't in our map.
|
||||
continue;
|
||||
};
|
||||
|
||||
// Usually generation should only be updated via this path, so the max() isn't
|
||||
// needed, but it is used to handle out-of-band updates via. e.g. test hook.
|
||||
tenant.generation = std::cmp::max(tenant.generation, result.generation);
|
||||
|
||||
// If the reconciler signals that it failed to notify compute, set this state on
|
||||
// the shard so that a future [`TenantState::maybe_reconcile`] will try again.
|
||||
tenant.pending_compute_notification = result.pending_compute_notification;
|
||||
|
||||
match result.result {
|
||||
Ok(()) => {
|
||||
for (node_id, loc) in &result.observed.locations {
|
||||
if let Some(conf) = &loc.conf {
|
||||
tracing::info!(
|
||||
"Updating observed location {}: {:?}",
|
||||
node_id,
|
||||
conf
|
||||
);
|
||||
} else {
|
||||
tracing::info!("Setting observed location {} to None", node_id,)
|
||||
}
|
||||
}
|
||||
tenant.observed = result.observed;
|
||||
tenant.waiter.advance(result.sequence);
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
"Reconcile error on tenant {}: {}",
|
||||
tenant.tenant_shard_id,
|
||||
e
|
||||
);
|
||||
|
||||
// Ordering: populate last_error before advancing error_seq,
|
||||
// so that waiters will see the correct error after waiting.
|
||||
*(tenant.last_error.lock().unwrap()) = format!("{e}");
|
||||
tenant.error_waiter.advance(result.sequence);
|
||||
|
||||
for (node_id, o) in result.observed.locations {
|
||||
tenant.observed.locations.insert(node_id, o);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Block shutdown until we're done (we must respect self.cancel)
|
||||
if let Ok(_gate) = result_task_this.gate.enter() {
|
||||
result_task_this.process_results(result_rx).await
|
||||
}
|
||||
});
|
||||
|
||||
let startup_reconcile_this = this.clone();
|
||||
tokio::task::spawn(async move {
|
||||
// Block the [`Service::startup_complete`] barrier until we're done
|
||||
let _completion = startup_completion;
|
||||
tokio::task::spawn({
|
||||
let this = this.clone();
|
||||
// We will block the [`Service::startup_complete`] barrier until [`Self::startup_reconcile`]
|
||||
// is done.
|
||||
let startup_completion = startup_completion.clone();
|
||||
async move {
|
||||
// Block shutdown until we're done (we must respect self.cancel)
|
||||
let Ok(_gate) = this.gate.enter() else {
|
||||
return;
|
||||
};
|
||||
|
||||
startup_reconcile_this.startup_reconcile().await
|
||||
this.startup_reconcile().await;
|
||||
|
||||
drop(startup_completion);
|
||||
|
||||
this.background_reconcile().await;
|
||||
}
|
||||
});
|
||||
|
||||
Ok(this)
|
||||
@@ -526,7 +585,7 @@ impl Service {
|
||||
let tsp = TenantShardPersistence {
|
||||
tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(),
|
||||
shard_number: attach_req.tenant_shard_id.shard_number.0 as i32,
|
||||
shard_count: attach_req.tenant_shard_id.shard_count.0 as i32,
|
||||
shard_count: attach_req.tenant_shard_id.shard_count.literal() as i32,
|
||||
shard_stripe_size: 0,
|
||||
generation: 0,
|
||||
generation_pageserver: i64::MAX,
|
||||
@@ -620,6 +679,28 @@ impl Service {
|
||||
attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
|
||||
);
|
||||
|
||||
// Trick the reconciler into not doing anything for this tenant: this helps
|
||||
// tests that manually configure a tenant on the pagesrever, and then call this
|
||||
// attach hook: they don't want background reconciliation to modify what they
|
||||
// did to the pageserver.
|
||||
#[cfg(feature = "testing")]
|
||||
{
|
||||
if let Some(node_id) = attach_req.node_id {
|
||||
tenant_state.observed.locations = HashMap::from([(
|
||||
node_id,
|
||||
ObservedStateLocation {
|
||||
conf: Some(attached_location_conf(
|
||||
tenant_state.generation,
|
||||
&tenant_state.shard,
|
||||
&tenant_state.config,
|
||||
)),
|
||||
},
|
||||
)]);
|
||||
} else {
|
||||
tenant_state.observed.locations.clear();
|
||||
}
|
||||
}
|
||||
|
||||
Ok(AttachHookResponse {
|
||||
gen: attach_req
|
||||
.node_id
|
||||
@@ -726,16 +807,9 @@ impl Service {
|
||||
&self,
|
||||
create_req: TenantCreateRequest,
|
||||
) -> Result<TenantCreateResponse, ApiError> {
|
||||
// Shard count 0 is valid: it means create a single shard (ShardCount(0) means "unsharded")
|
||||
let literal_shard_count = if create_req.shard_parameters.is_unsharded() {
|
||||
1
|
||||
} else {
|
||||
create_req.shard_parameters.count.0
|
||||
};
|
||||
|
||||
// This service expects to handle sharding itself: it is an error to try and directly create
|
||||
// a particular shard here.
|
||||
let tenant_id = if create_req.new_tenant_id.shard_count > ShardCount(1) {
|
||||
let tenant_id = if !create_req.new_tenant_id.is_unsharded() {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Attempted to create a specific shard, this API is for creating the whole tenant"
|
||||
)));
|
||||
@@ -749,7 +823,7 @@ impl Service {
|
||||
create_req.shard_parameters.count,
|
||||
);
|
||||
|
||||
let create_ids = (0..literal_shard_count)
|
||||
let create_ids = (0..create_req.shard_parameters.count.count())
|
||||
.map(|i| TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(i),
|
||||
@@ -769,7 +843,7 @@ impl Service {
|
||||
.map(|tenant_shard_id| TenantShardPersistence {
|
||||
tenant_id: tenant_shard_id.tenant_id.to_string(),
|
||||
shard_number: tenant_shard_id.shard_number.0 as i32,
|
||||
shard_count: tenant_shard_id.shard_count.0 as i32,
|
||||
shard_count: tenant_shard_id.shard_count.literal() as i32,
|
||||
shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32,
|
||||
generation: create_req.generation.map(|g| g as i32).unwrap_or(0),
|
||||
generation_pageserver: i64::MAX,
|
||||
@@ -875,6 +949,8 @@ impl Service {
|
||||
&compute_hook,
|
||||
&self.config,
|
||||
&self.persistence,
|
||||
&self.gate,
|
||||
&self.cancel,
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
@@ -914,7 +990,7 @@ impl Service {
|
||||
tenant_id: TenantId,
|
||||
req: TenantLocationConfigRequest,
|
||||
) -> Result<TenantLocationConfigResponse, ApiError> {
|
||||
if req.tenant_id.shard_count.0 > 1 {
|
||||
if !req.tenant_id.is_unsharded() {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"This API is for importing single-sharded or unsharded tenants"
|
||||
)));
|
||||
@@ -977,6 +1053,8 @@ impl Service {
|
||||
&compute_hook,
|
||||
&self.config,
|
||||
&self.persistence,
|
||||
&self.gate,
|
||||
&self.cancel,
|
||||
);
|
||||
if let Some(waiter) = maybe_waiter {
|
||||
waiters.push(waiter);
|
||||
@@ -1066,6 +1144,8 @@ impl Service {
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_delete(&self, tenant_id: TenantId) -> Result<StatusCode, ApiError> {
|
||||
self.ensure_attached_wait(tenant_id).await?;
|
||||
|
||||
// TODO: refactor into helper
|
||||
let targets = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
@@ -1087,8 +1167,6 @@ impl Service {
|
||||
targets
|
||||
};
|
||||
|
||||
// TODO: error out if the tenant is not attached anywhere.
|
||||
|
||||
// Phase 1: delete on the pageservers
|
||||
let mut any_pending = false;
|
||||
for (tenant_shard_id, node) in targets {
|
||||
@@ -1424,9 +1502,6 @@ impl Service {
|
||||
let mut policy = None;
|
||||
let mut shard_ident = None;
|
||||
|
||||
// TODO: put a cancellation token on Service for clean shutdown
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
// A parent shard which will be split
|
||||
struct SplitTarget {
|
||||
parent_id: TenantShardId,
|
||||
@@ -1449,7 +1524,7 @@ impl Service {
|
||||
for (tenant_shard_id, shard) in
|
||||
locked.tenants.range(TenantShardId::tenant_range(tenant_id))
|
||||
{
|
||||
match shard.shard.count.0.cmp(&split_req.new_shard_count) {
|
||||
match shard.shard.count.count().cmp(&split_req.new_shard_count) {
|
||||
Ordering::Equal => {
|
||||
// Already split this
|
||||
children_found.push(*tenant_shard_id);
|
||||
@@ -1459,7 +1534,7 @@ impl Service {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Requested count {} but already have shards at count {}",
|
||||
split_req.new_shard_count,
|
||||
shard.shard.count.0
|
||||
shard.shard.count.count()
|
||||
)));
|
||||
}
|
||||
Ordering::Less => {
|
||||
@@ -1489,7 +1564,7 @@ impl Service {
|
||||
shard_ident = Some(shard.shard);
|
||||
}
|
||||
|
||||
if tenant_shard_id.shard_count == ShardCount(split_req.new_shard_count) {
|
||||
if tenant_shard_id.shard_count.count() == split_req.new_shard_count {
|
||||
tracing::info!(
|
||||
"Tenant shard {} already has shard count {}",
|
||||
tenant_shard_id,
|
||||
@@ -1515,7 +1590,7 @@ impl Service {
|
||||
targets.push(SplitTarget {
|
||||
parent_id: *tenant_shard_id,
|
||||
node: node.clone(),
|
||||
child_ids: tenant_shard_id.split(ShardCount(split_req.new_shard_count)),
|
||||
child_ids: tenant_shard_id.split(ShardCount::new(split_req.new_shard_count)),
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1562,7 +1637,7 @@ impl Service {
|
||||
this_child_tsps.push(TenantShardPersistence {
|
||||
tenant_id: child.tenant_id.to_string(),
|
||||
shard_number: child.shard_number.0 as i32,
|
||||
shard_count: child.shard_count.0 as i32,
|
||||
shard_count: child.shard_count.literal() as i32,
|
||||
shard_stripe_size: shard_ident.stripe_size.0 as i32,
|
||||
// Note: this generation is a placeholder, [`Persistence::begin_shard_split`] will
|
||||
// populate the correct generation as part of its transaction, to protect us
|
||||
@@ -1598,6 +1673,18 @@ impl Service {
|
||||
}
|
||||
}
|
||||
|
||||
// Now that I have persisted the splitting state, apply it in-memory. This is infallible, so
|
||||
// callers may assume that if splitting is set in memory, then it was persisted, and if splitting
|
||||
// is not set in memory, then it was not persisted.
|
||||
{
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
for target in &targets {
|
||||
if let Some(parent_shard) = locked.tenants.get_mut(&target.parent_id) {
|
||||
parent_shard.splitting = SplitState::Splitting;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: we have now committed the shard split state to the database, so any subsequent
|
||||
// failure needs to roll it back. We will later wrap this function in logic to roll back
|
||||
// the split if it fails.
|
||||
@@ -1657,7 +1744,7 @@ impl Service {
|
||||
.complete_shard_split(tenant_id, old_shard_count)
|
||||
.await?;
|
||||
|
||||
// Replace all the shards we just split with their children
|
||||
// Replace all the shards we just split with their children: this phase is infallible.
|
||||
let mut response = TenantShardSplitResponse {
|
||||
new_shards: Vec::new(),
|
||||
};
|
||||
@@ -1705,6 +1792,10 @@ impl Service {
|
||||
child_state.generation = generation;
|
||||
child_state.config = config.clone();
|
||||
|
||||
// The child's TenantState::splitting is intentionally left at the default value of Idle,
|
||||
// as at this point in the split process we have succeeded and this part is infallible:
|
||||
// we will never need to do any special recovery from this state.
|
||||
|
||||
child_locations.push((child, pageserver));
|
||||
|
||||
locked.tenants.insert(child, child_state);
|
||||
@@ -1716,7 +1807,7 @@ impl Service {
|
||||
// Send compute notifications for all the new shards
|
||||
let mut failed_notifications = Vec::new();
|
||||
for (child_id, child_ps) in child_locations {
|
||||
if let Err(e) = compute_hook.notify(child_id, child_ps, &cancel).await {
|
||||
if let Err(e) = compute_hook.notify(child_id, child_ps, &self.cancel).await {
|
||||
tracing::warn!("Failed to update compute of {}->{} during split, proceeding anyway to complete split ({e})",
|
||||
child_id, child_ps);
|
||||
failed_notifications.push(child_id);
|
||||
@@ -1792,6 +1883,8 @@ impl Service {
|
||||
&compute_hook,
|
||||
&self.config,
|
||||
&self.persistence,
|
||||
&self.gate,
|
||||
&self.cancel,
|
||||
)
|
||||
};
|
||||
|
||||
@@ -1993,6 +2086,8 @@ impl Service {
|
||||
&compute_hook,
|
||||
&self.config,
|
||||
&self.persistence,
|
||||
&self.gate,
|
||||
&self.cancel,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -2014,6 +2109,8 @@ impl Service {
|
||||
&compute_hook,
|
||||
&self.config,
|
||||
&self.persistence,
|
||||
&self.gate,
|
||||
&self.cancel,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -2053,6 +2150,8 @@ impl Service {
|
||||
&compute_hook,
|
||||
&self.config,
|
||||
&self.persistence,
|
||||
&self.gate,
|
||||
&self.cancel,
|
||||
) {
|
||||
waiters.push(waiter);
|
||||
}
|
||||
@@ -2064,6 +2163,17 @@ impl Service {
|
||||
let ensure_waiters = {
|
||||
let locked = self.inner.write().unwrap();
|
||||
|
||||
// Check if the tenant is splitting: in this case, even if it is attached,
|
||||
// we must act as if it is not: this blocks e.g. timeline creation/deletion
|
||||
// operations during the split.
|
||||
for (_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) {
|
||||
if !matches!(shard.splitting, SplitState::Idle) {
|
||||
return Err(ApiError::ResourceUnavailable(
|
||||
"Tenant shards are currently splitting".into(),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
self.ensure_attached_schedule(locked, tenant_id)
|
||||
.map_err(ApiError::InternalServerError)?
|
||||
};
|
||||
@@ -2095,8 +2205,25 @@ impl Service {
|
||||
&compute_hook,
|
||||
&self.config,
|
||||
&self.persistence,
|
||||
&self.gate,
|
||||
&self.cancel,
|
||||
)
|
||||
})
|
||||
.count()
|
||||
}
|
||||
|
||||
pub async fn shutdown(&self) {
|
||||
// Note that this already stops processing any results from reconciles: so
|
||||
// we do not expect that our [`TenantState`] objects will reach a neat
|
||||
// final state.
|
||||
self.cancel.cancel();
|
||||
|
||||
// The cancellation tokens in [`crate::reconciler::Reconciler`] are children
|
||||
// of our cancellation token, so we do not need to explicitly cancel each of
|
||||
// them.
|
||||
|
||||
// Background tasks and reconcilers hold gate guards: this waits for them all
|
||||
// to complete.
|
||||
self.gate.close().await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,16 +7,18 @@ use pageserver_api::{
|
||||
};
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{instrument, Instrument};
|
||||
use utils::{
|
||||
generation::Generation,
|
||||
id::NodeId,
|
||||
seqwait::{SeqWait, SeqWaitError},
|
||||
sync::gate::Gate,
|
||||
};
|
||||
|
||||
use crate::{
|
||||
compute_hook::ComputeHook,
|
||||
node::Node,
|
||||
persistence::Persistence,
|
||||
persistence::{split_state::SplitState, Persistence},
|
||||
reconciler::{attached_location_conf, secondary_location_conf, ReconcileError, Reconciler},
|
||||
scheduler::{ScheduleError, Scheduler},
|
||||
service, PlacementPolicy, Sequence,
|
||||
@@ -58,6 +60,11 @@ pub(crate) struct TenantState {
|
||||
/// cancellation token has been fired)
|
||||
pub(crate) reconciler: Option<ReconcilerHandle>,
|
||||
|
||||
/// If a tenant is being split, then all shards with that TenantId will have a
|
||||
/// SplitState set, this acts as a guard against other operations such as background
|
||||
/// reconciliation, and timeline creation.
|
||||
pub(crate) splitting: SplitState,
|
||||
|
||||
/// Optionally wait for reconciliation to complete up to a particular
|
||||
/// sequence number.
|
||||
pub(crate) waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
|
||||
@@ -238,6 +245,7 @@ impl TenantState {
|
||||
observed: ObservedState::default(),
|
||||
config: TenantConfig::default(),
|
||||
reconciler: None,
|
||||
splitting: SplitState::Idle,
|
||||
sequence: Sequence(1),
|
||||
waiter: Arc::new(SeqWait::new(Sequence(0))),
|
||||
error_waiter: Arc::new(SeqWait::new(Sequence(0))),
|
||||
@@ -415,6 +423,8 @@ impl TenantState {
|
||||
false
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
|
||||
pub(crate) fn maybe_reconcile(
|
||||
&mut self,
|
||||
result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
|
||||
@@ -422,6 +432,8 @@ impl TenantState {
|
||||
compute_hook: &Arc<ComputeHook>,
|
||||
service_config: &service::Config,
|
||||
persistence: &Arc<Persistence>,
|
||||
gate: &Gate,
|
||||
cancel: &CancellationToken,
|
||||
) -> Option<ReconcilerWaiter> {
|
||||
// If there are any ambiguous observed states, and the nodes they refer to are available,
|
||||
// we should reconcile to clean them up.
|
||||
@@ -443,6 +455,14 @@ impl TenantState {
|
||||
return None;
|
||||
}
|
||||
|
||||
// If we are currently splitting, then never start a reconciler task: the splitting logic
|
||||
// requires that shards are not interfered with while it runs. Do this check here rather than
|
||||
// up top, so that we only log this message if we would otherwise have done a reconciliation.
|
||||
if !matches!(self.splitting, SplitState::Idle) {
|
||||
tracing::info!("Refusing to reconcile, splitting in progress");
|
||||
return None;
|
||||
}
|
||||
|
||||
// Reconcile already in flight for the current sequence?
|
||||
if let Some(handle) = &self.reconciler {
|
||||
if handle.sequence == self.sequence {
|
||||
@@ -460,7 +480,12 @@ impl TenantState {
|
||||
// doing our sequence's work.
|
||||
let old_handle = self.reconciler.take();
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
let Ok(gate_guard) = gate.enter() else {
|
||||
// Shutting down, don't start a reconciler
|
||||
return None;
|
||||
};
|
||||
|
||||
let reconciler_cancel = cancel.child_token();
|
||||
let mut reconciler = Reconciler {
|
||||
tenant_shard_id: self.tenant_shard_id,
|
||||
shard: self.shard,
|
||||
@@ -471,59 +496,66 @@ impl TenantState {
|
||||
pageservers: pageservers.clone(),
|
||||
compute_hook: compute_hook.clone(),
|
||||
service_config: service_config.clone(),
|
||||
cancel: cancel.clone(),
|
||||
_gate_guard: gate_guard,
|
||||
cancel: reconciler_cancel.clone(),
|
||||
persistence: persistence.clone(),
|
||||
compute_notify_failure: false,
|
||||
};
|
||||
|
||||
let reconcile_seq = self.sequence;
|
||||
|
||||
tracing::info!("Spawning Reconciler for sequence {}", self.sequence);
|
||||
tracing::info!(seq=%reconcile_seq, "Spawning Reconciler for sequence {}", self.sequence);
|
||||
let must_notify = self.pending_compute_notification;
|
||||
let join_handle = tokio::task::spawn(async move {
|
||||
// Wait for any previous reconcile task to complete before we start
|
||||
if let Some(old_handle) = old_handle {
|
||||
old_handle.cancel.cancel();
|
||||
if let Err(e) = old_handle.handle.await {
|
||||
// We can't do much with this other than log it: the task is done, so
|
||||
// we may proceed with our work.
|
||||
tracing::error!("Unexpected join error waiting for reconcile task: {e}");
|
||||
let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
|
||||
tenant_id=%reconciler.tenant_shard_id.tenant_id,
|
||||
shard_id=%reconciler.tenant_shard_id.shard_slug());
|
||||
let join_handle = tokio::task::spawn(
|
||||
async move {
|
||||
// Wait for any previous reconcile task to complete before we start
|
||||
if let Some(old_handle) = old_handle {
|
||||
old_handle.cancel.cancel();
|
||||
if let Err(e) = old_handle.handle.await {
|
||||
// We can't do much with this other than log it: the task is done, so
|
||||
// we may proceed with our work.
|
||||
tracing::error!("Unexpected join error waiting for reconcile task: {e}");
|
||||
}
|
||||
}
|
||||
|
||||
// Early check for cancellation before doing any work
|
||||
// TODO: wrap all remote API operations in cancellation check
|
||||
// as well.
|
||||
if reconciler.cancel.is_cancelled() {
|
||||
return;
|
||||
}
|
||||
|
||||
// Attempt to make observed state match intent state
|
||||
let result = reconciler.reconcile().await;
|
||||
|
||||
// If we know we had a pending compute notification from some previous action, send a notification irrespective
|
||||
// of whether the above reconcile() did any work
|
||||
if result.is_ok() && must_notify {
|
||||
// If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`]
|
||||
reconciler.compute_notify().await.ok();
|
||||
}
|
||||
|
||||
result_tx
|
||||
.send(ReconcileResult {
|
||||
sequence: reconcile_seq,
|
||||
result,
|
||||
tenant_shard_id: reconciler.tenant_shard_id,
|
||||
generation: reconciler.generation,
|
||||
observed: reconciler.observed,
|
||||
pending_compute_notification: reconciler.compute_notify_failure,
|
||||
})
|
||||
.ok();
|
||||
}
|
||||
|
||||
// Early check for cancellation before doing any work
|
||||
// TODO: wrap all remote API operations in cancellation check
|
||||
// as well.
|
||||
if reconciler.cancel.is_cancelled() {
|
||||
return;
|
||||
}
|
||||
|
||||
// Attempt to make observed state match intent state
|
||||
let result = reconciler.reconcile().await;
|
||||
|
||||
// If we know we had a pending compute notification from some previous action, send a notification irrespective
|
||||
// of whether the above reconcile() did any work
|
||||
if result.is_ok() && must_notify {
|
||||
// If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`]
|
||||
reconciler.compute_notify().await.ok();
|
||||
}
|
||||
|
||||
result_tx
|
||||
.send(ReconcileResult {
|
||||
sequence: reconcile_seq,
|
||||
result,
|
||||
tenant_shard_id: reconciler.tenant_shard_id,
|
||||
generation: reconciler.generation,
|
||||
observed: reconciler.observed,
|
||||
pending_compute_notification: reconciler.compute_notify_failure,
|
||||
})
|
||||
.ok();
|
||||
});
|
||||
.instrument(reconciler_span),
|
||||
);
|
||||
|
||||
self.reconciler = Some(ReconcilerHandle {
|
||||
sequence: self.sequence,
|
||||
handle: join_handle,
|
||||
cancel,
|
||||
cancel: reconciler_cancel,
|
||||
});
|
||||
|
||||
Some(ReconcilerWaiter {
|
||||
|
||||
@@ -450,7 +450,7 @@ async fn handle_tenant(
|
||||
new_tenant_id: TenantShardId::unsharded(tenant_id),
|
||||
generation: None,
|
||||
shard_parameters: ShardParameters {
|
||||
count: ShardCount(shard_count),
|
||||
count: ShardCount::new(shard_count),
|
||||
stripe_size: shard_stripe_size
|
||||
.map(ShardStripeSize)
|
||||
.unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE),
|
||||
|
||||
@@ -115,7 +115,6 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) {
|
||||
// performed by the process.
|
||||
// We know the size of the block, so we can determine the I/O bytes out of it.
|
||||
// The value might be not 100% exact, but should be fine for Prometheus metrics in this case.
|
||||
#[allow(clippy::unnecessary_cast)]
|
||||
fn update_rusage_metrics() {
|
||||
let rusage_stats = get_rusage_stats();
|
||||
|
||||
|
||||
@@ -214,14 +214,14 @@ impl ShardParameters {
|
||||
pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
|
||||
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.count == ShardCount(0)
|
||||
self.count.is_unsharded()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ShardParameters {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
count: ShardCount(0),
|
||||
count: ShardCount::new(0),
|
||||
stripe_size: Self::DEFAULT_STRIPE_SIZE,
|
||||
}
|
||||
}
|
||||
@@ -494,6 +494,8 @@ pub struct TimelineInfo {
|
||||
pub current_logical_size: u64,
|
||||
pub current_logical_size_is_accurate: bool,
|
||||
|
||||
pub directory_entries_counts: Vec<u64>,
|
||||
|
||||
/// Sum of the size of all layer files.
|
||||
/// If a layer is present in both local FS and S3, it counts only once.
|
||||
pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
|
||||
|
||||
@@ -124,6 +124,7 @@ impl RelTag {
|
||||
Ord,
|
||||
strum_macros::EnumIter,
|
||||
strum_macros::FromRepr,
|
||||
enum_map::Enum,
|
||||
)]
|
||||
#[repr(u8)]
|
||||
pub enum SlruKind {
|
||||
|
||||
@@ -13,10 +13,41 @@ use utils::id::TenantId;
|
||||
pub struct ShardNumber(pub u8);
|
||||
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
||||
pub struct ShardCount(pub u8);
|
||||
pub struct ShardCount(u8);
|
||||
|
||||
impl ShardCount {
|
||||
pub const MAX: Self = Self(u8::MAX);
|
||||
|
||||
/// The internal value of a ShardCount may be zero, which means "1 shard, but use
|
||||
/// legacy format for TenantShardId that excludes the shard suffix", also known
|
||||
/// as `TenantShardId::unsharded`.
|
||||
///
|
||||
/// This method returns the actual number of shards, i.e. if our internal value is
|
||||
/// zero, we return 1 (unsharded tenants have 1 shard).
|
||||
pub fn count(&self) -> u8 {
|
||||
if self.0 > 0 {
|
||||
self.0
|
||||
} else {
|
||||
1
|
||||
}
|
||||
}
|
||||
|
||||
/// The literal internal value: this is **not** the number of shards in the
|
||||
/// tenant, as we have a special zero value for legacy unsharded tenants. Use
|
||||
/// [`Self::count`] if you want to know the cardinality of shards.
|
||||
pub fn literal(&self) -> u8 {
|
||||
self.0
|
||||
}
|
||||
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.0 == 0
|
||||
}
|
||||
|
||||
/// `v` may be zero, or the number of shards in the tenant. `v` is what
|
||||
/// [`Self::literal`] would return.
|
||||
pub fn new(val: u8) -> Self {
|
||||
Self(val)
|
||||
}
|
||||
}
|
||||
|
||||
impl ShardNumber {
|
||||
@@ -86,7 +117,7 @@ impl TenantShardId {
|
||||
}
|
||||
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
|
||||
self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
|
||||
}
|
||||
|
||||
/// Convenience for dropping the tenant_id and just getting the ShardIndex: this
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
#![allow(non_snake_case)]
|
||||
// bindgen creates some unsafe code with no doc comments.
|
||||
#![allow(clippy::missing_safety_doc)]
|
||||
// noted at 1.63 that in many cases there's a u32 -> u32 transmutes in bindgen code.
|
||||
// noted at 1.63 that in many cases there's u32 -> u32 transmutes in bindgen code.
|
||||
#![allow(clippy::useless_transmute)]
|
||||
// modules included with the postgres_ffi macro depend on the types of the specific version's
|
||||
// types, and trigger a too eager lint.
|
||||
|
||||
@@ -13,5 +13,6 @@ rand.workspace = true
|
||||
tokio.workspace = true
|
||||
tracing.workspace = true
|
||||
thiserror.workspace = true
|
||||
serde.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -7,6 +7,7 @@ pub mod framed;
|
||||
|
||||
use byteorder::{BigEndian, ReadBytesExt};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{borrow::Cow, collections::HashMap, fmt, io, str};
|
||||
|
||||
// re-export for use in utils pageserver_feedback.rs
|
||||
@@ -123,7 +124,7 @@ impl StartupMessageParams {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
|
||||
#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct CancelKeyData {
|
||||
pub backend_pid: i32,
|
||||
pub cancel_key: i32,
|
||||
|
||||
@@ -15,11 +15,13 @@ aws-sdk-s3.workspace = true
|
||||
aws-credential-types.workspace = true
|
||||
bytes.workspace = true
|
||||
camino.workspace = true
|
||||
humantime.workspace = true
|
||||
hyper = { workspace = true, features = ["stream"] }
|
||||
futures.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
|
||||
tokio-stream.workspace = true
|
||||
tokio-util = { workspace = true, features = ["compat"] }
|
||||
toml_edit.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
@@ -22,16 +22,15 @@ use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerCl
|
||||
use bytes::Bytes;
|
||||
use futures::stream::Stream;
|
||||
use futures_util::StreamExt;
|
||||
use futures_util::TryStreamExt;
|
||||
use http_types::{StatusCode, Url};
|
||||
use tokio::time::Instant;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::debug;
|
||||
|
||||
use crate::s3_bucket::RequestKind;
|
||||
use crate::TimeTravelError;
|
||||
use crate::{
|
||||
AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath,
|
||||
RemoteStorage, StorageMetadata,
|
||||
error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download,
|
||||
DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
|
||||
TimeTravelError, TimeoutOrCancel,
|
||||
};
|
||||
|
||||
pub struct AzureBlobStorage {
|
||||
@@ -39,10 +38,12 @@ pub struct AzureBlobStorage {
|
||||
prefix_in_container: Option<String>,
|
||||
max_keys_per_list_response: Option<NonZeroU32>,
|
||||
concurrency_limiter: ConcurrencyLimiter,
|
||||
// Per-request timeout. Accessible for tests.
|
||||
pub timeout: Duration,
|
||||
}
|
||||
|
||||
impl AzureBlobStorage {
|
||||
pub fn new(azure_config: &AzureConfig) -> Result<Self> {
|
||||
pub fn new(azure_config: &AzureConfig, timeout: Duration) -> Result<Self> {
|
||||
debug!(
|
||||
"Creating azure remote storage for azure container {}",
|
||||
azure_config.container_name
|
||||
@@ -79,6 +80,7 @@ impl AzureBlobStorage {
|
||||
prefix_in_container: azure_config.prefix_in_container.to_owned(),
|
||||
max_keys_per_list_response,
|
||||
concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()),
|
||||
timeout,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -121,8 +123,11 @@ impl AzureBlobStorage {
|
||||
async fn download_for_builder(
|
||||
&self,
|
||||
builder: GetBlobBuilder,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError> {
|
||||
let mut response = builder.into_stream();
|
||||
let kind = RequestKind::Get;
|
||||
|
||||
let _permit = self.permit(kind, cancel).await?;
|
||||
|
||||
let mut etag = None;
|
||||
let mut last_modified = None;
|
||||
@@ -130,39 +135,70 @@ impl AzureBlobStorage {
|
||||
// TODO give proper streaming response instead of buffering into RAM
|
||||
// https://github.com/neondatabase/neon/issues/5563
|
||||
|
||||
let mut bufs = Vec::new();
|
||||
while let Some(part) = response.next().await {
|
||||
let part = part.map_err(to_download_error)?;
|
||||
let etag_str: &str = part.blob.properties.etag.as_ref();
|
||||
if etag.is_none() {
|
||||
etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
|
||||
let download = async {
|
||||
let response = builder
|
||||
// convert to concrete Pageable
|
||||
.into_stream()
|
||||
// convert to TryStream
|
||||
.into_stream()
|
||||
.map_err(to_download_error);
|
||||
|
||||
// apply per request timeout
|
||||
let response = tokio_stream::StreamExt::timeout(response, self.timeout);
|
||||
|
||||
// flatten
|
||||
let response = response.map(|res| match res {
|
||||
Ok(res) => res,
|
||||
Err(_elapsed) => Err(DownloadError::Timeout),
|
||||
});
|
||||
|
||||
let mut response = std::pin::pin!(response);
|
||||
|
||||
let mut bufs = Vec::new();
|
||||
while let Some(part) = response.next().await {
|
||||
let part = part?;
|
||||
let etag_str: &str = part.blob.properties.etag.as_ref();
|
||||
if etag.is_none() {
|
||||
etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
|
||||
}
|
||||
if last_modified.is_none() {
|
||||
last_modified = Some(part.blob.properties.last_modified.into());
|
||||
}
|
||||
if let Some(blob_meta) = part.blob.metadata {
|
||||
metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
|
||||
}
|
||||
let data = part
|
||||
.data
|
||||
.collect()
|
||||
.await
|
||||
.map_err(|e| DownloadError::Other(e.into()))?;
|
||||
bufs.push(data);
|
||||
}
|
||||
if last_modified.is_none() {
|
||||
last_modified = Some(part.blob.properties.last_modified.into());
|
||||
}
|
||||
if let Some(blob_meta) = part.blob.metadata {
|
||||
metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
|
||||
}
|
||||
let data = part
|
||||
.data
|
||||
.collect()
|
||||
.await
|
||||
.map_err(|e| DownloadError::Other(e.into()))?;
|
||||
bufs.push(data);
|
||||
Ok(Download {
|
||||
download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
|
||||
etag,
|
||||
last_modified,
|
||||
metadata: Some(StorageMetadata(metadata)),
|
||||
})
|
||||
};
|
||||
|
||||
tokio::select! {
|
||||
bufs = download => bufs,
|
||||
_ = cancel.cancelled() => Err(DownloadError::Cancelled),
|
||||
}
|
||||
Ok(Download {
|
||||
download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
|
||||
etag,
|
||||
last_modified,
|
||||
metadata: Some(StorageMetadata(metadata)),
|
||||
})
|
||||
}
|
||||
|
||||
async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
|
||||
self.concurrency_limiter
|
||||
.acquire(kind)
|
||||
.await
|
||||
.expect("semaphore is never closed")
|
||||
async fn permit(
|
||||
&self,
|
||||
kind: RequestKind,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<tokio::sync::SemaphorePermit<'_>, Cancelled> {
|
||||
let acquire = self.concurrency_limiter.acquire(kind);
|
||||
|
||||
tokio::select! {
|
||||
permit = acquire => Ok(permit.expect("never closed")),
|
||||
_ = cancel.cancelled() => Err(Cancelled),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -192,66 +228,87 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
prefix: Option<&RemotePath>,
|
||||
mode: ListingMode,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<Listing, DownloadError> {
|
||||
// get the passed prefix or if it is not set use prefix_in_bucket value
|
||||
let list_prefix = prefix
|
||||
.map(|p| self.relative_path_to_name(p))
|
||||
.or_else(|| self.prefix_in_container.clone())
|
||||
.map(|mut p| {
|
||||
// required to end with a separator
|
||||
// otherwise request will return only the entry of a prefix
|
||||
if matches!(mode, ListingMode::WithDelimiter)
|
||||
&& !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
|
||||
{
|
||||
p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||
}
|
||||
p
|
||||
let _permit = self.permit(RequestKind::List, cancel).await?;
|
||||
|
||||
let op = async {
|
||||
// get the passed prefix or if it is not set use prefix_in_bucket value
|
||||
let list_prefix = prefix
|
||||
.map(|p| self.relative_path_to_name(p))
|
||||
.or_else(|| self.prefix_in_container.clone())
|
||||
.map(|mut p| {
|
||||
// required to end with a separator
|
||||
// otherwise request will return only the entry of a prefix
|
||||
if matches!(mode, ListingMode::WithDelimiter)
|
||||
&& !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
|
||||
{
|
||||
p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||
}
|
||||
p
|
||||
});
|
||||
|
||||
let mut builder = self.client.list_blobs();
|
||||
|
||||
if let ListingMode::WithDelimiter = mode {
|
||||
builder = builder.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
|
||||
}
|
||||
|
||||
if let Some(prefix) = list_prefix {
|
||||
builder = builder.prefix(Cow::from(prefix.to_owned()));
|
||||
}
|
||||
|
||||
if let Some(limit) = self.max_keys_per_list_response {
|
||||
builder = builder.max_results(MaxResults::new(limit));
|
||||
}
|
||||
|
||||
let response = builder.into_stream();
|
||||
let response = response.into_stream().map_err(to_download_error);
|
||||
let response = tokio_stream::StreamExt::timeout(response, self.timeout);
|
||||
let response = response.map(|res| match res {
|
||||
Ok(res) => res,
|
||||
Err(_elapsed) => Err(DownloadError::Timeout),
|
||||
});
|
||||
|
||||
let mut builder = self.client.list_blobs();
|
||||
let mut response = std::pin::pin!(response);
|
||||
|
||||
if let ListingMode::WithDelimiter = mode {
|
||||
builder = builder.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
|
||||
}
|
||||
let mut res = Listing::default();
|
||||
|
||||
if let Some(prefix) = list_prefix {
|
||||
builder = builder.prefix(Cow::from(prefix.to_owned()));
|
||||
}
|
||||
let mut max_keys = max_keys.map(|mk| mk.get());
|
||||
while let Some(entry) = response.next().await {
|
||||
let entry = entry?;
|
||||
let prefix_iter = entry
|
||||
.blobs
|
||||
.prefixes()
|
||||
.map(|prefix| self.name_to_relative_path(&prefix.name));
|
||||
res.prefixes.extend(prefix_iter);
|
||||
|
||||
if let Some(limit) = self.max_keys_per_list_response {
|
||||
builder = builder.max_results(MaxResults::new(limit));
|
||||
}
|
||||
let blob_iter = entry
|
||||
.blobs
|
||||
.blobs()
|
||||
.map(|k| self.name_to_relative_path(&k.name));
|
||||
|
||||
let mut response = builder.into_stream();
|
||||
let mut res = Listing::default();
|
||||
// NonZeroU32 doesn't support subtraction apparently
|
||||
let mut max_keys = max_keys.map(|mk| mk.get());
|
||||
while let Some(l) = response.next().await {
|
||||
let entry = l.map_err(to_download_error)?;
|
||||
let prefix_iter = entry
|
||||
.blobs
|
||||
.prefixes()
|
||||
.map(|prefix| self.name_to_relative_path(&prefix.name));
|
||||
res.prefixes.extend(prefix_iter);
|
||||
for key in blob_iter {
|
||||
res.keys.push(key);
|
||||
|
||||
let blob_iter = entry
|
||||
.blobs
|
||||
.blobs()
|
||||
.map(|k| self.name_to_relative_path(&k.name));
|
||||
|
||||
for key in blob_iter {
|
||||
res.keys.push(key);
|
||||
if let Some(mut mk) = max_keys {
|
||||
assert!(mk > 0);
|
||||
mk -= 1;
|
||||
if mk == 0 {
|
||||
return Ok(res); // limit reached
|
||||
if let Some(mut mk) = max_keys {
|
||||
assert!(mk > 0);
|
||||
mk -= 1;
|
||||
if mk == 0 {
|
||||
return Ok(res); // limit reached
|
||||
}
|
||||
max_keys = Some(mk);
|
||||
}
|
||||
max_keys = Some(mk);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(res)
|
||||
};
|
||||
|
||||
tokio::select! {
|
||||
res = op => res,
|
||||
_ = cancel.cancelled() => Err(DownloadError::Cancelled),
|
||||
}
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
@@ -260,35 +317,52 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let _permit = self.permit(RequestKind::Put).await;
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(to));
|
||||
let _permit = self.permit(RequestKind::Put, cancel).await?;
|
||||
|
||||
let from: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>> =
|
||||
Box::pin(from);
|
||||
let op = async {
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(to));
|
||||
|
||||
let from = NonSeekableStream::new(from, data_size_bytes);
|
||||
let from: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>> =
|
||||
Box::pin(from);
|
||||
|
||||
let body = azure_core::Body::SeekableStream(Box::new(from));
|
||||
let from = NonSeekableStream::new(from, data_size_bytes);
|
||||
|
||||
let mut builder = blob_client.put_block_blob(body);
|
||||
let body = azure_core::Body::SeekableStream(Box::new(from));
|
||||
|
||||
if let Some(metadata) = metadata {
|
||||
builder = builder.metadata(to_azure_metadata(metadata));
|
||||
let mut builder = blob_client.put_block_blob(body);
|
||||
|
||||
if let Some(metadata) = metadata {
|
||||
builder = builder.metadata(to_azure_metadata(metadata));
|
||||
}
|
||||
|
||||
let fut = builder.into_future();
|
||||
let fut = tokio::time::timeout(self.timeout, fut);
|
||||
|
||||
match fut.await {
|
||||
Ok(Ok(_response)) => Ok(()),
|
||||
Ok(Err(azure)) => Err(azure.into()),
|
||||
Err(_timeout) => Err(TimeoutOrCancel::Cancel.into()),
|
||||
}
|
||||
};
|
||||
|
||||
tokio::select! {
|
||||
res = op => res,
|
||||
_ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
|
||||
}
|
||||
|
||||
let _response = builder.into_future().await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
|
||||
let _permit = self.permit(RequestKind::Get).await;
|
||||
async fn download(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError> {
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(from));
|
||||
|
||||
let builder = blob_client.get();
|
||||
|
||||
self.download_for_builder(builder).await
|
||||
self.download_for_builder(builder, cancel).await
|
||||
}
|
||||
|
||||
async fn download_byte_range(
|
||||
@@ -296,8 +370,8 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
from: &RemotePath,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError> {
|
||||
let _permit = self.permit(RequestKind::Get).await;
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(from));
|
||||
|
||||
let mut builder = blob_client.get();
|
||||
@@ -309,82 +383,113 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
};
|
||||
builder = builder.range(range);
|
||||
|
||||
self.download_for_builder(builder).await
|
||||
self.download_for_builder(builder, cancel).await
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
|
||||
let _permit = self.permit(RequestKind::Delete).await;
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(path));
|
||||
async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
|
||||
self.delete_objects(std::array::from_ref(path), cancel)
|
||||
.await
|
||||
}
|
||||
|
||||
let builder = blob_client.delete();
|
||||
async fn delete_objects<'a>(
|
||||
&self,
|
||||
paths: &'a [RemotePath],
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let _permit = self.permit(RequestKind::Delete, cancel).await?;
|
||||
|
||||
match builder.into_future().await {
|
||||
Ok(_response) => Ok(()),
|
||||
Err(e) => {
|
||||
if let Some(http_err) = e.as_http_error() {
|
||||
if http_err.status() == StatusCode::NotFound {
|
||||
return Ok(());
|
||||
let op = async {
|
||||
// TODO batch requests are also not supported by the SDK
|
||||
// https://github.com/Azure/azure-sdk-for-rust/issues/1068
|
||||
// https://github.com/Azure/azure-sdk-for-rust/issues/1249
|
||||
for path in paths {
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(path));
|
||||
|
||||
let request = blob_client.delete().into_future();
|
||||
|
||||
let res = tokio::time::timeout(self.timeout, request).await;
|
||||
|
||||
match res {
|
||||
Ok(Ok(_response)) => continue,
|
||||
Ok(Err(e)) => {
|
||||
if let Some(http_err) = e.as_http_error() {
|
||||
if http_err.status() == StatusCode::NotFound {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return Err(e.into());
|
||||
}
|
||||
Err(_elapsed) => return Err(TimeoutOrCancel::Timeout.into()),
|
||||
}
|
||||
Err(anyhow::Error::new(e))
|
||||
}
|
||||
|
||||
Ok(())
|
||||
};
|
||||
|
||||
tokio::select! {
|
||||
res = op => res,
|
||||
_ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
|
||||
}
|
||||
}
|
||||
|
||||
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
|
||||
// Permit is already obtained by inner delete function
|
||||
async fn copy(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
to: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let _permit = self.permit(RequestKind::Copy, cancel).await?;
|
||||
|
||||
// TODO batch requests are also not supported by the SDK
|
||||
// https://github.com/Azure/azure-sdk-for-rust/issues/1068
|
||||
// https://github.com/Azure/azure-sdk-for-rust/issues/1249
|
||||
for path in paths {
|
||||
self.delete(path).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
let timeout = tokio::time::sleep(self.timeout);
|
||||
|
||||
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
|
||||
let _permit = self.permit(RequestKind::Copy).await;
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(to));
|
||||
let mut copy_status = None;
|
||||
|
||||
let source_url = format!(
|
||||
"{}/{}",
|
||||
self.client.url()?,
|
||||
self.relative_path_to_name(from)
|
||||
);
|
||||
let builder = blob_client.copy(Url::from_str(&source_url)?);
|
||||
let op = async {
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(to));
|
||||
|
||||
let result = builder.into_future().await?;
|
||||
let source_url = format!(
|
||||
"{}/{}",
|
||||
self.client.url()?,
|
||||
self.relative_path_to_name(from)
|
||||
);
|
||||
|
||||
let mut copy_status = result.copy_status;
|
||||
let start_time = Instant::now();
|
||||
const MAX_WAIT_TIME: Duration = Duration::from_secs(60);
|
||||
loop {
|
||||
match copy_status {
|
||||
CopyStatus::Aborted => {
|
||||
anyhow::bail!("Received abort for copy from {from} to {to}.");
|
||||
let builder = blob_client.copy(Url::from_str(&source_url)?);
|
||||
let copy = builder.into_future();
|
||||
|
||||
let result = copy.await?;
|
||||
|
||||
copy_status = Some(result.copy_status);
|
||||
loop {
|
||||
match copy_status.as_ref().expect("we always set it to Some") {
|
||||
CopyStatus::Aborted => {
|
||||
anyhow::bail!("Received abort for copy from {from} to {to}.");
|
||||
}
|
||||
CopyStatus::Failed => {
|
||||
anyhow::bail!("Received failure response for copy from {from} to {to}.");
|
||||
}
|
||||
CopyStatus::Success => return Ok(()),
|
||||
CopyStatus::Pending => (),
|
||||
}
|
||||
CopyStatus::Failed => {
|
||||
anyhow::bail!("Received failure response for copy from {from} to {to}.");
|
||||
}
|
||||
CopyStatus::Success => return Ok(()),
|
||||
CopyStatus::Pending => (),
|
||||
// The copy is taking longer. Waiting a second and then re-trying.
|
||||
// TODO estimate time based on copy_progress and adjust time based on that
|
||||
tokio::time::sleep(Duration::from_millis(1000)).await;
|
||||
let properties = blob_client.get_properties().into_future().await?;
|
||||
let Some(status) = properties.blob.properties.copy_status else {
|
||||
tracing::warn!("copy_status for copy is None!, from={from}, to={to}");
|
||||
return Ok(());
|
||||
};
|
||||
copy_status = Some(status);
|
||||
}
|
||||
// The copy is taking longer. Waiting a second and then re-trying.
|
||||
// TODO estimate time based on copy_progress and adjust time based on that
|
||||
tokio::time::sleep(Duration::from_millis(1000)).await;
|
||||
let properties = blob_client.get_properties().into_future().await?;
|
||||
let Some(status) = properties.blob.properties.copy_status else {
|
||||
tracing::warn!("copy_status for copy is None!, from={from}, to={to}");
|
||||
return Ok(());
|
||||
};
|
||||
if start_time.elapsed() > MAX_WAIT_TIME {
|
||||
anyhow::bail!("Copy from from {from} to {to} took longer than limit MAX_WAIT_TIME={}s. copy_pogress={:?}.",
|
||||
MAX_WAIT_TIME.as_secs_f32(),
|
||||
properties.blob.properties.copy_progress,
|
||||
);
|
||||
}
|
||||
copy_status = status;
|
||||
};
|
||||
|
||||
tokio::select! {
|
||||
res = op => res,
|
||||
_ = cancel.cancelled() => Err(anyhow::Error::new(TimeoutOrCancel::Cancel)),
|
||||
_ = timeout => {
|
||||
let e = anyhow::Error::new(TimeoutOrCancel::Timeout);
|
||||
let e = e.context(format!("Timeout, last status: {copy_status:?}"));
|
||||
Err(e)
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
181
libs/remote_storage/src/error.rs
Normal file
181
libs/remote_storage/src/error.rs
Normal file
@@ -0,0 +1,181 @@
|
||||
/// Reasons for downloads or listings to fail.
|
||||
#[derive(Debug)]
|
||||
pub enum DownloadError {
|
||||
/// Validation or other error happened due to user input.
|
||||
BadInput(anyhow::Error),
|
||||
/// The file was not found in the remote storage.
|
||||
NotFound,
|
||||
/// A cancellation token aborted the download, typically during
|
||||
/// tenant detach or process shutdown.
|
||||
Cancelled,
|
||||
/// A timeout happened while executing the request. Possible reasons:
|
||||
/// - stuck tcp connection
|
||||
///
|
||||
/// Concurrency control is not timed within timeout.
|
||||
Timeout,
|
||||
/// The file was found in the remote storage, but the download failed.
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
impl std::fmt::Display for DownloadError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
DownloadError::BadInput(e) => {
|
||||
write!(f, "Failed to download a remote file due to user input: {e}")
|
||||
}
|
||||
DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
|
||||
DownloadError::Cancelled => write!(f, "Cancelled, shutting down"),
|
||||
DownloadError::Timeout => write!(f, "timeout"),
|
||||
DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for DownloadError {}
|
||||
|
||||
impl DownloadError {
|
||||
/// Returns true if the error should not be retried with backoff
|
||||
pub fn is_permanent(&self) -> bool {
|
||||
use DownloadError::*;
|
||||
match self {
|
||||
BadInput(_) | NotFound | Cancelled => true,
|
||||
Timeout | Other(_) => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum TimeTravelError {
|
||||
/// Validation or other error happened due to user input.
|
||||
BadInput(anyhow::Error),
|
||||
/// The used remote storage does not have time travel recovery implemented
|
||||
Unimplemented,
|
||||
/// The number of versions/deletion markers is above our limit.
|
||||
TooManyVersions,
|
||||
/// A cancellation token aborted the process, typically during
|
||||
/// request closure or process shutdown.
|
||||
Cancelled,
|
||||
/// Other errors
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
impl std::fmt::Display for TimeTravelError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
TimeTravelError::BadInput(e) => {
|
||||
write!(
|
||||
f,
|
||||
"Failed to time travel recover a prefix due to user input: {e}"
|
||||
)
|
||||
}
|
||||
TimeTravelError::Unimplemented => write!(
|
||||
f,
|
||||
"time travel recovery is not implemented for the current storage backend"
|
||||
),
|
||||
TimeTravelError::Cancelled => write!(f, "Cancelled, shutting down"),
|
||||
TimeTravelError::TooManyVersions => {
|
||||
write!(f, "Number of versions/delete markers above limit")
|
||||
}
|
||||
TimeTravelError::Other(e) => write!(f, "Failed to time travel recover a prefix: {e:?}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for TimeTravelError {}
|
||||
|
||||
/// Plain cancelled error.
|
||||
///
|
||||
/// By design this type does not not implement `std::error::Error` so it cannot be put as the root
|
||||
/// cause of `std::io::Error` or `anyhow::Error`. It should never need to be exposed out of this
|
||||
/// crate.
|
||||
///
|
||||
/// It exists to implement permit acquiring in `{Download,TimeTravel}Error` and `anyhow::Error` returning
|
||||
/// operations and ensuring that those get converted to proper versions with just `?`.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct Cancelled;
|
||||
|
||||
impl From<Cancelled> for anyhow::Error {
|
||||
fn from(_: Cancelled) -> Self {
|
||||
anyhow::Error::new(TimeoutOrCancel::Cancel)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Cancelled> for TimeTravelError {
|
||||
fn from(_: Cancelled) -> Self {
|
||||
TimeTravelError::Cancelled
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Cancelled> for TimeoutOrCancel {
|
||||
fn from(_: Cancelled) -> Self {
|
||||
TimeoutOrCancel::Cancel
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Cancelled> for DownloadError {
|
||||
fn from(_: Cancelled) -> Self {
|
||||
DownloadError::Cancelled
|
||||
}
|
||||
}
|
||||
|
||||
/// This type is used at as the root cause for timeouts and cancellations with `anyhow::Error` returning
|
||||
/// RemoteStorage methods.
|
||||
///
|
||||
/// For use with `utils::backoff::retry` and `anyhow::Error` returning operations there is
|
||||
/// `TimeoutOrCancel::caused_by_cancel` method to query "proper form" errors.
|
||||
#[derive(Debug)]
|
||||
pub enum TimeoutOrCancel {
|
||||
Timeout,
|
||||
Cancel,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for TimeoutOrCancel {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
use TimeoutOrCancel::*;
|
||||
match self {
|
||||
Timeout => write!(f, "timeout"),
|
||||
Cancel => write!(f, "cancel"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for TimeoutOrCancel {}
|
||||
|
||||
impl TimeoutOrCancel {
|
||||
pub fn caused(error: &anyhow::Error) -> Option<&Self> {
|
||||
error.root_cause().downcast_ref()
|
||||
}
|
||||
|
||||
/// Returns true if the error was caused by [`TimeoutOrCancel::Cancel`].
|
||||
pub fn caused_by_cancel(error: &anyhow::Error) -> bool {
|
||||
Self::caused(error).is_some_and(Self::is_cancel)
|
||||
}
|
||||
|
||||
pub fn is_cancel(&self) -> bool {
|
||||
matches!(self, TimeoutOrCancel::Cancel)
|
||||
}
|
||||
|
||||
pub fn is_timeout(&self) -> bool {
|
||||
matches!(self, TimeoutOrCancel::Timeout)
|
||||
}
|
||||
}
|
||||
|
||||
/// This conversion is used when [`crate::support::DownloadStream`] notices a cancellation or
|
||||
/// timeout to wrap it in an `std::io::Error`.
|
||||
impl From<TimeoutOrCancel> for std::io::Error {
|
||||
fn from(value: TimeoutOrCancel) -> Self {
|
||||
let e = DownloadError::from(value);
|
||||
std::io::Error::other(e)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<TimeoutOrCancel> for DownloadError {
|
||||
fn from(value: TimeoutOrCancel) -> Self {
|
||||
use TimeoutOrCancel::*;
|
||||
|
||||
match value {
|
||||
Timeout => DownloadError::Timeout,
|
||||
Cancel => DownloadError::Cancelled,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -10,6 +10,7 @@
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
|
||||
mod azure_blob;
|
||||
mod error;
|
||||
mod local_fs;
|
||||
mod s3_bucket;
|
||||
mod simulate_failures;
|
||||
@@ -21,7 +22,7 @@ use std::{
|
||||
num::{NonZeroU32, NonZeroUsize},
|
||||
pin::Pin,
|
||||
sync::Arc,
|
||||
time::SystemTime,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
@@ -41,6 +42,8 @@ pub use self::{
|
||||
};
|
||||
use s3_bucket::RequestKind;
|
||||
|
||||
pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
|
||||
|
||||
/// Currently, sync happens with AWS S3, that has two limits on requests per second:
|
||||
/// ~200 RPS for IAM services
|
||||
/// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
|
||||
@@ -158,9 +161,10 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
let result = self
|
||||
.list(prefix, ListingMode::WithDelimiter, None)
|
||||
.list(prefix, ListingMode::WithDelimiter, None, cancel)
|
||||
.await?
|
||||
.prefixes;
|
||||
Ok(result)
|
||||
@@ -182,9 +186,10 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
let result = self
|
||||
.list(prefix, ListingMode::NoDelimiter, max_keys)
|
||||
.list(prefix, ListingMode::NoDelimiter, max_keys, cancel)
|
||||
.await?
|
||||
.keys;
|
||||
Ok(result)
|
||||
@@ -195,9 +200,13 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
prefix: Option<&RemotePath>,
|
||||
_mode: ListingMode,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Listing, DownloadError>;
|
||||
|
||||
/// Streams the local file contents into remote into the remote storage entry.
|
||||
///
|
||||
/// If the operation fails because of timeout or cancellation, the root cause of the error will be
|
||||
/// set to `TimeoutOrCancel`.
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
@@ -206,27 +215,61 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()>;
|
||||
|
||||
/// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
|
||||
/// Streams the remote storage entry contents.
|
||||
///
|
||||
/// The returned download stream will obey initial timeout and cancellation signal by erroring
|
||||
/// on whichever happens first. Only one of the reasons will fail the stream, which is usually
|
||||
/// enough for `tokio::io::copy_buf` usage. If needed the error can be filtered out.
|
||||
///
|
||||
/// Returns the metadata, if any was stored with the file previously.
|
||||
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError>;
|
||||
async fn download(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError>;
|
||||
|
||||
/// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer.
|
||||
/// Streams a given byte range of the remote storage entry contents.
|
||||
///
|
||||
/// The returned download stream will obey initial timeout and cancellation signal by erroring
|
||||
/// on whichever happens first. Only one of the reasons will fail the stream, which is usually
|
||||
/// enough for `tokio::io::copy_buf` usage. If needed the error can be filtered out.
|
||||
///
|
||||
/// Returns the metadata, if any was stored with the file previously.
|
||||
async fn download_byte_range(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError>;
|
||||
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
|
||||
/// Delete a single path from remote storage.
|
||||
///
|
||||
/// If the operation fails because of timeout or cancellation, the root cause of the error will be
|
||||
/// set to `TimeoutOrCancel`. In such situation it is unknown if the deletion went through.
|
||||
async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()>;
|
||||
|
||||
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
|
||||
/// Delete a multiple paths from remote storage.
|
||||
///
|
||||
/// If the operation fails because of timeout or cancellation, the root cause of the error will be
|
||||
/// set to `TimeoutOrCancel`. In such situation it is unknown which deletions, if any, went
|
||||
/// through.
|
||||
async fn delete_objects<'a>(
|
||||
&self,
|
||||
paths: &'a [RemotePath],
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()>;
|
||||
|
||||
/// Copy a remote object inside a bucket from one path to another.
|
||||
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>;
|
||||
async fn copy(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
to: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()>;
|
||||
|
||||
/// Resets the content of everything with the given prefix to the given state
|
||||
async fn time_travel_recover(
|
||||
@@ -238,7 +281,13 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
) -> Result<(), TimeTravelError>;
|
||||
}
|
||||
|
||||
pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
|
||||
/// DownloadStream is sensitive to the timeout and cancellation used with the original
|
||||
/// [`RemoteStorage::download`] request. The type yields `std::io::Result<Bytes>` to be compatible
|
||||
/// with `tokio::io::copy_buf`.
|
||||
// This has 'static because safekeepers do not use cancellation tokens (yet)
|
||||
pub type DownloadStream =
|
||||
Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>>;
|
||||
|
||||
pub struct Download {
|
||||
pub download_stream: DownloadStream,
|
||||
/// The last time the file was modified (`last-modified` HTTP header)
|
||||
@@ -257,86 +306,6 @@ impl Debug for Download {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum DownloadError {
|
||||
/// Validation or other error happened due to user input.
|
||||
BadInput(anyhow::Error),
|
||||
/// The file was not found in the remote storage.
|
||||
NotFound,
|
||||
/// A cancellation token aborted the download, typically during
|
||||
/// tenant detach or process shutdown.
|
||||
Cancelled,
|
||||
/// The file was found in the remote storage, but the download failed.
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
impl std::fmt::Display for DownloadError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
DownloadError::BadInput(e) => {
|
||||
write!(f, "Failed to download a remote file due to user input: {e}")
|
||||
}
|
||||
DownloadError::Cancelled => write!(f, "Cancelled, shutting down"),
|
||||
DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
|
||||
DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for DownloadError {}
|
||||
|
||||
impl DownloadError {
|
||||
/// Returns true if the error should not be retried with backoff
|
||||
pub fn is_permanent(&self) -> bool {
|
||||
use DownloadError::*;
|
||||
match self {
|
||||
BadInput(_) => true,
|
||||
NotFound => true,
|
||||
Cancelled => true,
|
||||
Other(_) => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum TimeTravelError {
|
||||
/// Validation or other error happened due to user input.
|
||||
BadInput(anyhow::Error),
|
||||
/// The used remote storage does not have time travel recovery implemented
|
||||
Unimplemented,
|
||||
/// The number of versions/deletion markers is above our limit.
|
||||
TooManyVersions,
|
||||
/// A cancellation token aborted the process, typically during
|
||||
/// request closure or process shutdown.
|
||||
Cancelled,
|
||||
/// Other errors
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
impl std::fmt::Display for TimeTravelError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
TimeTravelError::BadInput(e) => {
|
||||
write!(
|
||||
f,
|
||||
"Failed to time travel recover a prefix due to user input: {e}"
|
||||
)
|
||||
}
|
||||
TimeTravelError::Unimplemented => write!(
|
||||
f,
|
||||
"time travel recovery is not implemented for the current storage backend"
|
||||
),
|
||||
TimeTravelError::Cancelled => write!(f, "Cancelled, shutting down"),
|
||||
TimeTravelError::TooManyVersions => {
|
||||
write!(f, "Number of versions/delete markers above limit")
|
||||
}
|
||||
TimeTravelError::Other(e) => write!(f, "Failed to time travel recover a prefix: {e:?}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for TimeTravelError {}
|
||||
|
||||
/// Every storage, currently supported.
|
||||
/// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
|
||||
#[derive(Clone)]
|
||||
@@ -354,12 +323,13 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
prefix: Option<&RemotePath>,
|
||||
mode: ListingMode,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<Listing, DownloadError> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.list(prefix, mode, max_keys).await,
|
||||
Self::AwsS3(s) => s.list(prefix, mode, max_keys).await,
|
||||
Self::AzureBlob(s) => s.list(prefix, mode, max_keys).await,
|
||||
Self::Unreliable(s) => s.list(prefix, mode, max_keys).await,
|
||||
Self::LocalFs(s) => s.list(prefix, mode, max_keys, cancel).await,
|
||||
Self::AwsS3(s) => s.list(prefix, mode, max_keys, cancel).await,
|
||||
Self::AzureBlob(s) => s.list(prefix, mode, max_keys, cancel).await,
|
||||
Self::Unreliable(s) => s.list(prefix, mode, max_keys, cancel).await,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -372,12 +342,13 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
&self,
|
||||
folder: Option<&RemotePath>,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.list_files(folder, max_keys).await,
|
||||
Self::AwsS3(s) => s.list_files(folder, max_keys).await,
|
||||
Self::AzureBlob(s) => s.list_files(folder, max_keys).await,
|
||||
Self::Unreliable(s) => s.list_files(folder, max_keys).await,
|
||||
Self::LocalFs(s) => s.list_files(folder, max_keys, cancel).await,
|
||||
Self::AwsS3(s) => s.list_files(folder, max_keys, cancel).await,
|
||||
Self::AzureBlob(s) => s.list_files(folder, max_keys, cancel).await,
|
||||
Self::Unreliable(s) => s.list_files(folder, max_keys, cancel).await,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -387,36 +358,43 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
pub async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.list_prefixes(prefix).await,
|
||||
Self::AwsS3(s) => s.list_prefixes(prefix).await,
|
||||
Self::AzureBlob(s) => s.list_prefixes(prefix).await,
|
||||
Self::Unreliable(s) => s.list_prefixes(prefix).await,
|
||||
Self::LocalFs(s) => s.list_prefixes(prefix, cancel).await,
|
||||
Self::AwsS3(s) => s.list_prefixes(prefix, cancel).await,
|
||||
Self::AzureBlob(s) => s.list_prefixes(prefix, cancel).await,
|
||||
Self::Unreliable(s) => s.list_prefixes(prefix, cancel).await,
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`RemoteStorage::upload`]
|
||||
pub async fn upload(
|
||||
&self,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.upload(from, data_size_bytes, to, metadata).await,
|
||||
Self::AwsS3(s) => s.upload(from, data_size_bytes, to, metadata).await,
|
||||
Self::AzureBlob(s) => s.upload(from, data_size_bytes, to, metadata).await,
|
||||
Self::Unreliable(s) => s.upload(from, data_size_bytes, to, metadata).await,
|
||||
Self::LocalFs(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await,
|
||||
Self::AwsS3(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await,
|
||||
Self::AzureBlob(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await,
|
||||
Self::Unreliable(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
|
||||
pub async fn download(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.download(from).await,
|
||||
Self::AwsS3(s) => s.download(from).await,
|
||||
Self::AzureBlob(s) => s.download(from).await,
|
||||
Self::Unreliable(s) => s.download(from).await,
|
||||
Self::LocalFs(s) => s.download(from, cancel).await,
|
||||
Self::AwsS3(s) => s.download(from, cancel).await,
|
||||
Self::AzureBlob(s) => s.download(from, cancel).await,
|
||||
Self::Unreliable(s) => s.download(from, cancel).await,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -425,54 +403,72 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
from: &RemotePath,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError> {
|
||||
match self {
|
||||
Self::LocalFs(s) => {
|
||||
s.download_byte_range(from, start_inclusive, end_exclusive)
|
||||
s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
|
||||
.await
|
||||
}
|
||||
Self::AwsS3(s) => {
|
||||
s.download_byte_range(from, start_inclusive, end_exclusive)
|
||||
s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
|
||||
.await
|
||||
}
|
||||
Self::AzureBlob(s) => {
|
||||
s.download_byte_range(from, start_inclusive, end_exclusive)
|
||||
s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
|
||||
.await
|
||||
}
|
||||
Self::Unreliable(s) => {
|
||||
s.download_byte_range(from, start_inclusive, end_exclusive)
|
||||
s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
|
||||
.await
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
|
||||
/// See [`RemoteStorage::delete`]
|
||||
pub async fn delete(
|
||||
&self,
|
||||
path: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.delete(path).await,
|
||||
Self::AwsS3(s) => s.delete(path).await,
|
||||
Self::AzureBlob(s) => s.delete(path).await,
|
||||
Self::Unreliable(s) => s.delete(path).await,
|
||||
Self::LocalFs(s) => s.delete(path, cancel).await,
|
||||
Self::AwsS3(s) => s.delete(path, cancel).await,
|
||||
Self::AzureBlob(s) => s.delete(path, cancel).await,
|
||||
Self::Unreliable(s) => s.delete(path, cancel).await,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
|
||||
/// See [`RemoteStorage::delete_objects`]
|
||||
pub async fn delete_objects(
|
||||
&self,
|
||||
paths: &[RemotePath],
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.delete_objects(paths).await,
|
||||
Self::AwsS3(s) => s.delete_objects(paths).await,
|
||||
Self::AzureBlob(s) => s.delete_objects(paths).await,
|
||||
Self::Unreliable(s) => s.delete_objects(paths).await,
|
||||
Self::LocalFs(s) => s.delete_objects(paths, cancel).await,
|
||||
Self::AwsS3(s) => s.delete_objects(paths, cancel).await,
|
||||
Self::AzureBlob(s) => s.delete_objects(paths, cancel).await,
|
||||
Self::Unreliable(s) => s.delete_objects(paths, cancel).await,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn copy_object(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
|
||||
/// See [`RemoteStorage::copy`]
|
||||
pub async fn copy_object(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
to: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.copy(from, to).await,
|
||||
Self::AwsS3(s) => s.copy(from, to).await,
|
||||
Self::AzureBlob(s) => s.copy(from, to).await,
|
||||
Self::Unreliable(s) => s.copy(from, to).await,
|
||||
Self::LocalFs(s) => s.copy(from, to, cancel).await,
|
||||
Self::AwsS3(s) => s.copy(from, to, cancel).await,
|
||||
Self::AzureBlob(s) => s.copy(from, to, cancel).await,
|
||||
Self::Unreliable(s) => s.copy(from, to, cancel).await,
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`RemoteStorage::time_travel_recover`].
|
||||
pub async fn time_travel_recover(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
@@ -503,10 +499,11 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
|
||||
impl GenericRemoteStorage {
|
||||
pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
|
||||
let timeout = storage_config.timeout;
|
||||
Ok(match &storage_config.storage {
|
||||
RemoteStorageKind::LocalFs(root) => {
|
||||
info!("Using fs root '{root}' as a remote storage");
|
||||
Self::LocalFs(LocalFs::new(root.clone())?)
|
||||
RemoteStorageKind::LocalFs(path) => {
|
||||
info!("Using fs root '{path}' as a remote storage");
|
||||
Self::LocalFs(LocalFs::new(path.clone(), timeout)?)
|
||||
}
|
||||
RemoteStorageKind::AwsS3(s3_config) => {
|
||||
// The profile and access key id are only printed here for debugging purposes,
|
||||
@@ -516,12 +513,12 @@ impl GenericRemoteStorage {
|
||||
std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "<none>".into());
|
||||
info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}",
|
||||
s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
|
||||
Self::AwsS3(Arc::new(S3Bucket::new(s3_config)?))
|
||||
Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout)?))
|
||||
}
|
||||
RemoteStorageKind::AzureContainer(azure_config) => {
|
||||
info!("Using azure container '{}' in region '{}' as a remote storage, prefix in container: '{:?}'",
|
||||
azure_config.container_name, azure_config.container_region, azure_config.prefix_in_container);
|
||||
Self::AzureBlob(Arc::new(AzureBlobStorage::new(azure_config)?))
|
||||
Self::AzureBlob(Arc::new(AzureBlobStorage::new(azure_config, timeout)?))
|
||||
}
|
||||
})
|
||||
}
|
||||
@@ -530,18 +527,15 @@ impl GenericRemoteStorage {
|
||||
Self::Unreliable(Arc::new(UnreliableWrapper::new(s, fail_first)))
|
||||
}
|
||||
|
||||
/// Takes storage object contents and its size and uploads to remote storage,
|
||||
/// mapping `from_path` to the corresponding remote object id in the storage.
|
||||
///
|
||||
/// The storage object does not have to be present on the `from_path`,
|
||||
/// this path is used for the remote object id conversion only.
|
||||
/// See [`RemoteStorage::upload`], which this method calls with `None` as metadata.
|
||||
pub async fn upload_storage_object(
|
||||
&self,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
from_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
self.upload(from, from_size_bytes, to, None)
|
||||
self.upload(from, from_size_bytes, to, None, cancel)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to upload data of length {from_size_bytes} to storage path {to:?}")
|
||||
@@ -554,10 +548,11 @@ impl GenericRemoteStorage {
|
||||
&self,
|
||||
byte_range: Option<(u64, Option<u64>)>,
|
||||
from: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError> {
|
||||
match byte_range {
|
||||
Some((start, end)) => self.download_byte_range(from, start, end).await,
|
||||
None => self.download(from).await,
|
||||
Some((start, end)) => self.download_byte_range(from, start, end, cancel).await,
|
||||
None => self.download(from, cancel).await,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -572,6 +567,9 @@ pub struct StorageMetadata(HashMap<String, String>);
|
||||
pub struct RemoteStorageConfig {
|
||||
/// The storage connection configuration.
|
||||
pub storage: RemoteStorageKind,
|
||||
/// A common timeout enforced for all requests after concurrency limiter permit has been
|
||||
/// acquired.
|
||||
pub timeout: Duration,
|
||||
}
|
||||
|
||||
/// A kind of a remote storage to connect to, with its connection configuration.
|
||||
@@ -656,6 +654,8 @@ impl Debug for AzureConfig {
|
||||
}
|
||||
|
||||
impl RemoteStorageConfig {
|
||||
pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120);
|
||||
|
||||
pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
|
||||
let local_path = toml.get("local_path");
|
||||
let bucket_name = toml.get("bucket_name");
|
||||
@@ -685,6 +685,27 @@ impl RemoteStorageConfig {
|
||||
.map(|endpoint| parse_toml_string("endpoint", endpoint))
|
||||
.transpose()?;
|
||||
|
||||
let timeout = toml
|
||||
.get("timeout")
|
||||
.map(|timeout| {
|
||||
timeout
|
||||
.as_str()
|
||||
.ok_or_else(|| anyhow::Error::msg("timeout was not a string"))
|
||||
})
|
||||
.transpose()
|
||||
.and_then(|timeout| {
|
||||
timeout
|
||||
.map(humantime::parse_duration)
|
||||
.transpose()
|
||||
.map_err(anyhow::Error::new)
|
||||
})
|
||||
.context("parse timeout")?
|
||||
.unwrap_or(Self::DEFAULT_TIMEOUT);
|
||||
|
||||
if timeout < Duration::from_secs(1) {
|
||||
bail!("timeout was specified as {timeout:?} which is too low");
|
||||
}
|
||||
|
||||
let storage = match (
|
||||
local_path,
|
||||
bucket_name,
|
||||
@@ -746,7 +767,7 @@ impl RemoteStorageConfig {
|
||||
}
|
||||
};
|
||||
|
||||
Ok(Some(RemoteStorageConfig { storage }))
|
||||
Ok(Some(RemoteStorageConfig { storage, timeout }))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -842,4 +863,24 @@ mod tests {
|
||||
let err = RemotePath::new(Utf8Path::new("/")).expect_err("Should fail on absolute paths");
|
||||
assert_eq!(err.to_string(), "Path \"/\" is not relative");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_localfs_config_with_timeout() {
|
||||
let input = "local_path = '.'
|
||||
timeout = '5s'";
|
||||
|
||||
let toml = input.parse::<toml_edit::Document>().unwrap();
|
||||
|
||||
let config = RemoteStorageConfig::from_toml(toml.as_item())
|
||||
.unwrap()
|
||||
.expect("it exists");
|
||||
|
||||
assert_eq!(
|
||||
config,
|
||||
RemoteStorageConfig {
|
||||
storage: RemoteStorageKind::LocalFs(Utf8PathBuf::from(".")),
|
||||
timeout: Duration::from_secs(5)
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,7 +5,12 @@
|
||||
//! volume is mounted to the local FS.
|
||||
|
||||
use std::{
|
||||
borrow::Cow, future::Future, io::ErrorKind, num::NonZeroU32, pin::Pin, time::SystemTime,
|
||||
borrow::Cow,
|
||||
future::Future,
|
||||
io::ErrorKind,
|
||||
num::NonZeroU32,
|
||||
pin::Pin,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
@@ -20,7 +25,9 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken};
|
||||
use tracing::*;
|
||||
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
|
||||
|
||||
use crate::{Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError};
|
||||
use crate::{
|
||||
Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
|
||||
};
|
||||
|
||||
use super::{RemoteStorage, StorageMetadata};
|
||||
|
||||
@@ -29,12 +36,13 @@ const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp";
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct LocalFs {
|
||||
storage_root: Utf8PathBuf,
|
||||
timeout: Duration,
|
||||
}
|
||||
|
||||
impl LocalFs {
|
||||
/// Attempts to create local FS storage, along with its root directory.
|
||||
/// Storage root will be created (if does not exist) and transformed into an absolute path (if passed as relative).
|
||||
pub fn new(mut storage_root: Utf8PathBuf) -> anyhow::Result<Self> {
|
||||
pub fn new(mut storage_root: Utf8PathBuf, timeout: Duration) -> anyhow::Result<Self> {
|
||||
if !storage_root.exists() {
|
||||
std::fs::create_dir_all(&storage_root).with_context(|| {
|
||||
format!("Failed to create all directories in the given root path {storage_root:?}")
|
||||
@@ -46,7 +54,10 @@ impl LocalFs {
|
||||
})?;
|
||||
}
|
||||
|
||||
Ok(Self { storage_root })
|
||||
Ok(Self {
|
||||
storage_root,
|
||||
timeout,
|
||||
})
|
||||
}
|
||||
|
||||
// mirrors S3Bucket::s3_object_to_relative_path
|
||||
@@ -157,80 +168,14 @@ impl LocalFs {
|
||||
|
||||
Ok(files)
|
||||
}
|
||||
}
|
||||
|
||||
impl RemoteStorage for LocalFs {
|
||||
async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
mode: ListingMode,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
) -> Result<Listing, DownloadError> {
|
||||
let mut result = Listing::default();
|
||||
|
||||
if let ListingMode::NoDelimiter = mode {
|
||||
let keys = self
|
||||
.list_recursive(prefix)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
result.keys = keys
|
||||
.into_iter()
|
||||
.filter(|k| {
|
||||
let path = k.with_base(&self.storage_root);
|
||||
!path.is_dir()
|
||||
})
|
||||
.collect();
|
||||
if let Some(max_keys) = max_keys {
|
||||
result.keys.truncate(max_keys.get() as usize);
|
||||
}
|
||||
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
let path = match prefix {
|
||||
Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
|
||||
None => Cow::Borrowed(&self.storage_root),
|
||||
};
|
||||
|
||||
let prefixes_to_filter = get_all_files(path.as_ref(), false)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
// filter out empty directories to mirror s3 behavior.
|
||||
for prefix in prefixes_to_filter {
|
||||
if prefix.is_dir()
|
||||
&& is_directory_empty(&prefix)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
let stripped = prefix
|
||||
.strip_prefix(&self.storage_root)
|
||||
.context("Failed to strip prefix")
|
||||
.and_then(RemotePath::new)
|
||||
.expect(
|
||||
"We list files for storage root, hence should be able to remote the prefix",
|
||||
);
|
||||
|
||||
if prefix.is_dir() {
|
||||
result.prefixes.push(stripped);
|
||||
} else {
|
||||
result.keys.push(stripped);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
async fn upload0(
|
||||
&self,
|
||||
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let target_file_path = to.with_base(&self.storage_root);
|
||||
create_target_directory(&target_file_path).await?;
|
||||
@@ -265,9 +210,26 @@ impl RemoteStorage for LocalFs {
|
||||
let mut buffer_to_read = data.take(from_size_bytes);
|
||||
|
||||
// alternatively we could just write the bytes to a file, but local_fs is a testing utility
|
||||
let bytes_read = io::copy_buf(&mut buffer_to_read, &mut destination)
|
||||
.await
|
||||
.with_context(|| {
|
||||
let copy = io::copy_buf(&mut buffer_to_read, &mut destination);
|
||||
|
||||
let bytes_read = tokio::select! {
|
||||
biased;
|
||||
_ = cancel.cancelled() => {
|
||||
let file = destination.into_inner();
|
||||
// wait for the inflight operation(s) to complete so that there could be a next
|
||||
// attempt right away and our writes are not directed to their file.
|
||||
file.into_std().await;
|
||||
|
||||
// TODO: leave the temp or not? leaving is probably less racy. enabled truncate at
|
||||
// least.
|
||||
fs::remove_file(temp_file_path).await.context("remove temp_file_path after cancellation or timeout")?;
|
||||
return Err(TimeoutOrCancel::Cancel.into());
|
||||
}
|
||||
read = copy => read,
|
||||
};
|
||||
|
||||
let bytes_read =
|
||||
bytes_read.with_context(|| {
|
||||
format!(
|
||||
"Failed to upload file (write temp) to the local storage at '{temp_file_path}'",
|
||||
)
|
||||
@@ -299,6 +261,9 @@ impl RemoteStorage for LocalFs {
|
||||
})?;
|
||||
|
||||
if let Some(storage_metadata) = metadata {
|
||||
// FIXME: we must not be using metadata much, since this would forget the old metadata
|
||||
// for new writes? or perhaps metadata is sticky; could consider removing if it's never
|
||||
// used.
|
||||
let storage_metadata_path = storage_metadata_path(&target_file_path);
|
||||
fs::write(
|
||||
&storage_metadata_path,
|
||||
@@ -315,8 +280,131 @@ impl RemoteStorage for LocalFs {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
|
||||
impl RemoteStorage for LocalFs {
|
||||
async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
mode: ListingMode,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Listing, DownloadError> {
|
||||
let op = async {
|
||||
let mut result = Listing::default();
|
||||
|
||||
if let ListingMode::NoDelimiter = mode {
|
||||
let keys = self
|
||||
.list_recursive(prefix)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
result.keys = keys
|
||||
.into_iter()
|
||||
.filter(|k| {
|
||||
let path = k.with_base(&self.storage_root);
|
||||
!path.is_dir()
|
||||
})
|
||||
.collect();
|
||||
|
||||
if let Some(max_keys) = max_keys {
|
||||
result.keys.truncate(max_keys.get() as usize);
|
||||
}
|
||||
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
let path = match prefix {
|
||||
Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
|
||||
None => Cow::Borrowed(&self.storage_root),
|
||||
};
|
||||
|
||||
let prefixes_to_filter = get_all_files(path.as_ref(), false)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
// filter out empty directories to mirror s3 behavior.
|
||||
for prefix in prefixes_to_filter {
|
||||
if prefix.is_dir()
|
||||
&& is_directory_empty(&prefix)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
let stripped = prefix
|
||||
.strip_prefix(&self.storage_root)
|
||||
.context("Failed to strip prefix")
|
||||
.and_then(RemotePath::new)
|
||||
.expect(
|
||||
"We list files for storage root, hence should be able to remote the prefix",
|
||||
);
|
||||
|
||||
if prefix.is_dir() {
|
||||
result.prefixes.push(stripped);
|
||||
} else {
|
||||
result.keys.push(stripped);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
};
|
||||
|
||||
let timeout = async {
|
||||
tokio::time::sleep(self.timeout).await;
|
||||
Err(DownloadError::Timeout)
|
||||
};
|
||||
|
||||
let cancelled = async {
|
||||
cancel.cancelled().await;
|
||||
Err(DownloadError::Cancelled)
|
||||
};
|
||||
|
||||
tokio::select! {
|
||||
res = op => res,
|
||||
res = timeout => res,
|
||||
res = cancelled => res,
|
||||
}
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let cancel = cancel.child_token();
|
||||
|
||||
let op = self.upload0(data, data_size_bytes, to, metadata, &cancel);
|
||||
let mut op = std::pin::pin!(op);
|
||||
|
||||
// race the upload0 to the timeout; if it goes over, do a graceful shutdown
|
||||
let (res, timeout) = tokio::select! {
|
||||
res = &mut op => (res, false),
|
||||
_ = tokio::time::sleep(self.timeout) => {
|
||||
cancel.cancel();
|
||||
(op.await, true)
|
||||
}
|
||||
};
|
||||
|
||||
match res {
|
||||
Err(e) if timeout && TimeoutOrCancel::caused_by_cancel(&e) => {
|
||||
// we caused this cancel (or they happened simultaneously) -- swap it out to
|
||||
// Timeout
|
||||
Err(TimeoutOrCancel::Timeout.into())
|
||||
}
|
||||
res => res,
|
||||
}
|
||||
}
|
||||
|
||||
async fn download(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError> {
|
||||
let target_path = from.with_base(&self.storage_root);
|
||||
if file_exists(&target_path).map_err(DownloadError::BadInput)? {
|
||||
let source = ReaderStream::new(
|
||||
@@ -334,6 +422,10 @@ impl RemoteStorage for LocalFs {
|
||||
.read_storage_metadata(&target_path)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
|
||||
let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
|
||||
|
||||
Ok(Download {
|
||||
metadata,
|
||||
last_modified: None,
|
||||
@@ -350,6 +442,7 @@ impl RemoteStorage for LocalFs {
|
||||
from: &RemotePath,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError> {
|
||||
if let Some(end_exclusive) = end_exclusive {
|
||||
if end_exclusive <= start_inclusive {
|
||||
@@ -391,6 +484,9 @@ impl RemoteStorage for LocalFs {
|
||||
let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
|
||||
let source = ReaderStream::new(source);
|
||||
|
||||
let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
|
||||
let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
|
||||
|
||||
Ok(Download {
|
||||
metadata,
|
||||
last_modified: None,
|
||||
@@ -402,7 +498,7 @@ impl RemoteStorage for LocalFs {
|
||||
}
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
|
||||
async fn delete(&self, path: &RemotePath, _cancel: &CancellationToken) -> anyhow::Result<()> {
|
||||
let file_path = path.with_base(&self.storage_root);
|
||||
match fs::remove_file(&file_path).await {
|
||||
Ok(()) => Ok(()),
|
||||
@@ -414,14 +510,23 @@ impl RemoteStorage for LocalFs {
|
||||
}
|
||||
}
|
||||
|
||||
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
|
||||
async fn delete_objects<'a>(
|
||||
&self,
|
||||
paths: &'a [RemotePath],
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
for path in paths {
|
||||
self.delete(path).await?
|
||||
self.delete(path, cancel).await?
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
|
||||
async fn copy(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
to: &RemotePath,
|
||||
_cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let from_path = from.with_base(&self.storage_root);
|
||||
let to_path = to.with_base(&self.storage_root);
|
||||
create_target_directory(&to_path).await?;
|
||||
@@ -435,7 +540,6 @@ impl RemoteStorage for LocalFs {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[allow(clippy::diverging_sub_expression)]
|
||||
async fn time_travel_recover(
|
||||
&self,
|
||||
_prefix: Option<&RemotePath>,
|
||||
@@ -529,8 +633,9 @@ mod fs_tests {
|
||||
remote_storage_path: &RemotePath,
|
||||
expected_metadata: Option<&StorageMetadata>,
|
||||
) -> anyhow::Result<String> {
|
||||
let cancel = CancellationToken::new();
|
||||
let download = storage
|
||||
.download(remote_storage_path)
|
||||
.download(remote_storage_path, &cancel)
|
||||
.await
|
||||
.map_err(|e| anyhow::anyhow!("Download failed: {e}"))?;
|
||||
ensure!(
|
||||
@@ -545,16 +650,16 @@ mod fs_tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn upload_file() -> anyhow::Result<()> {
|
||||
let storage = create_storage()?;
|
||||
let (storage, cancel) = create_storage()?;
|
||||
|
||||
let target_path_1 = upload_dummy_file(&storage, "upload_1", None).await?;
|
||||
let target_path_1 = upload_dummy_file(&storage, "upload_1", None, &cancel).await?;
|
||||
assert_eq!(
|
||||
storage.list_all().await?,
|
||||
vec![target_path_1.clone()],
|
||||
"Should list a single file after first upload"
|
||||
);
|
||||
|
||||
let target_path_2 = upload_dummy_file(&storage, "upload_2", None).await?;
|
||||
let target_path_2 = upload_dummy_file(&storage, "upload_2", None, &cancel).await?;
|
||||
assert_eq!(
|
||||
list_files_sorted(&storage).await?,
|
||||
vec![target_path_1.clone(), target_path_2.clone()],
|
||||
@@ -566,7 +671,7 @@ mod fs_tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn upload_file_negatives() -> anyhow::Result<()> {
|
||||
let storage = create_storage()?;
|
||||
let (storage, cancel) = create_storage()?;
|
||||
|
||||
let id = RemotePath::new(Utf8Path::new("dummy"))?;
|
||||
let content = Bytes::from_static(b"12345");
|
||||
@@ -575,34 +680,34 @@ mod fs_tests {
|
||||
// Check that you get an error if the size parameter doesn't match the actual
|
||||
// size of the stream.
|
||||
storage
|
||||
.upload(content(), 0, &id, None)
|
||||
.upload(content(), 0, &id, None, &cancel)
|
||||
.await
|
||||
.expect_err("upload with zero size succeeded");
|
||||
storage
|
||||
.upload(content(), 4, &id, None)
|
||||
.upload(content(), 4, &id, None, &cancel)
|
||||
.await
|
||||
.expect_err("upload with too short size succeeded");
|
||||
storage
|
||||
.upload(content(), 6, &id, None)
|
||||
.upload(content(), 6, &id, None, &cancel)
|
||||
.await
|
||||
.expect_err("upload with too large size succeeded");
|
||||
|
||||
// Correct size is 5, this should succeed.
|
||||
storage.upload(content(), 5, &id, None).await?;
|
||||
storage.upload(content(), 5, &id, None, &cancel).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn create_storage() -> anyhow::Result<LocalFs> {
|
||||
fn create_storage() -> anyhow::Result<(LocalFs, CancellationToken)> {
|
||||
let storage_root = tempdir()?.path().to_path_buf();
|
||||
LocalFs::new(storage_root)
|
||||
LocalFs::new(storage_root, Duration::from_secs(120)).map(|s| (s, CancellationToken::new()))
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_file() -> anyhow::Result<()> {
|
||||
let storage = create_storage()?;
|
||||
let (storage, cancel) = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
|
||||
let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?;
|
||||
|
||||
let contents = read_and_check_metadata(&storage, &upload_target, None).await?;
|
||||
assert_eq!(
|
||||
@@ -612,7 +717,7 @@ mod fs_tests {
|
||||
);
|
||||
|
||||
let non_existing_path = "somewhere/else";
|
||||
match storage.download(&RemotePath::new(Utf8Path::new(non_existing_path))?).await {
|
||||
match storage.download(&RemotePath::new(Utf8Path::new(non_existing_path))?, &cancel).await {
|
||||
Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys
|
||||
other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"),
|
||||
}
|
||||
@@ -621,9 +726,9 @@ mod fs_tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_file_range_positive() -> anyhow::Result<()> {
|
||||
let storage = create_storage()?;
|
||||
let (storage, cancel) = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
|
||||
let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?;
|
||||
|
||||
let full_range_download_contents =
|
||||
read_and_check_metadata(&storage, &upload_target, None).await?;
|
||||
@@ -637,7 +742,12 @@ mod fs_tests {
|
||||
let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);
|
||||
|
||||
let first_part_download = storage
|
||||
.download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
|
||||
.download_byte_range(
|
||||
&upload_target,
|
||||
0,
|
||||
Some(first_part_local.len() as u64),
|
||||
&cancel,
|
||||
)
|
||||
.await?;
|
||||
assert!(
|
||||
first_part_download.metadata.is_none(),
|
||||
@@ -655,6 +765,7 @@ mod fs_tests {
|
||||
&upload_target,
|
||||
first_part_local.len() as u64,
|
||||
Some((first_part_local.len() + second_part_local.len()) as u64),
|
||||
&cancel,
|
||||
)
|
||||
.await?;
|
||||
assert!(
|
||||
@@ -669,7 +780,7 @@ mod fs_tests {
|
||||
);
|
||||
|
||||
let suffix_bytes = storage
|
||||
.download_byte_range(&upload_target, 13, None)
|
||||
.download_byte_range(&upload_target, 13, None, &cancel)
|
||||
.await?
|
||||
.download_stream;
|
||||
let suffix_bytes = aggregate(suffix_bytes).await?;
|
||||
@@ -677,7 +788,7 @@ mod fs_tests {
|
||||
assert_eq!(upload_name, suffix);
|
||||
|
||||
let all_bytes = storage
|
||||
.download_byte_range(&upload_target, 0, None)
|
||||
.download_byte_range(&upload_target, 0, None, &cancel)
|
||||
.await?
|
||||
.download_stream;
|
||||
let all_bytes = aggregate(all_bytes).await?;
|
||||
@@ -689,9 +800,9 @@ mod fs_tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_file_range_negative() -> anyhow::Result<()> {
|
||||
let storage = create_storage()?;
|
||||
let (storage, cancel) = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
|
||||
let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?;
|
||||
|
||||
let start = 1_000_000_000;
|
||||
let end = start + 1;
|
||||
@@ -700,6 +811,7 @@ mod fs_tests {
|
||||
&upload_target,
|
||||
start,
|
||||
Some(end), // exclusive end
|
||||
&cancel,
|
||||
)
|
||||
.await
|
||||
{
|
||||
@@ -716,7 +828,7 @@ mod fs_tests {
|
||||
let end = 234;
|
||||
assert!(start > end, "Should test an incorrect range");
|
||||
match storage
|
||||
.download_byte_range(&upload_target, start, Some(end))
|
||||
.download_byte_range(&upload_target, start, Some(end), &cancel)
|
||||
.await
|
||||
{
|
||||
Ok(_) => panic!("Should not allow downloading wrong ranges"),
|
||||
@@ -733,15 +845,15 @@ mod fs_tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn delete_file() -> anyhow::Result<()> {
|
||||
let storage = create_storage()?;
|
||||
let (storage, cancel) = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
|
||||
let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?;
|
||||
|
||||
storage.delete(&upload_target).await?;
|
||||
storage.delete(&upload_target, &cancel).await?;
|
||||
assert!(storage.list_all().await?.is_empty());
|
||||
|
||||
storage
|
||||
.delete(&upload_target)
|
||||
.delete(&upload_target, &cancel)
|
||||
.await
|
||||
.expect("Should allow deleting non-existing storage files");
|
||||
|
||||
@@ -750,14 +862,14 @@ mod fs_tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn file_with_metadata() -> anyhow::Result<()> {
|
||||
let storage = create_storage()?;
|
||||
let (storage, cancel) = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let metadata = StorageMetadata(HashMap::from([
|
||||
("one".to_string(), "1".to_string()),
|
||||
("two".to_string(), "2".to_string()),
|
||||
]));
|
||||
let upload_target =
|
||||
upload_dummy_file(&storage, upload_name, Some(metadata.clone())).await?;
|
||||
upload_dummy_file(&storage, upload_name, Some(metadata.clone()), &cancel).await?;
|
||||
|
||||
let full_range_download_contents =
|
||||
read_and_check_metadata(&storage, &upload_target, Some(&metadata)).await?;
|
||||
@@ -771,7 +883,12 @@ mod fs_tests {
|
||||
let (first_part_local, _) = uploaded_bytes.split_at(3);
|
||||
|
||||
let partial_download_with_metadata = storage
|
||||
.download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
|
||||
.download_byte_range(
|
||||
&upload_target,
|
||||
0,
|
||||
Some(first_part_local.len() as u64),
|
||||
&cancel,
|
||||
)
|
||||
.await?;
|
||||
let first_part_remote = aggregate(partial_download_with_metadata.download_stream).await?;
|
||||
assert_eq!(
|
||||
@@ -792,16 +909,20 @@ mod fs_tests {
|
||||
#[tokio::test]
|
||||
async fn list() -> anyhow::Result<()> {
|
||||
// No delimiter: should recursively list everything
|
||||
let storage = create_storage()?;
|
||||
let child = upload_dummy_file(&storage, "grandparent/parent/child", None).await?;
|
||||
let uncle = upload_dummy_file(&storage, "grandparent/uncle", None).await?;
|
||||
let (storage, cancel) = create_storage()?;
|
||||
let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?;
|
||||
let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?;
|
||||
|
||||
let listing = storage.list(None, ListingMode::NoDelimiter, None).await?;
|
||||
let listing = storage
|
||||
.list(None, ListingMode::NoDelimiter, None, &cancel)
|
||||
.await?;
|
||||
assert!(listing.prefixes.is_empty());
|
||||
assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
|
||||
|
||||
// Delimiter: should only go one deep
|
||||
let listing = storage.list(None, ListingMode::WithDelimiter, None).await?;
|
||||
let listing = storage
|
||||
.list(None, ListingMode::WithDelimiter, None, &cancel)
|
||||
.await?;
|
||||
|
||||
assert_eq!(
|
||||
listing.prefixes,
|
||||
@@ -815,6 +936,7 @@ mod fs_tests {
|
||||
Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
|
||||
ListingMode::WithDelimiter,
|
||||
None,
|
||||
&cancel,
|
||||
)
|
||||
.await?;
|
||||
assert_eq!(
|
||||
@@ -827,10 +949,75 @@ mod fs_tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn overwrite_shorter_file() -> anyhow::Result<()> {
|
||||
let (storage, cancel) = create_storage()?;
|
||||
|
||||
let path = RemotePath::new("does/not/matter/file".into())?;
|
||||
|
||||
let body = Bytes::from_static(b"long file contents is long");
|
||||
{
|
||||
let len = body.len();
|
||||
let body =
|
||||
futures::stream::once(futures::future::ready(std::io::Result::Ok(body.clone())));
|
||||
storage.upload(body, len, &path, None, &cancel).await?;
|
||||
}
|
||||
|
||||
let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?;
|
||||
assert_eq!(body, read);
|
||||
|
||||
let shorter = Bytes::from_static(b"shorter body");
|
||||
{
|
||||
let len = shorter.len();
|
||||
let body =
|
||||
futures::stream::once(futures::future::ready(std::io::Result::Ok(shorter.clone())));
|
||||
storage.upload(body, len, &path, None, &cancel).await?;
|
||||
}
|
||||
|
||||
let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?;
|
||||
assert_eq!(shorter, read);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn cancelled_upload_can_later_be_retried() -> anyhow::Result<()> {
|
||||
let (storage, cancel) = create_storage()?;
|
||||
|
||||
let path = RemotePath::new("does/not/matter/file".into())?;
|
||||
|
||||
let body = Bytes::from_static(b"long file contents is long");
|
||||
{
|
||||
let len = body.len();
|
||||
let body =
|
||||
futures::stream::once(futures::future::ready(std::io::Result::Ok(body.clone())));
|
||||
let cancel = cancel.child_token();
|
||||
cancel.cancel();
|
||||
let e = storage
|
||||
.upload(body, len, &path, None, &cancel)
|
||||
.await
|
||||
.unwrap_err();
|
||||
|
||||
assert!(TimeoutOrCancel::caused_by_cancel(&e));
|
||||
}
|
||||
|
||||
{
|
||||
let len = body.len();
|
||||
let body =
|
||||
futures::stream::once(futures::future::ready(std::io::Result::Ok(body.clone())));
|
||||
storage.upload(body, len, &path, None, &cancel).await?;
|
||||
}
|
||||
|
||||
let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?;
|
||||
assert_eq!(body, read);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn upload_dummy_file(
|
||||
storage: &LocalFs,
|
||||
name: &str,
|
||||
metadata: Option<StorageMetadata>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<RemotePath> {
|
||||
let from_path = storage
|
||||
.storage_root
|
||||
@@ -852,7 +1039,9 @@ mod fs_tests {
|
||||
|
||||
let file = tokio_util::io::ReaderStream::new(file);
|
||||
|
||||
storage.upload(file, size, &relative_path, metadata).await?;
|
||||
storage
|
||||
.upload(file, size, &relative_path, metadata, cancel)
|
||||
.await?;
|
||||
Ok(relative_path)
|
||||
}
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ use std::{
|
||||
pin::Pin,
|
||||
sync::Arc,
|
||||
task::{Context, Poll},
|
||||
time::SystemTime,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, Context as _};
|
||||
@@ -46,9 +46,9 @@ use utils::backoff;
|
||||
|
||||
use super::StorageMetadata;
|
||||
use crate::{
|
||||
support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode,
|
||||
RemotePath, RemoteStorage, S3Config, TimeTravelError, MAX_KEYS_PER_DELETE,
|
||||
REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError,
|
||||
Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel,
|
||||
MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
};
|
||||
|
||||
pub(super) mod metrics;
|
||||
@@ -63,6 +63,8 @@ pub struct S3Bucket {
|
||||
prefix_in_bucket: Option<String>,
|
||||
max_keys_per_list_response: Option<i32>,
|
||||
concurrency_limiter: ConcurrencyLimiter,
|
||||
// Per-request timeout. Accessible for tests.
|
||||
pub timeout: Duration,
|
||||
}
|
||||
|
||||
struct GetObjectRequest {
|
||||
@@ -72,7 +74,7 @@ struct GetObjectRequest {
|
||||
}
|
||||
impl S3Bucket {
|
||||
/// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
|
||||
pub fn new(aws_config: &S3Config) -> anyhow::Result<Self> {
|
||||
pub fn new(aws_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
|
||||
tracing::debug!(
|
||||
"Creating s3 remote storage for S3 bucket {}",
|
||||
aws_config.bucket_name
|
||||
@@ -152,6 +154,7 @@ impl S3Bucket {
|
||||
max_keys_per_list_response: aws_config.max_keys_per_list_response,
|
||||
prefix_in_bucket,
|
||||
concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
|
||||
timeout,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -185,40 +188,55 @@ impl S3Bucket {
|
||||
}
|
||||
}
|
||||
|
||||
async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
|
||||
async fn permit(
|
||||
&self,
|
||||
kind: RequestKind,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<tokio::sync::SemaphorePermit<'_>, Cancelled> {
|
||||
let started_at = start_counting_cancelled_wait(kind);
|
||||
let permit = self
|
||||
.concurrency_limiter
|
||||
.acquire(kind)
|
||||
.await
|
||||
.expect("semaphore is never closed");
|
||||
let acquire = self.concurrency_limiter.acquire(kind);
|
||||
|
||||
let permit = tokio::select! {
|
||||
permit = acquire => permit.expect("semaphore is never closed"),
|
||||
_ = cancel.cancelled() => return Err(Cancelled),
|
||||
};
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.wait_seconds
|
||||
.observe_elapsed(kind, started_at);
|
||||
|
||||
permit
|
||||
Ok(permit)
|
||||
}
|
||||
|
||||
async fn owned_permit(&self, kind: RequestKind) -> tokio::sync::OwnedSemaphorePermit {
|
||||
async fn owned_permit(
|
||||
&self,
|
||||
kind: RequestKind,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<tokio::sync::OwnedSemaphorePermit, Cancelled> {
|
||||
let started_at = start_counting_cancelled_wait(kind);
|
||||
let permit = self
|
||||
.concurrency_limiter
|
||||
.acquire_owned(kind)
|
||||
.await
|
||||
.expect("semaphore is never closed");
|
||||
let acquire = self.concurrency_limiter.acquire_owned(kind);
|
||||
|
||||
let permit = tokio::select! {
|
||||
permit = acquire => permit.expect("semaphore is never closed"),
|
||||
_ = cancel.cancelled() => return Err(Cancelled),
|
||||
};
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.wait_seconds
|
||||
.observe_elapsed(kind, started_at);
|
||||
permit
|
||||
Ok(permit)
|
||||
}
|
||||
|
||||
async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
|
||||
async fn download_object(
|
||||
&self,
|
||||
request: GetObjectRequest,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError> {
|
||||
let kind = RequestKind::Get;
|
||||
let permit = self.owned_permit(kind).await;
|
||||
|
||||
let permit = self.owned_permit(kind, cancel).await?;
|
||||
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
@@ -228,8 +246,13 @@ impl S3Bucket {
|
||||
.bucket(request.bucket)
|
||||
.key(request.key)
|
||||
.set_range(request.range)
|
||||
.send()
|
||||
.await;
|
||||
.send();
|
||||
|
||||
let get_object = tokio::select! {
|
||||
res = get_object => res,
|
||||
_ = tokio::time::sleep(self.timeout) => return Err(DownloadError::Timeout),
|
||||
_ = cancel.cancelled() => return Err(DownloadError::Cancelled),
|
||||
};
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
|
||||
@@ -259,6 +282,10 @@ impl S3Bucket {
|
||||
}
|
||||
};
|
||||
|
||||
// even if we would have no timeout left, continue anyways. the caller can decide to ignore
|
||||
// the errors considering timeouts and cancellation.
|
||||
let remaining = self.timeout.saturating_sub(started_at.elapsed());
|
||||
|
||||
let metadata = object_output.metadata().cloned().map(StorageMetadata);
|
||||
let etag = object_output.e_tag;
|
||||
let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
|
||||
@@ -268,6 +295,9 @@ impl S3Bucket {
|
||||
let body = PermitCarrying::new(permit, body);
|
||||
let body = TimedDownload::new(started_at, body);
|
||||
|
||||
let cancel_or_timeout = crate::support::cancel_or_timeout(remaining, cancel.clone());
|
||||
let body = crate::support::DownloadStream::new(cancel_or_timeout, body);
|
||||
|
||||
Ok(Download {
|
||||
metadata,
|
||||
etag,
|
||||
@@ -278,33 +308,44 @@ impl S3Bucket {
|
||||
|
||||
async fn delete_oids(
|
||||
&self,
|
||||
kind: RequestKind,
|
||||
_permit: &tokio::sync::SemaphorePermit<'_>,
|
||||
delete_objects: &[ObjectIdentifier],
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let kind = RequestKind::Delete;
|
||||
let mut cancel = std::pin::pin!(cancel.cancelled());
|
||||
|
||||
for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let resp = self
|
||||
let req = self
|
||||
.client
|
||||
.delete_objects()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.delete(
|
||||
Delete::builder()
|
||||
.set_objects(Some(chunk.to_vec()))
|
||||
.build()?,
|
||||
.build()
|
||||
.context("build request")?,
|
||||
)
|
||||
.send()
|
||||
.await;
|
||||
.send();
|
||||
|
||||
let resp = tokio::select! {
|
||||
resp = req => resp,
|
||||
_ = tokio::time::sleep(self.timeout) => return Err(TimeoutOrCancel::Timeout.into()),
|
||||
_ = &mut cancel => return Err(TimeoutOrCancel::Cancel.into()),
|
||||
};
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &resp, started_at);
|
||||
|
||||
let resp = resp?;
|
||||
let resp = resp.context("request deletion")?;
|
||||
metrics::BUCKET_METRICS
|
||||
.deleted_objects_total
|
||||
.inc_by(chunk.len() as u64);
|
||||
|
||||
if let Some(errors) = resp.errors {
|
||||
// Log a bounded number of the errors within the response:
|
||||
// these requests can carry 1000 keys so logging each one
|
||||
@@ -320,9 +361,10 @@ impl S3Bucket {
|
||||
);
|
||||
}
|
||||
|
||||
return Err(anyhow::format_err!(
|
||||
"Failed to delete {} objects",
|
||||
errors.len()
|
||||
return Err(anyhow::anyhow!(
|
||||
"Failed to delete {}/{} objects",
|
||||
errors.len(),
|
||||
chunk.len(),
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -410,6 +452,7 @@ impl RemoteStorage for S3Bucket {
|
||||
prefix: Option<&RemotePath>,
|
||||
mode: ListingMode,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Listing, DownloadError> {
|
||||
let kind = RequestKind::List;
|
||||
// s3 sdk wants i32
|
||||
@@ -431,10 +474,11 @@ impl RemoteStorage for S3Bucket {
|
||||
p
|
||||
});
|
||||
|
||||
let _permit = self.permit(kind, cancel).await?;
|
||||
|
||||
let mut continuation_token = None;
|
||||
|
||||
loop {
|
||||
let _guard = self.permit(kind).await;
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
// min of two Options, returning Some if one is value and another is
|
||||
@@ -456,9 +500,15 @@ impl RemoteStorage for S3Bucket {
|
||||
request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
|
||||
}
|
||||
|
||||
let response = request
|
||||
.send()
|
||||
.await
|
||||
let request = request.send();
|
||||
|
||||
let response = tokio::select! {
|
||||
res = request => res,
|
||||
_ = tokio::time::sleep(self.timeout) => return Err(DownloadError::Timeout),
|
||||
_ = cancel.cancelled() => return Err(DownloadError::Cancelled),
|
||||
};
|
||||
|
||||
let response = response
|
||||
.context("Failed to list S3 prefixes")
|
||||
.map_err(DownloadError::Other);
|
||||
|
||||
@@ -511,16 +561,17 @@ impl RemoteStorage for S3Bucket {
|
||||
from_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let kind = RequestKind::Put;
|
||||
let _guard = self.permit(kind).await;
|
||||
let _permit = self.permit(kind, cancel).await?;
|
||||
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let body = Body::wrap_stream(from);
|
||||
let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body));
|
||||
|
||||
let res = self
|
||||
let upload = self
|
||||
.client
|
||||
.put_object()
|
||||
.bucket(self.bucket_name.clone())
|
||||
@@ -528,22 +579,40 @@ impl RemoteStorage for S3Bucket {
|
||||
.set_metadata(metadata.map(|m| m.0))
|
||||
.content_length(from_size_bytes.try_into()?)
|
||||
.body(bytes_stream)
|
||||
.send()
|
||||
.await;
|
||||
.send();
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &res, started_at);
|
||||
let upload = tokio::time::timeout(self.timeout, upload);
|
||||
|
||||
res?;
|
||||
let res = tokio::select! {
|
||||
res = upload => res,
|
||||
_ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
|
||||
};
|
||||
|
||||
Ok(())
|
||||
if let Ok(inner) = &res {
|
||||
// do not incl. timeouts as errors in metrics but cancellations
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, inner, started_at);
|
||||
}
|
||||
|
||||
match res {
|
||||
Ok(Ok(_put)) => Ok(()),
|
||||
Ok(Err(sdk)) => Err(sdk.into()),
|
||||
Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()),
|
||||
}
|
||||
}
|
||||
|
||||
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
|
||||
async fn copy(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
to: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let kind = RequestKind::Copy;
|
||||
let _guard = self.permit(kind).await;
|
||||
let _permit = self.permit(kind, cancel).await?;
|
||||
|
||||
let timeout = tokio::time::sleep(self.timeout);
|
||||
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
@@ -554,14 +623,19 @@ impl RemoteStorage for S3Bucket {
|
||||
self.relative_path_to_s3_object(from)
|
||||
);
|
||||
|
||||
let res = self
|
||||
let op = self
|
||||
.client
|
||||
.copy_object()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.key(self.relative_path_to_s3_object(to))
|
||||
.copy_source(copy_source)
|
||||
.send()
|
||||
.await;
|
||||
.send();
|
||||
|
||||
let res = tokio::select! {
|
||||
res = op => res,
|
||||
_ = timeout => return Err(TimeoutOrCancel::Timeout.into()),
|
||||
_ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
|
||||
};
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
@@ -573,14 +647,21 @@ impl RemoteStorage for S3Bucket {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
|
||||
async fn download(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError> {
|
||||
// if prefix is not none then download file `prefix/from`
|
||||
// if prefix is none then download file `from`
|
||||
self.download_object(GetObjectRequest {
|
||||
bucket: self.bucket_name.clone(),
|
||||
key: self.relative_path_to_s3_object(from),
|
||||
range: None,
|
||||
})
|
||||
self.download_object(
|
||||
GetObjectRequest {
|
||||
bucket: self.bucket_name.clone(),
|
||||
key: self.relative_path_to_s3_object(from),
|
||||
range: None,
|
||||
},
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -589,6 +670,7 @@ impl RemoteStorage for S3Bucket {
|
||||
from: &RemotePath,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError> {
|
||||
// S3 accepts ranges as https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35
|
||||
// and needs both ends to be exclusive
|
||||
@@ -598,31 +680,39 @@ impl RemoteStorage for S3Bucket {
|
||||
None => format!("bytes={start_inclusive}-"),
|
||||
});
|
||||
|
||||
self.download_object(GetObjectRequest {
|
||||
bucket: self.bucket_name.clone(),
|
||||
key: self.relative_path_to_s3_object(from),
|
||||
range,
|
||||
})
|
||||
self.download_object(
|
||||
GetObjectRequest {
|
||||
bucket: self.bucket_name.clone(),
|
||||
key: self.relative_path_to_s3_object(from),
|
||||
range,
|
||||
},
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
}
|
||||
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
|
||||
let kind = RequestKind::Delete;
|
||||
let _guard = self.permit(kind).await;
|
||||
|
||||
async fn delete_objects<'a>(
|
||||
&self,
|
||||
paths: &'a [RemotePath],
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let kind = RequestKind::Delete;
|
||||
let permit = self.permit(kind, cancel).await?;
|
||||
let mut delete_objects = Vec::with_capacity(paths.len());
|
||||
for path in paths {
|
||||
let obj_id = ObjectIdentifier::builder()
|
||||
.set_key(Some(self.relative_path_to_s3_object(path)))
|
||||
.build()?;
|
||||
.build()
|
||||
.context("convert path to oid")?;
|
||||
delete_objects.push(obj_id);
|
||||
}
|
||||
|
||||
self.delete_oids(kind, &delete_objects).await
|
||||
self.delete_oids(&permit, &delete_objects, cancel).await
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
|
||||
async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
|
||||
let paths = std::array::from_ref(path);
|
||||
self.delete_objects(paths).await
|
||||
self.delete_objects(paths, cancel).await
|
||||
}
|
||||
|
||||
async fn time_travel_recover(
|
||||
@@ -633,7 +723,7 @@ impl RemoteStorage for S3Bucket {
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), TimeTravelError> {
|
||||
let kind = RequestKind::TimeTravel;
|
||||
let _guard = self.permit(kind).await;
|
||||
let permit = self.permit(kind, cancel).await?;
|
||||
|
||||
let timestamp = DateTime::from(timestamp);
|
||||
let done_if_after = DateTime::from(done_if_after);
|
||||
@@ -647,7 +737,7 @@ impl RemoteStorage for S3Bucket {
|
||||
|
||||
let warn_threshold = 3;
|
||||
let max_retries = 10;
|
||||
let is_permanent = |_e: &_| false;
|
||||
let is_permanent = |e: &_| matches!(e, TimeTravelError::Cancelled);
|
||||
|
||||
let mut key_marker = None;
|
||||
let mut version_id_marker = None;
|
||||
@@ -656,15 +746,19 @@ impl RemoteStorage for S3Bucket {
|
||||
loop {
|
||||
let response = backoff::retry(
|
||||
|| async {
|
||||
self.client
|
||||
let op = self
|
||||
.client
|
||||
.list_object_versions()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.set_prefix(prefix.clone())
|
||||
.set_key_marker(key_marker.clone())
|
||||
.set_version_id_marker(version_id_marker.clone())
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| TimeTravelError::Other(e.into()))
|
||||
.send();
|
||||
|
||||
tokio::select! {
|
||||
res = op => res.map_err(|e| TimeTravelError::Other(e.into())),
|
||||
_ = cancel.cancelled() => Err(TimeTravelError::Cancelled),
|
||||
}
|
||||
},
|
||||
is_permanent,
|
||||
warn_threshold,
|
||||
@@ -786,14 +880,18 @@ impl RemoteStorage for S3Bucket {
|
||||
|
||||
backoff::retry(
|
||||
|| async {
|
||||
self.client
|
||||
let op = self
|
||||
.client
|
||||
.copy_object()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.key(key)
|
||||
.copy_source(&source_id)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| TimeTravelError::Other(e.into()))
|
||||
.send();
|
||||
|
||||
tokio::select! {
|
||||
res = op => res.map_err(|e| TimeTravelError::Other(e.into())),
|
||||
_ = cancel.cancelled() => Err(TimeTravelError::Cancelled),
|
||||
}
|
||||
},
|
||||
is_permanent,
|
||||
warn_threshold,
|
||||
@@ -824,10 +922,18 @@ impl RemoteStorage for S3Bucket {
|
||||
let oid = ObjectIdentifier::builder()
|
||||
.key(key.to_owned())
|
||||
.build()
|
||||
.map_err(|e| TimeTravelError::Other(anyhow::Error::new(e)))?;
|
||||
self.delete_oids(kind, &[oid])
|
||||
.map_err(|e| TimeTravelError::Other(e.into()))?;
|
||||
|
||||
self.delete_oids(&permit, &[oid], cancel)
|
||||
.await
|
||||
.map_err(TimeTravelError::Other)?;
|
||||
.map_err(|e| {
|
||||
// delete_oid0 will use TimeoutOrCancel
|
||||
if TimeoutOrCancel::caused_by_cancel(&e) {
|
||||
TimeTravelError::Cancelled
|
||||
} else {
|
||||
TimeTravelError::Other(e)
|
||||
}
|
||||
})?;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -963,7 +1069,8 @@ mod tests {
|
||||
concurrency_limit: NonZeroUsize::new(100).unwrap(),
|
||||
max_keys_per_list_response: Some(5),
|
||||
};
|
||||
let storage = S3Bucket::new(&config).expect("remote storage init");
|
||||
let storage =
|
||||
S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init");
|
||||
for (test_path_idx, test_path) in all_paths.iter().enumerate() {
|
||||
let result = storage.relative_path_to_s3_object(test_path);
|
||||
let expected = expected_outputs[prefix_idx][test_path_idx];
|
||||
|
||||
@@ -90,11 +90,16 @@ impl UnreliableWrapper {
|
||||
}
|
||||
}
|
||||
|
||||
async fn delete_inner(&self, path: &RemotePath, attempt: bool) -> anyhow::Result<()> {
|
||||
async fn delete_inner(
|
||||
&self,
|
||||
path: &RemotePath,
|
||||
attempt: bool,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
if attempt {
|
||||
self.attempt(RemoteOp::Delete(path.clone()))?;
|
||||
}
|
||||
self.inner.delete(path).await
|
||||
self.inner.delete(path, cancel).await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -105,20 +110,22 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
|
||||
.map_err(DownloadError::Other)?;
|
||||
self.inner.list_prefixes(prefix).await
|
||||
self.inner.list_prefixes(prefix, cancel).await
|
||||
}
|
||||
|
||||
async fn list_files(
|
||||
&self,
|
||||
folder: Option<&RemotePath>,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
self.attempt(RemoteOp::ListPrefixes(folder.cloned()))
|
||||
.map_err(DownloadError::Other)?;
|
||||
self.inner.list_files(folder, max_keys).await
|
||||
self.inner.list_files(folder, max_keys, cancel).await
|
||||
}
|
||||
|
||||
async fn list(
|
||||
@@ -126,10 +133,11 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
prefix: Option<&RemotePath>,
|
||||
mode: ListingMode,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Listing, DownloadError> {
|
||||
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
|
||||
.map_err(DownloadError::Other)?;
|
||||
self.inner.list(prefix, mode, max_keys).await
|
||||
self.inner.list(prefix, mode, max_keys, cancel).await
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
@@ -140,15 +148,22 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
self.attempt(RemoteOp::Upload(to.clone()))?;
|
||||
self.inner.upload(data, data_size_bytes, to, metadata).await
|
||||
self.inner
|
||||
.upload(data, data_size_bytes, to, metadata, cancel)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
|
||||
async fn download(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError> {
|
||||
self.attempt(RemoteOp::Download(from.clone()))
|
||||
.map_err(DownloadError::Other)?;
|
||||
self.inner.download(from).await
|
||||
self.inner.download(from, cancel).await
|
||||
}
|
||||
|
||||
async fn download_byte_range(
|
||||
@@ -156,6 +171,7 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
from: &RemotePath,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError> {
|
||||
// Note: We treat any download_byte_range as an "attempt" of the same
|
||||
// operation. We don't pay attention to the ranges. That's good enough
|
||||
@@ -163,20 +179,24 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
self.attempt(RemoteOp::Download(from.clone()))
|
||||
.map_err(DownloadError::Other)?;
|
||||
self.inner
|
||||
.download_byte_range(from, start_inclusive, end_exclusive)
|
||||
.download_byte_range(from, start_inclusive, end_exclusive, cancel)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
|
||||
self.delete_inner(path, true).await
|
||||
async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
|
||||
self.delete_inner(path, true, cancel).await
|
||||
}
|
||||
|
||||
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
|
||||
async fn delete_objects<'a>(
|
||||
&self,
|
||||
paths: &'a [RemotePath],
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
self.attempt(RemoteOp::DeleteObjects(paths.to_vec()))?;
|
||||
let mut error_counter = 0;
|
||||
for path in paths {
|
||||
// Dont record attempt because it was already recorded above
|
||||
if (self.delete_inner(path, false).await).is_err() {
|
||||
if (self.delete_inner(path, false, cancel).await).is_err() {
|
||||
error_counter += 1;
|
||||
}
|
||||
}
|
||||
@@ -189,11 +209,16 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
|
||||
async fn copy(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
to: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
// copy is equivalent to download + upload
|
||||
self.attempt(RemoteOp::Download(from.clone()))?;
|
||||
self.attempt(RemoteOp::Upload(to.clone()))?;
|
||||
self.inner.copy_object(from, to).await
|
||||
self.inner.copy_object(from, to, cancel).await
|
||||
}
|
||||
|
||||
async fn time_travel_recover(
|
||||
|
||||
@@ -1,9 +1,15 @@
|
||||
use std::{
|
||||
future::Future,
|
||||
pin::Pin,
|
||||
task::{Context, Poll},
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use bytes::Bytes;
|
||||
use futures_util::Stream;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use crate::TimeoutOrCancel;
|
||||
|
||||
pin_project_lite::pin_project! {
|
||||
/// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
|
||||
@@ -31,3 +37,133 @@ impl<S: Stream> Stream for PermitCarrying<S> {
|
||||
self.inner.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
pin_project_lite::pin_project! {
|
||||
pub(crate) struct DownloadStream<F, S> {
|
||||
hit: bool,
|
||||
#[pin]
|
||||
cancellation: F,
|
||||
#[pin]
|
||||
inner: S,
|
||||
}
|
||||
}
|
||||
|
||||
impl<F, S> DownloadStream<F, S> {
|
||||
pub(crate) fn new(cancellation: F, inner: S) -> Self {
|
||||
Self {
|
||||
cancellation,
|
||||
hit: false,
|
||||
inner,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// See documentation on [`crate::DownloadStream`] on rationale why `std::io::Error` is used.
|
||||
impl<E, F, S> Stream for DownloadStream<F, S>
|
||||
where
|
||||
std::io::Error: From<E>,
|
||||
F: Future<Output = E>,
|
||||
S: Stream<Item = std::io::Result<Bytes>>,
|
||||
{
|
||||
type Item = <S as Stream>::Item;
|
||||
|
||||
fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
|
||||
let this = self.project();
|
||||
|
||||
if !*this.hit {
|
||||
if let Poll::Ready(e) = this.cancellation.poll(cx) {
|
||||
*this.hit = true;
|
||||
let e = Err(std::io::Error::from(e));
|
||||
return Poll::Ready(Some(e));
|
||||
}
|
||||
}
|
||||
|
||||
this.inner.poll_next(cx)
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
self.inner.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
/// Fires only on the first cancel or timeout, not on both.
|
||||
pub(crate) async fn cancel_or_timeout(
|
||||
timeout: Duration,
|
||||
cancel: CancellationToken,
|
||||
) -> TimeoutOrCancel {
|
||||
tokio::select! {
|
||||
_ = tokio::time::sleep(timeout) => TimeoutOrCancel::Timeout,
|
||||
_ = cancel.cancelled() => TimeoutOrCancel::Cancel,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::DownloadError;
|
||||
use futures::stream::StreamExt;
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn cancelled_download_stream() {
|
||||
let inner = futures::stream::pending();
|
||||
let timeout = Duration::from_secs(120);
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner);
|
||||
let mut stream = std::pin::pin!(stream);
|
||||
|
||||
let mut first = stream.next();
|
||||
|
||||
tokio::select! {
|
||||
_ = &mut first => unreachable!("we haven't yet cancelled nor is timeout passed"),
|
||||
_ = tokio::time::sleep(Duration::from_secs(1)) => {},
|
||||
}
|
||||
|
||||
cancel.cancel();
|
||||
|
||||
let e = first.await.expect("there must be some").unwrap_err();
|
||||
assert!(matches!(e.kind(), std::io::ErrorKind::Other), "{e:?}");
|
||||
let inner = e.get_ref().expect("inner should be set");
|
||||
assert!(
|
||||
inner
|
||||
.downcast_ref::<DownloadError>()
|
||||
.is_some_and(|e| matches!(e, DownloadError::Cancelled)),
|
||||
"{inner:?}"
|
||||
);
|
||||
|
||||
tokio::select! {
|
||||
_ = stream.next() => unreachable!("no timeout ever happens as we were already cancelled"),
|
||||
_ = tokio::time::sleep(Duration::from_secs(121)) => {},
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn timeouted_download_stream() {
|
||||
let inner = futures::stream::pending();
|
||||
let timeout = Duration::from_secs(120);
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner);
|
||||
let mut stream = std::pin::pin!(stream);
|
||||
|
||||
// because the stream uses 120s timeout we are paused, we advance to 120s right away.
|
||||
let first = stream.next();
|
||||
|
||||
let e = first.await.expect("there must be some").unwrap_err();
|
||||
assert!(matches!(e.kind(), std::io::ErrorKind::Other), "{e:?}");
|
||||
let inner = e.get_ref().expect("inner should be set");
|
||||
assert!(
|
||||
inner
|
||||
.downcast_ref::<DownloadError>()
|
||||
.is_some_and(|e| matches!(e, DownloadError::Timeout)),
|
||||
"{inner:?}"
|
||||
);
|
||||
|
||||
cancel.cancel();
|
||||
|
||||
tokio::select! {
|
||||
_ = stream.next() => unreachable!("no cancellation ever happens because we already timed out"),
|
||||
_ = tokio::time::sleep(Duration::from_secs(121)) => {},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@ use futures::stream::Stream;
|
||||
use once_cell::sync::OnceCell;
|
||||
use remote_storage::{Download, GenericRemoteStorage, RemotePath};
|
||||
use tokio::task::JoinSet;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, error, info};
|
||||
|
||||
static LOGGING_DONE: OnceCell<()> = OnceCell::new();
|
||||
@@ -58,8 +59,12 @@ pub(crate) async fn upload_simple_remote_data(
|
||||
) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
|
||||
info!("Creating {upload_tasks_count} remote files");
|
||||
let mut upload_tasks = JoinSet::new();
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
for i in 1..upload_tasks_count + 1 {
|
||||
let task_client = Arc::clone(client);
|
||||
let cancel = cancel.clone();
|
||||
|
||||
upload_tasks.spawn(async move {
|
||||
let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
|
||||
let blob_path = RemotePath::new(
|
||||
@@ -69,7 +74,9 @@ pub(crate) async fn upload_simple_remote_data(
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
|
||||
task_client.upload(data, len, &blob_path, None).await?;
|
||||
task_client
|
||||
.upload(data, len, &blob_path, None, &cancel)
|
||||
.await?;
|
||||
|
||||
Ok::<_, anyhow::Error>(blob_path)
|
||||
});
|
||||
@@ -107,13 +114,15 @@ pub(crate) async fn cleanup(
|
||||
"Removing {} objects from the remote storage during cleanup",
|
||||
objects_to_delete.len()
|
||||
);
|
||||
let cancel = CancellationToken::new();
|
||||
let mut delete_tasks = JoinSet::new();
|
||||
for object_to_delete in objects_to_delete {
|
||||
let task_client = Arc::clone(client);
|
||||
let cancel = cancel.clone();
|
||||
delete_tasks.spawn(async move {
|
||||
debug!("Deleting remote item at path {object_to_delete:?}");
|
||||
task_client
|
||||
.delete(&object_to_delete)
|
||||
.delete(&object_to_delete, &cancel)
|
||||
.await
|
||||
.with_context(|| format!("{object_to_delete:?} removal"))
|
||||
});
|
||||
@@ -141,8 +150,12 @@ pub(crate) async fn upload_remote_data(
|
||||
) -> ControlFlow<Uploads, Uploads> {
|
||||
info!("Creating {upload_tasks_count} remote files");
|
||||
let mut upload_tasks = JoinSet::new();
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
for i in 1..upload_tasks_count + 1 {
|
||||
let task_client = Arc::clone(client);
|
||||
let cancel = cancel.clone();
|
||||
|
||||
upload_tasks.spawn(async move {
|
||||
let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
|
||||
let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
|
||||
@@ -152,7 +165,9 @@ pub(crate) async fn upload_remote_data(
|
||||
|
||||
let (data, data_len) =
|
||||
upload_stream(format!("remote blob data {i}").into_bytes().into());
|
||||
task_client.upload(data, data_len, &blob_path, None).await?;
|
||||
task_client
|
||||
.upload(data, data_len, &blob_path, None, &cancel)
|
||||
.await?;
|
||||
|
||||
Ok::<_, anyhow::Error>((blob_prefix, blob_path))
|
||||
});
|
||||
|
||||
@@ -4,6 +4,7 @@ use remote_storage::RemotePath;
|
||||
use std::sync::Arc;
|
||||
use std::{collections::HashSet, num::NonZeroU32};
|
||||
use test_context::test_context;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::debug;
|
||||
|
||||
use crate::common::{download_to_vec, upload_stream, wrap_stream};
|
||||
@@ -45,13 +46,15 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
|
||||
}
|
||||
};
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let test_client = Arc::clone(&ctx.enabled.client);
|
||||
let expected_remote_prefixes = ctx.remote_prefixes.clone();
|
||||
|
||||
let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
|
||||
.context("common_prefix construction")?;
|
||||
let root_remote_prefixes = test_client
|
||||
.list_prefixes(None)
|
||||
.list_prefixes(None, &cancel)
|
||||
.await
|
||||
.context("client list root prefixes failure")?
|
||||
.into_iter()
|
||||
@@ -62,7 +65,7 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
|
||||
);
|
||||
|
||||
let nested_remote_prefixes = test_client
|
||||
.list_prefixes(Some(&base_prefix))
|
||||
.list_prefixes(Some(&base_prefix), &cancel)
|
||||
.await
|
||||
.context("client list nested prefixes failure")?
|
||||
.into_iter()
|
||||
@@ -99,11 +102,12 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
|
||||
anyhow::bail!("S3 init failed: {e:?}")
|
||||
}
|
||||
};
|
||||
let cancel = CancellationToken::new();
|
||||
let test_client = Arc::clone(&ctx.enabled.client);
|
||||
let base_prefix =
|
||||
RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
|
||||
let root_files = test_client
|
||||
.list_files(None, None)
|
||||
.list_files(None, None, &cancel)
|
||||
.await
|
||||
.context("client list root files failure")?
|
||||
.into_iter()
|
||||
@@ -117,13 +121,13 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
|
||||
// Test that max_keys limit works. In total there are about 21 files (see
|
||||
// upload_simple_remote_data call in test_real_s3.rs).
|
||||
let limited_root_files = test_client
|
||||
.list_files(None, Some(NonZeroU32::new(2).unwrap()))
|
||||
.list_files(None, Some(NonZeroU32::new(2).unwrap()), &cancel)
|
||||
.await
|
||||
.context("client list root files failure")?;
|
||||
assert_eq!(limited_root_files.len(), 2);
|
||||
|
||||
let nested_remote_files = test_client
|
||||
.list_files(Some(&base_prefix), None)
|
||||
.list_files(Some(&base_prefix), None, &cancel)
|
||||
.await
|
||||
.context("client list nested files failure")?
|
||||
.into_iter()
|
||||
@@ -150,12 +154,17 @@ async fn delete_non_exising_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Resu
|
||||
MaybeEnabledStorage::Disabled => return Ok(()),
|
||||
};
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let path = RemotePath::new(Utf8Path::new(
|
||||
format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
|
||||
))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
ctx.client.delete(&path).await.expect("should succeed");
|
||||
ctx.client
|
||||
.delete(&path, &cancel)
|
||||
.await
|
||||
.expect("should succeed");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -168,6 +177,8 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(
|
||||
MaybeEnabledStorage::Disabled => return Ok(()),
|
||||
};
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
@@ -178,21 +189,21 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let (data, len) = upload_stream("remote blob data1".as_bytes().into());
|
||||
ctx.client.upload(data, len, &path1, None).await?;
|
||||
ctx.client.upload(data, len, &path1, None, &cancel).await?;
|
||||
|
||||
let (data, len) = upload_stream("remote blob data2".as_bytes().into());
|
||||
ctx.client.upload(data, len, &path2, None).await?;
|
||||
ctx.client.upload(data, len, &path2, None, &cancel).await?;
|
||||
|
||||
let (data, len) = upload_stream("remote blob data3".as_bytes().into());
|
||||
ctx.client.upload(data, len, &path3, None).await?;
|
||||
ctx.client.upload(data, len, &path3, None, &cancel).await?;
|
||||
|
||||
ctx.client.delete_objects(&[path1, path2]).await?;
|
||||
ctx.client.delete_objects(&[path1, path2], &cancel).await?;
|
||||
|
||||
let prefixes = ctx.client.list_prefixes(None).await?;
|
||||
let prefixes = ctx.client.list_prefixes(None, &cancel).await?;
|
||||
|
||||
assert_eq!(prefixes.len(), 1);
|
||||
|
||||
ctx.client.delete_objects(&[path3]).await?;
|
||||
ctx.client.delete_objects(&[path3], &cancel).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -204,6 +215,8 @@ async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
@@ -211,47 +224,56 @@ async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<
|
||||
|
||||
let (data, len) = wrap_stream(orig.clone());
|
||||
|
||||
ctx.client.upload(data, len, &path, None).await?;
|
||||
ctx.client.upload(data, len, &path, None, &cancel).await?;
|
||||
|
||||
// Normal download request
|
||||
let dl = ctx.client.download(&path).await?;
|
||||
let dl = ctx.client.download(&path, &cancel).await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
assert_eq!(&buf, &orig);
|
||||
|
||||
// Full range (end specified)
|
||||
let dl = ctx
|
||||
.client
|
||||
.download_byte_range(&path, 0, Some(len as u64))
|
||||
.download_byte_range(&path, 0, Some(len as u64), &cancel)
|
||||
.await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
assert_eq!(&buf, &orig);
|
||||
|
||||
// partial range (end specified)
|
||||
let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
|
||||
let dl = ctx
|
||||
.client
|
||||
.download_byte_range(&path, 4, Some(10), &cancel)
|
||||
.await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
assert_eq!(&buf, &orig[4..10]);
|
||||
|
||||
// partial range (end beyond real end)
|
||||
let dl = ctx
|
||||
.client
|
||||
.download_byte_range(&path, 8, Some(len as u64 * 100))
|
||||
.download_byte_range(&path, 8, Some(len as u64 * 100), &cancel)
|
||||
.await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
assert_eq!(&buf, &orig[8..]);
|
||||
|
||||
// Partial range (end unspecified)
|
||||
let dl = ctx.client.download_byte_range(&path, 4, None).await?;
|
||||
let dl = ctx
|
||||
.client
|
||||
.download_byte_range(&path, 4, None, &cancel)
|
||||
.await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
assert_eq!(&buf, &orig[4..]);
|
||||
|
||||
// Full range (end unspecified)
|
||||
let dl = ctx.client.download_byte_range(&path, 0, None).await?;
|
||||
let dl = ctx
|
||||
.client
|
||||
.download_byte_range(&path, 0, None, &cancel)
|
||||
.await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
assert_eq!(&buf, &orig);
|
||||
|
||||
debug!("Cleanup: deleting file at path {path:?}");
|
||||
ctx.client
|
||||
.delete(&path)
|
||||
.delete(&path, &cancel)
|
||||
.await
|
||||
.with_context(|| format!("{path:?} removal"))?;
|
||||
|
||||
@@ -265,6 +287,8 @@ async fn copy_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let path = RemotePath::new(Utf8Path::new(
|
||||
format!("{}/file_to_copy", ctx.base_prefix).as_str(),
|
||||
))
|
||||
@@ -278,18 +302,18 @@ async fn copy_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
|
||||
|
||||
let (data, len) = wrap_stream(orig.clone());
|
||||
|
||||
ctx.client.upload(data, len, &path, None).await?;
|
||||
ctx.client.upload(data, len, &path, None, &cancel).await?;
|
||||
|
||||
// Normal download request
|
||||
ctx.client.copy_object(&path, &path_dest).await?;
|
||||
ctx.client.copy_object(&path, &path_dest, &cancel).await?;
|
||||
|
||||
let dl = ctx.client.download(&path_dest).await?;
|
||||
let dl = ctx.client.download(&path_dest, &cancel).await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
assert_eq!(&buf, &orig);
|
||||
|
||||
debug!("Cleanup: deleting file at path {path:?}");
|
||||
ctx.client
|
||||
.delete_objects(&[path.clone(), path_dest.clone()])
|
||||
.delete_objects(&[path.clone(), path_dest.clone()], &cancel)
|
||||
.await
|
||||
.with_context(|| format!("{path:?} removal"))?;
|
||||
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use std::collections::HashSet;
|
||||
use std::env;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::ops::ControlFlow;
|
||||
use std::sync::Arc;
|
||||
use std::time::UNIX_EPOCH;
|
||||
use std::{collections::HashSet, time::Duration};
|
||||
|
||||
use anyhow::Context;
|
||||
use remote_storage::{
|
||||
@@ -39,6 +39,17 @@ impl EnabledAzure {
|
||||
base_prefix: BASE_PREFIX,
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(unused)] // this will be needed when moving the timeout integration tests back
|
||||
fn configure_request_timeout(&mut self, timeout: Duration) {
|
||||
match Arc::get_mut(&mut self.client).expect("outer Arc::get_mut") {
|
||||
GenericRemoteStorage::AzureBlob(azure) => {
|
||||
let azure = Arc::get_mut(azure).expect("inner Arc::get_mut");
|
||||
azure.timeout = timeout;
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
enum MaybeEnabledStorage {
|
||||
@@ -213,6 +224,7 @@ fn create_azure_client(
|
||||
concurrency_limit: NonZeroUsize::new(100).unwrap(),
|
||||
max_keys_per_list_response,
|
||||
}),
|
||||
timeout: Duration::from_secs(120),
|
||||
};
|
||||
Ok(Arc::new(
|
||||
GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use std::env;
|
||||
use std::fmt::{Debug, Display};
|
||||
use std::future::Future;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::ops::ControlFlow;
|
||||
use std::sync::Arc;
|
||||
@@ -9,9 +10,10 @@ use std::{collections::HashSet, time::SystemTime};
|
||||
use crate::common::{download_to_vec, upload_stream};
|
||||
use anyhow::Context;
|
||||
use camino::Utf8Path;
|
||||
use futures_util::Future;
|
||||
use futures_util::StreamExt;
|
||||
use remote_storage::{
|
||||
GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
|
||||
DownloadError, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
|
||||
S3Config,
|
||||
};
|
||||
use test_context::test_context;
|
||||
use test_context::AsyncTestContext;
|
||||
@@ -27,7 +29,6 @@ use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_re
|
||||
use utils::backoff;
|
||||
|
||||
const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
|
||||
|
||||
const BASE_PREFIX: &str = "test";
|
||||
|
||||
#[test_context(MaybeEnabledStorage)]
|
||||
@@ -69,8 +70,11 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
ret
|
||||
}
|
||||
|
||||
async fn list_files(client: &Arc<GenericRemoteStorage>) -> anyhow::Result<HashSet<RemotePath>> {
|
||||
Ok(retry(|| client.list_files(None, None))
|
||||
async fn list_files(
|
||||
client: &Arc<GenericRemoteStorage>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<HashSet<RemotePath>> {
|
||||
Ok(retry(|| client.list_files(None, None, cancel))
|
||||
.await
|
||||
.context("list root files failure")?
|
||||
.into_iter()
|
||||
@@ -90,11 +94,11 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
|
||||
retry(|| {
|
||||
let (data, len) = upload_stream("remote blob data1".as_bytes().into());
|
||||
ctx.client.upload(data, len, &path1, None)
|
||||
ctx.client.upload(data, len, &path1, None, &cancel)
|
||||
})
|
||||
.await?;
|
||||
|
||||
let t0_files = list_files(&ctx.client).await?;
|
||||
let t0_files = list_files(&ctx.client, &cancel).await?;
|
||||
let t0 = time_point().await;
|
||||
println!("at t0: {t0_files:?}");
|
||||
|
||||
@@ -102,17 +106,17 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
|
||||
retry(|| {
|
||||
let (data, len) = upload_stream(old_data.as_bytes().into());
|
||||
ctx.client.upload(data, len, &path2, None)
|
||||
ctx.client.upload(data, len, &path2, None, &cancel)
|
||||
})
|
||||
.await?;
|
||||
|
||||
let t1_files = list_files(&ctx.client).await?;
|
||||
let t1_files = list_files(&ctx.client, &cancel).await?;
|
||||
let t1 = time_point().await;
|
||||
println!("at t1: {t1_files:?}");
|
||||
|
||||
// A little check to ensure that our clock is not too far off from the S3 clock
|
||||
{
|
||||
let dl = retry(|| ctx.client.download(&path2)).await?;
|
||||
let dl = retry(|| ctx.client.download(&path2, &cancel)).await?;
|
||||
let last_modified = dl.last_modified.unwrap();
|
||||
let half_wt = WAIT_TIME.mul_f32(0.5);
|
||||
let t0_hwt = t0 + half_wt;
|
||||
@@ -125,7 +129,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
|
||||
retry(|| {
|
||||
let (data, len) = upload_stream("remote blob data3".as_bytes().into());
|
||||
ctx.client.upload(data, len, &path3, None)
|
||||
ctx.client.upload(data, len, &path3, None, &cancel)
|
||||
})
|
||||
.await?;
|
||||
|
||||
@@ -133,12 +137,12 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
|
||||
retry(|| {
|
||||
let (data, len) = upload_stream(new_data.as_bytes().into());
|
||||
ctx.client.upload(data, len, &path2, None)
|
||||
ctx.client.upload(data, len, &path2, None, &cancel)
|
||||
})
|
||||
.await?;
|
||||
|
||||
retry(|| ctx.client.delete(&path1)).await?;
|
||||
let t2_files = list_files(&ctx.client).await?;
|
||||
retry(|| ctx.client.delete(&path1, &cancel)).await?;
|
||||
let t2_files = list_files(&ctx.client, &cancel).await?;
|
||||
let t2 = time_point().await;
|
||||
println!("at t2: {t2_files:?}");
|
||||
|
||||
@@ -147,10 +151,10 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
ctx.client
|
||||
.time_travel_recover(None, t2, t_final, &cancel)
|
||||
.await?;
|
||||
let t2_files_recovered = list_files(&ctx.client).await?;
|
||||
let t2_files_recovered = list_files(&ctx.client, &cancel).await?;
|
||||
println!("after recovery to t2: {t2_files_recovered:?}");
|
||||
assert_eq!(t2_files, t2_files_recovered);
|
||||
let path2_recovered_t2 = download_to_vec(ctx.client.download(&path2).await?).await?;
|
||||
let path2_recovered_t2 = download_to_vec(ctx.client.download(&path2, &cancel).await?).await?;
|
||||
assert_eq!(path2_recovered_t2, new_data.as_bytes());
|
||||
|
||||
// after recovery to t1: path1 is back, path2 has the old content
|
||||
@@ -158,10 +162,10 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
ctx.client
|
||||
.time_travel_recover(None, t1, t_final, &cancel)
|
||||
.await?;
|
||||
let t1_files_recovered = list_files(&ctx.client).await?;
|
||||
let t1_files_recovered = list_files(&ctx.client, &cancel).await?;
|
||||
println!("after recovery to t1: {t1_files_recovered:?}");
|
||||
assert_eq!(t1_files, t1_files_recovered);
|
||||
let path2_recovered_t1 = download_to_vec(ctx.client.download(&path2).await?).await?;
|
||||
let path2_recovered_t1 = download_to_vec(ctx.client.download(&path2, &cancel).await?).await?;
|
||||
assert_eq!(path2_recovered_t1, old_data.as_bytes());
|
||||
|
||||
// after recovery to t0: everything is gone except for path1
|
||||
@@ -169,14 +173,14 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
ctx.client
|
||||
.time_travel_recover(None, t0, t_final, &cancel)
|
||||
.await?;
|
||||
let t0_files_recovered = list_files(&ctx.client).await?;
|
||||
let t0_files_recovered = list_files(&ctx.client, &cancel).await?;
|
||||
println!("after recovery to t0: {t0_files_recovered:?}");
|
||||
assert_eq!(t0_files, t0_files_recovered);
|
||||
|
||||
// cleanup
|
||||
|
||||
let paths = &[path1, path2, path3];
|
||||
retry(|| ctx.client.delete_objects(paths)).await?;
|
||||
retry(|| ctx.client.delete_objects(paths, &cancel)).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -197,6 +201,16 @@ impl EnabledS3 {
|
||||
base_prefix: BASE_PREFIX,
|
||||
}
|
||||
}
|
||||
|
||||
fn configure_request_timeout(&mut self, timeout: Duration) {
|
||||
match Arc::get_mut(&mut self.client).expect("outer Arc::get_mut") {
|
||||
GenericRemoteStorage::AwsS3(s3) => {
|
||||
let s3 = Arc::get_mut(s3).expect("inner Arc::get_mut");
|
||||
s3.timeout = timeout;
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
enum MaybeEnabledStorage {
|
||||
@@ -370,8 +384,169 @@ fn create_s3_client(
|
||||
concurrency_limit: NonZeroUsize::new(100).unwrap(),
|
||||
max_keys_per_list_response,
|
||||
}),
|
||||
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
||||
};
|
||||
Ok(Arc::new(
|
||||
GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
|
||||
))
|
||||
}
|
||||
|
||||
#[test_context(MaybeEnabledStorage)]
|
||||
#[tokio::test]
|
||||
async fn download_is_timeouted(ctx: &mut MaybeEnabledStorage) {
|
||||
let MaybeEnabledStorage::Enabled(ctx) = ctx else {
|
||||
return;
|
||||
};
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let path = RemotePath::new(Utf8Path::new(
|
||||
format!("{}/file_to_copy", ctx.base_prefix).as_str(),
|
||||
))
|
||||
.unwrap();
|
||||
|
||||
let len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
|
||||
|
||||
let timeout = std::time::Duration::from_secs(5);
|
||||
|
||||
ctx.configure_request_timeout(timeout);
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
let mut stream = ctx
|
||||
.client
|
||||
.download(&path, &cancel)
|
||||
.await
|
||||
.expect("download succeeds")
|
||||
.download_stream;
|
||||
|
||||
if started_at.elapsed().mul_f32(0.9) >= timeout {
|
||||
tracing::warn!(
|
||||
elapsed_ms = started_at.elapsed().as_millis(),
|
||||
"timeout might be too low, consumed most of it during headers"
|
||||
);
|
||||
}
|
||||
|
||||
let first = stream
|
||||
.next()
|
||||
.await
|
||||
.expect("should have the first blob")
|
||||
.expect("should have succeeded");
|
||||
|
||||
tracing::info!(len = first.len(), "downloaded first chunk");
|
||||
|
||||
assert!(
|
||||
first.len() < len,
|
||||
"uploaded file is too small, we downloaded all on first chunk"
|
||||
);
|
||||
|
||||
tokio::time::sleep(timeout).await;
|
||||
|
||||
{
|
||||
let started_at = std::time::Instant::now();
|
||||
let next = stream
|
||||
.next()
|
||||
.await
|
||||
.expect("stream should not have ended yet");
|
||||
|
||||
tracing::info!(
|
||||
next.is_err = next.is_err(),
|
||||
elapsed_ms = started_at.elapsed().as_millis(),
|
||||
"received item after timeout"
|
||||
);
|
||||
|
||||
let e = next.expect_err("expected an error, but got a chunk?");
|
||||
|
||||
let inner = e.get_ref().expect("std::io::Error::inner should be set");
|
||||
assert!(
|
||||
inner
|
||||
.downcast_ref::<DownloadError>()
|
||||
.is_some_and(|e| matches!(e, DownloadError::Timeout)),
|
||||
"{inner:?}"
|
||||
);
|
||||
}
|
||||
|
||||
ctx.configure_request_timeout(RemoteStorageConfig::DEFAULT_TIMEOUT);
|
||||
|
||||
ctx.client.delete_objects(&[path], &cancel).await.unwrap()
|
||||
}
|
||||
|
||||
#[test_context(MaybeEnabledStorage)]
|
||||
#[tokio::test]
|
||||
async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) {
|
||||
let MaybeEnabledStorage::Enabled(ctx) = ctx else {
|
||||
return;
|
||||
};
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let path = RemotePath::new(Utf8Path::new(
|
||||
format!("{}/file_to_copy", ctx.base_prefix).as_str(),
|
||||
))
|
||||
.unwrap();
|
||||
|
||||
let len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
|
||||
|
||||
{
|
||||
let mut stream = ctx
|
||||
.client
|
||||
.download(&path, &cancel)
|
||||
.await
|
||||
.expect("download succeeds")
|
||||
.download_stream;
|
||||
|
||||
let first = stream
|
||||
.next()
|
||||
.await
|
||||
.expect("should have the first blob")
|
||||
.expect("should have succeeded");
|
||||
|
||||
tracing::info!(len = first.len(), "downloaded first chunk");
|
||||
|
||||
assert!(
|
||||
first.len() < len,
|
||||
"uploaded file is too small, we downloaded all on first chunk"
|
||||
);
|
||||
|
||||
cancel.cancel();
|
||||
|
||||
let next = stream.next().await.expect("stream should have more");
|
||||
|
||||
let e = next.expect_err("expected an error, but got a chunk?");
|
||||
|
||||
let inner = e.get_ref().expect("std::io::Error::inner should be set");
|
||||
assert!(
|
||||
inner
|
||||
.downcast_ref::<DownloadError>()
|
||||
.is_some_and(|e| matches!(e, DownloadError::Cancelled)),
|
||||
"{inner:?}"
|
||||
);
|
||||
}
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
ctx.client.delete_objects(&[path], &cancel).await.unwrap();
|
||||
}
|
||||
|
||||
/// Upload a long enough file so that we cannot download it in single chunk
|
||||
///
|
||||
/// For s3 the first chunk seems to be less than 10kB, so this has a bit of a safety margin
|
||||
async fn upload_large_enough_file(
|
||||
client: &GenericRemoteStorage,
|
||||
path: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> usize {
|
||||
let header = bytes::Bytes::from_static("remote blob data content".as_bytes());
|
||||
let body = bytes::Bytes::from(vec![0u8; 1024]);
|
||||
let contents = std::iter::once(header).chain(std::iter::repeat(body).take(128));
|
||||
|
||||
let len = contents.clone().fold(0, |acc, next| acc + next.len());
|
||||
|
||||
let contents = futures::stream::iter(contents.map(std::io::Result::Ok));
|
||||
|
||||
client
|
||||
.upload(contents, len, path, None, cancel)
|
||||
.await
|
||||
.expect("upload succeeds");
|
||||
|
||||
len
|
||||
}
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
#![allow(unused)]
|
||||
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use utils::id;
|
||||
|
||||
|
||||
@@ -29,6 +29,9 @@ pub enum Scope {
|
||||
// Should only be used e.g. for status check.
|
||||
// Currently also used for connection from any pageserver to any safekeeper.
|
||||
SafekeeperData,
|
||||
// The scope used by pageservers in upcalls to storage controller and cloud control plane
|
||||
#[serde(rename = "generations_api")]
|
||||
GenerationsApi,
|
||||
}
|
||||
|
||||
/// JWT payload. See docs/authentication.md for the format
|
||||
|
||||
@@ -6,14 +6,28 @@
|
||||
//! There are two sets of inputs; `short` and `medium`. They were collected on postgres v14 by
|
||||
//! logging what happens when a sequential scan is requested on a small table, then picking out two
|
||||
//! suitable from logs.
|
||||
//!
|
||||
//!
|
||||
//! Reference data (git blame to see commit) on an i3en.3xlarge
|
||||
// ```text
|
||||
//! short/short/1 time: [39.175 µs 39.348 µs 39.536 µs]
|
||||
//! short/short/2 time: [51.227 µs 51.487 µs 51.755 µs]
|
||||
//! short/short/4 time: [76.048 µs 76.362 µs 76.674 µs]
|
||||
//! short/short/8 time: [128.94 µs 129.82 µs 130.74 µs]
|
||||
//! short/short/16 time: [227.84 µs 229.00 µs 230.28 µs]
|
||||
//! short/short/32 time: [455.97 µs 457.81 µs 459.90 µs]
|
||||
//! short/short/64 time: [902.46 µs 904.84 µs 907.32 µs]
|
||||
//! short/short/128 time: [1.7416 ms 1.7487 ms 1.7561 ms]
|
||||
//! ``
|
||||
|
||||
use std::sync::{Arc, Barrier};
|
||||
use std::sync::Arc;
|
||||
|
||||
use bytes::{Buf, Bytes};
|
||||
use pageserver::{
|
||||
config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use tokio::task::JoinSet;
|
||||
use utils::{id::TenantId, lsn::Lsn};
|
||||
|
||||
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
|
||||
@@ -39,11 +53,11 @@ fn redo_scenarios(c: &mut Criterion) {
|
||||
.build()
|
||||
.unwrap();
|
||||
tracing::info!("executing first");
|
||||
short().execute(rt.handle(), &manager).unwrap();
|
||||
rt.block_on(short().execute(&manager)).unwrap();
|
||||
tracing::info!("first executed");
|
||||
}
|
||||
|
||||
let thread_counts = [1, 2, 4, 8, 16];
|
||||
let thread_counts = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
|
||||
let mut group = c.benchmark_group("short");
|
||||
group.sampling_mode(criterion::SamplingMode::Flat);
|
||||
@@ -74,114 +88,69 @@ fn redo_scenarios(c: &mut Criterion) {
|
||||
drop(group);
|
||||
}
|
||||
|
||||
/// Sets up `threads` number of requesters to `request_redo`, with the given input.
|
||||
/// Sets up a multi-threaded tokio runtime with default worker thread count,
|
||||
/// then, spawn `requesters` tasks that repeatedly:
|
||||
/// - get input from `input_factor()`
|
||||
/// - call `manager.request_redo()` with their input
|
||||
///
|
||||
/// This stress-tests the scalability of a single walredo manager at high tokio-level concurrency.
|
||||
///
|
||||
/// Using tokio's default worker thread count means the results will differ on machines
|
||||
/// with different core countrs. We don't care about that, the performance will always
|
||||
/// be different on different hardware. To compare performance of different software versions,
|
||||
/// use the same hardware.
|
||||
fn add_multithreaded_walredo_requesters(
|
||||
b: &mut criterion::Bencher,
|
||||
threads: u32,
|
||||
nrequesters: usize,
|
||||
manager: &Arc<PostgresRedoManager>,
|
||||
input_factory: fn() -> Request,
|
||||
) {
|
||||
assert_ne!(threads, 0);
|
||||
assert_ne!(nrequesters, 0);
|
||||
|
||||
if threads == 1 {
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
let handle = rt.handle();
|
||||
b.iter_batched_ref(
|
||||
|| Some(input_factory()),
|
||||
|input| execute_all(input.take(), handle, manager),
|
||||
criterion::BatchSize::PerIteration,
|
||||
);
|
||||
} else {
|
||||
let (work_tx, work_rx) = std::sync::mpsc::sync_channel(threads as usize);
|
||||
let rt = tokio::runtime::Builder::new_multi_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let work_rx = std::sync::Arc::new(std::sync::Mutex::new(work_rx));
|
||||
let barrier = Arc::new(tokio::sync::Barrier::new(nrequesters + 1));
|
||||
|
||||
let barrier = Arc::new(Barrier::new(threads as usize + 1));
|
||||
|
||||
let jhs = (0..threads)
|
||||
.map(|_| {
|
||||
std::thread::spawn({
|
||||
let manager = manager.clone();
|
||||
let barrier = barrier.clone();
|
||||
let work_rx = work_rx.clone();
|
||||
move || {
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
let handle = rt.handle();
|
||||
loop {
|
||||
// queue up and wait if we want to go another round
|
||||
if work_rx.lock().unwrap().recv().is_err() {
|
||||
break;
|
||||
}
|
||||
|
||||
let input = Some(input_factory());
|
||||
|
||||
barrier.wait();
|
||||
|
||||
execute_all(input, handle, &manager).unwrap();
|
||||
|
||||
barrier.wait();
|
||||
}
|
||||
}
|
||||
})
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let _jhs = JoinOnDrop(jhs);
|
||||
|
||||
b.iter_batched(
|
||||
|| {
|
||||
for _ in 0..threads {
|
||||
work_tx.send(()).unwrap()
|
||||
}
|
||||
},
|
||||
|()| {
|
||||
// start the work
|
||||
barrier.wait();
|
||||
|
||||
// wait for work to complete
|
||||
barrier.wait();
|
||||
},
|
||||
criterion::BatchSize::PerIteration,
|
||||
);
|
||||
|
||||
drop(work_tx);
|
||||
let mut requesters = JoinSet::new();
|
||||
for _ in 0..nrequesters {
|
||||
let _entered = rt.enter();
|
||||
let manager = manager.clone();
|
||||
let barrier = barrier.clone();
|
||||
requesters.spawn(async move {
|
||||
loop {
|
||||
let input = input_factory();
|
||||
barrier.wait().await;
|
||||
let page = input.execute(&manager).await.unwrap();
|
||||
assert_eq!(page.remaining(), 8192);
|
||||
barrier.wait().await;
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
struct JoinOnDrop(Vec<std::thread::JoinHandle<()>>);
|
||||
let do_one_iteration = || {
|
||||
rt.block_on(async {
|
||||
barrier.wait().await;
|
||||
// wait for work to complete
|
||||
barrier.wait().await;
|
||||
})
|
||||
};
|
||||
|
||||
impl Drop for JoinOnDrop {
|
||||
// it's not really needless because we want join all then check for panicks
|
||||
#[allow(clippy::needless_collect)]
|
||||
fn drop(&mut self) {
|
||||
// first join all
|
||||
let results = self.0.drain(..).map(|jh| jh.join()).collect::<Vec<_>>();
|
||||
// then check the results; panicking here is not great, but it does get the message across
|
||||
// to the user, and sets an exit value.
|
||||
results.into_iter().try_for_each(|res| res).unwrap();
|
||||
}
|
||||
}
|
||||
b.iter_batched(
|
||||
|| {
|
||||
// warmup
|
||||
do_one_iteration();
|
||||
},
|
||||
|()| {
|
||||
// work loop
|
||||
do_one_iteration();
|
||||
},
|
||||
criterion::BatchSize::PerIteration,
|
||||
);
|
||||
|
||||
fn execute_all<I>(
|
||||
input: I,
|
||||
handle: &tokio::runtime::Handle,
|
||||
manager: &PostgresRedoManager,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
I: IntoIterator<Item = Request>,
|
||||
{
|
||||
// just fire all requests as fast as possible
|
||||
input.into_iter().try_for_each(|req| {
|
||||
let page = req.execute(handle, manager)?;
|
||||
assert_eq!(page.remaining(), 8192);
|
||||
anyhow::Ok(())
|
||||
})
|
||||
rt.block_on(requesters.shutdown());
|
||||
}
|
||||
|
||||
criterion_group!(benches, redo_scenarios);
|
||||
@@ -493,11 +462,7 @@ struct Request {
|
||||
}
|
||||
|
||||
impl Request {
|
||||
fn execute(
|
||||
self,
|
||||
rt: &tokio::runtime::Handle,
|
||||
manager: &PostgresRedoManager,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
async fn execute(self, manager: &PostgresRedoManager) -> anyhow::Result<Bytes> {
|
||||
let Request {
|
||||
key,
|
||||
lsn,
|
||||
@@ -506,6 +471,8 @@ impl Request {
|
||||
pg_version,
|
||||
} = self;
|
||||
|
||||
rt.block_on(manager.request_redo(key, lsn, base_img, records, pg_version))
|
||||
manager
|
||||
.request_redo(key, lsn, base_img, records, pg_version)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,8 +14,12 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
|
||||
}
|
||||
(Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
|
||||
(Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
|
||||
(Scope::SafekeeperData, _) => Err(AuthError(
|
||||
"SafekeeperData scope makes no sense for Pageserver".into(),
|
||||
(Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError(
|
||||
format!(
|
||||
"JWT scope '{:?}' is ineligible for Pageserver auth",
|
||||
claims.scope
|
||||
)
|
||||
.into(),
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1359,6 +1359,7 @@ broker_endpoint = '{broker_endpoint}'
|
||||
parsed_remote_storage_config,
|
||||
RemoteStorageConfig {
|
||||
storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
|
||||
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
||||
},
|
||||
"Remote storage config should correctly parse the local FS config and fill other storage defaults"
|
||||
);
|
||||
@@ -1426,6 +1427,7 @@ broker_endpoint = '{broker_endpoint}'
|
||||
concurrency_limit: s3_concurrency_limit,
|
||||
max_keys_per_list_response: None,
|
||||
}),
|
||||
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
||||
},
|
||||
"Remote storage config should correctly parse the S3 config"
|
||||
);
|
||||
|
||||
@@ -234,7 +234,7 @@ impl DeletionHeader {
|
||||
let header_bytes = serde_json::to_vec(self).context("serialize deletion header")?;
|
||||
let header_path = conf.deletion_header_path();
|
||||
let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
|
||||
VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes)
|
||||
VirtualFile::crashsafe_overwrite(&header_path, &temp_path, header_bytes)
|
||||
.await
|
||||
.maybe_fatal_err("save deletion header")?;
|
||||
|
||||
@@ -325,7 +325,7 @@ impl DeletionList {
|
||||
let temp_path = path_with_suffix_extension(&path, TEMP_SUFFIX);
|
||||
|
||||
let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
|
||||
VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes)
|
||||
VirtualFile::crashsafe_overwrite(&path, &temp_path, bytes)
|
||||
.await
|
||||
.maybe_fatal_err("save deletion list")
|
||||
.map_err(Into::into)
|
||||
@@ -834,7 +834,6 @@ mod test {
|
||||
}
|
||||
|
||||
impl ControlPlaneGenerationsApi for MockControlPlane {
|
||||
#[allow(clippy::diverging_sub_expression)] // False positive via async_trait
|
||||
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
|
||||
unimplemented!()
|
||||
}
|
||||
@@ -868,6 +867,7 @@ mod test {
|
||||
let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?;
|
||||
let storage_config = RemoteStorageConfig {
|
||||
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
|
||||
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
||||
};
|
||||
let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
|
||||
|
||||
@@ -1171,6 +1171,7 @@ pub(crate) mod mock {
|
||||
pub struct ConsumerState {
|
||||
rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
|
||||
executor_rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
impl ConsumerState {
|
||||
@@ -1184,7 +1185,7 @@ pub(crate) mod mock {
|
||||
match msg {
|
||||
DeleterMessage::Delete(objects) => {
|
||||
for path in objects {
|
||||
match remote_storage.delete(&path).await {
|
||||
match remote_storage.delete(&path, &self.cancel).await {
|
||||
Ok(_) => {
|
||||
debug!("Deleted {path}");
|
||||
}
|
||||
@@ -1217,7 +1218,7 @@ pub(crate) mod mock {
|
||||
|
||||
for path in objects {
|
||||
info!("Executing deletion {path}");
|
||||
match remote_storage.delete(&path).await {
|
||||
match remote_storage.delete(&path, &self.cancel).await {
|
||||
Ok(_) => {
|
||||
debug!("Deleted {path}");
|
||||
}
|
||||
@@ -1267,7 +1268,11 @@ pub(crate) mod mock {
|
||||
executor_tx,
|
||||
executed,
|
||||
remote_storage,
|
||||
consumer: std::sync::Mutex::new(ConsumerState { rx, executor_rx }),
|
||||
consumer: std::sync::Mutex::new(ConsumerState {
|
||||
rx,
|
||||
executor_rx,
|
||||
cancel: CancellationToken::new(),
|
||||
}),
|
||||
lsn_table: Arc::new(std::sync::RwLock::new(VisibleLsnUpdates::new())),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use remote_storage::RemotePath;
|
||||
use remote_storage::TimeoutOrCancel;
|
||||
use remote_storage::MAX_KEYS_PER_DELETE;
|
||||
use std::time::Duration;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -71,9 +72,11 @@ impl Deleter {
|
||||
Err(anyhow::anyhow!("failpoint: deletion-queue-before-execute"))
|
||||
});
|
||||
|
||||
self.remote_storage.delete_objects(&self.accumulator).await
|
||||
self.remote_storage
|
||||
.delete_objects(&self.accumulator, &self.cancel)
|
||||
.await
|
||||
},
|
||||
|_| false,
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
3,
|
||||
10,
|
||||
"executing deletion batch",
|
||||
|
||||
@@ -351,7 +351,6 @@ pub enum IterationOutcome<U> {
|
||||
Finished(IterationOutcomeFinished<U>),
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct IterationOutcomeFinished<U> {
|
||||
/// The actual usage observed before we started the iteration.
|
||||
@@ -366,7 +365,6 @@ pub struct IterationOutcomeFinished<U> {
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
#[allow(dead_code)]
|
||||
struct AssumedUsage<U> {
|
||||
/// The expected value for `after`, after phase 2.
|
||||
projected_after: U,
|
||||
@@ -374,14 +372,12 @@ struct AssumedUsage<U> {
|
||||
failed: LayerCount,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Serialize)]
|
||||
struct PlannedUsage<U> {
|
||||
respecting_tenant_min_resident_size: U,
|
||||
fallback_to_global_lru: Option<U>,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Default, Serialize)]
|
||||
struct LayerCount {
|
||||
file_sizes: u64,
|
||||
@@ -565,7 +561,6 @@ pub(crate) struct EvictionSecondaryLayer {
|
||||
#[derive(Clone)]
|
||||
pub(crate) enum EvictionLayer {
|
||||
Attached(Layer),
|
||||
#[allow(dead_code)]
|
||||
Secondary(EvictionSecondaryLayer),
|
||||
}
|
||||
|
||||
@@ -1105,7 +1100,6 @@ mod filesystem_level_usage {
|
||||
use super::DiskUsageEvictionTaskConfig;
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
#[allow(dead_code)]
|
||||
pub struct Usage<'a> {
|
||||
config: &'a DiskUsageEvictionTaskConfig,
|
||||
|
||||
|
||||
@@ -422,6 +422,7 @@ async fn build_timeline_info_common(
|
||||
tenant::timeline::logical_size::Accuracy::Approximate => false,
|
||||
tenant::timeline::logical_size::Accuracy::Exact => true,
|
||||
},
|
||||
directory_entries_counts: timeline.get_directory_metrics().to_vec(),
|
||||
current_physical_size,
|
||||
current_logical_size_non_incremental: None,
|
||||
timeline_dir_layer_file_size_sum: None,
|
||||
@@ -1135,7 +1136,7 @@ async fn tenant_shard_split_handler(
|
||||
|
||||
let new_shards = state
|
||||
.tenant_manager
|
||||
.shard_split(tenant_shard_id, ShardCount(req.new_shard_count), &ctx)
|
||||
.shard_split(tenant_shard_id, ShardCount::new(req.new_shard_count), &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -1950,6 +1951,7 @@ async fn put_io_engine_handler(
|
||||
mut r: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&r, None)?;
|
||||
let kind: crate::virtual_file::IoEngineKind = json_request(&mut r).await?;
|
||||
crate::virtual_file::io_engine::set(kind);
|
||||
json_response(StatusCode::OK, ())
|
||||
@@ -2213,7 +2215,7 @@ pub fn make_router(
|
||||
)
|
||||
.get(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/keyspace",
|
||||
|r| testing_api_handler("read out the keyspace", r, timeline_collect_keyspace),
|
||||
|r| api_handler(r, timeline_collect_keyspace),
|
||||
)
|
||||
.put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
|
||||
.any(handler_404))
|
||||
|
||||
@@ -602,6 +602,15 @@ pub(crate) mod initial_logical_size {
|
||||
});
|
||||
}
|
||||
|
||||
static DIRECTORY_ENTRIES_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_directory_entries_count",
|
||||
"Sum of the entries in pageserver-stored directory listings",
|
||||
&["tenant_id", "shard_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_tenant_states_count",
|
||||
@@ -1809,6 +1818,7 @@ pub(crate) struct TimelineMetrics {
|
||||
resident_physical_size_gauge: UIntGauge,
|
||||
/// copy of LayeredTimeline.current_logical_size
|
||||
pub current_logical_size_gauge: UIntGauge,
|
||||
pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
|
||||
pub num_persistent_files_created: IntCounter,
|
||||
pub persistent_bytes_written: IntCounter,
|
||||
pub evictions: IntCounter,
|
||||
@@ -1818,12 +1828,12 @@ pub(crate) struct TimelineMetrics {
|
||||
impl TimelineMetrics {
|
||||
pub fn new(
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
timeline_id_raw: &TimelineId,
|
||||
evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder,
|
||||
) -> Self {
|
||||
let tenant_id = tenant_shard_id.tenant_id.to_string();
|
||||
let shard_id = format!("{}", tenant_shard_id.shard_slug());
|
||||
let timeline_id = timeline_id.to_string();
|
||||
let timeline_id = timeline_id_raw.to_string();
|
||||
let flush_time_histo = StorageTimeMetrics::new(
|
||||
StorageTimeOperation::LayerFlush,
|
||||
&tenant_id,
|
||||
@@ -1876,6 +1886,22 @@ impl TimelineMetrics {
|
||||
let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
// TODO use impl Trait syntax here once we have ability to use it: https://github.com/rust-lang/rust/issues/63065
|
||||
let directory_entries_count_gauge_closure = {
|
||||
let tenant_shard_id = *tenant_shard_id;
|
||||
let timeline_id_raw = *timeline_id_raw;
|
||||
move || {
|
||||
let tenant_id = tenant_shard_id.tenant_id.to_string();
|
||||
let shard_id = format!("{}", tenant_shard_id.shard_slug());
|
||||
let timeline_id = timeline_id_raw.to_string();
|
||||
let gauge: UIntGauge = DIRECTORY_ENTRIES_COUNT
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
gauge
|
||||
}
|
||||
};
|
||||
let directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>> =
|
||||
Lazy::new(Box::new(directory_entries_count_gauge_closure));
|
||||
let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
@@ -1902,6 +1928,7 @@ impl TimelineMetrics {
|
||||
last_record_gauge,
|
||||
resident_physical_size_gauge,
|
||||
current_logical_size_gauge,
|
||||
directory_entries_count_gauge,
|
||||
num_persistent_files_created,
|
||||
persistent_bytes_written,
|
||||
evictions,
|
||||
@@ -1944,6 +1971,9 @@ impl Drop for TimelineMetrics {
|
||||
RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
||||
}
|
||||
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
||||
if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
|
||||
let _ = metric.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
||||
}
|
||||
let _ =
|
||||
NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
||||
let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
||||
|
||||
@@ -26,7 +26,7 @@ use pageserver_api::models::{
|
||||
PagestreamNblocksResponse,
|
||||
};
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
use pageserver_api::shard::{ShardCount, ShardNumber};
|
||||
use pageserver_api::shard::ShardNumber;
|
||||
use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError};
|
||||
use pq_proto::framed::ConnectionError;
|
||||
use pq_proto::FeStartupPacket;
|
||||
@@ -998,7 +998,7 @@ impl PageServerHandler {
|
||||
) -> Result<&Arc<Timeline>, Key> {
|
||||
let key = if let Some((first_idx, first_timeline)) = self.shard_timelines.iter().next() {
|
||||
// Fastest path: single sharded case
|
||||
if first_idx.shard_count < ShardCount(2) {
|
||||
if first_idx.shard_count.count() == 1 {
|
||||
return Ok(&first_timeline.timeline);
|
||||
}
|
||||
|
||||
|
||||
@@ -14,6 +14,7 @@ use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_i
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::{ensure, Context};
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use enum_map::Enum;
|
||||
use pageserver_api::key::{
|
||||
dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
|
||||
rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
|
||||
@@ -155,6 +156,8 @@ impl Timeline {
|
||||
pending_updates: HashMap::new(),
|
||||
pending_deletions: Vec::new(),
|
||||
pending_nblocks: 0,
|
||||
pending_aux_files: None,
|
||||
pending_directory_entries: Vec::new(),
|
||||
lsn,
|
||||
}
|
||||
}
|
||||
@@ -868,6 +871,15 @@ pub struct DatadirModification<'a> {
|
||||
pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
|
||||
pending_deletions: Vec<(Range<Key>, Lsn)>,
|
||||
pending_nblocks: i64,
|
||||
|
||||
// If we already wrote any aux file changes in this modification, stash the latest dir. If set,
|
||||
// [`Self::put_file`] may assume that it is safe to emit a delta rather than checking
|
||||
// if AUX_FILES_KEY is already set.
|
||||
pending_aux_files: Option<AuxFilesDirectory>,
|
||||
|
||||
/// For special "directory" keys that store key-value maps, track the size of the map
|
||||
/// if it was updated in this modification.
|
||||
pending_directory_entries: Vec<(DirectoryKind, usize)>,
|
||||
}
|
||||
|
||||
impl<'a> DatadirModification<'a> {
|
||||
@@ -899,6 +911,7 @@ impl<'a> DatadirModification<'a> {
|
||||
let buf = DbDirectory::ser(&DbDirectory {
|
||||
dbdirs: HashMap::new(),
|
||||
})?;
|
||||
self.pending_directory_entries.push((DirectoryKind::Db, 0));
|
||||
self.put(DBDIR_KEY, Value::Image(buf.into()));
|
||||
|
||||
// Create AuxFilesDirectory
|
||||
@@ -907,16 +920,24 @@ impl<'a> DatadirModification<'a> {
|
||||
let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory {
|
||||
xids: HashSet::new(),
|
||||
})?;
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::TwoPhase, 0));
|
||||
self.put(TWOPHASEDIR_KEY, Value::Image(buf.into()));
|
||||
|
||||
let buf: Bytes = SlruSegmentDirectory::ser(&SlruSegmentDirectory::default())?.into();
|
||||
let empty_dir = Value::Image(buf);
|
||||
self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone());
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
|
||||
self.put(
|
||||
slru_dir_to_key(SlruKind::MultiXactMembers),
|
||||
empty_dir.clone(),
|
||||
);
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
|
||||
self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir);
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), 0));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1017,6 +1038,7 @@ impl<'a> DatadirModification<'a> {
|
||||
let buf = RelDirectory::ser(&RelDirectory {
|
||||
rels: HashSet::new(),
|
||||
})?;
|
||||
self.pending_directory_entries.push((DirectoryKind::Rel, 0));
|
||||
self.put(
|
||||
rel_dir_to_key(spcnode, dbnode),
|
||||
Value::Image(Bytes::from(buf)),
|
||||
@@ -1039,6 +1061,8 @@ impl<'a> DatadirModification<'a> {
|
||||
if !dir.xids.insert(xid) {
|
||||
anyhow::bail!("twophase file for xid {} already exists", xid);
|
||||
}
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::TwoPhase, dir.xids.len()));
|
||||
self.put(
|
||||
TWOPHASEDIR_KEY,
|
||||
Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)),
|
||||
@@ -1074,6 +1098,8 @@ impl<'a> DatadirModification<'a> {
|
||||
let mut dir = DbDirectory::des(&buf)?;
|
||||
if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() {
|
||||
let buf = DbDirectory::ser(&dir)?;
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::Db, dir.dbdirs.len()));
|
||||
self.put(DBDIR_KEY, Value::Image(buf.into()));
|
||||
} else {
|
||||
warn!(
|
||||
@@ -1111,6 +1137,8 @@ impl<'a> DatadirModification<'a> {
|
||||
// Didn't exist. Update dbdir
|
||||
dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false);
|
||||
let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::Db, dbdir.dbdirs.len()));
|
||||
self.put(DBDIR_KEY, Value::Image(buf.into()));
|
||||
|
||||
// and create the RelDirectory
|
||||
@@ -1125,6 +1153,10 @@ impl<'a> DatadirModification<'a> {
|
||||
if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
|
||||
return Err(RelationError::AlreadyExists);
|
||||
}
|
||||
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::Rel, rel_dir.rels.len()));
|
||||
|
||||
self.put(
|
||||
rel_dir_key,
|
||||
Value::Image(Bytes::from(
|
||||
@@ -1216,6 +1248,9 @@ impl<'a> DatadirModification<'a> {
|
||||
let buf = self.get(dir_key, ctx).await?;
|
||||
let mut dir = RelDirectory::des(&buf)?;
|
||||
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::Rel, dir.rels.len()));
|
||||
|
||||
if dir.rels.remove(&(rel.relnode, rel.forknum)) {
|
||||
self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?)));
|
||||
} else {
|
||||
@@ -1251,6 +1286,8 @@ impl<'a> DatadirModification<'a> {
|
||||
if !dir.segments.insert(segno) {
|
||||
anyhow::bail!("slru segment {kind:?}/{segno} already exists");
|
||||
}
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::SlruSegment(kind), dir.segments.len()));
|
||||
self.put(
|
||||
dir_key,
|
||||
Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)),
|
||||
@@ -1295,6 +1332,8 @@ impl<'a> DatadirModification<'a> {
|
||||
if !dir.segments.remove(&segno) {
|
||||
warn!("slru segment {:?}/{} does not exist", kind, segno);
|
||||
}
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::SlruSegment(kind), dir.segments.len()));
|
||||
self.put(
|
||||
dir_key,
|
||||
Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)),
|
||||
@@ -1325,6 +1364,8 @@ impl<'a> DatadirModification<'a> {
|
||||
if !dir.xids.remove(&xid) {
|
||||
warn!("twophase file for xid {} does not exist", xid);
|
||||
}
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::TwoPhase, dir.xids.len()));
|
||||
self.put(
|
||||
TWOPHASEDIR_KEY,
|
||||
Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)),
|
||||
@@ -1340,6 +1381,8 @@ impl<'a> DatadirModification<'a> {
|
||||
let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
|
||||
files: HashMap::new(),
|
||||
})?;
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::AuxFiles, 0));
|
||||
self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
|
||||
Ok(())
|
||||
}
|
||||
@@ -1350,28 +1393,76 @@ impl<'a> DatadirModification<'a> {
|
||||
content: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut dir = match self.get(AUX_FILES_KEY, ctx).await {
|
||||
Ok(buf) => AuxFilesDirectory::des(&buf)?,
|
||||
Err(e) => {
|
||||
// This is expected: historical databases do not have the key.
|
||||
debug!("Failed to get info about AUX files: {}", e);
|
||||
AuxFilesDirectory {
|
||||
files: HashMap::new(),
|
||||
let file_path = path.to_string();
|
||||
let content = if content.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(Bytes::copy_from_slice(content))
|
||||
};
|
||||
|
||||
let dir = if let Some(mut dir) = self.pending_aux_files.take() {
|
||||
// We already updated aux files in `self`: emit a delta and update our latest value
|
||||
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::WalRecord(NeonWalRecord::AuxFile {
|
||||
file_path: file_path.clone(),
|
||||
content: content.clone(),
|
||||
}),
|
||||
);
|
||||
|
||||
dir.upsert(file_path, content);
|
||||
dir
|
||||
} else {
|
||||
// Check if the AUX_FILES_KEY is initialized
|
||||
match self.get(AUX_FILES_KEY, ctx).await {
|
||||
Ok(dir_bytes) => {
|
||||
let mut dir = AuxFilesDirectory::des(&dir_bytes)?;
|
||||
// Key is already set, we may append a delta
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::WalRecord(NeonWalRecord::AuxFile {
|
||||
file_path: file_path.clone(),
|
||||
content: content.clone(),
|
||||
}),
|
||||
);
|
||||
dir.upsert(file_path, content);
|
||||
dir
|
||||
}
|
||||
Err(
|
||||
e @ (PageReconstructError::AncestorStopping(_)
|
||||
| PageReconstructError::Cancelled
|
||||
| PageReconstructError::AncestorLsnTimeout(_)),
|
||||
) => {
|
||||
// Important that we do not interpret a shutdown error as "not found" and thereby
|
||||
// reset the map.
|
||||
return Err(e.into());
|
||||
}
|
||||
// FIXME: PageReconstructError doesn't have an explicit variant for key-not-found, so
|
||||
// we are assuming that all _other_ possible errors represents a missing key. If some
|
||||
// other error occurs, we may incorrectly reset the map of aux files.
|
||||
Err(PageReconstructError::Other(_) | PageReconstructError::WalRedo(_)) => {
|
||||
// Key is missing, we must insert an image as the basis for subsequent deltas.
|
||||
|
||||
let mut dir = AuxFilesDirectory {
|
||||
files: HashMap::new(),
|
||||
};
|
||||
dir.upsert(file_path, content);
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::Image(Bytes::from(
|
||||
AuxFilesDirectory::ser(&dir).context("serialize")?,
|
||||
)),
|
||||
);
|
||||
dir
|
||||
}
|
||||
}
|
||||
};
|
||||
let path = path.to_string();
|
||||
if content.is_empty() {
|
||||
dir.files.remove(&path);
|
||||
} else {
|
||||
dir.files.insert(path, Bytes::copy_from_slice(content));
|
||||
}
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::Image(Bytes::from(
|
||||
AuxFilesDirectory::ser(&dir).context("serialize")?,
|
||||
)),
|
||||
);
|
||||
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::AuxFiles, dir.files.len()));
|
||||
self.pending_aux_files = Some(dir);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1427,6 +1518,10 @@ impl<'a> DatadirModification<'a> {
|
||||
self.pending_nblocks = 0;
|
||||
}
|
||||
|
||||
for (kind, count) in std::mem::take(&mut self.pending_directory_entries) {
|
||||
writer.update_directory_entries_count(kind, count as u64);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1464,6 +1559,10 @@ impl<'a> DatadirModification<'a> {
|
||||
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
|
||||
}
|
||||
|
||||
for (kind, count) in std::mem::take(&mut self.pending_directory_entries) {
|
||||
writer.update_directory_entries_count(kind, count as u64);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1573,8 +1672,18 @@ struct RelDirectory {
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Default)]
|
||||
struct AuxFilesDirectory {
|
||||
files: HashMap<String, Bytes>,
|
||||
pub(crate) struct AuxFilesDirectory {
|
||||
pub(crate) files: HashMap<String, Bytes>,
|
||||
}
|
||||
|
||||
impl AuxFilesDirectory {
|
||||
pub(crate) fn upsert(&mut self, key: String, value: Option<Bytes>) {
|
||||
if let Some(value) = value {
|
||||
self.files.insert(key, value);
|
||||
} else {
|
||||
self.files.remove(&key);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
@@ -1588,13 +1697,82 @@ struct SlruSegmentDirectory {
|
||||
segments: HashSet<u32>,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, PartialEq, Eq, Debug, enum_map::Enum)]
|
||||
#[repr(u8)]
|
||||
pub(crate) enum DirectoryKind {
|
||||
Db,
|
||||
TwoPhase,
|
||||
Rel,
|
||||
AuxFiles,
|
||||
SlruSegment(SlruKind),
|
||||
}
|
||||
|
||||
impl DirectoryKind {
|
||||
pub(crate) const KINDS_NUM: usize = <DirectoryKind as Enum>::LENGTH;
|
||||
pub(crate) fn offset(&self) -> usize {
|
||||
self.into_usize()
|
||||
}
|
||||
}
|
||||
|
||||
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
|
||||
|
||||
#[allow(clippy::bool_assert_comparison)]
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
//use super::repo_harness::*;
|
||||
//use super::*;
|
||||
use hex_literal::hex;
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use super::*;
|
||||
|
||||
use crate::{tenant::harness::TenantHarness, DEFAULT_PG_VERSION};
|
||||
|
||||
/// Test a round trip of aux file updates, from DatadirModification to reading back from the Timeline
|
||||
#[tokio::test]
|
||||
async fn aux_files_round_trip() -> anyhow::Result<()> {
|
||||
let name = "aux_files_round_trip";
|
||||
let harness = TenantHarness::create(name)?;
|
||||
|
||||
pub const TIMELINE_ID: TimelineId =
|
||||
TimelineId::from_array(hex!("11223344556677881122334455667788"));
|
||||
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
let tline = tline.raw_timeline().unwrap();
|
||||
|
||||
// First modification: insert two keys
|
||||
let mut modification = tline.begin_modification(Lsn(0x1000));
|
||||
modification.put_file("foo/bar1", b"content1", &ctx).await?;
|
||||
modification.set_lsn(Lsn(0x1008))?;
|
||||
modification.put_file("foo/bar2", b"content2", &ctx).await?;
|
||||
modification.commit(&ctx).await?;
|
||||
let expect_1008 = HashMap::from([
|
||||
("foo/bar1".to_string(), Bytes::from_static(b"content1")),
|
||||
("foo/bar2".to_string(), Bytes::from_static(b"content2")),
|
||||
]);
|
||||
|
||||
let readback = tline.list_aux_files(Lsn(0x1008), &ctx).await?;
|
||||
assert_eq!(readback, expect_1008);
|
||||
|
||||
// Second modification: update one key, remove the other
|
||||
let mut modification = tline.begin_modification(Lsn(0x2000));
|
||||
modification.put_file("foo/bar1", b"content3", &ctx).await?;
|
||||
modification.set_lsn(Lsn(0x2008))?;
|
||||
modification.put_file("foo/bar2", b"", &ctx).await?;
|
||||
modification.commit(&ctx).await?;
|
||||
let expect_2008 =
|
||||
HashMap::from([("foo/bar1".to_string(), Bytes::from_static(b"content3"))]);
|
||||
|
||||
let readback = tline.list_aux_files(Lsn(0x2008), &ctx).await?;
|
||||
assert_eq!(readback, expect_2008);
|
||||
|
||||
// Reading back in time works
|
||||
let readback = tline.list_aux_files(Lsn(0x1008), &ctx).await?;
|
||||
assert_eq!(readback, expect_1008);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/*
|
||||
fn assert_current_logical_size<R: Repository>(timeline: &DatadirTimeline<R>, lsn: Lsn) {
|
||||
|
||||
@@ -30,10 +30,6 @@
|
||||
//! only a single tenant or timeline.
|
||||
//!
|
||||
|
||||
// Clippy 1.60 incorrectly complains about the tokio::task_local!() macro.
|
||||
// Silence it. See https://github.com/rust-lang/rust-clippy/issues/9224.
|
||||
#![allow(clippy::declare_interior_mutable_const)]
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::future::Future;
|
||||
@@ -312,7 +308,6 @@ struct MutableTaskState {
|
||||
}
|
||||
|
||||
struct PageServerTask {
|
||||
#[allow(dead_code)] // unused currently
|
||||
task_id: PageserverTaskId,
|
||||
|
||||
kind: TaskKind,
|
||||
|
||||
@@ -25,6 +25,7 @@ use pageserver_api::shard::ShardIdentity;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::DownloadError;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use remote_storage::TimeoutOrCancel;
|
||||
use std::fmt;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use tokio::io::BufReader;
|
||||
@@ -48,7 +49,6 @@ use self::config::AttachmentMode;
|
||||
use self::config::LocationConf;
|
||||
use self::config::TenantConf;
|
||||
use self::delete::DeleteTenantFlow;
|
||||
use self::metadata::LoadMetadataError;
|
||||
use self::metadata::TimelineMetadata;
|
||||
use self::mgr::GetActiveTenantError;
|
||||
use self::mgr::GetTenantError;
|
||||
@@ -76,7 +76,6 @@ use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::config::LocationMode;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::metadata::load_metadata;
|
||||
pub use crate::tenant::remote_timeline_client::index::IndexPart;
|
||||
use crate::tenant::remote_timeline_client::remote_initdb_archive_path;
|
||||
use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
|
||||
@@ -93,7 +92,6 @@ use std::fmt::Debug;
|
||||
use std::fmt::Display;
|
||||
use std::fs;
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
use std::ops::Bound::Included;
|
||||
use std::sync::atomic::AtomicU64;
|
||||
use std::sync::atomic::Ordering;
|
||||
@@ -487,11 +485,6 @@ impl From<std::io::Error> for InitdbError {
|
||||
}
|
||||
}
|
||||
|
||||
struct TenantDirectoryScan {
|
||||
sorted_timelines_to_load: Vec<(TimelineId, TimelineMetadata)>,
|
||||
timelines_to_resume_deletion: Vec<(TimelineId, Option<TimelineMetadata>)>,
|
||||
}
|
||||
|
||||
enum CreateTimelineCause {
|
||||
Load,
|
||||
Delete,
|
||||
@@ -927,9 +920,7 @@ impl Tenant {
|
||||
timelines: HashMap::new(),
|
||||
},
|
||||
(None, SpawnMode::Normal) => {
|
||||
// Deprecated dev mode: load from local disk state instead of remote storage
|
||||
// https://github.com/neondatabase/neon/issues/5624
|
||||
return self.load_local(ctx).await;
|
||||
anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624");
|
||||
}
|
||||
};
|
||||
|
||||
@@ -1197,149 +1188,6 @@ impl Tenant {
|
||||
))
|
||||
}
|
||||
|
||||
fn scan_and_sort_timelines_dir(self: Arc<Tenant>) -> anyhow::Result<TenantDirectoryScan> {
|
||||
let mut timelines_to_load: HashMap<TimelineId, TimelineMetadata> = HashMap::new();
|
||||
// Note timelines_to_resume_deletion needs to be separate because it can be not sortable
|
||||
// from the point of `tree_sort_timelines`. I e some parents can be missing because deletion
|
||||
// completed in non topological order (for example because parent has smaller number of layer files in it)
|
||||
let mut timelines_to_resume_deletion: Vec<(TimelineId, Option<TimelineMetadata>)> = vec![];
|
||||
|
||||
let timelines_dir = self.conf.timelines_path(&self.tenant_shard_id);
|
||||
|
||||
for entry in timelines_dir
|
||||
.read_dir_utf8()
|
||||
.context("list timelines directory for tenant")?
|
||||
{
|
||||
let entry = entry.context("read timeline dir entry")?;
|
||||
let timeline_dir = entry.path();
|
||||
|
||||
if crate::is_temporary(timeline_dir) {
|
||||
info!("Found temporary timeline directory, removing: {timeline_dir}");
|
||||
if let Err(e) = std::fs::remove_dir_all(timeline_dir) {
|
||||
error!("Failed to remove temporary directory '{timeline_dir}': {e:?}");
|
||||
}
|
||||
} else if is_uninit_mark(timeline_dir) {
|
||||
if !timeline_dir.exists() {
|
||||
warn!("Timeline dir entry become invalid: {timeline_dir}");
|
||||
continue;
|
||||
}
|
||||
|
||||
let timeline_uninit_mark_file = &timeline_dir;
|
||||
info!(
|
||||
"Found an uninit mark file {timeline_uninit_mark_file}, removing the timeline and its uninit mark",
|
||||
);
|
||||
let timeline_id =
|
||||
TimelineId::try_from(timeline_uninit_mark_file.file_stem())
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Could not parse timeline id out of the timeline uninit mark name {timeline_uninit_mark_file}",
|
||||
)
|
||||
})?;
|
||||
let timeline_dir = self.conf.timeline_path(&self.tenant_shard_id, &timeline_id);
|
||||
if let Err(e) =
|
||||
remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file)
|
||||
{
|
||||
error!("Failed to clean up uninit marked timeline: {e:?}");
|
||||
}
|
||||
} else if crate::is_delete_mark(timeline_dir) {
|
||||
// If metadata exists, load as usual, continue deletion
|
||||
let timeline_id = TimelineId::try_from(timeline_dir.file_stem())
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Could not parse timeline id out of the timeline uninit mark name {timeline_dir}",
|
||||
)
|
||||
})?;
|
||||
|
||||
info!("Found deletion mark for timeline {}", timeline_id);
|
||||
|
||||
match load_metadata(self.conf, &self.tenant_shard_id, &timeline_id) {
|
||||
Ok(metadata) => {
|
||||
timelines_to_resume_deletion.push((timeline_id, Some(metadata)))
|
||||
}
|
||||
Err(e) => match &e {
|
||||
LoadMetadataError::Read(r) => {
|
||||
if r.kind() != io::ErrorKind::NotFound {
|
||||
return Err(anyhow::anyhow!(e)).with_context(|| {
|
||||
format!("Failed to load metadata for timeline_id {timeline_id}")
|
||||
});
|
||||
}
|
||||
|
||||
// If metadata doesnt exist it means that we've crashed without
|
||||
// completing cleanup_remaining_timeline_fs_traces in DeleteTimelineFlow.
|
||||
// So save timeline_id for later call to `DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`.
|
||||
// We cant do it here because the method is async so we'd need block_on
|
||||
// and here we're in spawn_blocking. cleanup_remaining_timeline_fs_traces uses fs operations
|
||||
// so that basically results in a cycle:
|
||||
// spawn_blocking
|
||||
// - block_on
|
||||
// - spawn_blocking
|
||||
// which can lead to running out of threads in blocing pool.
|
||||
timelines_to_resume_deletion.push((timeline_id, None));
|
||||
}
|
||||
_ => {
|
||||
return Err(anyhow::anyhow!(e)).with_context(|| {
|
||||
format!("Failed to load metadata for timeline_id {timeline_id}")
|
||||
})
|
||||
}
|
||||
},
|
||||
}
|
||||
} else {
|
||||
if !timeline_dir.exists() {
|
||||
warn!("Timeline dir entry become invalid: {timeline_dir}");
|
||||
continue;
|
||||
}
|
||||
let timeline_id = TimelineId::try_from(timeline_dir.file_name())
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Could not parse timeline id out of the timeline dir name {timeline_dir}",
|
||||
)
|
||||
})?;
|
||||
let timeline_uninit_mark_file = self
|
||||
.conf
|
||||
.timeline_uninit_mark_file_path(self.tenant_shard_id, timeline_id);
|
||||
if timeline_uninit_mark_file.exists() {
|
||||
info!(
|
||||
%timeline_id,
|
||||
"Found an uninit mark file, removing the timeline and its uninit mark",
|
||||
);
|
||||
if let Err(e) =
|
||||
remove_timeline_and_uninit_mark(timeline_dir, &timeline_uninit_mark_file)
|
||||
{
|
||||
error!("Failed to clean up uninit marked timeline: {e:?}");
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
let timeline_delete_mark_file = self
|
||||
.conf
|
||||
.timeline_delete_mark_file_path(self.tenant_shard_id, timeline_id);
|
||||
if timeline_delete_mark_file.exists() {
|
||||
// Cleanup should be done in `is_delete_mark` branch above
|
||||
continue;
|
||||
}
|
||||
|
||||
let file_name = entry.file_name();
|
||||
if let Ok(timeline_id) = file_name.parse::<TimelineId>() {
|
||||
let metadata = load_metadata(self.conf, &self.tenant_shard_id, &timeline_id)
|
||||
.context("failed to load metadata")?;
|
||||
timelines_to_load.insert(timeline_id, metadata);
|
||||
} else {
|
||||
// A file or directory that doesn't look like a timeline ID
|
||||
warn!("unexpected file or directory in timelines directory: {file_name}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sort the array of timeline IDs into tree-order, so that parent comes before
|
||||
// all its children.
|
||||
tree_sort_timelines(timelines_to_load, |m| m.ancestor_timeline()).map(|sorted_timelines| {
|
||||
TenantDirectoryScan {
|
||||
sorted_timelines_to_load: sorted_timelines,
|
||||
timelines_to_resume_deletion,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
async fn load_timeline_metadata(
|
||||
self: &Arc<Tenant>,
|
||||
timeline_ids: HashSet<TimelineId>,
|
||||
@@ -1403,141 +1251,6 @@ impl Tenant {
|
||||
Ok(timeline_preloads)
|
||||
}
|
||||
|
||||
///
|
||||
/// Background task to load in-memory data structures for this tenant, from
|
||||
/// files on disk. Used at pageserver startup.
|
||||
///
|
||||
/// No background tasks are started as part of this routine.
|
||||
async fn load_local(self: &Arc<Tenant>, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
debug!("loading tenant task");
|
||||
|
||||
// Load in-memory state to reflect the local files on disk
|
||||
//
|
||||
// Scan the directory, peek into the metadata file of each timeline, and
|
||||
// collect a list of timelines and their ancestors.
|
||||
let span = info_span!("blocking");
|
||||
let cloned = Arc::clone(self);
|
||||
|
||||
let scan = tokio::task::spawn_blocking(move || {
|
||||
let _g = span.entered();
|
||||
cloned.scan_and_sort_timelines_dir()
|
||||
})
|
||||
.await
|
||||
.context("load spawn_blocking")
|
||||
.and_then(|res| res)?;
|
||||
|
||||
// FIXME original collect_timeline_files contained one more check:
|
||||
// 1. "Timeline has no ancestor and no layer files"
|
||||
|
||||
// Process loadable timelines first
|
||||
for (timeline_id, local_metadata) in scan.sorted_timelines_to_load {
|
||||
if let Err(e) = self
|
||||
.load_local_timeline(timeline_id, local_metadata, ctx, false)
|
||||
.await
|
||||
{
|
||||
match e {
|
||||
LoadLocalTimelineError::Load(source) => {
|
||||
return Err(anyhow::anyhow!(source)).with_context(|| {
|
||||
format!("Failed to load local timeline: {timeline_id}")
|
||||
})
|
||||
}
|
||||
LoadLocalTimelineError::ResumeDeletion(source) => {
|
||||
// Make sure resumed deletion wont fail loading for entire tenant.
|
||||
error!("Failed to resume timeline deletion: {source:#}")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Resume deletion ones with deleted_mark
|
||||
for (timeline_id, maybe_local_metadata) in scan.timelines_to_resume_deletion {
|
||||
match maybe_local_metadata {
|
||||
None => {
|
||||
// See comment in `scan_and_sort_timelines_dir`.
|
||||
if let Err(e) =
|
||||
DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces(self, timeline_id)
|
||||
.await
|
||||
{
|
||||
warn!(
|
||||
"cannot clean up deleted timeline dir timeline_id: {} error: {:#}",
|
||||
timeline_id, e
|
||||
);
|
||||
}
|
||||
}
|
||||
Some(local_metadata) => {
|
||||
if let Err(e) = self
|
||||
.load_local_timeline(timeline_id, local_metadata, ctx, true)
|
||||
.await
|
||||
{
|
||||
match e {
|
||||
LoadLocalTimelineError::Load(source) => {
|
||||
// We tried to load deleted timeline, this is a bug.
|
||||
return Err(anyhow::anyhow!(source).context(
|
||||
format!("This is a bug. We tried to load deleted timeline which is wrong and loading failed. Timeline: {timeline_id}")
|
||||
));
|
||||
}
|
||||
LoadLocalTimelineError::ResumeDeletion(source) => {
|
||||
// Make sure resumed deletion wont fail loading for entire tenant.
|
||||
error!("Failed to resume timeline deletion: {source:#}")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
trace!("Done");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Subroutine of `load_tenant`, to load an individual timeline
|
||||
///
|
||||
/// NB: The parent is assumed to be already loaded!
|
||||
#[instrument(skip(self, local_metadata, ctx))]
|
||||
async fn load_local_timeline(
|
||||
self: &Arc<Self>,
|
||||
timeline_id: TimelineId,
|
||||
local_metadata: TimelineMetadata,
|
||||
ctx: &RequestContext,
|
||||
found_delete_mark: bool,
|
||||
) -> Result<(), LoadLocalTimelineError> {
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let resources = self.build_timeline_resources(timeline_id);
|
||||
|
||||
if found_delete_mark {
|
||||
// There is no remote client, we found local metadata.
|
||||
// Continue cleaning up local disk.
|
||||
DeleteTimelineFlow::resume_deletion(
|
||||
Arc::clone(self),
|
||||
timeline_id,
|
||||
&local_metadata,
|
||||
None,
|
||||
self.deletion_queue_client.clone(),
|
||||
)
|
||||
.await
|
||||
.context("resume deletion")
|
||||
.map_err(LoadLocalTimelineError::ResumeDeletion)?;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() {
|
||||
let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false)
|
||||
.with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}"))
|
||||
.map_err(LoadLocalTimelineError::Load)?;
|
||||
Some(ancestor_timeline)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
self.timeline_init_and_sync(timeline_id, resources, None, local_metadata, ancestor, ctx)
|
||||
.await
|
||||
.map_err(LoadLocalTimelineError::Load)
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
|
||||
self.tenant_shard_id
|
||||
}
|
||||
@@ -2369,7 +2082,7 @@ impl Tenant {
|
||||
generation: self.generation.into(),
|
||||
secondary_conf: None,
|
||||
shard_number: self.shard_identity.number.0,
|
||||
shard_count: self.shard_identity.count.0,
|
||||
shard_count: self.shard_identity.count.literal(),
|
||||
shard_stripe_size: self.shard_identity.stripe_size.0,
|
||||
tenant_conf: tenant_config,
|
||||
}
|
||||
@@ -2880,7 +2593,7 @@ impl Tenant {
|
||||
let config_path = config_path.to_owned();
|
||||
tokio::task::spawn_blocking(move || {
|
||||
Handle::current().block_on(async move {
|
||||
let conf_content = conf_content.as_bytes();
|
||||
let conf_content = conf_content.into_bytes();
|
||||
VirtualFile::crashsafe_overwrite(&config_path, &temp_path, conf_content)
|
||||
.await
|
||||
.with_context(|| {
|
||||
@@ -2917,7 +2630,7 @@ impl Tenant {
|
||||
let target_config_path = target_config_path.to_owned();
|
||||
tokio::task::spawn_blocking(move || {
|
||||
Handle::current().block_on(async move {
|
||||
let conf_content = conf_content.as_bytes();
|
||||
let conf_content = conf_content.into_bytes();
|
||||
VirtualFile::crashsafe_overwrite(&target_config_path, &temp_path, conf_content)
|
||||
.await
|
||||
.with_context(|| {
|
||||
@@ -3275,7 +2988,7 @@ impl Tenant {
|
||||
|
||||
/// For unit tests, make this visible so that other modules can directly create timelines
|
||||
#[cfg(test)]
|
||||
#[tracing::instrument(fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))]
|
||||
#[tracing::instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))]
|
||||
pub(crate) async fn bootstrap_timeline_test(
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
@@ -3339,7 +3052,7 @@ impl Tenant {
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::anyhow!("Cancelled"))
|
||||
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
|
||||
.and_then(|x| x)
|
||||
}
|
||||
|
||||
@@ -3389,8 +3102,10 @@ impl Tenant {
|
||||
);
|
||||
let dest_path =
|
||||
&remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &timeline_id);
|
||||
|
||||
// if this fails, it will get retried by retried control plane requests
|
||||
storage
|
||||
.copy_object(source_path, dest_path)
|
||||
.copy_object(source_path, dest_path, &self.cancel)
|
||||
.await
|
||||
.context("copy initdb tar")?;
|
||||
}
|
||||
@@ -3784,29 +3499,6 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
|
||||
fn remove_timeline_and_uninit_mark(
|
||||
timeline_dir: &Utf8Path,
|
||||
uninit_mark: &Utf8Path,
|
||||
) -> anyhow::Result<()> {
|
||||
fs::remove_dir_all(timeline_dir)
|
||||
.or_else(|e| {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
// we can leave the uninit mark without a timeline dir,
|
||||
// just remove the mark then
|
||||
Ok(())
|
||||
} else {
|
||||
Err(e)
|
||||
}
|
||||
})
|
||||
.with_context(|| {
|
||||
format!("Failed to remove unit marked timeline directory {timeline_dir}")
|
||||
})?;
|
||||
fs::remove_file(uninit_mark)
|
||||
.with_context(|| format!("Failed to remove timeline uninit mark file {uninit_mark}"))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Create the cluster temporarily in 'initdbpath' directory inside the repository
|
||||
/// to get bootstrap data for timeline initialization.
|
||||
async fn run_initdb(
|
||||
@@ -3914,6 +3606,7 @@ pub(crate) mod harness {
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::deletion_queue::mock::MockDeletionQueue;
|
||||
use crate::walredo::apply_neon;
|
||||
use crate::{
|
||||
config::PageServerConf, repository::Key, tenant::Tenant, walrecord::NeonWalRecord,
|
||||
};
|
||||
@@ -3929,8 +3622,7 @@ pub(crate) mod harness {
|
||||
TimelineId::from_array(hex!("AA223344556677881122334455667788"));
|
||||
|
||||
/// Convenience function to create a page image with given string as the only content
|
||||
#[allow(non_snake_case)]
|
||||
pub fn TEST_IMG(s: &str) -> Bytes {
|
||||
pub fn test_img(s: &str) -> Bytes {
|
||||
let mut buf = BytesMut::new();
|
||||
buf.extend_from_slice(s.as_bytes());
|
||||
buf.resize(64, 0);
|
||||
@@ -3966,13 +3658,6 @@ pub(crate) mod harness {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[derive(Debug)]
|
||||
enum LoadMode {
|
||||
Local,
|
||||
Remote,
|
||||
}
|
||||
|
||||
pub struct TenantHarness {
|
||||
pub conf: &'static PageServerConf,
|
||||
pub tenant_conf: TenantConf,
|
||||
@@ -4030,6 +3715,7 @@ pub(crate) mod harness {
|
||||
std::fs::create_dir_all(&remote_fs_dir).unwrap();
|
||||
let config = RemoteStorageConfig {
|
||||
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
|
||||
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
||||
};
|
||||
let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
|
||||
let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()));
|
||||
@@ -4053,42 +3739,17 @@ pub(crate) mod harness {
|
||||
pub(crate) async fn load(&self) -> (Arc<Tenant>, RequestContext) {
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
(
|
||||
self.try_load(&ctx)
|
||||
self.do_try_load(&ctx)
|
||||
.await
|
||||
.expect("failed to load test tenant"),
|
||||
ctx,
|
||||
)
|
||||
}
|
||||
|
||||
/// For tests that specifically want to exercise the local load path, which does
|
||||
/// not use remote storage.
|
||||
pub(crate) async fn try_load_local(
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
|
||||
pub(crate) async fn do_try_load(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Tenant>> {
|
||||
self.do_try_load(ctx, LoadMode::Local).await
|
||||
}
|
||||
|
||||
/// The 'load' in this function is either a local load or a normal attachment,
|
||||
pub(crate) async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
|
||||
// If we have nothing in remote storage, must use load_local instead of attach: attach
|
||||
// will error out if there are no timelines.
|
||||
//
|
||||
// See https://github.com/neondatabase/neon/issues/5456 for how we will eliminate
|
||||
// this weird state of a Tenant which exists but doesn't have any timelines.
|
||||
let mode = match self.remote_empty() {
|
||||
true => LoadMode::Local,
|
||||
false => LoadMode::Remote,
|
||||
};
|
||||
|
||||
self.do_try_load(ctx, mode).await
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), ?mode))]
|
||||
async fn do_try_load(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
mode: LoadMode,
|
||||
) -> anyhow::Result<Arc<Tenant>> {
|
||||
let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));
|
||||
|
||||
@@ -4109,17 +3770,10 @@ pub(crate) mod harness {
|
||||
self.deletion_queue.new_client(),
|
||||
));
|
||||
|
||||
match mode {
|
||||
LoadMode::Local => {
|
||||
tenant.load_local(ctx).await?;
|
||||
}
|
||||
LoadMode::Remote => {
|
||||
let preload = tenant
|
||||
.preload(&self.remote_storage, CancellationToken::new())
|
||||
.await?;
|
||||
tenant.attach(Some(preload), SpawnMode::Normal, ctx).await?;
|
||||
}
|
||||
}
|
||||
let preload = tenant
|
||||
.preload(&self.remote_storage, CancellationToken::new())
|
||||
.await?;
|
||||
tenant.attach(Some(preload), SpawnMode::Normal, ctx).await?;
|
||||
|
||||
tenant.state.send_replace(TenantState::Active);
|
||||
for timeline in tenant.timelines.lock().unwrap().values() {
|
||||
@@ -4128,31 +3782,6 @@ pub(crate) mod harness {
|
||||
Ok(tenant)
|
||||
}
|
||||
|
||||
fn remote_empty(&self) -> bool {
|
||||
let tenant_path = self.conf.tenant_path(&self.tenant_shard_id);
|
||||
let remote_tenant_dir = self
|
||||
.remote_fs_dir
|
||||
.join(tenant_path.strip_prefix(&self.conf.workdir).unwrap());
|
||||
if std::fs::metadata(&remote_tenant_dir).is_err() {
|
||||
return true;
|
||||
}
|
||||
|
||||
match std::fs::read_dir(remote_tenant_dir)
|
||||
.unwrap()
|
||||
.flatten()
|
||||
.next()
|
||||
{
|
||||
Some(entry) => {
|
||||
tracing::debug!(
|
||||
"remote_empty: not empty, found file {}",
|
||||
entry.file_name().to_string_lossy(),
|
||||
);
|
||||
false
|
||||
}
|
||||
None => true,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn timeline_path(&self, timeline_id: &TimelineId) -> Utf8PathBuf {
|
||||
self.conf.timeline_path(&self.tenant_shard_id, timeline_id)
|
||||
}
|
||||
@@ -4173,20 +3802,33 @@ pub(crate) mod harness {
|
||||
records: Vec<(Lsn, NeonWalRecord)>,
|
||||
_pg_version: u32,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
let s = format!(
|
||||
"redo for {} to get to {}, with {} and {} records",
|
||||
key,
|
||||
lsn,
|
||||
if base_img.is_some() {
|
||||
"base image"
|
||||
} else {
|
||||
"no base image"
|
||||
},
|
||||
records.len()
|
||||
);
|
||||
println!("{s}");
|
||||
let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1));
|
||||
if records_neon {
|
||||
// For Neon wal records, we can decode without spawning postgres, so do so.
|
||||
let base_img = base_img.expect("Neon WAL redo requires base image").1;
|
||||
let mut page = BytesMut::new();
|
||||
page.extend_from_slice(&base_img);
|
||||
for (_record_lsn, record) in records {
|
||||
apply_neon::apply_in_neon(&record, key, &mut page)?;
|
||||
}
|
||||
Ok(page.freeze())
|
||||
} else {
|
||||
// We never spawn a postgres walredo process in unit tests: just log what we might have done.
|
||||
let s = format!(
|
||||
"redo for {} to get to {}, with {} and {} records",
|
||||
key,
|
||||
lsn,
|
||||
if base_img.is_some() {
|
||||
"base image"
|
||||
} else {
|
||||
"no base image"
|
||||
},
|
||||
records.len()
|
||||
);
|
||||
println!("{s}");
|
||||
|
||||
Ok(TEST_IMG(&s))
|
||||
Ok(test_img(&s))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -4198,7 +3840,6 @@ mod tests {
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::tenant::harness::*;
|
||||
use crate::DEFAULT_PG_VERSION;
|
||||
use crate::METADATA_FILE_NAME;
|
||||
use bytes::BytesMut;
|
||||
use hex_literal::hex;
|
||||
use once_cell::sync::Lazy;
|
||||
@@ -4220,7 +3861,7 @@ mod tests {
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
Lsn(0x10),
|
||||
&Value::Image(TEST_IMG("foo at 0x10")),
|
||||
&Value::Image(test_img("foo at 0x10")),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -4232,7 +3873,7 @@ mod tests {
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
Lsn(0x20),
|
||||
&Value::Image(TEST_IMG("foo at 0x20")),
|
||||
&Value::Image(test_img("foo at 0x20")),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -4241,15 +3882,15 @@ mod tests {
|
||||
|
||||
assert_eq!(
|
||||
tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?,
|
||||
TEST_IMG("foo at 0x10")
|
||||
test_img("foo at 0x10")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?,
|
||||
TEST_IMG("foo at 0x10")
|
||||
test_img("foo at 0x10")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?,
|
||||
TEST_IMG("foo at 0x20")
|
||||
test_img("foo at 0x20")
|
||||
);
|
||||
|
||||
Ok(())
|
||||
@@ -4358,7 +3999,6 @@ mod tests {
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut lsn = start_lsn;
|
||||
#[allow(non_snake_case)]
|
||||
{
|
||||
let writer = tline.writer().await;
|
||||
// Create a relation on the timeline
|
||||
@@ -4366,7 +4006,7 @@ mod tests {
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||
&Value::Image(test_img(&format!("foo at {}", lsn))),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -4376,7 +4016,7 @@ mod tests {
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||
&Value::Image(test_img(&format!("foo at {}", lsn))),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -4390,7 +4030,7 @@ mod tests {
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||
&Value::Image(test_img(&format!("foo at {}", lsn))),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -4400,7 +4040,7 @@ mod tests {
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||
&Value::Image(test_img(&format!("foo at {}", lsn))),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -4555,7 +4195,7 @@ mod tests {
|
||||
// Broken, as long as you don't need to access data from the parent.
|
||||
assert_eq!(
|
||||
newtline.get(*TEST_KEY, Lsn(0x70), &ctx).await?,
|
||||
TEST_IMG(&format!("foo at {}", Lsn(0x70)))
|
||||
test_img(&format!("foo at {}", Lsn(0x70)))
|
||||
);
|
||||
|
||||
// This needs to traverse to the parent, and fails.
|
||||
@@ -4632,7 +4272,7 @@ mod tests {
|
||||
// Check that the data is still accessible on the branch.
|
||||
assert_eq!(
|
||||
newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await?,
|
||||
TEST_IMG(&format!("foo at {}", Lsn(0x40)))
|
||||
test_img(&format!("foo at {}", Lsn(0x40)))
|
||||
);
|
||||
|
||||
Ok(())
|
||||
@@ -4741,60 +4381,6 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn corrupt_local_metadata() -> anyhow::Result<()> {
|
||||
const TEST_NAME: &str = "corrupt_metadata";
|
||||
let harness = TenantHarness::create(TEST_NAME)?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
drop(tline);
|
||||
// so that all uploads finish & we can call harness.try_load() below again
|
||||
tenant
|
||||
.shutdown(Default::default(), true)
|
||||
.instrument(harness.span())
|
||||
.await
|
||||
.ok()
|
||||
.unwrap();
|
||||
drop(tenant);
|
||||
|
||||
// Corrupt local metadata
|
||||
let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME);
|
||||
assert!(metadata_path.is_file());
|
||||
let mut metadata_bytes = std::fs::read(&metadata_path)?;
|
||||
assert_eq!(metadata_bytes.len(), 512);
|
||||
metadata_bytes[8] ^= 1;
|
||||
std::fs::write(metadata_path, metadata_bytes)?;
|
||||
|
||||
let err = harness.try_load_local(&ctx).await.expect_err("should fail");
|
||||
// get all the stack with all .context, not only the last one
|
||||
let message = format!("{err:#}");
|
||||
let expected = "failed to load metadata";
|
||||
assert!(
|
||||
message.contains(expected),
|
||||
"message '{message}' expected to contain {expected}"
|
||||
);
|
||||
|
||||
let mut found_error_message = false;
|
||||
let mut err_source = err.source();
|
||||
while let Some(source) = err_source {
|
||||
if source.to_string().contains("metadata checksum mismatch") {
|
||||
found_error_message = true;
|
||||
break;
|
||||
}
|
||||
err_source = source.source();
|
||||
}
|
||||
assert!(
|
||||
found_error_message,
|
||||
"didn't find the corrupted metadata error in {}",
|
||||
message
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_images() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) = TenantHarness::create("test_images")?.load().await;
|
||||
@@ -4807,7 +4393,7 @@ mod tests {
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
Lsn(0x10),
|
||||
&Value::Image(TEST_IMG("foo at 0x10")),
|
||||
&Value::Image(test_img("foo at 0x10")),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -4824,7 +4410,7 @@ mod tests {
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
Lsn(0x20),
|
||||
&Value::Image(TEST_IMG("foo at 0x20")),
|
||||
&Value::Image(test_img("foo at 0x20")),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -4841,7 +4427,7 @@ mod tests {
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
Lsn(0x30),
|
||||
&Value::Image(TEST_IMG("foo at 0x30")),
|
||||
&Value::Image(test_img("foo at 0x30")),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -4858,7 +4444,7 @@ mod tests {
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
Lsn(0x40),
|
||||
&Value::Image(TEST_IMG("foo at 0x40")),
|
||||
&Value::Image(test_img("foo at 0x40")),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -4872,23 +4458,23 @@ mod tests {
|
||||
|
||||
assert_eq!(
|
||||
tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?,
|
||||
TEST_IMG("foo at 0x10")
|
||||
test_img("foo at 0x10")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?,
|
||||
TEST_IMG("foo at 0x10")
|
||||
test_img("foo at 0x10")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?,
|
||||
TEST_IMG("foo at 0x20")
|
||||
test_img("foo at 0x20")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get(*TEST_KEY, Lsn(0x30), &ctx).await?,
|
||||
TEST_IMG("foo at 0x30")
|
||||
test_img("foo at 0x30")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get(*TEST_KEY, Lsn(0x40), &ctx).await?,
|
||||
TEST_IMG("foo at 0x40")
|
||||
test_img("foo at 0x40")
|
||||
);
|
||||
|
||||
Ok(())
|
||||
@@ -4920,7 +4506,7 @@ mod tests {
|
||||
.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -4982,7 +4568,7 @@ mod tests {
|
||||
.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -5003,7 +4589,7 @@ mod tests {
|
||||
.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -5017,7 +4603,7 @@ mod tests {
|
||||
test_key.field6 = blknum as u32;
|
||||
assert_eq!(
|
||||
tline.get(test_key, lsn, &ctx).await?,
|
||||
TEST_IMG(&format!("{} at {}", blknum, last_lsn))
|
||||
test_img(&format!("{} at {}", blknum, last_lsn))
|
||||
);
|
||||
}
|
||||
|
||||
@@ -5071,7 +4657,7 @@ mod tests {
|
||||
.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -5100,7 +4686,7 @@ mod tests {
|
||||
.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -5115,7 +4701,7 @@ mod tests {
|
||||
test_key.field6 = blknum as u32;
|
||||
assert_eq!(
|
||||
tline.get(test_key, lsn, &ctx).await?,
|
||||
TEST_IMG(&format!("{} at {}", blknum, last_lsn))
|
||||
test_img(&format!("{} at {}", blknum, last_lsn))
|
||||
);
|
||||
}
|
||||
|
||||
@@ -5177,7 +4763,7 @@ mod tests {
|
||||
.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("{} {} at {}", idx, blknum, lsn))),
|
||||
&Value::Image(test_img(&format!("{} {} at {}", idx, blknum, lsn))),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -5199,7 +4785,7 @@ mod tests {
|
||||
test_key.field6 = blknum as u32;
|
||||
assert_eq!(
|
||||
tline.get(test_key, *lsn, &ctx).await?,
|
||||
TEST_IMG(&format!("{idx} {blknum} at {lsn}"))
|
||||
test_img(&format!("{idx} {blknum} at {lsn}"))
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -131,27 +131,23 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
&mut self,
|
||||
src_buf: B,
|
||||
) -> (B::Buf, Result<(), Error>) {
|
||||
let src_buf_len = src_buf.bytes_init();
|
||||
let (src_buf, res) = if src_buf_len > 0 {
|
||||
let src_buf = src_buf.slice(0..src_buf_len);
|
||||
let res = self.inner.write_all(&src_buf).await;
|
||||
let src_buf = Slice::into_inner(src_buf);
|
||||
(src_buf, res)
|
||||
} else {
|
||||
let res = self.inner.write_all(&[]).await;
|
||||
(Slice::into_inner(src_buf.slice_full()), res)
|
||||
let (src_buf, res) = self.inner.write_all(src_buf).await;
|
||||
let nbytes = match res {
|
||||
Ok(nbytes) => nbytes,
|
||||
Err(e) => return (src_buf, Err(e)),
|
||||
};
|
||||
if let Ok(()) = &res {
|
||||
self.offset += src_buf_len as u64;
|
||||
}
|
||||
(src_buf, res)
|
||||
self.offset += nbytes as u64;
|
||||
(src_buf, Ok(()))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
/// Flushes the internal buffer to the underlying `VirtualFile`.
|
||||
pub async fn flush_buffer(&mut self) -> Result<(), Error> {
|
||||
self.inner.write_all(&self.buf).await?;
|
||||
self.buf.clear();
|
||||
let buf = std::mem::take(&mut self.buf);
|
||||
let (mut buf, res) = self.inner.write_all(buf).await;
|
||||
res?;
|
||||
buf.clear();
|
||||
self.buf = buf;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -251,7 +251,7 @@ impl LocationConf {
|
||||
} else {
|
||||
ShardIdentity::new(
|
||||
ShardNumber(conf.shard_number),
|
||||
ShardCount(conf.shard_count),
|
||||
ShardCount::new(conf.shard_count),
|
||||
ShardStripeSize(conf.shard_stripe_size),
|
||||
)?
|
||||
};
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::sync::Arc;
|
||||
use anyhow::Context;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use pageserver_api::{models::TenantState, shard::TenantShardId};
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath};
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
|
||||
use tokio::sync::OwnedMutexGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, instrument, Instrument};
|
||||
@@ -84,17 +84,17 @@ async fn create_remote_delete_mark(
|
||||
let data = bytes::Bytes::from_static(data);
|
||||
let stream = futures::stream::once(futures::future::ready(Ok(data)));
|
||||
remote_storage
|
||||
.upload(stream, 0, &remote_mark_path, None)
|
||||
.upload(stream, 0, &remote_mark_path, None, cancel)
|
||||
.await
|
||||
},
|
||||
|_e| false,
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"mark_upload",
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::anyhow!("Cancelled"))
|
||||
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
|
||||
.and_then(|x| x)
|
||||
.context("mark_upload")?;
|
||||
|
||||
@@ -184,15 +184,15 @@ async fn remove_tenant_remote_delete_mark(
|
||||
if let Some(remote_storage) = remote_storage {
|
||||
let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
|
||||
backoff::retry(
|
||||
|| async { remote_storage.delete(&path).await },
|
||||
|_e| false,
|
||||
|| async { remote_storage.delete(&path, cancel).await },
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"remove_tenant_remote_delete_mark",
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::anyhow!("Cancelled"))
|
||||
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
|
||||
.and_then(|x| x)
|
||||
.context("remove_tenant_remote_delete_mark")?;
|
||||
}
|
||||
|
||||
@@ -36,7 +36,6 @@ use crate::{
|
||||
pub const VALUE_SZ: usize = 5;
|
||||
pub const MAX_VALUE: u64 = 0x007f_ffff_ffff;
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub const PAGE_SZ: usize = 8192;
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
|
||||
@@ -6,6 +6,7 @@ use crate::context::RequestContext;
|
||||
use crate::page_cache::{self, PAGE_SZ};
|
||||
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
use bytes::BytesMut;
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::cmp::min;
|
||||
@@ -26,7 +27,10 @@ pub struct EphemeralFile {
|
||||
/// An ephemeral file is append-only.
|
||||
/// We keep the last page, which can still be modified, in [`Self::mutable_tail`].
|
||||
/// The other pages, which can no longer be modified, are accessed through the page cache.
|
||||
mutable_tail: [u8; PAGE_SZ],
|
||||
///
|
||||
/// None <=> IO is ongoing.
|
||||
/// Size is fixed to PAGE_SZ at creation time and must not be changed.
|
||||
mutable_tail: Option<BytesMut>,
|
||||
}
|
||||
|
||||
impl EphemeralFile {
|
||||
@@ -60,7 +64,7 @@ impl EphemeralFile {
|
||||
_timeline_id: timeline_id,
|
||||
file,
|
||||
len: 0,
|
||||
mutable_tail: [0u8; PAGE_SZ],
|
||||
mutable_tail: Some(BytesMut::zeroed(PAGE_SZ)),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -103,7 +107,13 @@ impl EphemeralFile {
|
||||
};
|
||||
} else {
|
||||
debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
|
||||
Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
|
||||
Ok(BlockLease::EphemeralFileMutableTail(
|
||||
self.mutable_tail
|
||||
.as_deref()
|
||||
.expect("we're not doing IO, it must be Some()")
|
||||
.try_into()
|
||||
.expect("we ensure that it's always PAGE_SZ"),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -135,21 +145,27 @@ impl EphemeralFile {
|
||||
) -> Result<(), io::Error> {
|
||||
let mut src_remaining = src;
|
||||
while !src_remaining.is_empty() {
|
||||
let dst_remaining = &mut self.ephemeral_file.mutable_tail[self.off..];
|
||||
let dst_remaining = &mut self
|
||||
.ephemeral_file
|
||||
.mutable_tail
|
||||
.as_deref_mut()
|
||||
.expect("IO is not yet ongoing")[self.off..];
|
||||
let n = min(dst_remaining.len(), src_remaining.len());
|
||||
dst_remaining[..n].copy_from_slice(&src_remaining[..n]);
|
||||
self.off += n;
|
||||
src_remaining = &src_remaining[n..];
|
||||
if self.off == PAGE_SZ {
|
||||
match self
|
||||
let mutable_tail = std::mem::take(&mut self.ephemeral_file.mutable_tail)
|
||||
.expect("IO is not yet ongoing");
|
||||
let (mutable_tail, res) = self
|
||||
.ephemeral_file
|
||||
.file
|
||||
.write_all_at(
|
||||
&self.ephemeral_file.mutable_tail,
|
||||
self.blknum as u64 * PAGE_SZ as u64,
|
||||
)
|
||||
.await
|
||||
{
|
||||
.write_all_at(mutable_tail, self.blknum as u64 * PAGE_SZ as u64)
|
||||
.await;
|
||||
// TODO: If we panic before we can put the mutable_tail back, subsequent calls will fail.
|
||||
// I.e., the IO isn't retryable if we panic.
|
||||
self.ephemeral_file.mutable_tail = Some(mutable_tail);
|
||||
match res {
|
||||
Ok(_) => {
|
||||
// Pre-warm the page cache with what we just wrote.
|
||||
// This isn't necessary for coherency/correctness, but it's how we've always done it.
|
||||
@@ -169,7 +185,12 @@ impl EphemeralFile {
|
||||
Ok(page_cache::ReadBufResult::NotFound(mut write_guard)) => {
|
||||
let buf: &mut [u8] = write_guard.deref_mut();
|
||||
debug_assert_eq!(buf.len(), PAGE_SZ);
|
||||
buf.copy_from_slice(&self.ephemeral_file.mutable_tail);
|
||||
buf.copy_from_slice(
|
||||
self.ephemeral_file
|
||||
.mutable_tail
|
||||
.as_deref()
|
||||
.expect("IO is not ongoing"),
|
||||
);
|
||||
let _ = write_guard.mark_valid();
|
||||
// pre-warm successful
|
||||
}
|
||||
@@ -181,7 +202,11 @@ impl EphemeralFile {
|
||||
// Zero the buffer for re-use.
|
||||
// Zeroing is critical for correcntess because the write_blob code below
|
||||
// and similarly read_blk expect zeroed pages.
|
||||
self.ephemeral_file.mutable_tail.fill(0);
|
||||
self.ephemeral_file
|
||||
.mutable_tail
|
||||
.as_deref_mut()
|
||||
.expect("IO is not ongoing")
|
||||
.fill(0);
|
||||
// This block is done, move to next one.
|
||||
self.blknum += 1;
|
||||
self.off = 0;
|
||||
|
||||
@@ -279,7 +279,7 @@ pub async fn save_metadata(
|
||||
let path = conf.metadata_path(tenant_shard_id, timeline_id);
|
||||
let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
|
||||
let metadata_bytes = data.to_bytes().context("serialize metadata")?;
|
||||
VirtualFile::crashsafe_overwrite(&path, &temp_path, &metadata_bytes)
|
||||
VirtualFile::crashsafe_overwrite(&path, &temp_path, metadata_bytes)
|
||||
.await
|
||||
.context("write metadata")?;
|
||||
Ok(())
|
||||
@@ -294,17 +294,6 @@ pub enum LoadMetadataError {
|
||||
Decode(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
pub fn load_metadata(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
) -> Result<TimelineMetadata, LoadMetadataError> {
|
||||
let metadata_path = conf.metadata_path(tenant_shard_id, timeline_id);
|
||||
let metadata_bytes = std::fs::read(metadata_path)?;
|
||||
|
||||
Ok(TimelineMetadata::from_bytes(&metadata_bytes)?)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
//! page server.
|
||||
|
||||
use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
|
||||
use futures::stream::StreamExt;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::models::ShardParameters;
|
||||
@@ -793,7 +794,7 @@ pub(crate) async fn set_new_tenant_config(
|
||||
info!("configuring tenant {tenant_id}");
|
||||
let tenant = get_tenant(tenant_shard_id, true)?;
|
||||
|
||||
if tenant.tenant_shard_id().shard_count > ShardCount(0) {
|
||||
if !tenant.tenant_shard_id().shard_count.is_unsharded() {
|
||||
// Note that we use ShardParameters::default below.
|
||||
return Err(SetNewTenantConfigError::Other(anyhow::anyhow!(
|
||||
"This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants"
|
||||
@@ -1375,7 +1376,7 @@ impl TenantManager {
|
||||
result
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), new_shard_count=%new_shard_count.0))]
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), new_shard_count=%new_shard_count.literal()))]
|
||||
pub(crate) async fn shard_split(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
@@ -1385,11 +1386,10 @@ impl TenantManager {
|
||||
let tenant = get_tenant(tenant_shard_id, true)?;
|
||||
|
||||
// Plan: identify what the new child shards will be
|
||||
let effective_old_shard_count = std::cmp::max(tenant_shard_id.shard_count.0, 1);
|
||||
if new_shard_count <= ShardCount(effective_old_shard_count) {
|
||||
if new_shard_count.count() <= tenant_shard_id.shard_count.count() {
|
||||
anyhow::bail!("Requested shard count is not an increase");
|
||||
}
|
||||
let expansion_factor = new_shard_count.0 / effective_old_shard_count;
|
||||
let expansion_factor = new_shard_count.count() / tenant_shard_id.shard_count.count();
|
||||
if !expansion_factor.is_power_of_two() {
|
||||
anyhow::bail!("Requested split is not a power of two");
|
||||
}
|
||||
@@ -1439,8 +1439,10 @@ impl TenantManager {
|
||||
}
|
||||
};
|
||||
|
||||
// TODO: hardlink layers from the parent into the child shard directories so that they don't immediately re-download
|
||||
// TODO: erase the dentries from the parent
|
||||
// Optimization: hardlink layers from the parent into the children, so that they don't have to
|
||||
// re-download & duplicate the data referenced in their initial IndexPart
|
||||
self.shard_split_hardlink(parent, child_shards.clone())
|
||||
.await?;
|
||||
|
||||
// Take a snapshot of where the parent's WAL ingest had got to: we will wait for
|
||||
// child shards to reach this point.
|
||||
@@ -1479,10 +1481,11 @@ impl TenantManager {
|
||||
|
||||
// Phase 4: wait for child chards WAL ingest to catch up to target LSN
|
||||
for child_shard_id in &child_shards {
|
||||
let child_shard_id = *child_shard_id;
|
||||
let child_shard = {
|
||||
let locked = TENANTS.read().unwrap();
|
||||
let peek_slot =
|
||||
tenant_map_peek_slot(&locked, child_shard_id, TenantSlotPeekMode::Read)?;
|
||||
tenant_map_peek_slot(&locked, &child_shard_id, TenantSlotPeekMode::Read)?;
|
||||
peek_slot.and_then(|s| s.get_attached()).cloned()
|
||||
};
|
||||
if let Some(t) = child_shard {
|
||||
@@ -1517,7 +1520,7 @@ impl TenantManager {
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 5: Shut down the parent shard.
|
||||
// Phase 5: Shut down the parent shard, and erase it from disk
|
||||
let (_guard, progress) = completion::channel();
|
||||
match parent.shutdown(progress, false).await {
|
||||
Ok(()) => {}
|
||||
@@ -1525,6 +1528,24 @@ impl TenantManager {
|
||||
other.wait().await;
|
||||
}
|
||||
}
|
||||
let local_tenant_directory = self.conf.tenant_path(&tenant_shard_id);
|
||||
let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
|
||||
.await
|
||||
.with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::MgmtRequest,
|
||||
None,
|
||||
None,
|
||||
"tenant_files_delete",
|
||||
false,
|
||||
async move {
|
||||
fs::remove_dir_all(tmp_path.as_path())
|
||||
.await
|
||||
.with_context(|| format!("tenant directory {:?} deletion", tmp_path))
|
||||
},
|
||||
);
|
||||
|
||||
parent_slot_guard.drop_old_value()?;
|
||||
|
||||
// Phase 6: Release the InProgress on the parent shard
|
||||
@@ -1532,6 +1553,130 @@ impl TenantManager {
|
||||
|
||||
Ok(child_shards)
|
||||
}
|
||||
|
||||
/// Part of [`Self::shard_split`]: hard link parent shard layers into child shards, as an optimization
|
||||
/// to avoid the children downloading them again.
|
||||
///
|
||||
/// For each resident layer in the parent shard, we will hard link it into all of the child shards.
|
||||
async fn shard_split_hardlink(
|
||||
&self,
|
||||
parent_shard: &Tenant,
|
||||
child_shards: Vec<TenantShardId>,
|
||||
) -> anyhow::Result<()> {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let parent_path = self.conf.tenant_path(parent_shard.get_tenant_shard_id());
|
||||
let (parent_timelines, parent_layers) = {
|
||||
let mut parent_layers = Vec::new();
|
||||
let timelines = parent_shard.timelines.lock().unwrap().clone();
|
||||
let parent_timelines = timelines.keys().cloned().collect::<Vec<_>>();
|
||||
for timeline in timelines.values() {
|
||||
let timeline_layers = timeline
|
||||
.layers
|
||||
.read()
|
||||
.await
|
||||
.resident_layers()
|
||||
.collect::<Vec<_>>()
|
||||
.await;
|
||||
for layer in timeline_layers {
|
||||
let relative_path = layer
|
||||
.local_path()
|
||||
.strip_prefix(&parent_path)
|
||||
.context("Removing prefix from parent layer path")?;
|
||||
parent_layers.push(relative_path.to_owned());
|
||||
}
|
||||
}
|
||||
debug_assert!(
|
||||
!parent_layers.is_empty(),
|
||||
"shutdown cannot empty the layermap"
|
||||
);
|
||||
(parent_timelines, parent_layers)
|
||||
};
|
||||
|
||||
let mut child_prefixes = Vec::new();
|
||||
let mut create_dirs = Vec::new();
|
||||
|
||||
for child in child_shards {
|
||||
let child_prefix = self.conf.tenant_path(&child);
|
||||
create_dirs.push(child_prefix.clone());
|
||||
create_dirs.extend(
|
||||
parent_timelines
|
||||
.iter()
|
||||
.map(|t| self.conf.timeline_path(&child, t)),
|
||||
);
|
||||
|
||||
child_prefixes.push(child_prefix);
|
||||
}
|
||||
|
||||
// Since we will do a large number of small filesystem metadata operations, batch them into
|
||||
// spawn_blocking calls rather than doing each one as a tokio::fs round-trip.
|
||||
let jh = tokio::task::spawn_blocking(move || -> anyhow::Result<usize> {
|
||||
for dir in &create_dirs {
|
||||
if let Err(e) = std::fs::create_dir_all(dir) {
|
||||
// Ignore AlreadyExists errors, drop out on all other errors
|
||||
match e.kind() {
|
||||
std::io::ErrorKind::AlreadyExists => {}
|
||||
_ => {
|
||||
return Err(anyhow::anyhow!(e).context(format!("Creating {dir}")));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for child_prefix in child_prefixes {
|
||||
for relative_layer in &parent_layers {
|
||||
let parent_path = parent_path.join(relative_layer);
|
||||
let child_path = child_prefix.join(relative_layer);
|
||||
if let Err(e) = std::fs::hard_link(&parent_path, &child_path) {
|
||||
match e.kind() {
|
||||
std::io::ErrorKind::AlreadyExists => {}
|
||||
std::io::ErrorKind::NotFound => {
|
||||
tracing::info!(
|
||||
"Layer {} not found during hard-linking, evicted during split?",
|
||||
relative_layer
|
||||
);
|
||||
}
|
||||
_ => {
|
||||
return Err(anyhow::anyhow!(e).context(format!(
|
||||
"Hard linking {relative_layer} into {child_prefix}"
|
||||
)))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Durability is not required for correctness, but if we crashed during split and
|
||||
// then came restarted with empty timeline dirs, it would be very inefficient to
|
||||
// re-populate from remote storage.
|
||||
for dir in create_dirs {
|
||||
if let Err(e) = crashsafe::fsync(&dir) {
|
||||
// Something removed a newly created timeline dir out from underneath us? Extremely
|
||||
// unexpected, but not worth panic'ing over as this whole function is just an
|
||||
// optimization.
|
||||
tracing::warn!("Failed to fsync directory {dir}: {e}")
|
||||
}
|
||||
}
|
||||
|
||||
Ok(parent_layers.len())
|
||||
});
|
||||
|
||||
match jh.await {
|
||||
Ok(Ok(layer_count)) => {
|
||||
tracing::info!(count = layer_count, "Hard linked layers into child shards");
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
// This is an optimization, so we tolerate failure.
|
||||
tracing::warn!("Error hard-linking layers, proceeding anyway: {e}")
|
||||
}
|
||||
Err(e) => {
|
||||
// This is something totally unexpected like a panic, so bail out.
|
||||
anyhow::bail!("Error joining hard linking task: {e}");
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
|
||||
@@ -196,14 +196,12 @@ pub(crate) use upload::upload_initdb_dir;
|
||||
use utils::backoff::{
|
||||
self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
};
|
||||
use utils::timeout::{timeout_cancellable, TimeoutCancellableError};
|
||||
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::sync::atomic::{AtomicU32, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::Duration;
|
||||
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath, TimeoutOrCancel};
|
||||
use std::ops::DerefMut;
|
||||
use tracing::{debug, error, info, instrument, warn};
|
||||
use tracing::{info_span, Instrument};
|
||||
@@ -263,11 +261,6 @@ pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst";
|
||||
/// Default buffer size when interfacing with [`tokio::fs::File`].
|
||||
pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
|
||||
|
||||
/// This timeout is intended to deal with hangs in lower layers, e.g. stuck TCP flows. It is not
|
||||
/// intended to be snappy enough for prompt shutdown, as we have a CancellationToken for that.
|
||||
pub(crate) const UPLOAD_TIMEOUT: Duration = Duration::from_secs(120);
|
||||
pub(crate) const DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(120);
|
||||
|
||||
pub enum MaybeDeletedIndexPart {
|
||||
IndexPart(IndexPart),
|
||||
Deleted(IndexPart),
|
||||
@@ -331,40 +324,6 @@ pub struct RemoteTimelineClient {
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to anyhow.
|
||||
///
|
||||
/// This is a convenience for the various upload functions. In future
|
||||
/// the anyhow::Error result should be replaced with a more structured type that
|
||||
/// enables callers to avoid handling shutdown as an error.
|
||||
async fn upload_cancellable<F>(cancel: &CancellationToken, future: F) -> anyhow::Result<()>
|
||||
where
|
||||
F: std::future::Future<Output = anyhow::Result<()>>,
|
||||
{
|
||||
match timeout_cancellable(UPLOAD_TIMEOUT, cancel, future).await {
|
||||
Ok(Ok(())) => Ok(()),
|
||||
Ok(Err(e)) => Err(e),
|
||||
Err(TimeoutCancellableError::Timeout) => Err(anyhow::anyhow!("Timeout")),
|
||||
Err(TimeoutCancellableError::Cancelled) => Err(anyhow::anyhow!("Shutting down")),
|
||||
}
|
||||
}
|
||||
/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to DownloaDError.
|
||||
async fn download_cancellable<F, R>(
|
||||
cancel: &CancellationToken,
|
||||
future: F,
|
||||
) -> Result<R, DownloadError>
|
||||
where
|
||||
F: std::future::Future<Output = Result<R, DownloadError>>,
|
||||
{
|
||||
match timeout_cancellable(DOWNLOAD_TIMEOUT, cancel, future).await {
|
||||
Ok(Ok(r)) => Ok(r),
|
||||
Ok(Err(e)) => Err(e),
|
||||
Err(TimeoutCancellableError::Timeout) => {
|
||||
Err(DownloadError::Other(anyhow::anyhow!("Timed out")))
|
||||
}
|
||||
Err(TimeoutCancellableError::Cancelled) => Err(DownloadError::Cancelled),
|
||||
}
|
||||
}
|
||||
|
||||
impl RemoteTimelineClient {
|
||||
///
|
||||
/// Create a remote storage client for given timeline
|
||||
@@ -1050,7 +1009,7 @@ impl RemoteTimelineClient {
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::anyhow!("Cancelled"))
|
||||
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
|
||||
.and_then(|x| x)?;
|
||||
|
||||
// all good, disarm the guard and mark as success
|
||||
@@ -1082,14 +1041,14 @@ impl RemoteTimelineClient {
|
||||
upload::preserve_initdb_archive(&self.storage_impl, tenant_id, timeline_id, cancel)
|
||||
.await
|
||||
},
|
||||
|_e| false,
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"preserve_initdb_tar_zst",
|
||||
&cancel.clone(),
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::anyhow!("Cancellled"))
|
||||
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
|
||||
.and_then(|x| x)
|
||||
.context("backing up initdb archive")?;
|
||||
Ok(())
|
||||
@@ -1151,7 +1110,7 @@ impl RemoteTimelineClient {
|
||||
let remaining = download_retry(
|
||||
|| async {
|
||||
self.storage_impl
|
||||
.list_files(Some(&timeline_storage_path), None)
|
||||
.list_files(Some(&timeline_storage_path), None, &cancel)
|
||||
.await
|
||||
},
|
||||
"list remaining files",
|
||||
@@ -1445,6 +1404,10 @@ impl RemoteTimelineClient {
|
||||
Ok(()) => {
|
||||
break;
|
||||
}
|
||||
Err(e) if TimeoutOrCancel::caused_by_cancel(&e) => {
|
||||
// loop around to do the proper stopping
|
||||
continue;
|
||||
}
|
||||
Err(e) => {
|
||||
let retries = task.retries.fetch_add(1, Ordering::SeqCst);
|
||||
|
||||
|
||||
@@ -11,16 +11,14 @@ use camino::{Utf8Path, Utf8PathBuf};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use tokio::fs::{self, File, OpenOptions};
|
||||
use tokio::io::{AsyncSeekExt, AsyncWriteExt};
|
||||
use tokio_util::io::StreamReader;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::warn;
|
||||
use utils::timeout::timeout_cancellable;
|
||||
use utils::{backoff, crashsafe};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::remote_timeline_client::{
|
||||
download_cancellable, remote_layer_path, remote_timelines_path, DOWNLOAD_TIMEOUT,
|
||||
};
|
||||
use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
use crate::tenant::Generation;
|
||||
use crate::virtual_file::on_fatal_io_error;
|
||||
@@ -83,15 +81,13 @@ pub async fn download_layer_file<'a>(
|
||||
.with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
// Cancellation safety: it is safe to cancel this future, because it isn't writing to a local
|
||||
// file: the write to local file doesn't start until after the request header is returned
|
||||
// and we start draining the body stream below
|
||||
let download = download_cancellable(cancel, storage.download(&remote_path))
|
||||
let download = storage
|
||||
.download(&remote_path, cancel)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"open a download stream for layer with remote storage path '{remote_path:?}'"
|
||||
)
|
||||
"open a download stream for layer with remote storage path '{remote_path:?}'"
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
@@ -100,43 +96,26 @@ pub async fn download_layer_file<'a>(
|
||||
|
||||
let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
|
||||
// Cancellation safety: it is safe to cancel this future because it is writing into a temporary file,
|
||||
// and we will unlink the temporary file if there is an error. This unlink is important because we
|
||||
// are in a retry loop, and we wouldn't want to leave behind a rogue write I/O to a file that
|
||||
// we will imminiently try and write to again.
|
||||
let bytes_amount: u64 = match timeout_cancellable(
|
||||
DOWNLOAD_TIMEOUT,
|
||||
cancel,
|
||||
tokio::io::copy_buf(&mut reader, &mut destination_file),
|
||||
)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
let bytes_amount = tokio::io::copy_buf(&mut reader, &mut destination_file)
|
||||
.await
|
||||
.with_context(|| format!(
|
||||
"download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::Other)?
|
||||
{
|
||||
Ok(b) => Ok(b),
|
||||
))
|
||||
.map_err(DownloadError::Other);
|
||||
|
||||
match bytes_amount {
|
||||
Ok(bytes_amount) => {
|
||||
let destination_file = destination_file.into_inner();
|
||||
Ok((destination_file, bytes_amount))
|
||||
}
|
||||
Err(e) => {
|
||||
// Remove incomplete files: on restart Timeline would do this anyway, but we must
|
||||
// do it here for the retry case.
|
||||
if let Err(e) = tokio::fs::remove_file(&temp_file_path).await {
|
||||
on_fatal_io_error(&e, &format!("Removing temporary file {temp_file_path}"));
|
||||
}
|
||||
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let destination_file = destination_file.into_inner();
|
||||
|
||||
Ok((destination_file, bytes_amount))
|
||||
},
|
||||
&format!("download {remote_path:?}"),
|
||||
cancel,
|
||||
@@ -218,9 +197,11 @@ pub async fn list_remote_timelines(
|
||||
|
||||
let listing = download_retry_forever(
|
||||
|| {
|
||||
download_cancellable(
|
||||
storage.list(
|
||||
Some(&remote_path),
|
||||
ListingMode::WithDelimiter,
|
||||
None,
|
||||
&cancel,
|
||||
storage.list(Some(&remote_path), ListingMode::WithDelimiter, None),
|
||||
)
|
||||
},
|
||||
&format!("list timelines for {tenant_shard_id}"),
|
||||
@@ -259,26 +240,23 @@ async fn do_download_index_part(
|
||||
index_generation: Generation,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<IndexPart, DownloadError> {
|
||||
use futures::stream::StreamExt;
|
||||
|
||||
let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
|
||||
|
||||
let index_part_bytes = download_retry_forever(
|
||||
|| async {
|
||||
// Cancellation: if is safe to cancel this future because we're just downloading into
|
||||
// a memory buffer, not touching local disk.
|
||||
let index_part_download =
|
||||
download_cancellable(cancel, storage.download(&remote_path)).await?;
|
||||
let download = storage.download(&remote_path, cancel).await?;
|
||||
|
||||
let mut index_part_bytes = Vec::new();
|
||||
let mut stream = std::pin::pin!(index_part_download.download_stream);
|
||||
while let Some(chunk) = stream.next().await {
|
||||
let chunk = chunk
|
||||
.with_context(|| format!("download index part at {remote_path:?}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
index_part_bytes.extend_from_slice(&chunk[..]);
|
||||
}
|
||||
Ok(index_part_bytes)
|
||||
let mut bytes = Vec::new();
|
||||
|
||||
let stream = download.download_stream;
|
||||
let mut stream = StreamReader::new(stream);
|
||||
|
||||
tokio::io::copy_buf(&mut stream, &mut bytes)
|
||||
.await
|
||||
.with_context(|| format!("download index part at {remote_path:?}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
Ok(bytes)
|
||||
},
|
||||
&format!("download {remote_path:?}"),
|
||||
cancel,
|
||||
@@ -373,7 +351,7 @@ pub(super) async fn download_index_part(
|
||||
let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
|
||||
|
||||
let indices = download_retry(
|
||||
|| async { storage.list_files(Some(&index_prefix), None).await },
|
||||
|| async { storage.list_files(Some(&index_prefix), None, cancel).await },
|
||||
"list index_part files",
|
||||
cancel,
|
||||
)
|
||||
@@ -446,11 +424,10 @@ pub(crate) async fn download_initdb_tar_zst(
|
||||
.with_context(|| format!("tempfile creation {temp_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let download = match download_cancellable(cancel, storage.download(&remote_path)).await
|
||||
{
|
||||
let download = match storage.download(&remote_path, cancel).await {
|
||||
Ok(dl) => dl,
|
||||
Err(DownloadError::NotFound) => {
|
||||
download_cancellable(cancel, storage.download(&remote_preserved_path)).await?
|
||||
storage.download(&remote_preserved_path, cancel).await?
|
||||
}
|
||||
Err(other) => Err(other)?,
|
||||
};
|
||||
@@ -460,6 +437,7 @@ pub(crate) async fn download_initdb_tar_zst(
|
||||
// TODO: this consumption of the response body should be subject to timeout + cancellation, but
|
||||
// not without thinking carefully about how to recover safely from cancelling a write to
|
||||
// local storage (e.g. by writing into a temp file as we do in download_layer)
|
||||
// FIXME: flip the weird error wrapping
|
||||
tokio::io::copy_buf(&mut download, &mut writer)
|
||||
.await
|
||||
.with_context(|| format!("download initdb.tar.zst at {remote_path:?}"))
|
||||
|
||||
@@ -16,7 +16,7 @@ use crate::{
|
||||
config::PageServerConf,
|
||||
tenant::remote_timeline_client::{
|
||||
index::IndexPart, remote_index_path, remote_initdb_archive_path,
|
||||
remote_initdb_preserved_archive_path, remote_path, upload_cancellable,
|
||||
remote_initdb_preserved_archive_path, remote_path,
|
||||
},
|
||||
};
|
||||
use remote_storage::{GenericRemoteStorage, TimeTravelError};
|
||||
@@ -49,16 +49,15 @@ pub(crate) async fn upload_index_part<'a>(
|
||||
let index_part_bytes = bytes::Bytes::from(index_part_bytes);
|
||||
|
||||
let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation);
|
||||
upload_cancellable(
|
||||
cancel,
|
||||
storage.upload_storage_object(
|
||||
storage
|
||||
.upload_storage_object(
|
||||
futures::stream::once(futures::future::ready(Ok(index_part_bytes))),
|
||||
index_part_size,
|
||||
&remote_path,
|
||||
),
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
|
||||
}
|
||||
|
||||
/// Attempts to upload given layer files.
|
||||
@@ -115,11 +114,10 @@ pub(super) async fn upload_timeline_layer<'a>(
|
||||
|
||||
let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE);
|
||||
|
||||
upload_cancellable(cancel, storage.upload(reader, fs_size, &storage_path, None))
|
||||
storage
|
||||
.upload(reader, fs_size, &storage_path, None, cancel)
|
||||
.await
|
||||
.with_context(|| format!("upload layer from local path '{source_path}'"))?;
|
||||
|
||||
Ok(())
|
||||
.with_context(|| format!("upload layer from local path '{source_path}'"))
|
||||
}
|
||||
|
||||
/// Uploads the given `initdb` data to the remote storage.
|
||||
@@ -139,12 +137,10 @@ pub(crate) async fn upload_initdb_dir(
|
||||
let file = tokio_util::io::ReaderStream::with_capacity(initdb_tar_zst, super::BUFFER_SIZE);
|
||||
|
||||
let remote_path = remote_initdb_archive_path(tenant_id, timeline_id);
|
||||
upload_cancellable(
|
||||
cancel,
|
||||
storage.upload_storage_object(file, size as usize, &remote_path),
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
|
||||
storage
|
||||
.upload_storage_object(file, size as usize, &remote_path, cancel)
|
||||
.await
|
||||
.with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
|
||||
}
|
||||
|
||||
pub(crate) async fn preserve_initdb_archive(
|
||||
@@ -155,7 +151,8 @@ pub(crate) async fn preserve_initdb_archive(
|
||||
) -> anyhow::Result<()> {
|
||||
let source_path = remote_initdb_archive_path(tenant_id, timeline_id);
|
||||
let dest_path = remote_initdb_preserved_archive_path(tenant_id, timeline_id);
|
||||
upload_cancellable(cancel, storage.copy_object(&source_path, &dest_path))
|
||||
storage
|
||||
.copy_object(&source_path, &dest_path, cancel)
|
||||
.await
|
||||
.with_context(|| format!("backing up initdb archive for '{tenant_id} / {timeline_id}'"))
|
||||
}
|
||||
|
||||
@@ -150,7 +150,7 @@ impl SecondaryTenant {
|
||||
generation: None,
|
||||
secondary_conf: Some(conf),
|
||||
shard_number: self.tenant_shard_id.shard_number.0,
|
||||
shard_count: self.tenant_shard_id.shard_count.0,
|
||||
shard_count: self.tenant_shard_id.shard_count.literal(),
|
||||
shard_stripe_size: self.shard_identity.stripe_size.0,
|
||||
tenant_conf: tenant_conf.into(),
|
||||
}
|
||||
|
||||
@@ -486,7 +486,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
let heatmap_path_bg = heatmap_path.clone();
|
||||
tokio::task::spawn_blocking(move || {
|
||||
tokio::runtime::Handle::current().block_on(async move {
|
||||
VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, &heatmap_bytes).await
|
||||
VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, heatmap_bytes).await
|
||||
})
|
||||
})
|
||||
.await
|
||||
@@ -523,12 +523,13 @@ impl<'a> TenantDownloader<'a> {
|
||||
tracing::debug!("Downloading heatmap for secondary tenant",);
|
||||
|
||||
let heatmap_path = remote_heatmap_path(tenant_shard_id);
|
||||
let cancel = &self.secondary_state.cancel;
|
||||
|
||||
let heatmap_bytes = backoff::retry(
|
||||
|| async {
|
||||
let download = self
|
||||
.remote_storage
|
||||
.download(&heatmap_path)
|
||||
.download(&heatmap_path, cancel)
|
||||
.await
|
||||
.map_err(UpdateError::from)?;
|
||||
let mut heatmap_bytes = Vec::new();
|
||||
@@ -540,7 +541,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"download heatmap",
|
||||
&self.secondary_state.cancel,
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| UpdateError::Cancelled)
|
||||
|
||||
@@ -21,18 +21,17 @@ use futures::Future;
|
||||
use md5;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use rand::Rng;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use remote_storage::{GenericRemoteStorage, TimeoutOrCancel};
|
||||
|
||||
use super::{
|
||||
heatmap::HeatMapTenant,
|
||||
scheduler::{self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs},
|
||||
CommandRequest,
|
||||
CommandRequest, UploadCommand,
|
||||
};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{info_span, instrument, Instrument};
|
||||
use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop};
|
||||
|
||||
use super::{heatmap::HeatMapTenant, UploadCommand};
|
||||
|
||||
pub(super) async fn heatmap_uploader_task(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
@@ -417,10 +416,10 @@ async fn upload_tenant_heatmap(
|
||||
|| async {
|
||||
let bytes = futures::stream::once(futures::future::ready(Ok(bytes.clone())));
|
||||
remote_storage
|
||||
.upload_storage_object(bytes, size, &path)
|
||||
.upload_storage_object(bytes, size, &path, cancel)
|
||||
.await
|
||||
},
|
||||
|_| false,
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
3,
|
||||
u32::MAX,
|
||||
"Uploading heatmap",
|
||||
|
||||
@@ -461,7 +461,8 @@ impl DeltaLayerWriterInner {
|
||||
file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))
|
||||
.await?;
|
||||
for buf in block_buf.blocks {
|
||||
file.write_all(buf.as_ref()).await?;
|
||||
let (_buf, res) = file.write_all(buf).await;
|
||||
res?;
|
||||
}
|
||||
assert!(self.lsn_range.start < self.lsn_range.end);
|
||||
// Fill in the summary on blk 0
|
||||
@@ -476,17 +477,12 @@ impl DeltaLayerWriterInner {
|
||||
index_root_blk,
|
||||
};
|
||||
|
||||
let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
|
||||
let mut buf = Vec::with_capacity(PAGE_SZ);
|
||||
// TODO: could use smallvec here but it's a pain with Slice<T>
|
||||
Summary::ser_into(&summary, &mut buf)?;
|
||||
if buf.spilled() {
|
||||
// This is bad as we only have one free block for the summary
|
||||
warn!(
|
||||
"Used more than one page size for summary buffer: {}",
|
||||
buf.len()
|
||||
);
|
||||
}
|
||||
file.seek(SeekFrom::Start(0)).await?;
|
||||
file.write_all(&buf).await?;
|
||||
let (_buf, res) = file.write_all(buf).await;
|
||||
res?;
|
||||
|
||||
let metadata = file
|
||||
.metadata()
|
||||
@@ -679,18 +675,12 @@ impl DeltaLayer {
|
||||
|
||||
let new_summary = rewrite(actual_summary);
|
||||
|
||||
let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
|
||||
let mut buf = Vec::with_capacity(PAGE_SZ);
|
||||
// TODO: could use smallvec here, but it's a pain with Slice<T>
|
||||
Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
|
||||
if buf.spilled() {
|
||||
// The code in DeltaLayerWriterInner just warn!()s for this.
|
||||
// It should probably error out as well.
|
||||
return Err(RewriteSummaryError::Other(anyhow::anyhow!(
|
||||
"Used more than one page size for summary buffer: {}",
|
||||
buf.len()
|
||||
)));
|
||||
}
|
||||
file.seek(SeekFrom::Start(0)).await?;
|
||||
file.write_all(&buf).await?;
|
||||
let (_buf, res) = file.write_all(buf).await;
|
||||
res?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -341,18 +341,12 @@ impl ImageLayer {
|
||||
|
||||
let new_summary = rewrite(actual_summary);
|
||||
|
||||
let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
|
||||
let mut buf = Vec::with_capacity(PAGE_SZ);
|
||||
// TODO: could use smallvec here but it's a pain with Slice<T>
|
||||
Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
|
||||
if buf.spilled() {
|
||||
// The code in ImageLayerWriterInner just warn!()s for this.
|
||||
// It should probably error out as well.
|
||||
return Err(RewriteSummaryError::Other(anyhow::anyhow!(
|
||||
"Used more than one page size for summary buffer: {}",
|
||||
buf.len()
|
||||
)));
|
||||
}
|
||||
file.seek(SeekFrom::Start(0)).await?;
|
||||
file.write_all(&buf).await?;
|
||||
let (_buf, res) = file.write_all(buf).await;
|
||||
res?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -555,7 +549,8 @@ impl ImageLayerWriterInner {
|
||||
.await?;
|
||||
let (index_root_blk, block_buf) = self.tree.finish()?;
|
||||
for buf in block_buf.blocks {
|
||||
file.write_all(buf.as_ref()).await?;
|
||||
let (_buf, res) = file.write_all(buf).await;
|
||||
res?;
|
||||
}
|
||||
|
||||
// Fill in the summary on blk 0
|
||||
@@ -570,17 +565,12 @@ impl ImageLayerWriterInner {
|
||||
index_root_blk,
|
||||
};
|
||||
|
||||
let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
|
||||
let mut buf = Vec::with_capacity(PAGE_SZ);
|
||||
// TODO: could use smallvec here but it's a pain with Slice<T>
|
||||
Summary::ser_into(&summary, &mut buf)?;
|
||||
if buf.spilled() {
|
||||
// This is bad as we only have one free block for the summary
|
||||
warn!(
|
||||
"Used more than one page size for summary buffer: {}",
|
||||
buf.len()
|
||||
);
|
||||
}
|
||||
file.seek(SeekFrom::Start(0)).await?;
|
||||
file.write_all(&buf).await?;
|
||||
let (_buf, res) = file.write_all(buf).await;
|
||||
res?;
|
||||
|
||||
let metadata = file
|
||||
.metadata()
|
||||
|
||||
@@ -14,6 +14,7 @@ use enumset::EnumSet;
|
||||
use fail::fail_point;
|
||||
use futures::stream::StreamExt;
|
||||
use itertools::Itertools;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::{
|
||||
keyspace::{key_range_size, KeySpaceAccum},
|
||||
models::{
|
||||
@@ -34,17 +35,22 @@ use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::sync::gate::Gate;
|
||||
|
||||
use std::collections::{BTreeMap, BinaryHeap, HashMap, HashSet};
|
||||
use std::ops::{Deref, Range};
|
||||
use std::pin::pin;
|
||||
use std::sync::atomic::Ordering as AtomicOrdering;
|
||||
use std::sync::{Arc, Mutex, RwLock, Weak};
|
||||
use std::time::{Duration, Instant, SystemTime};
|
||||
use std::{
|
||||
array,
|
||||
collections::{BTreeMap, BinaryHeap, HashMap, HashSet},
|
||||
sync::atomic::AtomicU64,
|
||||
};
|
||||
use std::{
|
||||
cmp::{max, min, Ordering},
|
||||
ops::ControlFlow,
|
||||
};
|
||||
|
||||
use crate::pgdatadir_mapping::DirectoryKind;
|
||||
use crate::tenant::timeline::logical_size::CurrentLogicalSize;
|
||||
use crate::tenant::{
|
||||
layer_map::{LayerMap, SearchResult},
|
||||
@@ -258,6 +264,8 @@ pub struct Timeline {
|
||||
// in `crate::page_service` writes these metrics.
|
||||
pub(crate) query_metrics: crate::metrics::SmgrQueryTimePerTimeline,
|
||||
|
||||
directory_metrics: [AtomicU64; DirectoryKind::KINDS_NUM],
|
||||
|
||||
/// Ensures layers aren't frozen by checkpointer between
|
||||
/// [`Timeline::get_layer_for_write`] and layer reads.
|
||||
/// Locked automatically by [`TimelineWriter`] and checkpointer.
|
||||
@@ -790,6 +798,10 @@ impl Timeline {
|
||||
self.metrics.resident_physical_size_get()
|
||||
}
|
||||
|
||||
pub(crate) fn get_directory_metrics(&self) -> [u64; DirectoryKind::KINDS_NUM] {
|
||||
array::from_fn(|idx| self.directory_metrics[idx].load(AtomicOrdering::Relaxed))
|
||||
}
|
||||
|
||||
///
|
||||
/// Wait until WAL has been received and processed up to this LSN.
|
||||
///
|
||||
@@ -1496,6 +1508,8 @@ impl Timeline {
|
||||
&timeline_id,
|
||||
),
|
||||
|
||||
directory_metrics: array::from_fn(|_| AtomicU64::new(0)),
|
||||
|
||||
flush_loop_state: Mutex::new(FlushLoopState::NotStarted),
|
||||
|
||||
layer_flush_start_tx,
|
||||
@@ -2264,6 +2278,29 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn update_directory_entries_count(&self, kind: DirectoryKind, count: u64) {
|
||||
self.directory_metrics[kind.offset()].store(count, AtomicOrdering::Relaxed);
|
||||
let aux_metric =
|
||||
self.directory_metrics[DirectoryKind::AuxFiles.offset()].load(AtomicOrdering::Relaxed);
|
||||
|
||||
let sum_of_entries = self
|
||||
.directory_metrics
|
||||
.iter()
|
||||
.map(|v| v.load(AtomicOrdering::Relaxed))
|
||||
.sum();
|
||||
// Set a high general threshold and a lower threshold for the auxiliary files,
|
||||
// as we can have large numbers of relations in the db directory.
|
||||
const SUM_THRESHOLD: u64 = 5000;
|
||||
const AUX_THRESHOLD: u64 = 1000;
|
||||
if sum_of_entries >= SUM_THRESHOLD || aux_metric >= AUX_THRESHOLD {
|
||||
self.metrics
|
||||
.directory_entries_count_gauge
|
||||
.set(sum_of_entries);
|
||||
} else if let Some(metric) = Lazy::get(&self.metrics.directory_entries_count_gauge) {
|
||||
metric.set(sum_of_entries);
|
||||
}
|
||||
}
|
||||
|
||||
async fn find_layer(&self, layer_file_name: &str) -> Option<Layer> {
|
||||
let guard = self.layers.read().await;
|
||||
for historic_layer in guard.layer_map().iter_historic_layers() {
|
||||
@@ -4812,7 +4849,7 @@ mod tests {
|
||||
TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap();
|
||||
|
||||
let ctx = any_context();
|
||||
let tenant = harness.try_load(&ctx).await.unwrap();
|
||||
let tenant = harness.do_try_load(&ctx).await.unwrap();
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
|
||||
.await
|
||||
|
||||
@@ -196,13 +196,13 @@ impl Timeline {
|
||||
ControlFlow::Continue(()) => (),
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Default)]
|
||||
struct EvictionStats {
|
||||
candidates: usize,
|
||||
evicted: usize,
|
||||
errors: usize,
|
||||
not_evictable: usize,
|
||||
#[allow(dead_code)]
|
||||
skipped_for_shutdown: usize,
|
||||
}
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ use once_cell::sync::OnceCell;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::fs::{self, File};
|
||||
use std::io::{Error, ErrorKind, Seek, SeekFrom};
|
||||
use tokio_epoll_uring::IoBufMut;
|
||||
use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice};
|
||||
|
||||
use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
|
||||
use std::os::unix::fs::FileExt;
|
||||
@@ -410,10 +410,10 @@ impl VirtualFile {
|
||||
/// step, the tmp path is renamed to the final path. As renames are
|
||||
/// atomic, a crash during the write operation will never leave behind a
|
||||
/// partially written file.
|
||||
pub async fn crashsafe_overwrite(
|
||||
pub async fn crashsafe_overwrite<B: BoundedBuf>(
|
||||
final_path: &Utf8Path,
|
||||
tmp_path: &Utf8Path,
|
||||
content: &[u8],
|
||||
content: B,
|
||||
) -> std::io::Result<()> {
|
||||
let Some(final_path_parent) = final_path.parent() else {
|
||||
return Err(std::io::Error::from_raw_os_error(
|
||||
@@ -430,7 +430,8 @@ impl VirtualFile {
|
||||
.create_new(true),
|
||||
)
|
||||
.await?;
|
||||
file.write_all(content).await?;
|
||||
let (_content, res) = file.write_all(content).await;
|
||||
res?;
|
||||
file.sync_all().await?;
|
||||
drop(file); // before the rename, that's important!
|
||||
// renames are atomic
|
||||
@@ -581,43 +582,69 @@ impl VirtualFile {
|
||||
}
|
||||
|
||||
// Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235
|
||||
pub async fn write_all_at(&self, mut buf: &[u8], mut offset: u64) -> Result<(), Error> {
|
||||
pub async fn write_all_at<B: BoundedBuf>(
|
||||
&self,
|
||||
buf: B,
|
||||
mut offset: u64,
|
||||
) -> (B::Buf, Result<(), Error>) {
|
||||
let buf_len = buf.bytes_init();
|
||||
if buf_len == 0 {
|
||||
return (Slice::into_inner(buf.slice_full()), Ok(()));
|
||||
}
|
||||
let mut buf = buf.slice(0..buf_len);
|
||||
while !buf.is_empty() {
|
||||
match self.write_at(buf, offset).await {
|
||||
// TODO: push `buf` further down
|
||||
match self.write_at(&buf, offset).await {
|
||||
Ok(0) => {
|
||||
return Err(Error::new(
|
||||
std::io::ErrorKind::WriteZero,
|
||||
"failed to write whole buffer",
|
||||
));
|
||||
return (
|
||||
Slice::into_inner(buf),
|
||||
Err(Error::new(
|
||||
std::io::ErrorKind::WriteZero,
|
||||
"failed to write whole buffer",
|
||||
)),
|
||||
);
|
||||
}
|
||||
Ok(n) => {
|
||||
buf = &buf[n..];
|
||||
buf = buf.slice(n..);
|
||||
offset += n as u64;
|
||||
}
|
||||
Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
|
||||
Err(e) => return Err(e),
|
||||
Err(e) => return (Slice::into_inner(buf), Err(e)),
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
(Slice::into_inner(buf), Ok(()))
|
||||
}
|
||||
|
||||
pub async fn write_all(&mut self, mut buf: &[u8]) -> Result<(), Error> {
|
||||
/// Writes `buf.slice(0..buf.bytes_init())`.
|
||||
/// Returns the IoBuf that is underlying the BoundedBuf `buf`.
|
||||
/// I.e., the returned value's `bytes_init()` method returns something different than the `bytes_init()` that was passed in.
|
||||
/// It's quite brittle and easy to mis-use, so, we return the size in the Ok() variant.
|
||||
pub async fn write_all<B: BoundedBuf>(&mut self, buf: B) -> (B::Buf, Result<usize, Error>) {
|
||||
let nbytes = buf.bytes_init();
|
||||
if nbytes == 0 {
|
||||
return (Slice::into_inner(buf.slice_full()), Ok(0));
|
||||
}
|
||||
let mut buf = buf.slice(0..nbytes);
|
||||
while !buf.is_empty() {
|
||||
match self.write(buf).await {
|
||||
// TODO: push `Slice` further down
|
||||
match self.write(&buf).await {
|
||||
Ok(0) => {
|
||||
return Err(Error::new(
|
||||
std::io::ErrorKind::WriteZero,
|
||||
"failed to write whole buffer",
|
||||
));
|
||||
return (
|
||||
Slice::into_inner(buf),
|
||||
Err(Error::new(
|
||||
std::io::ErrorKind::WriteZero,
|
||||
"failed to write whole buffer",
|
||||
)),
|
||||
);
|
||||
}
|
||||
Ok(n) => {
|
||||
buf = &buf[n..];
|
||||
buf = buf.slice(n..);
|
||||
}
|
||||
Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
|
||||
Err(e) => return Err(e),
|
||||
Err(e) => return (Slice::into_inner(buf), Err(e)),
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
(Slice::into_inner(buf), Ok(nbytes))
|
||||
}
|
||||
|
||||
async fn write(&mut self, buf: &[u8]) -> Result<usize, std::io::Error> {
|
||||
@@ -676,7 +703,6 @@ where
|
||||
F: FnMut(tokio_epoll_uring::Slice<B>, u64) -> Fut,
|
||||
Fut: std::future::Future<Output = (tokio_epoll_uring::Slice<B>, std::io::Result<usize>)>,
|
||||
{
|
||||
use tokio_epoll_uring::BoundedBuf;
|
||||
let mut buf: tokio_epoll_uring::Slice<B> = buf.slice_full(); // includes all the uninitialized memory
|
||||
while buf.bytes_total() != 0 {
|
||||
let res;
|
||||
@@ -1051,10 +1077,19 @@ mod tests {
|
||||
MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf),
|
||||
}
|
||||
}
|
||||
async fn write_all_at(&self, buf: &[u8], offset: u64) -> Result<(), Error> {
|
||||
async fn write_all_at<B: BoundedBuf>(&self, buf: B, offset: u64) -> Result<(), Error> {
|
||||
match self {
|
||||
MaybeVirtualFile::VirtualFile(file) => file.write_all_at(buf, offset).await,
|
||||
MaybeVirtualFile::File(file) => file.write_all_at(buf, offset),
|
||||
MaybeVirtualFile::VirtualFile(file) => {
|
||||
let (_buf, res) = file.write_all_at(buf, offset).await;
|
||||
res
|
||||
}
|
||||
MaybeVirtualFile::File(file) => {
|
||||
let buf_len = buf.bytes_init();
|
||||
if buf_len == 0 {
|
||||
return Ok(());
|
||||
}
|
||||
file.write_all_at(&buf.slice(0..buf_len), offset)
|
||||
}
|
||||
}
|
||||
}
|
||||
async fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
|
||||
@@ -1063,10 +1098,19 @@ mod tests {
|
||||
MaybeVirtualFile::File(file) => file.seek(pos),
|
||||
}
|
||||
}
|
||||
async fn write_all(&mut self, buf: &[u8]) -> Result<(), Error> {
|
||||
async fn write_all<B: BoundedBuf>(&mut self, buf: B) -> Result<(), Error> {
|
||||
match self {
|
||||
MaybeVirtualFile::VirtualFile(file) => file.write_all(buf).await,
|
||||
MaybeVirtualFile::File(file) => file.write_all(buf),
|
||||
MaybeVirtualFile::VirtualFile(file) => {
|
||||
let (_buf, res) = file.write_all(buf).await;
|
||||
res.map(|_| ())
|
||||
}
|
||||
MaybeVirtualFile::File(file) => {
|
||||
let buf_len = buf.bytes_init();
|
||||
if buf_len == 0 {
|
||||
return Ok(());
|
||||
}
|
||||
file.write_all(&buf.slice(0..buf_len))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1141,7 +1185,7 @@ mod tests {
|
||||
.to_owned(),
|
||||
)
|
||||
.await?;
|
||||
file_a.write_all(b"foobar").await?;
|
||||
file_a.write_all(b"foobar".to_vec()).await?;
|
||||
|
||||
// cannot read from a file opened in write-only mode
|
||||
let _ = file_a.read_string().await.unwrap_err();
|
||||
@@ -1150,7 +1194,7 @@ mod tests {
|
||||
let mut file_a = openfunc(path_a, OpenOptions::new().read(true).to_owned()).await?;
|
||||
|
||||
// cannot write to a file opened in read-only mode
|
||||
let _ = file_a.write_all(b"bar").await.unwrap_err();
|
||||
let _ = file_a.write_all(b"bar".to_vec()).await.unwrap_err();
|
||||
|
||||
// Try simple read
|
||||
assert_eq!("foobar", file_a.read_string().await?);
|
||||
@@ -1192,8 +1236,8 @@ mod tests {
|
||||
.to_owned(),
|
||||
)
|
||||
.await?;
|
||||
file_b.write_all_at(b"BAR", 3).await?;
|
||||
file_b.write_all_at(b"FOO", 0).await?;
|
||||
file_b.write_all_at(b"BAR".to_vec(), 3).await?;
|
||||
file_b.write_all_at(b"FOO".to_vec(), 0).await?;
|
||||
|
||||
assert_eq!(file_b.read_string_at(2, 3).await?, "OBA");
|
||||
|
||||
@@ -1293,7 +1337,7 @@ mod tests {
|
||||
let path = testdir.join("myfile");
|
||||
let tmp_path = testdir.join("myfile.tmp");
|
||||
|
||||
VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo")
|
||||
VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo".to_vec())
|
||||
.await
|
||||
.unwrap();
|
||||
let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
|
||||
@@ -1302,7 +1346,7 @@ mod tests {
|
||||
assert!(!tmp_path.exists());
|
||||
drop(file);
|
||||
|
||||
VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"bar")
|
||||
VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"bar".to_vec())
|
||||
.await
|
||||
.unwrap();
|
||||
let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
|
||||
@@ -1324,7 +1368,7 @@ mod tests {
|
||||
std::fs::write(&tmp_path, "some preexisting junk that should be removed").unwrap();
|
||||
assert!(tmp_path.exists());
|
||||
|
||||
VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo")
|
||||
VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo".to_vec())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
|
||||
@@ -1695,22 +1695,22 @@ mod tests {
|
||||
let mut m = tline.begin_modification(Lsn(0x20));
|
||||
walingest.put_rel_creation(&mut m, TESTREL_A, &ctx).await?;
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 2"), &ctx)
|
||||
.await?;
|
||||
m.commit(&ctx).await?;
|
||||
let mut m = tline.begin_modification(Lsn(0x30));
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"), &ctx)
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 3"), &ctx)
|
||||
.await?;
|
||||
m.commit(&ctx).await?;
|
||||
let mut m = tline.begin_modification(Lsn(0x40));
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"), &ctx)
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1 at 4"), &ctx)
|
||||
.await?;
|
||||
m.commit(&ctx).await?;
|
||||
let mut m = tline.begin_modification(Lsn(0x50));
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"), &ctx)
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 2, test_img("foo blk 2 at 5"), &ctx)
|
||||
.await?;
|
||||
m.commit(&ctx).await?;
|
||||
|
||||
@@ -1751,46 +1751,46 @@ mod tests {
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), false, &ctx)
|
||||
.await?,
|
||||
TEST_IMG("foo blk 0 at 2")
|
||||
test_img("foo blk 0 at 2")
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), false, &ctx)
|
||||
.await?,
|
||||
TEST_IMG("foo blk 0 at 3")
|
||||
test_img("foo blk 0 at 3")
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), false, &ctx)
|
||||
.await?,
|
||||
TEST_IMG("foo blk 0 at 3")
|
||||
test_img("foo blk 0 at 3")
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), false, &ctx)
|
||||
.await?,
|
||||
TEST_IMG("foo blk 1 at 4")
|
||||
test_img("foo blk 1 at 4")
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), false, &ctx)
|
||||
.await?,
|
||||
TEST_IMG("foo blk 0 at 3")
|
||||
test_img("foo blk 0 at 3")
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), false, &ctx)
|
||||
.await?,
|
||||
TEST_IMG("foo blk 1 at 4")
|
||||
test_img("foo blk 1 at 4")
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
|
||||
.await?,
|
||||
TEST_IMG("foo blk 2 at 5")
|
||||
test_img("foo blk 2 at 5")
|
||||
);
|
||||
|
||||
// Truncate last block
|
||||
@@ -1812,13 +1812,13 @@ mod tests {
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), false, &ctx)
|
||||
.await?,
|
||||
TEST_IMG("foo blk 0 at 3")
|
||||
test_img("foo blk 0 at 3")
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), false, &ctx)
|
||||
.await?,
|
||||
TEST_IMG("foo blk 1 at 4")
|
||||
test_img("foo blk 1 at 4")
|
||||
);
|
||||
|
||||
// should still see the truncated block with older LSN
|
||||
@@ -1832,7 +1832,7 @@ mod tests {
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
|
||||
.await?,
|
||||
TEST_IMG("foo blk 2 at 5")
|
||||
test_img("foo blk 2 at 5")
|
||||
);
|
||||
|
||||
// Truncate to zero length
|
||||
@@ -1851,7 +1851,7 @@ mod tests {
|
||||
// Extend from 0 to 2 blocks, leaving a gap
|
||||
let mut m = tline.begin_modification(Lsn(0x70));
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"), &ctx)
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1"), &ctx)
|
||||
.await?;
|
||||
m.commit(&ctx).await?;
|
||||
assert_eq!(
|
||||
@@ -1870,13 +1870,13 @@ mod tests {
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), false, &ctx)
|
||||
.await?,
|
||||
TEST_IMG("foo blk 1")
|
||||
test_img("foo blk 1")
|
||||
);
|
||||
|
||||
// Extend a lot more, leaving a big gap that spans across segments
|
||||
let mut m = tline.begin_modification(Lsn(0x80));
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"), &ctx)
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 1500, test_img("foo blk 1500"), &ctx)
|
||||
.await?;
|
||||
m.commit(&ctx).await?;
|
||||
assert_eq!(
|
||||
@@ -1897,7 +1897,7 @@ mod tests {
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), false, &ctx)
|
||||
.await?,
|
||||
TEST_IMG("foo blk 1500")
|
||||
test_img("foo blk 1500")
|
||||
);
|
||||
|
||||
Ok(())
|
||||
@@ -1915,7 +1915,7 @@ mod tests {
|
||||
|
||||
let mut m = tline.begin_modification(Lsn(0x20));
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 2"), &ctx)
|
||||
.await?;
|
||||
m.commit(&ctx).await?;
|
||||
|
||||
@@ -1952,7 +1952,7 @@ mod tests {
|
||||
// Re-create it
|
||||
let mut m = tline.begin_modification(Lsn(0x40));
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"), &ctx)
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 4"), &ctx)
|
||||
.await?;
|
||||
m.commit(&ctx).await?;
|
||||
|
||||
@@ -1990,7 +1990,7 @@ mod tests {
|
||||
for blkno in 0..relsize {
|
||||
let data = format!("foo blk {} at {}", blkno, Lsn(0x20));
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
|
||||
.put_rel_page_image(&mut m, TESTREL_A, blkno, test_img(&data), &ctx)
|
||||
.await?;
|
||||
}
|
||||
m.commit(&ctx).await?;
|
||||
@@ -2028,7 +2028,7 @@ mod tests {
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), false, &ctx)
|
||||
.await?,
|
||||
TEST_IMG(&data)
|
||||
test_img(&data)
|
||||
);
|
||||
}
|
||||
|
||||
@@ -2055,7 +2055,7 @@ mod tests {
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), false, &ctx)
|
||||
.await?,
|
||||
TEST_IMG(&data)
|
||||
test_img(&data)
|
||||
);
|
||||
}
|
||||
|
||||
@@ -2073,7 +2073,7 @@ mod tests {
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), false, &ctx)
|
||||
.await?,
|
||||
TEST_IMG(&data)
|
||||
test_img(&data)
|
||||
);
|
||||
}
|
||||
|
||||
@@ -2084,7 +2084,7 @@ mod tests {
|
||||
for blkno in 0..relsize {
|
||||
let data = format!("foo blk {} at {}", blkno, lsn);
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
|
||||
.put_rel_page_image(&mut m, TESTREL_A, blkno, test_img(&data), &ctx)
|
||||
.await?;
|
||||
}
|
||||
m.commit(&ctx).await?;
|
||||
@@ -2109,7 +2109,7 @@ mod tests {
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), false, &ctx)
|
||||
.await?,
|
||||
TEST_IMG(&data)
|
||||
test_img(&data)
|
||||
);
|
||||
}
|
||||
|
||||
@@ -2130,7 +2130,7 @@ mod tests {
|
||||
for blknum in 0..RELSEG_SIZE + 1 {
|
||||
lsn += 0x10;
|
||||
let mut m = tline.begin_modification(Lsn(lsn));
|
||||
let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
|
||||
let img = test_img(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img, &ctx)
|
||||
.await?;
|
||||
|
||||
@@ -44,6 +44,11 @@ pub enum NeonWalRecord {
|
||||
moff: MultiXactOffset,
|
||||
members: Vec<MultiXactMember>,
|
||||
},
|
||||
/// Update the map of AUX files, either writing or dropping an entry
|
||||
AuxFile {
|
||||
file_path: String,
|
||||
content: Option<Bytes>,
|
||||
},
|
||||
}
|
||||
|
||||
impl NeonWalRecord {
|
||||
|
||||
@@ -22,7 +22,7 @@
|
||||
mod process;
|
||||
|
||||
/// Code to apply [`NeonWalRecord`]s.
|
||||
mod apply_neon;
|
||||
pub(crate) mod apply_neon;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::metrics::{
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
use crate::pgdatadir_mapping::AuxFilesDirectory;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::Context;
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::BytesMut;
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use pageserver_api::key::{key_to_rel_block, key_to_slru_block, Key};
|
||||
use pageserver_api::reltag::SlruKind;
|
||||
use postgres_ffi::pg_constants;
|
||||
@@ -12,6 +13,7 @@ use postgres_ffi::v14::nonrelfile_utils::{
|
||||
};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use tracing::*;
|
||||
use utils::bin_ser::BeSer;
|
||||
|
||||
/// Can this request be served by neon redo functions
|
||||
/// or we need to pass it to wal-redo postgres process?
|
||||
@@ -230,6 +232,72 @@ pub(crate) fn apply_in_neon(
|
||||
LittleEndian::write_u32(&mut page[memberoff..memberoff + 4], member.xid);
|
||||
}
|
||||
}
|
||||
NeonWalRecord::AuxFile { file_path, content } => {
|
||||
let mut dir = AuxFilesDirectory::des(page)?;
|
||||
dir.upsert(file_path.clone(), content.clone());
|
||||
|
||||
page.clear();
|
||||
let mut writer = page.writer();
|
||||
dir.ser_into(&mut writer)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::key::AUX_FILES_KEY;
|
||||
|
||||
use super::*;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::{pgdatadir_mapping::AuxFilesDirectory, walrecord::NeonWalRecord};
|
||||
|
||||
/// Test [`apply_in_neon`]'s handling of NeonWalRecord::AuxFile
|
||||
#[test]
|
||||
fn apply_aux_file_deltas() -> anyhow::Result<()> {
|
||||
let base_dir = AuxFilesDirectory {
|
||||
files: HashMap::from([
|
||||
("two".to_string(), Bytes::from_static(b"content0")),
|
||||
("three".to_string(), Bytes::from_static(b"contentX")),
|
||||
]),
|
||||
};
|
||||
let base_image = AuxFilesDirectory::ser(&base_dir)?;
|
||||
|
||||
let deltas = vec![
|
||||
// Insert
|
||||
NeonWalRecord::AuxFile {
|
||||
file_path: "one".to_string(),
|
||||
content: Some(Bytes::from_static(b"content1")),
|
||||
},
|
||||
// Update
|
||||
NeonWalRecord::AuxFile {
|
||||
file_path: "two".to_string(),
|
||||
content: Some(Bytes::from_static(b"content99")),
|
||||
},
|
||||
// Delete
|
||||
NeonWalRecord::AuxFile {
|
||||
file_path: "three".to_string(),
|
||||
content: None,
|
||||
},
|
||||
];
|
||||
|
||||
let file_path = AUX_FILES_KEY;
|
||||
let mut page = BytesMut::from_iter(base_image);
|
||||
|
||||
for record in deltas {
|
||||
apply_in_neon(&record, file_path, &mut page)?;
|
||||
}
|
||||
|
||||
let reconstructed = AuxFilesDirectory::des(&page)?;
|
||||
let expect = HashMap::from([
|
||||
("one".to_string(), Bytes::from_static(b"content1")),
|
||||
("two".to_string(), Bytes::from_static(b"content99")),
|
||||
]);
|
||||
|
||||
assert_eq!(reconstructed.files, expect);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3079,14 +3079,6 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
|
||||
XLogRecGetBlockTag(record, block_id, &rinfo, &forknum, &blkno);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Out of an abundance of caution, we always run redo on shared catalogs,
|
||||
* regardless of whether the block is stored in shared buffers. See also
|
||||
* this function's top comment.
|
||||
*/
|
||||
if (!OidIsValid(NInfoGetDbOid(rinfo)))
|
||||
return false;
|
||||
|
||||
CopyNRelFileInfoToBufTag(tag, rinfo);
|
||||
tag.forkNum = forknum;
|
||||
tag.blockNum = blkno;
|
||||
@@ -3100,17 +3092,28 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
|
||||
*/
|
||||
LWLockAcquire(partitionLock, LW_SHARED);
|
||||
|
||||
/* Try to find the relevant buffer */
|
||||
buffer = BufTableLookup(&tag, hash);
|
||||
|
||||
no_redo_needed = buffer < 0;
|
||||
/*
|
||||
* Out of an abundance of caution, we always run redo on shared catalogs,
|
||||
* regardless of whether the block is stored in shared buffers. See also
|
||||
* this function's top comment.
|
||||
*/
|
||||
if (!OidIsValid(NInfoGetDbOid(rinfo)))
|
||||
{
|
||||
no_redo_needed = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Try to find the relevant buffer */
|
||||
buffer = BufTableLookup(&tag, hash);
|
||||
|
||||
no_redo_needed = buffer < 0;
|
||||
}
|
||||
/* In both cases st lwlsn past this WAL record */
|
||||
SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);
|
||||
|
||||
/*
|
||||
* we don't have the buffer in memory, update lwLsn past this record, also
|
||||
* evict page fro file cache
|
||||
* evict page from file cache
|
||||
*/
|
||||
if (no_redo_needed)
|
||||
lfc_evict(rinfo, forknum, blkno);
|
||||
|
||||
@@ -7,6 +7,24 @@ AS 'MODULE_PATHNAME', 'test_consume_xids'
|
||||
LANGUAGE C STRICT
|
||||
PARALLEL UNSAFE;
|
||||
|
||||
CREATE FUNCTION test_consume_cpu(seconds int)
|
||||
RETURNS VOID
|
||||
AS 'MODULE_PATHNAME', 'test_consume_cpu'
|
||||
LANGUAGE C STRICT
|
||||
PARALLEL UNSAFE;
|
||||
|
||||
CREATE FUNCTION test_consume_memory(megabytes int)
|
||||
RETURNS VOID
|
||||
AS 'MODULE_PATHNAME', 'test_consume_memory'
|
||||
LANGUAGE C STRICT
|
||||
PARALLEL UNSAFE;
|
||||
|
||||
CREATE FUNCTION test_release_memory(megabytes int DEFAULT NULL)
|
||||
RETURNS VOID
|
||||
AS 'MODULE_PATHNAME', 'test_release_memory'
|
||||
LANGUAGE C
|
||||
PARALLEL UNSAFE;
|
||||
|
||||
CREATE FUNCTION clear_buffer_cache()
|
||||
RETURNS VOID
|
||||
AS 'MODULE_PATHNAME', 'clear_buffer_cache'
|
||||
|
||||
@@ -3,3 +3,4 @@ comment = 'helpers for neon testing and debugging'
|
||||
default_version = '1.0'
|
||||
module_pathname = '$libdir/neon_test_utils'
|
||||
relocatable = true
|
||||
trusted = true
|
||||
|
||||
@@ -21,10 +21,12 @@
|
||||
#include "miscadmin.h"
|
||||
#include "storage/buf_internals.h"
|
||||
#include "storage/bufmgr.h"
|
||||
#include "storage/fd.h"
|
||||
#include "utils/builtins.h"
|
||||
#include "utils/pg_lsn.h"
|
||||
#include "utils/rel.h"
|
||||
#include "utils/varlena.h"
|
||||
#include "utils/wait_event.h"
|
||||
#include "../neon/pagestore_client.h"
|
||||
|
||||
PG_MODULE_MAGIC;
|
||||
@@ -32,6 +34,9 @@ PG_MODULE_MAGIC;
|
||||
extern void _PG_init(void);
|
||||
|
||||
PG_FUNCTION_INFO_V1(test_consume_xids);
|
||||
PG_FUNCTION_INFO_V1(test_consume_cpu);
|
||||
PG_FUNCTION_INFO_V1(test_consume_memory);
|
||||
PG_FUNCTION_INFO_V1(test_release_memory);
|
||||
PG_FUNCTION_INFO_V1(clear_buffer_cache);
|
||||
PG_FUNCTION_INFO_V1(get_raw_page_at_lsn);
|
||||
PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex);
|
||||
@@ -97,6 +102,119 @@ test_consume_xids(PG_FUNCTION_ARGS)
|
||||
PG_RETURN_VOID();
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* test_consume_cpu(seconds int). Keeps one CPU busy for the given number of seconds.
|
||||
*/
|
||||
Datum
|
||||
test_consume_cpu(PG_FUNCTION_ARGS)
|
||||
{
|
||||
int32 seconds = PG_GETARG_INT32(0);
|
||||
TimestampTz start;
|
||||
uint64 total_iterations = 0;
|
||||
|
||||
start = GetCurrentTimestamp();
|
||||
|
||||
for (;;)
|
||||
{
|
||||
TimestampTz elapsed;
|
||||
|
||||
elapsed = GetCurrentTimestamp() - start;
|
||||
if (elapsed > (TimestampTz) seconds * USECS_PER_SEC)
|
||||
break;
|
||||
|
||||
/* keep spinning */
|
||||
for (int i = 0; i < 1000000; i++)
|
||||
total_iterations++;
|
||||
elog(DEBUG2, "test_consume_cpu(): %lu iterations in total", total_iterations);
|
||||
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
}
|
||||
|
||||
PG_RETURN_VOID();
|
||||
}
|
||||
|
||||
static MemoryContext consume_cxt = NULL;
|
||||
static slist_head consumed_memory_chunks;
|
||||
static int64 num_memory_chunks;
|
||||
|
||||
/*
|
||||
* test_consume_memory(megabytes int).
|
||||
*
|
||||
* Consume given amount of memory. The allocation is made in TopMemoryContext,
|
||||
* so it outlives the function, until you call test_release_memory to
|
||||
* explicitly release it, or close the session.
|
||||
*/
|
||||
Datum
|
||||
test_consume_memory(PG_FUNCTION_ARGS)
|
||||
{
|
||||
int32 megabytes = PG_GETARG_INT32(0);
|
||||
|
||||
/*
|
||||
* Consume the memory in a new memory context, so that it's convenient to
|
||||
* release and to display it separately in a possible memory context dump.
|
||||
*/
|
||||
if (consume_cxt == NULL)
|
||||
consume_cxt = AllocSetContextCreate(TopMemoryContext,
|
||||
"test_consume_memory",
|
||||
ALLOCSET_DEFAULT_SIZES);
|
||||
|
||||
for (int32 i = 0; i < megabytes; i++)
|
||||
{
|
||||
char *p;
|
||||
|
||||
p = MemoryContextAllocZero(consume_cxt, 1024 * 1024);
|
||||
|
||||
/* touch the memory, so that it's really allocated by the kernel */
|
||||
for (int j = 0; j < 1024 * 1024; j += 1024)
|
||||
p[j] = j % 0xFF;
|
||||
|
||||
slist_push_head(&consumed_memory_chunks, (slist_node *) p);
|
||||
num_memory_chunks++;
|
||||
}
|
||||
|
||||
PG_RETURN_VOID();
|
||||
}
|
||||
|
||||
/*
|
||||
* test_release_memory(megabytes int). NULL releases all
|
||||
*/
|
||||
Datum
|
||||
test_release_memory(PG_FUNCTION_ARGS)
|
||||
{
|
||||
TimestampTz start;
|
||||
|
||||
if (PG_ARGISNULL(0))
|
||||
{
|
||||
if (consume_cxt)
|
||||
{
|
||||
MemoryContextDelete(consume_cxt);
|
||||
consume_cxt = NULL;
|
||||
num_memory_chunks = 0;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
int32 chunks_to_release = PG_GETARG_INT32(0);
|
||||
|
||||
if (chunks_to_release > num_memory_chunks)
|
||||
{
|
||||
elog(WARNING, "only %lu MB is consumed, releasing it all", num_memory_chunks);
|
||||
chunks_to_release = num_memory_chunks;
|
||||
}
|
||||
|
||||
for (int32 i = 0; i < chunks_to_release; i++)
|
||||
{
|
||||
slist_node *chunk = slist_pop_head_node(&consumed_memory_chunks);
|
||||
|
||||
pfree(chunk);
|
||||
num_memory_chunks--;
|
||||
}
|
||||
}
|
||||
|
||||
PG_RETURN_VOID();
|
||||
}
|
||||
|
||||
/*
|
||||
* Flush the buffer cache, evicting all pages that are not currently pinned.
|
||||
*/
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
use futures::future::Either;
|
||||
use proxy::auth;
|
||||
use proxy::auth::backend::MaybeOwned;
|
||||
use proxy::cancellation::CancelMap;
|
||||
use proxy::cancellation::CancellationHandler;
|
||||
use proxy::config::AuthenticationConfig;
|
||||
use proxy::config::CacheOptions;
|
||||
use proxy::config::HttpConfig;
|
||||
@@ -12,6 +14,7 @@ use proxy::rate_limiter::EndpointRateLimiter;
|
||||
use proxy::rate_limiter::RateBucketInfo;
|
||||
use proxy::rate_limiter::RateLimiterConfig;
|
||||
use proxy::redis::notifications;
|
||||
use proxy::redis::publisher::RedisPublisherClient;
|
||||
use proxy::serverless::GlobalConnPoolOptions;
|
||||
use proxy::usage_metrics;
|
||||
|
||||
@@ -22,6 +25,7 @@ use std::net::SocketAddr;
|
||||
use std::pin::pin;
|
||||
use std::sync::Arc;
|
||||
use tokio::net::TcpListener;
|
||||
use tokio::sync::Mutex;
|
||||
use tokio::task::JoinSet;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::info;
|
||||
@@ -129,6 +133,9 @@ struct ProxyCliArgs {
|
||||
/// Can be given multiple times for different bucket sizes.
|
||||
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
|
||||
endpoint_rps_limit: Vec<RateBucketInfo>,
|
||||
/// Redis rate limiter max number of requests per second.
|
||||
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
|
||||
redis_rps_limit: Vec<RateBucketInfo>,
|
||||
/// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`.
|
||||
#[clap(long, default_value_t = 100)]
|
||||
initial_limit: usize,
|
||||
@@ -225,6 +232,19 @@ async fn main() -> anyhow::Result<()> {
|
||||
let cancellation_token = CancellationToken::new();
|
||||
|
||||
let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit));
|
||||
let cancel_map = CancelMap::default();
|
||||
let redis_publisher = match &args.redis_notifications {
|
||||
Some(url) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
|
||||
url,
|
||||
args.region.clone(),
|
||||
&config.redis_rps_limit,
|
||||
)?))),
|
||||
None => None,
|
||||
};
|
||||
let cancellation_handler = Arc::new(CancellationHandler::new(
|
||||
cancel_map.clone(),
|
||||
redis_publisher,
|
||||
));
|
||||
|
||||
// client facing tasks. these will exit on error or on cancellation
|
||||
// cancellation returns Ok(())
|
||||
@@ -234,6 +254,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
proxy_listener,
|
||||
cancellation_token.clone(),
|
||||
endpoint_rate_limiter.clone(),
|
||||
cancellation_handler.clone(),
|
||||
));
|
||||
|
||||
// TODO: rename the argument to something like serverless.
|
||||
@@ -248,6 +269,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
serverless_listener,
|
||||
cancellation_token.clone(),
|
||||
endpoint_rate_limiter.clone(),
|
||||
cancellation_handler.clone(),
|
||||
));
|
||||
}
|
||||
|
||||
@@ -271,7 +293,12 @@ async fn main() -> anyhow::Result<()> {
|
||||
let cache = api.caches.project_info.clone();
|
||||
if let Some(url) = args.redis_notifications {
|
||||
info!("Starting redis notifications listener ({url})");
|
||||
maintenance_tasks.spawn(notifications::task_main(url.to_owned(), cache.clone()));
|
||||
maintenance_tasks.spawn(notifications::task_main(
|
||||
url.to_owned(),
|
||||
cache.clone(),
|
||||
cancel_map.clone(),
|
||||
args.region.clone(),
|
||||
));
|
||||
}
|
||||
maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
|
||||
}
|
||||
@@ -403,6 +430,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
|
||||
let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
|
||||
RateBucketInfo::validate(&mut endpoint_rps_limit)?;
|
||||
let mut redis_rps_limit = args.redis_rps_limit.clone();
|
||||
RateBucketInfo::validate(&mut redis_rps_limit)?;
|
||||
|
||||
let config = Box::leak(Box::new(ProxyConfig {
|
||||
tls_config,
|
||||
@@ -414,6 +443,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
require_client_ip: args.require_client_ip,
|
||||
disable_ip_check_for_http: args.disable_ip_check_for_http,
|
||||
endpoint_rps_limit,
|
||||
redis_rps_limit,
|
||||
handshake_timeout: args.handshake_timeout,
|
||||
// TODO: add this argument
|
||||
region: args.region.clone(),
|
||||
|
||||
@@ -1,16 +1,28 @@
|
||||
use async_trait::async_trait;
|
||||
use dashmap::DashMap;
|
||||
use pq_proto::CancelKeyData;
|
||||
use std::{net::SocketAddr, sync::Arc};
|
||||
use thiserror::Error;
|
||||
use tokio::net::TcpStream;
|
||||
use tokio::sync::Mutex;
|
||||
use tokio_postgres::{CancelToken, NoTls};
|
||||
use tracing::info;
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::error::ReportableError;
|
||||
use crate::{
|
||||
error::ReportableError, metrics::NUM_CANCELLATION_REQUESTS,
|
||||
redis::publisher::RedisPublisherClient,
|
||||
};
|
||||
|
||||
pub type CancelMap = Arc<DashMap<CancelKeyData, Option<CancelClosure>>>;
|
||||
|
||||
/// Enables serving `CancelRequest`s.
|
||||
#[derive(Default)]
|
||||
pub struct CancelMap(DashMap<CancelKeyData, Option<CancelClosure>>);
|
||||
///
|
||||
/// If there is a `RedisPublisherClient` available, it will be used to publish the cancellation key to other proxy instances.
|
||||
pub struct CancellationHandler {
|
||||
map: CancelMap,
|
||||
redis_client: Option<Arc<Mutex<RedisPublisherClient>>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum CancelError {
|
||||
@@ -32,15 +44,43 @@ impl ReportableError for CancelError {
|
||||
}
|
||||
}
|
||||
|
||||
impl CancelMap {
|
||||
impl CancellationHandler {
|
||||
pub fn new(map: CancelMap, redis_client: Option<Arc<Mutex<RedisPublisherClient>>>) -> Self {
|
||||
Self { map, redis_client }
|
||||
}
|
||||
/// Cancel a running query for the corresponding connection.
|
||||
pub async fn cancel_session(&self, key: CancelKeyData) -> Result<(), CancelError> {
|
||||
pub async fn cancel_session(
|
||||
&self,
|
||||
key: CancelKeyData,
|
||||
session_id: Uuid,
|
||||
) -> Result<(), CancelError> {
|
||||
let from = "from_client";
|
||||
// NB: we should immediately release the lock after cloning the token.
|
||||
let Some(cancel_closure) = self.0.get(&key).and_then(|x| x.clone()) else {
|
||||
let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else {
|
||||
tracing::warn!("query cancellation key not found: {key}");
|
||||
if let Some(redis_client) = &self.redis_client {
|
||||
NUM_CANCELLATION_REQUESTS
|
||||
.with_label_values(&[from, "not_found"])
|
||||
.inc();
|
||||
info!("publishing cancellation key to Redis");
|
||||
match redis_client.lock().await.try_publish(key, session_id).await {
|
||||
Ok(()) => {
|
||||
info!("cancellation key successfuly published to Redis");
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::error!("failed to publish a message: {e}");
|
||||
return Err(CancelError::IO(std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
e.to_string(),
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
NUM_CANCELLATION_REQUESTS
|
||||
.with_label_values(&[from, "found"])
|
||||
.inc();
|
||||
info!("cancelling query per user's request using key {key}");
|
||||
cancel_closure.try_cancel_query().await
|
||||
}
|
||||
@@ -57,7 +97,7 @@ impl CancelMap {
|
||||
|
||||
// Random key collisions are unlikely to happen here, but they're still possible,
|
||||
// which is why we have to take care not to rewrite an existing key.
|
||||
match self.0.entry(key) {
|
||||
match self.map.entry(key) {
|
||||
dashmap::mapref::entry::Entry::Occupied(_) => continue,
|
||||
dashmap::mapref::entry::Entry::Vacant(e) => {
|
||||
e.insert(None);
|
||||
@@ -69,18 +109,46 @@ impl CancelMap {
|
||||
info!("registered new query cancellation key {key}");
|
||||
Session {
|
||||
key,
|
||||
cancel_map: self,
|
||||
cancellation_handler: self,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn contains(&self, session: &Session) -> bool {
|
||||
self.0.contains_key(&session.key)
|
||||
self.map.contains_key(&session.key)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn is_empty(&self) -> bool {
|
||||
self.0.is_empty()
|
||||
self.map.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
pub trait NotificationsCancellationHandler {
|
||||
async fn cancel_session_no_publish(&self, key: CancelKeyData) -> Result<(), CancelError>;
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl NotificationsCancellationHandler for CancellationHandler {
|
||||
async fn cancel_session_no_publish(&self, key: CancelKeyData) -> Result<(), CancelError> {
|
||||
let from = "from_redis";
|
||||
let cancel_closure = self.map.get(&key).and_then(|x| x.clone());
|
||||
match cancel_closure {
|
||||
Some(cancel_closure) => {
|
||||
NUM_CANCELLATION_REQUESTS
|
||||
.with_label_values(&[from, "found"])
|
||||
.inc();
|
||||
cancel_closure.try_cancel_query().await
|
||||
}
|
||||
None => {
|
||||
NUM_CANCELLATION_REQUESTS
|
||||
.with_label_values(&[from, "not_found"])
|
||||
.inc();
|
||||
tracing::warn!("query cancellation key not found: {key}");
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -115,7 +183,7 @@ pub struct Session {
|
||||
/// The user-facing key identifying this session.
|
||||
key: CancelKeyData,
|
||||
/// The [`CancelMap`] this session belongs to.
|
||||
cancel_map: Arc<CancelMap>,
|
||||
cancellation_handler: Arc<CancellationHandler>,
|
||||
}
|
||||
|
||||
impl Session {
|
||||
@@ -123,7 +191,9 @@ impl Session {
|
||||
/// This enables query cancellation in `crate::proxy::prepare_client_connection`.
|
||||
pub fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData {
|
||||
info!("enabling query cancellation for this session");
|
||||
self.cancel_map.0.insert(self.key, Some(cancel_closure));
|
||||
self.cancellation_handler
|
||||
.map
|
||||
.insert(self.key, Some(cancel_closure));
|
||||
|
||||
self.key
|
||||
}
|
||||
@@ -131,7 +201,7 @@ impl Session {
|
||||
|
||||
impl Drop for Session {
|
||||
fn drop(&mut self) {
|
||||
self.cancel_map.0.remove(&self.key);
|
||||
self.cancellation_handler.map.remove(&self.key);
|
||||
info!("dropped query cancellation key {}", &self.key);
|
||||
}
|
||||
}
|
||||
@@ -142,13 +212,16 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn check_session_drop() -> anyhow::Result<()> {
|
||||
let cancel_map: Arc<CancelMap> = Default::default();
|
||||
let cancellation_handler = Arc::new(CancellationHandler {
|
||||
map: CancelMap::default(),
|
||||
redis_client: None,
|
||||
});
|
||||
|
||||
let session = cancel_map.clone().get_session();
|
||||
assert!(cancel_map.contains(&session));
|
||||
let session = cancellation_handler.clone().get_session();
|
||||
assert!(cancellation_handler.contains(&session));
|
||||
drop(session);
|
||||
// Check that the session has been dropped.
|
||||
assert!(cancel_map.is_empty());
|
||||
assert!(cancellation_handler.is_empty());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -21,6 +21,7 @@ pub struct ProxyConfig {
|
||||
pub require_client_ip: bool,
|
||||
pub disable_ip_check_for_http: bool,
|
||||
pub endpoint_rps_limit: Vec<RateBucketInfo>,
|
||||
pub redis_rps_limit: Vec<RateBucketInfo>,
|
||||
pub region: String,
|
||||
pub handshake_timeout: Duration,
|
||||
}
|
||||
|
||||
@@ -13,7 +13,7 @@ use parquet::{
|
||||
},
|
||||
record::RecordWriter,
|
||||
};
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig};
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig, TimeoutOrCancel};
|
||||
use tokio::{sync::mpsc, time};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, info, Span};
|
||||
@@ -314,20 +314,23 @@ async fn upload_parquet(
|
||||
let path = RemotePath::from_string(&format!(
|
||||
"{year:04}/{month:02}/{day:02}/{hour:02}/requests_{id}.parquet"
|
||||
))?;
|
||||
let cancel = CancellationToken::new();
|
||||
backoff::retry(
|
||||
|| async {
|
||||
let stream = futures::stream::once(futures::future::ready(Ok(data.clone())));
|
||||
storage.upload(stream, data.len(), &path, None).await
|
||||
storage
|
||||
.upload(stream, data.len(), &path, None, &cancel)
|
||||
.await
|
||||
},
|
||||
|_e| false,
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_UPLOAD_MAX_RETRIES,
|
||||
"request_data_upload",
|
||||
// we don't want cancellation to interrupt here, so we make a dummy cancel token
|
||||
&CancellationToken::new(),
|
||||
&cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::anyhow!("Cancelled"))
|
||||
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
|
||||
.and_then(|x| x)
|
||||
.context("request_data_upload")?;
|
||||
|
||||
@@ -413,7 +416,8 @@ mod tests {
|
||||
)
|
||||
.unwrap(),
|
||||
max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
|
||||
})
|
||||
}),
|
||||
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
||||
})
|
||||
);
|
||||
assert_eq!(parquet_upload.parquet_upload_row_group_size, 100);
|
||||
@@ -466,6 +470,7 @@ mod tests {
|
||||
) -> Vec<(u64, usize, i64)> {
|
||||
let remote_storage_config = RemoteStorageConfig {
|
||||
storage: RemoteStorageKind::LocalFs(tmpdir.to_path_buf()),
|
||||
timeout: std::time::Duration::from_secs(120),
|
||||
};
|
||||
let storage = GenericRemoteStorage::from_config(&remote_storage_config).unwrap();
|
||||
|
||||
|
||||
@@ -152,6 +152,15 @@ pub static NUM_OPEN_CLIENTS_IN_HTTP_POOL: Lazy<IntGauge> = Lazy::new(|| {
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub static NUM_CANCELLATION_REQUESTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"proxy_cancellation_requests_total",
|
||||
"Number of cancellation requests (per found/not_found).",
|
||||
&["source", "kind"],
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct LatencyTimer {
|
||||
// time since the stopwatch was started
|
||||
|
||||
@@ -10,7 +10,7 @@ pub mod wake_compute;
|
||||
|
||||
use crate::{
|
||||
auth,
|
||||
cancellation::{self, CancelMap},
|
||||
cancellation::{self, CancellationHandler},
|
||||
compute,
|
||||
config::{ProxyConfig, TlsConfig},
|
||||
context::RequestMonitoring,
|
||||
@@ -62,6 +62,7 @@ pub async fn task_main(
|
||||
listener: tokio::net::TcpListener,
|
||||
cancellation_token: CancellationToken,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
cancellation_handler: Arc<CancellationHandler>,
|
||||
) -> anyhow::Result<()> {
|
||||
scopeguard::defer! {
|
||||
info!("proxy has shut down");
|
||||
@@ -72,7 +73,6 @@ pub async fn task_main(
|
||||
socket2::SockRef::from(&listener).set_keepalive(true)?;
|
||||
|
||||
let connections = tokio_util::task::task_tracker::TaskTracker::new();
|
||||
let cancel_map = Arc::new(CancelMap::default());
|
||||
|
||||
while let Some(accept_result) =
|
||||
run_until_cancelled(listener.accept(), &cancellation_token).await
|
||||
@@ -80,7 +80,7 @@ pub async fn task_main(
|
||||
let (socket, peer_addr) = accept_result?;
|
||||
|
||||
let session_id = uuid::Uuid::new_v4();
|
||||
let cancel_map = Arc::clone(&cancel_map);
|
||||
let cancellation_handler = Arc::clone(&cancellation_handler);
|
||||
let endpoint_rate_limiter = endpoint_rate_limiter.clone();
|
||||
|
||||
let session_span = info_span!(
|
||||
@@ -113,7 +113,7 @@ pub async fn task_main(
|
||||
let res = handle_client(
|
||||
config,
|
||||
&mut ctx,
|
||||
cancel_map,
|
||||
cancellation_handler,
|
||||
socket,
|
||||
ClientMode::Tcp,
|
||||
endpoint_rate_limiter,
|
||||
@@ -227,7 +227,7 @@ impl ReportableError for ClientRequestError {
|
||||
pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
config: &'static ProxyConfig,
|
||||
ctx: &mut RequestMonitoring,
|
||||
cancel_map: Arc<CancelMap>,
|
||||
cancellation_handler: Arc<CancellationHandler>,
|
||||
stream: S,
|
||||
mode: ClientMode,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
@@ -253,8 +253,8 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
|
||||
HandshakeData::Startup(stream, params) => (stream, params),
|
||||
HandshakeData::Cancel(cancel_key_data) => {
|
||||
return Ok(cancel_map
|
||||
.cancel_session(cancel_key_data)
|
||||
return Ok(cancellation_handler
|
||||
.cancel_session(cancel_key_data, ctx.session_id)
|
||||
.await
|
||||
.map(|()| None)?)
|
||||
}
|
||||
@@ -315,7 +315,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
.or_else(|e| stream.throw_error(e))
|
||||
.await?;
|
||||
|
||||
let session = cancel_map.get_session();
|
||||
let session = cancellation_handler.get_session();
|
||||
prepare_client_connection(&node, &session, &mut stream).await?;
|
||||
|
||||
// Before proxy passing, forward to compute whatever data is left in the
|
||||
@@ -331,6 +331,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
compute: node,
|
||||
req: _request_gauge,
|
||||
conn: _client_gauge,
|
||||
cancel: session,
|
||||
}))
|
||||
}
|
||||
|
||||
|
||||
@@ -122,25 +122,24 @@ where
|
||||
|
||||
error!(error = ?err, "could not connect to compute node");
|
||||
|
||||
let node_info =
|
||||
if err.get_error_kind() == crate::error::ErrorKind::Postgres || !node_info.cached() {
|
||||
// If the error is Postgres, that means that we managed to connect to the compute node, but there was an error.
|
||||
// Do not need to retrieve a new node_info, just return the old one.
|
||||
if !err.should_retry(num_retries) {
|
||||
return Err(err.into());
|
||||
}
|
||||
node_info
|
||||
} else {
|
||||
// if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
|
||||
info!("compute node's state has likely changed; requesting a wake-up");
|
||||
ctx.latency_timer.cache_miss();
|
||||
let old_node_info = invalidate_cache(node_info);
|
||||
let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
|
||||
node_info.reuse_settings(old_node_info);
|
||||
let node_info = if !node_info.cached() {
|
||||
// If we just recieved this from cplane and dodn't get it from cache, we shouldn't retry.
|
||||
// Do not need to retrieve a new node_info, just return the old one.
|
||||
if !err.should_retry(num_retries) {
|
||||
return Err(err.into());
|
||||
}
|
||||
node_info
|
||||
} else {
|
||||
// if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
|
||||
info!("compute node's state has likely changed; requesting a wake-up");
|
||||
ctx.latency_timer.cache_miss();
|
||||
let old_node_info = invalidate_cache(node_info);
|
||||
let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
|
||||
node_info.reuse_settings(old_node_info);
|
||||
|
||||
mechanism.update_connect_config(&mut node_info.config);
|
||||
node_info
|
||||
};
|
||||
mechanism.update_connect_config(&mut node_info.config);
|
||||
node_info
|
||||
};
|
||||
|
||||
// now that we have a new node, try connect to it repeatedly.
|
||||
// this can error for a few reasons, for instance:
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use crate::{
|
||||
cancellation,
|
||||
compute::PostgresConnection,
|
||||
console::messages::MetricsAuxInfo,
|
||||
metrics::NUM_BYTES_PROXIED_COUNTER,
|
||||
@@ -57,6 +58,7 @@ pub struct ProxyPassthrough<S> {
|
||||
|
||||
pub req: IntCounterPairGuard,
|
||||
pub conn: IntCounterPairGuard,
|
||||
pub cancel: cancellation::Session,
|
||||
}
|
||||
|
||||
impl<S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<S> {
|
||||
|
||||
@@ -375,8 +375,6 @@ enum ConnectAction {
|
||||
Connect,
|
||||
Retry,
|
||||
Fail,
|
||||
RetryPg,
|
||||
FailPg,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
@@ -466,14 +464,6 @@ impl ConnectMechanism for TestConnectMechanism {
|
||||
retryable: false,
|
||||
kind: ErrorKind::Compute,
|
||||
}),
|
||||
ConnectAction::FailPg => Err(TestConnectError {
|
||||
retryable: false,
|
||||
kind: ErrorKind::Postgres,
|
||||
}),
|
||||
ConnectAction::RetryPg => Err(TestConnectError {
|
||||
retryable: true,
|
||||
kind: ErrorKind::Postgres,
|
||||
}),
|
||||
x => panic!("expecting action {:?}, connect is called instead", x),
|
||||
}
|
||||
}
|
||||
@@ -572,32 +562,6 @@ async fn connect_to_compute_retry() {
|
||||
mechanism.verify();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn connect_to_compute_retry_pg() {
|
||||
let _ = env_logger::try_init();
|
||||
use ConnectAction::*;
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let mechanism = TestConnectMechanism::new(vec![Wake, RetryPg, Connect]);
|
||||
let user_info = helper_create_connect_info(&mechanism);
|
||||
connect_to_compute(&mut ctx, &mechanism, &user_info, false)
|
||||
.await
|
||||
.unwrap();
|
||||
mechanism.verify();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn connect_to_compute_fail_pg() {
|
||||
let _ = env_logger::try_init();
|
||||
use ConnectAction::*;
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let mechanism = TestConnectMechanism::new(vec![Wake, FailPg]);
|
||||
let user_info = helper_create_connect_info(&mechanism);
|
||||
connect_to_compute(&mut ctx, &mechanism, &user_info, false)
|
||||
.await
|
||||
.unwrap_err();
|
||||
mechanism.verify();
|
||||
}
|
||||
|
||||
/// Test that we don't retry if the error is not retryable.
|
||||
#[tokio::test]
|
||||
async fn connect_to_compute_non_retry_1() {
|
||||
|
||||
@@ -4,4 +4,4 @@ mod limiter;
|
||||
pub use aimd::Aimd;
|
||||
pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig};
|
||||
pub use limiter::Limiter;
|
||||
pub use limiter::{EndpointRateLimiter, RateBucketInfo};
|
||||
pub use limiter::{EndpointRateLimiter, RateBucketInfo, RedisRateLimiter};
|
||||
|
||||
@@ -22,6 +22,44 @@ use super::{
|
||||
RateLimiterConfig,
|
||||
};
|
||||
|
||||
pub struct RedisRateLimiter {
|
||||
data: Vec<RateBucket>,
|
||||
info: &'static [RateBucketInfo],
|
||||
}
|
||||
|
||||
impl RedisRateLimiter {
|
||||
pub fn new(info: &'static [RateBucketInfo]) -> Self {
|
||||
Self {
|
||||
data: vec![
|
||||
RateBucket {
|
||||
start: Instant::now(),
|
||||
count: 0,
|
||||
};
|
||||
info.len()
|
||||
],
|
||||
info,
|
||||
}
|
||||
}
|
||||
|
||||
/// Check that number of connections is below `max_rps` rps.
|
||||
pub fn check(&mut self) -> bool {
|
||||
let now = Instant::now();
|
||||
|
||||
let should_allow_request = self
|
||||
.data
|
||||
.iter_mut()
|
||||
.zip(self.info)
|
||||
.all(|(bucket, info)| bucket.should_allow_request(info, now));
|
||||
|
||||
if should_allow_request {
|
||||
// only increment the bucket counts if the request will actually be accepted
|
||||
self.data.iter_mut().for_each(RateBucket::inc);
|
||||
}
|
||||
|
||||
should_allow_request
|
||||
}
|
||||
}
|
||||
|
||||
// Simple per-endpoint rate limiter.
|
||||
//
|
||||
// Check that number of connections to the endpoint is below `max_rps` rps.
|
||||
|
||||
@@ -1 +1,2 @@
|
||||
pub mod notifications;
|
||||
pub mod publisher;
|
||||
|
||||
@@ -1,38 +1,44 @@
|
||||
use std::{convert::Infallible, sync::Arc};
|
||||
|
||||
use futures::StreamExt;
|
||||
use pq_proto::CancelKeyData;
|
||||
use redis::aio::PubSub;
|
||||
use serde::Deserialize;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::{
|
||||
cache::project_info::ProjectInfoCache,
|
||||
cancellation::{CancelMap, CancellationHandler, NotificationsCancellationHandler},
|
||||
intern::{ProjectIdInt, RoleNameInt},
|
||||
};
|
||||
|
||||
const CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
|
||||
const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
|
||||
pub(crate) const PROXY_CHANNEL_NAME: &str = "neondb-proxy-to-proxy-updates";
|
||||
const RECONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20);
|
||||
const INVALIDATION_LAG: std::time::Duration = std::time::Duration::from_secs(20);
|
||||
|
||||
struct ConsoleRedisClient {
|
||||
struct RedisConsumerClient {
|
||||
client: redis::Client,
|
||||
}
|
||||
|
||||
impl ConsoleRedisClient {
|
||||
impl RedisConsumerClient {
|
||||
pub fn new(url: &str) -> anyhow::Result<Self> {
|
||||
let client = redis::Client::open(url)?;
|
||||
Ok(Self { client })
|
||||
}
|
||||
async fn try_connect(&self) -> anyhow::Result<PubSub> {
|
||||
let mut conn = self.client.get_async_connection().await?.into_pubsub();
|
||||
tracing::info!("subscribing to a channel `{CHANNEL_NAME}`");
|
||||
conn.subscribe(CHANNEL_NAME).await?;
|
||||
tracing::info!("subscribing to a channel `{CPLANE_CHANNEL_NAME}`");
|
||||
conn.subscribe(CPLANE_CHANNEL_NAME).await?;
|
||||
tracing::info!("subscribing to a channel `{PROXY_CHANNEL_NAME}`");
|
||||
conn.subscribe(PROXY_CHANNEL_NAME).await?;
|
||||
Ok(conn)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
|
||||
#[serde(tag = "topic", content = "data")]
|
||||
enum Notification {
|
||||
pub(crate) enum Notification {
|
||||
#[serde(
|
||||
rename = "/allowed_ips_updated",
|
||||
deserialize_with = "deserialize_json_string"
|
||||
@@ -45,16 +51,25 @@ enum Notification {
|
||||
deserialize_with = "deserialize_json_string"
|
||||
)]
|
||||
PasswordUpdate { password_update: PasswordUpdate },
|
||||
#[serde(rename = "/cancel_session")]
|
||||
Cancel(CancelSession),
|
||||
}
|
||||
#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
|
||||
struct AllowedIpsUpdate {
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
|
||||
pub(crate) struct AllowedIpsUpdate {
|
||||
project_id: ProjectIdInt,
|
||||
}
|
||||
#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
|
||||
struct PasswordUpdate {
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
|
||||
pub(crate) struct PasswordUpdate {
|
||||
project_id: ProjectIdInt,
|
||||
role_name: RoleNameInt,
|
||||
}
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
|
||||
pub(crate) struct CancelSession {
|
||||
pub region_id: Option<String>,
|
||||
pub cancel_key_data: CancelKeyData,
|
||||
pub session_id: Uuid,
|
||||
}
|
||||
|
||||
fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result<T, D::Error>
|
||||
where
|
||||
T: for<'de2> serde::Deserialize<'de2>,
|
||||
@@ -64,6 +79,88 @@ where
|
||||
serde_json::from_str(&s).map_err(<D::Error as serde::de::Error>::custom)
|
||||
}
|
||||
|
||||
struct MessageHandler<
|
||||
C: ProjectInfoCache + Send + Sync + 'static,
|
||||
H: NotificationsCancellationHandler + Send + Sync + 'static,
|
||||
> {
|
||||
cache: Arc<C>,
|
||||
cancellation_handler: Arc<H>,
|
||||
region_id: String,
|
||||
}
|
||||
|
||||
impl<
|
||||
C: ProjectInfoCache + Send + Sync + 'static,
|
||||
H: NotificationsCancellationHandler + Send + Sync + 'static,
|
||||
> MessageHandler<C, H>
|
||||
{
|
||||
pub fn new(cache: Arc<C>, cancellation_handler: Arc<H>, region_id: String) -> Self {
|
||||
Self {
|
||||
cache,
|
||||
cancellation_handler,
|
||||
region_id,
|
||||
}
|
||||
}
|
||||
pub fn disable_ttl(&self) {
|
||||
self.cache.disable_ttl();
|
||||
}
|
||||
pub fn enable_ttl(&self) {
|
||||
self.cache.enable_ttl();
|
||||
}
|
||||
#[tracing::instrument(skip(self, msg), fields(session_id = tracing::field::Empty))]
|
||||
async fn handle_message(&self, msg: redis::Msg) -> anyhow::Result<()> {
|
||||
use Notification::*;
|
||||
let payload: String = msg.get_payload()?;
|
||||
tracing::debug!(?payload, "received a message payload");
|
||||
|
||||
let msg: Notification = match serde_json::from_str(&payload) {
|
||||
Ok(msg) => msg,
|
||||
Err(e) => {
|
||||
tracing::error!("broken message: {e}");
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
tracing::debug!(?msg, "received a message");
|
||||
match msg {
|
||||
Cancel(cancel_session) => {
|
||||
tracing::Span::current().record(
|
||||
"session_id",
|
||||
&tracing::field::display(cancel_session.session_id),
|
||||
);
|
||||
if let Some(cancel_region) = cancel_session.region_id {
|
||||
// If the message is not for this region, ignore it.
|
||||
if cancel_region != self.region_id {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
// This instance of cancellation_handler doesn't have a RedisPublisherClient so it can't publish the message.
|
||||
match self
|
||||
.cancellation_handler
|
||||
.cancel_session_no_publish(cancel_session.cancel_key_data)
|
||||
.await
|
||||
{
|
||||
Ok(()) => {}
|
||||
Err(e) => {
|
||||
tracing::error!("failed to cancel session: {e}");
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
invalidate_cache(self.cache.clone(), msg.clone());
|
||||
// It might happen that the invalid entry is on the way to be cached.
|
||||
// To make sure that the entry is invalidated, let's repeat the invalidation in INVALIDATION_LAG seconds.
|
||||
// TODO: include the version (or the timestamp) in the message and invalidate only if the entry is cached before the message.
|
||||
let cache = self.cache.clone();
|
||||
tokio::spawn(async move {
|
||||
tokio::time::sleep(INVALIDATION_LAG).await;
|
||||
invalidate_cache(cache, msg);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
|
||||
use Notification::*;
|
||||
match msg {
|
||||
@@ -74,50 +171,33 @@ fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
|
||||
password_update.project_id,
|
||||
password_update.role_name,
|
||||
),
|
||||
Cancel(_) => unreachable!("cancel message should be handled separately"),
|
||||
}
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip(cache))]
|
||||
fn handle_message<C>(msg: redis::Msg, cache: Arc<C>) -> anyhow::Result<()>
|
||||
where
|
||||
C: ProjectInfoCache + Send + Sync + 'static,
|
||||
{
|
||||
let payload: String = msg.get_payload()?;
|
||||
tracing::debug!(?payload, "received a message payload");
|
||||
|
||||
let msg: Notification = match serde_json::from_str(&payload) {
|
||||
Ok(msg) => msg,
|
||||
Err(e) => {
|
||||
tracing::error!("broken message: {e}");
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
tracing::debug!(?msg, "received a message");
|
||||
invalidate_cache(cache.clone(), msg.clone());
|
||||
// It might happen that the invalid entry is on the way to be cached.
|
||||
// To make sure that the entry is invalidated, let's repeat the invalidation in INVALIDATION_LAG seconds.
|
||||
// TODO: include the version (or the timestamp) in the message and invalidate only if the entry is cached before the message.
|
||||
tokio::spawn(async move {
|
||||
tokio::time::sleep(INVALIDATION_LAG).await;
|
||||
invalidate_cache(cache, msg.clone());
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Handle console's invalidation messages.
|
||||
#[tracing::instrument(name = "console_notifications", skip_all)]
|
||||
pub async fn task_main<C>(url: String, cache: Arc<C>) -> anyhow::Result<Infallible>
|
||||
pub async fn task_main<C>(
|
||||
url: String,
|
||||
cache: Arc<C>,
|
||||
cancel_map: CancelMap,
|
||||
region_id: String,
|
||||
) -> anyhow::Result<Infallible>
|
||||
where
|
||||
C: ProjectInfoCache + Send + Sync + 'static,
|
||||
{
|
||||
cache.enable_ttl();
|
||||
let handler = MessageHandler::new(
|
||||
cache,
|
||||
Arc::new(CancellationHandler::new(cancel_map, None)),
|
||||
region_id,
|
||||
);
|
||||
|
||||
loop {
|
||||
let redis = ConsoleRedisClient::new(&url)?;
|
||||
let redis = RedisConsumerClient::new(&url)?;
|
||||
let conn = match redis.try_connect().await {
|
||||
Ok(conn) => {
|
||||
cache.disable_ttl();
|
||||
handler.disable_ttl();
|
||||
conn
|
||||
}
|
||||
Err(e) => {
|
||||
@@ -130,7 +210,7 @@ where
|
||||
};
|
||||
let mut stream = conn.into_on_message();
|
||||
while let Some(msg) = stream.next().await {
|
||||
match handle_message(msg, cache.clone()) {
|
||||
match handler.handle_message(msg).await {
|
||||
Ok(()) => {}
|
||||
Err(e) => {
|
||||
tracing::error!("failed to handle message: {e}, will try to reconnect");
|
||||
@@ -138,7 +218,7 @@ where
|
||||
}
|
||||
}
|
||||
}
|
||||
cache.enable_ttl();
|
||||
handler.enable_ttl();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -198,6 +278,33 @@ mod tests {
|
||||
}
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
#[test]
|
||||
fn parse_cancel_session() -> anyhow::Result<()> {
|
||||
let cancel_key_data = CancelKeyData {
|
||||
backend_pid: 42,
|
||||
cancel_key: 41,
|
||||
};
|
||||
let uuid = uuid::Uuid::new_v4();
|
||||
let msg = Notification::Cancel(CancelSession {
|
||||
cancel_key_data,
|
||||
region_id: None,
|
||||
session_id: uuid,
|
||||
});
|
||||
let text = serde_json::to_string(&msg)?;
|
||||
let result: Notification = serde_json::from_str(&text)?;
|
||||
assert_eq!(msg, result);
|
||||
|
||||
let msg = Notification::Cancel(CancelSession {
|
||||
cancel_key_data,
|
||||
region_id: Some("region".to_string()),
|
||||
session_id: uuid,
|
||||
});
|
||||
let text = serde_json::to_string(&msg)?;
|
||||
let result: Notification = serde_json::from_str(&text)?;
|
||||
assert_eq!(msg, result,);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
80
proxy/src/redis/publisher.rs
Normal file
80
proxy/src/redis/publisher.rs
Normal file
@@ -0,0 +1,80 @@
|
||||
use pq_proto::CancelKeyData;
|
||||
use redis::AsyncCommands;
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::rate_limiter::{RateBucketInfo, RedisRateLimiter};
|
||||
|
||||
use super::notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME};
|
||||
|
||||
pub struct RedisPublisherClient {
|
||||
client: redis::Client,
|
||||
publisher: Option<redis::aio::Connection>,
|
||||
region_id: String,
|
||||
limiter: RedisRateLimiter,
|
||||
}
|
||||
|
||||
impl RedisPublisherClient {
|
||||
pub fn new(
|
||||
url: &str,
|
||||
region_id: String,
|
||||
info: &'static [RateBucketInfo],
|
||||
) -> anyhow::Result<Self> {
|
||||
let client = redis::Client::open(url)?;
|
||||
Ok(Self {
|
||||
client,
|
||||
publisher: None,
|
||||
region_id,
|
||||
limiter: RedisRateLimiter::new(info),
|
||||
})
|
||||
}
|
||||
pub async fn try_publish(
|
||||
&mut self,
|
||||
cancel_key_data: CancelKeyData,
|
||||
session_id: Uuid,
|
||||
) -> anyhow::Result<()> {
|
||||
if !self.limiter.check() {
|
||||
tracing::info!("Rate limit exceeded. Skipping cancellation message");
|
||||
return Err(anyhow::anyhow!("Rate limit exceeded"));
|
||||
}
|
||||
match self.publish(cancel_key_data, session_id).await {
|
||||
Ok(()) => return Ok(()),
|
||||
Err(e) => {
|
||||
tracing::error!("failed to publish a message: {e}");
|
||||
self.publisher = None;
|
||||
}
|
||||
}
|
||||
tracing::info!("Publisher is disconnected. Reconnectiong...");
|
||||
self.try_connect().await?;
|
||||
self.publish(cancel_key_data, session_id).await
|
||||
}
|
||||
|
||||
async fn publish(
|
||||
&mut self,
|
||||
cancel_key_data: CancelKeyData,
|
||||
session_id: Uuid,
|
||||
) -> anyhow::Result<()> {
|
||||
let conn = self
|
||||
.publisher
|
||||
.as_mut()
|
||||
.ok_or_else(|| anyhow::anyhow!("not connected"))?;
|
||||
let payload = serde_json::to_string(&Notification::Cancel(CancelSession {
|
||||
region_id: Some(self.region_id.clone()),
|
||||
cancel_key_data,
|
||||
session_id,
|
||||
}))?;
|
||||
conn.publish(PROXY_CHANNEL_NAME, payload).await?;
|
||||
Ok(())
|
||||
}
|
||||
pub async fn try_connect(&mut self) -> anyhow::Result<()> {
|
||||
match self.client.get_async_connection().await {
|
||||
Ok(conn) => {
|
||||
self.publisher = Some(conn);
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::error!("failed to connect to redis: {e}");
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -24,7 +24,7 @@ use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE;
|
||||
use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
|
||||
use crate::rate_limiter::EndpointRateLimiter;
|
||||
use crate::serverless::backend::PoolingBackend;
|
||||
use crate::{cancellation::CancelMap, config::ProxyConfig};
|
||||
use crate::{cancellation::CancellationHandler, config::ProxyConfig};
|
||||
use futures::StreamExt;
|
||||
use hyper::{
|
||||
server::{
|
||||
@@ -50,6 +50,7 @@ pub async fn task_main(
|
||||
ws_listener: TcpListener,
|
||||
cancellation_token: CancellationToken,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
cancellation_handler: Arc<CancellationHandler>,
|
||||
) -> anyhow::Result<()> {
|
||||
scopeguard::defer! {
|
||||
info!("websocket server has shut down");
|
||||
@@ -115,7 +116,7 @@ pub async fn task_main(
|
||||
let backend = backend.clone();
|
||||
let ws_connections = ws_connections.clone();
|
||||
let endpoint_rate_limiter = endpoint_rate_limiter.clone();
|
||||
|
||||
let cancellation_handler = cancellation_handler.clone();
|
||||
async move {
|
||||
let peer_addr = match client_addr {
|
||||
Some(addr) => addr,
|
||||
@@ -127,9 +128,9 @@ pub async fn task_main(
|
||||
let backend = backend.clone();
|
||||
let ws_connections = ws_connections.clone();
|
||||
let endpoint_rate_limiter = endpoint_rate_limiter.clone();
|
||||
let cancellation_handler = cancellation_handler.clone();
|
||||
|
||||
async move {
|
||||
let cancel_map = Arc::new(CancelMap::default());
|
||||
let session_id = uuid::Uuid::new_v4();
|
||||
|
||||
request_handler(
|
||||
@@ -137,7 +138,7 @@ pub async fn task_main(
|
||||
config,
|
||||
backend,
|
||||
ws_connections,
|
||||
cancel_map,
|
||||
cancellation_handler,
|
||||
session_id,
|
||||
peer_addr.ip(),
|
||||
endpoint_rate_limiter,
|
||||
@@ -205,7 +206,7 @@ async fn request_handler(
|
||||
config: &'static ProxyConfig,
|
||||
backend: Arc<PoolingBackend>,
|
||||
ws_connections: TaskTracker,
|
||||
cancel_map: Arc<CancelMap>,
|
||||
cancellation_handler: Arc<CancellationHandler>,
|
||||
session_id: uuid::Uuid,
|
||||
peer_addr: IpAddr,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
@@ -232,7 +233,7 @@ async fn request_handler(
|
||||
config,
|
||||
ctx,
|
||||
websocket,
|
||||
cancel_map,
|
||||
cancellation_handler,
|
||||
host,
|
||||
endpoint_rate_limiter,
|
||||
)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use crate::{
|
||||
cancellation::CancelMap,
|
||||
cancellation::CancellationHandler,
|
||||
config::ProxyConfig,
|
||||
context::RequestMonitoring,
|
||||
error::{io_error, ReportableError},
|
||||
@@ -133,7 +133,7 @@ pub async fn serve_websocket(
|
||||
config: &'static ProxyConfig,
|
||||
mut ctx: RequestMonitoring,
|
||||
websocket: HyperWebsocket,
|
||||
cancel_map: Arc<CancelMap>,
|
||||
cancellation_handler: Arc<CancellationHandler>,
|
||||
hostname: Option<String>,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
) -> anyhow::Result<()> {
|
||||
@@ -141,7 +141,7 @@ pub async fn serve_websocket(
|
||||
let res = handle_client(
|
||||
config,
|
||||
&mut ctx,
|
||||
cancel_map,
|
||||
cancellation_handler,
|
||||
WebSocketRw::new(websocket),
|
||||
ClientMode::Websockets { hostname },
|
||||
endpoint_rate_limiter,
|
||||
|
||||
@@ -1,11 +1,7 @@
|
||||
#![allow(unused)]
|
||||
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use hex::FromHex;
|
||||
use pageserver::tenant::Tenant;
|
||||
use reqwest::{header, Client, StatusCode, Url};
|
||||
use serde::Deserialize;
|
||||
use tokio::sync::Semaphore;
|
||||
@@ -290,7 +286,7 @@ impl CloudAdminApiClient {
|
||||
tokio::time::sleep(Duration::from_millis(500)).await;
|
||||
continue;
|
||||
}
|
||||
status => {
|
||||
_status => {
|
||||
return Err(Error::new(
|
||||
"List active projects".to_string(),
|
||||
ErrorKind::ResponseStatus(response.status()),
|
||||
|
||||
@@ -12,8 +12,12 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
(Scope::PageServerApi, _) => Err(AuthError(
|
||||
"PageServerApi scope makes no sense for Safekeeper".into(),
|
||||
(Scope::PageServerApi | Scope::GenerationsApi, _) => Err(AuthError(
|
||||
format!(
|
||||
"JWT scope '{:?}' is ineligible for Safekeeper auth",
|
||||
claims.scope
|
||||
)
|
||||
.into(),
|
||||
)),
|
||||
(Scope::SafekeeperData, _) => Ok(()),
|
||||
}
|
||||
|
||||
@@ -511,7 +511,11 @@ async fn backup_object(
|
||||
|
||||
let file = tokio_util::io::ReaderStream::with_capacity(file, BUFFER_SIZE);
|
||||
|
||||
storage.upload_storage_object(file, size, target_file).await
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
storage
|
||||
.upload_storage_object(file, size, target_file, &cancel)
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn read_object(
|
||||
@@ -526,8 +530,10 @@ pub async fn read_object(
|
||||
|
||||
info!("segment download about to start from remote path {file_path:?} at offset {offset}");
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let download = storage
|
||||
.download_storage_object(Some((offset, None)), file_path)
|
||||
.download_storage_object(Some((offset, None)), file_path, &cancel)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to open WAL segment download stream for remote path {file_path:?}")
|
||||
@@ -559,7 +565,8 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
|
||||
// Note: listing segments might take a long time if there are many of them.
|
||||
// We don't currently have http requests timeout cancellation, but if/once
|
||||
// we have listing should get streaming interface to make progress.
|
||||
let token = CancellationToken::new(); // not really used
|
||||
|
||||
let cancel = CancellationToken::new(); // not really used
|
||||
backoff::retry(
|
||||
|| async {
|
||||
// Do list-delete in batch_size batches to make progress even if there a lot of files.
|
||||
@@ -567,7 +574,7 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
|
||||
// I'm not sure deleting while iterating is expected in s3.
|
||||
loop {
|
||||
let files = storage
|
||||
.list_files(Some(&remote_path), Some(batch_size))
|
||||
.list_files(Some(&remote_path), Some(batch_size), &cancel)
|
||||
.await?;
|
||||
if files.is_empty() {
|
||||
return Ok(()); // done
|
||||
@@ -580,14 +587,15 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
|
||||
files.first().unwrap().object_name().unwrap_or(""),
|
||||
files.last().unwrap().object_name().unwrap_or("")
|
||||
);
|
||||
storage.delete_objects(&files).await?;
|
||||
storage.delete_objects(&files, &cancel).await?;
|
||||
}
|
||||
},
|
||||
// consider TimeoutOrCancel::caused_by_cancel when using cancellation
|
||||
|_| false,
|
||||
3,
|
||||
10,
|
||||
"executing WAL segments deletion batch",
|
||||
&token,
|
||||
&cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::anyhow!("canceled"))
|
||||
@@ -617,7 +625,12 @@ pub async fn copy_s3_segments(
|
||||
|
||||
let remote_path = RemotePath::new(&relative_dst_path)?;
|
||||
|
||||
let files = storage.list_files(Some(&remote_path), None).await?;
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let files = storage
|
||||
.list_files(Some(&remote_path), None, &cancel)
|
||||
.await?;
|
||||
|
||||
let uploaded_segments = &files
|
||||
.iter()
|
||||
.filter_map(|file| file.object_name().map(ToOwned::to_owned))
|
||||
@@ -645,7 +658,7 @@ pub async fn copy_s3_segments(
|
||||
let from = RemotePath::new(&relative_src_path.join(&segment_name))?;
|
||||
let to = RemotePath::new(&relative_dst_path.join(&segment_name))?;
|
||||
|
||||
storage.copy_object(&from, &to).await?;
|
||||
storage.copy_object(&from, &to, &cancel).await?;
|
||||
}
|
||||
|
||||
info!(
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user