mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-23 13:20:37 +00:00
Compare commits
81 Commits
proxy/remo
...
universal_
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c2a4f432ac | ||
|
|
0806a6548e | ||
|
|
89a285b33b | ||
|
|
c697b4533e | ||
|
|
7e6252c3d5 | ||
|
|
d8735aa12a | ||
|
|
1b97a3074c | ||
|
|
5c836ee5b4 | ||
|
|
4687b2e597 | ||
|
|
13adc83fc3 | ||
|
|
52c2c69351 | ||
|
|
207919f5eb | ||
|
|
218be9eb32 | ||
|
|
8198b865c3 | ||
|
|
baf395983f | ||
|
|
ce7efbe48a | ||
|
|
ef4a76c01e | ||
|
|
1ca08cc523 | ||
|
|
4626d89eda | ||
|
|
49c57c0b13 | ||
|
|
d3a97fdf88 | ||
|
|
763f5c0641 | ||
|
|
8173813584 | ||
|
|
cc2d00fea4 | ||
|
|
9ffccb55f1 | ||
|
|
3a6b99f03c | ||
|
|
d39fd66773 | ||
|
|
73d7a9bc6e | ||
|
|
3a71cf38c1 | ||
|
|
25c66dc635 | ||
|
|
538373019a | ||
|
|
c58b22bacb | ||
|
|
17aea78aa7 | ||
|
|
71f9d9e5a3 | ||
|
|
119b86480f | ||
|
|
fa1f87b268 | ||
|
|
db48f7e40d | ||
|
|
e157b16c24 | ||
|
|
94ad9204bb | ||
|
|
c8aed107c5 | ||
|
|
da128a509a | ||
|
|
5993b2bedc | ||
|
|
4ce7aa9ffe | ||
|
|
cbd04f5140 | ||
|
|
1037a8ddd9 | ||
|
|
6661f4fd44 | ||
|
|
b9f84b9609 | ||
|
|
459253879e | ||
|
|
0fa85aa08e | ||
|
|
039017cb4b | ||
|
|
4dc644612b | ||
|
|
6d17d6c775 | ||
|
|
4892a5c5b7 | ||
|
|
33cb1e9c0c | ||
|
|
9559ef6f3b | ||
|
|
64a4fb35c9 | ||
|
|
95ec42f2b8 | ||
|
|
ba9df27e78 | ||
|
|
ea3e1b51ec | ||
|
|
e3e739ee71 | ||
|
|
606caa0c5d | ||
|
|
6a906c68c9 | ||
|
|
682dfb3a31 | ||
|
|
5263b39e2c | ||
|
|
a241c8b2a4 | ||
|
|
e71d8095b9 | ||
|
|
1497a42296 | ||
|
|
cd33089a66 | ||
|
|
416c14b353 | ||
|
|
df49a9b7aa | ||
|
|
4ad0c8f960 | ||
|
|
e0b05ecafb | ||
|
|
ca4d71a954 | ||
|
|
381f41e685 | ||
|
|
d005c77ea3 | ||
|
|
04776ade6c | ||
|
|
c3fe335eaf | ||
|
|
3a00a5deb2 | ||
|
|
78fa2b13e5 | ||
|
|
7c076edeea | ||
|
|
69528b7c30 |
@@ -1,7 +1,20 @@
|
||||
name: 'Create Allure report'
|
||||
description: 'Generate Allure report from uploaded by actions/allure-report-store tests results'
|
||||
|
||||
inputs:
|
||||
store-test-results-into-db:
|
||||
description: 'Whether to store test results into the database. TEST_RESULT_CONNSTR/TEST_RESULT_CONNSTR_NEW should be set'
|
||||
type: boolean
|
||||
required: false
|
||||
default: false
|
||||
|
||||
outputs:
|
||||
base-url:
|
||||
description: 'Base URL for Allure report'
|
||||
value: ${{ steps.generate-report.outputs.base-url }}
|
||||
base-s3-url:
|
||||
description: 'Base S3 URL for Allure report'
|
||||
value: ${{ steps.generate-report.outputs.base-s3-url }}
|
||||
report-url:
|
||||
description: 'Allure report URL'
|
||||
value: ${{ steps.generate-report.outputs.report-url }}
|
||||
@@ -63,8 +76,8 @@ runs:
|
||||
rm -f ${ALLURE_ZIP}
|
||||
fi
|
||||
env:
|
||||
ALLURE_VERSION: 2.22.1
|
||||
ALLURE_ZIP_SHA256: fdc7a62d94b14c5e0bf25198ae1feded6b005fdbed864b4d3cb4e5e901720b0b
|
||||
ALLURE_VERSION: 2.23.1
|
||||
ALLURE_ZIP_SHA256: 11141bfe727504b3fd80c0f9801eb317407fd0ac983ebb57e671f14bac4bcd86
|
||||
|
||||
# Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
|
||||
- name: Acquire lock
|
||||
@@ -102,6 +115,11 @@ runs:
|
||||
REPORT_PREFIX=reports/${BRANCH_OR_PR}
|
||||
RAW_PREFIX=reports-raw/${BRANCH_OR_PR}/${GITHUB_RUN_ID}
|
||||
|
||||
BASE_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}
|
||||
BASE_S3_URL=s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}
|
||||
REPORT_URL=${BASE_URL}/index.html
|
||||
REPORT_JSON_URL=${BASE_URL}/data/suites.json
|
||||
|
||||
# Get previously uploaded data for this run
|
||||
ZSTD_NBTHREADS=0
|
||||
|
||||
@@ -110,10 +128,9 @@ runs:
|
||||
# There's no previously uploaded data for this $GITHUB_RUN_ID
|
||||
exit 0
|
||||
fi
|
||||
for S3_FILEPATH in ${S3_FILEPATHS}; do
|
||||
time aws s3 cp --only-show-errors "s3://${BUCKET}/${S3_FILEPATH}" "${WORKDIR}"
|
||||
|
||||
archive=${WORKDIR}/$(basename $S3_FILEPATH)
|
||||
time aws s3 cp --recursive --only-show-errors "s3://${BUCKET}/${RAW_PREFIX}/" "${WORKDIR}/"
|
||||
for archive in $(find ${WORKDIR} -name "*.tar.zst"); do
|
||||
mkdir -p ${archive%.tar.zst}
|
||||
time tar -xf ${archive} -C ${archive%.tar.zst}
|
||||
rm -f ${archive}
|
||||
@@ -130,9 +147,10 @@ runs:
|
||||
|
||||
# Upload a history and the final report (in this particular order to not to have duplicated history in 2 places)
|
||||
time aws s3 mv --recursive --only-show-errors "${WORKDIR}/report/history" "s3://${BUCKET}/${REPORT_PREFIX}/latest/history"
|
||||
time aws s3 mv --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
|
||||
|
||||
REPORT_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html
|
||||
# Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work,
|
||||
# and to keep files on the host to upload them to the database
|
||||
time aws s3 cp --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
|
||||
|
||||
# Generate redirect
|
||||
cat <<EOF > ${WORKDIR}/index.html
|
||||
@@ -144,8 +162,10 @@ runs:
|
||||
EOF
|
||||
time aws s3 cp --only-show-errors ${WORKDIR}/index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"
|
||||
|
||||
echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
|
||||
echo "report-json-url=${REPORT_URL%/index.html}/data/suites.json" >> $GITHUB_OUTPUT
|
||||
echo "base-url=${BASE_URL}" >> $GITHUB_OUTPUT
|
||||
echo "base-s3-url=${BASE_S3_URL}" >> $GITHUB_OUTPUT
|
||||
echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
|
||||
echo "report-json-url=${REPORT_JSON_URL}" >> $GITHUB_OUTPUT
|
||||
|
||||
echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY}
|
||||
|
||||
@@ -159,6 +179,41 @@ runs:
|
||||
aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
|
||||
fi
|
||||
|
||||
- name: Store Allure test stat in the DB
|
||||
if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
env:
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
REPORT_JSON_URL: ${{ steps.generate-report.outputs.report-json-url }}
|
||||
run: |
|
||||
export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR}
|
||||
|
||||
./scripts/pysync
|
||||
|
||||
poetry run python3 scripts/ingest_regress_test_result.py \
|
||||
--revision ${COMMIT_SHA} \
|
||||
--reference ${GITHUB_REF} \
|
||||
--build-type unified \
|
||||
--ingest ${WORKDIR}/report/data/suites.json
|
||||
|
||||
- name: Store Allure test stat in the DB (new)
|
||||
if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
env:
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
BASE_S3_URL: ${{ steps.generate-report.outputs.base-s3-url }}
|
||||
run: |
|
||||
export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR_NEW}
|
||||
|
||||
./scripts/pysync
|
||||
|
||||
poetry run python3 scripts/ingest_regress_test_result-new-format.py \
|
||||
--reference ${GITHUB_REF} \
|
||||
--revision ${COMMIT_SHA} \
|
||||
--run-id ${GITHUB_RUN_ID} \
|
||||
--run-attempt ${GITHUB_RUN_ATTEMPT} \
|
||||
--test-cases-dir ${WORKDIR}/report/data/test-cases
|
||||
|
||||
- name: Cleanup
|
||||
if: always()
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
2
.github/actions/download/action.yml
vendored
2
.github/actions/download/action.yml
vendored
@@ -31,7 +31,7 @@ runs:
|
||||
BUCKET=neon-github-public-dev
|
||||
FILENAME=$(basename $ARCHIVE)
|
||||
|
||||
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX%$GITHUB_RUN_ATTEMPT} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
|
||||
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX%$GITHUB_RUN_ATTEMPT} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
|
||||
if [ -z "${S3_KEY}" ]; then
|
||||
if [ "${SKIP_IF_DOES_NOT_EXIST}" = "true" ]; then
|
||||
echo 'SKIPPED=true' >> $GITHUB_OUTPUT
|
||||
|
||||
28
.github/workflows/build_and_test.yml
vendored
28
.github/workflows/build_and_test.yml
vendored
@@ -432,6 +432,11 @@ jobs:
|
||||
if: ${{ !cancelled() }}
|
||||
id: create-allure-report
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
with:
|
||||
store-test-results-into-db: true
|
||||
env:
|
||||
REGRESS_TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
|
||||
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
|
||||
- uses: actions/github-script@v6
|
||||
if: ${{ !cancelled() }}
|
||||
@@ -452,25 +457,6 @@ jobs:
|
||||
report,
|
||||
})
|
||||
|
||||
- name: Store Allure test stat in the DB
|
||||
if: ${{ !cancelled() && steps.create-allure-report.outputs.report-json-url }}
|
||||
env:
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
REPORT_JSON_URL: ${{ steps.create-allure-report.outputs.report-json-url }}
|
||||
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
|
||||
run: |
|
||||
./scripts/pysync
|
||||
|
||||
curl --fail --output suites.json "${REPORT_JSON_URL}"
|
||||
export BUILD_TYPE=unified
|
||||
export DATABASE_URL="$TEST_RESULT_CONNSTR"
|
||||
|
||||
poetry run python3 scripts/ingest_regress_test_result.py \
|
||||
--revision ${COMMIT_SHA} \
|
||||
--reference ${GITHUB_REF} \
|
||||
--build-type ${BUILD_TYPE} \
|
||||
--ingest suites.json
|
||||
|
||||
coverage-report:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
@@ -794,7 +780,7 @@ jobs:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
env:
|
||||
VM_BUILDER_VERSION: v0.13.1
|
||||
VM_BUILDER_VERSION: v0.15.4
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -1067,7 +1053,7 @@ jobs:
|
||||
OLD_PREFIX=artifacts/${GITHUB_RUN_ID}
|
||||
FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst
|
||||
|
||||
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
|
||||
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
|
||||
if [ -z "${S3_KEY}" ]; then
|
||||
echo >&2 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist"
|
||||
exit 1
|
||||
|
||||
62
Cargo.lock
generated
62
Cargo.lock
generated
@@ -740,6 +740,9 @@ name = "cc"
|
||||
version = "1.0.79"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
|
||||
dependencies = [
|
||||
"jobserver",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cexpr"
|
||||
@@ -907,12 +910,14 @@ dependencies = [
|
||||
"opentelemetry",
|
||||
"postgres",
|
||||
"regex",
|
||||
"remote_storage",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tar",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"toml_edit",
|
||||
"tracing",
|
||||
"tracing-opentelemetry",
|
||||
"tracing-subscriber",
|
||||
@@ -920,6 +925,7 @@ dependencies = [
|
||||
"url",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
"zstd",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -980,6 +986,7 @@ dependencies = [
|
||||
"tar",
|
||||
"thiserror",
|
||||
"toml",
|
||||
"tracing",
|
||||
"url",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
@@ -1972,6 +1979,15 @@ version = "1.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
|
||||
|
||||
[[package]]
|
||||
name = "jobserver"
|
||||
version = "0.1.26"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "js-sys"
|
||||
version = "0.3.63"
|
||||
@@ -2654,6 +2670,16 @@ dependencies = [
|
||||
"windows-sys 0.45.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pbkdf2"
|
||||
version = "0.12.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31"
|
||||
dependencies = [
|
||||
"digest",
|
||||
"hmac",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "peeking_take_while"
|
||||
version = "0.1.2"
|
||||
@@ -3030,7 +3056,6 @@ dependencies = [
|
||||
"chrono",
|
||||
"clap",
|
||||
"consumption_metrics",
|
||||
"fallible-iterator",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hashbrown 0.13.2",
|
||||
@@ -3048,9 +3073,9 @@ dependencies = [
|
||||
"once_cell",
|
||||
"opentelemetry",
|
||||
"parking_lot 0.12.1",
|
||||
"pbkdf2",
|
||||
"pin-project-lite",
|
||||
"postgres-native-tls",
|
||||
"postgres-protocol",
|
||||
"postgres_backend",
|
||||
"pq_proto",
|
||||
"prometheus",
|
||||
@@ -3074,7 +3099,6 @@ dependencies = [
|
||||
"thiserror",
|
||||
"tls-listener",
|
||||
"tokio",
|
||||
"tokio-native-tls",
|
||||
"tokio-postgres",
|
||||
"tokio-postgres-rustls",
|
||||
"tokio-rustls 0.23.4",
|
||||
@@ -3229,6 +3253,7 @@ dependencies = [
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tempfile",
|
||||
@@ -5288,6 +5313,7 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bytes",
|
||||
"cc",
|
||||
"chrono",
|
||||
"clap",
|
||||
"clap_builder",
|
||||
@@ -5388,3 +5414,33 @@ name = "zeroize"
|
||||
version = "1.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"
|
||||
|
||||
[[package]]
|
||||
name = "zstd"
|
||||
version = "0.12.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c"
|
||||
dependencies = [
|
||||
"zstd-safe",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zstd-safe"
|
||||
version = "6.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"zstd-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zstd-sys"
|
||||
version = "2.0.8+zstd.1.5.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
"pkg-config",
|
||||
]
|
||||
|
||||
@@ -88,6 +88,7 @@ opentelemetry = "0.19.0"
|
||||
opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-semantic-conventions = "0.11.0"
|
||||
parking_lot = "0.12"
|
||||
pbkdf2 = "0.12.1"
|
||||
pin-project-lite = "0.2"
|
||||
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
|
||||
prost = "0.11"
|
||||
|
||||
@@ -51,6 +51,7 @@ RUN set -e \
|
||||
--bin safekeeper \
|
||||
--bin storage_broker \
|
||||
--bin proxy \
|
||||
--bin neon_local \
|
||||
--locked --release \
|
||||
&& cachepot -s
|
||||
|
||||
@@ -76,6 +77,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin
|
||||
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
|
||||
|
||||
@@ -551,8 +551,8 @@ FROM build-deps AS pg-embedding-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.1.tar.gz -O pg_embedding.tar.gz && \
|
||||
echo "c4ae84eef36fa8ec5868f6e061f39812f19ee5ba3604d428d40935685c7be512 pg_embedding.tar.gz" | sha256sum --check && \
|
||||
RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.5.tar.gz -O pg_embedding.tar.gz && \
|
||||
echo "0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 pg_embedding.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
@@ -816,6 +816,7 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
|
||||
# libxml2, libxslt1.1 for xml2
|
||||
# libzstd1 for zstd
|
||||
# libboost*, libfreetype6, and zlib1g for rdkit
|
||||
# ca-certificates for communicating with s3 by compute_ctl
|
||||
RUN apt update && \
|
||||
apt install --no-install-recommends -y \
|
||||
gdb \
|
||||
@@ -839,7 +840,8 @@ RUN apt update && \
|
||||
libcurl4-openssl-dev \
|
||||
locales \
|
||||
procps \
|
||||
zlib1g && \
|
||||
zlib1g \
|
||||
ca-certificates && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
|
||||
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
|
||||
|
||||
|
||||
11
README.md
11
README.md
@@ -29,13 +29,13 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
|
||||
```bash
|
||||
apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
|
||||
libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
|
||||
libcurl4-openssl-dev
|
||||
libcurl4-openssl-dev openssl python-poetry
|
||||
```
|
||||
* On Fedora, these packages are needed:
|
||||
```bash
|
||||
dnf install flex bison readline-devel zlib-devel openssl-devel \
|
||||
libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
|
||||
protobuf-devel libcurl-devel
|
||||
protobuf-devel libcurl-devel openssl poetry
|
||||
```
|
||||
* On Arch based systems, these packages are needed:
|
||||
```bash
|
||||
@@ -235,6 +235,13 @@ CARGO_BUILD_FLAGS="--features=testing" make
|
||||
./scripts/pytest
|
||||
```
|
||||
|
||||
By default, this runs both debug and release modes, and all supported postgres versions. When
|
||||
testing locally, it is convenient to run just run one set of permutations, like this:
|
||||
|
||||
```sh
|
||||
DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
|
||||
```
|
||||
|
||||
## Documentation
|
||||
|
||||
[docs](/docs) Contains a top-level overview of all available markdown documentation.
|
||||
|
||||
@@ -32,3 +32,6 @@ url.workspace = true
|
||||
compute_api.workspace = true
|
||||
utils.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
toml_edit.workspace = true
|
||||
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
|
||||
zstd = "0.12.4"
|
||||
|
||||
@@ -5,6 +5,8 @@
|
||||
//! - `compute_ctl` accepts cluster (compute node) specification as a JSON file.
|
||||
//! - Every start is a fresh start, so the data directory is removed and
|
||||
//! initialized again on each run.
|
||||
//! - If remote_extension_config is provided, it will be used to fetch extensions list
|
||||
//! and download `shared_preload_libraries` from the remote storage.
|
||||
//! - Next it will put configuration files into the `PGDATA` directory.
|
||||
//! - Sync safekeepers and get commit LSN.
|
||||
//! - Get `basebackup` from pageserver using the returned on the previous step LSN.
|
||||
@@ -27,7 +29,8 @@
|
||||
//! compute_ctl -D /var/db/postgres/compute \
|
||||
//! -C 'postgresql://cloud_admin@localhost/postgres' \
|
||||
//! -S /var/db/postgres/specs/current.json \
|
||||
//! -b /usr/local/bin/postgres
|
||||
//! -b /usr/local/bin/postgres \
|
||||
//! -r {"bucket": "neon-dev-extensions-eu-central-1", "region": "eu-central-1"}
|
||||
//! ```
|
||||
//!
|
||||
use std::collections::HashMap;
|
||||
@@ -35,7 +38,7 @@ use std::fs::File;
|
||||
use std::panic;
|
||||
use std::path::Path;
|
||||
use std::process::exit;
|
||||
use std::sync::{mpsc, Arc, Condvar, Mutex};
|
||||
use std::sync::{mpsc, Arc, Condvar, Mutex, OnceLock, RwLock};
|
||||
use std::{thread, time::Duration};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
@@ -48,22 +51,33 @@ use compute_api::responses::ComputeStatus;
|
||||
|
||||
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
|
||||
use compute_tools::configurator::launch_configurator;
|
||||
use compute_tools::extension_server::{get_pg_version, init_remote_storage};
|
||||
use compute_tools::http::api::launch_http_server;
|
||||
use compute_tools::logger::*;
|
||||
use compute_tools::monitor::launch_monitor;
|
||||
use compute_tools::params::*;
|
||||
use compute_tools::spec::*;
|
||||
|
||||
const BUILD_TAG_DEFAULT: &str = "local";
|
||||
// this is an arbitrary build tag. Fine as a default / for testing purposes
|
||||
// in-case of not-set environment var
|
||||
const BUILD_TAG_DEFAULT: &str = "5670669815";
|
||||
|
||||
fn main() -> Result<()> {
|
||||
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
|
||||
|
||||
let build_tag = option_env!("BUILD_TAG").unwrap_or(BUILD_TAG_DEFAULT);
|
||||
|
||||
let build_tag = option_env!("BUILD_TAG")
|
||||
.unwrap_or(BUILD_TAG_DEFAULT)
|
||||
.to_string();
|
||||
info!("build_tag: {build_tag}");
|
||||
|
||||
let matches = cli().get_matches();
|
||||
let pgbin_default = String::from("postgres");
|
||||
let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
|
||||
|
||||
let remote_ext_config = matches.get_one::<String>("remote-ext-config");
|
||||
let ext_remote_storage = remote_ext_config.map(|x| {
|
||||
init_remote_storage(x).expect("cannot initialize remote extension storage from config")
|
||||
});
|
||||
|
||||
let http_port = *matches
|
||||
.get_one::<u16>("http-port")
|
||||
@@ -128,9 +142,6 @@ fn main() -> Result<()> {
|
||||
let compute_id = matches.get_one::<String>("compute-id");
|
||||
let control_plane_uri = matches.get_one::<String>("control-plane-uri");
|
||||
|
||||
// Try to use just 'postgres' if no path is provided
|
||||
let pgbin = matches.get_one::<String>("pgbin").unwrap();
|
||||
|
||||
let spec;
|
||||
let mut live_config_allowed = false;
|
||||
match spec_json {
|
||||
@@ -168,6 +179,7 @@ fn main() -> Result<()> {
|
||||
|
||||
let mut new_state = ComputeState::new();
|
||||
let spec_set;
|
||||
|
||||
if let Some(spec) = spec {
|
||||
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
|
||||
new_state.pspec = Some(pspec);
|
||||
@@ -179,27 +191,37 @@ fn main() -> Result<()> {
|
||||
connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
|
||||
pgdata: pgdata.to_string(),
|
||||
pgbin: pgbin.to_string(),
|
||||
pgversion: get_pg_version(pgbin),
|
||||
live_config_allowed,
|
||||
state: Mutex::new(new_state),
|
||||
state_changed: Condvar::new(),
|
||||
ext_remote_storage,
|
||||
ext_remote_paths: OnceLock::new(),
|
||||
ext_download_progress: RwLock::new(HashMap::new()),
|
||||
library_index: OnceLock::new(),
|
||||
build_tag,
|
||||
};
|
||||
let compute = Arc::new(compute_node);
|
||||
|
||||
// If this is a pooled VM, prewarm before starting HTTP server and becoming
|
||||
// available for binding. Prewarming helps postgres start quicker later,
|
||||
// because QEMU will already have it's memory allocated from the host, and
|
||||
// the necessary binaries will alreaady be cached.
|
||||
if !spec_set {
|
||||
compute.prewarm_postgres()?;
|
||||
}
|
||||
|
||||
// Launch http service first, so we were able to serve control-plane
|
||||
// requests, while configuration is still in progress.
|
||||
let _http_handle =
|
||||
launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
|
||||
|
||||
let extension_server_port: u16 = http_port;
|
||||
|
||||
if !spec_set {
|
||||
// No spec provided, hang waiting for it.
|
||||
info!("no compute spec provided, waiting");
|
||||
|
||||
// TODO this can stall startups in the unlikely event that we bind
|
||||
// this compute node while it's busy prewarming. It's not too
|
||||
// bad because it's just 100ms and unlikely, but it's an
|
||||
// avoidable problem.
|
||||
compute.prewarm_postgres()?;
|
||||
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
while state.status != ComputeStatus::ConfigurationPending {
|
||||
state = compute.state_changed.wait(state).unwrap();
|
||||
@@ -236,7 +258,7 @@ fn main() -> Result<()> {
|
||||
// Start Postgres
|
||||
let mut delay_exit = false;
|
||||
let mut exit_code = None;
|
||||
let pg = match compute.start_compute() {
|
||||
let pg = match compute.start_compute(extension_server_port) {
|
||||
Ok(pg) => Some(pg),
|
||||
Err(err) => {
|
||||
error!("could not start the compute node: {:?}", err);
|
||||
@@ -365,6 +387,12 @@ fn cli() -> clap::Command {
|
||||
.long("control-plane-uri")
|
||||
.value_name("CONTROL_PLANE_API_BASE_URI"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("remote-ext-config")
|
||||
.short('r')
|
||||
.long("remote-ext-config")
|
||||
.value_name("REMOTE_EXT_CONFIG"),
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,16 +1,21 @@
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::io::BufRead;
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
use std::path::Path;
|
||||
use std::process::{Command, Stdio};
|
||||
use std::str::FromStr;
|
||||
use std::sync::{Condvar, Mutex};
|
||||
use std::sync::{Condvar, Mutex, OnceLock, RwLock};
|
||||
use std::time::Instant;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::{DateTime, Utc};
|
||||
use futures::future::join_all;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::StreamExt;
|
||||
use postgres::{Client, NoTls};
|
||||
use regex::Regex;
|
||||
use tokio;
|
||||
use tokio_postgres;
|
||||
use tracing::{error, info, instrument, warn};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -20,10 +25,12 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus};
|
||||
use compute_api::spec::{ComputeMode, ComputeSpec};
|
||||
use utils::measured_stream::MeasuredReader;
|
||||
|
||||
use crate::config;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
|
||||
|
||||
use crate::pg_helpers::*;
|
||||
use crate::spec::*;
|
||||
use crate::sync_sk::{check_if_synced, ping_safekeeper};
|
||||
use crate::{config, extension_server};
|
||||
|
||||
/// Compute node info shared across several `compute_ctl` threads.
|
||||
pub struct ComputeNode {
|
||||
@@ -31,6 +38,7 @@ pub struct ComputeNode {
|
||||
pub connstr: url::Url,
|
||||
pub pgdata: String,
|
||||
pub pgbin: String,
|
||||
pub pgversion: String,
|
||||
/// We should only allow live re- / configuration of the compute node if
|
||||
/// it uses 'pull model', i.e. it can go to control-plane and fetch
|
||||
/// the latest configuration. Otherwise, there could be a case:
|
||||
@@ -50,6 +58,24 @@ pub struct ComputeNode {
|
||||
pub state: Mutex<ComputeState>,
|
||||
/// `Condvar` to allow notifying waiters about state changes.
|
||||
pub state_changed: Condvar,
|
||||
/// the S3 bucket that we search for extensions in
|
||||
pub ext_remote_storage: Option<GenericRemoteStorage>,
|
||||
// (key: extension name, value: path to extension archive in remote storage)
|
||||
pub ext_remote_paths: OnceLock<HashMap<String, RemotePath>>,
|
||||
// (key: library name, value: name of extension containing this library)
|
||||
pub library_index: OnceLock<HashMap<String, String>>,
|
||||
// key: ext_archive_name, value: started download time, download_completed?
|
||||
pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
|
||||
pub build_tag: String,
|
||||
}
|
||||
|
||||
// store some metrics about download size that might impact startup time
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct RemoteExtensionMetrics {
|
||||
num_ext_downloaded: u64,
|
||||
largest_ext_size: u64,
|
||||
total_ext_download_size: u64,
|
||||
prep_extensions_ms: u64,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
@@ -260,7 +286,7 @@ impl ComputeNode {
|
||||
#[instrument(skip_all, fields(%lsn))]
|
||||
fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
|
||||
let spec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
let start_time = Utc::now();
|
||||
let start_time = Instant::now();
|
||||
|
||||
let mut config = postgres::Config::from_str(&spec.pageserver_connstr)?;
|
||||
|
||||
@@ -273,7 +299,10 @@ impl ComputeNode {
|
||||
info!("Storage auth token not set");
|
||||
}
|
||||
|
||||
// Connect to pageserver
|
||||
let mut client = config.connect(NoTls)?;
|
||||
let pageserver_connect_micros = start_time.elapsed().as_micros() as u64;
|
||||
|
||||
let basebackup_cmd = match lsn {
|
||||
// HACK We don't use compression on first start (Lsn(0)) because there's no API for it
|
||||
Lsn(0) => format!("basebackup {} {}", spec.tenant_id, spec.timeline_id),
|
||||
@@ -319,13 +348,10 @@ impl ComputeNode {
|
||||
};
|
||||
|
||||
// Report metrics
|
||||
self.state.lock().unwrap().metrics.basebackup_bytes =
|
||||
measured_reader.get_byte_count() as u64;
|
||||
self.state.lock().unwrap().metrics.basebackup_ms = Utc::now()
|
||||
.signed_duration_since(start_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64;
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.metrics.pageserver_connect_micros = pageserver_connect_micros;
|
||||
state.metrics.basebackup_bytes = measured_reader.get_byte_count() as u64;
|
||||
state.metrics.basebackup_ms = start_time.elapsed().as_millis() as u64;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -473,14 +499,22 @@ impl ComputeNode {
|
||||
/// Do all the preparations like PGDATA directory creation, configuration,
|
||||
/// safekeepers sync, basebackup, etc.
|
||||
#[instrument(skip_all)]
|
||||
pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
|
||||
pub fn prepare_pgdata(
|
||||
&self,
|
||||
compute_state: &ComputeState,
|
||||
extension_server_port: u16,
|
||||
) -> Result<()> {
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
let spec = &pspec.spec;
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
|
||||
// Remove/create an empty pgdata directory and put configuration there.
|
||||
self.create_pgdata()?;
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &pspec.spec)?;
|
||||
config::write_postgres_conf(
|
||||
&pgdata_path.join("postgresql.conf"),
|
||||
&pspec.spec,
|
||||
Some(extension_server_port),
|
||||
)?;
|
||||
|
||||
// Syncing safekeepers is only safe with primary nodes: if a primary
|
||||
// is already connected it will be kicked out, so a secondary (standby)
|
||||
@@ -670,7 +704,7 @@ impl ComputeNode {
|
||||
|
||||
// Write new config
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec)?;
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
|
||||
|
||||
let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
|
||||
self.pg_reload_conf(&mut client)?;
|
||||
@@ -700,7 +734,7 @@ impl ComputeNode {
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
pub fn start_compute(&self) -> Result<std::process::Child> {
|
||||
pub fn start_compute(&self, extension_server_port: u16) -> Result<std::process::Child> {
|
||||
let compute_state = self.state.lock().unwrap().clone();
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
info!(
|
||||
@@ -711,7 +745,31 @@ impl ComputeNode {
|
||||
pspec.timeline_id,
|
||||
);
|
||||
|
||||
self.prepare_pgdata(&compute_state)?;
|
||||
// This part is sync, because we need to download
|
||||
// remote shared_preload_libraries before postgres start (if any)
|
||||
{
|
||||
let library_load_start_time = Utc::now();
|
||||
let remote_ext_metrics = self.prepare_preload_libraries(&compute_state)?;
|
||||
|
||||
let library_load_time = Utc::now()
|
||||
.signed_duration_since(library_load_start_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64;
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.metrics.load_ext_ms = library_load_time;
|
||||
state.metrics.num_ext_downloaded = remote_ext_metrics.num_ext_downloaded;
|
||||
state.metrics.largest_ext_size = remote_ext_metrics.largest_ext_size;
|
||||
state.metrics.total_ext_download_size = remote_ext_metrics.total_ext_download_size;
|
||||
state.metrics.prep_extensions_ms = remote_ext_metrics.prep_extensions_ms;
|
||||
info!(
|
||||
"Loading shared_preload_libraries took {:?}ms",
|
||||
library_load_time
|
||||
);
|
||||
info!("{:?}", remote_ext_metrics);
|
||||
}
|
||||
|
||||
self.prepare_pgdata(&compute_state, extension_server_port)?;
|
||||
|
||||
let start_time = Utc::now();
|
||||
let pg = self.start_postgres(pspec.storage_auth_token.clone())?;
|
||||
@@ -859,4 +917,241 @@ LIMIT 100",
|
||||
"{{\"pg_stat_statements\": []}}".to_string()
|
||||
}
|
||||
}
|
||||
|
||||
// If remote extension storage is configured,
|
||||
// download extension control files
|
||||
pub async fn prepare_external_extensions(&self, compute_state: &ComputeState) -> Result<()> {
|
||||
if let Some(ref ext_remote_storage) = self.ext_remote_storage {
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
let spec = &pspec.spec;
|
||||
let custom_ext = spec.custom_extensions.clone().unwrap_or(Vec::new());
|
||||
info!("custom extensions: {:?}", &custom_ext);
|
||||
|
||||
let (ext_remote_paths, library_index) = extension_server::get_available_extensions(
|
||||
ext_remote_storage,
|
||||
&self.pgbin,
|
||||
&self.pgversion,
|
||||
&custom_ext,
|
||||
&self.build_tag,
|
||||
)
|
||||
.await?;
|
||||
self.ext_remote_paths
|
||||
.set(ext_remote_paths)
|
||||
.expect("this is the only time we set ext_remote_paths");
|
||||
self.library_index
|
||||
.set(library_index)
|
||||
.expect("this is the only time we set library_index");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// download an archive, unzip and place files in correct locations
|
||||
pub async fn download_extension(
|
||||
&self,
|
||||
ext_name: &str,
|
||||
is_library: bool,
|
||||
) -> Result<u64, DownloadError> {
|
||||
let remote_storage = self
|
||||
.ext_remote_storage
|
||||
.as_ref()
|
||||
.ok_or(DownloadError::BadInput(anyhow::anyhow!(
|
||||
"Remote extensions storage is not configured",
|
||||
)))?;
|
||||
|
||||
let mut real_ext_name = ext_name;
|
||||
if is_library {
|
||||
// sometimes library names might have a suffix like
|
||||
// library.so or library.so.3. We strip this off
|
||||
// because library_index is based on the name without the file extension
|
||||
let strip_lib_suffix = Regex::new(r"\.so.*").unwrap();
|
||||
let lib_raw_name = strip_lib_suffix.replace(real_ext_name, "").to_string();
|
||||
|
||||
real_ext_name = self
|
||||
.library_index
|
||||
.get()
|
||||
.expect("must have already downloaded the library_index")
|
||||
.get(&lib_raw_name)
|
||||
.ok_or(DownloadError::BadInput(anyhow::anyhow!(
|
||||
"library {} is not found",
|
||||
lib_raw_name
|
||||
)))?;
|
||||
}
|
||||
|
||||
let ext_path = &self
|
||||
.ext_remote_paths
|
||||
.get()
|
||||
.expect("error accessing ext_remote_paths")
|
||||
.get(real_ext_name)
|
||||
.ok_or(DownloadError::BadInput(anyhow::anyhow!(
|
||||
"real_ext_name {} is not found",
|
||||
real_ext_name
|
||||
)))?;
|
||||
|
||||
let ext_archive_name = ext_path.object_name().expect("bad path");
|
||||
|
||||
let mut first_try = false;
|
||||
if !self
|
||||
.ext_download_progress
|
||||
.read()
|
||||
.expect("lock err")
|
||||
.contains_key(ext_archive_name)
|
||||
{
|
||||
self.ext_download_progress
|
||||
.write()
|
||||
.expect("lock err")
|
||||
.insert(ext_archive_name.to_string(), (Utc::now(), false));
|
||||
first_try = true;
|
||||
}
|
||||
let (download_start, download_completed) =
|
||||
self.ext_download_progress.read().expect("lock err")[ext_archive_name];
|
||||
let start_time_delta = Utc::now()
|
||||
.signed_duration_since(download_start)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64;
|
||||
|
||||
// how long to wait for extension download if it was started by another process
|
||||
const HANG_TIMEOUT: u64 = 3000; // milliseconds
|
||||
|
||||
if download_completed {
|
||||
info!("extension already downloaded, skipping re-download");
|
||||
return Ok(0);
|
||||
} else if start_time_delta < HANG_TIMEOUT && !first_try {
|
||||
info!("download {ext_archive_name} already started by another process, hanging untill completion or timeout");
|
||||
let mut interval = tokio::time::interval(tokio::time::Duration::from_millis(500));
|
||||
loop {
|
||||
info!("waiting for download");
|
||||
interval.tick().await;
|
||||
let (_, download_completed_now) =
|
||||
self.ext_download_progress.read().expect("lock")[ext_archive_name];
|
||||
if download_completed_now {
|
||||
info!("download finished by whoever else downloaded it");
|
||||
return Ok(0);
|
||||
}
|
||||
}
|
||||
// NOTE: the above loop will get terminated
|
||||
// based on the timeout of the download function
|
||||
}
|
||||
|
||||
// if extension hasn't been downloaded before or the previous
|
||||
// attempt to download was at least HANG_TIMEOUT ms ago
|
||||
// then we try to download it here
|
||||
info!("downloading new extension {ext_archive_name}");
|
||||
|
||||
let download_size = extension_server::download_extension(
|
||||
real_ext_name,
|
||||
ext_path,
|
||||
remote_storage,
|
||||
&self.pgbin,
|
||||
)
|
||||
.await
|
||||
.map_err(DownloadError::Other);
|
||||
|
||||
self.ext_download_progress
|
||||
.write()
|
||||
.expect("bad lock")
|
||||
.insert(ext_archive_name.to_string(), (download_start, true));
|
||||
|
||||
download_size
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
pub async fn prepare_preload_libraries(
|
||||
&self,
|
||||
compute_state: &ComputeState,
|
||||
) -> Result<RemoteExtensionMetrics> {
|
||||
if self.ext_remote_storage.is_none() {
|
||||
return Ok(RemoteExtensionMetrics {
|
||||
num_ext_downloaded: 0,
|
||||
largest_ext_size: 0,
|
||||
total_ext_download_size: 0,
|
||||
prep_extensions_ms: 0,
|
||||
});
|
||||
}
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
let spec = &pspec.spec;
|
||||
|
||||
info!("parse shared_preload_libraries from spec.cluster.settings");
|
||||
let mut libs_vec = Vec::new();
|
||||
if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
|
||||
libs_vec = libs
|
||||
.split(&[',', '\'', ' '])
|
||||
.filter(|s| *s != "neon" && !s.is_empty())
|
||||
.map(str::to_string)
|
||||
.collect();
|
||||
}
|
||||
info!("parse shared_preload_libraries from provided postgresql.conf");
|
||||
// that is used in neon_local and python tests
|
||||
if let Some(conf) = &spec.cluster.postgresql_conf {
|
||||
let conf_lines = conf.split('\n').collect::<Vec<&str>>();
|
||||
let mut shared_preload_libraries_line = "";
|
||||
for line in conf_lines {
|
||||
if line.starts_with("shared_preload_libraries") {
|
||||
shared_preload_libraries_line = line;
|
||||
}
|
||||
}
|
||||
let mut preload_libs_vec = Vec::new();
|
||||
if let Some(libs) = shared_preload_libraries_line.split("='").nth(1) {
|
||||
preload_libs_vec = libs
|
||||
.split(&[',', '\'', ' '])
|
||||
.filter(|s| *s != "neon" && !s.is_empty())
|
||||
.map(str::to_string)
|
||||
.collect();
|
||||
}
|
||||
libs_vec.extend(preload_libs_vec);
|
||||
}
|
||||
|
||||
info!("Download ext_index.json, find the extension paths");
|
||||
let prep_ext_start_time = Utc::now();
|
||||
self.prepare_external_extensions(compute_state).await?;
|
||||
let prep_ext_time_delta = Utc::now()
|
||||
.signed_duration_since(prep_ext_start_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64;
|
||||
info!("Prepare extensions took {prep_ext_time_delta}ms");
|
||||
|
||||
// Don't try to download libraries that are not in the index.
|
||||
// Assume that they are already present locally.
|
||||
libs_vec.retain(|lib| {
|
||||
self.library_index
|
||||
.get()
|
||||
.expect("error accessing ext_remote_paths")
|
||||
.contains_key(lib)
|
||||
});
|
||||
|
||||
info!("Downloading to shared preload libraries: {:?}", &libs_vec);
|
||||
|
||||
let mut download_tasks = Vec::new();
|
||||
for library in &libs_vec {
|
||||
download_tasks.push(self.download_extension(library, true));
|
||||
}
|
||||
let results = join_all(download_tasks).await;
|
||||
|
||||
let mut remote_ext_metrics = RemoteExtensionMetrics {
|
||||
num_ext_downloaded: 0,
|
||||
largest_ext_size: 0,
|
||||
total_ext_download_size: 0,
|
||||
prep_extensions_ms: prep_ext_time_delta,
|
||||
};
|
||||
for result in results {
|
||||
let download_size = match result {
|
||||
Ok(res) => {
|
||||
remote_ext_metrics.num_ext_downloaded += 1;
|
||||
res
|
||||
}
|
||||
Err(err) => {
|
||||
// if we failed to download an extension, we don't want to fail the whole
|
||||
// process, but we do want to log the error
|
||||
error!("Failed to download extension: {}", err);
|
||||
0
|
||||
}
|
||||
};
|
||||
|
||||
remote_ext_metrics.largest_ext_size =
|
||||
std::cmp::max(remote_ext_metrics.largest_ext_size, download_size);
|
||||
remote_ext_metrics.total_ext_download_size += download_size;
|
||||
}
|
||||
Ok(remote_ext_metrics)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -33,7 +33,11 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
|
||||
}
|
||||
|
||||
/// Create or completely rewrite configuration file specified by `path`
|
||||
pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
|
||||
pub fn write_postgres_conf(
|
||||
path: &Path,
|
||||
spec: &ComputeSpec,
|
||||
extension_server_port: Option<u16>,
|
||||
) -> Result<()> {
|
||||
// File::create() destroys the file content if it exists.
|
||||
let mut file = File::create(path)?;
|
||||
|
||||
@@ -87,5 +91,9 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
|
||||
writeln!(file, "# Managed by compute_ctl: end")?;
|
||||
}
|
||||
|
||||
if let Some(port) = extension_server_port {
|
||||
writeln!(file, "neon.extension_server_port={}", port)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
281
compute_tools/src/extension_server.rs
Normal file
281
compute_tools/src/extension_server.rs
Normal file
@@ -0,0 +1,281 @@
|
||||
// Download extension files from the extension store
|
||||
// and put them in the right place in the postgres directory (share / lib)
|
||||
/*
|
||||
The layout of the S3 bucket is as follows:
|
||||
5615610098 // this is an extension build number
|
||||
├── v14
|
||||
│ ├── extensions
|
||||
│ │ ├── anon.tar.zst
|
||||
│ │ └── embedding.tar.zst
|
||||
│ └── ext_index.json
|
||||
└── v15
|
||||
├── extensions
|
||||
│ ├── anon.tar.zst
|
||||
│ └── embedding.tar.zst
|
||||
└── ext_index.json
|
||||
5615261079
|
||||
├── v14
|
||||
│ ├── extensions
|
||||
│ │ └── anon.tar.zst
|
||||
│ └── ext_index.json
|
||||
└── v15
|
||||
├── extensions
|
||||
│ └── anon.tar.zst
|
||||
└── ext_index.json
|
||||
5623261088
|
||||
├── v14
|
||||
│ ├── extensions
|
||||
│ │ └── embedding.tar.zst
|
||||
│ └── ext_index.json
|
||||
└── v15
|
||||
├── extensions
|
||||
│ └── embedding.tar.zst
|
||||
└── ext_index.json
|
||||
|
||||
Note that build number cannot be part of prefix because we might need extensions
|
||||
from other build numbers.
|
||||
|
||||
ext_index.json stores the control files and location of extension archives
|
||||
It also stores a list of public extensions and a library_index
|
||||
|
||||
We don't need to duplicate extension.tar.zst files.
|
||||
We only need to upload a new one if it is updated.
|
||||
(Although currently we just upload every time anyways, hopefully will change
|
||||
this sometime)
|
||||
|
||||
*access* is controlled by spec
|
||||
|
||||
More specifically, here is an example ext_index.json
|
||||
{
|
||||
"public_extensions": [
|
||||
"anon",
|
||||
"pg_buffercache"
|
||||
],
|
||||
"library_index": {
|
||||
"anon": "anon",
|
||||
"pg_buffercache": "pg_buffercache"
|
||||
},
|
||||
"extension_data": {
|
||||
"pg_buffercache": {
|
||||
"control_data": {
|
||||
"pg_buffercache.control": "# pg_buffercache extension \ncomment = 'examine the shared buffer cache' \ndefault_version = '1.3' \nmodule_pathname = '$libdir/pg_buffercache' \nrelocatable = true \ntrusted=true"
|
||||
},
|
||||
"archive_path": "5670669815/v14/extensions/pg_buffercache.tar.zst"
|
||||
},
|
||||
"anon": {
|
||||
"control_data": {
|
||||
"anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
|
||||
},
|
||||
"archive_path": "5670669815/v14/extensions/anon.tar.zst"
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
use anyhow::Context;
|
||||
use anyhow::{self, Result};
|
||||
use futures::future::join_all;
|
||||
use remote_storage::*;
|
||||
use serde_json;
|
||||
use std::collections::HashMap;
|
||||
use std::io::Read;
|
||||
use std::num::{NonZeroU32, NonZeroUsize};
|
||||
use std::path::Path;
|
||||
use std::str;
|
||||
use tar::Archive;
|
||||
use tokio::io::AsyncReadExt;
|
||||
use tracing::info;
|
||||
use tracing::log::warn;
|
||||
use zstd::stream::read::Decoder;
|
||||
|
||||
fn get_pg_config(argument: &str, pgbin: &str) -> String {
|
||||
// gives the result of `pg_config [argument]`
|
||||
// where argument is a flag like `--version` or `--sharedir`
|
||||
let pgconfig = pgbin
|
||||
.strip_suffix("postgres")
|
||||
.expect("bad pgbin")
|
||||
.to_owned()
|
||||
+ "/pg_config";
|
||||
let config_output = std::process::Command::new(pgconfig)
|
||||
.arg(argument)
|
||||
.output()
|
||||
.expect("pg_config error");
|
||||
std::str::from_utf8(&config_output.stdout)
|
||||
.expect("pg_config error")
|
||||
.trim()
|
||||
.to_string()
|
||||
}
|
||||
|
||||
pub fn get_pg_version(pgbin: &str) -> String {
|
||||
// pg_config --version returns a (platform specific) human readable string
|
||||
// such as "PostgreSQL 15.4". We parse this to v14/v15
|
||||
let human_version = get_pg_config("--version", pgbin);
|
||||
if human_version.contains("15") {
|
||||
return "v15".to_string();
|
||||
} else if human_version.contains("14") {
|
||||
return "v14".to_string();
|
||||
}
|
||||
panic!("Unsuported postgres version {human_version}");
|
||||
}
|
||||
|
||||
// download control files for enabled_extensions
|
||||
// return Hashmaps converting library names to extension names (library_index)
|
||||
// and specifying the remote path to the archive for each extension name
|
||||
pub async fn get_available_extensions(
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
pgbin: &str,
|
||||
pg_version: &str,
|
||||
custom_extensions: &[String],
|
||||
build_tag: &str,
|
||||
) -> Result<(HashMap<String, RemotePath>, HashMap<String, String>)> {
|
||||
let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
|
||||
let index_path = format!("{build_tag}/{pg_version}/ext_index.json");
|
||||
let index_path = RemotePath::new(Path::new(&index_path)).context("error forming path")?;
|
||||
info!("download ext_index.json from: {:?}", &index_path);
|
||||
|
||||
let mut download = remote_storage.download(&index_path).await?;
|
||||
let mut ext_idx_buffer = Vec::new();
|
||||
download
|
||||
.download_stream
|
||||
.read_to_end(&mut ext_idx_buffer)
|
||||
.await?;
|
||||
info!("ext_index downloaded");
|
||||
|
||||
#[derive(Debug, serde::Deserialize)]
|
||||
struct Index {
|
||||
public_extensions: Vec<String>,
|
||||
library_index: HashMap<String, String>,
|
||||
extension_data: HashMap<String, ExtensionData>,
|
||||
}
|
||||
|
||||
#[derive(Debug, serde::Deserialize)]
|
||||
struct ExtensionData {
|
||||
control_data: HashMap<String, String>,
|
||||
archive_path: String,
|
||||
}
|
||||
|
||||
let ext_index_full = serde_json::from_slice::<Index>(&ext_idx_buffer)?;
|
||||
let mut enabled_extensions = ext_index_full.public_extensions;
|
||||
enabled_extensions.extend_from_slice(custom_extensions);
|
||||
let mut library_index = ext_index_full.library_index;
|
||||
let all_extension_data = ext_index_full.extension_data;
|
||||
info!("library_index: {:?}", library_index);
|
||||
|
||||
info!("enabled_extensions: {:?}", enabled_extensions);
|
||||
let mut ext_remote_paths = HashMap::new();
|
||||
let mut file_create_tasks = Vec::new();
|
||||
for extension in enabled_extensions {
|
||||
let ext_data = &all_extension_data[&extension];
|
||||
for (control_file, control_contents) in &ext_data.control_data {
|
||||
let extension_name = control_file
|
||||
.strip_suffix(".control")
|
||||
.expect("control files must end in .control");
|
||||
let control_path = local_sharedir.join(control_file);
|
||||
if !control_path.exists() {
|
||||
ext_remote_paths.insert(
|
||||
extension_name.to_string(),
|
||||
RemotePath::from_string(&ext_data.archive_path)?,
|
||||
);
|
||||
info!("writing file {:?}{:?}", control_path, control_contents);
|
||||
file_create_tasks.push(tokio::fs::write(control_path, control_contents));
|
||||
} else {
|
||||
warn!("control file {:?} exists both locally and remotely. ignoring the remote version.", control_file);
|
||||
// also delete this from library index
|
||||
library_index.retain(|_, value| value != extension_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
let results = join_all(file_create_tasks).await;
|
||||
for result in results {
|
||||
result?;
|
||||
}
|
||||
info!("ext_remote_paths {:?}", ext_remote_paths);
|
||||
Ok((ext_remote_paths, library_index))
|
||||
}
|
||||
|
||||
// download the archive for a given extension,
|
||||
// unzip it, and place files in the appropriate locations (share/lib)
|
||||
pub async fn download_extension(
|
||||
ext_name: &str,
|
||||
ext_path: &RemotePath,
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
pgbin: &str,
|
||||
) -> Result<u64> {
|
||||
info!("Download extension {:?} from {:?}", ext_name, ext_path);
|
||||
let mut download = remote_storage.download(ext_path).await?;
|
||||
let mut download_buffer = Vec::new();
|
||||
download
|
||||
.download_stream
|
||||
.read_to_end(&mut download_buffer)
|
||||
.await?;
|
||||
let download_size = download_buffer.len() as u64;
|
||||
// it's unclear whether it is more performant to decompress into memory or not
|
||||
// TODO: decompressing into memory can be avoided
|
||||
let mut decoder = Decoder::new(download_buffer.as_slice())?;
|
||||
let mut decompress_buffer = Vec::new();
|
||||
decoder.read_to_end(&mut decompress_buffer)?;
|
||||
let mut archive = Archive::new(decompress_buffer.as_slice());
|
||||
let unzip_dest = pgbin
|
||||
.strip_suffix("/bin/postgres")
|
||||
.expect("bad pgbin")
|
||||
.to_string()
|
||||
+ "/download_extensions";
|
||||
archive.unpack(&unzip_dest)?;
|
||||
info!("Download + unzip {:?} completed successfully", &ext_path);
|
||||
|
||||
let sharedir_paths = (
|
||||
unzip_dest.to_string() + "/share/extension",
|
||||
Path::new(&get_pg_config("--sharedir", pgbin)).join("extension"),
|
||||
);
|
||||
let libdir_paths = (
|
||||
unzip_dest.to_string() + "/lib",
|
||||
Path::new(&get_pg_config("--pkglibdir", pgbin)).to_path_buf(),
|
||||
);
|
||||
// move contents of the libdir / sharedir in unzipped archive to the correct local paths
|
||||
for paths in [sharedir_paths, libdir_paths] {
|
||||
let (zip_dir, real_dir) = paths;
|
||||
info!("mv {zip_dir:?}/* {real_dir:?}");
|
||||
for file in std::fs::read_dir(zip_dir)? {
|
||||
let old_file = file?.path();
|
||||
let new_file =
|
||||
Path::new(&real_dir).join(old_file.file_name().context("error parsing file")?);
|
||||
info!("moving {old_file:?} to {new_file:?}");
|
||||
|
||||
// extension download failed: Directory not empty (os error 39)
|
||||
match std::fs::rename(old_file, new_file) {
|
||||
Ok(()) => info!("move succeeded"),
|
||||
Err(e) => {
|
||||
warn!("move failed, probably because the extension already exists: {e}")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
info!("done moving extension {ext_name}");
|
||||
Ok(download_size)
|
||||
}
|
||||
|
||||
// This function initializes the necessary structs to use remote storage
|
||||
pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRemoteStorage> {
|
||||
#[derive(Debug, serde::Deserialize)]
|
||||
struct RemoteExtJson {
|
||||
bucket: String,
|
||||
region: String,
|
||||
endpoint: Option<String>,
|
||||
prefix: Option<String>,
|
||||
}
|
||||
let remote_ext_json = serde_json::from_str::<RemoteExtJson>(remote_ext_config)?;
|
||||
|
||||
let config = S3Config {
|
||||
bucket_name: remote_ext_json.bucket,
|
||||
bucket_region: remote_ext_json.region,
|
||||
prefix_in_bucket: remote_ext_json.prefix,
|
||||
endpoint: remote_ext_json.endpoint,
|
||||
concurrency_limit: NonZeroUsize::new(100).expect("100 != 0"),
|
||||
max_keys_per_list_response: None,
|
||||
};
|
||||
let config = RemoteStorageConfig {
|
||||
max_concurrent_syncs: NonZeroUsize::new(100).expect("100 != 0"),
|
||||
max_sync_errors: NonZeroU32::new(100).expect("100 != 0"),
|
||||
storage: RemoteStorageKind::AwsS3(config),
|
||||
};
|
||||
GenericRemoteStorage::from_config(&config)
|
||||
}
|
||||
@@ -121,6 +121,46 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
|
||||
}
|
||||
}
|
||||
|
||||
// download extension files from S3 on demand
|
||||
(&Method::POST, route) if route.starts_with("/extension_server/") => {
|
||||
info!("serving {:?} POST request", route);
|
||||
info!("req.uri {:?}", req.uri());
|
||||
|
||||
let mut is_library = false;
|
||||
if let Some(params) = req.uri().query() {
|
||||
info!("serving {:?} POST request with params: {}", route, params);
|
||||
if params == "is_library=true" {
|
||||
is_library = true;
|
||||
} else {
|
||||
let mut resp = Response::new(Body::from("Wrong request parameters"));
|
||||
*resp.status_mut() = StatusCode::BAD_REQUEST;
|
||||
return resp;
|
||||
}
|
||||
}
|
||||
|
||||
let filename = route.split('/').last().unwrap().to_string();
|
||||
info!("serving /extension_server POST request, filename: {filename:?} is_library: {is_library}");
|
||||
|
||||
// don't even try to download extensions
|
||||
// if no remote storage is configured
|
||||
if compute.ext_remote_storage.is_none() {
|
||||
info!("no extensions remote storage configured");
|
||||
let mut resp = Response::new(Body::from("no remote storage configured"));
|
||||
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
|
||||
return resp;
|
||||
}
|
||||
|
||||
match compute.download_extension(&filename, is_library).await {
|
||||
Ok(_) => Response::new(Body::from("OK")),
|
||||
Err(e) => {
|
||||
error!("extension download failed: {}", e);
|
||||
let mut resp = Response::new(Body::from(e.to_string()));
|
||||
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
|
||||
resp
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Return the `404 Not Found` for any other routes.
|
||||
_ => {
|
||||
let mut not_found = Response::new(Body::from("404 Not Found"));
|
||||
|
||||
@@ -139,6 +139,34 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
/extension_server:
|
||||
post:
|
||||
tags:
|
||||
- Extension
|
||||
summary: Download extension from S3 to local folder.
|
||||
description: ""
|
||||
operationId: downloadExtension
|
||||
responses:
|
||||
200:
|
||||
description: Extension downloaded
|
||||
content:
|
||||
text/plain:
|
||||
schema:
|
||||
type: string
|
||||
description: Error text or 'OK' if download succeeded.
|
||||
example: "OK"
|
||||
400:
|
||||
description: Request is invalid.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
500:
|
||||
description: Extension download request failed.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
|
||||
components:
|
||||
securitySchemes:
|
||||
|
||||
@@ -9,6 +9,7 @@ pub mod http;
|
||||
#[macro_use]
|
||||
pub mod logger;
|
||||
pub mod compute;
|
||||
pub mod extension_server;
|
||||
pub mod monitor;
|
||||
pub mod params;
|
||||
pub mod pg_helpers;
|
||||
|
||||
@@ -124,7 +124,7 @@ pub fn get_spec_from_control_plane(
|
||||
pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
|
||||
// File `postgresql.conf` is no longer included into `basebackup`, so just
|
||||
// always write all config into it creating new file.
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec, None)?;
|
||||
|
||||
update_pg_hba(pgdata_path)?;
|
||||
|
||||
@@ -270,7 +270,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
}
|
||||
RoleAction::Create => {
|
||||
let mut query: String = format!(
|
||||
"CREATE ROLE {} CREATEROLE CREATEDB IN ROLE neon_superuser",
|
||||
"CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
|
||||
name.pg_quote()
|
||||
);
|
||||
info!("role create query: '{}'", &query);
|
||||
|
||||
@@ -32,3 +32,4 @@ utils.workspace = true
|
||||
|
||||
compute_api.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
@@ -658,6 +658,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
||||
.get_one::<String>("endpoint_id")
|
||||
.ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;
|
||||
|
||||
let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
|
||||
|
||||
// If --safekeepers argument is given, use only the listed safekeeper nodes.
|
||||
let safekeepers =
|
||||
if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
|
||||
@@ -699,7 +701,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
||||
_ => {}
|
||||
}
|
||||
println!("Starting existing endpoint {endpoint_id}...");
|
||||
endpoint.start(&auth_token, safekeepers)?;
|
||||
endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
|
||||
} else {
|
||||
let branch_name = sub_args
|
||||
.get_one::<String>("branch-name")
|
||||
@@ -743,7 +745,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
||||
pg_version,
|
||||
mode,
|
||||
)?;
|
||||
ep.start(&auth_token, safekeepers)?;
|
||||
ep.start(&auth_token, safekeepers, remote_ext_config)?;
|
||||
}
|
||||
}
|
||||
"stop" => {
|
||||
@@ -823,6 +825,16 @@ fn get_safekeeper(env: &local_env::LocalEnv, id: NodeId) -> Result<SafekeeperNod
|
||||
}
|
||||
}
|
||||
|
||||
// Get list of options to append to safekeeper command invocation.
|
||||
fn safekeeper_extra_opts(init_match: &ArgMatches) -> Vec<String> {
|
||||
init_match
|
||||
.get_many::<String>("safekeeper-extra-opt")
|
||||
.into_iter()
|
||||
.flatten()
|
||||
.map(|s| s.to_owned())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
let (sub_name, sub_args) = match sub_match.subcommand() {
|
||||
Some(safekeeper_command_data) => safekeeper_command_data,
|
||||
@@ -839,7 +851,9 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
|
||||
|
||||
match sub_name {
|
||||
"start" => {
|
||||
if let Err(e) = safekeeper.start() {
|
||||
let extra_opts = safekeeper_extra_opts(sub_args);
|
||||
|
||||
if let Err(e) = safekeeper.start(extra_opts) {
|
||||
eprintln!("safekeeper start failed: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
@@ -864,7 +878,8 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if let Err(e) = safekeeper.start() {
|
||||
let extra_opts = safekeeper_extra_opts(sub_args);
|
||||
if let Err(e) = safekeeper.start(extra_opts) {
|
||||
eprintln!("safekeeper start failed: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
@@ -891,7 +906,7 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow
|
||||
|
||||
for node in env.safekeepers.iter() {
|
||||
let safekeeper = SafekeeperNode::from_env(env, node);
|
||||
if let Err(e) = safekeeper.start() {
|
||||
if let Err(e) = safekeeper.start(vec![]) {
|
||||
eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
|
||||
try_stop_all(env, false);
|
||||
exit(1);
|
||||
@@ -954,6 +969,14 @@ fn cli() -> Command {
|
||||
|
||||
let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false);
|
||||
|
||||
let safekeeper_extra_opt_arg = Arg::new("safekeeper-extra-opt")
|
||||
.short('e')
|
||||
.long("safekeeper-extra-opt")
|
||||
.num_args(1)
|
||||
.action(ArgAction::Append)
|
||||
.help("Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo")
|
||||
.required(false);
|
||||
|
||||
let tenant_id_arg = Arg::new("tenant-id")
|
||||
.long("tenant-id")
|
||||
.help("Tenant id. Represented as a hexadecimal string 32 symbols length")
|
||||
@@ -1003,6 +1026,12 @@ fn cli() -> Command {
|
||||
.help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
|
||||
.required(false);
|
||||
|
||||
let remote_ext_config_args = Arg::new("remote-ext-config")
|
||||
.long("remote-ext-config")
|
||||
.num_args(1)
|
||||
.help("Configure the S3 bucket that we search for extensions in.")
|
||||
.required(false);
|
||||
|
||||
let lsn_arg = Arg::new("lsn")
|
||||
.long("lsn")
|
||||
.help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.")
|
||||
@@ -1116,6 +1145,7 @@ fn cli() -> Command {
|
||||
.subcommand(Command::new("start")
|
||||
.about("Start local safekeeper")
|
||||
.arg(safekeeper_id_arg.clone())
|
||||
.arg(safekeeper_extra_opt_arg.clone())
|
||||
)
|
||||
.subcommand(Command::new("stop")
|
||||
.about("Stop local safekeeper")
|
||||
@@ -1126,6 +1156,7 @@ fn cli() -> Command {
|
||||
.about("Restart local safekeeper")
|
||||
.arg(safekeeper_id_arg)
|
||||
.arg(stop_mode_arg.clone())
|
||||
.arg(safekeeper_extra_opt_arg)
|
||||
)
|
||||
)
|
||||
.subcommand(
|
||||
@@ -1161,6 +1192,7 @@ fn cli() -> Command {
|
||||
.arg(pg_version_arg)
|
||||
.arg(hot_standby_arg)
|
||||
.arg(safekeepers_arg)
|
||||
.arg(remote_ext_config_args)
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("stop")
|
||||
|
||||
@@ -313,7 +313,7 @@ impl Endpoint {
|
||||
|
||||
// TODO: use future host field from safekeeper spec
|
||||
// Pass the list of safekeepers to the replica so that it can connect to any of them,
|
||||
// whichever is availiable.
|
||||
// whichever is available.
|
||||
let sk_ports = self
|
||||
.env
|
||||
.safekeepers
|
||||
@@ -420,7 +420,12 @@ impl Endpoint {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn start(&self, auth_token: &Option<String>, safekeepers: Vec<NodeId>) -> Result<()> {
|
||||
pub fn start(
|
||||
&self,
|
||||
auth_token: &Option<String>,
|
||||
safekeepers: Vec<NodeId>,
|
||||
remote_ext_config: Option<&String>,
|
||||
) -> Result<()> {
|
||||
if self.status() == "running" {
|
||||
anyhow::bail!("The endpoint is already running");
|
||||
}
|
||||
@@ -488,6 +493,7 @@ impl Endpoint {
|
||||
pageserver_connstring: Some(pageserver_connstring),
|
||||
safekeeper_connstrings,
|
||||
storage_auth_token: auth_token.clone(),
|
||||
custom_extensions: Some(vec![]),
|
||||
};
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
|
||||
@@ -519,6 +525,11 @@ impl Endpoint {
|
||||
.stdin(std::process::Stdio::null())
|
||||
.stderr(logfile.try_clone()?)
|
||||
.stdout(logfile);
|
||||
|
||||
if let Some(remote_ext_config) = remote_ext_config {
|
||||
cmd.args(["--remote-ext-config", remote_ext_config]);
|
||||
}
|
||||
|
||||
let child = cmd.spawn()?;
|
||||
|
||||
// Write down the pid so we can wait for it when we want to stop
|
||||
|
||||
@@ -101,7 +101,7 @@ impl SafekeeperNode {
|
||||
self.datadir_path().join("safekeeper.pid")
|
||||
}
|
||||
|
||||
pub fn start(&self) -> anyhow::Result<Child> {
|
||||
pub fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
|
||||
print!(
|
||||
"Starting safekeeper at '{}' in '{}'",
|
||||
self.pg_connection_config.raw_address(),
|
||||
@@ -161,17 +161,28 @@ impl SafekeeperNode {
|
||||
|
||||
let key_path = self.env.base_data_dir.join("auth_public_key.pem");
|
||||
if self.conf.auth_enabled {
|
||||
let key_path_string = key_path
|
||||
.to_str()
|
||||
.with_context(|| {
|
||||
format!("Key path {key_path:?} cannot be represented as a unicode string")
|
||||
})?
|
||||
.to_owned();
|
||||
args.extend([
|
||||
"--auth-validation-public-key-path".to_owned(),
|
||||
key_path
|
||||
.to_str()
|
||||
.with_context(|| {
|
||||
format!("Key path {key_path:?} cannot be represented as a unicode string")
|
||||
})?
|
||||
.to_owned(),
|
||||
"--pg-auth-public-key-path".to_owned(),
|
||||
key_path_string.clone(),
|
||||
]);
|
||||
args.extend([
|
||||
"--pg-tenant-only-auth-public-key-path".to_owned(),
|
||||
key_path_string.clone(),
|
||||
]);
|
||||
args.extend([
|
||||
"--http-auth-public-key-path".to_owned(),
|
||||
key_path_string.clone(),
|
||||
]);
|
||||
}
|
||||
|
||||
args.extend(extra_opts);
|
||||
|
||||
background_process::start_process(
|
||||
&format!("safekeeper-{id}"),
|
||||
&datadir,
|
||||
|
||||
236
docs/rfcs/024-extension-loading.md
Normal file
236
docs/rfcs/024-extension-loading.md
Normal file
@@ -0,0 +1,236 @@
|
||||
# Supporting custom user Extensions (Dynamic Extension Loading)
|
||||
Created 2023-05-03
|
||||
|
||||
## Motivation
|
||||
|
||||
There are many extensions in the PostgreSQL ecosystem, and not all extensions
|
||||
are of a quality that we can confidently support them. Additionally, our
|
||||
current extension inclusion mechanism has several problems because we build all
|
||||
extensions into the primary Compute image: We build the extensions every time
|
||||
we build the compute image regardless of whether we actually need to rebuild
|
||||
the image, and the inclusion of these extensions in the image adds a hard
|
||||
dependency on all supported extensions - thus increasing the image size, and
|
||||
with it the time it takes to download that image - increasing first start
|
||||
latency.
|
||||
|
||||
This RFC proposes a dynamic loading mechanism that solves most of these
|
||||
problems.
|
||||
|
||||
## Summary
|
||||
|
||||
`compute_ctl` is made responsible for loading extensions on-demand into
|
||||
the container's file system for dynamically loaded extensions, and will also
|
||||
make sure that the extensions in `shared_preload_libraries` are downloaded
|
||||
before the compute node starts.
|
||||
|
||||
## Components
|
||||
|
||||
compute_ctl, PostgreSQL, neon (extension), Compute Host Node, Extension Store
|
||||
|
||||
## Requirements
|
||||
|
||||
Compute nodes with no extra extensions should not be negatively impacted by
|
||||
the existence of support for many extensions.
|
||||
|
||||
Installing an extension into PostgreSQL should be easy.
|
||||
|
||||
Non-preloaded extensions shouldn't impact startup latency.
|
||||
|
||||
Uninstalled extensions shouldn't impact query latency.
|
||||
|
||||
A small latency penalty for dynamically loaded extensions is acceptable in
|
||||
the first seconds of compute startup, but not in steady-state operations.
|
||||
|
||||
## Proposed implementation
|
||||
|
||||
### On-demand, JIT-loading of extensions
|
||||
|
||||
Before postgres starts we download
|
||||
- control files for all extensions available to that compute node;
|
||||
- all `shared_preload_libraries`;
|
||||
|
||||
After postgres is running, `compute_ctl` listens for requests to load files.
|
||||
When PostgreSQL requests a file, `compute_ctl` downloads it.
|
||||
|
||||
PostgreSQL requests files in the following cases:
|
||||
- When loading a preload library set in `local_preload_libraries`
|
||||
- When explicitly loading a library with `LOAD`
|
||||
- Wnen creating extension with `CREATE EXTENSION` (download sql scripts, (optional) extension data files and (optional) library files)))
|
||||
|
||||
|
||||
#### Summary
|
||||
|
||||
Pros:
|
||||
- Startup is only as slow as it takes to load all (shared_)preload_libraries
|
||||
- Supports BYO Extension
|
||||
|
||||
Cons:
|
||||
- O(sizeof(extensions)) IO requirement for loading all extensions.
|
||||
|
||||
### Alternative solutions
|
||||
|
||||
1. Allow users to add their extensions to the base image
|
||||
|
||||
Pros:
|
||||
- Easy to deploy
|
||||
|
||||
Cons:
|
||||
- Doesn't scale - first start size is dependent on image size;
|
||||
- All extensions are shared across all users: It doesn't allow users to
|
||||
bring their own restrictive-licensed extensions
|
||||
|
||||
2. Bring Your Own compute image
|
||||
|
||||
Pros:
|
||||
- Still easy to deploy
|
||||
- User can bring own patched version of PostgreSQL
|
||||
|
||||
Cons:
|
||||
- First start latency is O(sizeof(extensions image))
|
||||
- Warm instance pool for skipping pod schedule latency is not feasible with
|
||||
O(n) custom images
|
||||
- Support channels are difficult to manage
|
||||
|
||||
3. Download all user extensions in bulk on compute start
|
||||
|
||||
Pros:
|
||||
- Easy to deploy
|
||||
- No startup latency issues for "clean" users.
|
||||
- Warm instance pool for skipping pod schedule latency is possible
|
||||
|
||||
Cons:
|
||||
- Downloading all extensions in advance takes a lot of time, thus startup
|
||||
latency issues
|
||||
|
||||
4. Store user's extensions in persistent storage
|
||||
|
||||
Pros:
|
||||
- Easy to deploy
|
||||
- No startup latency issues
|
||||
- Warm instance pool for skipping pod schedule latency is possible
|
||||
|
||||
Cons:
|
||||
- EC2 instances have only limited number of attachments shared between EBS
|
||||
volumes, direct-attached NVMe drives, and ENIs.
|
||||
- Compute instance migration isn't trivially solved for EBS mounts (e.g.
|
||||
the device is unavailable whilst moving the mount between instances).
|
||||
- EBS can only mount on one instance at a time (except the expensive IO2
|
||||
device type).
|
||||
|
||||
5. Store user's extensions in network drive
|
||||
|
||||
Pros:
|
||||
- Easy to deploy
|
||||
- Few startup latency issues
|
||||
- Warm instance pool for skipping pod schedule latency is possible
|
||||
|
||||
Cons:
|
||||
- We'd need networked drives, and a lot of them, which would store many
|
||||
duplicate extensions.
|
||||
- **UNCHECKED:** Compute instance migration may not work nicely with
|
||||
networked IOs
|
||||
|
||||
|
||||
### Idea extensions
|
||||
|
||||
The extension store does not have to be S3 directly, but could be a Node-local
|
||||
caching service on top of S3. This would reduce the load on the network for
|
||||
popular extensions.
|
||||
|
||||
## Extension Storage implementation
|
||||
|
||||
The layout of the S3 bucket is as follows:
|
||||
```
|
||||
5615610098 // this is an extension build number
|
||||
├── v14
|
||||
│ ├── extensions
|
||||
│ │ ├── anon.tar.zst
|
||||
│ │ └── embedding.tar.zst
|
||||
│ └── ext_index.json
|
||||
└── v15
|
||||
├── extensions
|
||||
│ ├── anon.tar.zst
|
||||
│ └── embedding.tar.zst
|
||||
└── ext_index.json
|
||||
5615261079
|
||||
├── v14
|
||||
│ ├── extensions
|
||||
│ │ └── anon.tar.zst
|
||||
│ └── ext_index.json
|
||||
└── v15
|
||||
├── extensions
|
||||
│ └── anon.tar.zst
|
||||
└── ext_index.json
|
||||
5623261088
|
||||
├── v14
|
||||
│ ├── extensions
|
||||
│ │ └── embedding.tar.zst
|
||||
│ └── ext_index.json
|
||||
└── v15
|
||||
├── extensions
|
||||
│ └── embedding.tar.zst
|
||||
└── ext_index.json
|
||||
```
|
||||
|
||||
Note that build number cannot be part of prefix because we might need extensions
|
||||
from other build numbers.
|
||||
|
||||
`ext_index.json` stores the control files and location of extension archives.
|
||||
It also stores a list of public extensions and a library_index
|
||||
|
||||
We don't need to duplicate `extension.tar.zst`` files.
|
||||
We only need to upload a new one if it is updated.
|
||||
(Although currently we just upload every time anyways, hopefully will change
|
||||
this sometime)
|
||||
|
||||
*access* is controlled by spec
|
||||
|
||||
More specifically, here is an example ext_index.json
|
||||
```
|
||||
{
|
||||
"public_extensions": [
|
||||
"anon",
|
||||
"pg_buffercache"
|
||||
],
|
||||
"library_index": {
|
||||
"anon": "anon",
|
||||
"pg_buffercache": "pg_buffercache"
|
||||
// for more complex extensions like postgis
|
||||
// we might have something like:
|
||||
// address_standardizer: postgis
|
||||
// postgis_tiger: postgis
|
||||
},
|
||||
"extension_data": {
|
||||
"pg_buffercache": {
|
||||
"control_data": {
|
||||
"pg_buffercache.control": "# pg_buffercache extension \ncomment = 'examine the shared buffer cache' \ndefault_version = '1.3' \nmodule_pathname = '$libdir/pg_buffercache' \nrelocatable = true \ntrusted=true"
|
||||
},
|
||||
"archive_path": "5670669815/v14/extensions/pg_buffercache.tar.zst"
|
||||
},
|
||||
"anon": {
|
||||
"control_data": {
|
||||
"anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
|
||||
},
|
||||
"archive_path": "5670669815/v14/extensions/anon.tar.zst"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### How to add new extension to the Extension Storage?
|
||||
|
||||
Simply upload build artifacts to the S3 bucket.
|
||||
Implement a CI step for that. Splitting it from compute-node-image build.
|
||||
|
||||
### How do we deal with extension versions and updates?
|
||||
|
||||
Currently, we rebuild extensions on every compute-node-image build and store them in the <build-version> prefix.
|
||||
This is needed to ensure that `/share` and `/lib` files are in sync.
|
||||
|
||||
For extension updates, we rely on the PostgreSQL extension versioning mechanism (sql update scripts) and extension authors to not break backwards compatibility within one major version of PostgreSQL.
|
||||
|
||||
### Alternatives
|
||||
|
||||
For extensions written on trusted languages we can also adopt
|
||||
`dbdev` PostgreSQL Package Manager based on `pg_tle` by Supabase.
|
||||
This will increase the amount supported extensions and decrease the amount of work required to support them.
|
||||
@@ -68,14 +68,46 @@ where
|
||||
/// Response of the /metrics.json API
|
||||
#[derive(Clone, Debug, Default, Serialize)]
|
||||
pub struct ComputeMetrics {
|
||||
/// Time spent waiting in pool
|
||||
pub wait_for_spec_ms: u64,
|
||||
pub sync_safekeepers_ms: u64,
|
||||
|
||||
/// Time spent checking if safekeepers are synced
|
||||
pub sync_sk_check_ms: u64,
|
||||
|
||||
/// Time spent syncing safekeepers (walproposer.c).
|
||||
/// In most cases this should be zero.
|
||||
pub sync_safekeepers_ms: u64,
|
||||
|
||||
/// Time it took to establish a pg connection to the pageserver.
|
||||
/// This is two roundtrips, so it's a good proxy for compute-pageserver
|
||||
/// latency. The latency is usually 0.2ms, but it's not safe to assume
|
||||
/// that.
|
||||
pub pageserver_connect_micros: u64,
|
||||
|
||||
/// Time to get basebackup from pageserver and write it to disk.
|
||||
pub basebackup_ms: u64,
|
||||
|
||||
/// Compressed size of basebackup received.
|
||||
pub basebackup_bytes: u64,
|
||||
|
||||
/// Time spent starting potgres. This includes initialization of shared
|
||||
/// buffers, preloading extensions, and other pg operations.
|
||||
pub start_postgres_ms: u64,
|
||||
|
||||
/// Time spent applying pg catalog updates that were made in the console
|
||||
/// UI. This should be 0 when startup time matters, since cplane tries
|
||||
/// to do these updates eagerly, and passes the skip_pg_catalog_updates
|
||||
/// when it's safe to skip this step.
|
||||
pub config_ms: u64,
|
||||
|
||||
/// Total time, from when we receive the spec to when we're ready to take
|
||||
/// pg connections.
|
||||
pub total_startup_ms: u64,
|
||||
pub load_ext_ms: u64,
|
||||
pub num_ext_downloaded: u64,
|
||||
pub largest_ext_size: u64, // these are measured in bytes
|
||||
pub total_ext_download_size: u64,
|
||||
pub prep_extensions_ms: u64,
|
||||
}
|
||||
|
||||
/// Response of the `/computes/{compute_id}/spec` control-plane API.
|
||||
|
||||
@@ -60,6 +60,9 @@ pub struct ComputeSpec {
|
||||
/// If set, 'storage_auth_token' is used as the password to authenticate to
|
||||
/// the pageserver and safekeepers.
|
||||
pub storage_auth_token: Option<String>,
|
||||
|
||||
// list of prefixes to search for custom extensions in remote extension storage
|
||||
pub custom_extensions: Option<Vec<String>>,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
|
||||
@@ -57,7 +57,7 @@ pub struct Event<Extra> {
|
||||
pub extra: Extra,
|
||||
}
|
||||
|
||||
pub fn idempotency_key(node_id: String) -> String {
|
||||
pub fn idempotency_key(node_id: &str) -> String {
|
||||
format!(
|
||||
"{}-{}-{:04}",
|
||||
Utc::now(),
|
||||
@@ -71,6 +71,6 @@ pub const CHUNK_SIZE: usize = 1000;
|
||||
// Just a wrapper around a slice of events
|
||||
// to serialize it as `{"events" : [ ] }
|
||||
#[derive(serde::Serialize)]
|
||||
pub struct EventChunk<'a, T> {
|
||||
pub events: &'a [T],
|
||||
pub struct EventChunk<'a, T: Clone> {
|
||||
pub events: std::borrow::Cow<'a, [T]>,
|
||||
}
|
||||
|
||||
@@ -145,6 +145,13 @@ pub const XLH_INSERT_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
|
||||
pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
|
||||
pub const XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED: u8 = (1 << 1) as u8;
|
||||
pub const XLH_DELETE_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
|
||||
pub const XLH_INSERT_STORE_CID: u8 = (1 << 7) as u8;
|
||||
pub const XLH_UPDATE_STORE_CID: u8 = (1 << 7) as u8;
|
||||
pub const XLH_DELETE_STORE_CID: u8 = (1 << 7) as u8;
|
||||
pub const XLH_LOCK_STORE_CID: u8 = (1 << 7) as u8;
|
||||
|
||||
pub const SIZE_OF_HEAP_LOCK: usize = 14;
|
||||
pub const SIZE_OF_HEAP_DELETE: usize = 14;
|
||||
|
||||
// From replication/message.h
|
||||
pub const XLOG_LOGICAL_MESSAGE: u8 = 0x00;
|
||||
|
||||
@@ -20,6 +20,7 @@ tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
|
||||
tokio-util.workspace = true
|
||||
toml_edit.workspace = true
|
||||
tracing.workspace = true
|
||||
scopeguard.workspace = true
|
||||
metrics.workspace = true
|
||||
utils.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
|
||||
@@ -65,6 +65,10 @@ impl RemotePath {
|
||||
Ok(Self(relative_path.to_path_buf()))
|
||||
}
|
||||
|
||||
pub fn from_string(relative_path: &str) -> anyhow::Result<Self> {
|
||||
Self::new(Path::new(relative_path))
|
||||
}
|
||||
|
||||
pub fn with_base(&self, base_path: &Path) -> PathBuf {
|
||||
base_path.join(&self.0)
|
||||
}
|
||||
@@ -190,6 +194,20 @@ pub enum GenericRemoteStorage {
|
||||
}
|
||||
|
||||
impl GenericRemoteStorage {
|
||||
// A function for listing all the files in a "directory"
|
||||
// Example:
|
||||
// list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
|
||||
pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.list_files(folder).await,
|
||||
Self::AwsS3(s) => s.list_files(folder).await,
|
||||
Self::Unreliable(s) => s.list_files(folder).await,
|
||||
}
|
||||
}
|
||||
|
||||
// lists common *prefixes*, if any of files
|
||||
// Example:
|
||||
// list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
|
||||
pub async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
@@ -201,14 +219,6 @@ impl GenericRemoteStorage {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.list_files(folder).await,
|
||||
Self::AwsS3(s) => s.list_files(folder).await,
|
||||
Self::Unreliable(s) => s.list_files(folder).await,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn upload(
|
||||
&self,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
|
||||
@@ -10,6 +10,7 @@ use anyhow::Context;
|
||||
use aws_config::{
|
||||
environment::credentials::EnvironmentVariableCredentialsProvider,
|
||||
imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain,
|
||||
provider_config::ProviderConfig, web_identity_token::WebIdentityTokenCredentialsProvider,
|
||||
};
|
||||
use aws_credential_types::cache::CredentialsCache;
|
||||
use aws_sdk_s3::{
|
||||
@@ -22,6 +23,7 @@ use aws_sdk_s3::{
|
||||
};
|
||||
use aws_smithy_http::body::SdkBody;
|
||||
use hyper::Body;
|
||||
use scopeguard::ScopeGuard;
|
||||
use tokio::{
|
||||
io::{self, AsyncRead},
|
||||
sync::Semaphore,
|
||||
@@ -36,82 +38,9 @@ use crate::{
|
||||
|
||||
const MAX_DELETE_OBJECTS_REQUEST_SIZE: usize = 1000;
|
||||
|
||||
pub(super) mod metrics {
|
||||
use metrics::{register_int_counter_vec, IntCounterVec};
|
||||
use once_cell::sync::Lazy;
|
||||
pub(super) mod metrics;
|
||||
|
||||
static S3_REQUESTS_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"remote_storage_s3_requests_count",
|
||||
"Number of s3 requests of particular type",
|
||||
&["request_type"],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static S3_REQUESTS_FAIL_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"remote_storage_s3_failures_count",
|
||||
"Number of failed s3 requests of particular type",
|
||||
&["request_type"],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub fn inc_get_object() {
|
||||
S3_REQUESTS_COUNT.with_label_values(&["get_object"]).inc();
|
||||
}
|
||||
|
||||
pub fn inc_get_object_fail() {
|
||||
S3_REQUESTS_FAIL_COUNT
|
||||
.with_label_values(&["get_object"])
|
||||
.inc();
|
||||
}
|
||||
|
||||
pub fn inc_put_object() {
|
||||
S3_REQUESTS_COUNT.with_label_values(&["put_object"]).inc();
|
||||
}
|
||||
|
||||
pub fn inc_put_object_fail() {
|
||||
S3_REQUESTS_FAIL_COUNT
|
||||
.with_label_values(&["put_object"])
|
||||
.inc();
|
||||
}
|
||||
|
||||
pub fn inc_delete_object() {
|
||||
S3_REQUESTS_COUNT
|
||||
.with_label_values(&["delete_object"])
|
||||
.inc();
|
||||
}
|
||||
|
||||
pub fn inc_delete_objects(count: u64) {
|
||||
S3_REQUESTS_COUNT
|
||||
.with_label_values(&["delete_object"])
|
||||
.inc_by(count);
|
||||
}
|
||||
|
||||
pub fn inc_delete_object_fail() {
|
||||
S3_REQUESTS_FAIL_COUNT
|
||||
.with_label_values(&["delete_object"])
|
||||
.inc();
|
||||
}
|
||||
|
||||
pub fn inc_delete_objects_fail(count: u64) {
|
||||
S3_REQUESTS_FAIL_COUNT
|
||||
.with_label_values(&["delete_object"])
|
||||
.inc_by(count);
|
||||
}
|
||||
|
||||
pub fn inc_list_objects() {
|
||||
S3_REQUESTS_COUNT.with_label_values(&["list_objects"]).inc();
|
||||
}
|
||||
|
||||
pub fn inc_list_objects_fail() {
|
||||
S3_REQUESTS_FAIL_COUNT
|
||||
.with_label_values(&["list_objects"])
|
||||
.inc();
|
||||
}
|
||||
}
|
||||
use self::metrics::{AttemptOutcome, RequestKind};
|
||||
|
||||
/// AWS S3 storage.
|
||||
pub struct S3Bucket {
|
||||
@@ -139,18 +68,29 @@ impl S3Bucket {
|
||||
aws_config.bucket_name
|
||||
);
|
||||
|
||||
let region = Some(Region::new(aws_config.bucket_region.clone()));
|
||||
|
||||
let credentials_provider = {
|
||||
// uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
|
||||
CredentialsProviderChain::first_try(
|
||||
"env",
|
||||
EnvironmentVariableCredentialsProvider::new(),
|
||||
)
|
||||
// uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
|
||||
// needed to access remote extensions bucket
|
||||
.or_else("token", {
|
||||
let provider_conf = ProviderConfig::without_region().with_region(region.clone());
|
||||
|
||||
WebIdentityTokenCredentialsProvider::builder()
|
||||
.configure(&provider_conf)
|
||||
.build()
|
||||
})
|
||||
// uses imds v2
|
||||
.or_else("imds", ImdsCredentialsProvider::builder().build())
|
||||
};
|
||||
|
||||
let mut config_builder = Config::builder()
|
||||
.region(Region::new(aws_config.bucket_region.clone()))
|
||||
.region(region)
|
||||
.credentials_cache(CredentialsCache::lazy())
|
||||
.credentials_provider(credentials_provider);
|
||||
|
||||
@@ -213,16 +153,43 @@ impl S3Bucket {
|
||||
}
|
||||
}
|
||||
|
||||
async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
|
||||
async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
|
||||
let started_at = start_counting_cancelled_wait(kind);
|
||||
let permit = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
.await
|
||||
.expect("semaphore is never closed");
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.wait_seconds
|
||||
.observe_elapsed(kind, started_at);
|
||||
|
||||
permit
|
||||
}
|
||||
|
||||
async fn owned_permit(&self, kind: RequestKind) -> tokio::sync::OwnedSemaphorePermit {
|
||||
let started_at = start_counting_cancelled_wait(kind);
|
||||
let permit = self
|
||||
.concurrency_limiter
|
||||
.clone()
|
||||
.acquire_owned()
|
||||
.await
|
||||
.context("Concurrency limiter semaphore got closed during S3 download")
|
||||
.map_err(DownloadError::Other)?;
|
||||
.expect("semaphore is never closed");
|
||||
|
||||
metrics::inc_get_object();
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.wait_seconds
|
||||
.observe_elapsed(kind, started_at);
|
||||
permit
|
||||
}
|
||||
|
||||
async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
|
||||
let kind = RequestKind::Get;
|
||||
let permit = self.owned_permit(kind).await;
|
||||
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let get_object = self
|
||||
.client
|
||||
@@ -233,26 +200,33 @@ impl S3Bucket {
|
||||
.send()
|
||||
.await;
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
|
||||
if get_object.is_err() {
|
||||
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Err,
|
||||
started_at,
|
||||
);
|
||||
}
|
||||
|
||||
match get_object {
|
||||
Ok(object_output) => {
|
||||
let metadata = object_output.metadata().cloned().map(StorageMetadata);
|
||||
Ok(Download {
|
||||
metadata,
|
||||
download_stream: Box::pin(io::BufReader::new(RatelimitedAsyncRead::new(
|
||||
permit,
|
||||
object_output.body.into_async_read(),
|
||||
download_stream: Box::pin(io::BufReader::new(TimedDownload::new(
|
||||
started_at,
|
||||
RatelimitedAsyncRead::new(permit, object_output.body.into_async_read()),
|
||||
))),
|
||||
})
|
||||
}
|
||||
Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
|
||||
Err(DownloadError::NotFound)
|
||||
}
|
||||
Err(e) => {
|
||||
metrics::inc_get_object_fail();
|
||||
Err(DownloadError::Other(anyhow::anyhow!(
|
||||
"Failed to download S3 object: {e}"
|
||||
)))
|
||||
}
|
||||
Err(e) => Err(DownloadError::Other(
|
||||
anyhow::Error::new(e).context("download s3 object"),
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -283,6 +257,54 @@ impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
|
||||
}
|
||||
}
|
||||
|
||||
pin_project_lite::pin_project! {
|
||||
/// Times and tracks the outcome of the request.
|
||||
struct TimedDownload<S> {
|
||||
started_at: std::time::Instant,
|
||||
outcome: metrics::AttemptOutcome,
|
||||
#[pin]
|
||||
inner: S
|
||||
}
|
||||
|
||||
impl<S> PinnedDrop for TimedDownload<S> {
|
||||
fn drop(mut this: Pin<&mut Self>) {
|
||||
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: AsyncRead> TimedDownload<S> {
|
||||
fn new(started_at: std::time::Instant, inner: S) -> Self {
|
||||
TimedDownload {
|
||||
started_at,
|
||||
outcome: metrics::AttemptOutcome::Cancelled,
|
||||
inner,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: AsyncRead> AsyncRead for TimedDownload<S> {
|
||||
fn poll_read(
|
||||
self: std::pin::Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
buf: &mut io::ReadBuf<'_>,
|
||||
) -> std::task::Poll<std::io::Result<()>> {
|
||||
let this = self.project();
|
||||
let before = buf.filled().len();
|
||||
let read = std::task::ready!(this.inner.poll_read(cx, buf));
|
||||
|
||||
let read_eof = buf.filled().len() == before;
|
||||
|
||||
match read {
|
||||
Ok(()) if read_eof => *this.outcome = AttemptOutcome::Ok,
|
||||
Ok(()) => { /* still in progress */ }
|
||||
Err(_) => *this.outcome = AttemptOutcome::Err,
|
||||
}
|
||||
|
||||
std::task::Poll::Ready(read)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for S3Bucket {
|
||||
/// See the doc for `RemoteStorage::list_prefixes`
|
||||
@@ -291,6 +313,8 @@ impl RemoteStorage for S3Bucket {
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
let kind = RequestKind::List;
|
||||
|
||||
// get the passed prefix or if it is not set use prefix_in_bucket value
|
||||
let list_prefix = prefix
|
||||
.map(|p| self.relative_path_to_s3_object(p))
|
||||
@@ -307,15 +331,10 @@ impl RemoteStorage for S3Bucket {
|
||||
let mut document_keys = Vec::new();
|
||||
|
||||
let mut continuation_token = None;
|
||||
loop {
|
||||
let _guard = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
.await
|
||||
.context("Concurrency limiter semaphore got closed during S3 list")
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
metrics::inc_list_objects();
|
||||
loop {
|
||||
let _guard = self.permit(kind).await;
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let fetch_response = self
|
||||
.client
|
||||
@@ -327,12 +346,16 @@ impl RemoteStorage for S3Bucket {
|
||||
.set_max_keys(self.max_keys_per_list_response)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
metrics::inc_list_objects_fail();
|
||||
e
|
||||
})
|
||||
.context("Failed to list S3 prefixes")
|
||||
.map_err(DownloadError::Other)?;
|
||||
.map_err(DownloadError::Other);
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &fetch_response, started_at);
|
||||
|
||||
let fetch_response = fetch_response?;
|
||||
|
||||
document_keys.extend(
|
||||
fetch_response
|
||||
@@ -342,10 +365,10 @@ impl RemoteStorage for S3Bucket {
|
||||
.filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
|
||||
);
|
||||
|
||||
match fetch_response.next_continuation_token {
|
||||
Some(new_token) => continuation_token = Some(new_token),
|
||||
continuation_token = match fetch_response.next_continuation_token {
|
||||
Some(new_token) => Some(new_token),
|
||||
None => break,
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
Ok(document_keys)
|
||||
@@ -353,6 +376,8 @@ impl RemoteStorage for S3Bucket {
|
||||
|
||||
/// See the doc for `RemoteStorage::list_files`
|
||||
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
let kind = RequestKind::List;
|
||||
|
||||
let folder_name = folder
|
||||
.map(|p| self.relative_path_to_s3_object(p))
|
||||
.or_else(|| self.prefix_in_bucket.clone());
|
||||
@@ -361,12 +386,8 @@ impl RemoteStorage for S3Bucket {
|
||||
let mut continuation_token = None;
|
||||
let mut all_files = vec![];
|
||||
loop {
|
||||
let _guard = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
.await
|
||||
.context("Concurrency limiter semaphore got closed during S3 list_files")?;
|
||||
metrics::inc_list_objects();
|
||||
let _guard = self.permit(kind).await;
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let response = self
|
||||
.client
|
||||
@@ -377,11 +398,14 @@ impl RemoteStorage for S3Bucket {
|
||||
.set_max_keys(self.max_keys_per_list_response)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
metrics::inc_list_objects_fail();
|
||||
e
|
||||
})
|
||||
.context("Failed to list files in S3 bucket")?;
|
||||
.context("Failed to list files in S3 bucket");
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &response, started_at);
|
||||
|
||||
let response = response?;
|
||||
|
||||
for object in response.contents().unwrap_or_default() {
|
||||
let object_path = object.key().expect("response does not contain a key");
|
||||
@@ -403,18 +427,16 @@ impl RemoteStorage for S3Bucket {
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
) -> anyhow::Result<()> {
|
||||
let _guard = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
.await
|
||||
.context("Concurrency limiter semaphore got closed during S3 upload")?;
|
||||
let kind = RequestKind::Put;
|
||||
let _guard = self.permit(kind).await;
|
||||
|
||||
metrics::inc_put_object();
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let body = Body::wrap_stream(ReaderStream::new(from));
|
||||
let bytes_stream = ByteStream::new(SdkBody::from(body));
|
||||
|
||||
self.client
|
||||
let res = self
|
||||
.client
|
||||
.put_object()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.key(self.relative_path_to_s3_object(to))
|
||||
@@ -422,11 +444,15 @@ impl RemoteStorage for S3Bucket {
|
||||
.content_length(from_size_bytes.try_into()?)
|
||||
.body(bytes_stream)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
metrics::inc_put_object_fail();
|
||||
e
|
||||
})?;
|
||||
.await;
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &res, started_at);
|
||||
|
||||
res?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -463,11 +489,8 @@ impl RemoteStorage for S3Bucket {
|
||||
.await
|
||||
}
|
||||
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
|
||||
let _guard = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
.await
|
||||
.context("Concurrency limiter semaphore got closed during S3 delete")?;
|
||||
let kind = RequestKind::Delete;
|
||||
let _guard = self.permit(kind).await;
|
||||
|
||||
let mut delete_objects = Vec::with_capacity(paths.len());
|
||||
for path in paths {
|
||||
@@ -478,7 +501,7 @@ impl RemoteStorage for S3Bucket {
|
||||
}
|
||||
|
||||
for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) {
|
||||
metrics::inc_delete_objects(chunk.len() as u64);
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let resp = self
|
||||
.client
|
||||
@@ -488,10 +511,17 @@ impl RemoteStorage for S3Bucket {
|
||||
.send()
|
||||
.await;
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &resp, started_at);
|
||||
|
||||
match resp {
|
||||
Ok(resp) => {
|
||||
metrics::BUCKET_METRICS
|
||||
.deleted_objects_total
|
||||
.inc_by(chunk.len() as u64);
|
||||
if let Some(errors) = resp.errors {
|
||||
metrics::inc_delete_objects_fail(errors.len() as u64);
|
||||
return Err(anyhow::format_err!(
|
||||
"Failed to delete {} objects",
|
||||
errors.len()
|
||||
@@ -499,7 +529,6 @@ impl RemoteStorage for S3Bucket {
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
metrics::inc_delete_objects_fail(chunk.len() as u64);
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
@@ -508,28 +537,33 @@ impl RemoteStorage for S3Bucket {
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
|
||||
let _guard = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
.await
|
||||
.context("Concurrency limiter semaphore got closed during S3 delete")?;
|
||||
|
||||
metrics::inc_delete_object();
|
||||
|
||||
self.client
|
||||
.delete_object()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.key(self.relative_path_to_s3_object(path))
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
metrics::inc_delete_object_fail();
|
||||
e
|
||||
})?;
|
||||
Ok(())
|
||||
let paths = std::array::from_ref(path);
|
||||
self.delete_objects(paths).await
|
||||
}
|
||||
}
|
||||
|
||||
/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
|
||||
fn start_counting_cancelled_wait(
|
||||
kind: RequestKind,
|
||||
) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
|
||||
scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
|
||||
metrics::BUCKET_METRICS.cancelled_waits.get(kind).inc()
|
||||
})
|
||||
}
|
||||
|
||||
/// On drop (cancellation) add time to [`metrics::BucketMetrics::req_seconds`].
|
||||
fn start_measuring_requests(
|
||||
kind: RequestKind,
|
||||
) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
|
||||
scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
|
||||
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Cancelled,
|
||||
started_at,
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::num::NonZeroUsize;
|
||||
|
||||
191
libs/remote_storage/src/s3_bucket/metrics.rs
Normal file
191
libs/remote_storage/src/s3_bucket/metrics.rs
Normal file
@@ -0,0 +1,191 @@
|
||||
use metrics::{
|
||||
register_histogram_vec, register_int_counter, register_int_counter_vec, Histogram, IntCounter,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
pub(super) static BUCKET_METRICS: Lazy<BucketMetrics> = Lazy::new(Default::default);
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub(super) enum RequestKind {
|
||||
Get = 0,
|
||||
Put = 1,
|
||||
Delete = 2,
|
||||
List = 3,
|
||||
}
|
||||
|
||||
use RequestKind::*;
|
||||
|
||||
impl RequestKind {
|
||||
const fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
Get => "get_object",
|
||||
Put => "put_object",
|
||||
Delete => "delete_object",
|
||||
List => "list_objects",
|
||||
}
|
||||
}
|
||||
const fn as_index(&self) -> usize {
|
||||
*self as usize
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) struct RequestTyped<C>([C; 4]);
|
||||
|
||||
impl<C> RequestTyped<C> {
|
||||
pub(super) fn get(&self, kind: RequestKind) -> &C {
|
||||
&self.0[kind.as_index()]
|
||||
}
|
||||
|
||||
fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
|
||||
use RequestKind::*;
|
||||
let mut it = [Get, Put, Delete, List].into_iter();
|
||||
let arr = std::array::from_fn::<C, 4, _>(|index| {
|
||||
let next = it.next().unwrap();
|
||||
assert_eq!(index, next.as_index());
|
||||
f(next)
|
||||
});
|
||||
|
||||
if let Some(next) = it.next() {
|
||||
panic!("unexpected {next:?}");
|
||||
}
|
||||
|
||||
RequestTyped(arr)
|
||||
}
|
||||
}
|
||||
|
||||
impl RequestTyped<Histogram> {
|
||||
pub(super) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
|
||||
self.get(kind).observe(started_at.elapsed().as_secs_f64())
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) struct PassFailCancelledRequestTyped<C> {
|
||||
success: RequestTyped<C>,
|
||||
fail: RequestTyped<C>,
|
||||
cancelled: RequestTyped<C>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub(super) enum AttemptOutcome {
|
||||
Ok,
|
||||
Err,
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
impl<T, E> From<&Result<T, E>> for AttemptOutcome {
|
||||
fn from(value: &Result<T, E>) -> Self {
|
||||
match value {
|
||||
Ok(_) => AttemptOutcome::Ok,
|
||||
Err(_) => AttemptOutcome::Err,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AttemptOutcome {
|
||||
pub(super) fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
AttemptOutcome::Ok => "ok",
|
||||
AttemptOutcome::Err => "err",
|
||||
AttemptOutcome::Cancelled => "cancelled",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<C> PassFailCancelledRequestTyped<C> {
|
||||
pub(super) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
|
||||
let target = match outcome {
|
||||
AttemptOutcome::Ok => &self.success,
|
||||
AttemptOutcome::Err => &self.fail,
|
||||
AttemptOutcome::Cancelled => &self.cancelled,
|
||||
};
|
||||
target.get(kind)
|
||||
}
|
||||
|
||||
fn build_with(mut f: impl FnMut(RequestKind, AttemptOutcome) -> C) -> Self {
|
||||
let success = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Ok));
|
||||
let fail = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Err));
|
||||
let cancelled = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Cancelled));
|
||||
|
||||
PassFailCancelledRequestTyped {
|
||||
success,
|
||||
fail,
|
||||
cancelled,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PassFailCancelledRequestTyped<Histogram> {
|
||||
pub(super) fn observe_elapsed(
|
||||
&self,
|
||||
kind: RequestKind,
|
||||
outcome: impl Into<AttemptOutcome>,
|
||||
started_at: std::time::Instant,
|
||||
) {
|
||||
self.get(kind, outcome.into())
|
||||
.observe(started_at.elapsed().as_secs_f64())
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) struct BucketMetrics {
|
||||
/// Full request duration until successful completion, error or cancellation.
|
||||
pub(super) req_seconds: PassFailCancelledRequestTyped<Histogram>,
|
||||
/// Total amount of seconds waited on queue.
|
||||
pub(super) wait_seconds: RequestTyped<Histogram>,
|
||||
|
||||
/// Track how many semaphore awaits were cancelled per request type.
|
||||
///
|
||||
/// This is in case cancellations are happening more than expected.
|
||||
pub(super) cancelled_waits: RequestTyped<IntCounter>,
|
||||
|
||||
/// Total amount of deleted objects in batches or single requests.
|
||||
pub(super) deleted_objects_total: IntCounter,
|
||||
}
|
||||
|
||||
impl Default for BucketMetrics {
|
||||
fn default() -> Self {
|
||||
let buckets = [0.01, 0.10, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0];
|
||||
|
||||
let req_seconds = register_histogram_vec!(
|
||||
"remote_storage_s3_request_seconds",
|
||||
"Seconds to complete a request",
|
||||
&["request_type", "result"],
|
||||
buckets.to_vec(),
|
||||
)
|
||||
.unwrap();
|
||||
let req_seconds = PassFailCancelledRequestTyped::build_with(|kind, outcome| {
|
||||
req_seconds.with_label_values(&[kind.as_str(), outcome.as_str()])
|
||||
});
|
||||
|
||||
let wait_seconds = register_histogram_vec!(
|
||||
"remote_storage_s3_wait_seconds",
|
||||
"Seconds rate limited",
|
||||
&["request_type"],
|
||||
buckets.to_vec(),
|
||||
)
|
||||
.unwrap();
|
||||
let wait_seconds =
|
||||
RequestTyped::build_with(|kind| wait_seconds.with_label_values(&[kind.as_str()]));
|
||||
|
||||
let cancelled_waits = register_int_counter_vec!(
|
||||
"remote_storage_s3_cancelled_waits_total",
|
||||
"Times a semaphore wait has been cancelled per request type",
|
||||
&["request_type"],
|
||||
)
|
||||
.unwrap();
|
||||
let cancelled_waits =
|
||||
RequestTyped::build_with(|kind| cancelled_waits.with_label_values(&[kind.as_str()]));
|
||||
|
||||
let deleted_objects_total = register_int_counter!(
|
||||
"remote_storage_s3_deleted_objects_total",
|
||||
"Amount of deleted objects in total",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
Self {
|
||||
req_seconds,
|
||||
wait_seconds,
|
||||
cancelled_waits,
|
||||
deleted_objects_total,
|
||||
}
|
||||
}
|
||||
}
|
||||
188
libs/utils/src/backoff.rs
Normal file
188
libs/utils/src/backoff.rs
Normal file
@@ -0,0 +1,188 @@
|
||||
use std::fmt::{Debug, Display};
|
||||
|
||||
use futures::Future;
|
||||
|
||||
pub const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
|
||||
pub const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
|
||||
|
||||
pub async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
|
||||
let backoff_duration_seconds =
|
||||
exponential_backoff_duration_seconds(n, base_increment, max_seconds);
|
||||
if backoff_duration_seconds > 0.0 {
|
||||
tracing::info!(
|
||||
"Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
|
||||
);
|
||||
tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
|
||||
if n == 0 {
|
||||
0.0
|
||||
} else {
|
||||
(1.0 + base_increment).powf(f64::from(n)).min(max_seconds)
|
||||
}
|
||||
}
|
||||
|
||||
/// retries passed operation until one of the following conditions are met:
|
||||
/// Encountered error is considered as permanent (non-retryable)
|
||||
/// Retries have been exhausted.
|
||||
/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent errors
|
||||
/// When attempts cross `warn_threshold` function starts to emit log warnings.
|
||||
/// `description` argument is added to log messages. Its value should identify the `op` is doing
|
||||
pub async fn retry<T, O, F, E>(
|
||||
mut op: O,
|
||||
is_permanent: impl Fn(&E) -> bool,
|
||||
warn_threshold: u32,
|
||||
max_retries: u32,
|
||||
description: &str,
|
||||
) -> Result<T, E>
|
||||
where
|
||||
// Not std::error::Error because anyhow::Error doesnt implement it.
|
||||
// For context see https://github.com/dtolnay/anyhow/issues/63
|
||||
E: Display + Debug,
|
||||
O: FnMut() -> F,
|
||||
F: Future<Output = Result<T, E>>,
|
||||
{
|
||||
let mut attempts = 0;
|
||||
loop {
|
||||
let result = op().await;
|
||||
match result {
|
||||
Ok(_) => {
|
||||
if attempts > 0 {
|
||||
tracing::info!("{description} succeeded after {attempts} retries");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// These are "permanent" errors that should not be retried.
|
||||
Err(ref e) if is_permanent(e) => {
|
||||
return result;
|
||||
}
|
||||
// Assume that any other failure might be transient, and the operation might
|
||||
// succeed if we just keep trying.
|
||||
Err(err) if attempts < warn_threshold => {
|
||||
tracing::info!("{description} failed, will retry (attempt {attempts}): {err:#}");
|
||||
}
|
||||
Err(err) if attempts < max_retries => {
|
||||
tracing::warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
|
||||
}
|
||||
Err(ref err) => {
|
||||
// Operation failed `max_attempts` times. Time to give up.
|
||||
tracing::warn!(
|
||||
"{description} still failed after {attempts} retries, giving up: {err:?}"
|
||||
);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
// sleep and retry
|
||||
exponential_backoff(
|
||||
attempts,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
)
|
||||
.await;
|
||||
attempts += 1;
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::io;
|
||||
|
||||
use tokio::sync::Mutex;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn backoff_defaults_produce_growing_backoff_sequence() {
|
||||
let mut current_backoff_value = None;
|
||||
|
||||
for i in 0..10_000 {
|
||||
let new_backoff_value = exponential_backoff_duration_seconds(
|
||||
i,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
);
|
||||
|
||||
if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) {
|
||||
assert!(
|
||||
old_backoff_value <= new_backoff_value,
|
||||
"{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}"
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(
|
||||
current_backoff_value.expect("Should have produced backoff values to compare"),
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
"Given big enough of retries, backoff should reach its allowed max value"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn retry_always_error() {
|
||||
let count = Mutex::new(0);
|
||||
let err_result = retry(
|
||||
|| async {
|
||||
*count.lock().await += 1;
|
||||
Result::<(), io::Error>::Err(io::Error::from(io::ErrorKind::Other))
|
||||
},
|
||||
|_e| false,
|
||||
1,
|
||||
1,
|
||||
"work",
|
||||
)
|
||||
.await;
|
||||
|
||||
assert!(err_result.is_err());
|
||||
|
||||
assert_eq!(*count.lock().await, 2);
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn retry_ok_after_err() {
|
||||
let count = Mutex::new(0);
|
||||
retry(
|
||||
|| async {
|
||||
let mut locked = count.lock().await;
|
||||
if *locked > 1 {
|
||||
Ok(())
|
||||
} else {
|
||||
*locked += 1;
|
||||
Err(io::Error::from(io::ErrorKind::Other))
|
||||
}
|
||||
},
|
||||
|_e| false,
|
||||
2,
|
||||
2,
|
||||
"work",
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn dont_retry_permanent_errors() {
|
||||
let count = Mutex::new(0);
|
||||
let _ = retry(
|
||||
|| async {
|
||||
let mut locked = count.lock().await;
|
||||
if *locked > 1 {
|
||||
Ok(())
|
||||
} else {
|
||||
*locked += 1;
|
||||
Err(io::Error::from(io::ErrorKind::Other))
|
||||
}
|
||||
},
|
||||
|_e| true,
|
||||
2,
|
||||
2,
|
||||
"work",
|
||||
)
|
||||
.await
|
||||
.unwrap_err();
|
||||
|
||||
assert_eq!(*count.lock().await, 1);
|
||||
}
|
||||
}
|
||||
@@ -111,6 +111,10 @@ pub fn fsync(path: &Path) -> io::Result<()> {
|
||||
.map_err(|e| io::Error::new(e.kind(), format!("Failed to fsync file {path:?}: {e}")))
|
||||
}
|
||||
|
||||
pub async fn fsync_async(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
|
||||
tokio::fs::File::open(path).await?.sync_all().await
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use tempfile::tempdir;
|
||||
|
||||
@@ -24,6 +24,20 @@ pub async fn is_directory_empty(path: impl AsRef<Path>) -> anyhow::Result<bool>
|
||||
Ok(dir.next_entry().await?.is_none())
|
||||
}
|
||||
|
||||
pub async fn list_dir(path: impl AsRef<Path>) -> anyhow::Result<Vec<String>> {
|
||||
let mut dir = tokio::fs::read_dir(&path)
|
||||
.await
|
||||
.context(format!("read_dir({})", path.as_ref().display()))?;
|
||||
|
||||
let mut content = vec![];
|
||||
while let Some(next) = dir.next_entry().await? {
|
||||
let file_name = next.file_name();
|
||||
content.push(file_name.to_string_lossy().to_string());
|
||||
}
|
||||
|
||||
Ok(content)
|
||||
}
|
||||
|
||||
pub fn ignore_not_found(e: io::Error) -> io::Result<()> {
|
||||
if e.kind() == io::ErrorKind::NotFound {
|
||||
Ok(())
|
||||
@@ -43,7 +57,7 @@ where
|
||||
mod test {
|
||||
use std::path::PathBuf;
|
||||
|
||||
use crate::fs_ext::is_directory_empty;
|
||||
use crate::fs_ext::{is_directory_empty, list_dir};
|
||||
|
||||
use super::ignore_absent_files;
|
||||
|
||||
@@ -109,4 +123,25 @@ mod test {
|
||||
|
||||
assert!(!file_path.exists());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn list_dir_works() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let dir_path = dir.path();
|
||||
|
||||
assert!(list_dir(dir_path).await.unwrap().is_empty());
|
||||
|
||||
let file_path: PathBuf = dir_path.join("testfile");
|
||||
let _ = std::fs::File::create(&file_path).unwrap();
|
||||
|
||||
assert_eq!(&list_dir(dir_path).await.unwrap(), &["testfile"]);
|
||||
|
||||
let another_dir_path: PathBuf = dir_path.join("testdir");
|
||||
std::fs::create_dir(another_dir_path).unwrap();
|
||||
|
||||
let expected = &["testdir", "testfile"];
|
||||
let mut actual = list_dir(dir_path).await.unwrap();
|
||||
actual.sort();
|
||||
assert_eq!(actual, expected);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
//! `utils` is intended to be a place to put code that is shared
|
||||
//! between other crates in this repository.
|
||||
|
||||
pub mod backoff;
|
||||
|
||||
/// `Lsn` type implements common tasks on Log Sequence Numbers
|
||||
pub mod lsn;
|
||||
/// SeqWait allows waiting for a future sequence number to arrive
|
||||
|
||||
@@ -23,6 +23,7 @@
|
||||
//! <https://grafana.com/tutorials/build-a-panel-plugin/>
|
||||
use anyhow::Result;
|
||||
use pageserver::repository::Key;
|
||||
use pageserver::METADATA_FILE_NAME;
|
||||
use std::cmp::Ordering;
|
||||
use std::io::{self, BufRead};
|
||||
use std::path::PathBuf;
|
||||
@@ -71,6 +72,10 @@ pub fn main() -> Result<()> {
|
||||
let line = PathBuf::from_str(&line).unwrap();
|
||||
let filename = line.file_name().unwrap();
|
||||
let filename = filename.to_str().unwrap();
|
||||
if filename == METADATA_FILE_NAME {
|
||||
// Don't try and parse "metadata" like a key-lsn range
|
||||
continue;
|
||||
}
|
||||
let range = parse_filename(filename);
|
||||
ranges.push(range);
|
||||
}
|
||||
|
||||
@@ -107,23 +107,25 @@ async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
|
||||
// min-heap (reserve space for one more element added before eviction)
|
||||
let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
|
||||
let mut prev_key: Option<Key> = None;
|
||||
tree_reader.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
|key, _value| {
|
||||
let curr = Key::from_slice(&key[..KEY_SIZE]);
|
||||
if let Some(prev) = prev_key {
|
||||
if curr.to_i128() - prev.to_i128() >= MIN_HOLE_LENGTH {
|
||||
heap.push(Hole(prev..curr));
|
||||
if heap.len() > max_holes {
|
||||
heap.pop(); // remove smallest hole
|
||||
tree_reader
|
||||
.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
|key, _value| {
|
||||
let curr = Key::from_slice(&key[..KEY_SIZE]);
|
||||
if let Some(prev) = prev_key {
|
||||
if curr.to_i128() - prev.to_i128() >= MIN_HOLE_LENGTH {
|
||||
heap.push(Hole(prev..curr));
|
||||
if heap.len() > max_holes {
|
||||
heap.pop(); // remove smallest hole
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
prev_key = Some(curr.next());
|
||||
true
|
||||
},
|
||||
)?;
|
||||
prev_key = Some(curr.next());
|
||||
true
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
let mut holes = heap.into_vec();
|
||||
holes.sort_by_key(|hole| hole.0.start);
|
||||
Ok(holes)
|
||||
|
||||
@@ -59,18 +59,20 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
|
||||
);
|
||||
// TODO(chi): dedup w/ `delta_layer.rs` by exposing the API.
|
||||
let mut all = vec![];
|
||||
tree_reader.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
|key, value_offset| {
|
||||
let curr = Key::from_slice(&key[..KEY_SIZE]);
|
||||
all.push((curr, BlobRef(value_offset)));
|
||||
true
|
||||
},
|
||||
)?;
|
||||
let mut cursor = BlockCursor::new(&file);
|
||||
tree_reader
|
||||
.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
|key, value_offset| {
|
||||
let curr = Key::from_slice(&key[..KEY_SIZE]);
|
||||
all.push((curr, BlobRef(value_offset)));
|
||||
true
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
let cursor = BlockCursor::new(&file);
|
||||
for (k, v) in all {
|
||||
let value = cursor.read_blob(v.pos())?;
|
||||
let value = cursor.read_blob(v.pos()).await?;
|
||||
println!("key:{} value_len:{}", k, value.len());
|
||||
}
|
||||
// TODO(chi): special handling for last key?
|
||||
|
||||
@@ -9,8 +9,10 @@ use clap::{Arg, ArgAction, Command};
|
||||
use fail::FailScenario;
|
||||
use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
|
||||
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
|
||||
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
|
||||
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tokio::time::Instant;
|
||||
use tracing::*;
|
||||
|
||||
use metrics::set_build_info_metric;
|
||||
@@ -38,8 +40,6 @@ const PID_FILE_NAME: &str = "pageserver.pid";
|
||||
const FEATURES: &[&str] = &[
|
||||
#[cfg(feature = "testing")]
|
||||
"testing",
|
||||
#[cfg(feature = "fail/failpoints")]
|
||||
"fail/failpoints",
|
||||
];
|
||||
|
||||
fn version() -> String {
|
||||
@@ -226,6 +226,19 @@ fn start_pageserver(
|
||||
launch_ts: &'static LaunchTimestamp,
|
||||
conf: &'static PageServerConf,
|
||||
) -> anyhow::Result<()> {
|
||||
// Monotonic time for later calculating startup duration
|
||||
let started_startup_at = Instant::now();
|
||||
|
||||
let startup_checkpoint = move |phase: &str, human_phase: &str| {
|
||||
let elapsed = started_startup_at.elapsed();
|
||||
let secs = elapsed.as_secs_f64();
|
||||
STARTUP_DURATION.with_label_values(&[phase]).set(secs);
|
||||
info!(
|
||||
elapsed_ms = elapsed.as_millis(),
|
||||
"{human_phase} ({secs:.3}s since start)"
|
||||
)
|
||||
};
|
||||
|
||||
// Print version and launch timestamp to the log,
|
||||
// and expose them as prometheus metrics.
|
||||
// A changed version string indicates changed software.
|
||||
@@ -335,6 +348,11 @@ fn start_pageserver(
|
||||
// Set up remote storage client
|
||||
let remote_storage = create_remote_storage_client(conf)?;
|
||||
|
||||
// Up to this point no significant I/O has been done: this should have been fast. Record
|
||||
// duration prior to starting I/O intensive phase of startup.
|
||||
startup_checkpoint("initial", "Starting loading tenants");
|
||||
STARTUP_IS_LOADING.set(1);
|
||||
|
||||
// Startup staging or optimizing:
|
||||
//
|
||||
// We want to minimize downtime for `page_service` connections, and trying not to overload
|
||||
@@ -355,12 +373,11 @@ fn start_pageserver(
|
||||
let order = pageserver::InitializationOrder {
|
||||
initial_tenant_load: Some(init_done_tx),
|
||||
initial_logical_size_can_start: init_done_rx.clone(),
|
||||
initial_logical_size_attempt: init_logical_size_done_tx,
|
||||
initial_logical_size_attempt: Some(init_logical_size_done_tx),
|
||||
background_jobs_can_start: background_jobs_barrier.clone(),
|
||||
};
|
||||
|
||||
// Scan the local 'tenants/' directory and start loading the tenants
|
||||
let init_started_at = std::time::Instant::now();
|
||||
let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
|
||||
|
||||
BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
|
||||
@@ -378,18 +395,13 @@ fn start_pageserver(
|
||||
let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed"));
|
||||
|
||||
init_done_rx.wait().await;
|
||||
startup_checkpoint("initial_tenant_load", "Initial load completed");
|
||||
STARTUP_IS_LOADING.set(0);
|
||||
|
||||
// initial logical sizes can now start, as they were waiting on init_done_rx.
|
||||
|
||||
scopeguard::ScopeGuard::into_inner(guard);
|
||||
|
||||
let init_done = std::time::Instant::now();
|
||||
let elapsed = init_done - init_started_at;
|
||||
|
||||
tracing::info!(
|
||||
elapsed_millis = elapsed.as_millis(),
|
||||
"Initial load completed"
|
||||
);
|
||||
|
||||
let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait());
|
||||
|
||||
let timeout = conf.background_task_maximum_delay;
|
||||
@@ -398,12 +410,7 @@ fn start_pageserver(
|
||||
|
||||
let init_sizes_done = match tokio::time::timeout(timeout, &mut init_sizes_done).await {
|
||||
Ok(_) => {
|
||||
let now = std::time::Instant::now();
|
||||
tracing::info!(
|
||||
from_init_done_millis = (now - init_done).as_millis(),
|
||||
from_init_millis = (now - init_started_at).as_millis(),
|
||||
"Initial logical sizes completed"
|
||||
);
|
||||
startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed");
|
||||
None
|
||||
}
|
||||
Err(_) => {
|
||||
@@ -419,6 +426,7 @@ fn start_pageserver(
|
||||
|
||||
// allow background jobs to start
|
||||
drop(background_jobs_can_start);
|
||||
startup_checkpoint("background_jobs_can_start", "Starting background jobs");
|
||||
|
||||
if let Some(init_sizes_done) = init_sizes_done {
|
||||
// ending up here is not a bug; at the latest logical sizes will be queried by
|
||||
@@ -428,14 +436,11 @@ fn start_pageserver(
|
||||
|
||||
scopeguard::ScopeGuard::into_inner(guard);
|
||||
|
||||
let now = std::time::Instant::now();
|
||||
tracing::info!(
|
||||
from_init_done_millis = (now - init_done).as_millis(),
|
||||
from_init_millis = (now - init_started_at).as_millis(),
|
||||
"Initial logical sizes completed after timeout (background jobs already started)"
|
||||
);
|
||||
startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed after timeout (background jobs already started)");
|
||||
|
||||
}
|
||||
|
||||
startup_checkpoint("complete", "Startup complete");
|
||||
};
|
||||
|
||||
async move {
|
||||
|
||||
@@ -31,7 +31,9 @@ use utils::{
|
||||
use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
|
||||
use crate::tenant::config::TenantConf;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
|
||||
use crate::tenant::{
|
||||
TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
|
||||
};
|
||||
use crate::{
|
||||
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
|
||||
TIMELINE_UNINIT_MARK_SUFFIX,
|
||||
@@ -613,6 +615,11 @@ impl PageServerConf {
|
||||
)
|
||||
}
|
||||
|
||||
pub fn tenant_deleted_mark_file_path(&self, tenant_id: &TenantId) -> PathBuf {
|
||||
self.tenant_path(tenant_id)
|
||||
.join(TENANT_DELETED_MARKER_FILE_NAME)
|
||||
}
|
||||
|
||||
pub fn traces_path(&self) -> PathBuf {
|
||||
self.workdir.join("traces")
|
||||
}
|
||||
|
||||
@@ -14,14 +14,16 @@ use reqwest::Url;
|
||||
use serde::Serialize;
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use std::collections::HashMap;
|
||||
use std::time::Duration;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, SystemTime};
|
||||
use tracing::*;
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Debug)]
|
||||
#[derive(Serialize, Debug, Clone, Copy)]
|
||||
struct Ids {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
tenant_id: TenantId,
|
||||
@@ -32,13 +34,13 @@ struct Ids {
|
||||
|
||||
/// Key that uniquely identifies the object, this metric describes.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct PageserverConsumptionMetricsKey {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: Option<TimelineId>,
|
||||
pub metric: &'static str,
|
||||
struct MetricsKey {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: Option<TimelineId>,
|
||||
metric: &'static str,
|
||||
}
|
||||
|
||||
impl PageserverConsumptionMetricsKey {
|
||||
impl MetricsKey {
|
||||
const fn absolute_values(self) -> AbsoluteValueFactory {
|
||||
AbsoluteValueFactory(self)
|
||||
}
|
||||
@@ -48,18 +50,17 @@ impl PageserverConsumptionMetricsKey {
|
||||
}
|
||||
|
||||
/// Helper type which each individual metric kind can return to produce only absolute values.
|
||||
struct AbsoluteValueFactory(PageserverConsumptionMetricsKey);
|
||||
struct AbsoluteValueFactory(MetricsKey);
|
||||
|
||||
impl AbsoluteValueFactory {
|
||||
fn now(self, val: u64) -> (PageserverConsumptionMetricsKey, (EventType, u64)) {
|
||||
fn at(self, time: DateTime<Utc>, val: u64) -> (MetricsKey, (EventType, u64)) {
|
||||
let key = self.0;
|
||||
let time = Utc::now();
|
||||
(key, (EventType::Absolute { time }, val))
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper type which each individual metric kind can return to produce only incremental values.
|
||||
struct IncrementalValueFactory(PageserverConsumptionMetricsKey);
|
||||
struct IncrementalValueFactory(MetricsKey);
|
||||
|
||||
impl IncrementalValueFactory {
|
||||
#[allow(clippy::wrong_self_convention)]
|
||||
@@ -68,7 +69,7 @@ impl IncrementalValueFactory {
|
||||
prev_end: DateTime<Utc>,
|
||||
up_to: DateTime<Utc>,
|
||||
val: u64,
|
||||
) -> (PageserverConsumptionMetricsKey, (EventType, u64)) {
|
||||
) -> (MetricsKey, (EventType, u64)) {
|
||||
let key = self.0;
|
||||
// cannot assert prev_end < up_to because these are realtime clock based
|
||||
(
|
||||
@@ -83,15 +84,18 @@ impl IncrementalValueFactory {
|
||||
)
|
||||
}
|
||||
|
||||
fn key(&self) -> &PageserverConsumptionMetricsKey {
|
||||
fn key(&self) -> &MetricsKey {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
// the static part of a PageserverConsumptionMetricsKey
|
||||
impl PageserverConsumptionMetricsKey {
|
||||
// the static part of a MetricsKey
|
||||
impl MetricsKey {
|
||||
/// Absolute value of [`Timeline::get_last_record_lsn`].
|
||||
///
|
||||
/// [`Timeline::get_last_record_lsn`]: crate::tenant::Timeline::get_last_record_lsn
|
||||
const fn written_size(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory {
|
||||
PageserverConsumptionMetricsKey {
|
||||
MetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: Some(timeline_id),
|
||||
metric: "written_size",
|
||||
@@ -99,25 +103,31 @@ impl PageserverConsumptionMetricsKey {
|
||||
.absolute_values()
|
||||
}
|
||||
|
||||
/// Values will be the difference of the latest written_size (last_record_lsn) to what we
|
||||
/// previously sent.
|
||||
/// Values will be the difference of the latest [`MetricsKey::written_size`] to what we
|
||||
/// previously sent, starting from the previously sent incremental time range ending at the
|
||||
/// latest absolute measurement.
|
||||
const fn written_size_delta(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> IncrementalValueFactory {
|
||||
PageserverConsumptionMetricsKey {
|
||||
MetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: Some(timeline_id),
|
||||
metric: "written_size_bytes_delta",
|
||||
// the name here is correctly about data not size, because that is what is wanted by
|
||||
// downstream pipeline
|
||||
metric: "written_data_bytes_delta",
|
||||
}
|
||||
.incremental_values()
|
||||
}
|
||||
|
||||
/// Exact [`Timeline::get_current_logical_size`].
|
||||
///
|
||||
/// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
|
||||
const fn timeline_logical_size(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> AbsoluteValueFactory {
|
||||
PageserverConsumptionMetricsKey {
|
||||
MetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: Some(timeline_id),
|
||||
metric: "timeline_logical_size",
|
||||
@@ -125,8 +135,11 @@ impl PageserverConsumptionMetricsKey {
|
||||
.absolute_values()
|
||||
}
|
||||
|
||||
/// [`Tenant::remote_size`]
|
||||
///
|
||||
/// [`Tenant::remote_size`]: crate::tenant::Tenant::remote_size
|
||||
const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
|
||||
PageserverConsumptionMetricsKey {
|
||||
MetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: None,
|
||||
metric: "remote_storage_size",
|
||||
@@ -134,8 +147,11 @@ impl PageserverConsumptionMetricsKey {
|
||||
.absolute_values()
|
||||
}
|
||||
|
||||
/// Sum of [`Timeline::resident_physical_size`] for each `Tenant`.
|
||||
///
|
||||
/// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size
|
||||
const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory {
|
||||
PageserverConsumptionMetricsKey {
|
||||
MetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: None,
|
||||
metric: "resident_size",
|
||||
@@ -143,8 +159,11 @@ impl PageserverConsumptionMetricsKey {
|
||||
.absolute_values()
|
||||
}
|
||||
|
||||
/// [`Tenant::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
|
||||
///
|
||||
/// [`Tenant::cached_synthetic_size`]: crate::tenant::Tenant::cached_synthetic_size
|
||||
const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
|
||||
PageserverConsumptionMetricsKey {
|
||||
MetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: None,
|
||||
metric: "synthetic_storage_size",
|
||||
@@ -228,15 +247,15 @@ pub async fn collect_metrics(
|
||||
///
|
||||
/// TODO
|
||||
/// - refactor this function (chunking+sending part) to reuse it in proxy module;
|
||||
pub async fn collect_metrics_iteration(
|
||||
async fn collect_metrics_iteration(
|
||||
client: &reqwest::Client,
|
||||
cached_metrics: &mut HashMap<PageserverConsumptionMetricsKey, (EventType, u64)>,
|
||||
cached_metrics: &mut HashMap<MetricsKey, (EventType, u64)>,
|
||||
metric_collection_endpoint: &reqwest::Url,
|
||||
node_id: NodeId,
|
||||
ctx: &RequestContext,
|
||||
send_cached: bool,
|
||||
) {
|
||||
let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, (EventType, u64))> = Vec::new();
|
||||
let mut current_metrics: Vec<(MetricsKey, (EventType, u64))> = Vec::new();
|
||||
trace!(
|
||||
"starting collect_metrics_iteration. metric_collection_endpoint: {}",
|
||||
metric_collection_endpoint
|
||||
@@ -270,130 +289,48 @@ pub async fn collect_metrics_iteration(
|
||||
let mut tenant_resident_size = 0;
|
||||
|
||||
// iterate through list of timelines in tenant
|
||||
for timeline in tenant.list_timelines().iter() {
|
||||
for timeline in tenant.list_timelines() {
|
||||
// collect per-timeline metrics only for active timelines
|
||||
if timeline.is_active() {
|
||||
let timeline_written_size = u64::from(timeline.get_last_record_lsn());
|
||||
|
||||
let (key, written_size_now) =
|
||||
PageserverConsumptionMetricsKey::written_size(tenant_id, timeline.timeline_id)
|
||||
.now(timeline_written_size);
|
||||
let timeline_id = timeline.timeline_id;
|
||||
|
||||
// last_record_lsn can only go up, right now at least, TODO: #2592 or related
|
||||
// features might change this.
|
||||
|
||||
let written_size_delta_key = PageserverConsumptionMetricsKey::written_size_delta(
|
||||
tenant_id,
|
||||
timeline.timeline_id,
|
||||
);
|
||||
|
||||
// use this when available, because in a stream of incremental values, it will be
|
||||
// accurate where as when last_record_lsn stops moving, we will only cache the last
|
||||
// one of those.
|
||||
let last_stop_time =
|
||||
cached_metrics
|
||||
.get(written_size_delta_key.key())
|
||||
.map(|(until, _val)| {
|
||||
until
|
||||
.incremental_timerange()
|
||||
.expect("never create EventType::Absolute for written_size_delta")
|
||||
.end
|
||||
});
|
||||
|
||||
// by default, use the last sent written_size as the basis for
|
||||
// calculating the delta. if we don't yet have one, use the load time value.
|
||||
let prev = cached_metrics
|
||||
.get(&key)
|
||||
.map(|(prev_at, prev)| {
|
||||
// use the prev time from our last incremental update, or default to latest
|
||||
// absolute update on the first round.
|
||||
let prev_at = prev_at
|
||||
.absolute_time()
|
||||
.expect("never create EventType::Incremental for written_size");
|
||||
let prev_at = last_stop_time.unwrap_or(prev_at);
|
||||
(*prev_at, *prev)
|
||||
})
|
||||
.unwrap_or_else(|| {
|
||||
// if we don't have a previous point of comparison, compare to the load time
|
||||
// lsn.
|
||||
let (disk_consistent_lsn, loaded_at) = &timeline.loaded_at;
|
||||
(DateTime::from(*loaded_at), disk_consistent_lsn.0)
|
||||
});
|
||||
|
||||
// written_size_delta_bytes
|
||||
current_metrics.extend(
|
||||
if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
|
||||
let up_to = written_size_now
|
||||
.0
|
||||
.absolute_time()
|
||||
.expect("never create EventType::Incremental for written_size");
|
||||
let key_value =
|
||||
written_size_delta_key.from_previous_up_to(prev.0, *up_to, delta);
|
||||
Some(key_value)
|
||||
} else {
|
||||
None
|
||||
},
|
||||
);
|
||||
|
||||
// written_size
|
||||
current_metrics.push((key, written_size_now));
|
||||
|
||||
let span = info_span!("collect_metrics_iteration", tenant_id = %timeline.tenant_id, timeline_id = %timeline.timeline_id);
|
||||
match span.in_scope(|| timeline.get_current_logical_size(ctx)) {
|
||||
// Only send timeline logical size when it is fully calculated.
|
||||
Ok((size, is_exact)) if is_exact => {
|
||||
current_metrics.push(
|
||||
PageserverConsumptionMetricsKey::timeline_logical_size(
|
||||
tenant_id,
|
||||
timeline.timeline_id,
|
||||
)
|
||||
.now(size),
|
||||
);
|
||||
}
|
||||
Ok((_, _)) => {}
|
||||
Err(err) => {
|
||||
error!(
|
||||
"failed to get current logical size for timeline {}: {err:?}",
|
||||
timeline.timeline_id
|
||||
);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
match TimelineSnapshot::collect(&timeline, ctx) {
|
||||
Ok(Some(snap)) => {
|
||||
snap.to_metrics(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
Utc::now(),
|
||||
&mut current_metrics,
|
||||
cached_metrics,
|
||||
);
|
||||
}
|
||||
Ok(None) => {}
|
||||
Err(e) => {
|
||||
error!(
|
||||
"failed to get metrics values for tenant {tenant_id} timeline {}: {e:#?}",
|
||||
timeline.timeline_id
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
let timeline_resident_size = timeline.get_resident_physical_size();
|
||||
tenant_resident_size += timeline_resident_size;
|
||||
tenant_resident_size += timeline.resident_physical_size();
|
||||
}
|
||||
|
||||
match tenant.get_remote_size().await {
|
||||
Ok(tenant_remote_size) => {
|
||||
current_metrics.push(
|
||||
PageserverConsumptionMetricsKey::remote_storage_size(tenant_id)
|
||||
.now(tenant_remote_size),
|
||||
);
|
||||
}
|
||||
Err(err) => {
|
||||
error!(
|
||||
"failed to get remote size for tenant {}: {err:?}",
|
||||
tenant_id
|
||||
);
|
||||
}
|
||||
}
|
||||
current_metrics
|
||||
.push(MetricsKey::remote_storage_size(tenant_id).at(Utc::now(), tenant.remote_size()));
|
||||
|
||||
current_metrics.push(
|
||||
PageserverConsumptionMetricsKey::resident_size(tenant_id).now(tenant_resident_size),
|
||||
);
|
||||
current_metrics
|
||||
.push(MetricsKey::resident_size(tenant_id).at(Utc::now(), tenant_resident_size));
|
||||
|
||||
// Note that this metric is calculated in a separate bgworker
|
||||
// Here we only use cached value, which may lag behind the real latest one
|
||||
let tenant_synthetic_size = tenant.get_cached_synthetic_size();
|
||||
let synthetic_size = tenant.cached_synthetic_size();
|
||||
|
||||
if tenant_synthetic_size != 0 {
|
||||
if synthetic_size != 0 {
|
||||
// only send non-zeroes because otherwise these show up as errors in logs
|
||||
current_metrics.push(
|
||||
PageserverConsumptionMetricsKey::synthetic_size(tenant_id)
|
||||
.now(tenant_synthetic_size),
|
||||
);
|
||||
current_metrics
|
||||
.push(MetricsKey::synthetic_size(tenant_id).at(Utc::now(), synthetic_size));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -425,6 +362,8 @@ pub async fn collect_metrics_iteration(
|
||||
|
||||
let mut chunk_to_send: Vec<Event<Ids>> = Vec::with_capacity(CHUNK_SIZE);
|
||||
|
||||
let node_id = node_id.to_string();
|
||||
|
||||
for chunk in chunks {
|
||||
chunk_to_send.clear();
|
||||
|
||||
@@ -432,7 +371,7 @@ pub async fn collect_metrics_iteration(
|
||||
chunk_to_send.extend(chunk.iter().map(|(curr_key, (when, curr_val))| Event {
|
||||
kind: *when,
|
||||
metric: curr_key.metric,
|
||||
idempotency_key: idempotency_key(node_id.to_string()),
|
||||
idempotency_key: idempotency_key(&node_id),
|
||||
value: *curr_val,
|
||||
extra: Ids {
|
||||
tenant_id: curr_key.tenant_id,
|
||||
@@ -440,17 +379,14 @@ pub async fn collect_metrics_iteration(
|
||||
},
|
||||
}));
|
||||
|
||||
let chunk_json = serde_json::value::to_raw_value(&EventChunk {
|
||||
events: &chunk_to_send,
|
||||
})
|
||||
.expect("PageserverConsumptionMetric should not fail serialization");
|
||||
|
||||
const MAX_RETRIES: u32 = 3;
|
||||
|
||||
for attempt in 0..MAX_RETRIES {
|
||||
let res = client
|
||||
.post(metric_collection_endpoint.clone())
|
||||
.json(&chunk_json)
|
||||
.json(&EventChunk {
|
||||
events: (&chunk_to_send).into(),
|
||||
})
|
||||
.send()
|
||||
.await;
|
||||
|
||||
@@ -486,6 +422,130 @@ pub async fn collect_metrics_iteration(
|
||||
}
|
||||
}
|
||||
|
||||
/// Internal type to make timeline metric production testable.
|
||||
///
|
||||
/// As this value type contains all of the information needed from a timeline to produce the
|
||||
/// metrics, it can easily be created with different values in test.
|
||||
struct TimelineSnapshot {
|
||||
loaded_at: (Lsn, SystemTime),
|
||||
last_record_lsn: Lsn,
|
||||
current_exact_logical_size: Option<u64>,
|
||||
}
|
||||
|
||||
impl TimelineSnapshot {
|
||||
/// Collect the metrics from an actual timeline.
|
||||
///
|
||||
/// Fails currently only when [`Timeline::get_current_logical_size`] fails.
|
||||
///
|
||||
/// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
|
||||
fn collect(
|
||||
t: &Arc<crate::tenant::Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Option<Self>> {
|
||||
use anyhow::Context;
|
||||
|
||||
if !t.is_active() {
|
||||
// no collection for broken or stopping needed, we will still keep the cached values
|
||||
// though at the caller.
|
||||
Ok(None)
|
||||
} else {
|
||||
let loaded_at = t.loaded_at;
|
||||
let last_record_lsn = t.get_last_record_lsn();
|
||||
|
||||
let current_exact_logical_size = {
|
||||
let span = info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id);
|
||||
let res = span
|
||||
.in_scope(|| t.get_current_logical_size(ctx))
|
||||
.context("get_current_logical_size");
|
||||
match res? {
|
||||
// Only send timeline logical size when it is fully calculated.
|
||||
(size, is_exact) if is_exact => Some(size),
|
||||
(_, _) => None,
|
||||
}
|
||||
};
|
||||
|
||||
Ok(Some(TimelineSnapshot {
|
||||
loaded_at,
|
||||
last_record_lsn,
|
||||
current_exact_logical_size,
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
/// Produce the timeline consumption metrics into the `metrics` argument.
|
||||
fn to_metrics(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
now: DateTime<Utc>,
|
||||
metrics: &mut Vec<(MetricsKey, (EventType, u64))>,
|
||||
cache: &HashMap<MetricsKey, (EventType, u64)>,
|
||||
) {
|
||||
let timeline_written_size = u64::from(self.last_record_lsn);
|
||||
|
||||
let (key, written_size_now) =
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(now, timeline_written_size);
|
||||
|
||||
// last_record_lsn can only go up, right now at least, TODO: #2592 or related
|
||||
// features might change this.
|
||||
|
||||
let written_size_delta_key = MetricsKey::written_size_delta(tenant_id, timeline_id);
|
||||
|
||||
// use this when available, because in a stream of incremental values, it will be
|
||||
// accurate where as when last_record_lsn stops moving, we will only cache the last
|
||||
// one of those.
|
||||
let last_stop_time = cache
|
||||
.get(written_size_delta_key.key())
|
||||
.map(|(until, _val)| {
|
||||
until
|
||||
.incremental_timerange()
|
||||
.expect("never create EventType::Absolute for written_size_delta")
|
||||
.end
|
||||
});
|
||||
|
||||
// by default, use the last sent written_size as the basis for
|
||||
// calculating the delta. if we don't yet have one, use the load time value.
|
||||
let prev = cache
|
||||
.get(&key)
|
||||
.map(|(prev_at, prev)| {
|
||||
// use the prev time from our last incremental update, or default to latest
|
||||
// absolute update on the first round.
|
||||
let prev_at = prev_at
|
||||
.absolute_time()
|
||||
.expect("never create EventType::Incremental for written_size");
|
||||
let prev_at = last_stop_time.unwrap_or(prev_at);
|
||||
(*prev_at, *prev)
|
||||
})
|
||||
.unwrap_or_else(|| {
|
||||
// if we don't have a previous point of comparison, compare to the load time
|
||||
// lsn.
|
||||
let (disk_consistent_lsn, loaded_at) = &self.loaded_at;
|
||||
(DateTime::from(*loaded_at), disk_consistent_lsn.0)
|
||||
});
|
||||
|
||||
// written_size_bytes_delta
|
||||
metrics.extend(
|
||||
if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
|
||||
let up_to = written_size_now
|
||||
.0
|
||||
.absolute_time()
|
||||
.expect("never create EventType::Incremental for written_size");
|
||||
let key_value = written_size_delta_key.from_previous_up_to(prev.0, *up_to, delta);
|
||||
Some(key_value)
|
||||
} else {
|
||||
None
|
||||
},
|
||||
);
|
||||
|
||||
// written_size
|
||||
metrics.push((key, written_size_now));
|
||||
|
||||
if let Some(size) = self.current_exact_logical_size {
|
||||
metrics.push(MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, size));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Caclculate synthetic size for each active tenant
|
||||
pub async fn calculate_synthetic_size_worker(
|
||||
synthetic_size_calculation_interval: Duration,
|
||||
@@ -500,7 +560,7 @@ pub async fn calculate_synthetic_size_worker(
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
return Ok(());
|
||||
},
|
||||
tick_at = ticker.tick() => {
|
||||
tick_at = ticker.tick() => {
|
||||
|
||||
let tenants = match mgr::list_tenants().await {
|
||||
Ok(tenants) => tenants,
|
||||
@@ -536,3 +596,149 @@ pub async fn calculate_synthetic_size_worker(
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::HashMap;
|
||||
|
||||
use std::time::SystemTime;
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::consumption_metrics::MetricsKey;
|
||||
|
||||
use super::TimelineSnapshot;
|
||||
use chrono::{DateTime, Utc};
|
||||
|
||||
#[test]
|
||||
fn startup_collected_timeline_metrics_before_advancing() {
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
let cache = HashMap::new();
|
||||
|
||||
let initdb_lsn = Lsn(0x10000);
|
||||
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
|
||||
|
||||
let snap = TimelineSnapshot {
|
||||
loaded_at: (disk_consistent_lsn, SystemTime::now()),
|
||||
last_record_lsn: disk_consistent_lsn,
|
||||
current_exact_logical_size: Some(0x42000),
|
||||
};
|
||||
|
||||
let now = DateTime::<Utc>::from(SystemTime::now());
|
||||
|
||||
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
|
||||
|
||||
assert_eq!(
|
||||
metrics,
|
||||
&[
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
|
||||
snap.loaded_at.1.into(),
|
||||
now,
|
||||
0
|
||||
),
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
|
||||
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn startup_collected_timeline_metrics_second_round() {
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
|
||||
let [now, before, init] = time_backwards();
|
||||
|
||||
let now = DateTime::<Utc>::from(now);
|
||||
let before = DateTime::<Utc>::from(before);
|
||||
|
||||
let initdb_lsn = Lsn(0x10000);
|
||||
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
let cache = HashMap::from([
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0)
|
||||
]);
|
||||
|
||||
let snap = TimelineSnapshot {
|
||||
loaded_at: (disk_consistent_lsn, init),
|
||||
last_record_lsn: disk_consistent_lsn,
|
||||
current_exact_logical_size: Some(0x42000),
|
||||
};
|
||||
|
||||
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
|
||||
|
||||
assert_eq!(
|
||||
metrics,
|
||||
&[
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id)
|
||||
.from_previous_up_to(before, now, 0),
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
|
||||
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
|
||||
let [now, just_before, before, init] = time_backwards();
|
||||
|
||||
let now = DateTime::<Utc>::from(now);
|
||||
let just_before = DateTime::<Utc>::from(just_before);
|
||||
let before = DateTime::<Utc>::from(before);
|
||||
|
||||
let initdb_lsn = Lsn(0x10000);
|
||||
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
let cache = HashMap::from([
|
||||
// at t=before was the last time the last_record_lsn changed
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0),
|
||||
// end time of this event is used for the next ones
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
|
||||
before,
|
||||
just_before,
|
||||
0,
|
||||
),
|
||||
]);
|
||||
|
||||
let snap = TimelineSnapshot {
|
||||
loaded_at: (disk_consistent_lsn, init),
|
||||
last_record_lsn: disk_consistent_lsn,
|
||||
current_exact_logical_size: Some(0x42000),
|
||||
};
|
||||
|
||||
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
|
||||
|
||||
assert_eq!(
|
||||
metrics,
|
||||
&[
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
|
||||
just_before,
|
||||
now,
|
||||
0
|
||||
),
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
|
||||
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
fn time_backwards<const N: usize>() -> [std::time::SystemTime; N] {
|
||||
let mut times = [std::time::SystemTime::UNIX_EPOCH; N];
|
||||
times[0] = std::time::SystemTime::now();
|
||||
for behind in 1..N {
|
||||
times[behind] = times[0] - std::time::Duration::from_secs(behind as u64);
|
||||
}
|
||||
|
||||
times
|
||||
}
|
||||
}
|
||||
|
||||
@@ -85,6 +85,7 @@
|
||||
//! The solution is that all code paths are infected with precisely one
|
||||
//! [`RequestContext`] argument. Functions in the middle of the call chain
|
||||
//! only need to pass it on.
|
||||
|
||||
use crate::task_mgr::TaskKind;
|
||||
|
||||
// The main structure of this module, see module-level comment.
|
||||
@@ -92,6 +93,7 @@ use crate::task_mgr::TaskKind;
|
||||
pub struct RequestContext {
|
||||
task_kind: TaskKind,
|
||||
download_behavior: DownloadBehavior,
|
||||
access_stats_behavior: AccessStatsBehavior,
|
||||
}
|
||||
|
||||
/// Desired behavior if the operation requires an on-demand download
|
||||
@@ -109,6 +111,67 @@ pub enum DownloadBehavior {
|
||||
Error,
|
||||
}
|
||||
|
||||
/// Whether this request should update access times used in LRU eviction
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
|
||||
pub(crate) enum AccessStatsBehavior {
|
||||
/// Update access times: this request's access to data should be taken
|
||||
/// as a hint that the accessed layer is likely to be accessed again
|
||||
Update,
|
||||
|
||||
/// Do not update access times: this request is accessing the layer
|
||||
/// but does not want to indicate that the layer should be retained in cache,
|
||||
/// perhaps because the requestor is a compaction routine that will soon cover
|
||||
/// this layer with another.
|
||||
Skip,
|
||||
}
|
||||
|
||||
pub struct RequestContextBuilder {
|
||||
inner: RequestContext,
|
||||
}
|
||||
|
||||
impl RequestContextBuilder {
|
||||
/// A new builder with default settings
|
||||
pub fn new(task_kind: TaskKind) -> Self {
|
||||
Self {
|
||||
inner: RequestContext {
|
||||
task_kind,
|
||||
download_behavior: DownloadBehavior::Download,
|
||||
access_stats_behavior: AccessStatsBehavior::Update,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub fn extend(original: &RequestContext) -> Self {
|
||||
Self {
|
||||
// This is like a Copy, but avoid implementing Copy because ordinary users of
|
||||
// RequestContext should always move or ref it.
|
||||
inner: RequestContext {
|
||||
task_kind: original.task_kind,
|
||||
download_behavior: original.download_behavior,
|
||||
access_stats_behavior: original.access_stats_behavior,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// Configure the DownloadBehavior of the context: whether to
|
||||
/// download missing layers, and/or warn on the download.
|
||||
pub fn download_behavior(mut self, b: DownloadBehavior) -> Self {
|
||||
self.inner.download_behavior = b;
|
||||
self
|
||||
}
|
||||
|
||||
/// Configure the AccessStatsBehavior of the context: whether layer
|
||||
/// accesses should update the access time of the layer.
|
||||
pub(crate) fn access_stats_behavior(mut self, b: AccessStatsBehavior) -> Self {
|
||||
self.inner.access_stats_behavior = b;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn build(self) -> RequestContext {
|
||||
self.inner
|
||||
}
|
||||
}
|
||||
|
||||
impl RequestContext {
|
||||
/// Create a new RequestContext that has no parent.
|
||||
///
|
||||
@@ -123,10 +186,9 @@ impl RequestContext {
|
||||
/// because someone explicitly canceled it.
|
||||
/// It has no parent, so it cannot inherit cancellation from there.
|
||||
pub fn new(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
|
||||
RequestContext {
|
||||
task_kind,
|
||||
download_behavior,
|
||||
}
|
||||
RequestContextBuilder::new(task_kind)
|
||||
.download_behavior(download_behavior)
|
||||
.build()
|
||||
}
|
||||
|
||||
/// Create a detached child context for a task that may outlive `self`.
|
||||
@@ -187,10 +249,7 @@ impl RequestContext {
|
||||
}
|
||||
|
||||
fn child_impl(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
|
||||
RequestContext {
|
||||
task_kind,
|
||||
download_behavior,
|
||||
}
|
||||
Self::new(task_kind, download_behavior)
|
||||
}
|
||||
|
||||
pub fn task_kind(&self) -> TaskKind {
|
||||
@@ -200,4 +259,8 @@ impl RequestContext {
|
||||
pub fn download_behavior(&self) -> DownloadBehavior {
|
||||
self.download_behavior
|
||||
}
|
||||
|
||||
pub(crate) fn access_stats_behavior(&self) -> AccessStatsBehavior {
|
||||
self.access_stats_behavior
|
||||
}
|
||||
}
|
||||
|
||||
@@ -304,17 +304,18 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
// Debug-log the list of candidates
|
||||
let now = SystemTime::now();
|
||||
for (i, (partition, candidate)) in candidates.iter().enumerate() {
|
||||
let desc = candidate.layer.layer_desc();
|
||||
debug!(
|
||||
"cand {}/{}: size={}, no_access_for={}us, partition={:?}, {}/{}/{}",
|
||||
i + 1,
|
||||
candidates.len(),
|
||||
candidate.layer.file_size(),
|
||||
desc.file_size,
|
||||
now.duration_since(candidate.last_activity_ts)
|
||||
.unwrap()
|
||||
.as_micros(),
|
||||
partition,
|
||||
candidate.layer.get_tenant_id(),
|
||||
candidate.layer.get_timeline_id(),
|
||||
desc.tenant_id,
|
||||
desc.timeline_id,
|
||||
candidate.layer,
|
||||
);
|
||||
}
|
||||
@@ -346,7 +347,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
warned = Some(usage_planned);
|
||||
}
|
||||
|
||||
usage_planned.add_available_bytes(candidate.layer.file_size());
|
||||
usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);
|
||||
|
||||
batched
|
||||
.entry(TimelineKey(candidate.timeline))
|
||||
@@ -389,15 +390,16 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
Ok(results) => {
|
||||
assert_eq!(results.len(), batch.len());
|
||||
for (result, layer) in results.into_iter().zip(batch.iter()) {
|
||||
let file_size = layer.layer_desc().file_size;
|
||||
match result {
|
||||
Some(Ok(())) => {
|
||||
usage_assumed.add_available_bytes(layer.file_size());
|
||||
usage_assumed.add_available_bytes(file_size);
|
||||
}
|
||||
Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
|
||||
unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers")
|
||||
}
|
||||
Some(Err(EvictionError::FileNotFound)) => {
|
||||
evictions_failed.file_sizes += layer.file_size();
|
||||
evictions_failed.file_sizes += file_size;
|
||||
evictions_failed.count += 1;
|
||||
}
|
||||
Some(Err(
|
||||
@@ -406,7 +408,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
)) => {
|
||||
let e = utils::error::report_compact_sources(&e);
|
||||
warn!(%layer, "failed to evict layer: {e}");
|
||||
evictions_failed.file_sizes += layer.file_size();
|
||||
evictions_failed.file_sizes += file_size;
|
||||
evictions_failed.count += 1;
|
||||
}
|
||||
None => {
|
||||
|
||||
@@ -93,6 +93,47 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
delete:
|
||||
description: |
|
||||
Attempts to delete specified tenant. 500 and 409 errors should be retried until 404 is retrieved.
|
||||
404 means that deletion successfully finished"
|
||||
responses:
|
||||
"400":
|
||||
description: Error when no tenant id found in path
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"404":
|
||||
description: Tenant not found
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/NotFoundError"
|
||||
"409":
|
||||
description: Deletion is already in progress, continue polling
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ConflictError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline:
|
||||
parameters:
|
||||
@@ -820,6 +861,7 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/config:
|
||||
put:
|
||||
description: |
|
||||
|
||||
@@ -187,7 +187,7 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
|
||||
format!("Cannot delete timeline which has child timelines: {children:?}")
|
||||
.into_boxed_str(),
|
||||
),
|
||||
a @ AlreadyInProgress => ApiError::Conflict(a.to_string()),
|
||||
a @ AlreadyInProgress(_) => ApiError::Conflict(a.to_string()),
|
||||
Other(e) => ApiError::InternalServerError(e),
|
||||
}
|
||||
}
|
||||
@@ -208,6 +208,19 @@ impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
|
||||
fn from(value: crate::tenant::delete::DeleteTenantError) -> Self {
|
||||
use crate::tenant::delete::DeleteTenantError::*;
|
||||
match value {
|
||||
Get(g) => ApiError::from(g),
|
||||
e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
|
||||
Timeline(t) => ApiError::from(t),
|
||||
Other(o) => ApiError::InternalServerError(o),
|
||||
e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to construct a TimelineInfo struct for a timeline
|
||||
async fn build_timeline_info(
|
||||
timeline: &Arc<Timeline>,
|
||||
@@ -617,6 +630,23 @@ async fn tenant_status(
|
||||
json_response(StatusCode::OK, tenant_info)
|
||||
}
|
||||
|
||||
async fn tenant_delete_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
// TODO openapi spec
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_id)
|
||||
.instrument(info_span!("tenant_delete_handler", %tenant_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
|
||||
/// HTTP endpoint to query the current tenant_size of a tenant.
|
||||
///
|
||||
/// This is not used by consumption metrics under [`crate::consumption_metrics`], but can be used
|
||||
@@ -1345,6 +1375,9 @@ pub fn make_router(
|
||||
.get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
|
||||
.post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
|
||||
.get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
|
||||
.delete("/v1/tenant/:tenant_id", |r| {
|
||||
api_handler(r, tenant_delete_handler)
|
||||
})
|
||||
.get("/v1/tenant/:tenant_id/synthetic_size", |r| {
|
||||
api_handler(r, tenant_size_handler)
|
||||
})
|
||||
|
||||
@@ -7,7 +7,7 @@ pub mod disk_usage_eviction_task;
|
||||
pub mod http;
|
||||
pub mod import_datadir;
|
||||
pub mod keyspace;
|
||||
pub(crate) mod metrics;
|
||||
pub mod metrics;
|
||||
pub mod page_cache;
|
||||
pub mod page_service;
|
||||
pub mod pgdatadir_mapping;
|
||||
@@ -47,50 +47,54 @@ pub use crate::metrics::preinitialize_metrics;
|
||||
|
||||
#[tracing::instrument]
|
||||
pub async fn shutdown_pageserver(exit_code: i32) {
|
||||
use std::time::Duration;
|
||||
// Shut down the libpq endpoint task. This prevents new connections from
|
||||
// being accepted.
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None).await;
|
||||
timed(
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None),
|
||||
"shutdown LibpqEndpointListener",
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
.await;
|
||||
|
||||
// Shut down any page service tasks.
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None).await;
|
||||
timed(
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
|
||||
"shutdown PageRequestHandlers",
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
.await;
|
||||
|
||||
// Shut down all the tenants. This flushes everything to disk and kills
|
||||
// the checkpoint and GC tasks.
|
||||
tenant::mgr::shutdown_all_tenants().await;
|
||||
timed(
|
||||
tenant::mgr::shutdown_all_tenants(),
|
||||
"shutdown all tenants",
|
||||
Duration::from_secs(5),
|
||||
)
|
||||
.await;
|
||||
|
||||
// Shut down the HTTP endpoint last, so that you can still check the server's
|
||||
// status while it's shutting down.
|
||||
// FIXME: We should probably stop accepting commands like attach/detach earlier.
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None).await;
|
||||
timed(
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None),
|
||||
"shutdown http",
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
.await;
|
||||
|
||||
// There should be nothing left, but let's be sure
|
||||
task_mgr::shutdown_tasks(None, None, None).await;
|
||||
timed(
|
||||
task_mgr::shutdown_tasks(None, None, None),
|
||||
"shutdown leftovers",
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
.await;
|
||||
info!("Shut down successfully completed");
|
||||
std::process::exit(exit_code);
|
||||
}
|
||||
|
||||
const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
|
||||
const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
|
||||
|
||||
async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
|
||||
let backoff_duration_seconds =
|
||||
exponential_backoff_duration_seconds(n, base_increment, max_seconds);
|
||||
if backoff_duration_seconds > 0.0 {
|
||||
info!(
|
||||
"Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
|
||||
);
|
||||
tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
|
||||
if n == 0 {
|
||||
0.0
|
||||
} else {
|
||||
(1.0 + base_increment).powf(f64::from(n)).min(max_seconds)
|
||||
}
|
||||
}
|
||||
|
||||
/// The name of the metadata file pageserver creates per timeline.
|
||||
/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>/metadata`.
|
||||
pub const METADATA_FILE_NAME: &str = "metadata";
|
||||
@@ -164,7 +168,7 @@ pub struct InitializationOrder {
|
||||
|
||||
/// Each timeline owns a clone of this to be consumed on the initial logical size calculation
|
||||
/// attempt. It is important to drop this once the attempt has completed.
|
||||
pub initial_logical_size_attempt: utils::completion::Completion,
|
||||
pub initial_logical_size_attempt: Option<utils::completion::Completion>,
|
||||
|
||||
/// Barrier for when we can start any background jobs.
|
||||
///
|
||||
@@ -172,33 +176,75 @@ pub struct InitializationOrder {
|
||||
pub background_jobs_can_start: utils::completion::Barrier,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod backoff_defaults_tests {
|
||||
use super::*;
|
||||
/// Time the future with a warning when it exceeds a threshold.
|
||||
async fn timed<Fut: std::future::Future>(
|
||||
fut: Fut,
|
||||
name: &str,
|
||||
warn_at: std::time::Duration,
|
||||
) -> <Fut as std::future::Future>::Output {
|
||||
let started = std::time::Instant::now();
|
||||
|
||||
#[test]
|
||||
fn backoff_defaults_produce_growing_backoff_sequence() {
|
||||
let mut current_backoff_value = None;
|
||||
let mut fut = std::pin::pin!(fut);
|
||||
|
||||
for i in 0..10_000 {
|
||||
let new_backoff_value = exponential_backoff_duration_seconds(
|
||||
i,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
match tokio::time::timeout(warn_at, &mut fut).await {
|
||||
Ok(ret) => {
|
||||
tracing::info!(
|
||||
task = name,
|
||||
elapsed_ms = started.elapsed().as_millis(),
|
||||
"completed"
|
||||
);
|
||||
ret
|
||||
}
|
||||
Err(_) => {
|
||||
tracing::info!(
|
||||
task = name,
|
||||
elapsed_ms = started.elapsed().as_millis(),
|
||||
"still waiting, taking longer than expected..."
|
||||
);
|
||||
|
||||
if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) {
|
||||
assert!(
|
||||
old_backoff_value <= new_backoff_value,
|
||||
"{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}"
|
||||
)
|
||||
}
|
||||
}
|
||||
let ret = fut.await;
|
||||
|
||||
assert_eq!(
|
||||
current_backoff_value.expect("Should have produced backoff values to compare"),
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
"Given big enough of retries, backoff should reach its allowed max value"
|
||||
);
|
||||
// this has a global allowed_errors
|
||||
tracing::warn!(
|
||||
task = name,
|
||||
elapsed_ms = started.elapsed().as_millis(),
|
||||
"completed, took longer than expected"
|
||||
);
|
||||
|
||||
ret
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod timed_tests {
|
||||
use super::timed;
|
||||
use std::time::Duration;
|
||||
|
||||
#[tokio::test]
|
||||
async fn timed_completes_when_inner_future_completes() {
|
||||
// A future that completes on time should have its result returned
|
||||
let r1 = timed(
|
||||
async move {
|
||||
tokio::time::sleep(Duration::from_millis(10)).await;
|
||||
123
|
||||
},
|
||||
"test 1",
|
||||
Duration::from_millis(50),
|
||||
)
|
||||
.await;
|
||||
assert_eq!(r1, 123);
|
||||
|
||||
// A future that completes too slowly should also have its result returned
|
||||
let r1 = timed(
|
||||
async move {
|
||||
tokio::time::sleep(Duration::from_millis(50)).await;
|
||||
456
|
||||
},
|
||||
"test 1",
|
||||
Duration::from_millis(10),
|
||||
)
|
||||
.await;
|
||||
assert_eq!(r1, 456);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use metrics::metric_vec_duration::DurationResultObserver;
|
||||
use metrics::{
|
||||
register_counter_vec, register_histogram, register_histogram_vec, register_int_counter,
|
||||
register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge,
|
||||
register_uint_gauge_vec, Counter, CounterVec, Histogram, HistogramVec, IntCounter,
|
||||
IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
|
||||
register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
|
||||
register_int_counter, register_int_counter_vec, register_int_gauge, register_int_gauge_vec,
|
||||
register_uint_gauge, register_uint_gauge_vec, Counter, CounterVec, GaugeVec, Histogram,
|
||||
HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
use strum::VariantNames;
|
||||
@@ -394,6 +394,35 @@ pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(||
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
/// How long did we take to start up? Broken down by labels to describe
|
||||
/// different phases of startup.
|
||||
pub static STARTUP_DURATION: Lazy<GaugeVec> = Lazy::new(|| {
|
||||
register_gauge_vec!(
|
||||
"pageserver_startup_duration_seconds",
|
||||
"Time taken by phases of pageserver startup, in seconds",
|
||||
&["phase"]
|
||||
)
|
||||
.expect("Failed to register pageserver_startup_duration_seconds metric")
|
||||
});
|
||||
|
||||
pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
|
||||
register_uint_gauge!(
|
||||
"pageserver_startup_is_loading",
|
||||
"1 while in initial startup load of tenants, 0 at other times"
|
||||
)
|
||||
.expect("Failed to register pageserver_startup_is_loading")
|
||||
});
|
||||
|
||||
/// How long did tenants take to go from construction to active state?
|
||||
pub(crate) static TENANT_ACTIVATION: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_tenant_activation_seconds",
|
||||
"Time taken by tenants to activate, in seconds",
|
||||
CRITICAL_OP_BUCKETS.into()
|
||||
)
|
||||
.expect("Failed to register pageserver_tenant_activation_seconds metric")
|
||||
});
|
||||
|
||||
/// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
|
||||
#[derive(Debug)]
|
||||
pub struct EvictionsWithLowResidenceDuration {
|
||||
|
||||
@@ -28,6 +28,7 @@ use std::cmp::min;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt::Debug;
|
||||
use std::fs;
|
||||
use std::fs::File;
|
||||
use std::fs::OpenOptions;
|
||||
@@ -46,8 +47,10 @@ use std::sync::{Mutex, RwLock};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use self::config::TenantConf;
|
||||
use self::delete::DeleteTenantFlow;
|
||||
use self::metadata::LoadMetadataError;
|
||||
use self::metadata::TimelineMetadata;
|
||||
use self::mgr::TenantsMap;
|
||||
use self::remote_timeline_client::RemoteTimelineClient;
|
||||
use self::timeline::uninit::TimelineUninitMark;
|
||||
use self::timeline::uninit::UninitializedTimeline;
|
||||
@@ -56,6 +59,7 @@ use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::import_datadir;
|
||||
use crate::is_uninit_mark;
|
||||
use crate::metrics::TENANT_ACTIVATION;
|
||||
use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC};
|
||||
use crate::repository::GcResult;
|
||||
use crate::task_mgr;
|
||||
@@ -105,6 +109,7 @@ macro_rules! pausable_failpoint {
|
||||
|
||||
pub mod blob_io;
|
||||
pub mod block_io;
|
||||
|
||||
pub mod disk_btree;
|
||||
pub(crate) mod ephemeral_file;
|
||||
pub mod layer_map;
|
||||
@@ -117,6 +122,7 @@ mod remote_timeline_client;
|
||||
pub mod storage_layer;
|
||||
|
||||
pub mod config;
|
||||
pub mod delete;
|
||||
pub mod mgr;
|
||||
pub mod tasks;
|
||||
pub mod upload_queue;
|
||||
@@ -144,6 +150,8 @@ pub const TIMELINES_SEGMENT_NAME: &str = "timelines";
|
||||
|
||||
pub const TENANT_ATTACHING_MARKER_FILENAME: &str = "attaching";
|
||||
|
||||
pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted";
|
||||
|
||||
///
|
||||
/// Tenant consists of multiple timelines. Keep them in a hash table.
|
||||
///
|
||||
@@ -182,6 +190,8 @@ pub struct Tenant {
|
||||
cached_synthetic_tenant_size: Arc<AtomicU64>,
|
||||
|
||||
eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
|
||||
|
||||
pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
|
||||
}
|
||||
|
||||
// We should not blindly overwrite local metadata with remote one.
|
||||
@@ -273,7 +283,7 @@ pub enum LoadLocalTimelineError {
|
||||
ResumeDeletion(#[source] anyhow::Error),
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[derive(thiserror::Error)]
|
||||
pub enum DeleteTimelineError {
|
||||
#[error("NotFound")]
|
||||
NotFound,
|
||||
@@ -282,17 +292,37 @@ pub enum DeleteTimelineError {
|
||||
HasChildren(Vec<TimelineId>),
|
||||
|
||||
#[error("Timeline deletion is already in progress")]
|
||||
AlreadyInProgress,
|
||||
AlreadyInProgress(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>),
|
||||
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
impl Debug for DeleteTimelineError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::NotFound => write!(f, "NotFound"),
|
||||
Self::HasChildren(c) => f.debug_tuple("HasChildren").field(c).finish(),
|
||||
Self::AlreadyInProgress(_) => f.debug_tuple("AlreadyInProgress").finish(),
|
||||
Self::Other(e) => f.debug_tuple("Other").field(e).finish(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub enum SetStoppingError {
|
||||
AlreadyStopping(completion::Barrier),
|
||||
Broken,
|
||||
}
|
||||
|
||||
impl Debug for SetStoppingError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::AlreadyStopping(_) => f.debug_tuple("AlreadyStopping").finish(),
|
||||
Self::Broken => write!(f, "Broken"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct RemoteStartupData {
|
||||
index_part: IndexPart,
|
||||
remote_metadata: TimelineMetadata,
|
||||
@@ -615,7 +645,7 @@ impl Tenant {
|
||||
// For every timeline, download the metadata file, scan the local directory,
|
||||
// and build a layer map that contains an entry for each remote and local
|
||||
// layer file.
|
||||
let sorted_timelines = tree_sort_timelines(timeline_ancestors)?;
|
||||
let sorted_timelines = tree_sort_timelines(timeline_ancestors, |m| m.ancestor_timeline())?;
|
||||
for (timeline_id, remote_metadata) in sorted_timelines {
|
||||
let (index_part, remote_client) = remote_index_and_client
|
||||
.remove(&timeline_id)
|
||||
@@ -644,20 +674,19 @@ impl Tenant {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// get size of all remote timelines
|
||||
/// Get sum of all remote timelines sizes
|
||||
///
|
||||
/// This function relies on the index_part instead of listing the remote storage
|
||||
///
|
||||
pub async fn get_remote_size(&self) -> anyhow::Result<u64> {
|
||||
pub fn remote_size(&self) -> u64 {
|
||||
let mut size = 0;
|
||||
|
||||
for timeline in self.list_timelines().iter() {
|
||||
for timeline in self.list_timelines() {
|
||||
if let Some(remote_client) = &timeline.remote_client {
|
||||
size += remote_client.get_remote_physical_size();
|
||||
}
|
||||
}
|
||||
|
||||
Ok(size)
|
||||
size
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(timeline_id=%timeline_id))]
|
||||
@@ -740,12 +769,13 @@ impl Tenant {
|
||||
/// If the loading fails for some reason, the Tenant will go into Broken
|
||||
/// state.
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant_id))]
|
||||
pub fn spawn_load(
|
||||
pub(crate) fn spawn_load(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
init_order: Option<InitializationOrder>,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
ctx: &RequestContext,
|
||||
) -> Arc<Tenant> {
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
@@ -765,7 +795,7 @@ impl Tenant {
|
||||
tenant_conf,
|
||||
wal_redo_manager,
|
||||
tenant_id,
|
||||
remote_storage,
|
||||
remote_storage.clone(),
|
||||
);
|
||||
let tenant = Arc::new(tenant);
|
||||
|
||||
@@ -781,27 +811,83 @@ impl Tenant {
|
||||
"initial tenant load",
|
||||
false,
|
||||
async move {
|
||||
let make_broken = |t: &Tenant, err: anyhow::Error| {
|
||||
error!("load failed, setting tenant state to Broken: {err:?}");
|
||||
t.state.send_modify(|state| {
|
||||
assert!(
|
||||
matches!(*state, TenantState::Loading | TenantState::Stopping { .. }),
|
||||
"the loading task owns the tenant state until activation is complete"
|
||||
);
|
||||
*state = TenantState::broken_from_reason(err.to_string());
|
||||
});
|
||||
};
|
||||
|
||||
let mut init_order = init_order;
|
||||
|
||||
// take the completion because initial tenant loading will complete when all of
|
||||
// these tasks complete.
|
||||
let _completion = init_order.as_mut().and_then(|x| x.initial_tenant_load.take());
|
||||
let _completion = init_order
|
||||
.as_mut()
|
||||
.and_then(|x| x.initial_tenant_load.take());
|
||||
|
||||
// Dont block pageserver startup on figuring out deletion status
|
||||
let pending_deletion = {
|
||||
match DeleteTenantFlow::should_resume_deletion(
|
||||
conf,
|
||||
remote_storage.as_ref(),
|
||||
&tenant_clone,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(should_resume_deletion) => should_resume_deletion,
|
||||
Err(err) => {
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(err));
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
info!("pending deletion {}", pending_deletion.is_some());
|
||||
|
||||
if let Some(deletion) = pending_deletion {
|
||||
// as we are no longer loading, signal completion by dropping
|
||||
// the completion while we resume deletion
|
||||
drop(_completion);
|
||||
// do not hold to initial_logical_size_attempt as it will prevent loading from proceeding without timeout
|
||||
let _ = init_order
|
||||
.as_mut()
|
||||
.and_then(|x| x.initial_logical_size_attempt.take());
|
||||
|
||||
match DeleteTenantFlow::resume(
|
||||
deletion,
|
||||
&tenant_clone,
|
||||
init_order.as_ref(),
|
||||
tenants,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Err(err) => {
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(err));
|
||||
return Ok(());
|
||||
}
|
||||
Ok(()) => return Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
let background_jobs_can_start =
|
||||
init_order.as_ref().map(|x| &x.background_jobs_can_start);
|
||||
|
||||
match tenant_clone.load(init_order.as_ref(), &ctx).await {
|
||||
Ok(()) => {
|
||||
debug!("load finished, activating");
|
||||
let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start);
|
||||
debug!("load finished",);
|
||||
|
||||
tenant_clone.activate(broker_client, background_jobs_can_start, &ctx);
|
||||
}
|
||||
Err(err) => {
|
||||
error!("load failed, setting tenant state to Broken: {err:?}");
|
||||
tenant_clone.state.send_modify(|state| {
|
||||
assert_eq!(*state, TenantState::Loading, "the loading task owns the tenant state until activation is complete");
|
||||
*state = TenantState::broken_from_reason(err.to_string());
|
||||
});
|
||||
}
|
||||
Err(err) => make_broken(&tenant_clone, err),
|
||||
}
|
||||
Ok(())
|
||||
|
||||
Ok(())
|
||||
}
|
||||
.instrument({
|
||||
let span = tracing::info_span!(parent: None, "load", tenant_id=%tenant_id);
|
||||
@@ -877,6 +963,8 @@ impl Tenant {
|
||||
)
|
||||
})?;
|
||||
|
||||
info!("Found deletion mark for timeline {}", timeline_id);
|
||||
|
||||
match load_metadata(self.conf, &self.tenant_id, &timeline_id) {
|
||||
Ok(metadata) => {
|
||||
timelines_to_resume_deletion.push((timeline_id, Some(metadata)))
|
||||
@@ -966,9 +1054,11 @@ impl Tenant {
|
||||
|
||||
// Sort the array of timeline IDs into tree-order, so that parent comes before
|
||||
// all its children.
|
||||
tree_sort_timelines(timelines_to_load).map(|sorted_timelines| TenantDirectoryScan {
|
||||
sorted_timelines_to_load: sorted_timelines,
|
||||
timelines_to_resume_deletion,
|
||||
tree_sort_timelines(timelines_to_load, |m| m.ancestor_timeline()).map(|sorted_timelines| {
|
||||
TenantDirectoryScan {
|
||||
sorted_timelines_to_load: sorted_timelines,
|
||||
timelines_to_resume_deletion,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
@@ -1640,6 +1730,8 @@ impl Tenant {
|
||||
post_state = <&'static str>::from(&*current_state),
|
||||
"activation attempt finished"
|
||||
);
|
||||
|
||||
TENANT_ACTIVATION.observe(elapsed.as_secs_f64());
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -1680,7 +1772,7 @@ impl Tenant {
|
||||
// It's mesed up.
|
||||
// we just ignore the failure to stop
|
||||
|
||||
match self.set_stopping(shutdown_progress).await {
|
||||
match self.set_stopping(shutdown_progress, false).await {
|
||||
Ok(()) => {}
|
||||
Err(SetStoppingError::Broken) => {
|
||||
// assume that this is acceptable
|
||||
@@ -1720,18 +1812,25 @@ impl Tenant {
|
||||
/// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state.
|
||||
///
|
||||
/// This function is not cancel-safe!
|
||||
async fn set_stopping(&self, progress: completion::Barrier) -> Result<(), SetStoppingError> {
|
||||
///
|
||||
/// `allow_transition_from_loading` is needed for the special case of loading task deleting the tenant.
|
||||
async fn set_stopping(
|
||||
&self,
|
||||
progress: completion::Barrier,
|
||||
allow_transition_from_loading: bool,
|
||||
) -> Result<(), SetStoppingError> {
|
||||
let mut rx = self.state.subscribe();
|
||||
|
||||
// cannot stop before we're done activating, so wait out until we're done activating
|
||||
rx.wait_for(|state| match state {
|
||||
TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
|
||||
TenantState::Activating(_) | TenantState::Attaching => {
|
||||
info!(
|
||||
"waiting for {} to turn Active|Broken|Stopping",
|
||||
<&'static str>::from(state)
|
||||
);
|
||||
false
|
||||
}
|
||||
TenantState::Loading => allow_transition_from_loading,
|
||||
TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true,
|
||||
})
|
||||
.await
|
||||
@@ -1740,9 +1839,16 @@ impl Tenant {
|
||||
// we now know we're done activating, let's see whether this task is the winner to transition into Stopping
|
||||
let mut err = None;
|
||||
let stopping = self.state.send_if_modified(|current_state| match current_state {
|
||||
TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
|
||||
TenantState::Activating(_) | TenantState::Attaching => {
|
||||
unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
|
||||
}
|
||||
TenantState::Loading => {
|
||||
if !allow_transition_from_loading {
|
||||
unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
|
||||
};
|
||||
*current_state = TenantState::Stopping { progress };
|
||||
true
|
||||
}
|
||||
TenantState::Active => {
|
||||
// FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines
|
||||
// are created after the transition to Stopping. That's harmless, as the Timelines
|
||||
@@ -1811,6 +1917,10 @@ impl Tenant {
|
||||
.expect("cannot drop self.state while on a &self method");
|
||||
|
||||
// we now know we're done activating, let's see whether this task is the winner to transition into Broken
|
||||
self.set_broken_no_wait(reason)
|
||||
}
|
||||
|
||||
pub(crate) fn set_broken_no_wait(&self, reason: String) {
|
||||
self.state.send_modify(|current_state| {
|
||||
match *current_state {
|
||||
TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
|
||||
@@ -1876,22 +1986,28 @@ impl Tenant {
|
||||
/// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
|
||||
/// perform a topological sort, so that the parent of each timeline comes
|
||||
/// before the children.
|
||||
fn tree_sort_timelines(
|
||||
timelines: HashMap<TimelineId, TimelineMetadata>,
|
||||
) -> anyhow::Result<Vec<(TimelineId, TimelineMetadata)>> {
|
||||
/// E extracts the ancestor from T
|
||||
/// This allows for T to be different. It can be TimelineMetadata, can be Timeline itself, etc.
|
||||
fn tree_sort_timelines<T, E>(
|
||||
timelines: HashMap<TimelineId, T>,
|
||||
extractor: E,
|
||||
) -> anyhow::Result<Vec<(TimelineId, T)>>
|
||||
where
|
||||
E: Fn(&T) -> Option<TimelineId>,
|
||||
{
|
||||
let mut result = Vec::with_capacity(timelines.len());
|
||||
|
||||
let mut now = Vec::with_capacity(timelines.len());
|
||||
// (ancestor, children)
|
||||
let mut later: HashMap<TimelineId, Vec<(TimelineId, TimelineMetadata)>> =
|
||||
let mut later: HashMap<TimelineId, Vec<(TimelineId, T)>> =
|
||||
HashMap::with_capacity(timelines.len());
|
||||
|
||||
for (timeline_id, metadata) in timelines {
|
||||
if let Some(ancestor_id) = metadata.ancestor_timeline() {
|
||||
for (timeline_id, value) in timelines {
|
||||
if let Some(ancestor_id) = extractor(&value) {
|
||||
let children = later.entry(ancestor_id).or_default();
|
||||
children.push((timeline_id, metadata));
|
||||
children.push((timeline_id, value));
|
||||
} else {
|
||||
now.push((timeline_id, metadata));
|
||||
now.push((timeline_id, value));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2060,7 +2176,7 @@ impl Tenant {
|
||||
remote_client,
|
||||
pg_version,
|
||||
initial_logical_size_can_start.cloned(),
|
||||
initial_logical_size_attempt.cloned(),
|
||||
initial_logical_size_attempt.cloned().flatten(),
|
||||
state,
|
||||
);
|
||||
|
||||
@@ -2144,6 +2260,7 @@ impl Tenant {
|
||||
cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
|
||||
cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
|
||||
eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
|
||||
delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2160,6 +2277,7 @@ impl Tenant {
|
||||
// FIXME If the config file is not found, assume that we're attaching
|
||||
// a detached tenant and config is passed via attach command.
|
||||
// https://github.com/neondatabase/neon/issues/1555
|
||||
// OR: we're loading after incomplete deletion that managed to remove config.
|
||||
if !target_config_path.exists() {
|
||||
info!("tenant config not found in {target_config_display}");
|
||||
return Ok(TenantConfOpt::default());
|
||||
@@ -2889,7 +3007,7 @@ impl Tenant {
|
||||
.set(size);
|
||||
}
|
||||
|
||||
pub fn get_cached_synthetic_size(&self) -> u64 {
|
||||
pub fn cached_synthetic_size(&self) -> u64 {
|
||||
self.cached_synthetic_tenant_size.load(Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -21,15 +21,15 @@ where
|
||||
R: BlockReader,
|
||||
{
|
||||
/// Read a blob into a new buffer.
|
||||
pub fn read_blob(&mut self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
|
||||
pub async fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
|
||||
let mut buf = Vec::new();
|
||||
self.read_blob_into_buf(offset, &mut buf)?;
|
||||
self.read_blob_into_buf(offset, &mut buf).await?;
|
||||
Ok(buf)
|
||||
}
|
||||
/// Read blob into the given buffer. Any previous contents in the buffer
|
||||
/// are overwritten.
|
||||
pub fn read_blob_into_buf(
|
||||
&mut self,
|
||||
pub async fn read_blob_into_buf(
|
||||
&self,
|
||||
offset: u64,
|
||||
dstbuf: &mut Vec<u8>,
|
||||
) -> Result<(), std::io::Error> {
|
||||
|
||||
@@ -2,8 +2,7 @@
|
||||
//! Low-level Block-oriented I/O functions
|
||||
//!
|
||||
|
||||
use crate::page_cache;
|
||||
use crate::page_cache::{ReadBufResult, PAGE_SZ};
|
||||
use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ};
|
||||
use bytes::Bytes;
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use std::os::unix::fs::FileExt;
|
||||
@@ -15,14 +14,12 @@ use std::sync::atomic::AtomicU64;
|
||||
/// There are currently two implementations: EphemeralFile, and FileBlockReader
|
||||
/// below.
|
||||
pub trait BlockReader {
|
||||
type BlockLease: Deref<Target = [u8; PAGE_SZ]> + 'static;
|
||||
|
||||
///
|
||||
/// Read a block. Returns a "lease" object that can be used to
|
||||
/// access to the contents of the page. (For the page cache, the
|
||||
/// lease object represents a lock on the buffer.)
|
||||
///
|
||||
fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error>;
|
||||
fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error>;
|
||||
|
||||
///
|
||||
/// Create a new "cursor" for reading from this reader.
|
||||
@@ -41,13 +38,48 @@ impl<B> BlockReader for &B
|
||||
where
|
||||
B: BlockReader,
|
||||
{
|
||||
type BlockLease = B::BlockLease;
|
||||
|
||||
fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
|
||||
fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
|
||||
(*self).read_blk(blknum)
|
||||
}
|
||||
}
|
||||
|
||||
/// A block accessible for reading
|
||||
///
|
||||
/// During builds with `#[cfg(test)]`, this is a proper enum
|
||||
/// with two variants to support testing code. During normal
|
||||
/// builds, it just has one variant and is thus a cheap newtype
|
||||
/// wrapper of [`PageReadGuard`]
|
||||
pub enum BlockLease {
|
||||
PageReadGuard(PageReadGuard<'static>),
|
||||
#[cfg(test)]
|
||||
Rc(std::rc::Rc<[u8; PAGE_SZ]>),
|
||||
}
|
||||
|
||||
impl From<PageReadGuard<'static>> for BlockLease {
|
||||
fn from(value: PageReadGuard<'static>) -> Self {
|
||||
BlockLease::PageReadGuard(value)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl From<std::rc::Rc<[u8; PAGE_SZ]>> for BlockLease {
|
||||
fn from(value: std::rc::Rc<[u8; PAGE_SZ]>) -> Self {
|
||||
BlockLease::Rc(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for BlockLease {
|
||||
type Target = [u8; PAGE_SZ];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
match self {
|
||||
BlockLease::PageReadGuard(v) => v.deref(),
|
||||
#[cfg(test)]
|
||||
BlockLease::Rc(v) => v.deref(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// A "cursor" for efficiently reading multiple pages from a BlockReader
|
||||
///
|
||||
@@ -80,7 +112,7 @@ where
|
||||
BlockCursor { reader }
|
||||
}
|
||||
|
||||
pub fn read_blk(&mut self, blknum: u32) -> Result<R::BlockLease, std::io::Error> {
|
||||
pub fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
|
||||
self.reader.read_blk(blknum)
|
||||
}
|
||||
}
|
||||
@@ -118,9 +150,7 @@ impl<F> BlockReader for FileBlockReader<F>
|
||||
where
|
||||
F: FileExt,
|
||||
{
|
||||
type BlockLease = page_cache::PageReadGuard<'static>;
|
||||
|
||||
fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
|
||||
fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
|
||||
// Look up the right page
|
||||
let cache = page_cache::get();
|
||||
loop {
|
||||
@@ -132,7 +162,7 @@ where
|
||||
format!("Failed to read immutable buf: {e:#}"),
|
||||
)
|
||||
})? {
|
||||
ReadBufResult::Found(guard) => break Ok(guard),
|
||||
ReadBufResult::Found(guard) => break Ok(guard.into()),
|
||||
ReadBufResult::NotFound(mut write_guard) => {
|
||||
// Read the page from disk into the buffer
|
||||
self.fill_buffer(write_guard.deref_mut(), blknum)?;
|
||||
|
||||
572
pageserver/src/tenant/delete.rs
Normal file
572
pageserver/src/tenant/delete.rs
Normal file
@@ -0,0 +1,572 @@
|
||||
use std::{
|
||||
path::{Path, PathBuf},
|
||||
sync::Arc,
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use pageserver_api::models::TenantState;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
|
||||
use tokio::sync::OwnedMutexGuard;
|
||||
use tracing::{error, info, instrument, warn, Instrument, Span};
|
||||
|
||||
use utils::{
|
||||
backoff, completion, crashsafe, fs_ext,
|
||||
id::{TenantId, TimelineId},
|
||||
};
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
context::RequestContext,
|
||||
task_mgr::{self, TaskKind},
|
||||
InitializationOrder,
|
||||
};
|
||||
|
||||
use super::{
|
||||
mgr::{GetTenantError, TenantsMap},
|
||||
remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
|
||||
span,
|
||||
timeline::delete::DeleteTimelineFlow,
|
||||
tree_sort_timelines, DeleteTimelineError, Tenant,
|
||||
};
|
||||
|
||||
const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u32 = 3;
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum DeleteTenantError {
|
||||
#[error("GetTenant {0}")]
|
||||
Get(#[from] GetTenantError),
|
||||
|
||||
#[error("Invalid state {0}. Expected Active or Broken")]
|
||||
InvalidState(TenantState),
|
||||
|
||||
#[error("Tenant deletion is already in progress")]
|
||||
AlreadyInProgress,
|
||||
|
||||
#[error("Timeline {0}")]
|
||||
Timeline(#[from] DeleteTimelineError),
|
||||
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
type DeletionGuard = tokio::sync::OwnedMutexGuard<DeleteTenantFlow>;
|
||||
|
||||
fn remote_tenant_delete_mark_path(
|
||||
conf: &PageServerConf,
|
||||
tenant_id: &TenantId,
|
||||
) -> anyhow::Result<RemotePath> {
|
||||
let tenant_remote_path = conf
|
||||
.tenant_path(tenant_id)
|
||||
.strip_prefix(&conf.workdir)
|
||||
.context("Failed to strip workdir prefix")
|
||||
.and_then(RemotePath::new)
|
||||
.context("tenant path")?;
|
||||
Ok(tenant_remote_path.join(Path::new("deleted")))
|
||||
}
|
||||
|
||||
async fn create_remote_delete_mark(
|
||||
conf: &PageServerConf,
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
tenant_id: &TenantId,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id)?;
|
||||
|
||||
let data: &[u8] = &[];
|
||||
backoff::retry(
|
||||
|| async {
|
||||
remote_storage
|
||||
.upload(data, 0, &remote_mark_path, None)
|
||||
.await
|
||||
},
|
||||
|_e| false,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"mark_upload",
|
||||
)
|
||||
.await
|
||||
.context("mark_upload")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_local_delete_mark(
|
||||
conf: &PageServerConf,
|
||||
tenant_id: &TenantId,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
let marker_path = conf.tenant_deleted_mark_file_path(tenant_id);
|
||||
|
||||
// Note: we're ok to replace existing file.
|
||||
let _ = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create(true)
|
||||
.open(&marker_path)
|
||||
.with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
|
||||
|
||||
crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn schedule_ordered_timeline_deletions(
|
||||
tenant: &Arc<Tenant>,
|
||||
) -> Result<Vec<(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>, TimelineId)>, DeleteTenantError> {
|
||||
// Tenant is stopping at this point. We know it will be deleted.
|
||||
// No new timelines should be created.
|
||||
// Tree sort timelines to delete from leafs to the root.
|
||||
// NOTE: by calling clone we release the mutex which creates a possibility for a race: pending deletion
|
||||
// can complete and remove timeline from the map in between our call to clone
|
||||
// and `DeleteTimelineFlow::run`, so `run` wont find timeline in `timelines` map.
|
||||
// timelines.lock is currently synchronous so we cant hold it across await point.
|
||||
// So just ignore NotFound error if we get it from `run`.
|
||||
// Beware: in case it becomes async and we try to hold it here, `run` also locks it, which can create a deadlock.
|
||||
let timelines = tenant.timelines.lock().unwrap().clone();
|
||||
let sorted =
|
||||
tree_sort_timelines(timelines, |t| t.get_ancestor_timeline_id()).context("tree sort")?;
|
||||
|
||||
let mut already_running_deletions = vec![];
|
||||
|
||||
for (timeline_id, _) in sorted.into_iter().rev() {
|
||||
if let Err(e) = DeleteTimelineFlow::run(tenant, timeline_id, true).await {
|
||||
match e {
|
||||
DeleteTimelineError::NotFound => {
|
||||
// Timeline deletion finished after call to clone above but before call
|
||||
// to `DeleteTimelineFlow::run` and removed timeline from the map.
|
||||
continue;
|
||||
}
|
||||
DeleteTimelineError::AlreadyInProgress(guard) => {
|
||||
already_running_deletions.push((guard, timeline_id));
|
||||
continue;
|
||||
}
|
||||
e => return Err(DeleteTenantError::Timeline(e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(already_running_deletions)
|
||||
}
|
||||
|
||||
async fn ensure_timelines_dir_empty(timelines_path: &Path) -> Result<(), DeleteTenantError> {
|
||||
// Assert timelines dir is empty.
|
||||
if !fs_ext::is_directory_empty(timelines_path).await? {
|
||||
// Display first 10 items in directory
|
||||
let list = &fs_ext::list_dir(timelines_path).await.context("list_dir")?[..10];
|
||||
return Err(DeleteTenantError::Other(anyhow::anyhow!(
|
||||
"Timelines directory is not empty after all timelines deletion: {list:?}"
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn remove_tenant_remote_delete_mark(
|
||||
conf: &PageServerConf,
|
||||
remote_storage: Option<&GenericRemoteStorage>,
|
||||
tenant_id: &TenantId,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
if let Some(remote_storage) = remote_storage {
|
||||
let path = remote_tenant_delete_mark_path(conf, tenant_id)?;
|
||||
backoff::retry(
|
||||
|| async { remote_storage.delete(&path).await },
|
||||
|_e| false,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"remove_tenant_remote_delete_mark",
|
||||
)
|
||||
.await
|
||||
.context("remove_tenant_remote_delete_mark")?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir
|
||||
async fn cleanup_remaining_fs_traces(
|
||||
conf: &PageServerConf,
|
||||
tenant_id: &TenantId,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
let rm = |p: PathBuf, is_dir: bool| async move {
|
||||
if is_dir {
|
||||
tokio::fs::remove_dir(&p).await
|
||||
} else {
|
||||
tokio::fs::remove_file(&p).await
|
||||
}
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.with_context(|| {
|
||||
let to_display = p.display();
|
||||
format!("failed to delete {to_display}")
|
||||
})
|
||||
};
|
||||
|
||||
rm(conf.tenant_config_path(tenant_id), false).await?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-remove-timelines-dir"
|
||||
))?
|
||||
});
|
||||
|
||||
rm(conf.timelines_path(tenant_id), true).await?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-remove-deleted-mark"
|
||||
))?
|
||||
});
|
||||
|
||||
// Make sure previous deletions are ordered before mark removal.
|
||||
// Otherwise there is no guarantee that they reach the disk before mark deletion.
|
||||
// So its possible for mark to reach disk first and for other deletions
|
||||
// to be reordered later and thus missed if a crash occurs.
|
||||
// Note that we dont need to sync after mark file is removed
|
||||
// because we can tolerate the case when mark file reappears on startup.
|
||||
let tenant_path = &conf.tenant_path(tenant_id);
|
||||
if tenant_path.exists() {
|
||||
crashsafe::fsync_async(&conf.tenant_path(tenant_id))
|
||||
.await
|
||||
.context("fsync_pre_mark_remove")?;
|
||||
}
|
||||
|
||||
rm(conf.tenant_deleted_mark_file_path(tenant_id), false).await?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-remove-tenant-dir"
|
||||
))?
|
||||
});
|
||||
|
||||
rm(conf.tenant_path(tenant_id), true).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
|
||||
/// and deletes its data from both disk and s3.
|
||||
/// The sequence of steps:
|
||||
/// 1. Upload remote deletion mark.
|
||||
/// 2. Create local mark file.
|
||||
/// 3. Shutdown tasks
|
||||
/// 4. Run ordered timeline deletions
|
||||
/// 5. Wait for timeline deletion operations that were scheduled before tenant deletion was requested
|
||||
/// 6. Remove remote mark
|
||||
/// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
|
||||
/// It is resumable from any step in case a crash/restart occurs.
|
||||
/// There are three entrypoints to the process:
|
||||
/// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
|
||||
/// 2. [`DeleteTenantFlow::resume`] is called during restarts when local or remote deletion marks are still there.
|
||||
/// Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
|
||||
#[derive(Default)]
|
||||
pub enum DeleteTenantFlow {
|
||||
#[default]
|
||||
NotStarted,
|
||||
InProgress,
|
||||
Finished,
|
||||
}
|
||||
|
||||
impl DeleteTenantFlow {
|
||||
// These steps are run in the context of management api request handler.
|
||||
// Long running steps are continued to run in the background.
|
||||
// NB: If this fails half-way through, and is retried, the retry will go through
|
||||
// all the same steps again. Make sure the code here is idempotent, and don't
|
||||
// error out if some of the shutdown tasks have already been completed!
|
||||
// NOTE: static needed for background part.
|
||||
// We assume that calling code sets up the span with tenant_id.
|
||||
#[instrument(skip_all)]
|
||||
pub(crate) async fn run(
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let (tenant, mut guard) = Self::prepare(tenants, tenant_id).await?;
|
||||
|
||||
if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
|
||||
tenant.set_broken(format!("{e:#}")).await;
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
Self::schedule_background(guard, conf, remote_storage, tenants, tenant);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Helper function needed to be able to match once on returned error and transition tenant into broken state.
|
||||
// This is needed because tenant.shutwodn is not idempotent. If tenant state is set to stopping another call to tenant.shutdown
|
||||
// will result in an error, but here we need to be able to retry shutdown when tenant deletion is retried.
|
||||
// So the solution is to set tenant state to broken.
|
||||
async fn run_inner(
|
||||
guard: &mut OwnedMutexGuard<Self>,
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: Option<&GenericRemoteStorage>,
|
||||
tenant: &Tenant,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
guard.mark_in_progress()?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-create-remote-mark", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-create-remote-mark"
|
||||
))?
|
||||
});
|
||||
|
||||
// IDEA: implement detach as delete without remote storage. Then they would use the same lock (deletion_progress) so wont contend.
|
||||
// Though sounds scary, different mark name?
|
||||
// Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
|
||||
if let Some(remote_storage) = &remote_storage {
|
||||
create_remote_delete_mark(conf, remote_storage, &tenant.tenant_id)
|
||||
.await
|
||||
.context("remote_mark")?
|
||||
}
|
||||
|
||||
fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-create-local-mark"
|
||||
))?
|
||||
});
|
||||
|
||||
create_local_delete_mark(conf, &tenant.tenant_id)
|
||||
.await
|
||||
.context("local delete mark")?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-background", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-background"
|
||||
))?
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn mark_in_progress(&mut self) -> anyhow::Result<()> {
|
||||
match self {
|
||||
Self::Finished => anyhow::bail!("Bug. Is in finished state"),
|
||||
Self::InProgress { .. } => { /* We're in a retry */ }
|
||||
Self::NotStarted => { /* Fresh start */ }
|
||||
}
|
||||
|
||||
*self = Self::InProgress;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn should_resume_deletion(
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: Option<&GenericRemoteStorage>,
|
||||
tenant: &Tenant,
|
||||
) -> Result<Option<DeletionGuard>, DeleteTenantError> {
|
||||
let acquire = |t: &Tenant| {
|
||||
Some(
|
||||
Arc::clone(&t.delete_progress)
|
||||
.try_lock_owned()
|
||||
.expect("we're the only owner during init"),
|
||||
)
|
||||
};
|
||||
|
||||
let tenant_id = tenant.tenant_id;
|
||||
// Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
|
||||
if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
|
||||
return Ok(acquire(tenant));
|
||||
}
|
||||
|
||||
let remote_storage = match remote_storage {
|
||||
Some(remote_storage) => remote_storage,
|
||||
None => return Ok(None),
|
||||
};
|
||||
|
||||
// If remote storage is there we rely on it
|
||||
let remote_mark_path = remote_tenant_delete_mark_path(conf, &tenant_id)?;
|
||||
|
||||
let result = backoff::retry(
|
||||
|| async { remote_storage.download(&remote_mark_path).await },
|
||||
|e| matches!(e, DownloadError::NotFound),
|
||||
SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
|
||||
SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
|
||||
"fetch_tenant_deletion_mark",
|
||||
)
|
||||
.await;
|
||||
|
||||
match result {
|
||||
Ok(_) => Ok(acquire(tenant)),
|
||||
Err(DownloadError::NotFound) => Ok(None),
|
||||
Err(e) => Err(anyhow::anyhow!(e)).context("should_resume_deletion")?,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn resume(
|
||||
guard: DeletionGuard,
|
||||
tenant: &Arc<Tenant>,
|
||||
init_order: Option<&InitializationOrder>,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
let (_, progress) = completion::channel();
|
||||
|
||||
tenant
|
||||
.set_stopping(progress, true)
|
||||
.await
|
||||
.expect("cant be stopping or broken");
|
||||
|
||||
// Do not consume valuable resources during the load phase, continue deletion once init phase is complete.
|
||||
let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start);
|
||||
if let Some(background) = background_jobs_can_start {
|
||||
info!("waiting for backgound jobs barrier");
|
||||
background.clone().wait().await;
|
||||
info!("ready for backgound jobs barrier");
|
||||
}
|
||||
|
||||
// Tenant may not be loadable if we fail late in cleanup_remaining_fs_traces (e g remove timelines dir)
|
||||
let timelines_path = tenant.conf.timelines_path(&tenant.tenant_id);
|
||||
if timelines_path.exists() {
|
||||
tenant.load(init_order, ctx).await.context("load")?;
|
||||
}
|
||||
|
||||
Self::background(
|
||||
guard,
|
||||
tenant.conf,
|
||||
tenant.remote_storage.clone(),
|
||||
tenants,
|
||||
tenant,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn prepare(
|
||||
tenants: &tokio::sync::RwLock<TenantsMap>,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<(Arc<Tenant>, tokio::sync::OwnedMutexGuard<Self>), DeleteTenantError> {
|
||||
let m = tenants.read().await;
|
||||
|
||||
let tenant = m
|
||||
.get(&tenant_id)
|
||||
.ok_or(GetTenantError::NotFound(tenant_id))?;
|
||||
|
||||
// FIXME: unsure about active only. Our init jobs may not be cancellable properly,
|
||||
// so at least for now allow deletions only for active tenants. TODO recheck
|
||||
// Broken and Stopping is needed for retries.
|
||||
if !matches!(
|
||||
tenant.current_state(),
|
||||
TenantState::Active | TenantState::Broken { .. }
|
||||
) {
|
||||
return Err(DeleteTenantError::InvalidState(tenant.current_state()));
|
||||
}
|
||||
|
||||
let guard = Arc::clone(&tenant.delete_progress)
|
||||
.try_lock_owned()
|
||||
.map_err(|_| DeleteTenantError::AlreadyInProgress)?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-shutdown", |_| {
|
||||
Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))?
|
||||
});
|
||||
|
||||
// make pageserver shutdown not to wait for our completion
|
||||
let (_, progress) = completion::channel();
|
||||
|
||||
// It would be good to only set stopping here and continue shutdown in the background, but shutdown is not idempotent.
|
||||
// i e it is an error to do:
|
||||
// tenant.set_stopping
|
||||
// tenant.shutdown
|
||||
// Its also bad that we're holding tenants.read here.
|
||||
// TODO relax set_stopping to be idempotent?
|
||||
if tenant.shutdown(progress, false).await.is_err() {
|
||||
return Err(DeleteTenantError::Other(anyhow::anyhow!(
|
||||
"tenant shutdown is already in progress"
|
||||
)));
|
||||
}
|
||||
|
||||
Ok((Arc::clone(tenant), guard))
|
||||
}
|
||||
|
||||
fn schedule_background(
|
||||
guard: OwnedMutexGuard<Self>,
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
tenant: Arc<Tenant>,
|
||||
) {
|
||||
let tenant_id = tenant.tenant_id;
|
||||
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::TimelineDeletionWorker,
|
||||
Some(tenant_id),
|
||||
None,
|
||||
"tenant_delete",
|
||||
false,
|
||||
async move {
|
||||
if let Err(err) =
|
||||
Self::background(guard, conf, remote_storage, tenants, &tenant).await
|
||||
{
|
||||
error!("Error: {err:#}");
|
||||
tenant.set_broken(format!("{err:#}")).await;
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
.instrument({
|
||||
let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_id);
|
||||
span.follows_from(Span::current());
|
||||
span
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
async fn background(
|
||||
mut guard: OwnedMutexGuard<Self>,
|
||||
conf: &PageServerConf,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
tenant: &Arc<Tenant>,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
// Tree sort timelines, schedule delete for them. Mention retries from the console side.
|
||||
// Note that if deletion fails we dont mark timelines as broken,
|
||||
// the whole tenant will become broken as by `Self::schedule_background` logic
|
||||
let already_running_timeline_deletions = schedule_ordered_timeline_deletions(tenant)
|
||||
.await
|
||||
.context("schedule_ordered_timeline_deletions")?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-polling-ongoing-deletions", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-polling-ongoing-deletions"
|
||||
))?
|
||||
});
|
||||
|
||||
// Wait for deletions that were already running at the moment when tenant deletion was requested.
|
||||
// When we can lock deletion guard it means that corresponding timeline deletion finished.
|
||||
for (guard, timeline_id) in already_running_timeline_deletions {
|
||||
let flow = guard.lock().await;
|
||||
if !flow.is_finished() {
|
||||
return Err(DeleteTenantError::Other(anyhow::anyhow!(
|
||||
"already running timeline deletion failed: {timeline_id}"
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
let timelines_path = conf.timelines_path(&tenant.tenant_id);
|
||||
// May not exist if we fail in cleanup_remaining_fs_traces after removing it
|
||||
if timelines_path.exists() {
|
||||
// sanity check to guard against layout changes
|
||||
ensure_timelines_dir_empty(&timelines_path)
|
||||
.await
|
||||
.context("timelines dir not empty")?;
|
||||
}
|
||||
|
||||
remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_id).await?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-cleanup-remaining-fs-traces"
|
||||
))?
|
||||
});
|
||||
|
||||
cleanup_remaining_fs_traces(conf, &tenant.tenant_id)
|
||||
.await
|
||||
.context("cleanup_remaining_fs_traces")?;
|
||||
|
||||
let mut locked = tenants.write().await;
|
||||
if locked.remove(&tenant.tenant_id).is_none() {
|
||||
warn!("Tenant got removed from tenants map during deletion");
|
||||
};
|
||||
|
||||
*guard = Self::Finished;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -20,6 +20,7 @@
|
||||
//!
|
||||
use byteorder::{ReadBytesExt, BE};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use either::Either;
|
||||
use hex;
|
||||
use std::{cmp::Ordering, io, result};
|
||||
use thiserror::Error;
|
||||
@@ -230,14 +231,15 @@ where
|
||||
///
|
||||
/// Read the value for given key. Returns the value, or None if it doesn't exist.
|
||||
///
|
||||
pub fn get(&self, search_key: &[u8; L]) -> Result<Option<u64>> {
|
||||
pub async fn get(&self, search_key: &[u8; L]) -> Result<Option<u64>> {
|
||||
let mut result: Option<u64> = None;
|
||||
self.visit(search_key, VisitDirection::Forwards, |key, value| {
|
||||
if key == search_key {
|
||||
result = Some(value);
|
||||
}
|
||||
false
|
||||
})?;
|
||||
})
|
||||
.await?;
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
@@ -246,7 +248,7 @@ where
|
||||
/// will be called for every key >= 'search_key' (or <= 'search_key', if scanning
|
||||
/// backwards)
|
||||
///
|
||||
pub fn visit<V>(
|
||||
pub async fn visit<V>(
|
||||
&self,
|
||||
search_key: &[u8; L],
|
||||
dir: VisitDirection,
|
||||
@@ -255,117 +257,77 @@ where
|
||||
where
|
||||
V: FnMut(&[u8], u64) -> bool,
|
||||
{
|
||||
self.search_recurse(self.root_blk, search_key, dir, &mut visitor)
|
||||
}
|
||||
let mut stack = Vec::new();
|
||||
stack.push((self.root_blk, None));
|
||||
while let Some((node_blknum, opt_iter)) = stack.pop() {
|
||||
// Locate the node.
|
||||
let node_buf = self.reader.read_blk(self.start_blk + node_blknum)?;
|
||||
|
||||
fn search_recurse<V>(
|
||||
&self,
|
||||
node_blknum: u32,
|
||||
search_key: &[u8; L],
|
||||
dir: VisitDirection,
|
||||
visitor: &mut V,
|
||||
) -> Result<bool>
|
||||
where
|
||||
V: FnMut(&[u8], u64) -> bool,
|
||||
{
|
||||
// Locate the node.
|
||||
let blk = self.reader.read_blk(self.start_blk + node_blknum)?;
|
||||
let node = OnDiskNode::deparse(node_buf.as_ref())?;
|
||||
let prefix_len = node.prefix_len as usize;
|
||||
let suffix_len = node.suffix_len as usize;
|
||||
|
||||
// Search all entries on this node
|
||||
self.search_node(blk.as_ref(), search_key, dir, visitor)
|
||||
}
|
||||
assert!(node.num_children > 0);
|
||||
|
||||
fn search_node<V>(
|
||||
&self,
|
||||
node_buf: &[u8],
|
||||
search_key: &[u8; L],
|
||||
dir: VisitDirection,
|
||||
visitor: &mut V,
|
||||
) -> Result<bool>
|
||||
where
|
||||
V: FnMut(&[u8], u64) -> bool,
|
||||
{
|
||||
let node = OnDiskNode::deparse(node_buf)?;
|
||||
let prefix_len = node.prefix_len as usize;
|
||||
let suffix_len = node.suffix_len as usize;
|
||||
let mut keybuf = Vec::new();
|
||||
keybuf.extend(node.prefix);
|
||||
keybuf.resize(prefix_len + suffix_len, 0);
|
||||
|
||||
assert!(node.num_children > 0);
|
||||
|
||||
let mut keybuf = Vec::new();
|
||||
keybuf.extend(node.prefix);
|
||||
keybuf.resize(prefix_len + suffix_len, 0);
|
||||
|
||||
if dir == VisitDirection::Forwards {
|
||||
// Locate the first match
|
||||
let mut idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
|
||||
Ok(idx) => idx,
|
||||
Err(idx) => {
|
||||
if node.level == 0 {
|
||||
// Imagine that the node contains the following keys:
|
||||
//
|
||||
// 1
|
||||
// 3 <-- idx
|
||||
// 5
|
||||
//
|
||||
// If the search key is '2' and there is exact match,
|
||||
// the binary search would return the index of key
|
||||
// '3'. That's cool, '3' is the first key to return.
|
||||
let mut iter = if let Some(iter) = opt_iter {
|
||||
iter
|
||||
} else if dir == VisitDirection::Forwards {
|
||||
// Locate the first match
|
||||
let idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
|
||||
Ok(idx) => idx,
|
||||
Err(idx) => {
|
||||
if node.level == 0 {
|
||||
// Imagine that the node contains the following keys:
|
||||
//
|
||||
// 1
|
||||
// 3 <-- idx
|
||||
// 5
|
||||
//
|
||||
// If the search key is '2' and there is exact match,
|
||||
// the binary search would return the index of key
|
||||
// '3'. That's cool, '3' is the first key to return.
|
||||
idx
|
||||
} else {
|
||||
// This is an internal page, so each key represents a lower
|
||||
// bound for what's in the child page. If there is no exact
|
||||
// match, we have to return the *previous* entry.
|
||||
//
|
||||
// 1 <-- return this
|
||||
// 3 <-- idx
|
||||
// 5
|
||||
idx.saturating_sub(1)
|
||||
}
|
||||
}
|
||||
};
|
||||
Either::Left(idx..node.num_children.into())
|
||||
} else {
|
||||
let idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
|
||||
Ok(idx) => {
|
||||
// Exact match. That's the first entry to return, and walk
|
||||
// backwards from there.
|
||||
idx
|
||||
} else {
|
||||
// This is an internal page, so each key represents a lower
|
||||
// bound for what's in the child page. If there is no exact
|
||||
// match, we have to return the *previous* entry.
|
||||
//
|
||||
// 1 <-- return this
|
||||
// 3 <-- idx
|
||||
// 5
|
||||
idx.saturating_sub(1)
|
||||
}
|
||||
}
|
||||
};
|
||||
// idx points to the first match now. Keep going from there
|
||||
let mut key_off = idx * suffix_len;
|
||||
while idx < node.num_children as usize {
|
||||
let suffix = &node.keys[key_off..key_off + suffix_len];
|
||||
keybuf[prefix_len..].copy_from_slice(suffix);
|
||||
let value = node.value(idx);
|
||||
#[allow(clippy::collapsible_if)]
|
||||
if node.level == 0 {
|
||||
// leaf
|
||||
if !visitor(&keybuf, value.to_u64()) {
|
||||
return Ok(false);
|
||||
Err(idx) => {
|
||||
// No exact match. The binary search returned the index of the
|
||||
// first key that's > search_key. Back off by one, and walk
|
||||
// backwards from there.
|
||||
if let Some(idx) = idx.checked_sub(1) {
|
||||
idx
|
||||
} else {
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
#[allow(clippy::collapsible_if)]
|
||||
if !self.search_recurse(value.to_blknum(), search_key, dir, visitor)? {
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
idx += 1;
|
||||
key_off += suffix_len;
|
||||
}
|
||||
} else {
|
||||
let mut idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
|
||||
Ok(idx) => {
|
||||
// Exact match. That's the first entry to return, and walk
|
||||
// backwards from there. (The loop below starts from 'idx -
|
||||
// 1', so add one here to compensate.)
|
||||
idx + 1
|
||||
}
|
||||
Err(idx) => {
|
||||
// No exact match. The binary search returned the index of the
|
||||
// first key that's > search_key. Back off by one, and walk
|
||||
// backwards from there. (The loop below starts from idx - 1,
|
||||
// so we don't need to subtract one here)
|
||||
idx
|
||||
}
|
||||
};
|
||||
Either::Right((0..=idx).rev())
|
||||
};
|
||||
|
||||
// idx points to the first match + 1 now. Keep going from there.
|
||||
let mut key_off = idx * suffix_len;
|
||||
while idx > 0 {
|
||||
idx -= 1;
|
||||
key_off -= suffix_len;
|
||||
// idx points to the first match now. Keep going from there
|
||||
while let Some(idx) = iter.next() {
|
||||
let key_off = idx * suffix_len;
|
||||
let suffix = &node.keys[key_off..key_off + suffix_len];
|
||||
keybuf[prefix_len..].copy_from_slice(suffix);
|
||||
let value = node.value(idx);
|
||||
@@ -376,12 +338,8 @@ where
|
||||
return Ok(false);
|
||||
}
|
||||
} else {
|
||||
#[allow(clippy::collapsible_if)]
|
||||
if !self.search_recurse(value.to_blknum(), search_key, dir, visitor)? {
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
if idx == 0 {
|
||||
stack.push((node_blknum, Some(iter)));
|
||||
stack.push((value.to_blknum(), None));
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -727,6 +685,7 @@ impl<const L: usize> BuildNode<L> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tenant::block_io::BlockLease;
|
||||
use rand::Rng;
|
||||
use std::collections::BTreeMap;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
@@ -741,12 +700,10 @@ mod tests {
|
||||
}
|
||||
}
|
||||
impl BlockReader for TestDisk {
|
||||
type BlockLease = std::rc::Rc<[u8; PAGE_SZ]>;
|
||||
|
||||
fn read_blk(&self, blknum: u32) -> io::Result<Self::BlockLease> {
|
||||
fn read_blk(&self, blknum: u32) -> io::Result<BlockLease> {
|
||||
let mut buf = [0u8; PAGE_SZ];
|
||||
buf.copy_from_slice(&self.blocks[blknum as usize]);
|
||||
Ok(std::rc::Rc::new(buf))
|
||||
Ok(std::rc::Rc::new(buf).into())
|
||||
}
|
||||
}
|
||||
impl BlockWriter for &mut TestDisk {
|
||||
@@ -782,12 +739,12 @@ mod tests {
|
||||
|
||||
// Test the `get` function on all the keys.
|
||||
for (key, val) in all_data.iter() {
|
||||
assert_eq!(reader.get(key)?, Some(*val));
|
||||
assert_eq!(reader.get(key).await?, Some(*val));
|
||||
}
|
||||
// And on some keys that don't exist
|
||||
assert_eq!(reader.get(b"aaaaaa")?, None);
|
||||
assert_eq!(reader.get(b"zzzzzz")?, None);
|
||||
assert_eq!(reader.get(b"xaaabx")?, None);
|
||||
assert_eq!(reader.get(b"aaaaaa").await?, None);
|
||||
assert_eq!(reader.get(b"zzzzzz").await?, None);
|
||||
assert_eq!(reader.get(b"xaaabx").await?, None);
|
||||
|
||||
// Test search with `visit` function
|
||||
let search_key = b"xabaaa";
|
||||
@@ -798,10 +755,12 @@ mod tests {
|
||||
.collect();
|
||||
|
||||
let mut data = Vec::new();
|
||||
reader.visit(search_key, VisitDirection::Forwards, |key, value| {
|
||||
data.push((key.to_vec(), value));
|
||||
true
|
||||
})?;
|
||||
reader
|
||||
.visit(search_key, VisitDirection::Forwards, |key, value| {
|
||||
data.push((key.to_vec(), value));
|
||||
true
|
||||
})
|
||||
.await?;
|
||||
assert_eq!(data, expected);
|
||||
|
||||
// Test a backwards scan
|
||||
@@ -812,16 +771,20 @@ mod tests {
|
||||
.collect();
|
||||
expected.reverse();
|
||||
let mut data = Vec::new();
|
||||
reader.visit(search_key, VisitDirection::Backwards, |key, value| {
|
||||
data.push((key.to_vec(), value));
|
||||
true
|
||||
})?;
|
||||
reader
|
||||
.visit(search_key, VisitDirection::Backwards, |key, value| {
|
||||
data.push((key.to_vec(), value));
|
||||
true
|
||||
})
|
||||
.await?;
|
||||
assert_eq!(data, expected);
|
||||
|
||||
// Backward scan where nothing matches
|
||||
reader.visit(b"aaaaaa", VisitDirection::Backwards, |key, value| {
|
||||
panic!("found unexpected key {}: {}", hex::encode(key), value);
|
||||
})?;
|
||||
reader
|
||||
.visit(b"aaaaaa", VisitDirection::Backwards, |key, value| {
|
||||
panic!("found unexpected key {}: {}", hex::encode(key), value);
|
||||
})
|
||||
.await?;
|
||||
|
||||
// Full scan
|
||||
let expected: Vec<(Vec<u8>, u64)> = all_data
|
||||
@@ -829,10 +792,12 @@ mod tests {
|
||||
.map(|(key, value)| (key.to_vec(), *value))
|
||||
.collect();
|
||||
let mut data = Vec::new();
|
||||
reader.visit(&[0u8; 6], VisitDirection::Forwards, |key, value| {
|
||||
data.push((key.to_vec(), value));
|
||||
true
|
||||
})?;
|
||||
reader
|
||||
.visit(&[0u8; 6], VisitDirection::Forwards, |key, value| {
|
||||
data.push((key.to_vec(), value));
|
||||
true
|
||||
})
|
||||
.await?;
|
||||
assert_eq!(data, expected);
|
||||
|
||||
Ok(())
|
||||
@@ -880,13 +845,15 @@ mod tests {
|
||||
for search_key_int in 0..(NUM_KEYS * 2 + 10) {
|
||||
let search_key = u64::to_be_bytes(search_key_int);
|
||||
assert_eq!(
|
||||
reader.get(&search_key)?,
|
||||
reader.get(&search_key).await?,
|
||||
all_data.get(&search_key_int).cloned()
|
||||
);
|
||||
|
||||
// Test a forward scan starting with this key
|
||||
result.lock().unwrap().clear();
|
||||
reader.visit(&search_key, VisitDirection::Forwards, take_ten)?;
|
||||
reader
|
||||
.visit(&search_key, VisitDirection::Forwards, take_ten)
|
||||
.await?;
|
||||
let expected = all_data
|
||||
.range(search_key_int..)
|
||||
.take(10)
|
||||
@@ -896,7 +863,9 @@ mod tests {
|
||||
|
||||
// And a backwards scan
|
||||
result.lock().unwrap().clear();
|
||||
reader.visit(&search_key, VisitDirection::Backwards, take_ten)?;
|
||||
reader
|
||||
.visit(&search_key, VisitDirection::Backwards, take_ten)
|
||||
.await?;
|
||||
let expected = all_data
|
||||
.range(..=search_key_int)
|
||||
.rev()
|
||||
@@ -910,7 +879,9 @@ mod tests {
|
||||
let search_key = u64::to_be_bytes(0);
|
||||
limit.store(usize::MAX, Ordering::Relaxed);
|
||||
result.lock().unwrap().clear();
|
||||
reader.visit(&search_key, VisitDirection::Forwards, take_ten)?;
|
||||
reader
|
||||
.visit(&search_key, VisitDirection::Forwards, take_ten)
|
||||
.await?;
|
||||
let expected = all_data
|
||||
.iter()
|
||||
.map(|(&key, &val)| (key, val))
|
||||
@@ -921,7 +892,9 @@ mod tests {
|
||||
let search_key = u64::to_be_bytes(u64::MAX);
|
||||
limit.store(usize::MAX, Ordering::Relaxed);
|
||||
result.lock().unwrap().clear();
|
||||
reader.visit(&search_key, VisitDirection::Backwards, take_ten)?;
|
||||
reader
|
||||
.visit(&search_key, VisitDirection::Backwards, take_ten)
|
||||
.await?;
|
||||
let expected = all_data
|
||||
.iter()
|
||||
.rev()
|
||||
@@ -932,8 +905,8 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn random_data() -> Result<()> {
|
||||
#[tokio::test]
|
||||
async fn random_data() -> Result<()> {
|
||||
// Generate random keys with exponential distribution, to
|
||||
// exercise the prefix compression
|
||||
const NUM_KEYS: usize = 100000;
|
||||
@@ -960,19 +933,23 @@ mod tests {
|
||||
// Test get() operation on all the keys
|
||||
for (&key, &val) in all_data.iter() {
|
||||
let search_key = u128::to_be_bytes(key);
|
||||
assert_eq!(reader.get(&search_key)?, Some(val));
|
||||
assert_eq!(reader.get(&search_key).await?, Some(val));
|
||||
}
|
||||
|
||||
// Test get() operations on random keys, most of which will not exist
|
||||
for _ in 0..100000 {
|
||||
let key_int = rand::thread_rng().gen::<u128>();
|
||||
let search_key = u128::to_be_bytes(key_int);
|
||||
assert!(reader.get(&search_key)? == all_data.get(&key_int).cloned());
|
||||
assert!(reader.get(&search_key).await? == all_data.get(&key_int).cloned());
|
||||
}
|
||||
|
||||
// Test boundary cases
|
||||
assert!(reader.get(&u128::to_be_bytes(u128::MIN))? == all_data.get(&u128::MIN).cloned());
|
||||
assert!(reader.get(&u128::to_be_bytes(u128::MAX))? == all_data.get(&u128::MAX).cloned());
|
||||
assert!(
|
||||
reader.get(&u128::to_be_bytes(u128::MIN)).await? == all_data.get(&u128::MIN).cloned()
|
||||
);
|
||||
assert!(
|
||||
reader.get(&u128::to_be_bytes(u128::MAX)).await? == all_data.get(&u128::MAX).cloned()
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1014,15 +991,17 @@ mod tests {
|
||||
|
||||
// Test get() operation on all the keys
|
||||
for (key, val) in disk_btree_test_data::TEST_DATA {
|
||||
assert_eq!(reader.get(&key)?, Some(val));
|
||||
assert_eq!(reader.get(&key).await?, Some(val));
|
||||
}
|
||||
|
||||
// Test full scan
|
||||
let mut count = 0;
|
||||
reader.visit(&[0u8; 26], VisitDirection::Forwards, |_key, _value| {
|
||||
count += 1;
|
||||
true
|
||||
})?;
|
||||
reader
|
||||
.visit(&[0u8; 26], VisitDirection::Forwards, |_key, _value| {
|
||||
count += 1;
|
||||
true
|
||||
})
|
||||
.await?;
|
||||
assert_eq!(count, disk_btree_test_data::TEST_DATA.len());
|
||||
|
||||
reader.dump().await?;
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
use crate::config::PageServerConf;
|
||||
use crate::page_cache::{self, ReadBufResult, WriteBufResult, PAGE_SZ};
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::BlockReader;
|
||||
use crate::tenant::block_io::{BlockLease, BlockReader};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use once_cell::sync::Lazy;
|
||||
use std::cmp::min;
|
||||
@@ -266,11 +266,17 @@ impl Drop for EphemeralFile {
|
||||
// unlink the file
|
||||
let res = std::fs::remove_file(&self.file.path);
|
||||
if let Err(e) = res {
|
||||
warn!(
|
||||
"could not remove ephemeral file '{}': {}",
|
||||
self.file.path.display(),
|
||||
e
|
||||
);
|
||||
if e.kind() != std::io::ErrorKind::NotFound {
|
||||
// just never log the not found errors, we cannot do anything for them; on detach
|
||||
// the tenant directory is already gone.
|
||||
//
|
||||
// not found files might also be related to https://github.com/neondatabase/neon/issues/2442
|
||||
error!(
|
||||
"could not remove ephemeral file '{}': {}",
|
||||
self.file.path.display(),
|
||||
e
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -297,9 +303,7 @@ pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), io::Error>
|
||||
}
|
||||
|
||||
impl BlockReader for EphemeralFile {
|
||||
type BlockLease = page_cache::PageReadGuard<'static>;
|
||||
|
||||
fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, io::Error> {
|
||||
fn read_blk(&self, blknum: u32) -> Result<BlockLease, io::Error> {
|
||||
// Look up the right page
|
||||
let cache = page_cache::get();
|
||||
loop {
|
||||
@@ -307,7 +311,7 @@ impl BlockReader for EphemeralFile {
|
||||
.read_ephemeral_buf(self.file_id, blknum)
|
||||
.map_err(|e| to_io_error(e, "Failed to read ephemeral buf"))?
|
||||
{
|
||||
ReadBufResult::Found(guard) => return Ok(guard),
|
||||
ReadBufResult::Found(guard) => return Ok(guard.into()),
|
||||
ReadBufResult::NotFound(mut write_guard) => {
|
||||
// Read the page from disk into the buffer
|
||||
self.fill_buffer(write_guard.deref_mut(), blknum)?;
|
||||
@@ -395,17 +399,26 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ephemeral_blobs() -> Result<(), io::Error> {
|
||||
#[tokio::test]
|
||||
async fn test_ephemeral_blobs() -> Result<(), io::Error> {
|
||||
let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?;
|
||||
|
||||
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?;
|
||||
|
||||
let pos_foo = file.write_blob(b"foo")?;
|
||||
assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice());
|
||||
assert_eq!(
|
||||
b"foo",
|
||||
file.block_cursor().read_blob(pos_foo).await?.as_slice()
|
||||
);
|
||||
let pos_bar = file.write_blob(b"bar")?;
|
||||
assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice());
|
||||
assert_eq!(b"bar", file.block_cursor().read_blob(pos_bar)?.as_slice());
|
||||
assert_eq!(
|
||||
b"foo",
|
||||
file.block_cursor().read_blob(pos_foo).await?.as_slice()
|
||||
);
|
||||
assert_eq!(
|
||||
b"bar",
|
||||
file.block_cursor().read_blob(pos_bar).await?.as_slice()
|
||||
);
|
||||
|
||||
let mut blobs = Vec::new();
|
||||
for i in 0..10000 {
|
||||
@@ -420,9 +433,9 @@ mod tests {
|
||||
blobs.push((pos, data));
|
||||
}
|
||||
|
||||
let mut cursor = BlockCursor::new(&file);
|
||||
let cursor = BlockCursor::new(&file);
|
||||
for (pos, expected) in blobs {
|
||||
let actual = cursor.read_blob(pos)?;
|
||||
let actual = cursor.read_blob(pos).await?;
|
||||
assert_eq!(actual, expected);
|
||||
}
|
||||
|
||||
@@ -431,7 +444,7 @@ mod tests {
|
||||
large_data.resize(20000, 0);
|
||||
thread_rng().fill_bytes(&mut large_data);
|
||||
let pos_large = file.write_blob(&large_data)?;
|
||||
let result = file.block_cursor().read_blob(pos_large)?;
|
||||
let result = file.block_cursor().read_blob(pos_large).await?;
|
||||
assert_eq!(result, large_data);
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -121,7 +121,7 @@ impl BatchedUpdates<'_> {
|
||||
///
|
||||
/// This should be called when the corresponding file on disk has been deleted.
|
||||
///
|
||||
pub fn remove_historic(&mut self, layer_desc: PersistentLayerDesc) {
|
||||
pub fn remove_historic(&mut self, layer_desc: &PersistentLayerDesc) {
|
||||
self.layer_map.remove_historic_noflush(layer_desc)
|
||||
}
|
||||
|
||||
@@ -253,11 +253,11 @@ impl LayerMap {
|
||||
///
|
||||
/// Helper function for BatchedUpdates::remove_historic
|
||||
///
|
||||
pub fn remove_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
|
||||
pub fn remove_historic_noflush(&mut self, layer_desc: &PersistentLayerDesc) {
|
||||
self.historic
|
||||
.remove(historic_layer_coverage::LayerKey::from(&layer_desc));
|
||||
.remove(historic_layer_coverage::LayerKey::from(layer_desc));
|
||||
let layer_key = layer_desc.key();
|
||||
if Self::is_l0(&layer_desc) {
|
||||
if Self::is_l0(layer_desc) {
|
||||
let len_before = self.l0_delta_layers.len();
|
||||
let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
|
||||
l0_delta_layers.retain(|other| other.key() != layer_key);
|
||||
@@ -766,8 +766,7 @@ mod tests {
|
||||
expected_in_counts
|
||||
);
|
||||
|
||||
map.batch_update()
|
||||
.remove_historic(downloaded.layer_desc().clone());
|
||||
map.batch_update().remove_historic(downloaded.layer_desc());
|
||||
assert_eq!(count_layer_in(&map, downloaded.layer_desc()), (0, 0));
|
||||
}
|
||||
|
||||
|
||||
@@ -20,17 +20,19 @@ use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::task_mgr::{self, TaskKind};
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::delete::DeleteTenantFlow;
|
||||
use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
|
||||
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
|
||||
|
||||
use utils::fs_ext::PathExt;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::delete::DeleteTenantError;
|
||||
use super::timeline::delete::DeleteTimelineFlow;
|
||||
|
||||
/// The tenants known to the pageserver.
|
||||
/// The enum variants are used to distinguish the different states that the pageserver can be in.
|
||||
enum TenantsMap {
|
||||
pub(crate) enum TenantsMap {
|
||||
/// [`init_tenant_mgr`] is not done yet.
|
||||
Initializing,
|
||||
/// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded.
|
||||
@@ -42,13 +44,13 @@ enum TenantsMap {
|
||||
}
|
||||
|
||||
impl TenantsMap {
|
||||
fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
|
||||
pub(crate) fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
|
||||
match self {
|
||||
TenantsMap::Initializing => None,
|
||||
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.get(tenant_id),
|
||||
}
|
||||
}
|
||||
fn remove(&mut self, tenant_id: &TenantId) -> Option<Arc<Tenant>> {
|
||||
pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> Option<Arc<Tenant>> {
|
||||
match self {
|
||||
TenantsMap::Initializing => None,
|
||||
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.remove(tenant_id),
|
||||
@@ -97,7 +99,9 @@ pub async fn init_tenant_mgr(
|
||||
);
|
||||
}
|
||||
} else {
|
||||
// This case happens if we crash during attach before creating the attach marker file
|
||||
// This case happens if we:
|
||||
// * crash during attach before creating the attach marker file
|
||||
// * crash during tenant delete before removing tenant directory
|
||||
let is_empty = tenant_dir_path.is_empty_dir().with_context(|| {
|
||||
format!("Failed to check whether {tenant_dir_path:?} is an empty dir")
|
||||
})?;
|
||||
@@ -124,6 +128,7 @@ pub async fn init_tenant_mgr(
|
||||
broker_client.clone(),
|
||||
remote_storage.clone(),
|
||||
Some(init_order.clone()),
|
||||
&TENANTS,
|
||||
&ctx,
|
||||
) {
|
||||
Ok(tenant) => {
|
||||
@@ -154,12 +159,13 @@ pub async fn init_tenant_mgr(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn schedule_local_tenant_processing(
|
||||
pub(crate) fn schedule_local_tenant_processing(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_path: &Path,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
init_order: Option<InitializationOrder>,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Tenant>> {
|
||||
anyhow::ensure!(
|
||||
@@ -219,6 +225,7 @@ pub fn schedule_local_tenant_processing(
|
||||
broker_client,
|
||||
remote_storage,
|
||||
init_order,
|
||||
tenants,
|
||||
ctx,
|
||||
)
|
||||
};
|
||||
@@ -266,71 +273,77 @@ async fn shutdown_all_tenants0(tenants: &tokio::sync::RwLock<TenantsMap>) {
|
||||
}
|
||||
};
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
let mut join_set = JoinSet::new();
|
||||
for (tenant_id, tenant) in tenants_to_shut_down {
|
||||
join_set.spawn(
|
||||
async move {
|
||||
// ordering shouldn't matter for this, either we store true right away or never
|
||||
let ordering = std::sync::atomic::Ordering::Relaxed;
|
||||
let joined_other = std::sync::atomic::AtomicBool::new(false);
|
||||
let freeze_and_flush = true;
|
||||
|
||||
let mut shutdown = std::pin::pin!(async {
|
||||
let freeze_and_flush = true;
|
||||
|
||||
let res = {
|
||||
let (_guard, shutdown_progress) = completion::channel();
|
||||
tenant.shutdown(shutdown_progress, freeze_and_flush).await
|
||||
};
|
||||
|
||||
if let Err(other_progress) = res {
|
||||
// join the another shutdown in progress
|
||||
joined_other.store(true, ordering);
|
||||
other_progress.wait().await;
|
||||
}
|
||||
});
|
||||
|
||||
// in practice we might not have a lot time to go, since systemd is going to
|
||||
// SIGKILL us at 10s, but we can try. delete tenant might take a while, so put out
|
||||
// a warning.
|
||||
let warning = std::time::Duration::from_secs(5);
|
||||
let mut warning = std::pin::pin!(tokio::time::sleep(warning));
|
||||
|
||||
tokio::select! {
|
||||
_ = &mut shutdown => {},
|
||||
_ = &mut warning => {
|
||||
let joined_other = joined_other.load(ordering);
|
||||
warn!(%joined_other, "waiting for the shutdown to complete");
|
||||
shutdown.await;
|
||||
}
|
||||
let res = {
|
||||
let (_guard, shutdown_progress) = completion::channel();
|
||||
tenant.shutdown(shutdown_progress, freeze_and_flush).await
|
||||
};
|
||||
|
||||
if let Err(other_progress) = res {
|
||||
// join the another shutdown in progress
|
||||
other_progress.wait().await;
|
||||
}
|
||||
|
||||
// we cannot afford per tenant logging here, because if s3 is degraded, we are
|
||||
// going to log too many lines
|
||||
|
||||
debug!("tenant successfully stopped");
|
||||
}
|
||||
.instrument(info_span!("shutdown", %tenant_id)),
|
||||
);
|
||||
}
|
||||
|
||||
let total = join_set.len();
|
||||
let mut panicked = 0;
|
||||
let mut buffering = true;
|
||||
const BUFFER_FOR: std::time::Duration = std::time::Duration::from_millis(500);
|
||||
let mut buffered = std::pin::pin!(tokio::time::sleep(BUFFER_FOR));
|
||||
|
||||
while let Some(res) = join_set.join_next().await {
|
||||
match res {
|
||||
Ok(()) => {}
|
||||
Err(join_error) if join_error.is_cancelled() => {
|
||||
unreachable!("we are not cancelling any of the futures");
|
||||
}
|
||||
Err(join_error) if join_error.is_panic() => {
|
||||
// cannot really do anything, as this panic is likely a bug
|
||||
panicked += 1;
|
||||
}
|
||||
Err(join_error) => {
|
||||
warn!("unknown kind of JoinError: {join_error}");
|
||||
while !join_set.is_empty() {
|
||||
tokio::select! {
|
||||
Some(joined) = join_set.join_next() => {
|
||||
match joined {
|
||||
Ok(()) => {}
|
||||
Err(join_error) if join_error.is_cancelled() => {
|
||||
unreachable!("we are not cancelling any of the futures");
|
||||
}
|
||||
Err(join_error) if join_error.is_panic() => {
|
||||
// cannot really do anything, as this panic is likely a bug
|
||||
panicked += 1;
|
||||
}
|
||||
Err(join_error) => {
|
||||
warn!("unknown kind of JoinError: {join_error}");
|
||||
}
|
||||
}
|
||||
if !buffering {
|
||||
// buffer so that every 500ms since the first update (or starting) we'll log
|
||||
// how far away we are; this is because we will get SIGKILL'd at 10s, and we
|
||||
// are not able to log *then*.
|
||||
buffering = true;
|
||||
buffered.as_mut().reset(tokio::time::Instant::now() + BUFFER_FOR);
|
||||
}
|
||||
},
|
||||
_ = &mut buffered, if buffering => {
|
||||
buffering = false;
|
||||
info!(remaining = join_set.len(), total, elapsed_ms = started_at.elapsed().as_millis(), "waiting for tenants to shutdown");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if panicked > 0 {
|
||||
warn!(panicked, "observed panicks while shutting down tenants");
|
||||
warn!(
|
||||
panicked,
|
||||
total, "observed panicks while shutting down tenants"
|
||||
);
|
||||
}
|
||||
|
||||
// caller will log how long we took
|
||||
}
|
||||
|
||||
pub async fn create_tenant(
|
||||
@@ -350,7 +363,7 @@ pub async fn create_tenant(
|
||||
// See https://github.com/neondatabase/neon/issues/4233
|
||||
|
||||
let created_tenant =
|
||||
schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, ctx)?;
|
||||
schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, &TENANTS, ctx)?;
|
||||
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
|
||||
// See https://github.com/neondatabase/neon/issues/4233
|
||||
|
||||
@@ -411,6 +424,14 @@ pub async fn get_tenant(
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn delete_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
DeleteTenantFlow::run(conf, remote_storage, &TENANTS, tenant_id).await
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum DeleteTimelineError {
|
||||
#[error("Tenant {0}")]
|
||||
@@ -426,7 +447,7 @@ pub async fn delete_timeline(
|
||||
_ctx: &RequestContext,
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
let tenant = get_tenant(tenant_id, true).await?;
|
||||
DeleteTimelineFlow::run(&tenant, timeline_id).await?;
|
||||
DeleteTimelineFlow::run(&tenant, timeline_id, false).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -501,7 +522,7 @@ pub async fn load_tenant(
|
||||
.with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
|
||||
}
|
||||
|
||||
let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, None, ctx)
|
||||
let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, None, &TENANTS, ctx)
|
||||
.with_context(|| {
|
||||
format!("Failed to schedule tenant processing in path {tenant_path:?}")
|
||||
})?;
|
||||
@@ -582,7 +603,7 @@ pub async fn attach_tenant(
|
||||
.context("check for attach marker file existence")?;
|
||||
anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");
|
||||
|
||||
let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, ctx)?;
|
||||
let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, &TENANTS, ctx)?;
|
||||
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
|
||||
// See https://github.com/neondatabase/neon/issues/4233
|
||||
|
||||
|
||||
@@ -211,6 +211,9 @@ use chrono::{NaiveDateTime, Utc};
|
||||
// re-export these
|
||||
pub use download::{is_temp_download_file, list_remote_timelines};
|
||||
use scopeguard::ScopeGuard;
|
||||
use utils::backoff::{
|
||||
self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
};
|
||||
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::path::Path;
|
||||
@@ -241,7 +244,6 @@ use crate::{
|
||||
tenant::upload_queue::{
|
||||
UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
|
||||
},
|
||||
{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS},
|
||||
};
|
||||
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -256,12 +258,12 @@ use super::upload_queue::SetDeletedFlagProgress;
|
||||
// But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN
|
||||
// level instead, as repeated failures can mean a more serious problem. If it
|
||||
// fails more than FAILED_DOWNLOAD_RETRIES times, we give up
|
||||
const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3;
|
||||
const FAILED_DOWNLOAD_RETRIES: u32 = 10;
|
||||
pub(crate) const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3;
|
||||
pub(crate) const FAILED_REMOTE_OP_RETRIES: u32 = 10;
|
||||
|
||||
// Similarly log failed uploads and deletions at WARN level, after this many
|
||||
// retries. Uploads and deletions are retried forever, though.
|
||||
const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
|
||||
pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
|
||||
|
||||
pub enum MaybeDeletedIndexPart {
|
||||
IndexPart(IndexPart),
|
||||
@@ -752,12 +754,24 @@ impl RemoteTimelineClient {
|
||||
|
||||
pausable_failpoint!("persist_deleted_index_part");
|
||||
|
||||
upload::upload_index_part(
|
||||
self.conf,
|
||||
&self.storage_impl,
|
||||
&self.tenant_id,
|
||||
&self.timeline_id,
|
||||
&index_part_with_deleted_at,
|
||||
backoff::retry(
|
||||
|| async {
|
||||
upload::upload_index_part(
|
||||
self.conf,
|
||||
&self.storage_impl,
|
||||
&self.tenant_id,
|
||||
&self.timeline_id,
|
||||
&index_part_with_deleted_at,
|
||||
)
|
||||
.await
|
||||
},
|
||||
|_e| false,
|
||||
1,
|
||||
// have just a couple of attempts
|
||||
// when executed as part of timeline deletion this happens in context of api call
|
||||
// when executed as part of tenant deletion this happens in the background
|
||||
2,
|
||||
"persist_index_part_with_deleted_flag",
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -834,10 +848,19 @@ impl RemoteTimelineClient {
|
||||
let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id);
|
||||
let timeline_storage_path = self.conf.remote_path(&timeline_path)?;
|
||||
|
||||
let remaining = self
|
||||
.storage_impl
|
||||
.list_prefixes(Some(&timeline_storage_path))
|
||||
.await?;
|
||||
let remaining = backoff::retry(
|
||||
|| async {
|
||||
self.storage_impl
|
||||
.list_prefixes(Some(&timeline_storage_path))
|
||||
.await
|
||||
},
|
||||
|_e| false,
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"list_prefixes",
|
||||
)
|
||||
.await
|
||||
.context("list prefixes")?;
|
||||
|
||||
let remaining: Vec<RemotePath> = remaining
|
||||
.into_iter()
|
||||
@@ -852,7 +875,15 @@ impl RemoteTimelineClient {
|
||||
.collect();
|
||||
|
||||
if !remaining.is_empty() {
|
||||
self.storage_impl.delete_objects(&remaining).await?;
|
||||
backoff::retry(
|
||||
|| async { self.storage_impl.delete_objects(&remaining).await },
|
||||
|_e| false,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"delete_objects",
|
||||
)
|
||||
.await
|
||||
.context("delete_objects")?;
|
||||
}
|
||||
|
||||
fail::fail_point!("timeline-delete-before-index-delete", |_| {
|
||||
@@ -864,7 +895,16 @@ impl RemoteTimelineClient {
|
||||
let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));
|
||||
|
||||
debug!("deleting index part");
|
||||
self.storage_impl.delete(&index_file_path).await?;
|
||||
|
||||
backoff::retry(
|
||||
|| async { self.storage_impl.delete(&index_file_path).await },
|
||||
|_e| false,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"delete_index",
|
||||
)
|
||||
.await
|
||||
.context("delete_index")?;
|
||||
|
||||
fail::fail_point!("timeline-delete-after-index-delete", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
|
||||
@@ -11,23 +11,17 @@ use std::time::Duration;
|
||||
use anyhow::{anyhow, Context};
|
||||
use tokio::fs;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
|
||||
use tracing::{info, warn};
|
||||
use utils::{backoff, crashsafe};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage};
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::index::{IndexPart, LayerFileMetadata};
|
||||
use super::{FAILED_DOWNLOAD_RETRIES, FAILED_DOWNLOAD_WARN_THRESHOLD};
|
||||
|
||||
async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
|
||||
fs::File::open(path).await?.sync_all().await
|
||||
}
|
||||
use super::{FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES};
|
||||
|
||||
static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);
|
||||
|
||||
@@ -152,7 +146,7 @@ pub async fn download_layer_file<'a>(
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
fsync_path(&local_path)
|
||||
crashsafe::fsync_async(&local_path)
|
||||
.await
|
||||
.with_context(|| format!("Could not fsync layer file {}", local_path.display(),))
|
||||
.map_err(DownloadError::Other)?;
|
||||
@@ -268,7 +262,6 @@ pub(super) async fn download_index_part(
|
||||
Ok(index_part)
|
||||
}
|
||||
|
||||
///
|
||||
/// Helper function to handle retries for a download operation.
|
||||
///
|
||||
/// Remote operations can fail due to rate limits (IAM, S3), spurious network
|
||||
@@ -276,47 +269,17 @@ pub(super) async fn download_index_part(
|
||||
/// with backoff.
|
||||
///
|
||||
/// (See similar logic for uploads in `perform_upload_task`)
|
||||
async fn download_retry<T, O, F>(mut op: O, description: &str) -> Result<T, DownloadError>
|
||||
async fn download_retry<T, O, F>(op: O, description: &str) -> Result<T, DownloadError>
|
||||
where
|
||||
O: FnMut() -> F,
|
||||
F: Future<Output = Result<T, DownloadError>>,
|
||||
{
|
||||
let mut attempts = 0;
|
||||
loop {
|
||||
let result = op().await;
|
||||
match result {
|
||||
Ok(_) => {
|
||||
if attempts > 0 {
|
||||
info!("{description} succeeded after {attempts} retries");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// These are "permanent" errors that should not be retried.
|
||||
Err(DownloadError::BadInput(_)) | Err(DownloadError::NotFound) => {
|
||||
return result;
|
||||
}
|
||||
// Assume that any other failure might be transient, and the operation might
|
||||
// succeed if we just keep trying.
|
||||
Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_WARN_THRESHOLD => {
|
||||
info!("{description} failed, will retry (attempt {attempts}): {err:#}");
|
||||
}
|
||||
Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_RETRIES => {
|
||||
warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
|
||||
}
|
||||
Err(DownloadError::Other(ref err)) => {
|
||||
// Operation failed FAILED_DOWNLOAD_RETRIES times. Time to give up.
|
||||
warn!("{description} still failed after {attempts} retries, giving up: {err:?}");
|
||||
return result;
|
||||
}
|
||||
}
|
||||
// sleep and retry
|
||||
exponential_backoff(
|
||||
attempts,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
)
|
||||
.await;
|
||||
attempts += 1;
|
||||
}
|
||||
backoff::retry(
|
||||
op,
|
||||
|e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
description,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -8,8 +8,8 @@ mod layer_desc;
|
||||
mod remote_layer;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||
use crate::repository::Key;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::Result;
|
||||
@@ -34,7 +34,7 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
pub use delta_layer::{DeltaLayer, DeltaLayerWriter};
|
||||
pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
|
||||
pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
|
||||
pub use image_layer::{ImageLayer, ImageLayerWriter};
|
||||
pub use inmemory_layer::InMemoryLayer;
|
||||
@@ -241,10 +241,14 @@ impl LayerAccessStats {
|
||||
});
|
||||
}
|
||||
|
||||
fn record_access(&self, access_kind: LayerAccessKind, task_kind: TaskKind) {
|
||||
fn record_access(&self, access_kind: LayerAccessKind, ctx: &RequestContext) {
|
||||
if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
|
||||
return;
|
||||
}
|
||||
|
||||
let this_access = LayerAccessStatFullDetails {
|
||||
when: SystemTime::now(),
|
||||
task_kind,
|
||||
task_kind: ctx.task_kind(),
|
||||
access_kind,
|
||||
};
|
||||
|
||||
@@ -252,7 +256,7 @@ impl LayerAccessStats {
|
||||
locked.iter_mut().for_each(|inner| {
|
||||
inner.first_access.get_or_insert(this_access);
|
||||
inner.count_by_access_kind[access_kind] += 1;
|
||||
inner.task_kind_flag |= task_kind;
|
||||
inner.task_kind_flag |= ctx.task_kind();
|
||||
inner.last_accesses.write(this_access);
|
||||
})
|
||||
}
|
||||
@@ -381,12 +385,6 @@ pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
|
||||
async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
|
||||
}
|
||||
|
||||
/// Returned by [`PersistentLayer::iter`]
|
||||
pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i + Send>;
|
||||
|
||||
/// Returned by [`PersistentLayer::key_iter`]
|
||||
pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i + Send>;
|
||||
|
||||
/// Get a layer descriptor from a layer.
|
||||
pub trait AsLayerDesc {
|
||||
/// Get the layer descriptor.
|
||||
@@ -407,16 +405,6 @@ pub trait AsLayerDesc {
|
||||
/// An image layer is a snapshot of all the data in a key-range, at a single
|
||||
/// LSN.
|
||||
pub trait PersistentLayer: Layer + AsLayerDesc {
|
||||
/// Identify the tenant this layer belongs to
|
||||
fn get_tenant_id(&self) -> TenantId {
|
||||
self.layer_desc().tenant_id
|
||||
}
|
||||
|
||||
/// Identify the timeline this layer belongs to
|
||||
fn get_timeline_id(&self) -> TimelineId {
|
||||
self.layer_desc().timeline_id
|
||||
}
|
||||
|
||||
/// File name used for this layer, both in the pageserver's local filesystem
|
||||
/// state as well as in the remote storage.
|
||||
fn filename(&self) -> LayerFileName {
|
||||
@@ -427,15 +415,6 @@ pub trait PersistentLayer: Layer + AsLayerDesc {
|
||||
// `None` for `RemoteLayer`.
|
||||
fn local_path(&self) -> Option<PathBuf>;
|
||||
|
||||
/// Iterate through all keys and values stored in the layer
|
||||
fn iter(&self, ctx: &RequestContext) -> Result<LayerIter<'_>>;
|
||||
|
||||
/// Iterate through all keys stored in the layer. Returns key, lsn and value size
|
||||
/// It is used only for compaction and so is currently implemented only for DeltaLayer
|
||||
fn key_iter(&self, _ctx: &RequestContext) -> Result<LayerKeyIter<'_>> {
|
||||
panic!("Not implemented")
|
||||
}
|
||||
|
||||
/// Permanently remove this layer from disk.
|
||||
fn delete_resident_layer_file(&self) -> Result<()>;
|
||||
|
||||
@@ -451,14 +430,6 @@ pub trait PersistentLayer: Layer + AsLayerDesc {
|
||||
false
|
||||
}
|
||||
|
||||
/// Returns None if the layer file size is not known.
|
||||
///
|
||||
/// Should not change over the lifetime of the layer object because
|
||||
/// current_physical_size is computed as the som of this value.
|
||||
fn file_size(&self) -> u64 {
|
||||
self.layer_desc().file_size
|
||||
}
|
||||
|
||||
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo;
|
||||
|
||||
fn access_stats(&self) -> &LayerAccessStats;
|
||||
|
||||
@@ -29,10 +29,10 @@
|
||||
//!
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::page_cache::{PageReadGuard, PAGE_SZ};
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use crate::repository::{Key, Value, KEY_SIZE};
|
||||
use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
|
||||
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader};
|
||||
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::tenant::storage_layer::{
|
||||
PersistentLayer, ValueReconstructResult, ValueReconstructState,
|
||||
@@ -41,7 +41,6 @@ use crate::virtual_file::VirtualFile;
|
||||
use crate::{walrecord, TEMP_FILE_SUFFIX};
|
||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use once_cell::sync::OnceCell;
|
||||
use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -52,6 +51,8 @@ use std::ops::Range;
|
||||
use std::os::unix::fs::FileExt;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
use tokio::runtime::Handle;
|
||||
use tokio::sync::OnceCell;
|
||||
use tracing::*;
|
||||
|
||||
use utils::{
|
||||
@@ -61,8 +62,8 @@ use utils::{
|
||||
};
|
||||
|
||||
use super::{
|
||||
AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerIter,
|
||||
LayerKeyIter, PathOrConf, PersistentLayerDesc,
|
||||
AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, PathOrConf,
|
||||
PersistentLayerDesc,
|
||||
};
|
||||
|
||||
///
|
||||
@@ -90,14 +91,30 @@ pub struct Summary {
|
||||
|
||||
impl From<&DeltaLayer> for Summary {
|
||||
fn from(layer: &DeltaLayer) -> Self {
|
||||
Self::expected(
|
||||
layer.desc.tenant_id,
|
||||
layer.desc.timeline_id,
|
||||
layer.desc.key_range.clone(),
|
||||
layer.desc.lsn_range.clone(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl Summary {
|
||||
pub(super) fn expected(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
keys: Range<Key>,
|
||||
lsns: Range<Lsn>,
|
||||
) -> Self {
|
||||
Self {
|
||||
magic: DELTA_FILE_MAGIC,
|
||||
format_version: STORAGE_FORMAT_VERSION,
|
||||
|
||||
tenant_id: layer.desc.tenant_id,
|
||||
timeline_id: layer.desc.timeline_id,
|
||||
key_range: layer.desc.key_range.clone(),
|
||||
lsn_range: layer.desc.lsn_range.clone(),
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
key_range: keys,
|
||||
lsn_range: lsns,
|
||||
|
||||
index_start_blk: 0,
|
||||
index_root_blk: 0,
|
||||
@@ -108,12 +125,10 @@ impl From<&DeltaLayer> for Summary {
|
||||
// Flag indicating that this version initialize the page
|
||||
const WILL_INIT: u64 = 1;
|
||||
|
||||
///
|
||||
/// Struct representing reference to BLOB in layers. Reference contains BLOB
|
||||
/// offset, and for WAL records it also contains `will_init` flag. The flag
|
||||
/// helps to determine the range of records that needs to be applied, without
|
||||
/// reading/deserializing records themselves.
|
||||
///
|
||||
#[derive(Debug, Serialize, Deserialize, Copy, Clone)]
|
||||
pub struct BlobRef(pub u64);
|
||||
|
||||
@@ -138,10 +153,8 @@ impl BlobRef {
|
||||
pub const DELTA_KEY_SIZE: usize = KEY_SIZE + 8;
|
||||
struct DeltaKey([u8; DELTA_KEY_SIZE]);
|
||||
|
||||
///
|
||||
/// This is the key of the B-tree index stored in the delta layer. It consists
|
||||
/// of the serialized representation of a Key and LSN.
|
||||
///
|
||||
impl DeltaKey {
|
||||
fn from_slice(buf: &[u8]) -> Self {
|
||||
let mut bytes: [u8; DELTA_KEY_SIZE] = [0u8; DELTA_KEY_SIZE];
|
||||
@@ -189,7 +202,7 @@ pub struct DeltaLayer {
|
||||
|
||||
access_stats: LayerAccessStats,
|
||||
|
||||
inner: OnceCell<DeltaLayerInner>,
|
||||
inner: OnceCell<Arc<DeltaLayerInner>>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for DeltaLayer {
|
||||
@@ -214,6 +227,12 @@ pub struct DeltaLayerInner {
|
||||
file: FileBlockReader<VirtualFile>,
|
||||
}
|
||||
|
||||
impl AsRef<DeltaLayerInner> for DeltaLayerInner {
|
||||
fn as_ref(&self) -> &DeltaLayerInner {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for DeltaLayerInner {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("DeltaLayerInner")
|
||||
@@ -242,7 +261,7 @@ impl Layer for DeltaLayer {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let inner = self.load(LayerAccessKind::Dump, ctx)?;
|
||||
let inner = self.load(LayerAccessKind::Dump, ctx).await?;
|
||||
|
||||
println!(
|
||||
"index_start_blk: {}, root {}",
|
||||
@@ -258,11 +277,12 @@ impl Layer for DeltaLayer {
|
||||
|
||||
tree_reader.dump().await?;
|
||||
|
||||
let mut cursor = file.block_cursor();
|
||||
let cursor = file.block_cursor();
|
||||
|
||||
// A subroutine to dump a single blob
|
||||
let mut dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
|
||||
let buf = cursor.read_blob(blob_ref.pos())?;
|
||||
let dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
|
||||
// TODO this is not ideal, but on the other hand we are in dumping code...
|
||||
let buf = Handle::current().block_on(cursor.read_blob(blob_ref.pos()))?;
|
||||
let val = Value::des(&buf)?;
|
||||
let desc = match val {
|
||||
Value::Image(img) => {
|
||||
@@ -281,22 +301,24 @@ impl Layer for DeltaLayer {
|
||||
Ok(desc)
|
||||
};
|
||||
|
||||
tree_reader.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
|delta_key, val| {
|
||||
let blob_ref = BlobRef(val);
|
||||
let key = DeltaKey::extract_key_from_buf(delta_key);
|
||||
let lsn = DeltaKey::extract_lsn_from_buf(delta_key);
|
||||
tree_reader
|
||||
.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
|delta_key, val| {
|
||||
let blob_ref = BlobRef(val);
|
||||
let key = DeltaKey::extract_key_from_buf(delta_key);
|
||||
let lsn = DeltaKey::extract_lsn_from_buf(delta_key);
|
||||
|
||||
let desc = match dump_blob(blob_ref) {
|
||||
Ok(desc) => desc,
|
||||
Err(err) => format!("ERROR: {}", err),
|
||||
};
|
||||
println!(" key {} at {}: {}", key, lsn, desc);
|
||||
true
|
||||
},
|
||||
)?;
|
||||
let desc = match dump_blob(blob_ref) {
|
||||
Ok(desc) => desc,
|
||||
Err(err) => format!("ERROR: {}", err),
|
||||
};
|
||||
println!(" key {} at {}: {}", key, lsn, desc);
|
||||
true
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -309,82 +331,15 @@ impl Layer for DeltaLayer {
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
ensure!(lsn_range.start >= self.desc.lsn_range.start);
|
||||
let mut need_image = true;
|
||||
|
||||
ensure!(self.desc.key_range.contains(&key));
|
||||
|
||||
{
|
||||
// Open the file and lock the metadata in memory
|
||||
let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;
|
||||
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
let file = &inner.file;
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
inner.index_start_blk,
|
||||
inner.index_root_blk,
|
||||
file,
|
||||
);
|
||||
let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));
|
||||
|
||||
let mut offsets: Vec<(Lsn, u64)> = Vec::new();
|
||||
|
||||
tree_reader.visit(&search_key.0, VisitDirection::Backwards, |key, value| {
|
||||
let blob_ref = BlobRef(value);
|
||||
if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
|
||||
return false;
|
||||
}
|
||||
let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
|
||||
if entry_lsn < lsn_range.start {
|
||||
return false;
|
||||
}
|
||||
offsets.push((entry_lsn, blob_ref.pos()));
|
||||
|
||||
!blob_ref.will_init()
|
||||
})?;
|
||||
|
||||
// Ok, 'offsets' now contains the offsets of all the entries we need to read
|
||||
let mut cursor = file.block_cursor();
|
||||
let mut buf = Vec::new();
|
||||
for (entry_lsn, pos) in offsets {
|
||||
cursor.read_blob_into_buf(pos, &mut buf).with_context(|| {
|
||||
format!(
|
||||
"Failed to read blob from virtual file {}",
|
||||
file.file.path.display()
|
||||
)
|
||||
})?;
|
||||
let val = Value::des(&buf).with_context(|| {
|
||||
format!(
|
||||
"Failed to deserialize file blob from virtual file {}",
|
||||
file.file.path.display()
|
||||
)
|
||||
})?;
|
||||
match val {
|
||||
Value::Image(img) => {
|
||||
reconstruct_state.img = Some((entry_lsn, img));
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let will_init = rec.will_init();
|
||||
reconstruct_state.records.push((entry_lsn, rec));
|
||||
if will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// release metadata lock and close the file
|
||||
}
|
||||
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know.
|
||||
if need_image {
|
||||
Ok(ValueReconstructResult::Continue)
|
||||
} else {
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
}
|
||||
let inner = self
|
||||
.load(LayerAccessKind::GetValueReconstructData, ctx)
|
||||
.await?;
|
||||
inner
|
||||
.get_value_reconstruct_data(key, lsn_range, reconstruct_state)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
@@ -424,23 +379,6 @@ impl PersistentLayer for DeltaLayer {
|
||||
Some(self.path())
|
||||
}
|
||||
|
||||
fn iter(&self, ctx: &RequestContext) -> Result<LayerIter<'_>> {
|
||||
let inner = self
|
||||
.load(LayerAccessKind::KeyIter, ctx)
|
||||
.context("load delta layer")?;
|
||||
Ok(match DeltaValueIter::new(inner) {
|
||||
Ok(iter) => Box::new(iter),
|
||||
Err(err) => Box::new(std::iter::once(Err(err))),
|
||||
})
|
||||
}
|
||||
|
||||
fn key_iter(&self, ctx: &RequestContext) -> Result<LayerKeyIter<'_>> {
|
||||
let inner = self.load(LayerAccessKind::KeyIter, ctx)?;
|
||||
Ok(Box::new(
|
||||
DeltaKeyIter::new(inner).context("Layer index is corrupted")?,
|
||||
))
|
||||
}
|
||||
|
||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
// delete underlying file
|
||||
fs::remove_file(self.path())?;
|
||||
@@ -510,55 +448,43 @@ impl DeltaLayer {
|
||||
/// Open the underlying file and read the metadata into memory, if it's
|
||||
/// not loaded already.
|
||||
///
|
||||
fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&DeltaLayerInner> {
|
||||
self.access_stats
|
||||
.record_access(access_kind, ctx.task_kind());
|
||||
async fn load(
|
||||
&self,
|
||||
access_kind: LayerAccessKind,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<&Arc<DeltaLayerInner>> {
|
||||
self.access_stats.record_access(access_kind, ctx);
|
||||
// Quick exit if already loaded
|
||||
self.inner
|
||||
.get_or_try_init(|| self.load_inner())
|
||||
.await
|
||||
.with_context(|| format!("Failed to load delta layer {}", self.path().display()))
|
||||
}
|
||||
|
||||
fn load_inner(&self) -> Result<DeltaLayerInner> {
|
||||
async fn load_inner(&self) -> Result<Arc<DeltaLayerInner>> {
|
||||
let path = self.path();
|
||||
|
||||
let file = VirtualFile::open(&path)
|
||||
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
|
||||
let file = FileBlockReader::new(file);
|
||||
let summary = match &self.path_or_conf {
|
||||
PathOrConf::Conf(_) => Some(Summary::from(self)),
|
||||
PathOrConf::Path(_) => None,
|
||||
};
|
||||
|
||||
let summary_blk = file.read_blk(0)?;
|
||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
|
||||
let loaded = DeltaLayerInner::load(&path, summary)?;
|
||||
|
||||
match &self.path_or_conf {
|
||||
PathOrConf::Conf(_) => {
|
||||
let mut expected_summary = Summary::from(self);
|
||||
expected_summary.index_start_blk = actual_summary.index_start_blk;
|
||||
expected_summary.index_root_blk = actual_summary.index_root_blk;
|
||||
if actual_summary != expected_summary {
|
||||
bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary);
|
||||
}
|
||||
}
|
||||
PathOrConf::Path(path) => {
|
||||
let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
|
||||
let expected_filename = self.filename().file_name();
|
||||
if let PathOrConf::Path(ref path) = self.path_or_conf {
|
||||
// not production code
|
||||
|
||||
if actual_filename != expected_filename {
|
||||
println!(
|
||||
"warning: filename does not match what is expected from in-file summary"
|
||||
);
|
||||
println!("actual: {:?}", actual_filename);
|
||||
println!("expected: {:?}", expected_filename);
|
||||
}
|
||||
let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
|
||||
let expected_filename = self.filename().file_name();
|
||||
|
||||
if actual_filename != expected_filename {
|
||||
println!("warning: filename does not match what is expected from in-file summary");
|
||||
println!("actual: {:?}", actual_filename);
|
||||
println!("expected: {:?}", expected_filename);
|
||||
}
|
||||
}
|
||||
|
||||
debug!("loaded from {}", &path.display());
|
||||
|
||||
Ok(DeltaLayerInner {
|
||||
file,
|
||||
index_start_blk: actual_summary.index_start_blk,
|
||||
index_root_blk: actual_summary.index_root_blk,
|
||||
})
|
||||
Ok(Arc::new(loaded))
|
||||
}
|
||||
|
||||
/// Create a DeltaLayer struct representing an existing file on disk.
|
||||
@@ -580,7 +506,7 @@ impl DeltaLayer {
|
||||
file_size,
|
||||
),
|
||||
access_stats,
|
||||
inner: once_cell::sync::OnceCell::new(),
|
||||
inner: OnceCell::new(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -607,7 +533,7 @@ impl DeltaLayer {
|
||||
metadata.len(),
|
||||
),
|
||||
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
|
||||
inner: once_cell::sync::OnceCell::new(),
|
||||
inner: OnceCell::new(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -623,6 +549,33 @@ impl DeltaLayer {
|
||||
&self.layer_name(),
|
||||
)
|
||||
}
|
||||
|
||||
/// Obtains all keys and value references stored in the layer
|
||||
///
|
||||
/// The value can be obtained via the [`ValueRef::load`] function.
|
||||
pub async fn load_val_refs(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Vec<(Key, Lsn, ValueRef<Arc<DeltaLayerInner>>)>> {
|
||||
let inner = self
|
||||
.load(LayerAccessKind::Iter, ctx)
|
||||
.await
|
||||
.context("load delta layer")?;
|
||||
DeltaLayerInner::load_val_refs(inner)
|
||||
.await
|
||||
.context("Layer index is corrupted")
|
||||
}
|
||||
|
||||
/// Loads all keys stored in the layer. Returns key, lsn and value size.
|
||||
pub async fn load_keys(&self, ctx: &RequestContext) -> Result<Vec<(Key, Lsn, u64)>> {
|
||||
let inner = self
|
||||
.load(LayerAccessKind::KeyIter, ctx)
|
||||
.await
|
||||
.context("load delta layer keys")?;
|
||||
DeltaLayerInner::load_keys(inner)
|
||||
.await
|
||||
.context("Layer index is corrupted")
|
||||
}
|
||||
}
|
||||
|
||||
/// A builder object for constructing a new delta layer.
|
||||
@@ -771,7 +724,7 @@ impl DeltaLayerWriterInner {
|
||||
metadata.len(),
|
||||
),
|
||||
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
|
||||
inner: once_cell::sync::OnceCell::new(),
|
||||
inner: OnceCell::new(),
|
||||
};
|
||||
|
||||
// fsync the file
|
||||
@@ -893,168 +846,202 @@ impl Drop for DeltaLayerWriter {
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Iterator over all key-value pairse stored in a delta layer
|
||||
///
|
||||
/// FIXME: This creates a Vector to hold the offsets of all key value pairs.
|
||||
/// That takes up quite a lot of memory. Should do this in a more streaming
|
||||
/// fashion.
|
||||
///
|
||||
struct DeltaValueIter<'a> {
|
||||
all_offsets: Vec<(DeltaKey, BlobRef)>,
|
||||
next_idx: usize,
|
||||
reader: BlockCursor<Adapter<'a>>,
|
||||
}
|
||||
impl DeltaLayerInner {
|
||||
pub(super) fn load(path: &std::path::Path, summary: Option<Summary>) -> anyhow::Result<Self> {
|
||||
let file = VirtualFile::open(path)
|
||||
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
|
||||
let file = FileBlockReader::new(file);
|
||||
|
||||
struct Adapter<'a>(&'a DeltaLayerInner);
|
||||
let summary_blk = file.read_blk(0)?;
|
||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
|
||||
|
||||
impl<'a> BlockReader for Adapter<'a> {
|
||||
type BlockLease = PageReadGuard<'static>;
|
||||
if let Some(mut expected_summary) = summary {
|
||||
// production code path
|
||||
expected_summary.index_start_blk = actual_summary.index_start_blk;
|
||||
expected_summary.index_root_blk = actual_summary.index_root_blk;
|
||||
if actual_summary != expected_summary {
|
||||
bail!(
|
||||
"in-file summary does not match expected summary. actual = {:?} expected = {:?}",
|
||||
actual_summary,
|
||||
expected_summary
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
|
||||
self.0.file.read_blk(blknum)
|
||||
Ok(DeltaLayerInner {
|
||||
file,
|
||||
index_start_blk: actual_summary.index_start_blk,
|
||||
index_root_blk: actual_summary.index_root_blk,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for DeltaValueIter<'a> {
|
||||
type Item = Result<(Key, Lsn, Value)>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.next_res().transpose()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> DeltaValueIter<'a> {
|
||||
fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
|
||||
let file = &inner.file;
|
||||
pub(super) async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
let mut need_image = true;
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
let file = &self.file;
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
inner.index_start_blk,
|
||||
inner.index_root_blk,
|
||||
self.index_start_blk,
|
||||
self.index_root_blk,
|
||||
file,
|
||||
);
|
||||
let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));
|
||||
|
||||
let mut all_offsets: Vec<(DeltaKey, BlobRef)> = Vec::new();
|
||||
tree_reader.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
|key, value| {
|
||||
all_offsets.push((DeltaKey::from_slice(key), BlobRef(value)));
|
||||
true
|
||||
},
|
||||
)?;
|
||||
let mut offsets: Vec<(Lsn, u64)> = Vec::new();
|
||||
|
||||
let iter = DeltaValueIter {
|
||||
all_offsets,
|
||||
next_idx: 0,
|
||||
reader: BlockCursor::new(Adapter(inner)),
|
||||
};
|
||||
tree_reader
|
||||
.visit(&search_key.0, VisitDirection::Backwards, |key, value| {
|
||||
let blob_ref = BlobRef(value);
|
||||
if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
|
||||
return false;
|
||||
}
|
||||
let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
|
||||
if entry_lsn < lsn_range.start {
|
||||
return false;
|
||||
}
|
||||
offsets.push((entry_lsn, blob_ref.pos()));
|
||||
|
||||
Ok(iter)
|
||||
}
|
||||
!blob_ref.will_init()
|
||||
})
|
||||
.await?;
|
||||
|
||||
fn next_res(&mut self) -> Result<Option<(Key, Lsn, Value)>> {
|
||||
if self.next_idx < self.all_offsets.len() {
|
||||
let (delta_key, blob_ref) = &self.all_offsets[self.next_idx];
|
||||
|
||||
let key = delta_key.key();
|
||||
let lsn = delta_key.lsn();
|
||||
|
||||
let buf = self.reader.read_blob(blob_ref.pos())?;
|
||||
let val = Value::des(&buf)?;
|
||||
self.next_idx += 1;
|
||||
Ok(Some((key, lsn, val)))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
}
|
||||
///
|
||||
/// Iterator over all keys stored in a delta layer
|
||||
///
|
||||
/// FIXME: This creates a Vector to hold all keys.
|
||||
/// That takes up quite a lot of memory. Should do this in a more streaming
|
||||
/// fashion.
|
||||
///
|
||||
struct DeltaKeyIter {
|
||||
all_keys: Vec<(DeltaKey, u64)>,
|
||||
next_idx: usize,
|
||||
}
|
||||
|
||||
impl Iterator for DeltaKeyIter {
|
||||
type Item = (Key, Lsn, u64);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
if self.next_idx < self.all_keys.len() {
|
||||
let (delta_key, size) = &self.all_keys[self.next_idx];
|
||||
|
||||
let key = delta_key.key();
|
||||
let lsn = delta_key.lsn();
|
||||
|
||||
self.next_idx += 1;
|
||||
Some((key, lsn, *size))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> DeltaKeyIter {
|
||||
fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
|
||||
let file = &inner.file;
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
inner.index_start_blk,
|
||||
inner.index_root_blk,
|
||||
file,
|
||||
);
|
||||
|
||||
let mut all_keys: Vec<(DeltaKey, u64)> = Vec::new();
|
||||
tree_reader.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
|key, value| {
|
||||
let delta_key = DeltaKey::from_slice(key);
|
||||
let pos = BlobRef(value).pos();
|
||||
if let Some(last) = all_keys.last_mut() {
|
||||
if last.0.key() == delta_key.key() {
|
||||
return true;
|
||||
} else {
|
||||
// subtract offset of new key BLOB and first blob of this key
|
||||
// to get total size if values associated with this key
|
||||
let first_pos = last.1;
|
||||
last.1 = pos - first_pos;
|
||||
// Ok, 'offsets' now contains the offsets of all the entries we need to read
|
||||
let cursor = file.block_cursor();
|
||||
let mut buf = Vec::new();
|
||||
for (entry_lsn, pos) in offsets {
|
||||
cursor
|
||||
.read_blob_into_buf(pos, &mut buf)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to read blob from virtual file {}",
|
||||
file.file.path.display()
|
||||
)
|
||||
})?;
|
||||
let val = Value::des(&buf).with_context(|| {
|
||||
format!(
|
||||
"Failed to deserialize file blob from virtual file {}",
|
||||
file.file.path.display()
|
||||
)
|
||||
})?;
|
||||
match val {
|
||||
Value::Image(img) => {
|
||||
reconstruct_state.img = Some((entry_lsn, img));
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let will_init = rec.will_init();
|
||||
reconstruct_state.records.push((entry_lsn, rec));
|
||||
if will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
all_keys.push((delta_key, pos));
|
||||
true
|
||||
},
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know.
|
||||
if need_image {
|
||||
Ok(ValueReconstructResult::Continue)
|
||||
} else {
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) async fn load_val_refs<T: AsRef<DeltaLayerInner> + Clone>(
|
||||
this: &T,
|
||||
) -> Result<Vec<(Key, Lsn, ValueRef<T>)>> {
|
||||
let dl = this.as_ref();
|
||||
let file = &dl.file;
|
||||
let tree_reader =
|
||||
DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(dl.index_start_blk, dl.index_root_blk, file);
|
||||
|
||||
let mut all_offsets = Vec::<(Key, Lsn, ValueRef<T>)>::new();
|
||||
tree_reader
|
||||
.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
|key, value| {
|
||||
let delta_key = DeltaKey::from_slice(key);
|
||||
let val_ref = ValueRef {
|
||||
blob_ref: BlobRef(value),
|
||||
reader: BlockCursor::new(Adapter(this.clone())),
|
||||
};
|
||||
all_offsets.push((delta_key.key(), delta_key.lsn(), val_ref));
|
||||
true
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(all_offsets)
|
||||
}
|
||||
|
||||
pub(super) async fn load_keys(&self) -> Result<Vec<(Key, Lsn, u64)>> {
|
||||
let file = &self.file;
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
self.index_start_blk,
|
||||
self.index_root_blk,
|
||||
file,
|
||||
);
|
||||
|
||||
let mut all_keys: Vec<(Key, Lsn, u64)> = Vec::new();
|
||||
tree_reader
|
||||
.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
|key, value| {
|
||||
let delta_key = DeltaKey::from_slice(key);
|
||||
let pos = BlobRef(value).pos();
|
||||
if let Some(last) = all_keys.last_mut() {
|
||||
if last.0 == delta_key.key() {
|
||||
return true;
|
||||
} else {
|
||||
// subtract offset of new key BLOB and first blob of this key
|
||||
// to get total size if values associated with this key
|
||||
let first_pos = last.2;
|
||||
last.2 = pos - first_pos;
|
||||
}
|
||||
}
|
||||
all_keys.push((delta_key.key(), delta_key.lsn(), pos));
|
||||
true
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
if let Some(last) = all_keys.last_mut() {
|
||||
// Last key occupies all space till end of layer
|
||||
last.1 = std::fs::metadata(&file.file.path)?.len() - last.1;
|
||||
last.2 = std::fs::metadata(&file.file.path)?.len() - last.2;
|
||||
}
|
||||
let iter = DeltaKeyIter {
|
||||
all_keys,
|
||||
next_idx: 0,
|
||||
};
|
||||
|
||||
Ok(iter)
|
||||
Ok(all_keys)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::DeltaKeyIter;
|
||||
use super::DeltaLayer;
|
||||
use super::DeltaValueIter;
|
||||
/// Reference to an on-disk value
|
||||
pub struct ValueRef<T: AsRef<DeltaLayerInner>> {
|
||||
blob_ref: BlobRef,
|
||||
reader: BlockCursor<Adapter<T>>,
|
||||
}
|
||||
|
||||
// We will soon need the iters to be send in the compaction code.
|
||||
// Cf https://github.com/neondatabase/neon/pull/4462#issuecomment-1587398883
|
||||
// Cf https://github.com/neondatabase/neon/issues/4471
|
||||
#[test]
|
||||
fn is_send() {
|
||||
fn assert_send<T: Send>() {}
|
||||
assert_send::<DeltaLayer>();
|
||||
assert_send::<DeltaValueIter>();
|
||||
assert_send::<DeltaKeyIter>();
|
||||
impl<T: AsRef<DeltaLayerInner>> ValueRef<T> {
|
||||
/// Loads the value from disk
|
||||
pub async fn load(&self) -> Result<Value> {
|
||||
// theoretically we *could* record an access time for each, but it does not really matter
|
||||
let buf = self.reader.read_blob(self.blob_ref.pos()).await?;
|
||||
let val = Value::des(&buf)?;
|
||||
Ok(val)
|
||||
}
|
||||
}
|
||||
|
||||
struct Adapter<T: AsRef<DeltaLayerInner>>(T);
|
||||
|
||||
impl<T: AsRef<DeltaLayerInner>> BlockReader for Adapter<T> {
|
||||
fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
|
||||
self.0.as_ref().file.read_blk(blknum)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -38,7 +38,6 @@ use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use bytes::Bytes;
|
||||
use hex;
|
||||
use once_cell::sync::OnceCell;
|
||||
use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -48,6 +47,7 @@ use std::io::{Seek, SeekFrom};
|
||||
use std::ops::Range;
|
||||
use std::os::unix::prelude::FileExt;
|
||||
use std::path::{Path, PathBuf};
|
||||
use tokio::sync::OnceCell;
|
||||
use tracing::*;
|
||||
|
||||
use utils::{
|
||||
@@ -57,9 +57,7 @@ use utils::{
|
||||
};
|
||||
|
||||
use super::filename::ImageFileName;
|
||||
use super::{
|
||||
AsLayerDesc, Layer, LayerAccessStatsReset, LayerIter, PathOrConf, PersistentLayerDesc,
|
||||
};
|
||||
use super::{AsLayerDesc, Layer, LayerAccessStatsReset, PathOrConf, PersistentLayerDesc};
|
||||
|
||||
///
|
||||
/// Header stored in the beginning of the file
|
||||
@@ -68,7 +66,7 @@ use super::{
|
||||
/// the 'index' starts at the block indicated by 'index_start_blk'
|
||||
///
|
||||
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
|
||||
struct Summary {
|
||||
pub(super) struct Summary {
|
||||
/// Magic value to identify this as a neon image file. Always IMAGE_FILE_MAGIC.
|
||||
magic: u16,
|
||||
format_version: u16,
|
||||
@@ -87,13 +85,29 @@ struct Summary {
|
||||
|
||||
impl From<&ImageLayer> for Summary {
|
||||
fn from(layer: &ImageLayer) -> Self {
|
||||
Self::expected(
|
||||
layer.desc.tenant_id,
|
||||
layer.desc.timeline_id,
|
||||
layer.desc.key_range.clone(),
|
||||
layer.lsn,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl Summary {
|
||||
pub(super) fn expected(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
key_range: Range<Key>,
|
||||
lsn: Lsn,
|
||||
) -> Self {
|
||||
Self {
|
||||
magic: IMAGE_FILE_MAGIC,
|
||||
format_version: STORAGE_FORMAT_VERSION,
|
||||
tenant_id: layer.desc.tenant_id,
|
||||
timeline_id: layer.desc.timeline_id,
|
||||
key_range: layer.desc.key_range.clone(),
|
||||
lsn: layer.lsn,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
key_range,
|
||||
lsn,
|
||||
|
||||
index_start_blk: 0,
|
||||
index_root_blk: 0,
|
||||
@@ -138,6 +152,8 @@ pub struct ImageLayerInner {
|
||||
index_start_blk: u32,
|
||||
index_root_blk: u32,
|
||||
|
||||
lsn: Lsn,
|
||||
|
||||
/// Reader object for reading blocks from the file.
|
||||
file: FileBlockReader<VirtualFile>,
|
||||
}
|
||||
@@ -170,17 +186,19 @@ impl Layer for ImageLayer {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let inner = self.load(LayerAccessKind::Dump, ctx)?;
|
||||
let inner = self.load(LayerAccessKind::Dump, ctx).await?;
|
||||
let file = &inner.file;
|
||||
let tree_reader =
|
||||
DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);
|
||||
|
||||
tree_reader.dump().await?;
|
||||
|
||||
tree_reader.visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
|
||||
println!("key: {} offset {}", hex::encode(key), value);
|
||||
true
|
||||
})?;
|
||||
tree_reader
|
||||
.visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
|
||||
println!("key: {} offset {}", hex::encode(key), value);
|
||||
true
|
||||
})
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -197,28 +215,14 @@ impl Layer for ImageLayer {
|
||||
assert!(lsn_range.start >= self.lsn);
|
||||
assert!(lsn_range.end >= self.lsn);
|
||||
|
||||
let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;
|
||||
|
||||
let file = &inner.file;
|
||||
let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);
|
||||
|
||||
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
||||
key.write_to_byte_slice(&mut keybuf);
|
||||
if let Some(offset) = tree_reader.get(&keybuf)? {
|
||||
let blob = file.block_cursor().read_blob(offset).with_context(|| {
|
||||
format!(
|
||||
"failed to read value from data file {} at offset {}",
|
||||
self.path().display(),
|
||||
offset
|
||||
)
|
||||
})?;
|
||||
let value = Bytes::from(blob);
|
||||
|
||||
reconstruct_state.img = Some((self.lsn, value));
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
} else {
|
||||
Ok(ValueReconstructResult::Missing)
|
||||
}
|
||||
let inner = self
|
||||
.load(LayerAccessKind::GetValueReconstructData, ctx)
|
||||
.await?;
|
||||
inner
|
||||
.get_value_reconstruct_data(key, reconstruct_state)
|
||||
.await
|
||||
// FIXME: makes no sense to dump paths
|
||||
.with_context(|| format!("read {}", self.path().display()))
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
@@ -255,10 +259,6 @@ impl PersistentLayer for ImageLayer {
|
||||
Some(self.path())
|
||||
}
|
||||
|
||||
fn iter(&self, _ctx: &RequestContext) -> Result<LayerIter<'_>> {
|
||||
unimplemented!();
|
||||
}
|
||||
|
||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
// delete underlying file
|
||||
fs::remove_file(self.path())?;
|
||||
@@ -318,58 +318,41 @@ impl ImageLayer {
|
||||
/// Open the underlying file and read the metadata into memory, if it's
|
||||
/// not loaded already.
|
||||
///
|
||||
fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&ImageLayerInner> {
|
||||
self.access_stats
|
||||
.record_access(access_kind, ctx.task_kind());
|
||||
loop {
|
||||
if let Some(inner) = self.inner.get() {
|
||||
return Ok(inner);
|
||||
}
|
||||
self.inner
|
||||
.get_or_try_init(|| self.load_inner())
|
||||
.with_context(|| format!("Failed to load image layer {}", self.path().display()))?;
|
||||
}
|
||||
async fn load(
|
||||
&self,
|
||||
access_kind: LayerAccessKind,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<&ImageLayerInner> {
|
||||
self.access_stats.record_access(access_kind, ctx);
|
||||
self.inner
|
||||
.get_or_try_init(|| self.load_inner())
|
||||
.await
|
||||
.with_context(|| format!("Failed to load image layer {}", self.path().display()))
|
||||
}
|
||||
|
||||
fn load_inner(&self) -> Result<ImageLayerInner> {
|
||||
async fn load_inner(&self) -> Result<ImageLayerInner> {
|
||||
let path = self.path();
|
||||
|
||||
// Open the file if it's not open already.
|
||||
let file = VirtualFile::open(&path)
|
||||
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
|
||||
let file = FileBlockReader::new(file);
|
||||
let summary_blk = file.read_blk(0)?;
|
||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
|
||||
let expected_summary = match &self.path_or_conf {
|
||||
PathOrConf::Conf(_) => Some(Summary::from(self)),
|
||||
PathOrConf::Path(_) => None,
|
||||
};
|
||||
|
||||
match &self.path_or_conf {
|
||||
PathOrConf::Conf(_) => {
|
||||
let mut expected_summary = Summary::from(self);
|
||||
expected_summary.index_start_blk = actual_summary.index_start_blk;
|
||||
expected_summary.index_root_blk = actual_summary.index_root_blk;
|
||||
let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary)?;
|
||||
|
||||
if actual_summary != expected_summary {
|
||||
bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary);
|
||||
}
|
||||
}
|
||||
PathOrConf::Path(path) => {
|
||||
let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
|
||||
let expected_filename = self.filename().file_name();
|
||||
if let PathOrConf::Path(ref path) = self.path_or_conf {
|
||||
// not production code
|
||||
let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
|
||||
let expected_filename = self.filename().file_name();
|
||||
|
||||
if actual_filename != expected_filename {
|
||||
println!(
|
||||
"warning: filename does not match what is expected from in-file summary"
|
||||
);
|
||||
println!("actual: {:?}", actual_filename);
|
||||
println!("expected: {:?}", expected_filename);
|
||||
}
|
||||
if actual_filename != expected_filename {
|
||||
println!("warning: filename does not match what is expected from in-file summary");
|
||||
println!("actual: {:?}", actual_filename);
|
||||
println!("expected: {:?}", expected_filename);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ImageLayerInner {
|
||||
index_start_blk: actual_summary.index_start_blk,
|
||||
index_root_blk: actual_summary.index_root_blk,
|
||||
file,
|
||||
})
|
||||
Ok(loaded)
|
||||
}
|
||||
|
||||
/// Create an ImageLayer struct representing an existing file on disk
|
||||
@@ -439,6 +422,66 @@ impl ImageLayer {
|
||||
}
|
||||
}
|
||||
|
||||
impl ImageLayerInner {
|
||||
pub(super) fn load(
|
||||
path: &std::path::Path,
|
||||
lsn: Lsn,
|
||||
summary: Option<Summary>,
|
||||
) -> anyhow::Result<Self> {
|
||||
let file = VirtualFile::open(path)
|
||||
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
|
||||
let file = FileBlockReader::new(file);
|
||||
let summary_blk = file.read_blk(0)?;
|
||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
|
||||
|
||||
if let Some(mut expected_summary) = summary {
|
||||
// production code path
|
||||
expected_summary.index_start_blk = actual_summary.index_start_blk;
|
||||
expected_summary.index_root_blk = actual_summary.index_root_blk;
|
||||
|
||||
if actual_summary != expected_summary {
|
||||
bail!(
|
||||
"in-file summary does not match expected summary. actual = {:?} expected = {:?}",
|
||||
actual_summary,
|
||||
expected_summary
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ImageLayerInner {
|
||||
index_start_blk: actual_summary.index_start_blk,
|
||||
index_root_blk: actual_summary.index_root_blk,
|
||||
lsn,
|
||||
file,
|
||||
})
|
||||
}
|
||||
|
||||
pub(super) async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
let file = &self.file;
|
||||
let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file);
|
||||
|
||||
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
||||
key.write_to_byte_slice(&mut keybuf);
|
||||
if let Some(offset) = tree_reader.get(&keybuf).await? {
|
||||
let blob = file
|
||||
.block_cursor()
|
||||
.read_blob(offset)
|
||||
.await
|
||||
.with_context(|| format!("failed to read value from offset {}", offset))?;
|
||||
let value = Bytes::from(blob);
|
||||
|
||||
reconstruct_state.img = Some((self.lsn, value));
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
} else {
|
||||
Ok(ValueReconstructResult::Missing)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A builder object for constructing a new image layer.
|
||||
///
|
||||
/// Usage:
|
||||
|
||||
@@ -16,6 +16,7 @@ use anyhow::{ensure, Result};
|
||||
use pageserver_api::models::InMemoryLayerInfo;
|
||||
use std::cell::RefCell;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::OnceLock;
|
||||
use tracing::*;
|
||||
use utils::{
|
||||
bin_ser::BeSer,
|
||||
@@ -27,7 +28,7 @@ use utils::{
|
||||
// while being able to use std::fmt::Write's methods
|
||||
use std::fmt::Write as _;
|
||||
use std::ops::Range;
|
||||
use std::sync::RwLock;
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
use super::{DeltaLayer, DeltaLayerWriter, Layer};
|
||||
|
||||
@@ -42,14 +43,16 @@ pub struct InMemoryLayer {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
|
||||
///
|
||||
/// This layer contains all the changes from 'start_lsn'. The
|
||||
/// start is inclusive.
|
||||
///
|
||||
start_lsn: Lsn,
|
||||
|
||||
/// The above fields never change. The parts that do change are in 'inner',
|
||||
/// and protected by mutex.
|
||||
/// Frozen layers have an exclusive end LSN.
|
||||
/// Writes are only allowed when this is `None`.
|
||||
end_lsn: OnceLock<Lsn>,
|
||||
|
||||
/// The above fields never change, except for `end_lsn`, which is only set once.
|
||||
/// All other changing parts are in `inner`, and protected by a mutex.
|
||||
inner: RwLock<InMemoryLayerInner>,
|
||||
}
|
||||
|
||||
@@ -57,21 +60,16 @@ impl std::fmt::Debug for InMemoryLayer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("InMemoryLayer")
|
||||
.field("start_lsn", &self.start_lsn)
|
||||
.field("end_lsn", &self.end_lsn)
|
||||
.field("inner", &self.inner)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct InMemoryLayerInner {
|
||||
/// Frozen layers have an exclusive end LSN.
|
||||
/// Writes are only allowed when this is None
|
||||
end_lsn: Option<Lsn>,
|
||||
|
||||
///
|
||||
/// All versions of all pages in the layer are kept here. Indexed
|
||||
/// by block number and LSN. The value is an offset into the
|
||||
/// ephemeral file where the page version is stored.
|
||||
///
|
||||
index: HashMap<Key, VecMap<Lsn, u64>>,
|
||||
|
||||
/// The values are stored in a serialized format in this file.
|
||||
@@ -82,15 +80,7 @@ pub struct InMemoryLayerInner {
|
||||
|
||||
impl std::fmt::Debug for InMemoryLayerInner {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("InMemoryLayerInner")
|
||||
.field("end_lsn", &self.end_lsn)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl InMemoryLayerInner {
|
||||
fn assert_writeable(&self) {
|
||||
assert!(self.end_lsn.is_none());
|
||||
f.debug_struct("InMemoryLayerInner").finish()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -101,13 +91,21 @@ impl InMemoryLayer {
|
||||
|
||||
pub fn info(&self) -> InMemoryLayerInfo {
|
||||
let lsn_start = self.start_lsn;
|
||||
let lsn_end = self.inner.read().unwrap().end_lsn;
|
||||
|
||||
match lsn_end {
|
||||
Some(lsn_end) => InMemoryLayerInfo::Frozen { lsn_start, lsn_end },
|
||||
None => InMemoryLayerInfo::Open { lsn_start },
|
||||
if let Some(&lsn_end) = self.end_lsn.get() {
|
||||
InMemoryLayerInfo::Frozen { lsn_start, lsn_end }
|
||||
} else {
|
||||
InMemoryLayerInfo::Open { lsn_start }
|
||||
}
|
||||
}
|
||||
|
||||
fn assert_writable(&self) {
|
||||
assert!(self.end_lsn.get().is_none());
|
||||
}
|
||||
|
||||
fn end_lsn_or_max(&self) -> Lsn {
|
||||
self.end_lsn.get().copied().unwrap_or(Lsn::MAX)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
@@ -117,14 +115,7 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
|
||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
let end_lsn = if let Some(end_lsn) = inner.end_lsn {
|
||||
end_lsn
|
||||
} else {
|
||||
Lsn(u64::MAX)
|
||||
};
|
||||
self.start_lsn..end_lsn
|
||||
self.start_lsn..self.end_lsn_or_max()
|
||||
}
|
||||
|
||||
fn is_incremental(&self) -> bool {
|
||||
@@ -134,13 +125,9 @@ impl Layer for InMemoryLayer {
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
||||
let inner = self.inner.read().unwrap();
|
||||
let inner = self.inner.read().await;
|
||||
|
||||
let end_str = inner
|
||||
.end_lsn
|
||||
.as_ref()
|
||||
.map(Lsn::to_string)
|
||||
.unwrap_or_default();
|
||||
let end_str = self.end_lsn_or_max();
|
||||
|
||||
println!(
|
||||
"----- in-memory layer for tli {} LSNs {}-{} ----",
|
||||
@@ -151,12 +138,12 @@ impl Layer for InMemoryLayer {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut cursor = inner.file.block_cursor();
|
||||
let cursor = inner.file.block_cursor();
|
||||
let mut buf = Vec::new();
|
||||
for (key, vec_map) in inner.index.iter() {
|
||||
for (lsn, pos) in vec_map.as_slice() {
|
||||
let mut desc = String::new();
|
||||
cursor.read_blob_into_buf(*pos, &mut buf)?;
|
||||
cursor.read_blob_into_buf(*pos, &mut buf).await?;
|
||||
let val = Value::des(&buf);
|
||||
match val {
|
||||
Ok(Value::Image(img)) => {
|
||||
@@ -194,15 +181,15 @@ impl Layer for InMemoryLayer {
|
||||
ensure!(lsn_range.start >= self.start_lsn);
|
||||
let mut need_image = true;
|
||||
|
||||
let inner = self.inner.read().unwrap();
|
||||
let inner = self.inner.read().await;
|
||||
|
||||
let mut reader = inner.file.block_cursor();
|
||||
let reader = inner.file.block_cursor();
|
||||
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
if let Some(vec_map) = inner.index.get(&key) {
|
||||
let slice = vec_map.slice_range(lsn_range);
|
||||
for (entry_lsn, pos) in slice.iter().rev() {
|
||||
let buf = reader.read_blob(*pos)?;
|
||||
let buf = reader.read_blob(*pos).await?;
|
||||
let value = Value::des(&buf)?;
|
||||
match value {
|
||||
Value::Image(img) => {
|
||||
@@ -236,9 +223,7 @@ impl Layer for InMemoryLayer {
|
||||
|
||||
impl std::fmt::Display for InMemoryLayer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));
|
||||
let end_lsn = self.end_lsn_or_max();
|
||||
write!(f, "inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
|
||||
}
|
||||
}
|
||||
@@ -247,8 +232,8 @@ impl InMemoryLayer {
|
||||
///
|
||||
/// Get layer size on the disk
|
||||
///
|
||||
pub fn size(&self) -> Result<u64> {
|
||||
let inner = self.inner.read().unwrap();
|
||||
pub async fn size(&self) -> Result<u64> {
|
||||
let inner = self.inner.read().await;
|
||||
Ok(inner.file.size)
|
||||
}
|
||||
|
||||
@@ -270,8 +255,8 @@ impl InMemoryLayer {
|
||||
timeline_id,
|
||||
tenant_id,
|
||||
start_lsn,
|
||||
end_lsn: OnceLock::new(),
|
||||
inner: RwLock::new(InMemoryLayerInner {
|
||||
end_lsn: None,
|
||||
index: HashMap::new(),
|
||||
file,
|
||||
}),
|
||||
@@ -282,10 +267,10 @@ impl InMemoryLayer {
|
||||
|
||||
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
|
||||
/// Adds the page version to the in-memory tree
|
||||
pub fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
|
||||
pub async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
|
||||
trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
inner.assert_writeable();
|
||||
let mut inner = self.inner.write().await;
|
||||
self.assert_writable();
|
||||
|
||||
let off = {
|
||||
SER_BUFFER.with(|x| -> Result<_> {
|
||||
@@ -316,11 +301,11 @@ impl InMemoryLayer {
|
||||
/// Make the layer non-writeable. Only call once.
|
||||
/// Records the end_lsn for non-dropped layers.
|
||||
/// `end_lsn` is exclusive
|
||||
pub fn freeze(&self, end_lsn: Lsn) {
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
pub async fn freeze(&self, end_lsn: Lsn) {
|
||||
let inner = self.inner.write().await;
|
||||
|
||||
assert!(self.start_lsn < end_lsn);
|
||||
inner.end_lsn = Some(end_lsn);
|
||||
self.end_lsn.set(end_lsn).expect("end_lsn set only once");
|
||||
|
||||
for vec_map in inner.index.values() {
|
||||
for (lsn, _pos) in vec_map.as_slice() {
|
||||
@@ -332,7 +317,7 @@ impl InMemoryLayer {
|
||||
/// Write this frozen in-memory layer to disk.
|
||||
///
|
||||
/// Returns a new delta layer with all the same data as this in-memory layer
|
||||
pub fn write_to_disk(&self) -> Result<DeltaLayer> {
|
||||
pub async fn write_to_disk(&self) -> Result<DeltaLayer> {
|
||||
// Grab the lock in read-mode. We hold it over the I/O, but because this
|
||||
// layer is not writeable anymore, no one should be trying to acquire the
|
||||
// write lock on it, so we shouldn't block anyone. There's one exception
|
||||
@@ -342,19 +327,21 @@ impl InMemoryLayer {
|
||||
// lock, it will see that it's not writeable anymore and retry, but it
|
||||
// would have to wait until we release it. That race condition is very
|
||||
// rare though, so we just accept the potential latency hit for now.
|
||||
let inner = self.inner.read().unwrap();
|
||||
let inner = self.inner.read().await;
|
||||
|
||||
let end_lsn = *self.end_lsn.get().unwrap();
|
||||
|
||||
let mut delta_layer_writer = DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
Key::MIN,
|
||||
self.start_lsn..inner.end_lsn.unwrap(),
|
||||
self.start_lsn..end_lsn,
|
||||
)?;
|
||||
|
||||
let mut buf = Vec::new();
|
||||
|
||||
let mut cursor = inner.file.block_cursor();
|
||||
let cursor = inner.file.block_cursor();
|
||||
|
||||
let mut keys: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
|
||||
keys.sort_by_key(|k| k.0);
|
||||
@@ -363,7 +350,7 @@ impl InMemoryLayer {
|
||||
let key = **key;
|
||||
// Write all page versions
|
||||
for (lsn, pos) in vec_map.as_slice() {
|
||||
cursor.read_blob_into_buf(*pos, &mut buf)?;
|
||||
cursor.read_blob_into_buf(*pos, &mut buf).await?;
|
||||
let will_init = Value::des(&buf)?.will_init();
|
||||
delta_layer_writer.put_value_bytes(key, *lsn, &buf, will_init)?;
|
||||
}
|
||||
|
||||
@@ -20,8 +20,8 @@ use utils::{
|
||||
|
||||
use super::filename::{DeltaFileName, ImageFileName};
|
||||
use super::{
|
||||
AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter,
|
||||
LayerKeyIter, LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
|
||||
AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset,
|
||||
LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
|
||||
};
|
||||
|
||||
/// RemoteLayer is a not yet downloaded [`ImageLayer`] or
|
||||
@@ -129,14 +129,6 @@ impl PersistentLayer for RemoteLayer {
|
||||
None
|
||||
}
|
||||
|
||||
fn iter(&self, _ctx: &RequestContext) -> Result<LayerIter<'_>> {
|
||||
bail!("cannot iterate a remote layer");
|
||||
}
|
||||
|
||||
fn key_iter(&self, _ctx: &RequestContext) -> Result<LayerKeyIter<'_>> {
|
||||
bail!("cannot iterate a remote layer");
|
||||
}
|
||||
|
||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
bail!("remote layer has no layer file");
|
||||
}
|
||||
|
||||
@@ -73,17 +73,13 @@ pub fn start_background_loops(
|
||||
///
|
||||
async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
let wait_duration = Duration::from_secs(2);
|
||||
info!("starting");
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
async {
|
||||
let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
|
||||
let mut first = true;
|
||||
loop {
|
||||
trace!("waking up");
|
||||
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
info!("received cancellation request");
|
||||
return;
|
||||
},
|
||||
tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
|
||||
@@ -126,15 +122,12 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
.await
|
||||
.is_ok()
|
||||
{
|
||||
info!("received cancellation request during idling");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
.await;
|
||||
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
|
||||
|
||||
trace!("compaction loop stopped.");
|
||||
}
|
||||
|
||||
///
|
||||
@@ -142,7 +135,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
///
|
||||
async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
let wait_duration = Duration::from_secs(2);
|
||||
info!("starting");
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
async {
|
||||
// GC might require downloading, to find the cutoff LSN that corresponds to the
|
||||
@@ -151,11 +143,8 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
|
||||
let mut first = true;
|
||||
loop {
|
||||
trace!("waking up");
|
||||
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
info!("received cancellation request");
|
||||
return;
|
||||
},
|
||||
tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
|
||||
@@ -200,14 +189,12 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
.await
|
||||
.is_ok()
|
||||
{
|
||||
info!("received cancellation request during idling");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
.await;
|
||||
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
|
||||
trace!("GC loop stopped.");
|
||||
}
|
||||
|
||||
async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
|
||||
@@ -232,7 +219,6 @@ async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
|
||||
}
|
||||
}
|
||||
Err(_sender_dropped_error) => {
|
||||
info!("Tenant dropped the state updates sender, quitting waiting for tenant and the task loop");
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,6 +19,7 @@ use pageserver_api::models::{
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use serde_with::serde_as;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use tokio::runtime::Handle;
|
||||
use tokio::sync::{oneshot, watch, TryAcquireError};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
@@ -34,7 +35,9 @@ use std::sync::atomic::Ordering as AtomicOrdering;
|
||||
use std::sync::{Arc, Mutex, RwLock, Weak};
|
||||
use std::time::{Duration, Instant, SystemTime};
|
||||
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::context::{
|
||||
AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
|
||||
};
|
||||
use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata};
|
||||
use crate::tenant::storage_layer::{
|
||||
DeltaFileName, DeltaLayerWriter, ImageFileName, ImageLayerWriter, InMemoryLayer,
|
||||
@@ -528,7 +531,7 @@ impl Timeline {
|
||||
size
|
||||
}
|
||||
|
||||
pub fn get_resident_physical_size(&self) -> u64 {
|
||||
pub fn resident_physical_size(&self) -> u64 {
|
||||
self.metrics.resident_physical_size_gauge.get()
|
||||
}
|
||||
|
||||
@@ -697,6 +700,9 @@ impl Timeline {
|
||||
Err(CompactionError::DownloadRequired(rls)) => {
|
||||
anyhow::bail!("Compaction requires downloading multiple times (last was {} layers), possibly battling against eviction", rls.len())
|
||||
}
|
||||
Err(CompactionError::ShuttingDown) => {
|
||||
return Ok(());
|
||||
}
|
||||
Err(CompactionError::Other(e)) => {
|
||||
return Err(e);
|
||||
}
|
||||
@@ -778,7 +784,8 @@ impl Timeline {
|
||||
let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await);
|
||||
// Is the timeline being deleted?
|
||||
if self.is_stopping() {
|
||||
return Err(anyhow::anyhow!("timeline is Stopping").into());
|
||||
trace!("Dropping out of compaction on timeline shutdown");
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
}
|
||||
|
||||
let target_file_size = self.get_checkpoint_distance();
|
||||
@@ -794,10 +801,15 @@ impl Timeline {
|
||||
.await
|
||||
{
|
||||
Ok((partitioning, lsn)) => {
|
||||
// Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
|
||||
let image_ctx = RequestContextBuilder::extend(ctx)
|
||||
.access_stats_behavior(AccessStatsBehavior::Skip)
|
||||
.build();
|
||||
|
||||
// 2. Create new image layers for partitions that have been modified
|
||||
// "enough".
|
||||
let layer_paths_to_upload = self
|
||||
.create_image_layers(&partitioning, lsn, false, ctx)
|
||||
.create_image_layers(&partitioning, lsn, false, &image_ctx)
|
||||
.await
|
||||
.map_err(anyhow::Error::from)?;
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
@@ -870,7 +882,7 @@ impl Timeline {
|
||||
let Some(open_layer) = layers.open_layer.as_ref() else {
|
||||
return Ok(());
|
||||
};
|
||||
open_layer.size()?
|
||||
open_layer.size().await?
|
||||
};
|
||||
let last_freeze_at = self.last_freeze_at.load();
|
||||
let last_freeze_ts = *(self.last_freeze_ts.read().unwrap());
|
||||
@@ -914,7 +926,7 @@ impl Timeline {
|
||||
pub fn set_state(&self, new_state: TimelineState) {
|
||||
match (self.current_state(), new_state) {
|
||||
(equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => {
|
||||
warn!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
|
||||
info!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
|
||||
}
|
||||
(st, TimelineState::Loading) => {
|
||||
error!("ignoring transition from {st:?} into Loading state");
|
||||
@@ -1155,7 +1167,7 @@ impl Timeline {
|
||||
return Err(EvictionError::CannotEvictRemoteLayer);
|
||||
}
|
||||
|
||||
let layer_file_size = local_layer.file_size();
|
||||
let layer_file_size = local_layer.layer_desc().file_size;
|
||||
|
||||
let local_layer_mtime = local_layer
|
||||
.local_path()
|
||||
@@ -1585,7 +1597,6 @@ impl Timeline {
|
||||
///
|
||||
pub(super) async fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
|
||||
let mut guard = self.layers.write().await;
|
||||
let mut num_layers = 0;
|
||||
|
||||
let timer = self.metrics.load_layer_map_histo.start_timer();
|
||||
|
||||
@@ -1603,12 +1614,12 @@ impl Timeline {
|
||||
let fname = direntry.file_name();
|
||||
let fname = fname.to_string_lossy();
|
||||
|
||||
if let Some(imgfilename) = ImageFileName::parse_str(&fname) {
|
||||
if let Some(filename) = ImageFileName::parse_str(&fname) {
|
||||
// create an ImageLayer struct for each image file.
|
||||
if imgfilename.lsn > disk_consistent_lsn {
|
||||
if filename.lsn > disk_consistent_lsn {
|
||||
info!(
|
||||
"found future image layer {} on timeline {} disk_consistent_lsn is {}",
|
||||
imgfilename, self.timeline_id, disk_consistent_lsn
|
||||
filename, self.timeline_id, disk_consistent_lsn
|
||||
);
|
||||
|
||||
rename_to_backup(&direntry_path)?;
|
||||
@@ -1616,31 +1627,31 @@ impl Timeline {
|
||||
}
|
||||
|
||||
let file_size = direntry_path.metadata()?.len();
|
||||
let stats =
|
||||
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident);
|
||||
|
||||
let layer = ImageLayer::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
&imgfilename,
|
||||
&filename,
|
||||
file_size,
|
||||
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident),
|
||||
stats,
|
||||
);
|
||||
|
||||
trace!("found layer {}", layer.path().display());
|
||||
total_physical_size += file_size;
|
||||
loaded_layers.push(Arc::new(layer));
|
||||
num_layers += 1;
|
||||
} else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) {
|
||||
} else if let Some(filename) = DeltaFileName::parse_str(&fname) {
|
||||
// Create a DeltaLayer struct for each delta file.
|
||||
// The end-LSN is exclusive, while disk_consistent_lsn is
|
||||
// inclusive. For example, if disk_consistent_lsn is 100, it is
|
||||
// OK for a delta layer to have end LSN 101, but if the end LSN
|
||||
// is 102, then it might not have been fully flushed to disk
|
||||
// before crash.
|
||||
if deltafilename.lsn_range.end > disk_consistent_lsn + 1 {
|
||||
if filename.lsn_range.end > disk_consistent_lsn + 1 {
|
||||
info!(
|
||||
"found future delta layer {} on timeline {} disk_consistent_lsn is {}",
|
||||
deltafilename, self.timeline_id, disk_consistent_lsn
|
||||
filename, self.timeline_id, disk_consistent_lsn
|
||||
);
|
||||
|
||||
rename_to_backup(&direntry_path)?;
|
||||
@@ -1648,20 +1659,20 @@ impl Timeline {
|
||||
}
|
||||
|
||||
let file_size = direntry_path.metadata()?.len();
|
||||
let stats =
|
||||
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident);
|
||||
|
||||
let layer = DeltaLayer::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
&deltafilename,
|
||||
&filename,
|
||||
file_size,
|
||||
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident),
|
||||
stats,
|
||||
);
|
||||
|
||||
trace!("found layer {}", layer.path().display());
|
||||
total_physical_size += file_size;
|
||||
loaded_layers.push(Arc::new(layer));
|
||||
num_layers += 1;
|
||||
} else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
|
||||
// ignore these
|
||||
} else if remote_timeline_client::is_temp_download_file(&direntry_path) {
|
||||
@@ -1686,6 +1697,7 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
let num_layers = loaded_layers.len();
|
||||
guard.initialize_local_layers(loaded_layers, Lsn(disk_consistent_lsn.0) + 1);
|
||||
|
||||
info!(
|
||||
@@ -1786,13 +1798,15 @@ impl Timeline {
|
||||
);
|
||||
continue;
|
||||
}
|
||||
let stats =
|
||||
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted);
|
||||
|
||||
let remote_layer = RemoteLayer::new_img(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
imgfilename,
|
||||
&remote_layer_metadata,
|
||||
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted),
|
||||
stats,
|
||||
);
|
||||
let remote_layer = Arc::new(remote_layer);
|
||||
added_remote_layers.push(remote_layer);
|
||||
@@ -1811,12 +1825,15 @@ impl Timeline {
|
||||
);
|
||||
continue;
|
||||
}
|
||||
let stats =
|
||||
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted);
|
||||
|
||||
let remote_layer = RemoteLayer::new_delta(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
deltafilename,
|
||||
&remote_layer_metadata,
|
||||
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted),
|
||||
stats,
|
||||
);
|
||||
let remote_layer = Arc::new(remote_layer);
|
||||
added_remote_layers.push(remote_layer);
|
||||
@@ -2264,15 +2281,16 @@ trait TraversalLayerExt {
|
||||
|
||||
impl TraversalLayerExt for Arc<dyn PersistentLayer> {
|
||||
fn traversal_id(&self) -> TraversalId {
|
||||
let timeline_id = self.layer_desc().timeline_id;
|
||||
match self.local_path() {
|
||||
Some(local_path) => {
|
||||
debug_assert!(local_path.to_str().unwrap().contains(&format!("{}", self.get_timeline_id())),
|
||||
debug_assert!(local_path.to_str().unwrap().contains(&format!("{}", timeline_id)),
|
||||
"need timeline ID to uniquely identify the layer when traversal crosses ancestor boundary",
|
||||
);
|
||||
format!("{}", local_path.display())
|
||||
}
|
||||
None => {
|
||||
format!("remote {}/{self}", self.get_timeline_id())
|
||||
format!("remote {}/{self}", timeline_id)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2636,7 +2654,7 @@ impl Timeline {
|
||||
async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> anyhow::Result<()> {
|
||||
//info!("PUT: key {} at {}", key, lsn);
|
||||
let layer = self.get_layer_for_write(lsn).await?;
|
||||
layer.put_value(key, lsn, val)?;
|
||||
layer.put_value(key, lsn, val).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -2662,7 +2680,9 @@ impl Timeline {
|
||||
Some(self.write_lock.lock().await)
|
||||
};
|
||||
let mut guard = self.layers.write().await;
|
||||
guard.try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at);
|
||||
guard
|
||||
.try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at)
|
||||
.await;
|
||||
}
|
||||
|
||||
/// Layer flusher task's main loop.
|
||||
@@ -2808,7 +2828,10 @@ impl Timeline {
|
||||
// We will remove frozen layer and add delta layer in one atomic operation later.
|
||||
let layer = self.create_delta_layer(&frozen_layer).await?;
|
||||
(
|
||||
HashMap::from([(layer.filename(), LayerFileMetadata::new(layer.file_size()))]),
|
||||
HashMap::from([(
|
||||
layer.filename(),
|
||||
LayerFileMetadata::new(layer.layer_desc().file_size),
|
||||
)]),
|
||||
Some(layer),
|
||||
)
|
||||
};
|
||||
@@ -2828,7 +2851,7 @@ impl Timeline {
|
||||
);
|
||||
|
||||
// update metrics
|
||||
let sz = l.file_size();
|
||||
let sz = l.layer_desc().file_size;
|
||||
self.metrics.resident_physical_size_gauge.add(sz);
|
||||
self.metrics.num_persistent_files_created.inc_by(1);
|
||||
self.metrics.persistent_bytes_written.inc_by(sz);
|
||||
@@ -2941,7 +2964,11 @@ impl Timeline {
|
||||
let frozen_layer = Arc::clone(frozen_layer);
|
||||
move || {
|
||||
// Write it out
|
||||
let new_delta = frozen_layer.write_to_disk()?;
|
||||
// Keep this inside `spawn_blocking` and `Handle::current`
|
||||
// as long as the write path is still sync and the read impl
|
||||
// is still not fully async. Otherwise executor threads would
|
||||
// be blocked.
|
||||
let new_delta = Handle::current().block_on(frozen_layer.write_to_disk())?;
|
||||
let new_delta_path = new_delta.path();
|
||||
|
||||
// Sync it to disk.
|
||||
@@ -3235,6 +3262,8 @@ enum CompactionError {
|
||||
/// This should not happen repeatedly, but will be retried once by top-level
|
||||
/// `Timeline::compact`.
|
||||
DownloadRequired(Vec<Arc<RemoteLayer>>),
|
||||
/// The timeline or pageserver is shutting down
|
||||
ShuttingDown,
|
||||
/// Compaction cannot be done right now; page reconstruction and so on.
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
@@ -3445,14 +3474,14 @@ impl Timeline {
|
||||
// "gaps" in the sequence of level 0 files should only happen in case
|
||||
// of a crash, partial download from cloud storage, or something like
|
||||
// that, so it's not a big deal in practice.
|
||||
level0_deltas.sort_by_key(|l| l.get_lsn_range().start);
|
||||
level0_deltas.sort_by_key(|l| l.layer_desc().lsn_range.start);
|
||||
let mut level0_deltas_iter = level0_deltas.iter();
|
||||
|
||||
let first_level0_delta = level0_deltas_iter.next().unwrap();
|
||||
let mut prev_lsn_end = first_level0_delta.get_lsn_range().end;
|
||||
let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end;
|
||||
let mut deltas_to_compact = vec![Arc::clone(first_level0_delta)];
|
||||
for l in level0_deltas_iter {
|
||||
let lsn_range = l.get_lsn_range();
|
||||
let lsn_range = &l.layer_desc().lsn_range;
|
||||
|
||||
if lsn_range.start != prev_lsn_end {
|
||||
break;
|
||||
@@ -3461,8 +3490,13 @@ impl Timeline {
|
||||
prev_lsn_end = lsn_range.end;
|
||||
}
|
||||
let lsn_range = Range {
|
||||
start: deltas_to_compact.first().unwrap().get_lsn_range().start,
|
||||
end: deltas_to_compact.last().unwrap().get_lsn_range().end,
|
||||
start: deltas_to_compact
|
||||
.first()
|
||||
.unwrap()
|
||||
.layer_desc()
|
||||
.lsn_range
|
||||
.start,
|
||||
end: deltas_to_compact.last().unwrap().layer_desc().lsn_range.end,
|
||||
};
|
||||
|
||||
let remotes = deltas_to_compact
|
||||
@@ -3512,10 +3546,30 @@ impl Timeline {
|
||||
// min-heap (reserve space for one more element added before eviction)
|
||||
let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
|
||||
let mut prev: Option<Key> = None;
|
||||
for (next_key, _next_lsn, _size) in itertools::process_results(
|
||||
deltas_to_compact.iter().map(|l| l.key_iter(ctx)),
|
||||
|iter_iter| iter_iter.kmerge_by(|a, b| a.0 < b.0),
|
||||
)? {
|
||||
|
||||
let mut all_value_refs = Vec::new();
|
||||
let mut all_keys = Vec::new();
|
||||
|
||||
for l in deltas_to_compact.iter() {
|
||||
// TODO: replace this with an await once we fully go async
|
||||
let delta = l.clone().downcast_delta_layer().expect("delta layer");
|
||||
Handle::current().block_on(async {
|
||||
all_value_refs.extend(delta.load_val_refs(ctx).await?);
|
||||
all_keys.extend(delta.load_keys(ctx).await?);
|
||||
anyhow::Ok(())
|
||||
})?;
|
||||
}
|
||||
|
||||
// The current stdlib sorting implementation is designed in a way where it is
|
||||
// particularly fast where the slice is made up of sorted sub-ranges.
|
||||
all_value_refs.sort_by_key(|(key, lsn, _value_ref)| (*key, *lsn));
|
||||
|
||||
// The current stdlib sorting implementation is designed in a way where it is
|
||||
// particularly fast where the slice is made up of sorted sub-ranges.
|
||||
all_keys.sort_by_key(|(key, lsn, _size)| (*key, *lsn));
|
||||
|
||||
for (next_key, _next_lsn, _size) in all_keys.iter() {
|
||||
let next_key = *next_key;
|
||||
if let Some(prev_key) = prev {
|
||||
// just first fast filter
|
||||
if next_key.to_i128() - prev_key.to_i128() >= min_hole_range {
|
||||
@@ -3548,34 +3602,10 @@ impl Timeline {
|
||||
|
||||
// This iterator walks through all key-value pairs from all the layers
|
||||
// we're compacting, in key, LSN order.
|
||||
let all_values_iter = itertools::process_results(
|
||||
deltas_to_compact.iter().map(|l| l.iter(ctx)),
|
||||
|iter_iter| {
|
||||
iter_iter.kmerge_by(|a, b| {
|
||||
if let Ok((a_key, a_lsn, _)) = a {
|
||||
if let Ok((b_key, b_lsn, _)) = b {
|
||||
(a_key, a_lsn) < (b_key, b_lsn)
|
||||
} else {
|
||||
false
|
||||
}
|
||||
} else {
|
||||
true
|
||||
}
|
||||
})
|
||||
},
|
||||
)?;
|
||||
let all_values_iter = all_value_refs.into_iter();
|
||||
|
||||
// This iterator walks through all keys and is needed to calculate size used by each key
|
||||
let mut all_keys_iter = itertools::process_results(
|
||||
deltas_to_compact.iter().map(|l| l.key_iter(ctx)),
|
||||
|iter_iter| {
|
||||
iter_iter.kmerge_by(|a, b| {
|
||||
let (a_key, a_lsn, _) = a;
|
||||
let (b_key, b_lsn, _) = b;
|
||||
(a_key, a_lsn) < (b_key, b_lsn)
|
||||
})
|
||||
},
|
||||
)?;
|
||||
let mut all_keys_iter = all_keys.into_iter();
|
||||
|
||||
stats.prepare_iterators_micros = stats.read_lock_drop_micros.till_now();
|
||||
|
||||
@@ -3629,98 +3659,105 @@ impl Timeline {
|
||||
let mut key_values_total_size = 0u64;
|
||||
let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
|
||||
let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
|
||||
for x in all_values_iter {
|
||||
let (key, lsn, value) = x?;
|
||||
let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
|
||||
// We need to check key boundaries once we reach next key or end of layer with the same key
|
||||
if !same_key || lsn == dup_end_lsn {
|
||||
let mut next_key_size = 0u64;
|
||||
let is_dup_layer = dup_end_lsn.is_valid();
|
||||
dup_start_lsn = Lsn::INVALID;
|
||||
if !same_key {
|
||||
dup_end_lsn = Lsn::INVALID;
|
||||
|
||||
// TODO remove this block_on wrapper once we fully go async
|
||||
Handle::current().block_on(async {
|
||||
for (key, lsn, value_ref) in all_values_iter {
|
||||
let value = value_ref.load().await?;
|
||||
let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
|
||||
// We need to check key boundaries once we reach next key or end of layer with the same key
|
||||
if !same_key || lsn == dup_end_lsn {
|
||||
let mut next_key_size = 0u64;
|
||||
let is_dup_layer = dup_end_lsn.is_valid();
|
||||
dup_start_lsn = Lsn::INVALID;
|
||||
if !same_key {
|
||||
dup_end_lsn = Lsn::INVALID;
|
||||
}
|
||||
// Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
|
||||
for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
|
||||
next_key_size = next_size;
|
||||
if key != next_key {
|
||||
if dup_end_lsn.is_valid() {
|
||||
// We are writting segment with duplicates:
|
||||
// place all remaining values of this key in separate segment
|
||||
dup_start_lsn = dup_end_lsn; // new segments starts where old stops
|
||||
dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
|
||||
}
|
||||
break;
|
||||
}
|
||||
key_values_total_size += next_size;
|
||||
// Check if it is time to split segment: if total keys size is larger than target file size.
|
||||
// We need to avoid generation of empty segments if next_size > target_file_size.
|
||||
if key_values_total_size > target_file_size && lsn != next_lsn {
|
||||
// Split key between multiple layers: such layer can contain only single key
|
||||
dup_start_lsn = if dup_end_lsn.is_valid() {
|
||||
dup_end_lsn // new segment with duplicates starts where old one stops
|
||||
} else {
|
||||
lsn // start with the first LSN for this key
|
||||
};
|
||||
dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
|
||||
break;
|
||||
}
|
||||
}
|
||||
// handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
|
||||
if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
|
||||
dup_start_lsn = dup_end_lsn;
|
||||
dup_end_lsn = lsn_range.end;
|
||||
}
|
||||
if writer.is_some() {
|
||||
let written_size = writer.as_mut().unwrap().size();
|
||||
let contains_hole =
|
||||
next_hole < holes.len() && key >= holes[next_hole].key_range.end;
|
||||
// check if key cause layer overflow or contains hole...
|
||||
if is_dup_layer
|
||||
|| dup_end_lsn.is_valid()
|
||||
|| written_size + key_values_total_size > target_file_size
|
||||
|| contains_hole
|
||||
{
|
||||
// ... if so, flush previous layer and prepare to write new one
|
||||
new_layers.push(Arc::new(
|
||||
writer.take().unwrap().finish(prev_key.unwrap().next())?,
|
||||
));
|
||||
writer = None;
|
||||
|
||||
if contains_hole {
|
||||
// skip hole
|
||||
next_hole += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Remember size of key value because at next iteration we will access next item
|
||||
key_values_total_size = next_key_size;
|
||||
}
|
||||
// Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
|
||||
for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
|
||||
next_key_size = next_size;
|
||||
if key != next_key {
|
||||
if writer.is_none() {
|
||||
// Create writer if not initiaized yet
|
||||
writer = Some(DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
key,
|
||||
if dup_end_lsn.is_valid() {
|
||||
// We are writting segment with duplicates:
|
||||
// place all remaining values of this key in separate segment
|
||||
dup_start_lsn = dup_end_lsn; // new segments starts where old stops
|
||||
dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
|
||||
}
|
||||
break;
|
||||
}
|
||||
key_values_total_size += next_size;
|
||||
// Check if it is time to split segment: if total keys size is larger than target file size.
|
||||
// We need to avoid generation of empty segments if next_size > target_file_size.
|
||||
if key_values_total_size > target_file_size && lsn != next_lsn {
|
||||
// Split key between multiple layers: such layer can contain only single key
|
||||
dup_start_lsn = if dup_end_lsn.is_valid() {
|
||||
dup_end_lsn // new segment with duplicates starts where old one stops
|
||||
// this is a layer containing slice of values of the same key
|
||||
debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
|
||||
dup_start_lsn..dup_end_lsn
|
||||
} else {
|
||||
lsn // start with the first LSN for this key
|
||||
};
|
||||
dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
|
||||
break;
|
||||
}
|
||||
debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
|
||||
lsn_range.clone()
|
||||
},
|
||||
)?);
|
||||
}
|
||||
// handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
|
||||
if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
|
||||
dup_start_lsn = dup_end_lsn;
|
||||
dup_end_lsn = lsn_range.end;
|
||||
}
|
||||
if writer.is_some() {
|
||||
let written_size = writer.as_mut().unwrap().size();
|
||||
let contains_hole =
|
||||
next_hole < holes.len() && key >= holes[next_hole].key_range.end;
|
||||
// check if key cause layer overflow or contains hole...
|
||||
if is_dup_layer
|
||||
|| dup_end_lsn.is_valid()
|
||||
|| written_size + key_values_total_size > target_file_size
|
||||
|| contains_hole
|
||||
{
|
||||
// ... if so, flush previous layer and prepare to write new one
|
||||
new_layers.push(Arc::new(
|
||||
writer.take().unwrap().finish(prev_key.unwrap().next())?,
|
||||
));
|
||||
writer = None;
|
||||
|
||||
if contains_hole {
|
||||
// skip hole
|
||||
next_hole += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Remember size of key value because at next iteration we will access next item
|
||||
key_values_total_size = next_key_size;
|
||||
fail_point!("delta-layer-writer-fail-before-finish", |_| {
|
||||
Result::<_>::Err(anyhow::anyhow!(
|
||||
"failpoint delta-layer-writer-fail-before-finish"
|
||||
))
|
||||
});
|
||||
|
||||
writer.as_mut().unwrap().put_value(key, lsn, value)?;
|
||||
prev_key = Some(key);
|
||||
}
|
||||
if writer.is_none() {
|
||||
// Create writer if not initiaized yet
|
||||
writer = Some(DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
key,
|
||||
if dup_end_lsn.is_valid() {
|
||||
// this is a layer containing slice of values of the same key
|
||||
debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
|
||||
dup_start_lsn..dup_end_lsn
|
||||
} else {
|
||||
debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
|
||||
lsn_range.clone()
|
||||
},
|
||||
)?);
|
||||
}
|
||||
|
||||
fail_point!("delta-layer-writer-fail-before-finish", |_| {
|
||||
Err(anyhow::anyhow!("failpoint delta-layer-writer-fail-before-finish").into())
|
||||
});
|
||||
|
||||
writer.as_mut().unwrap().put_value(key, lsn, value)?;
|
||||
prev_key = Some(key);
|
||||
}
|
||||
Ok(())
|
||||
})?;
|
||||
if let Some(writer) = writer {
|
||||
new_layers.push(Arc::new(writer.finish(prev_key.unwrap().next())?));
|
||||
}
|
||||
@@ -4642,7 +4679,7 @@ impl std::fmt::Debug for LocalLayerInfoForDiskUsageEviction {
|
||||
|
||||
impl LocalLayerInfoForDiskUsageEviction {
|
||||
pub fn file_size(&self) -> u64 {
|
||||
self.layer.file_size()
|
||||
self.layer.layer_desc().file_size
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -219,27 +219,13 @@ async fn delete_local_layer_files(
|
||||
}
|
||||
};
|
||||
|
||||
let r = if metadata.is_dir() {
|
||||
// There shouldnt be any directories inside timeline dir as of current layout.
|
||||
if metadata.is_dir() {
|
||||
warn!(path=%entry.path().display(), "unexpected directory under timeline dir");
|
||||
tokio::fs::remove_dir(entry.path()).await
|
||||
} else {
|
||||
tokio::fs::remove_file(entry.path()).await
|
||||
};
|
||||
|
||||
if let Err(e) = r {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
warn!(
|
||||
timeline_dir=?local_timeline_directory,
|
||||
path=?entry.path().display(),
|
||||
"got not found err while removing timeline dir, proceeding anyway"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
anyhow::bail!(anyhow::anyhow!(
|
||||
"Failed to remove: {}. Error: {e}",
|
||||
entry.path().display()
|
||||
));
|
||||
}
|
||||
.with_context(|| format!("Failed to remove: {}", entry.path().display()))?;
|
||||
}
|
||||
|
||||
info!("finished deleting layer files, releasing layer_removal_cs.lock()");
|
||||
@@ -293,6 +279,17 @@ async fn cleanup_remaining_timeline_fs_traces(
|
||||
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm-dir"))?
|
||||
});
|
||||
|
||||
// Make sure previous deletions are ordered before mark removal.
|
||||
// Otherwise there is no guarantee that they reach the disk before mark deletion.
|
||||
// So its possible for mark to reach disk first and for other deletions
|
||||
// to be reordered later and thus missed if a crash occurs.
|
||||
// Note that we dont need to sync after mark file is removed
|
||||
// because we can tolerate the case when mark file reappears on startup.
|
||||
let timeline_path = conf.timelines_path(&tenant_id);
|
||||
crashsafe::fsync_async(timeline_path)
|
||||
.await
|
||||
.context("fsync_pre_mark_remove")?;
|
||||
|
||||
// Remove delete mark
|
||||
tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id))
|
||||
.await
|
||||
@@ -359,10 +356,11 @@ impl DeleteTimelineFlow {
|
||||
// NB: If this fails half-way through, and is retried, the retry will go through
|
||||
// all the same steps again. Make sure the code here is idempotent, and don't
|
||||
// error out if some of the shutdown tasks have already been completed!
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant.tenant_id, %timeline_id))]
|
||||
#[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_id))]
|
||||
pub async fn run(
|
||||
tenant: &Arc<Tenant>,
|
||||
timeline_id: TimelineId,
|
||||
inplace: bool,
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?;
|
||||
|
||||
@@ -380,7 +378,11 @@ impl DeleteTimelineFlow {
|
||||
))?
|
||||
});
|
||||
|
||||
Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
|
||||
if inplace {
|
||||
Self::background(guard, tenant.conf, tenant, &timeline).await?
|
||||
} else {
|
||||
Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -398,6 +400,8 @@ impl DeleteTimelineFlow {
|
||||
}
|
||||
|
||||
/// Shortcut to create Timeline in stopping state and spawn deletion task.
|
||||
/// See corresponding parts of [`crate::tenant::delete::DeleteTenantFlow`]
|
||||
#[instrument(skip_all, fields(%timeline_id))]
|
||||
pub async fn resume_deletion(
|
||||
tenant: Arc<Tenant>,
|
||||
timeline_id: TimelineId,
|
||||
@@ -444,11 +448,15 @@ impl DeleteTimelineFlow {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(%timeline_id))]
|
||||
pub async fn cleanup_remaining_timeline_fs_traces(
|
||||
tenant: &Tenant,
|
||||
timeline_id: TimelineId,
|
||||
) -> anyhow::Result<()> {
|
||||
cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await
|
||||
let r =
|
||||
cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await;
|
||||
info!("Done");
|
||||
r
|
||||
}
|
||||
|
||||
fn prepare(
|
||||
@@ -494,11 +502,17 @@ impl DeleteTimelineFlow {
|
||||
// At the end of the operation we're holding the guard and need to lock timelines map
|
||||
// to remove the timeline from it.
|
||||
// Always if you have two locks that are taken in different order this can result in a deadlock.
|
||||
let delete_lock_guard = DeletionGuard(
|
||||
Arc::clone(&timeline.delete_progress)
|
||||
.try_lock_owned()
|
||||
.map_err(|_| DeleteTimelineError::AlreadyInProgress)?,
|
||||
);
|
||||
|
||||
let delete_progress = Arc::clone(&timeline.delete_progress);
|
||||
let delete_lock_guard = match delete_progress.try_lock_owned() {
|
||||
Ok(guard) => DeletionGuard(guard),
|
||||
Err(_) => {
|
||||
// Unfortunately if lock fails arc is consumed.
|
||||
return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone(
|
||||
&timeline.delete_progress,
|
||||
)));
|
||||
}
|
||||
};
|
||||
|
||||
timeline.set_state(TimelineState::Stopping);
|
||||
|
||||
@@ -553,10 +567,14 @@ impl DeleteTimelineFlow {
|
||||
|
||||
remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;
|
||||
|
||||
*guard.0 = Self::Finished;
|
||||
*guard = Self::Finished;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn is_finished(&self) -> bool {
|
||||
matches!(self, Self::Finished)
|
||||
}
|
||||
}
|
||||
|
||||
struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
|
||||
|
||||
@@ -78,9 +78,6 @@ impl Timeline {
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))]
|
||||
async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
|
||||
scopeguard::defer! {
|
||||
info!("eviction task finishing");
|
||||
}
|
||||
use crate::tenant::tasks::random_init_delay;
|
||||
{
|
||||
let policy = self.get_eviction_policy();
|
||||
@@ -308,8 +305,13 @@ impl Timeline {
|
||||
ctx: &RequestContext,
|
||||
) -> ControlFlow<()> {
|
||||
let mut state = self.eviction_task_timeline_state.lock().await;
|
||||
|
||||
// Only do the imitate_layer accesses approximately as often as the threshold. A little
|
||||
// more frequently, to avoid this period racing with the threshold/period-th eviction iteration.
|
||||
let inter_imitate_period = p.threshold.checked_sub(p.period).unwrap_or(p.threshold);
|
||||
|
||||
match state.last_layer_access_imitation {
|
||||
Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ }
|
||||
Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
|
||||
_ => {
|
||||
self.imitate_timeline_cached_layer_accesses(cancel, ctx)
|
||||
.await;
|
||||
@@ -332,7 +334,7 @@ impl Timeline {
|
||||
};
|
||||
let mut state = tenant.eviction_task_tenant_state.lock().await;
|
||||
match state.last_layer_access_imitation {
|
||||
Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ }
|
||||
Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
|
||||
_ => {
|
||||
self.imitate_synthetic_size_calculation_worker(&tenant, ctx, cancel)
|
||||
.await;
|
||||
|
||||
@@ -120,10 +120,9 @@ impl LayerManager {
|
||||
|
||||
ensure!(
|
||||
lsn > last_record_lsn,
|
||||
"cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})\n{}",
|
||||
"cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})",
|
||||
lsn,
|
||||
last_record_lsn,
|
||||
std::backtrace::Backtrace::force_capture(),
|
||||
);
|
||||
|
||||
// Do we have a layer open for writing already?
|
||||
@@ -164,7 +163,7 @@ impl LayerManager {
|
||||
}
|
||||
|
||||
/// Called from `freeze_inmem_layer`, returns true if successfully frozen.
|
||||
pub fn try_freeze_in_memory_layer(
|
||||
pub async fn try_freeze_in_memory_layer(
|
||||
&mut self,
|
||||
Lsn(last_record_lsn): Lsn,
|
||||
last_freeze_at: &AtomicLsn,
|
||||
@@ -174,7 +173,7 @@ impl LayerManager {
|
||||
if let Some(open_layer) = &self.layer_map.open_layer {
|
||||
let open_layer_rc = Arc::clone(open_layer);
|
||||
// Does this layer need freezing?
|
||||
open_layer.freeze(end_lsn);
|
||||
open_layer.freeze(end_lsn).await;
|
||||
|
||||
// The layer is no longer open, update the layer map to reflect this.
|
||||
// We will replace it with on-disk historics below.
|
||||
@@ -278,7 +277,7 @@ impl LayerManager {
|
||||
updates: &mut BatchedUpdates<'_>,
|
||||
mapping: &mut LayerFileManager,
|
||||
) {
|
||||
updates.remove_historic(layer.layer_desc().clone());
|
||||
updates.remove_historic(layer.layer_desc());
|
||||
mapping.remove(layer);
|
||||
}
|
||||
|
||||
@@ -292,10 +291,10 @@ impl LayerManager {
|
||||
metrics: &TimelineMetrics,
|
||||
mapping: &mut LayerFileManager,
|
||||
) -> anyhow::Result<()> {
|
||||
let desc = layer.layer_desc();
|
||||
if !layer.is_remote_layer() {
|
||||
layer.delete_resident_layer_file()?;
|
||||
let layer_file_size = layer.file_size();
|
||||
metrics.resident_physical_size_gauge.sub(layer_file_size);
|
||||
metrics.resident_physical_size_gauge.sub(desc.file_size);
|
||||
}
|
||||
|
||||
// TODO Removing from the bottom of the layer map is expensive.
|
||||
@@ -303,7 +302,7 @@ impl LayerManager {
|
||||
// won't be needed for page reconstruction for this timeline,
|
||||
// and mark what we can't delete yet as deleted from the layer
|
||||
// map index without actually rebuilding the index.
|
||||
updates.remove_historic(layer.layer_desc().clone());
|
||||
updates.remove_historic(desc);
|
||||
mapping.remove(layer);
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -31,14 +31,19 @@ use storage_broker::Streaming;
|
||||
use tokio::select;
|
||||
use tracing::*;
|
||||
|
||||
use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
|
||||
use postgres_connection::{parse_host_port, PgConnectionConfig};
|
||||
use utils::backoff::{
|
||||
exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
};
|
||||
use utils::{
|
||||
id::{NodeId, TenantTimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use super::{walreceiver_connection::WalConnectionStatus, TaskEvent, TaskHandle};
|
||||
use super::{
|
||||
walreceiver_connection::WalConnectionStatus, walreceiver_connection::WalReceiverError,
|
||||
TaskEvent, TaskHandle,
|
||||
};
|
||||
|
||||
/// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker.
|
||||
/// Based on the updates, desides whether to start, keep or stop a WAL receiver task.
|
||||
@@ -419,13 +424,19 @@ impl ConnectionManagerState {
|
||||
match res {
|
||||
Ok(()) => Ok(()),
|
||||
Err(e) => {
|
||||
use super::walreceiver_connection::ExpectedError;
|
||||
if e.is_expected() {
|
||||
info!("walreceiver connection handling ended: {e:#}");
|
||||
Ok(())
|
||||
} else {
|
||||
// give out an error to have task_mgr give it a really verbose logging
|
||||
Err(e).context("walreceiver connection handling failure")
|
||||
match e {
|
||||
WalReceiverError::SuccessfulCompletion(msg) => {
|
||||
info!("walreceiver connection handling ended with success: {msg}");
|
||||
Ok(())
|
||||
}
|
||||
WalReceiverError::ExpectedSafekeeperError(e) => {
|
||||
info!("walreceiver connection handling ended: {e}");
|
||||
Ok(())
|
||||
}
|
||||
WalReceiverError::Other(e) => {
|
||||
// give out an error to have task_mgr give it a really verbose logging
|
||||
Err(e).context("walreceiver connection handling failure")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,14 +8,14 @@ use std::{
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use anyhow::{anyhow, Context};
|
||||
use bytes::BytesMut;
|
||||
use chrono::{NaiveDateTime, Utc};
|
||||
use fail::fail_point;
|
||||
use futures::StreamExt;
|
||||
use postgres::{error::SqlState, SimpleQueryMessage, SimpleQueryRow};
|
||||
use postgres_ffi::v14::xlog_utils::normalize_lsn;
|
||||
use postgres_ffi::WAL_SEGMENT_SIZE;
|
||||
use postgres_ffi::{v14::xlog_utils::normalize_lsn, waldecoder::WalDecodeError};
|
||||
use postgres_protocol::message::backend::ReplicationMessage;
|
||||
use postgres_types::PgLsn;
|
||||
use tokio::{select, sync::watch, time};
|
||||
@@ -60,6 +60,50 @@ pub(super) struct WalConnectionStatus {
|
||||
pub node: NodeId,
|
||||
}
|
||||
|
||||
pub(super) enum WalReceiverError {
|
||||
/// An error of a type that does not indicate an issue, e.g. a connection closing
|
||||
ExpectedSafekeeperError(postgres::Error),
|
||||
/// An "error" message that carries a SUCCESSFUL_COMPLETION status code. Carries
|
||||
/// the message part of the original postgres error
|
||||
SuccessfulCompletion(String),
|
||||
/// Generic error
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
impl From<tokio_postgres::Error> for WalReceiverError {
|
||||
fn from(err: tokio_postgres::Error) -> Self {
|
||||
if let Some(dberror) = err.as_db_error().filter(|db_error| {
|
||||
db_error.code() == &SqlState::SUCCESSFUL_COMPLETION
|
||||
&& db_error.message().contains("ending streaming")
|
||||
}) {
|
||||
// Strip the outer DbError, which carries a misleading "error" severity
|
||||
Self::SuccessfulCompletion(dberror.message().to_string())
|
||||
} else if err.is_closed()
|
||||
|| err
|
||||
.source()
|
||||
.and_then(|source| source.downcast_ref::<std::io::Error>())
|
||||
.map(is_expected_io_error)
|
||||
.unwrap_or(false)
|
||||
{
|
||||
Self::ExpectedSafekeeperError(err)
|
||||
} else {
|
||||
Self::Other(anyhow::Error::new(err))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<anyhow::Error> for WalReceiverError {
|
||||
fn from(err: anyhow::Error) -> Self {
|
||||
Self::Other(err)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<WalDecodeError> for WalReceiverError {
|
||||
fn from(err: WalDecodeError) -> Self {
|
||||
Self::Other(anyhow::Error::new(err))
|
||||
}
|
||||
}
|
||||
|
||||
/// Open a connection to the given safekeeper and receive WAL, sending back progress
|
||||
/// messages as we go.
|
||||
pub(super) async fn handle_walreceiver_connection(
|
||||
@@ -70,7 +114,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
connect_timeout: Duration,
|
||||
ctx: RequestContext,
|
||||
node: NodeId,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalReceiverError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
WALRECEIVER_STARTED_CONNECTIONS.inc();
|
||||
@@ -130,11 +174,15 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
connection_result = connection => match connection_result {
|
||||
Ok(()) => debug!("Walreceiver db connection closed"),
|
||||
Err(connection_error) => {
|
||||
if connection_error.is_expected() {
|
||||
// silence, because most likely we've already exited the outer call
|
||||
// with a similar error.
|
||||
} else {
|
||||
warn!("Connection aborted: {connection_error:#}")
|
||||
match WalReceiverError::from(connection_error) {
|
||||
WalReceiverError::ExpectedSafekeeperError(_) => {
|
||||
// silence, because most likely we've already exited the outer call
|
||||
// with a similar error.
|
||||
},
|
||||
WalReceiverError::SuccessfulCompletion(_) => {}
|
||||
WalReceiverError::Other(err) => {
|
||||
warn!("Connection aborted: {err:#}")
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
@@ -180,7 +228,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
let mut startpoint = last_rec_lsn;
|
||||
|
||||
if startpoint == Lsn(0) {
|
||||
bail!("No previous WAL position");
|
||||
return Err(WalReceiverError::Other(anyhow!("No previous WAL position")));
|
||||
}
|
||||
|
||||
// There might be some padding after the last full record, skip it.
|
||||
@@ -262,7 +310,9 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
// It is important to deal with the aligned records as lsn in getPage@LSN is
|
||||
// aligned and can be several bytes bigger. Without this alignment we are
|
||||
// at risk of hitting a deadlock.
|
||||
ensure!(lsn.is_aligned());
|
||||
if !lsn.is_aligned() {
|
||||
return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
|
||||
}
|
||||
|
||||
walingest
|
||||
.ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
|
||||
@@ -419,51 +469,3 @@ async fn identify_system(client: &mut Client) -> anyhow::Result<IdentifySystem>
|
||||
Err(IdentifyError.into())
|
||||
}
|
||||
}
|
||||
|
||||
/// Trait for avoid reporting walreceiver specific expected or "normal" or "ok" errors.
|
||||
pub(super) trait ExpectedError {
|
||||
/// Test if this error is an ok error.
|
||||
///
|
||||
/// We don't want to report connectivity problems as real errors towards connection manager because
|
||||
/// 1. they happen frequently enough to make server logs hard to read and
|
||||
/// 2. the connection manager can retry other safekeeper.
|
||||
///
|
||||
/// If this function returns `true`, it's such an error.
|
||||
/// The caller should log it at info level and then report to connection manager that we're done handling this connection.
|
||||
/// Connection manager will then handle reconnections.
|
||||
///
|
||||
/// If this function returns an `false` the error should be propagated and the connection manager
|
||||
/// will log the error at ERROR level.
|
||||
fn is_expected(&self) -> bool;
|
||||
}
|
||||
|
||||
impl ExpectedError for postgres::Error {
|
||||
fn is_expected(&self) -> bool {
|
||||
self.is_closed()
|
||||
|| self
|
||||
.source()
|
||||
.and_then(|source| source.downcast_ref::<std::io::Error>())
|
||||
.map(is_expected_io_error)
|
||||
.unwrap_or(false)
|
||||
|| self
|
||||
.as_db_error()
|
||||
.filter(|db_error| {
|
||||
db_error.code() == &SqlState::SUCCESSFUL_COMPLETION
|
||||
&& db_error.message().contains("ending streaming")
|
||||
})
|
||||
.is_some()
|
||||
}
|
||||
}
|
||||
|
||||
impl ExpectedError for anyhow::Error {
|
||||
fn is_expected(&self) -> bool {
|
||||
let head = self.downcast_ref::<postgres::Error>();
|
||||
|
||||
let tail = self
|
||||
.chain()
|
||||
.filter_map(|e| e.downcast_ref::<postgres::Error>());
|
||||
|
||||
// check if self or any of the chained/sourced errors are expected
|
||||
head.into_iter().chain(tail).any(|e| e.is_expected())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -53,6 +53,9 @@ pub struct VirtualFile {
|
||||
pub path: PathBuf,
|
||||
open_options: OpenOptions,
|
||||
|
||||
// These are strings becase we only use them for metrics, and those expect strings.
|
||||
// It makes no sense for us to constantly turn the `TimelineId` and `TenantId` into
|
||||
// strings.
|
||||
tenant_id: String,
|
||||
timeline_id: String,
|
||||
}
|
||||
|
||||
@@ -450,15 +450,6 @@ impl<'a> WalIngest<'a> {
|
||||
let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
|
||||
if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
|
||||
let xlrec = XlHeapMultiInsert::decode(buf);
|
||||
|
||||
let offset_array_len = if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
|
||||
// the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
|
||||
0
|
||||
} else {
|
||||
std::mem::size_of::<u16>() * xlrec.ntuples as usize
|
||||
};
|
||||
assert_eq!(offset_array_len, buf.remaining());
|
||||
|
||||
if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
|
||||
@@ -270,13 +270,67 @@ pub struct XlHeapDelete {
|
||||
|
||||
impl XlHeapDelete {
|
||||
pub fn decode(buf: &mut Bytes) -> XlHeapDelete {
|
||||
let neon_format = buf.remaining() == pg_constants::SIZE_OF_HEAP_DELETE;
|
||||
let xmax = buf.get_u32_le();
|
||||
let offnum = buf.get_u16_le();
|
||||
let _padding;
|
||||
let t_cid;
|
||||
if neon_format {
|
||||
_padding = buf.get_u16_le();
|
||||
t_cid = buf.get_u32_le();
|
||||
} else {
|
||||
_padding = 0;
|
||||
t_cid = 0;
|
||||
}
|
||||
let infobits_set = buf.get_u8();
|
||||
let flags = buf.get_u8();
|
||||
assert!(((flags & pg_constants::XLH_DELETE_STORE_CID) == 0) ^ neon_format);
|
||||
XlHeapDelete {
|
||||
xmax: buf.get_u32_le(),
|
||||
offnum: buf.get_u16_le(),
|
||||
_padding: buf.get_u16_le(),
|
||||
t_cid: buf.get_u32_le(),
|
||||
infobits_set: buf.get_u8(),
|
||||
flags: buf.get_u8(),
|
||||
xmax,
|
||||
offnum,
|
||||
_padding,
|
||||
t_cid,
|
||||
infobits_set,
|
||||
flags,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XlHeapLock {
|
||||
pub locking_xid: TransactionId,
|
||||
pub offnum: OffsetNumber,
|
||||
pub _padding: u16,
|
||||
pub t_cid: u32,
|
||||
pub infobits_set: u8,
|
||||
pub flags: u8,
|
||||
}
|
||||
|
||||
impl XlHeapLock {
|
||||
pub fn decode(buf: &mut Bytes) -> XlHeapLock {
|
||||
let neon_format = buf.remaining() == pg_constants::SIZE_OF_HEAP_LOCK;
|
||||
let locking_xid = buf.get_u32_le();
|
||||
let offnum = buf.get_u16_le();
|
||||
let _padding;
|
||||
let t_cid;
|
||||
if neon_format {
|
||||
_padding = buf.get_u16_le();
|
||||
t_cid = buf.get_u32_le();
|
||||
} else {
|
||||
_padding = 0;
|
||||
t_cid = 0;
|
||||
}
|
||||
let infobits_set = buf.get_u8();
|
||||
let flags = buf.get_u8();
|
||||
assert!(((flags & pg_constants::XLH_LOCK_STORE_CID) == 0) ^ neon_format);
|
||||
XlHeapLock {
|
||||
locking_xid,
|
||||
offnum,
|
||||
_padding,
|
||||
t_cid,
|
||||
infobits_set,
|
||||
flags,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -295,12 +349,21 @@ pub struct XlHeapUpdate {
|
||||
|
||||
impl XlHeapUpdate {
|
||||
pub fn decode(buf: &mut Bytes) -> XlHeapUpdate {
|
||||
let old_xmax = buf.get_u32_le();
|
||||
let old_offnum = buf.get_u16_le();
|
||||
let old_infobits_set = buf.get_u8();
|
||||
let flags = buf.get_u8();
|
||||
let t_cid = if (flags & pg_constants::XLH_UPDATE_STORE_CID) != 0 {
|
||||
buf.get_u32()
|
||||
} else {
|
||||
0
|
||||
};
|
||||
XlHeapUpdate {
|
||||
old_xmax: buf.get_u32_le(),
|
||||
old_offnum: buf.get_u16_le(),
|
||||
old_infobits_set: buf.get_u8(),
|
||||
flags: buf.get_u8(),
|
||||
t_cid: buf.get_u32(),
|
||||
old_xmax,
|
||||
old_offnum,
|
||||
old_infobits_set,
|
||||
flags,
|
||||
t_cid,
|
||||
new_xmax: buf.get_u32_le(),
|
||||
new_offnum: buf.get_u16_le(),
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
MODULE_big = neon
|
||||
OBJS = \
|
||||
$(WIN32RES) \
|
||||
extension_server.o \
|
||||
file_cache.o \
|
||||
libpagestore.o \
|
||||
libpqwalproposer.o \
|
||||
|
||||
103
pgxn/neon/extension_server.c
Normal file
103
pgxn/neon/extension_server.c
Normal file
@@ -0,0 +1,103 @@
|
||||
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* extension_server.c
|
||||
* Request compute_ctl to download extension files.
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* contrib/neon/extension_server.c
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres.h"
|
||||
#include "tcop/pquery.h"
|
||||
#include "tcop/utility.h"
|
||||
#include "access/xact.h"
|
||||
#include "utils/hsearch.h"
|
||||
#include "utils/memutils.h"
|
||||
#include "commands/defrem.h"
|
||||
#include "miscadmin.h"
|
||||
#include "utils/acl.h"
|
||||
#include "fmgr.h"
|
||||
#include "utils/guc.h"
|
||||
#include "port.h"
|
||||
#include "fmgr.h"
|
||||
|
||||
#include <curl/curl.h>
|
||||
|
||||
static int extension_server_port = 0;
|
||||
|
||||
static download_extension_file_hook_type prev_download_extension_file_hook = NULL;
|
||||
|
||||
// to download all SQL (and data) files for an extension:
|
||||
// curl -X POST http://localhost:8080/extension_server/postgis
|
||||
// it covers two possible extension files layouts:
|
||||
// 1. extension_name--version--platform.sql
|
||||
// 2. extension_name/extension_name--version.sql
|
||||
// extension_name/extra_files.csv
|
||||
//
|
||||
// to download specific library file:
|
||||
// curl -X POST http://localhost:8080/extension_server/postgis-3.so?is_library=true
|
||||
static bool
|
||||
neon_download_extension_file_http(const char *filename, bool is_library)
|
||||
{
|
||||
CURL *curl;
|
||||
CURLcode res;
|
||||
char *compute_ctl_url;
|
||||
char *postdata;
|
||||
bool ret = false;
|
||||
|
||||
if ((curl = curl_easy_init()) == NULL)
|
||||
{
|
||||
elog(ERROR, "Failed to initialize curl handle");
|
||||
}
|
||||
|
||||
compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
|
||||
extension_server_port, filename, is_library ? "?is_library=true" : "");
|
||||
|
||||
elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url);
|
||||
|
||||
curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST");
|
||||
curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url);
|
||||
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */);
|
||||
|
||||
if (curl)
|
||||
{
|
||||
/* Perform the request, res will get the return code */
|
||||
res = curl_easy_perform(curl);
|
||||
/* Check for errors */
|
||||
if (res == CURLE_OK)
|
||||
{
|
||||
ret = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Don't error here because postgres will try to find the file
|
||||
// and will fail with some proper error message if it's not found.
|
||||
elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res));
|
||||
}
|
||||
|
||||
/* always cleanup */
|
||||
curl_easy_cleanup(curl);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void pg_init_extension_server()
|
||||
{
|
||||
// Port to connect to compute_ctl on localhost
|
||||
// to request extension files.
|
||||
DefineCustomIntVariable("neon.extension_server_port",
|
||||
"connection string to the compute_ctl",
|
||||
NULL,
|
||||
&extension_server_port,
|
||||
0, 0, INT_MAX,
|
||||
PGC_POSTMASTER,
|
||||
0, /* no flags required */
|
||||
NULL, NULL, NULL);
|
||||
|
||||
// set download_extension_file_hook
|
||||
prev_download_extension_file_hook = download_extension_file_hook;
|
||||
download_extension_file_hook = neon_download_extension_file_http;
|
||||
}
|
||||
@@ -172,7 +172,7 @@ lfc_change_limit_hook(int newval, void *extra)
|
||||
{
|
||||
lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
|
||||
if (lfc_desc < 0) {
|
||||
elog(LOG, "Failed to open file cache %s: %m", lfc_path);
|
||||
elog(WARNING, "Failed to open file cache %s: %m, disabling file cache", lfc_path);
|
||||
lfc_size_limit = 0; /* disable file cache */
|
||||
return;
|
||||
}
|
||||
@@ -557,7 +557,7 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
Assert(victim->access_count == 0);
|
||||
entry->offset = victim->offset; /* grab victim's chunk */
|
||||
hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL);
|
||||
elog(LOG, "Swap file cache page");
|
||||
elog(DEBUG2, "Swap file cache page");
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -574,7 +574,7 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
{
|
||||
lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
|
||||
if (lfc_desc < 0) {
|
||||
elog(LOG, "Failed to open file cache %s: %m", lfc_path);
|
||||
elog(WARNING, "Failed to open file cache %s: %m, disabling file cache", lfc_path);
|
||||
lfc_size_limit = 0; /* disable file cache */
|
||||
}
|
||||
}
|
||||
@@ -583,7 +583,7 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
|
||||
if (rc != BLCKSZ)
|
||||
{
|
||||
elog(INFO, "Failed to write file cache: %m");
|
||||
elog(WARNING, "Failed to write file cache: %m, disabling file cache");
|
||||
lfc_size_limit = 0; /* disable file cache */
|
||||
}
|
||||
}
|
||||
|
||||
@@ -35,8 +35,11 @@ _PG_init(void)
|
||||
{
|
||||
pg_init_libpagestore();
|
||||
pg_init_walproposer();
|
||||
|
||||
InitControlPlaneConnector();
|
||||
|
||||
pg_init_extension_server();
|
||||
|
||||
// Important: This must happen after other parts of the extension
|
||||
// are loaded, otherwise any settings to GUCs that were set before
|
||||
// the extension was loaded will be removed.
|
||||
|
||||
@@ -21,6 +21,8 @@ extern char *neon_tenant;
|
||||
extern void pg_init_libpagestore(void);
|
||||
extern void pg_init_walproposer(void);
|
||||
|
||||
extern void pg_init_extension_server(void);
|
||||
|
||||
/*
|
||||
* Returns true if we shouldn't do REDO on that block in record indicated by
|
||||
* block_id; false otherwise.
|
||||
|
||||
@@ -37,68 +37,14 @@ static XLogSegNo walpropSegNo = 0;
|
||||
|
||||
/* START cloned file-local variables and functions from walsender.c */
|
||||
|
||||
/*
|
||||
* xlogreader used for replication. Note that a WAL sender doing physical
|
||||
* replication does not need xlogreader to read WAL, but it needs one to
|
||||
* keep a state of its work.
|
||||
*/
|
||||
static XLogReaderState *xlogreader = NULL;
|
||||
|
||||
/*
|
||||
* These variables keep track of the state of the timeline we're currently
|
||||
* sending. sendTimeLine identifies the timeline. If sendTimeLineIsHistoric,
|
||||
* the timeline is not the latest timeline on this server, and the server's
|
||||
* history forked off from that timeline at sendTimeLineValidUpto.
|
||||
*/
|
||||
static TimeLineID sendTimeLine = 0;
|
||||
static TimeLineID sendTimeLineNextTLI = 0;
|
||||
static bool sendTimeLineIsHistoric = false;
|
||||
static XLogRecPtr sendTimeLineValidUpto = InvalidXLogRecPtr;
|
||||
|
||||
/*
|
||||
* Timestamp of last ProcessRepliesIfAny() that saw a reply from the
|
||||
* standby. Set to 0 if wal_sender_timeout doesn't need to be active.
|
||||
*/
|
||||
static TimestampTz last_reply_timestamp = 0;
|
||||
|
||||
/* Have we sent a heartbeat message asking for reply, since last reply? */
|
||||
static bool waiting_for_ping_response = false;
|
||||
|
||||
static bool streamingDoneSending;
|
||||
static bool streamingDoneReceiving;
|
||||
|
||||
/* Are we there yet? */
|
||||
static bool WalSndCaughtUp = false;
|
||||
|
||||
/* Flags set by signal handlers for later service in main loop */
|
||||
static volatile sig_atomic_t got_STOPPING = false;
|
||||
|
||||
/*
|
||||
* How far have we sent WAL already? This is also advertised in
|
||||
* MyWalSnd->sentPtr. (Actually, this is the next WAL location to send.)
|
||||
*/
|
||||
static XLogRecPtr sentPtr = InvalidXLogRecPtr;
|
||||
|
||||
/*
|
||||
* This is set while we are streaming. When not set
|
||||
* PROCSIG_WALSND_INIT_STOPPING signal will be handled like SIGTERM. When set,
|
||||
* the main loop is responsible for checking got_STOPPING and terminating when
|
||||
* it's set (after streaming any remaining WAL).
|
||||
*/
|
||||
static volatile sig_atomic_t replication_active = false;
|
||||
|
||||
typedef void (*WalSndSendDataCallback) (void);
|
||||
static void WalSndLoop(WalSndSendDataCallback send_data);
|
||||
static void XLogSendPhysical(void);
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
static XLogRecPtr GetStandbyFlushRecPtr(TimeLineID *tli);
|
||||
#else
|
||||
static XLogRecPtr GetStandbyFlushRecPtr(void);
|
||||
#endif
|
||||
|
||||
static void WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo,
|
||||
TimeLineID *tli_p);
|
||||
|
||||
static void WalSndLoop(void);
|
||||
static void XLogBroadcastWalProposer(void);
|
||||
/* END cloned file-level variables and functions from walsender.c */
|
||||
|
||||
int
|
||||
@@ -506,7 +452,7 @@ XLogWalPropClose(XLogRecPtr recptr)
|
||||
/* START of cloned functions from walsender.c */
|
||||
|
||||
/*
|
||||
* Handle START_REPLICATION command.
|
||||
* Subscribe for new WAL and stream it in the loop to safekeepers.
|
||||
*
|
||||
* At the moment, this never returns, but an ereport(ERROR) will take us back
|
||||
* to the main loop.
|
||||
@@ -524,18 +470,6 @@ StartProposerReplication(StartReplicationCmd *cmd)
|
||||
errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION")));
|
||||
#endif
|
||||
|
||||
/* create xlogreader for physical replication */
|
||||
xlogreader =
|
||||
XLogReaderAllocate(wal_segment_size, NULL,
|
||||
XL_ROUTINE(.segment_open = WalSndSegmentOpen,
|
||||
.segment_close = wal_segment_close),
|
||||
NULL);
|
||||
|
||||
if (!xlogreader)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_OUT_OF_MEMORY),
|
||||
errmsg("out of memory")));
|
||||
|
||||
/*
|
||||
* We assume here that we're logging enough information in the WAL for
|
||||
* log-shipping, since this is checked in PostmasterMain().
|
||||
@@ -569,341 +503,61 @@ StartProposerReplication(StartReplicationCmd *cmd)
|
||||
* we keep this code around to lighten the load for when we need it.
|
||||
*/
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
if (am_cascading_walsender)
|
||||
{
|
||||
/* this also updates ThisTimeLineID */
|
||||
FlushPtr = GetStandbyFlushRecPtr(&currTLI);
|
||||
}
|
||||
else
|
||||
FlushPtr = GetFlushRecPtr(&currTLI);
|
||||
FlushPtr = GetFlushRecPtr(&currTLI);
|
||||
#else
|
||||
if (am_cascading_walsender)
|
||||
{
|
||||
/* this also updates ThisTimeLineID */
|
||||
FlushPtr = GetStandbyFlushRecPtr();
|
||||
}
|
||||
else
|
||||
FlushPtr = GetFlushRecPtr();
|
||||
|
||||
FlushPtr = GetFlushRecPtr();
|
||||
currTLI = ThisTimeLineID;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* When we first start replication the standby will be behind the
|
||||
* primary. For some applications, for example synchronous
|
||||
* replication, it is important to have a clear state for this initial
|
||||
* catchup mode, so we can trigger actions when we change streaming
|
||||
* state later. We may stay in this state for a long time, which is
|
||||
* exactly why we want to be able to monitor whether or not we are
|
||||
* still here.
|
||||
*/
|
||||
WalSndSetState(WALSNDSTATE_CATCHUP);
|
||||
|
||||
if (cmd->timeline != 0)
|
||||
/*
|
||||
* Don't allow a request to stream from a future point in WAL that
|
||||
* hasn't been flushed to disk in this server yet.
|
||||
*/
|
||||
if (FlushPtr < cmd->startpoint)
|
||||
{
|
||||
XLogRecPtr switchpoint;
|
||||
|
||||
sendTimeLine = cmd->timeline;
|
||||
if (sendTimeLine == currTLI)
|
||||
{
|
||||
sendTimeLineIsHistoric = false;
|
||||
sendTimeLineValidUpto = InvalidXLogRecPtr;
|
||||
}
|
||||
else
|
||||
{
|
||||
List *timeLineHistory;
|
||||
|
||||
sendTimeLineIsHistoric = true;
|
||||
|
||||
/*
|
||||
* Check that the timeline the client requested exists, and the
|
||||
* requested start location is on that timeline.
|
||||
*/
|
||||
timeLineHistory = readTimeLineHistory(currTLI);
|
||||
switchpoint = tliSwitchPoint(cmd->timeline, timeLineHistory,
|
||||
&sendTimeLineNextTLI);
|
||||
list_free_deep(timeLineHistory);
|
||||
|
||||
/*
|
||||
* Found the requested timeline in the history. Check that
|
||||
* requested startpoint is on that timeline in our history.
|
||||
*
|
||||
* This is quite loose on purpose. We only check that we didn't
|
||||
* fork off the requested timeline before the switchpoint. We
|
||||
* don't check that we switched *to* it before the requested
|
||||
* starting point. This is because the client can legitimately
|
||||
* request to start replication from the beginning of the WAL
|
||||
* segment that contains switchpoint, but on the new timeline, so
|
||||
* that it doesn't end up with a partial segment. If you ask for
|
||||
* too old a starting point, you'll get an error later when we
|
||||
* fail to find the requested WAL segment in pg_wal.
|
||||
*
|
||||
* XXX: we could be more strict here and only allow a startpoint
|
||||
* that's older than the switchpoint, if it's still in the same
|
||||
* WAL segment.
|
||||
*/
|
||||
if (!XLogRecPtrIsInvalid(switchpoint) &&
|
||||
switchpoint < cmd->startpoint)
|
||||
{
|
||||
ereport(ERROR,
|
||||
(errmsg("requested starting point %X/%X on timeline %u is not in this server's history",
|
||||
LSN_FORMAT_ARGS(cmd->startpoint),
|
||||
cmd->timeline),
|
||||
errdetail("This server's history forked from timeline %u at %X/%X.",
|
||||
cmd->timeline,
|
||||
LSN_FORMAT_ARGS(switchpoint))));
|
||||
}
|
||||
sendTimeLineValidUpto = switchpoint;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
sendTimeLine = currTLI;
|
||||
sendTimeLineValidUpto = InvalidXLogRecPtr;
|
||||
sendTimeLineIsHistoric = false;
|
||||
ereport(ERROR,
|
||||
(errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X",
|
||||
LSN_FORMAT_ARGS(cmd->startpoint),
|
||||
LSN_FORMAT_ARGS(FlushPtr))));
|
||||
}
|
||||
|
||||
streamingDoneSending = streamingDoneReceiving = false;
|
||||
/* Start streaming from the requested point */
|
||||
sentPtr = cmd->startpoint;
|
||||
|
||||
/* If there is nothing to stream, don't even enter COPY mode */
|
||||
if (!sendTimeLineIsHistoric || cmd->startpoint < sendTimeLineValidUpto)
|
||||
{
|
||||
/*
|
||||
* When we first start replication the standby will be behind the
|
||||
* primary. For some applications, for example synchronous
|
||||
* replication, it is important to have a clear state for this initial
|
||||
* catchup mode, so we can trigger actions when we change streaming
|
||||
* state later. We may stay in this state for a long time, which is
|
||||
* exactly why we want to be able to monitor whether or not we are
|
||||
* still here.
|
||||
*/
|
||||
WalSndSetState(WALSNDSTATE_CATCHUP);
|
||||
/* Initialize shared memory status, too */
|
||||
SpinLockAcquire(&MyWalSnd->mutex);
|
||||
MyWalSnd->sentPtr = sentPtr;
|
||||
SpinLockRelease(&MyWalSnd->mutex);
|
||||
|
||||
/*
|
||||
* Don't allow a request to stream from a future point in WAL that
|
||||
* hasn't been flushed to disk in this server yet.
|
||||
*/
|
||||
if (FlushPtr < cmd->startpoint)
|
||||
{
|
||||
ereport(ERROR,
|
||||
(errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X",
|
||||
LSN_FORMAT_ARGS(cmd->startpoint),
|
||||
LSN_FORMAT_ARGS(FlushPtr))));
|
||||
}
|
||||
SyncRepInitConfig();
|
||||
|
||||
/* Start streaming from the requested point */
|
||||
sentPtr = cmd->startpoint;
|
||||
/* Infinite send loop, never returns */
|
||||
WalSndLoop();
|
||||
|
||||
/* Initialize shared memory status, too */
|
||||
SpinLockAcquire(&MyWalSnd->mutex);
|
||||
MyWalSnd->sentPtr = sentPtr;
|
||||
SpinLockRelease(&MyWalSnd->mutex);
|
||||
|
||||
SyncRepInitConfig();
|
||||
|
||||
/* Main loop of walsender */
|
||||
replication_active = true;
|
||||
|
||||
WalSndLoop(XLogSendPhysical);
|
||||
|
||||
replication_active = false;
|
||||
if (got_STOPPING)
|
||||
proc_exit(0);
|
||||
WalSndSetState(WALSNDSTATE_STARTUP);
|
||||
|
||||
Assert(streamingDoneSending && streamingDoneReceiving);
|
||||
}
|
||||
WalSndSetState(WALSNDSTATE_STARTUP);
|
||||
|
||||
if (cmd->slotname)
|
||||
ReplicationSlotRelease();
|
||||
|
||||
/*
|
||||
* Copy is finished now. Send a single-row result set indicating the next
|
||||
* timeline.
|
||||
*/
|
||||
if (sendTimeLineIsHistoric)
|
||||
{
|
||||
char startpos_str[8 + 1 + 8 + 1];
|
||||
DestReceiver *dest;
|
||||
TupOutputState *tstate;
|
||||
TupleDesc tupdesc;
|
||||
Datum values[2];
|
||||
bool nulls[2];
|
||||
|
||||
snprintf(startpos_str, sizeof(startpos_str), "%X/%X",
|
||||
LSN_FORMAT_ARGS(sendTimeLineValidUpto));
|
||||
|
||||
dest = CreateDestReceiver(DestRemoteSimple);
|
||||
MemSet(nulls, false, sizeof(nulls));
|
||||
|
||||
/*
|
||||
* Need a tuple descriptor representing two columns. int8 may seem
|
||||
* like a surprising data type for this, but in theory int4 would not
|
||||
* be wide enough for this, as TimeLineID is unsigned.
|
||||
*/
|
||||
tupdesc = CreateTemplateTupleDesc(2);
|
||||
TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 1, "next_tli",
|
||||
INT8OID, -1, 0);
|
||||
TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 2, "next_tli_startpos",
|
||||
TEXTOID, -1, 0);
|
||||
|
||||
/* prepare for projection of tuple */
|
||||
tstate = begin_tup_output_tupdesc(dest, tupdesc, &TTSOpsVirtual);
|
||||
|
||||
values[0] = Int64GetDatum((int64) sendTimeLineNextTLI);
|
||||
values[1] = CStringGetTextDatum(startpos_str);
|
||||
|
||||
/* send it to dest */
|
||||
do_tup_output(tstate, values, nulls);
|
||||
|
||||
end_tup_output(tstate);
|
||||
}
|
||||
|
||||
/* Send CommandComplete message */
|
||||
EndReplicationCommand("START_STREAMING");
|
||||
}
|
||||
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
static XLogRecPtr
|
||||
GetStandbyFlushRecPtr(TimeLineID *tli)
|
||||
{
|
||||
XLogRecPtr replayPtr;
|
||||
TimeLineID replayTLI;
|
||||
XLogRecPtr receivePtr;
|
||||
TimeLineID receiveTLI;
|
||||
XLogRecPtr result;
|
||||
|
||||
/*
|
||||
* We can safely send what's already been replayed. Also, if walreceiver
|
||||
* is streaming WAL from the same timeline, we can send anything that it
|
||||
* has streamed, but hasn't been replayed yet.
|
||||
*/
|
||||
|
||||
receivePtr = GetWalRcvFlushRecPtr(NULL, &receiveTLI);
|
||||
replayPtr = GetXLogReplayRecPtr(&replayTLI);
|
||||
|
||||
*tli = replayTLI;
|
||||
|
||||
result = replayPtr;
|
||||
if (receiveTLI == replayTLI && receivePtr > replayPtr)
|
||||
result = receivePtr;
|
||||
|
||||
return result;
|
||||
}
|
||||
#else
|
||||
/*
|
||||
* Returns the latest point in WAL that has been safely flushed to disk, and
|
||||
* can be sent to the standby. This should only be called when in recovery,
|
||||
* ie. we're streaming to a cascaded standby.
|
||||
*
|
||||
* As a side-effect, ThisTimeLineID is updated to the TLI of the last
|
||||
* replayed WAL record.
|
||||
* Main loop that waits for LSN updates and calls the walproposer.
|
||||
* Synchronous replication sets latch in WalSndWakeup at walsender.c
|
||||
*/
|
||||
static XLogRecPtr
|
||||
GetStandbyFlushRecPtr(void)
|
||||
{
|
||||
XLogRecPtr replayPtr;
|
||||
TimeLineID replayTLI;
|
||||
XLogRecPtr receivePtr;
|
||||
TimeLineID receiveTLI;
|
||||
XLogRecPtr result;
|
||||
|
||||
/*
|
||||
* We can safely send what's already been replayed. Also, if walreceiver
|
||||
* is streaming WAL from the same timeline, we can send anything that it
|
||||
* has streamed, but hasn't been replayed yet.
|
||||
*/
|
||||
|
||||
receivePtr = GetWalRcvFlushRecPtr(NULL, &receiveTLI);
|
||||
replayPtr = GetXLogReplayRecPtr(&replayTLI);
|
||||
|
||||
ThisTimeLineID = replayTLI;
|
||||
|
||||
result = replayPtr;
|
||||
if (receiveTLI == ThisTimeLineID && receivePtr > replayPtr)
|
||||
result = receivePtr;
|
||||
|
||||
return result;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/* XLogReaderRoutine->segment_open callback */
|
||||
static void
|
||||
WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo,
|
||||
TimeLineID *tli_p)
|
||||
WalSndLoop(void)
|
||||
{
|
||||
char path[MAXPGPATH];
|
||||
|
||||
/*-------
|
||||
* When reading from a historic timeline, and there is a timeline switch
|
||||
* within this segment, read from the WAL segment belonging to the new
|
||||
* timeline.
|
||||
*
|
||||
* For example, imagine that this server is currently on timeline 5, and
|
||||
* we're streaming timeline 4. The switch from timeline 4 to 5 happened at
|
||||
* 0/13002088. In pg_wal, we have these files:
|
||||
*
|
||||
* ...
|
||||
* 000000040000000000000012
|
||||
* 000000040000000000000013
|
||||
* 000000050000000000000013
|
||||
* 000000050000000000000014
|
||||
* ...
|
||||
*
|
||||
* In this situation, when requested to send the WAL from segment 0x13, on
|
||||
* timeline 4, we read the WAL from file 000000050000000000000013. Archive
|
||||
* recovery prefers files from newer timelines, so if the segment was
|
||||
* restored from the archive on this server, the file belonging to the old
|
||||
* timeline, 000000040000000000000013, might not exist. Their contents are
|
||||
* equal up to the switchpoint, because at a timeline switch, the used
|
||||
* portion of the old segment is copied to the new file. -------
|
||||
*/
|
||||
*tli_p = sendTimeLine;
|
||||
if (sendTimeLineIsHistoric)
|
||||
{
|
||||
XLogSegNo endSegNo;
|
||||
|
||||
XLByteToSeg(sendTimeLineValidUpto, endSegNo, state->segcxt.ws_segsize);
|
||||
if (nextSegNo == endSegNo)
|
||||
*tli_p = sendTimeLineNextTLI;
|
||||
}
|
||||
|
||||
XLogFilePath(path, *tli_p, nextSegNo, state->segcxt.ws_segsize);
|
||||
state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY);
|
||||
if (state->seg.ws_file >= 0)
|
||||
return;
|
||||
|
||||
/*
|
||||
* If the file is not found, assume it's because the standby asked for a
|
||||
* too old WAL segment that has already been removed or recycled.
|
||||
*/
|
||||
if (errno == ENOENT)
|
||||
{
|
||||
char xlogfname[MAXFNAMELEN];
|
||||
int save_errno = errno;
|
||||
|
||||
XLogFileName(xlogfname, *tli_p, nextSegNo, wal_segment_size);
|
||||
errno = save_errno;
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("requested WAL segment %s has already been removed",
|
||||
xlogfname)));
|
||||
}
|
||||
else
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not open file \"%s\": %m",
|
||||
path)));
|
||||
}
|
||||
|
||||
|
||||
/* Main loop of walsender process that streams the WAL over Copy messages. */
|
||||
static void
|
||||
WalSndLoop(WalSndSendDataCallback send_data)
|
||||
{
|
||||
/*
|
||||
* Initialize the last reply timestamp. That enables timeout processing
|
||||
* from hereon.
|
||||
*/
|
||||
last_reply_timestamp = GetCurrentTimestamp();
|
||||
waiting_for_ping_response = false;
|
||||
|
||||
/*
|
||||
* Loop until we reach the end of this timeline or the client requests to
|
||||
* stop streaming.
|
||||
*/
|
||||
for (;;)
|
||||
{
|
||||
/* Clear any already-pending wakeups */
|
||||
@@ -911,153 +565,41 @@ WalSndLoop(WalSndSendDataCallback send_data)
|
||||
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
|
||||
/* Process any requests or signals received recently */
|
||||
if (ConfigReloadPending)
|
||||
{
|
||||
ConfigReloadPending = false;
|
||||
ProcessConfigFile(PGC_SIGHUP);
|
||||
SyncRepInitConfig();
|
||||
}
|
||||
XLogBroadcastWalProposer();
|
||||
|
||||
/* always true */
|
||||
if (am_wal_proposer)
|
||||
{
|
||||
send_data();
|
||||
if (WalSndCaughtUp)
|
||||
{
|
||||
if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
|
||||
WalSndSetState(WALSNDSTATE_STREAMING);
|
||||
WalProposerPoll();
|
||||
WalSndCaughtUp = false;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
|
||||
WalSndSetState(WALSNDSTATE_STREAMING);
|
||||
WalProposerPoll();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Send out the WAL in its normal physical/stored form.
|
||||
*
|
||||
* Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk,
|
||||
* but not yet sent to the client, and buffer it in the libpq output
|
||||
* buffer.
|
||||
*
|
||||
* If there is no unsent WAL remaining, WalSndCaughtUp is set to true,
|
||||
* otherwise WalSndCaughtUp is set to false.
|
||||
* Notify walproposer about the new WAL position.
|
||||
*/
|
||||
static void
|
||||
XLogSendPhysical(void)
|
||||
XLogBroadcastWalProposer(void)
|
||||
{
|
||||
XLogRecPtr SendRqstPtr;
|
||||
XLogRecPtr startptr;
|
||||
XLogRecPtr endptr;
|
||||
Size nbytes PG_USED_FOR_ASSERTS_ONLY;
|
||||
TimeLineID currTLI;
|
||||
|
||||
/* If requested switch the WAL sender to the stopping state. */
|
||||
if (got_STOPPING)
|
||||
WalSndSetState(WALSNDSTATE_STOPPING);
|
||||
/* Start from the last sent position */
|
||||
startptr = sentPtr;
|
||||
|
||||
if (streamingDoneSending)
|
||||
{
|
||||
WalSndCaughtUp = true;
|
||||
return;
|
||||
}
|
||||
|
||||
/* Figure out how far we can safely send the WAL. */
|
||||
if (sendTimeLineIsHistoric)
|
||||
{
|
||||
/*
|
||||
* Streaming an old timeline that's in this server's history, but is
|
||||
* not the one we're currently inserting or replaying. It can be
|
||||
* streamed up to the point where we switched off that timeline.
|
||||
*/
|
||||
SendRqstPtr = sendTimeLineValidUpto;
|
||||
}
|
||||
else if (am_cascading_walsender)
|
||||
{
|
||||
/*
|
||||
* Streaming the latest timeline on a standby.
|
||||
*
|
||||
* Attempt to send all WAL that has already been replayed, so that we
|
||||
* know it's valid. If we're receiving WAL through streaming
|
||||
* replication, it's also OK to send any WAL that has been received
|
||||
* but not replayed.
|
||||
*
|
||||
* The timeline we're recovering from can change, or we can be
|
||||
* promoted. In either case, the current timeline becomes historic. We
|
||||
* need to detect that so that we don't try to stream past the point
|
||||
* where we switched to another timeline. We check for promotion or
|
||||
* timeline switch after calculating FlushPtr, to avoid a race
|
||||
* condition: if the timeline becomes historic just after we checked
|
||||
* that it was still current, it's still be OK to stream it up to the
|
||||
* FlushPtr that was calculated before it became historic.
|
||||
*/
|
||||
bool becameHistoric = false;
|
||||
/*
|
||||
* Streaming the current timeline on a primary.
|
||||
*
|
||||
* Attempt to send all data that's already been written out and
|
||||
* fsync'd to disk. We cannot go further than what's been written out
|
||||
* given the current implementation of WALRead(). And in any case
|
||||
* it's unsafe to send WAL that is not securely down to disk on the
|
||||
* primary: if the primary subsequently crashes and restarts, standbys
|
||||
* must not have applied any WAL that got lost on the primary.
|
||||
*/
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
SendRqstPtr = GetStandbyFlushRecPtr(&currTLI);
|
||||
endptr = GetFlushRecPtr(NULL);
|
||||
#else
|
||||
SendRqstPtr = GetStandbyFlushRecPtr();
|
||||
currTLI = ThisTimeLineID;
|
||||
endptr = GetFlushRecPtr();
|
||||
#endif
|
||||
if (!RecoveryInProgress())
|
||||
{
|
||||
/*
|
||||
* We have been promoted. RecoveryInProgress() updated
|
||||
* ThisTimeLineID to the new current timeline.
|
||||
*/
|
||||
am_cascading_walsender = false;
|
||||
becameHistoric = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Still a cascading standby. But is the timeline we're sending
|
||||
* still the one recovery is recovering from? currTLI was updated
|
||||
* by the GetStandbyFlushRecPtr() call above.
|
||||
*/
|
||||
if (sendTimeLine != currTLI)
|
||||
becameHistoric = true;
|
||||
}
|
||||
|
||||
if (becameHistoric)
|
||||
{
|
||||
/*
|
||||
* The timeline we were sending has become historic. Read the
|
||||
* timeline history file of the new timeline to see where exactly
|
||||
* we forked off from the timeline we were sending.
|
||||
*/
|
||||
List *history;
|
||||
|
||||
history = readTimeLineHistory(currTLI);
|
||||
sendTimeLineValidUpto = tliSwitchPoint(sendTimeLine, history, &sendTimeLineNextTLI);
|
||||
|
||||
Assert(sendTimeLine < sendTimeLineNextTLI);
|
||||
list_free_deep(history);
|
||||
|
||||
sendTimeLineIsHistoric = true;
|
||||
|
||||
SendRqstPtr = sendTimeLineValidUpto;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Streaming the current timeline on a primary.
|
||||
*
|
||||
* Attempt to send all data that's already been written out and
|
||||
* fsync'd to disk. We cannot go further than what's been written out
|
||||
* given the current implementation of WALRead(). And in any case
|
||||
* it's unsafe to send WAL that is not securely down to disk on the
|
||||
* primary: if the primary subsequently crashes and restarts, standbys
|
||||
* must not have applied any WAL that got lost on the primary.
|
||||
*/
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
SendRqstPtr = GetFlushRecPtr(NULL);
|
||||
#else
|
||||
SendRqstPtr = GetFlushRecPtr();
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Record the current system time as an approximation of the time at which
|
||||
@@ -1083,91 +625,14 @@ XLogSendPhysical(void)
|
||||
* that arbitrary LSN is eventually reported as written, flushed and
|
||||
* applied, so that it can measure the elapsed time.
|
||||
*/
|
||||
LagTrackerWrite(SendRqstPtr, GetCurrentTimestamp());
|
||||
|
||||
/*
|
||||
* If this is a historic timeline and we've reached the point where we
|
||||
* forked to the next timeline, stop streaming.
|
||||
*
|
||||
* Note: We might already have sent WAL > sendTimeLineValidUpto. The
|
||||
* startup process will normally replay all WAL that has been received
|
||||
* from the primary, before promoting, but if the WAL streaming is
|
||||
* terminated at a WAL page boundary, the valid portion of the timeline
|
||||
* might end in the middle of a WAL record. We might've already sent the
|
||||
* first half of that partial WAL record to the cascading standby, so that
|
||||
* sentPtr > sendTimeLineValidUpto. That's OK; the cascading standby can't
|
||||
* replay the partial WAL record either, so it can still follow our
|
||||
* timeline switch.
|
||||
*/
|
||||
if (sendTimeLineIsHistoric && sendTimeLineValidUpto <= sentPtr)
|
||||
{
|
||||
/* close the current file. */
|
||||
if (xlogreader->seg.ws_file >= 0)
|
||||
wal_segment_close(xlogreader);
|
||||
|
||||
/* Send CopyDone */
|
||||
pq_putmessage_noblock('c', NULL, 0);
|
||||
streamingDoneSending = true;
|
||||
|
||||
WalSndCaughtUp = true;
|
||||
|
||||
elog(DEBUG1, "walsender reached end of timeline at %X/%X (sent up to %X/%X)",
|
||||
LSN_FORMAT_ARGS(sendTimeLineValidUpto),
|
||||
LSN_FORMAT_ARGS(sentPtr));
|
||||
return;
|
||||
}
|
||||
LagTrackerWrite(endptr, GetCurrentTimestamp());
|
||||
|
||||
/* Do we have any work to do? */
|
||||
Assert(sentPtr <= SendRqstPtr);
|
||||
if (SendRqstPtr <= sentPtr)
|
||||
{
|
||||
WalSndCaughtUp = true;
|
||||
Assert(startptr <= endptr);
|
||||
if (endptr <= startptr)
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Figure out how much to send in one message. If there's no more than
|
||||
* MAX_SEND_SIZE bytes to send, send everything. Otherwise send
|
||||
* MAX_SEND_SIZE bytes, but round back to logfile or page boundary.
|
||||
*
|
||||
* The rounding is not only for performance reasons. Walreceiver relies on
|
||||
* the fact that we never split a WAL record across two messages. Since a
|
||||
* long WAL record is split at page boundary into continuation records,
|
||||
* page boundary is always a safe cut-off point. We also assume that
|
||||
* SendRqstPtr never points to the middle of a WAL record.
|
||||
*/
|
||||
startptr = sentPtr;
|
||||
endptr = startptr;
|
||||
endptr += MAX_SEND_SIZE;
|
||||
|
||||
/* if we went beyond SendRqstPtr, back off */
|
||||
if (SendRqstPtr <= endptr)
|
||||
{
|
||||
endptr = SendRqstPtr;
|
||||
if (sendTimeLineIsHistoric)
|
||||
WalSndCaughtUp = false;
|
||||
else
|
||||
WalSndCaughtUp = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* round down to page boundary. */
|
||||
endptr -= (endptr % XLOG_BLCKSZ);
|
||||
WalSndCaughtUp = false;
|
||||
}
|
||||
|
||||
nbytes = endptr - startptr;
|
||||
Assert(nbytes <= MAX_SEND_SIZE);
|
||||
|
||||
/* always true */
|
||||
if (am_wal_proposer)
|
||||
{
|
||||
WalProposerBroadcast(startptr, endptr);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* code removed for brevity */
|
||||
}
|
||||
WalProposerBroadcast(startptr, endptr);
|
||||
sentPtr = endptr;
|
||||
|
||||
/* Update shared memory status */
|
||||
|
||||
48
poetry.lock
generated
48
poetry.lock
generated
@@ -887,34 +887,34 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "cryptography"
|
||||
version = "41.0.2"
|
||||
version = "41.0.3"
|
||||
description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "cryptography-41.0.2-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:01f1d9e537f9a15b037d5d9ee442b8c22e3ae11ce65ea1f3316a41c78756b711"},
|
||||
{file = "cryptography-41.0.2-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:079347de771f9282fbfe0e0236c716686950c19dee1b76240ab09ce1624d76d7"},
|
||||
{file = "cryptography-41.0.2-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:439c3cc4c0d42fa999b83ded80a9a1fb54d53c58d6e59234cfe97f241e6c781d"},
|
||||
{file = "cryptography-41.0.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f14ad275364c8b4e525d018f6716537ae7b6d369c094805cae45300847e0894f"},
|
||||
{file = "cryptography-41.0.2-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:84609ade00a6ec59a89729e87a503c6e36af98ddcd566d5f3be52e29ba993182"},
|
||||
{file = "cryptography-41.0.2-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:49c3222bb8f8e800aead2e376cbef687bc9e3cb9b58b29a261210456a7783d83"},
|
||||
{file = "cryptography-41.0.2-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:d73f419a56d74fef257955f51b18d046f3506270a5fd2ac5febbfa259d6c0fa5"},
|
||||
{file = "cryptography-41.0.2-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:2a034bf7d9ca894720f2ec1d8b7b5832d7e363571828037f9e0c4f18c1b58a58"},
|
||||
{file = "cryptography-41.0.2-cp37-abi3-win32.whl", hash = "sha256:d124682c7a23c9764e54ca9ab5b308b14b18eba02722b8659fb238546de83a76"},
|
||||
{file = "cryptography-41.0.2-cp37-abi3-win_amd64.whl", hash = "sha256:9c3fe6534d59d071ee82081ca3d71eed3210f76ebd0361798c74abc2bcf347d4"},
|
||||
{file = "cryptography-41.0.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a719399b99377b218dac6cf547b6ec54e6ef20207b6165126a280b0ce97e0d2a"},
|
||||
{file = "cryptography-41.0.2-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:182be4171f9332b6741ee818ec27daff9fb00349f706629f5cbf417bd50e66fd"},
|
||||
{file = "cryptography-41.0.2-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:7a9a3bced53b7f09da251685224d6a260c3cb291768f54954e28f03ef14e3766"},
|
||||
{file = "cryptography-41.0.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f0dc40e6f7aa37af01aba07277d3d64d5a03dc66d682097541ec4da03cc140ee"},
|
||||
{file = "cryptography-41.0.2-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:674b669d5daa64206c38e507808aae49904c988fa0a71c935e7006a3e1e83831"},
|
||||
{file = "cryptography-41.0.2-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:7af244b012711a26196450d34f483357e42aeddb04128885d95a69bd8b14b69b"},
|
||||
{file = "cryptography-41.0.2-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:9b6d717393dbae53d4e52684ef4f022444fc1cce3c48c38cb74fca29e1f08eaa"},
|
||||
{file = "cryptography-41.0.2-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:192255f539d7a89f2102d07d7375b1e0a81f7478925b3bc2e0549ebf739dae0e"},
|
||||
{file = "cryptography-41.0.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f772610fe364372de33d76edcd313636a25684edb94cee53fd790195f5989d14"},
|
||||
{file = "cryptography-41.0.2-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:b332cba64d99a70c1e0836902720887fb4529ea49ea7f5462cf6640e095e11d2"},
|
||||
{file = "cryptography-41.0.2-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:9a6673c1828db6270b76b22cc696f40cde9043eb90373da5c2f8f2158957f42f"},
|
||||
{file = "cryptography-41.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:342f3767e25876751e14f8459ad85e77e660537ca0a066e10e75df9c9e9099f0"},
|
||||
{file = "cryptography-41.0.2.tar.gz", hash = "sha256:7d230bf856164de164ecb615ccc14c7fc6de6906ddd5b491f3af90d3514c925c"},
|
||||
{file = "cryptography-41.0.3-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:652627a055cb52a84f8c448185922241dd5217443ca194d5739b44612c5e6507"},
|
||||
{file = "cryptography-41.0.3-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:8f09daa483aedea50d249ef98ed500569841d6498aa9c9f4b0531b9964658922"},
|
||||
{file = "cryptography-41.0.3-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4fd871184321100fb400d759ad0cddddf284c4b696568204d281c902fc7b0d81"},
|
||||
{file = "cryptography-41.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84537453d57f55a50a5b6835622ee405816999a7113267739a1b4581f83535bd"},
|
||||
{file = "cryptography-41.0.3-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:3fb248989b6363906827284cd20cca63bb1a757e0a2864d4c1682a985e3dca47"},
|
||||
{file = "cryptography-41.0.3-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:42cb413e01a5d36da9929baa9d70ca90d90b969269e5a12d39c1e0d475010116"},
|
||||
{file = "cryptography-41.0.3-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:aeb57c421b34af8f9fe830e1955bf493a86a7996cc1338fe41b30047d16e962c"},
|
||||
{file = "cryptography-41.0.3-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6af1c6387c531cd364b72c28daa29232162010d952ceb7e5ca8e2827526aceae"},
|
||||
{file = "cryptography-41.0.3-cp37-abi3-win32.whl", hash = "sha256:0d09fb5356f975974dbcb595ad2d178305e5050656affb7890a1583f5e02a306"},
|
||||
{file = "cryptography-41.0.3-cp37-abi3-win_amd64.whl", hash = "sha256:a983e441a00a9d57a4d7c91b3116a37ae602907a7618b882c8013b5762e80574"},
|
||||
{file = "cryptography-41.0.3-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5259cb659aa43005eb55a0e4ff2c825ca111a0da1814202c64d28a985d33b087"},
|
||||
{file = "cryptography-41.0.3-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:67e120e9a577c64fe1f611e53b30b3e69744e5910ff3b6e97e935aeb96005858"},
|
||||
{file = "cryptography-41.0.3-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:7efe8041897fe7a50863e51b77789b657a133c75c3b094e51b5e4b5cec7bf906"},
|
||||
{file = "cryptography-41.0.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ce785cf81a7bdade534297ef9e490ddff800d956625020ab2ec2780a556c313e"},
|
||||
{file = "cryptography-41.0.3-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:57a51b89f954f216a81c9d057bf1a24e2f36e764a1ca9a501a6964eb4a6800dd"},
|
||||
{file = "cryptography-41.0.3-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:4c2f0d35703d61002a2bbdcf15548ebb701cfdd83cdc12471d2bae80878a4207"},
|
||||
{file = "cryptography-41.0.3-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:23c2d778cf829f7d0ae180600b17e9fceea3c2ef8b31a99e3c694cbbf3a24b84"},
|
||||
{file = "cryptography-41.0.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:95dd7f261bb76948b52a5330ba5202b91a26fbac13ad0e9fc8a3ac04752058c7"},
|
||||
{file = "cryptography-41.0.3-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:41d7aa7cdfded09b3d73a47f429c298e80796c8e825ddfadc84c8a7f12df212d"},
|
||||
{file = "cryptography-41.0.3-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d0d651aa754ef58d75cec6edfbd21259d93810b73f6ec246436a21b7841908de"},
|
||||
{file = "cryptography-41.0.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:ab8de0d091acbf778f74286f4989cf3d1528336af1b59f3e5d2ebca8b5fe49e1"},
|
||||
{file = "cryptography-41.0.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a74fbcdb2a0d46fe00504f571a2a540532f4c188e6ccf26f1f178480117b33c4"},
|
||||
{file = "cryptography-41.0.3.tar.gz", hash = "sha256:6d192741113ef5e30d89dcb5b956ef4e1578f304708701b8b73d38e3e1461f34"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
|
||||
@@ -29,9 +29,9 @@ metrics.workspace = true
|
||||
once_cell.workspace = true
|
||||
opentelemetry.workspace = true
|
||||
parking_lot.workspace = true
|
||||
pbkdf2.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
postgres_backend.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
pq_proto.workspace = true
|
||||
prometheus.workspace = true
|
||||
rand.workspace = true
|
||||
@@ -65,13 +65,10 @@ webpki-roots.workspace = true
|
||||
x509-parser.workspace = true
|
||||
native-tls.workspace = true
|
||||
postgres-native-tls.workspace = true
|
||||
tokio-native-tls = "0.3.1"
|
||||
|
||||
workspace_hack.workspace = true
|
||||
tokio-util.workspace = true
|
||||
|
||||
fallible-iterator = "0.2.0"
|
||||
|
||||
[dev-dependencies]
|
||||
rcgen.workspace = true
|
||||
rstest.workspace = true
|
||||
|
||||
@@ -5,12 +5,12 @@ use crate::{
|
||||
auth::{self, AuthFlow, ClientCredentials},
|
||||
compute,
|
||||
console::{self, AuthInfo, CachedNodeInfo, ConsoleReqExtra},
|
||||
proxy::handle_try_wake,
|
||||
proxy::{handle_try_wake, retry_after},
|
||||
sasl, scram,
|
||||
stream::PqStream,
|
||||
};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::info;
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
pub(super) async fn authenticate(
|
||||
api: &impl console::Api,
|
||||
@@ -55,11 +55,20 @@ pub(super) async fn authenticate(
|
||||
let mut num_retries = 0;
|
||||
let mut node = loop {
|
||||
let wake_res = api.wake_compute(extra, creds).await;
|
||||
match handle_try_wake(wake_res, num_retries)? {
|
||||
ControlFlow::Continue(_) => num_retries += 1,
|
||||
ControlFlow::Break(n) => break n,
|
||||
match handle_try_wake(wake_res, num_retries) {
|
||||
Err(e) => {
|
||||
error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
|
||||
return Err(e.into());
|
||||
}
|
||||
Ok(ControlFlow::Continue(e)) => {
|
||||
warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
|
||||
}
|
||||
Ok(ControlFlow::Break(n)) => break n,
|
||||
}
|
||||
info!(num_retries, "retrying wake compute");
|
||||
|
||||
let wait_duration = retry_after(num_retries);
|
||||
num_retries += 1;
|
||||
tokio::time::sleep(wait_duration).await;
|
||||
};
|
||||
if let Some(keys) = scram_keys {
|
||||
use tokio_postgres::config::AuthKeys;
|
||||
|
||||
@@ -230,7 +230,8 @@ pub struct PostgresConnection {
|
||||
}
|
||||
|
||||
impl ConnCfg {
|
||||
async fn do_connect(
|
||||
/// Connect to a corresponding compute node.
|
||||
pub async fn connect(
|
||||
&self,
|
||||
allow_self_signed_compute: bool,
|
||||
timeout: Duration,
|
||||
@@ -270,20 +271,6 @@ impl ConnCfg {
|
||||
|
||||
Ok(connection)
|
||||
}
|
||||
|
||||
/// Connect to a corresponding compute node.
|
||||
pub async fn connect(
|
||||
&self,
|
||||
allow_self_signed_compute: bool,
|
||||
timeout: Duration,
|
||||
) -> Result<PostgresConnection, ConnectionError> {
|
||||
self.do_connect(allow_self_signed_compute, timeout)
|
||||
.inspect_err(|err| {
|
||||
// Immediately log the error we have at our disposal.
|
||||
error!("couldn't connect to compute node: {err}");
|
||||
})
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
/// Retrieve `options` from a startup message, dropping all proxy-secific flags.
|
||||
|
||||
@@ -8,6 +8,7 @@ use super::{
|
||||
use crate::{auth::ClientCredentials, compute, http, scram};
|
||||
use async_trait::async_trait;
|
||||
use futures::TryFutureExt;
|
||||
use tokio::time::Instant;
|
||||
use tokio_postgres::config::SslMode;
|
||||
use tracing::{error, info, info_span, warn, Instrument};
|
||||
|
||||
@@ -47,7 +48,9 @@ impl Api {
|
||||
.build()?;
|
||||
|
||||
info!(url = request.url().as_str(), "sending http request");
|
||||
let start = Instant::now();
|
||||
let response = self.endpoint.execute(request).await?;
|
||||
info!(duration = ?start.elapsed(), "received http response");
|
||||
let body = match parse_body::<GetRoleSecret>(response).await {
|
||||
Ok(body) => body,
|
||||
// Error 404 is special: it's ok not to have a secret.
|
||||
@@ -88,7 +91,9 @@ impl Api {
|
||||
.build()?;
|
||||
|
||||
info!(url = request.url().as_str(), "sending http request");
|
||||
let start = Instant::now();
|
||||
let response = self.endpoint.execute(request).await?;
|
||||
info!(duration = ?start.elapsed(), "received http response");
|
||||
let body = parse_body::<WakeCompute>(response).await?;
|
||||
|
||||
// Unfortunately, ownership won't let us use `Option::ok_or` here.
|
||||
|
||||
@@ -7,11 +7,14 @@ pub mod server;
|
||||
pub mod sql_over_http;
|
||||
pub mod websocket;
|
||||
|
||||
use std::time::Duration;
|
||||
use std::{sync::Arc, time::Duration};
|
||||
|
||||
use futures::FutureExt;
|
||||
pub use reqwest::{Request, Response, StatusCode};
|
||||
pub use reqwest_middleware::{ClientWithMiddleware, Error};
|
||||
pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
|
||||
use tokio::time::Instant;
|
||||
use tracing::trace;
|
||||
|
||||
use crate::url::ApiUrl;
|
||||
use reqwest_middleware::RequestBuilder;
|
||||
@@ -20,13 +23,21 @@ use reqwest_middleware::RequestBuilder;
|
||||
/// because it takes care of observability (OpenTelemetry).
|
||||
/// We deliberately don't want to replace this with a public static.
|
||||
pub fn new_client() -> ClientWithMiddleware {
|
||||
reqwest_middleware::ClientBuilder::new(reqwest::Client::new())
|
||||
let client = reqwest::ClientBuilder::new()
|
||||
.dns_resolver(Arc::new(GaiResolver::default()))
|
||||
.connection_verbose(true)
|
||||
.build()
|
||||
.expect("Failed to create http client");
|
||||
|
||||
reqwest_middleware::ClientBuilder::new(client)
|
||||
.with(reqwest_tracing::TracingMiddleware::default())
|
||||
.build()
|
||||
}
|
||||
|
||||
pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware {
|
||||
let timeout_client = reqwest::ClientBuilder::new()
|
||||
.dns_resolver(Arc::new(GaiResolver::default()))
|
||||
.connection_verbose(true)
|
||||
.timeout(default_timout)
|
||||
.build()
|
||||
.expect("Failed to create http client with timeout");
|
||||
@@ -39,6 +50,10 @@ pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware
|
||||
// As per docs, "This middleware always errors when given requests with streaming bodies".
|
||||
// That's all right because we only use this client to send `serde_json::RawValue`, which
|
||||
// is not a stream.
|
||||
//
|
||||
// ex-maintainer note:
|
||||
// this limitation can be fixed if streaming is necessary.
|
||||
// retries will still not be performed, but it wont error immediately
|
||||
.with(RetryTransientMiddleware::new_with_policy(retry_policy))
|
||||
.build()
|
||||
}
|
||||
@@ -81,6 +96,37 @@ impl Endpoint {
|
||||
}
|
||||
}
|
||||
|
||||
/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html
|
||||
use hyper::{
|
||||
client::connect::dns::{GaiResolver as HyperGaiResolver, Name},
|
||||
service::Service,
|
||||
};
|
||||
use reqwest::dns::{Addrs, Resolve, Resolving};
|
||||
#[derive(Debug)]
|
||||
pub struct GaiResolver(HyperGaiResolver);
|
||||
|
||||
impl Default for GaiResolver {
|
||||
fn default() -> Self {
|
||||
Self(HyperGaiResolver::new())
|
||||
}
|
||||
}
|
||||
|
||||
impl Resolve for GaiResolver {
|
||||
fn resolve(&self, name: Name) -> Resolving {
|
||||
let this = &mut self.0.clone();
|
||||
let start = Instant::now();
|
||||
Box::pin(
|
||||
Service::<Name>::call(this, name.clone()).map(move |result| {
|
||||
let resolve_duration = start.elapsed();
|
||||
trace!(duration = ?resolve_duration, addr = %name, "resolve host complete");
|
||||
result
|
||||
.map(|addrs| -> Addrs { Box::new(addrs) })
|
||||
.map_err(|err| -> Box<dyn std::error::Error + Send + Sync> { Box::new(err) })
|
||||
}),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
@@ -6,7 +6,7 @@ use std::fmt;
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
use tokio::time;
|
||||
|
||||
use crate::{auth, console, pg_client};
|
||||
use crate::{auth, console};
|
||||
use crate::{compute, config};
|
||||
|
||||
use super::sql_over_http::MAX_RESPONSE_SIZE;
|
||||
@@ -41,10 +41,8 @@ impl fmt::Display for ConnInfo {
|
||||
}
|
||||
}
|
||||
|
||||
type PgConn =
|
||||
pg_client::connection::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>;
|
||||
struct ConnPoolEntry {
|
||||
conn: PgConn,
|
||||
conn: tokio_postgres::Client,
|
||||
_last_access: std::time::Instant,
|
||||
}
|
||||
|
||||
@@ -80,8 +78,12 @@ impl GlobalConnPool {
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn get(&self, conn_info: &ConnInfo, force_new: bool) -> anyhow::Result<PgConn> {
|
||||
let mut client: Option<PgConn> = None;
|
||||
pub async fn get(
|
||||
&self,
|
||||
conn_info: &ConnInfo,
|
||||
force_new: bool,
|
||||
) -> anyhow::Result<tokio_postgres::Client> {
|
||||
let mut client: Option<tokio_postgres::Client> = None;
|
||||
|
||||
if !force_new {
|
||||
let pool = self.get_endpoint_pool(&conn_info.hostname).await;
|
||||
@@ -112,7 +114,11 @@ impl GlobalConnPool {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn put(&self, conn_info: &ConnInfo, client: PgConn) -> anyhow::Result<()> {
|
||||
pub async fn put(
|
||||
&self,
|
||||
conn_info: &ConnInfo,
|
||||
client: tokio_postgres::Client,
|
||||
) -> anyhow::Result<()> {
|
||||
let pool = self.get_endpoint_pool(&conn_info.hostname).await;
|
||||
|
||||
// return connection to the pool
|
||||
@@ -185,7 +191,7 @@ struct TokioMechanism<'a> {
|
||||
|
||||
#[async_trait]
|
||||
impl ConnectMechanism for TokioMechanism<'_> {
|
||||
type Connection = PgConn;
|
||||
type Connection = tokio_postgres::Client;
|
||||
type ConnectError = tokio_postgres::Error;
|
||||
type Error = anyhow::Error;
|
||||
|
||||
@@ -207,7 +213,7 @@ impl ConnectMechanism for TokioMechanism<'_> {
|
||||
async fn connect_to_compute(
|
||||
config: &config::ProxyConfig,
|
||||
conn_info: &ConnInfo,
|
||||
) -> anyhow::Result<PgConn> {
|
||||
) -> anyhow::Result<tokio_postgres::Client> {
|
||||
let tls = config.tls_config.as_ref();
|
||||
let common_names = tls.and_then(|tls| tls.common_names.clone());
|
||||
|
||||
@@ -245,7 +251,7 @@ async fn connect_to_compute_once(
|
||||
node_info: &console::CachedNodeInfo,
|
||||
conn_info: &ConnInfo,
|
||||
timeout: time::Duration,
|
||||
) -> Result<PgConn, tokio_postgres::Error> {
|
||||
) -> Result<tokio_postgres::Client, tokio_postgres::Error> {
|
||||
let mut config = (*node_info.config).clone();
|
||||
|
||||
let (client, connection) = config
|
||||
@@ -257,13 +263,11 @@ async fn connect_to_compute_once(
|
||||
.connect(tokio_postgres::NoTls)
|
||||
.await?;
|
||||
|
||||
let stream = connection.stream.into_inner();
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = connection.await {
|
||||
error!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
|
||||
// tokio::spawn(async move {
|
||||
// if let Err(e) = connection.await {
|
||||
// error!("connection error: {}", e);
|
||||
// }
|
||||
// });
|
||||
|
||||
Ok(PgConn::new(stream))
|
||||
Ok(client)
|
||||
}
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
use std::io::ErrorKind;
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::bail;
|
||||
use bytes::BufMut;
|
||||
use fallible_iterator::FallibleIterator;
|
||||
use futures::pin_mut;
|
||||
use futures::StreamExt;
|
||||
use hashbrown::HashMap;
|
||||
@@ -11,28 +8,16 @@ use hyper::body::HttpBody;
|
||||
use hyper::http::HeaderName;
|
||||
use hyper::http::HeaderValue;
|
||||
use hyper::{Body, HeaderMap, Request};
|
||||
use postgres_protocol::message::backend::DataRowBody;
|
||||
use postgres_protocol::message::backend::ReadyForQueryBody;
|
||||
use serde_json::json;
|
||||
use serde_json::Map;
|
||||
use serde_json::Value;
|
||||
use tokio::io::AsyncRead;
|
||||
use tokio::io::AsyncWrite;
|
||||
use tokio_postgres::types::Kind;
|
||||
use tokio_postgres::types::Type;
|
||||
use tokio_postgres::GenericClient;
|
||||
use tokio_postgres::IsolationLevel;
|
||||
use tokio_postgres::Row;
|
||||
use tokio_postgres::RowStream;
|
||||
use tokio_postgres::Statement;
|
||||
use url::Url;
|
||||
|
||||
use crate::pg_client;
|
||||
use crate::pg_client::codec::FrontendMessage;
|
||||
use crate::pg_client::connection;
|
||||
use crate::pg_client::connection::RequestMessages;
|
||||
use crate::pg_client::prepare::TypeinfoPreparedQueries;
|
||||
|
||||
use super::conn_pool::ConnInfo;
|
||||
use super::conn_pool::GlobalConnPool;
|
||||
|
||||
@@ -42,21 +27,26 @@ struct QueryData {
|
||||
params: Vec<serde_json::Value>,
|
||||
}
|
||||
|
||||
#[derive(serde::Deserialize)]
|
||||
struct BatchQueryData {
|
||||
queries: Vec<QueryData>,
|
||||
}
|
||||
|
||||
#[derive(serde::Deserialize)]
|
||||
#[serde(untagged)]
|
||||
enum Payload {
|
||||
Single(QueryData),
|
||||
Batch(Vec<QueryData>),
|
||||
Batch(BatchQueryData),
|
||||
}
|
||||
|
||||
pub const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB
|
||||
pub const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MB
|
||||
const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB
|
||||
|
||||
static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
|
||||
static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
|
||||
static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in");
|
||||
static TXN_ISOLATION_LEVEL: HeaderName = HeaderName::from_static("neon-batch-isolation-level");
|
||||
static TXN_READ_ONLY: HeaderName = HeaderName::from_static("neon-batch-read-only");
|
||||
static TXN_DEFERRABLE: HeaderName = HeaderName::from_static("neon-batch-deferrable");
|
||||
|
||||
static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true");
|
||||
|
||||
@@ -203,9 +193,9 @@ pub async fn handle(
|
||||
let array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);
|
||||
|
||||
// Allow connection pooling only if explicitly requested
|
||||
let allow_pool = headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
|
||||
let allow_pool = false;
|
||||
|
||||
// isolation level and read only
|
||||
// isolation level, read only and deferrable
|
||||
|
||||
let txn_isolation_level_raw = headers.get(&TXN_ISOLATION_LEVEL).cloned();
|
||||
let txn_isolation_level = match txn_isolation_level_raw {
|
||||
@@ -219,8 +209,8 @@ pub async fn handle(
|
||||
None => None,
|
||||
};
|
||||
|
||||
let txn_read_only_raw = headers.get(&TXN_READ_ONLY).cloned();
|
||||
let txn_read_only = txn_read_only_raw.as_ref() == Some(&HEADER_VALUE_TRUE);
|
||||
let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE);
|
||||
let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE);
|
||||
|
||||
let request_content_length = match request.body().size_hint().upper() {
|
||||
Some(v) => v,
|
||||
@@ -229,7 +219,7 @@ pub async fn handle(
|
||||
|
||||
if request_content_length > MAX_REQUEST_SIZE {
|
||||
return Err(anyhow::anyhow!(
|
||||
"request is too large (max {MAX_REQUEST_SIZE} bytes)"
|
||||
"request is too large (max is {MAX_REQUEST_SIZE} bytes)"
|
||||
));
|
||||
}
|
||||
|
||||
@@ -245,48 +235,54 @@ pub async fn handle(
|
||||
// Now execute the query and return the result
|
||||
//
|
||||
let result = match payload {
|
||||
Payload::Single(query) => query_raw_txt_as_json(&mut client, query, raw_output, array_mode)
|
||||
Payload::Single(query) => query_to_json(&client, query, raw_output, array_mode)
|
||||
.await
|
||||
.map(|x| (x, HashMap::default())),
|
||||
Payload::Batch(queries) => {
|
||||
Payload::Batch(batch_query) => {
|
||||
let mut results = Vec::new();
|
||||
|
||||
client
|
||||
.start_tx(txn_isolation_level, Some(txn_read_only))
|
||||
.await?;
|
||||
|
||||
for query in queries {
|
||||
let result =
|
||||
query_raw_txt_as_json(&mut client, query, raw_output, array_mode).await;
|
||||
let mut builder = client.build_transaction();
|
||||
if let Some(isolation_level) = txn_isolation_level {
|
||||
builder = builder.isolation_level(isolation_level);
|
||||
}
|
||||
if txn_read_only {
|
||||
builder = builder.read_only(true);
|
||||
}
|
||||
if txn_deferrable {
|
||||
builder = builder.deferrable(true);
|
||||
}
|
||||
let transaction = builder.start().await?;
|
||||
for query in batch_query.queries {
|
||||
let result = query_to_json(&transaction, query, raw_output, array_mode).await;
|
||||
match result {
|
||||
// TODO: check this tag to see if the client has executed a commit during the non-interactive transactions...
|
||||
Ok((r, _ready_tag)) => results.push(r),
|
||||
Ok(r) => results.push(r),
|
||||
Err(e) => {
|
||||
let tag = client.rollback().await?;
|
||||
if allow_pool && tag.status() == b'I' {
|
||||
// return connection to the pool
|
||||
tokio::task::spawn(async move {
|
||||
let _ = conn_pool.put(&conn_info, client).await;
|
||||
});
|
||||
}
|
||||
transaction.rollback().await?;
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
let ready_tag = client.commit().await?;
|
||||
transaction.commit().await?;
|
||||
let mut headers = HashMap::default();
|
||||
headers.insert(
|
||||
TXN_READ_ONLY.clone(),
|
||||
HeaderValue::try_from(txn_read_only.to_string())?,
|
||||
);
|
||||
if let Some(txn_isolation_level_raw) = txn_isolation_level_raw {
|
||||
headers.insert(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level_raw);
|
||||
if txn_read_only {
|
||||
headers.insert(
|
||||
TXN_READ_ONLY.clone(),
|
||||
HeaderValue::try_from(txn_read_only.to_string())?,
|
||||
);
|
||||
}
|
||||
Ok(((json!({ "results": results }), ready_tag), headers))
|
||||
if txn_deferrable {
|
||||
headers.insert(
|
||||
TXN_DEFERRABLE.clone(),
|
||||
HeaderValue::try_from(txn_deferrable.to_string())?,
|
||||
);
|
||||
}
|
||||
if let Some(txn_isolation_level) = txn_isolation_level_raw {
|
||||
headers.insert(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
|
||||
}
|
||||
Ok((json!({ "results": results }), headers))
|
||||
}
|
||||
};
|
||||
|
||||
if allow_pool && ready_tag.status() == b'I' {
|
||||
if allow_pool {
|
||||
// return connection to the pool
|
||||
tokio::task::spawn(async move {
|
||||
let _ = conn_pool.put(&conn_info, client).await;
|
||||
@@ -312,13 +308,15 @@ async fn query_to_json<T: GenericClient>(
|
||||
// big.
|
||||
pin_mut!(row_stream);
|
||||
let mut rows: Vec<tokio_postgres::Row> = Vec::new();
|
||||
let mut curret_size = 0;
|
||||
let mut current_size = 0;
|
||||
while let Some(row) = row_stream.next().await {
|
||||
let row = row?;
|
||||
curret_size += row.body_len();
|
||||
current_size += row.body_len();
|
||||
rows.push(row);
|
||||
if curret_size > MAX_RESPONSE_SIZE {
|
||||
return Err(anyhow::anyhow!("response too large"));
|
||||
if current_size > MAX_RESPONSE_SIZE {
|
||||
return Err(anyhow::anyhow!(
|
||||
"response is too large (max is {MAX_RESPONSE_SIZE} bytes)"
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -371,99 +369,6 @@ async fn query_to_json<T: GenericClient>(
|
||||
}))
|
||||
}
|
||||
|
||||
async fn query_raw_txt_as_json<'a, St, T>(
|
||||
conn: &mut connection::Connection<St, T>,
|
||||
data: QueryData,
|
||||
raw_output: bool,
|
||||
array_mode: bool,
|
||||
) -> anyhow::Result<(Value, ReadyForQueryBody)>
|
||||
where
|
||||
St: AsyncRead + AsyncWrite + Unpin + Send,
|
||||
T: AsyncRead + AsyncWrite + Unpin + Send,
|
||||
{
|
||||
let params = json_to_pg_text(data.params)?;
|
||||
let params = params.into_iter();
|
||||
|
||||
let stmt_name = conn.statement_name();
|
||||
let row_description = conn.prepare(&stmt_name, &data.query).await?;
|
||||
|
||||
let mut fields = vec![];
|
||||
let mut columns = vec![];
|
||||
let mut it = row_description.fields();
|
||||
while let Some(field) = it.next().map_err(pg_client::error::Error::parse)? {
|
||||
fields.push(json!({
|
||||
"name": Value::String(field.name().to_owned()),
|
||||
"dataTypeID": Value::Number(field.type_oid().into()),
|
||||
"tableID": field.table_oid(),
|
||||
"columnID": field.column_id(),
|
||||
"dataTypeSize": field.type_size(),
|
||||
"dataTypeModifier": field.type_modifier(),
|
||||
"format": "text",
|
||||
}));
|
||||
|
||||
let type_ = match Type::from_oid(field.type_oid()) {
|
||||
Some(t) => t,
|
||||
None => TypeinfoPreparedQueries::get_type(conn, field.type_oid()).await?,
|
||||
};
|
||||
|
||||
columns.push(Column {
|
||||
name: field.name().to_string(),
|
||||
type_,
|
||||
});
|
||||
}
|
||||
|
||||
conn.execute("", &stmt_name, params)?;
|
||||
conn.sync().await?;
|
||||
|
||||
let mut rows = vec![];
|
||||
|
||||
let mut row_stream = conn.stream_query_results().await?;
|
||||
|
||||
let mut curret_size = 0;
|
||||
while let Some(row) = row_stream.next().await.transpose()? {
|
||||
// let row = row.map_err(Error::db)?;
|
||||
|
||||
curret_size += row.buffer().len();
|
||||
if curret_size > MAX_RESPONSE_SIZE {
|
||||
return Err(anyhow::anyhow!("response too large"));
|
||||
}
|
||||
|
||||
rows.push(pg_text_row_to_json2(&row, &columns, raw_output, array_mode).unwrap());
|
||||
}
|
||||
|
||||
let command_tag = row_stream.tag();
|
||||
let command_tag = command_tag.tag()?;
|
||||
let mut command_tag_split = command_tag.split(' ');
|
||||
let command_tag_name = command_tag_split.next().unwrap_or_default();
|
||||
let command_tag_count = if command_tag_name == "INSERT" {
|
||||
// INSERT returns OID first and then number of rows
|
||||
command_tag_split.nth(1)
|
||||
} else {
|
||||
// other commands return number of rows (if any)
|
||||
command_tag_split.next()
|
||||
}
|
||||
.and_then(|s| s.parse::<i64>().ok());
|
||||
|
||||
let ready_tag = conn.wait_for_ready().await?;
|
||||
|
||||
// resulting JSON format is based on the format of node-postgres result
|
||||
Ok((
|
||||
json!({
|
||||
"command": command_tag_name,
|
||||
"rowCount": command_tag_count,
|
||||
"rows": rows,
|
||||
"fields": fields,
|
||||
"rowAsArray": array_mode,
|
||||
}),
|
||||
ready_tag,
|
||||
))
|
||||
}
|
||||
|
||||
struct Column {
|
||||
name: String,
|
||||
type_: Type,
|
||||
}
|
||||
|
||||
//
|
||||
// Convert postgres row with text-encoded values to JSON object
|
||||
//
|
||||
@@ -483,7 +388,7 @@ pub fn pg_text_row_to_json(
|
||||
} else {
|
||||
pg_text_to_json(pg_value, column.type_())?
|
||||
};
|
||||
Ok((name, json_value))
|
||||
Ok((name.to_string(), json_value))
|
||||
});
|
||||
|
||||
if array_mode {
|
||||
@@ -493,55 +398,7 @@ pub fn pg_text_row_to_json(
|
||||
.collect::<Result<Vec<Value>, anyhow::Error>>()?;
|
||||
Ok(Value::Array(arr))
|
||||
} else {
|
||||
let obj = iter
|
||||
.map(|r| r.map(|(key, val)| (key.to_owned(), val)))
|
||||
.collect::<Result<Map<String, Value>, anyhow::Error>>()?;
|
||||
Ok(Value::Object(obj))
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Convert postgres row with text-encoded values to JSON object
|
||||
//
|
||||
fn pg_text_row_to_json2(
|
||||
row: &DataRowBody,
|
||||
columns: &[Column],
|
||||
raw_output: bool,
|
||||
array_mode: bool,
|
||||
) -> Result<Value, anyhow::Error> {
|
||||
let ranges: Vec<Option<std::ops::Range<usize>>> = row.ranges().collect()?;
|
||||
let iter = std::iter::zip(ranges, columns)
|
||||
.enumerate()
|
||||
.map(|(i, (range, column))| {
|
||||
let name = &column.name;
|
||||
let pg_value = range
|
||||
.map(|r| {
|
||||
std::str::from_utf8(&row.buffer()[r])
|
||||
.map_err(|e| pg_client::error::Error::from_sql(e.into(), i))
|
||||
})
|
||||
.transpose()?;
|
||||
// let pg_value = row.as_text(i)?;
|
||||
let json_value = if raw_output {
|
||||
match pg_value {
|
||||
Some(v) => Value::String(v.to_string()),
|
||||
None => Value::Null,
|
||||
}
|
||||
} else {
|
||||
pg_text_to_json(pg_value, &column.type_)?
|
||||
};
|
||||
Ok((name, json_value))
|
||||
});
|
||||
|
||||
if array_mode {
|
||||
// drop keys and aggregate into array
|
||||
let arr = iter
|
||||
.map(|r| r.map(|(_key, val)| val))
|
||||
.collect::<Result<Vec<Value>, anyhow::Error>>()?;
|
||||
Ok(Value::Array(arr))
|
||||
} else {
|
||||
let obj = iter
|
||||
.map(|r| r.map(|(key, val)| (key.to_owned(), val)))
|
||||
.collect::<Result<Map<String, Value>, anyhow::Error>>()?;
|
||||
let obj = iter.collect::<Result<Map<String, Value>, anyhow::Error>>()?;
|
||||
Ok(Value::Object(obj))
|
||||
}
|
||||
}
|
||||
@@ -552,16 +409,16 @@ fn pg_text_row_to_json2(
|
||||
pub fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result<Value, anyhow::Error> {
|
||||
if let Some(val) = pg_value {
|
||||
if let Kind::Array(elem_type) = pg_type.kind() {
|
||||
return pg_array_parse(val, &elem_type);
|
||||
return pg_array_parse(val, elem_type);
|
||||
}
|
||||
|
||||
match pg_type {
|
||||
&Type::BOOL => Ok(Value::Bool(val == "t")),
|
||||
&Type::INT2 | &Type::INT4 => {
|
||||
match *pg_type {
|
||||
Type::BOOL => Ok(Value::Bool(val == "t")),
|
||||
Type::INT2 | Type::INT4 => {
|
||||
let val = val.parse::<i32>()?;
|
||||
Ok(Value::Number(serde_json::Number::from(val)))
|
||||
}
|
||||
&Type::FLOAT4 | &Type::FLOAT8 => {
|
||||
Type::FLOAT4 | Type::FLOAT8 => {
|
||||
let fval = val.parse::<f64>()?;
|
||||
let num = serde_json::Number::from_f64(fval);
|
||||
if let Some(num) = num {
|
||||
@@ -573,7 +430,7 @@ pub fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result<Value,
|
||||
Ok(Value::String(val.to_string()))
|
||||
}
|
||||
}
|
||||
&Type::JSON | &Type::JSONB => Ok(serde_json::from_str(val)?),
|
||||
Type::JSON | Type::JSONB => Ok(serde_json::from_str(val)?),
|
||||
_ => Ok(Value::String(val.to_string())),
|
||||
}
|
||||
} else {
|
||||
|
||||
@@ -187,12 +187,16 @@ async fn ws_handler(
|
||||
let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
|
||||
.map_err(|e| ApiError::BadRequest(e.into()))?;
|
||||
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = serve_websocket(websocket, config, &cancel_map, session_id, host).await
|
||||
{
|
||||
error!(session_id = ?session_id, "error in websocket connection: {e:?}");
|
||||
tokio::spawn(
|
||||
async move {
|
||||
if let Err(e) =
|
||||
serve_websocket(websocket, config, &cancel_map, session_id, host).await
|
||||
{
|
||||
error!(session_id = ?session_id, "error in websocket connection: {e:#}");
|
||||
}
|
||||
}
|
||||
});
|
||||
.in_current_span(),
|
||||
);
|
||||
|
||||
// Return the response so the spawned future can continue.
|
||||
Ok(response)
|
||||
@@ -217,6 +221,10 @@ async fn ws_handler(
|
||||
},
|
||||
None => Value::Null,
|
||||
};
|
||||
error!(
|
||||
?code,
|
||||
"sql-over-http per-client task finished with an error: {e:#}"
|
||||
);
|
||||
(
|
||||
json!({ "message": message, "code": code }),
|
||||
HashMap::default(),
|
||||
|
||||
@@ -22,7 +22,6 @@ pub mod scram;
|
||||
pub mod stream;
|
||||
pub mod url;
|
||||
pub mod waiters;
|
||||
pub mod pg_client;
|
||||
|
||||
/// Handle unix signals appropriately.
|
||||
pub async fn handle_signals(token: CancellationToken) -> anyhow::Result<Infallible> {
|
||||
|
||||
@@ -11,7 +11,6 @@ const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
|
||||
|
||||
const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
|
||||
|
||||
///
|
||||
/// Key that uniquely identifies the object, this metric describes.
|
||||
/// Currently, endpoint_id is enough, but this may change later,
|
||||
/// so keep it in a named struct.
|
||||
@@ -19,8 +18,7 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
|
||||
/// Both the proxy and the ingestion endpoint will live in the same region (or cell)
|
||||
/// so while the project-id is unique across regions the whole pipeline will work correctly
|
||||
/// because we enrich the event with project_id in the control-plane endpoint.
|
||||
///
|
||||
#[derive(Eq, Hash, PartialEq, Serialize, Debug)]
|
||||
#[derive(Eq, Hash, PartialEq, Serialize, Debug, Clone)]
|
||||
pub struct Ids {
|
||||
pub endpoint_id: String,
|
||||
pub branch_id: String,
|
||||
@@ -149,7 +147,7 @@ async fn collect_metrics_iteration(
|
||||
stop_time: *curr_time,
|
||||
},
|
||||
metric: PROXY_IO_BYTES_PER_CLIENT,
|
||||
idempotency_key: idempotency_key(hostname.to_owned()),
|
||||
idempotency_key: idempotency_key(hostname),
|
||||
value,
|
||||
extra: Ids {
|
||||
endpoint_id: curr_key.endpoint_id.clone(),
|
||||
@@ -167,12 +165,11 @@ async fn collect_metrics_iteration(
|
||||
// Send metrics.
|
||||
// Split into chunks of 1000 metrics to avoid exceeding the max request size
|
||||
for chunk in metrics_to_send.chunks(CHUNK_SIZE) {
|
||||
let chunk_json = serde_json::value::to_raw_value(&EventChunk { events: chunk })
|
||||
.expect("ProxyConsumptionMetric should not fail serialization");
|
||||
|
||||
let res = client
|
||||
.post(metric_collection_endpoint.clone())
|
||||
.json(&chunk_json)
|
||||
.json(&EventChunk {
|
||||
events: chunk.into(),
|
||||
})
|
||||
.send()
|
||||
.await;
|
||||
|
||||
|
||||
@@ -1,43 +0,0 @@
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use fallible_iterator::FallibleIterator;
|
||||
use postgres_protocol::message::backend::{self, Message};
|
||||
use std::io;
|
||||
use tokio_util::codec::{Decoder, Encoder};
|
||||
|
||||
pub struct FrontendMessage(pub Bytes);
|
||||
pub struct BackendMessages(pub BytesMut);
|
||||
|
||||
impl BackendMessages {
|
||||
pub fn empty() -> BackendMessages {
|
||||
BackendMessages(BytesMut::new())
|
||||
}
|
||||
}
|
||||
|
||||
impl FallibleIterator for BackendMessages {
|
||||
type Item = backend::Message;
|
||||
type Error = io::Error;
|
||||
|
||||
fn next(&mut self) -> io::Result<Option<backend::Message>> {
|
||||
backend::Message::parse(&mut self.0)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PostgresCodec;
|
||||
|
||||
impl Encoder<FrontendMessage> for PostgresCodec {
|
||||
type Error = io::Error;
|
||||
|
||||
fn encode(&mut self, item: FrontendMessage, dst: &mut BytesMut) -> io::Result<()> {
|
||||
dst.extend_from_slice(&item.0);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Decoder for PostgresCodec {
|
||||
type Item = Message;
|
||||
type Error = io::Error;
|
||||
|
||||
fn decode(&mut self, src: &mut BytesMut) -> Result<Option<Message>, io::Error> {
|
||||
Message::parse(src)
|
||||
}
|
||||
}
|
||||
@@ -1,369 +0,0 @@
|
||||
use super::codec::{BackendMessages, FrontendMessage, PostgresCodec};
|
||||
use super::error::Error;
|
||||
use super::prepare::TypeinfoPreparedQueries;
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use futures::channel::mpsc;
|
||||
use futures::{Sink, StreamExt};
|
||||
use futures::{SinkExt, Stream};
|
||||
use hashbrown::HashMap;
|
||||
use postgres_protocol::message::backend::{
|
||||
BackendKeyDataBody, CommandCompleteBody, DataRowBody, ErrorResponseBody, Message,
|
||||
ReadyForQueryBody, RowDescriptionBody,
|
||||
};
|
||||
use postgres_protocol::message::frontend;
|
||||
use postgres_protocol::Oid;
|
||||
use std::collections::VecDeque;
|
||||
use std::future::poll_fn;
|
||||
use std::pin::Pin;
|
||||
use std::task::{ready, Context, Poll};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_postgres::maybe_tls_stream::MaybeTlsStream;
|
||||
use tokio_postgres::types::Type;
|
||||
use tokio_postgres::IsolationLevel;
|
||||
use tokio_util::codec::Framed;
|
||||
|
||||
pub enum RequestMessages {
|
||||
Single(FrontendMessage),
|
||||
}
|
||||
|
||||
pub struct Request {
|
||||
pub messages: RequestMessages,
|
||||
pub sender: mpsc::Sender<BackendMessages>,
|
||||
}
|
||||
|
||||
pub struct Response {
|
||||
sender: mpsc::Sender<BackendMessages>,
|
||||
}
|
||||
|
||||
/// A connection to a PostgreSQL database.
|
||||
pub struct RawConnection<S, T> {
|
||||
stream: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
|
||||
pending_responses: VecDeque<Message>,
|
||||
pub buf: BytesMut,
|
||||
}
|
||||
|
||||
impl<S: AsyncRead + AsyncWrite + Unpin, T: AsyncRead + AsyncWrite + Unpin> RawConnection<S, T> {
|
||||
pub fn new(
|
||||
stream: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
|
||||
buf: BytesMut,
|
||||
) -> RawConnection<S, T> {
|
||||
RawConnection {
|
||||
stream,
|
||||
pending_responses: VecDeque::new(),
|
||||
buf,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn send(&mut self) -> Result<(), Error> {
|
||||
poll_fn(|cx| self.poll_send(cx)).await?;
|
||||
let request = FrontendMessage(self.buf.split().freeze());
|
||||
self.stream.start_send_unpin(request).map_err(Error::io)?;
|
||||
poll_fn(|cx| self.poll_flush(cx)).await
|
||||
}
|
||||
|
||||
pub async fn next_message(&mut self) -> Result<Message, Error> {
|
||||
match self.pending_responses.pop_front() {
|
||||
Some(message) => Ok(message),
|
||||
None => poll_fn(|cx| self.poll_read(cx)).await,
|
||||
}
|
||||
}
|
||||
|
||||
fn poll_read(&mut self, cx: &mut Context<'_>) -> Poll<Result<Message, Error>> {
|
||||
let message = match ready!(self.stream.poll_next_unpin(cx)?) {
|
||||
Some(message) => message,
|
||||
None => return Poll::Ready(Err(Error::closed())),
|
||||
};
|
||||
Poll::Ready(Ok(message))
|
||||
}
|
||||
|
||||
fn poll_shutdown(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Error>> {
|
||||
Pin::new(&mut self.stream).poll_close(cx).map_err(Error::io)
|
||||
}
|
||||
|
||||
fn poll_send(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Error>> {
|
||||
if let Poll::Ready(msg) = self.poll_read(cx)? {
|
||||
self.pending_responses.push_back(msg);
|
||||
};
|
||||
self.stream.poll_ready_unpin(cx).map_err(Error::io)
|
||||
}
|
||||
|
||||
fn poll_flush(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Error>> {
|
||||
if let Poll::Ready(msg) = self.poll_read(cx)? {
|
||||
self.pending_responses.push_back(msg);
|
||||
};
|
||||
self.stream.poll_flush_unpin(cx).map_err(Error::io)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Connection<S, T> {
|
||||
stmt_counter: usize,
|
||||
pub typeinfo: Option<TypeinfoPreparedQueries>,
|
||||
pub typecache: HashMap<Oid, Type>,
|
||||
pub raw: RawConnection<S, T>,
|
||||
// key: BackendKeyDataBody,
|
||||
}
|
||||
|
||||
impl<S: AsyncRead + AsyncWrite + Unpin, T: AsyncRead + AsyncWrite + Unpin> Connection<S, T> {
|
||||
pub fn new(stream: MaybeTlsStream<S, T>) -> Connection<S, T> {
|
||||
Connection {
|
||||
stmt_counter: 0,
|
||||
typeinfo: None,
|
||||
typecache: HashMap::new(),
|
||||
raw: RawConnection::new(Framed::new(stream, PostgresCodec), BytesMut::new()),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn start_tx(
|
||||
&mut self,
|
||||
isolation_level: Option<IsolationLevel>,
|
||||
read_only: Option<bool>,
|
||||
) -> Result<ReadyForQueryBody, Error> {
|
||||
let mut query = "START TRANSACTION".to_string();
|
||||
let mut first = true;
|
||||
|
||||
if let Some(level) = isolation_level {
|
||||
first = false;
|
||||
|
||||
query.push_str(" ISOLATION LEVEL ");
|
||||
let level = match level {
|
||||
IsolationLevel::ReadUncommitted => "READ UNCOMMITTED",
|
||||
IsolationLevel::ReadCommitted => "READ COMMITTED",
|
||||
IsolationLevel::RepeatableRead => "REPEATABLE READ",
|
||||
IsolationLevel::Serializable => "SERIALIZABLE",
|
||||
_ => return Err(Error::unexpected_message()),
|
||||
};
|
||||
query.push_str(level);
|
||||
}
|
||||
|
||||
if let Some(read_only) = read_only {
|
||||
if !first {
|
||||
query.push(',');
|
||||
}
|
||||
first = false;
|
||||
|
||||
let s = if read_only {
|
||||
" READ ONLY"
|
||||
} else {
|
||||
" READ WRITE"
|
||||
};
|
||||
query.push_str(s);
|
||||
}
|
||||
|
||||
self.execute_simple(&query).await
|
||||
}
|
||||
|
||||
pub async fn rollback(&mut self) -> Result<ReadyForQueryBody, Error> {
|
||||
self.execute_simple("ROLLBACK").await
|
||||
}
|
||||
|
||||
pub async fn commit(&mut self) -> Result<ReadyForQueryBody, Error> {
|
||||
self.execute_simple("COMMIT").await
|
||||
}
|
||||
|
||||
// pub async fn auth_sasl_scram<'a, I>(
|
||||
// mut raw: RawConnection<S, T>,
|
||||
// params: I,
|
||||
// password: &[u8],
|
||||
// ) -> Result<Self, Error>
|
||||
// where
|
||||
// I: IntoIterator<Item = (&'a str, &'a str)>,
|
||||
// {
|
||||
// // send a startup message
|
||||
// frontend::startup_message(params, &mut raw.buf).unwrap();
|
||||
// raw.send().await?;
|
||||
|
||||
// // expect sasl authentication message
|
||||
// let Message::AuthenticationSasl(body) = raw.next_message().await? else { return Err(Error::expecting("sasl authentication")) };
|
||||
// // expect support for SCRAM_SHA_256
|
||||
// if body
|
||||
// .mechanisms()
|
||||
// .find(|&x| Ok(x == authentication::sasl::SCRAM_SHA_256))?
|
||||
// .is_none()
|
||||
// {
|
||||
// return Err(Error::expecting("SCRAM-SHA-256 auth"));
|
||||
// }
|
||||
|
||||
// // initiate SCRAM_SHA_256 authentication without channel binding
|
||||
// let auth = authentication::sasl::ChannelBinding::unrequested();
|
||||
// let mut scram = authentication::sasl::ScramSha256::new(password, auth);
|
||||
|
||||
// frontend::sasl_initial_response(
|
||||
// authentication::sasl::SCRAM_SHA_256,
|
||||
// scram.message(),
|
||||
// &mut raw.buf,
|
||||
// )
|
||||
// .unwrap();
|
||||
// raw.send().await?;
|
||||
|
||||
// // expect sasl continue
|
||||
// let Message::AuthenticationSaslContinue(b) = raw.next_message().await? else { return Err(Error::expecting("auth continue")) };
|
||||
// scram.update(b.data()).unwrap();
|
||||
|
||||
// // continue sasl
|
||||
// frontend::sasl_response(scram.message(), &mut raw.buf).unwrap();
|
||||
// raw.send().await?;
|
||||
|
||||
// // expect sasl final
|
||||
// let Message::AuthenticationSaslFinal(b) = raw.next_message().await? else { return Err(Error::expecting("auth final")) };
|
||||
// scram.finish(b.data()).unwrap();
|
||||
|
||||
// // expect auth ok
|
||||
// let Message::AuthenticationOk = raw.next_message().await? else { return Err(Error::expecting("auth ok")) };
|
||||
|
||||
// // expect connection accepted
|
||||
// let key = loop {
|
||||
// match raw.next_message().await? {
|
||||
// Message::BackendKeyData(key) => break key,
|
||||
// Message::ParameterStatus(_) => {}
|
||||
// _ => return Err(Error::expecting("backend ready")),
|
||||
// }
|
||||
// };
|
||||
|
||||
// let Message::ReadyForQuery(b) = raw.next_message().await? else { return Err(Error::expecting("ready for query")) };
|
||||
// // assert_eq!(b.status(), b'I');
|
||||
|
||||
// Ok(Self { raw, key })
|
||||
// }
|
||||
|
||||
// pub fn prepare_and_execute(
|
||||
// &mut self,
|
||||
// portal: &str,
|
||||
// name: &str,
|
||||
// query: &str,
|
||||
// params: impl IntoIterator<Item = Option<impl AsRef<str>>>,
|
||||
// ) -> std::io::Result<()> {
|
||||
// self.prepare(name, query)?;
|
||||
// self.execute(portal, name, params)
|
||||
// }
|
||||
|
||||
pub fn statement_name(&mut self) -> String {
|
||||
self.stmt_counter += 1;
|
||||
format!("s{}", self.stmt_counter)
|
||||
}
|
||||
|
||||
async fn execute_simple(&mut self, query: &str) -> Result<ReadyForQueryBody, Error> {
|
||||
frontend::query(query, &mut self.raw.buf)?;
|
||||
self.raw.send().await?;
|
||||
|
||||
loop {
|
||||
match self.raw.next_message().await? {
|
||||
Message::ReadyForQuery(q) => return Ok(q),
|
||||
Message::CommandComplete(_)
|
||||
| Message::EmptyQueryResponse
|
||||
| Message::RowDescription(_)
|
||||
| Message::DataRow(_) => {}
|
||||
_ => return Err(Error::unexpected_message()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn prepare(&mut self, name: &str, query: &str) -> Result<RowDescriptionBody, Error> {
|
||||
frontend::parse(name, query, std::iter::empty(), &mut self.raw.buf)?;
|
||||
frontend::describe(b'S', name, &mut self.raw.buf)?;
|
||||
self.sync().await?;
|
||||
self.wait_for_prepare().await
|
||||
}
|
||||
|
||||
pub fn execute(
|
||||
&mut self,
|
||||
portal: &str,
|
||||
name: &str,
|
||||
params: impl IntoIterator<Item = Option<impl AsRef<str>>>,
|
||||
) -> std::io::Result<()> {
|
||||
frontend::bind(
|
||||
portal,
|
||||
name,
|
||||
std::iter::empty(), // all parameters use the default format (text)
|
||||
params,
|
||||
|param, buf| match param {
|
||||
Some(param) => {
|
||||
buf.put_slice(param.as_ref().as_bytes());
|
||||
Ok(postgres_protocol::IsNull::No)
|
||||
}
|
||||
None => Ok(postgres_protocol::IsNull::Yes),
|
||||
},
|
||||
Some(0), // all text
|
||||
&mut self.raw.buf,
|
||||
)
|
||||
.map_err(|e| match e {
|
||||
frontend::BindError::Conversion(e) => std::io::Error::new(std::io::ErrorKind::Other, e),
|
||||
frontend::BindError::Serialization(io) => io,
|
||||
})?;
|
||||
frontend::execute(portal, 0, &mut self.raw.buf)
|
||||
}
|
||||
|
||||
pub async fn sync(&mut self) -> Result<(), Error> {
|
||||
frontend::sync(&mut self.raw.buf);
|
||||
self.raw.send().await
|
||||
}
|
||||
|
||||
pub async fn wait_for_prepare(&mut self) -> Result<RowDescriptionBody, Error> {
|
||||
let Message::ParseComplete = self.raw.next_message().await? else { return Err(Error::expecting("parse")) };
|
||||
let Message::ParameterDescription(_) = self.raw.next_message().await? else { return Err(Error::expecting("param description")) };
|
||||
let Message::RowDescription(desc) = self.raw.next_message().await? else { return Err(Error::expecting("row description")) };
|
||||
|
||||
self.wait_for_ready().await?;
|
||||
|
||||
Ok(desc)
|
||||
}
|
||||
|
||||
pub async fn stream_query_results(&mut self) -> Result<RowStream<'_, S, T>, Error> {
|
||||
// let Message::ParseComplete = self.raw.next_message().await? else { return Err(Error::expecting("parse")) };
|
||||
let Message::BindComplete = self.raw.next_message().await? else { return Err(Error::expecting("bind")) };
|
||||
Ok(RowStream::Stream(&mut self.raw))
|
||||
}
|
||||
|
||||
pub async fn wait_for_ready(&mut self) -> Result<ReadyForQueryBody, Error> {
|
||||
loop {
|
||||
match self.raw.next_message().await.unwrap() {
|
||||
Message::ReadyForQuery(b) => break Ok(b),
|
||||
_ => continue,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub enum RowStream<'a, S, T> {
|
||||
Stream(&'a mut RawConnection<S, T>),
|
||||
Complete(Option<CommandCompleteBody>),
|
||||
}
|
||||
impl<S, T> Unpin for RowStream<'_, S, T> {}
|
||||
|
||||
impl<S: AsyncRead + AsyncWrite + Unpin, T: AsyncRead + AsyncWrite + Unpin> Stream
|
||||
for RowStream<'_, S, T>
|
||||
{
|
||||
// this is horrible - first result is for transport/protocol errors errors
|
||||
// second result is for sql errors.
|
||||
type Item = Result<Result<DataRowBody, ErrorResponseBody>, Error>;
|
||||
|
||||
fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
|
||||
match &mut *self {
|
||||
RowStream::Stream(raw) => match ready!(raw.poll_read(cx)?) {
|
||||
Message::DataRow(row) => Poll::Ready(Some(Ok(Ok(row)))),
|
||||
Message::CommandComplete(tag) => {
|
||||
*self = Self::Complete(Some(tag));
|
||||
Poll::Ready(None)
|
||||
}
|
||||
Message::EmptyQueryResponse | Message::PortalSuspended => {
|
||||
*self = Self::Complete(None);
|
||||
Poll::Ready(None)
|
||||
}
|
||||
Message::ErrorResponse(error) => {
|
||||
*self = Self::Complete(None);
|
||||
Poll::Ready(Some(Ok(Err(error))))
|
||||
}
|
||||
_ => Poll::Ready(Some(Err(Error::expecting("command completion")))),
|
||||
},
|
||||
RowStream::Complete(_) => Poll::Ready(None),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<S, T> RowStream<'_, S, T> {
|
||||
pub fn tag(self) -> Option<CommandCompleteBody> {
|
||||
match self {
|
||||
RowStream::Stream(_) => panic!("should not get tag unless row stream is exhausted"),
|
||||
RowStream::Complete(tag) => tag,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,447 +0,0 @@
|
||||
use std::{error, fmt, io};
|
||||
|
||||
use fallible_iterator::FallibleIterator;
|
||||
use postgres_protocol::message::backend::{ErrorFields, ErrorResponseBody};
|
||||
use tokio_native_tls::native_tls;
|
||||
use tokio_postgres::error::{ErrorPosition, SqlState};
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
enum Kind {
|
||||
Io,
|
||||
Tls,
|
||||
UnexpectedMessage,
|
||||
FromSql(usize),
|
||||
Closed,
|
||||
Db,
|
||||
Parse,
|
||||
Encode,
|
||||
}
|
||||
|
||||
struct ErrorInner {
|
||||
kind: Kind,
|
||||
cause: Option<Box<dyn error::Error + Sync + Send>>,
|
||||
}
|
||||
|
||||
/// An error communicating with the Postgres server.
|
||||
pub struct Error(ErrorInner);
|
||||
|
||||
impl fmt::Debug for Error {
|
||||
fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
fmt.debug_struct("Error")
|
||||
.field("kind", &self.0.kind)
|
||||
.field("cause", &self.0.cause)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for Error {
|
||||
fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match &self.0.kind {
|
||||
Kind::Io => fmt.write_str("error communicating with the server")?,
|
||||
Kind::Tls => fmt.write_str("error establishing tls")?,
|
||||
Kind::UnexpectedMessage => fmt.write_str("unexpected message from server")?,
|
||||
Kind::FromSql(idx) => write!(fmt, "error deserializing column {}", idx)?,
|
||||
Kind::Closed => fmt.write_str("connection closed")?,
|
||||
Kind::Db => fmt.write_str("db error")?,
|
||||
Kind::Parse => fmt.write_str("error parsing response from server")?,
|
||||
Kind::Encode => fmt.write_str("error encoding message to server")?,
|
||||
};
|
||||
if let Some(ref cause) = self.0.cause {
|
||||
write!(fmt, ": {}", cause)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl error::Error for Error {
|
||||
fn source(&self) -> Option<&(dyn error::Error + 'static)> {
|
||||
self.0.cause.as_ref().map(|e| &**e as _)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<io::Error> for Error {
|
||||
fn from(value: io::Error) -> Self {
|
||||
Self::io(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl Error {
|
||||
/// Consumes the error, returning its cause.
|
||||
pub fn into_source(self) -> Option<Box<dyn error::Error + Sync + Send>> {
|
||||
self.0.cause
|
||||
}
|
||||
|
||||
/// Returns the source of this error if it was a `DbError`.
|
||||
///
|
||||
/// This is a simple convenience method.
|
||||
pub fn as_db_error(&self) -> Option<&DbError> {
|
||||
error::Error::source(self).and_then(|e| e.downcast_ref::<DbError>())
|
||||
}
|
||||
|
||||
/// Determines if the error was associated with closed connection.
|
||||
pub fn is_closed(&self) -> bool {
|
||||
self.0.kind == Kind::Closed
|
||||
}
|
||||
|
||||
/// Returns the SQLSTATE error code associated with the error.
|
||||
///
|
||||
/// This is a convenience method that downcasts the cause to a `DbError` and returns its code.
|
||||
pub fn code(&self) -> Option<&SqlState> {
|
||||
self.as_db_error().map(DbError::code)
|
||||
}
|
||||
|
||||
fn new(kind: Kind, cause: Option<Box<dyn error::Error + Sync + Send>>) -> Error {
|
||||
Error(ErrorInner { kind, cause })
|
||||
}
|
||||
|
||||
#[allow(clippy::needless_pass_by_value)]
|
||||
pub(crate) fn db(error: ErrorResponseBody) -> Error {
|
||||
match DbError::parse(&mut error.fields()) {
|
||||
Ok(e) => Error::new(Kind::Db, Some(Box::new(e))),
|
||||
Err(e) => Error::new(Kind::Parse, Some(Box::new(e))),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn from_sql(e: Box<dyn error::Error + Sync + Send>, idx: usize) -> Error {
|
||||
Error::new(Kind::FromSql(idx), Some(e))
|
||||
}
|
||||
|
||||
pub(crate) fn closed() -> Error {
|
||||
Error::new(Kind::Closed, None)
|
||||
}
|
||||
|
||||
pub(crate) fn unexpected_message() -> Error {
|
||||
Error::new(Kind::UnexpectedMessage, None)
|
||||
}
|
||||
|
||||
pub(crate) fn expecting(expected: &str) -> Error {
|
||||
Error::new(Kind::UnexpectedMessage, Some(expected.into()))
|
||||
}
|
||||
|
||||
pub(crate) fn parse(e: io::Error) -> Error {
|
||||
Error::new(Kind::Parse, Some(Box::new(e)))
|
||||
}
|
||||
|
||||
pub(crate) fn encode(e: io::Error) -> Error {
|
||||
Error::new(Kind::Encode, Some(Box::new(e)))
|
||||
}
|
||||
|
||||
pub(crate) fn io(e: io::Error) -> Error {
|
||||
Error::new(Kind::Io, Some(Box::new(e)))
|
||||
}
|
||||
|
||||
pub(crate) fn tls(e: native_tls::Error) -> Error {
|
||||
Error::new(Kind::Tls, Some(Box::new(e)))
|
||||
}
|
||||
}
|
||||
|
||||
/// The severity of a Postgres error or notice.
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
|
||||
pub enum Severity {
|
||||
/// PANIC
|
||||
Panic,
|
||||
/// FATAL
|
||||
Fatal,
|
||||
/// ERROR
|
||||
Error,
|
||||
/// WARNING
|
||||
Warning,
|
||||
/// NOTICE
|
||||
Notice,
|
||||
/// DEBUG
|
||||
Debug,
|
||||
/// INFO
|
||||
Info,
|
||||
/// LOG
|
||||
Log,
|
||||
}
|
||||
|
||||
impl fmt::Display for Severity {
|
||||
fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let s = match *self {
|
||||
Severity::Panic => "PANIC",
|
||||
Severity::Fatal => "FATAL",
|
||||
Severity::Error => "ERROR",
|
||||
Severity::Warning => "WARNING",
|
||||
Severity::Notice => "NOTICE",
|
||||
Severity::Debug => "DEBUG",
|
||||
Severity::Info => "INFO",
|
||||
Severity::Log => "LOG",
|
||||
};
|
||||
fmt.write_str(s)
|
||||
}
|
||||
}
|
||||
|
||||
impl Severity {
|
||||
fn from_str(s: &str) -> Option<Severity> {
|
||||
match s {
|
||||
"PANIC" => Some(Severity::Panic),
|
||||
"FATAL" => Some(Severity::Fatal),
|
||||
"ERROR" => Some(Severity::Error),
|
||||
"WARNING" => Some(Severity::Warning),
|
||||
"NOTICE" => Some(Severity::Notice),
|
||||
"DEBUG" => Some(Severity::Debug),
|
||||
"INFO" => Some(Severity::Info),
|
||||
"LOG" => Some(Severity::Log),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A Postgres error or notice.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct DbError {
|
||||
severity: String,
|
||||
parsed_severity: Option<Severity>,
|
||||
code: SqlState,
|
||||
message: String,
|
||||
detail: Option<String>,
|
||||
hint: Option<String>,
|
||||
position: Option<ErrorPosition>,
|
||||
where_: Option<String>,
|
||||
schema: Option<String>,
|
||||
table: Option<String>,
|
||||
column: Option<String>,
|
||||
datatype: Option<String>,
|
||||
constraint: Option<String>,
|
||||
file: Option<String>,
|
||||
line: Option<u32>,
|
||||
routine: Option<String>,
|
||||
}
|
||||
|
||||
impl DbError {
|
||||
pub(crate) fn parse(fields: &mut ErrorFields<'_>) -> io::Result<DbError> {
|
||||
let mut severity = None;
|
||||
let mut parsed_severity = None;
|
||||
let mut code = None;
|
||||
let mut message = None;
|
||||
let mut detail = None;
|
||||
let mut hint = None;
|
||||
let mut normal_position = None;
|
||||
let mut internal_position = None;
|
||||
let mut internal_query = None;
|
||||
let mut where_ = None;
|
||||
let mut schema = None;
|
||||
let mut table = None;
|
||||
let mut column = None;
|
||||
let mut datatype = None;
|
||||
let mut constraint = None;
|
||||
let mut file = None;
|
||||
let mut line = None;
|
||||
let mut routine = None;
|
||||
|
||||
while let Some(field) = fields.next()? {
|
||||
match field.type_() {
|
||||
b'S' => severity = Some(field.value().to_owned()),
|
||||
b'C' => code = Some(SqlState::from_code(field.value())),
|
||||
b'M' => message = Some(field.value().to_owned()),
|
||||
b'D' => detail = Some(field.value().to_owned()),
|
||||
b'H' => hint = Some(field.value().to_owned()),
|
||||
b'P' => {
|
||||
normal_position = Some(field.value().parse::<u32>().map_err(|_| {
|
||||
io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"`P` field did not contain an integer",
|
||||
)
|
||||
})?);
|
||||
}
|
||||
b'p' => {
|
||||
internal_position = Some(field.value().parse::<u32>().map_err(|_| {
|
||||
io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"`p` field did not contain an integer",
|
||||
)
|
||||
})?);
|
||||
}
|
||||
b'q' => internal_query = Some(field.value().to_owned()),
|
||||
b'W' => where_ = Some(field.value().to_owned()),
|
||||
b's' => schema = Some(field.value().to_owned()),
|
||||
b't' => table = Some(field.value().to_owned()),
|
||||
b'c' => column = Some(field.value().to_owned()),
|
||||
b'd' => datatype = Some(field.value().to_owned()),
|
||||
b'n' => constraint = Some(field.value().to_owned()),
|
||||
b'F' => file = Some(field.value().to_owned()),
|
||||
b'L' => {
|
||||
line = Some(field.value().parse::<u32>().map_err(|_| {
|
||||
io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"`L` field did not contain an integer",
|
||||
)
|
||||
})?);
|
||||
}
|
||||
b'R' => routine = Some(field.value().to_owned()),
|
||||
b'V' => {
|
||||
parsed_severity = Some(Severity::from_str(field.value()).ok_or_else(|| {
|
||||
io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"`V` field contained an invalid value",
|
||||
)
|
||||
})?);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(DbError {
|
||||
severity: severity
|
||||
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "`S` field missing"))?,
|
||||
parsed_severity,
|
||||
code: code
|
||||
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "`C` field missing"))?,
|
||||
message: message
|
||||
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "`M` field missing"))?,
|
||||
detail,
|
||||
hint,
|
||||
position: match normal_position {
|
||||
Some(position) => Some(ErrorPosition::Original(position)),
|
||||
None => match internal_position {
|
||||
Some(position) => Some(ErrorPosition::Internal {
|
||||
position,
|
||||
query: internal_query.ok_or_else(|| {
|
||||
io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"`q` field missing but `p` field present",
|
||||
)
|
||||
})?,
|
||||
}),
|
||||
None => None,
|
||||
},
|
||||
},
|
||||
where_,
|
||||
schema,
|
||||
table,
|
||||
column,
|
||||
datatype,
|
||||
constraint,
|
||||
file,
|
||||
line,
|
||||
routine,
|
||||
})
|
||||
}
|
||||
|
||||
/// The field contents are ERROR, FATAL, or PANIC (in an error message),
|
||||
/// or WARNING, NOTICE, DEBUG, INFO, or LOG (in a notice message), or a
|
||||
/// localized translation of one of these.
|
||||
pub fn severity(&self) -> &str {
|
||||
&self.severity
|
||||
}
|
||||
|
||||
/// A parsed, nonlocalized version of `severity`. (PostgreSQL 9.6+)
|
||||
pub fn parsed_severity(&self) -> Option<Severity> {
|
||||
self.parsed_severity
|
||||
}
|
||||
|
||||
/// The SQLSTATE code for the error.
|
||||
pub fn code(&self) -> &SqlState {
|
||||
&self.code
|
||||
}
|
||||
|
||||
/// The primary human-readable error message.
|
||||
///
|
||||
/// This should be accurate but terse (typically one line).
|
||||
pub fn message(&self) -> &str {
|
||||
&self.message
|
||||
}
|
||||
|
||||
/// An optional secondary error message carrying more detail about the
|
||||
/// problem.
|
||||
///
|
||||
/// Might run to multiple lines.
|
||||
pub fn detail(&self) -> Option<&str> {
|
||||
self.detail.as_deref()
|
||||
}
|
||||
|
||||
/// An optional suggestion what to do about the problem.
|
||||
///
|
||||
/// This is intended to differ from `detail` in that it offers advice
|
||||
/// (potentially inappropriate) rather than hard facts. Might run to
|
||||
/// multiple lines.
|
||||
pub fn hint(&self) -> Option<&str> {
|
||||
self.hint.as_deref()
|
||||
}
|
||||
|
||||
/// An optional error cursor position into either the original query string
|
||||
/// or an internally generated query.
|
||||
pub fn position(&self) -> Option<&ErrorPosition> {
|
||||
self.position.as_ref()
|
||||
}
|
||||
|
||||
/// An indication of the context in which the error occurred.
|
||||
///
|
||||
/// Presently this includes a call stack traceback of active procedural
|
||||
/// language functions and internally-generated queries. The trace is one
|
||||
/// entry per line, most recent first.
|
||||
pub fn where_(&self) -> Option<&str> {
|
||||
self.where_.as_deref()
|
||||
}
|
||||
|
||||
/// If the error was associated with a specific database object, the name
|
||||
/// of the schema containing that object, if any. (PostgreSQL 9.3+)
|
||||
pub fn schema(&self) -> Option<&str> {
|
||||
self.schema.as_deref()
|
||||
}
|
||||
|
||||
/// If the error was associated with a specific table, the name of the
|
||||
/// table. (Refer to the schema name field for the name of the table's
|
||||
/// schema.) (PostgreSQL 9.3+)
|
||||
pub fn table(&self) -> Option<&str> {
|
||||
self.table.as_deref()
|
||||
}
|
||||
|
||||
/// If the error was associated with a specific table column, the name of
|
||||
/// the column.
|
||||
///
|
||||
/// (Refer to the schema and table name fields to identify the table.)
|
||||
/// (PostgreSQL 9.3+)
|
||||
pub fn column(&self) -> Option<&str> {
|
||||
self.column.as_deref()
|
||||
}
|
||||
|
||||
/// If the error was associated with a specific data type, the name of the
|
||||
/// data type. (Refer to the schema name field for the name of the data
|
||||
/// type's schema.) (PostgreSQL 9.3+)
|
||||
pub fn datatype(&self) -> Option<&str> {
|
||||
self.datatype.as_deref()
|
||||
}
|
||||
|
||||
/// If the error was associated with a specific constraint, the name of the
|
||||
/// constraint.
|
||||
///
|
||||
/// Refer to fields listed above for the associated table or domain.
|
||||
/// (For this purpose, indexes are treated as constraints, even if they
|
||||
/// weren't created with constraint syntax.) (PostgreSQL 9.3+)
|
||||
pub fn constraint(&self) -> Option<&str> {
|
||||
self.constraint.as_deref()
|
||||
}
|
||||
|
||||
/// The file name of the source-code location where the error was reported.
|
||||
pub fn file(&self) -> Option<&str> {
|
||||
self.file.as_deref()
|
||||
}
|
||||
|
||||
/// The line number of the source-code location where the error was
|
||||
/// reported.
|
||||
pub fn line(&self) -> Option<u32> {
|
||||
self.line
|
||||
}
|
||||
|
||||
/// The name of the source-code routine reporting the error.
|
||||
pub fn routine(&self) -> Option<&str> {
|
||||
self.routine.as_deref()
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for DbError {
|
||||
fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(fmt, "{}: {}", self.severity, self.message)?;
|
||||
if let Some(detail) = &self.detail {
|
||||
write!(fmt, "\nDETAIL: {}", detail)?;
|
||||
}
|
||||
if let Some(hint) = &self.hint {
|
||||
write!(fmt, "\nHINT: {}", hint)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl error::Error for DbError {}
|
||||
@@ -1,5 +0,0 @@
|
||||
|
||||
pub mod codec;
|
||||
pub mod connection;
|
||||
pub mod error;
|
||||
pub mod prepare;
|
||||
@@ -1,293 +0,0 @@
|
||||
use fallible_iterator::FallibleIterator;
|
||||
use futures::StreamExt;
|
||||
use postgres_protocol::message::backend::{DataRowRanges, Message};
|
||||
use postgres_protocol::message::frontend;
|
||||
use std::future::Future;
|
||||
use std::pin::Pin;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_postgres::types::{Field, Kind, Oid, ToSql, Type};
|
||||
|
||||
use super::connection::Connection;
|
||||
use super::error::Error;
|
||||
|
||||
const TYPEINFO_QUERY: &str = "\
|
||||
SELECT t.typname, t.typtype, t.typelem, r.rngsubtype, t.typbasetype, n.nspname, t.typrelid
|
||||
FROM pg_catalog.pg_type t
|
||||
LEFT OUTER JOIN pg_catalog.pg_range r ON r.rngtypid = t.oid
|
||||
INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid
|
||||
WHERE t.oid = $1
|
||||
";
|
||||
|
||||
const TYPEINFO_ENUM_QUERY: &str = "\
|
||||
SELECT enumlabel
|
||||
FROM pg_catalog.pg_enum
|
||||
WHERE enumtypid = $1
|
||||
ORDER BY enumsortorder
|
||||
";
|
||||
|
||||
const TYPEINFO_COMPOSITE_QUERY: &str = "\
|
||||
SELECT attname, atttypid
|
||||
FROM pg_catalog.pg_attribute
|
||||
WHERE attrelid = $1
|
||||
AND NOT attisdropped
|
||||
AND attnum > 0
|
||||
ORDER BY attnum
|
||||
";
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct TypeinfoPreparedQueries {
|
||||
query: String,
|
||||
enum_query: String,
|
||||
composite_query: String,
|
||||
}
|
||||
|
||||
fn map_is_null(x: tokio_postgres::types::IsNull) -> postgres_protocol::IsNull {
|
||||
match x {
|
||||
tokio_postgres::types::IsNull::Yes => postgres_protocol::IsNull::Yes,
|
||||
tokio_postgres::types::IsNull::No => postgres_protocol::IsNull::No,
|
||||
}
|
||||
}
|
||||
|
||||
fn read_column<'a, T: tokio_postgres::types::FromSql<'a>>(
|
||||
buffer: &'a [u8],
|
||||
type_: &Type,
|
||||
ranges: &mut DataRowRanges<'a>,
|
||||
) -> Result<T, Error> {
|
||||
let range = ranges.next()?;
|
||||
match range {
|
||||
Some(range) => T::from_sql_nullable(type_, range.map(|r| &buffer[r])),
|
||||
None => T::from_sql_null(type_),
|
||||
}
|
||||
.map_err(|e| Error::from_sql(e, 0))
|
||||
}
|
||||
|
||||
impl TypeinfoPreparedQueries {
|
||||
pub async fn new<
|
||||
S: AsyncRead + AsyncWrite + Unpin + Send,
|
||||
T: AsyncRead + AsyncWrite + Unpin + Send,
|
||||
>(
|
||||
c: &mut Connection<S, T>,
|
||||
) -> Result<Self, Error> {
|
||||
if let Some(ti) = &c.typeinfo {
|
||||
return Ok(ti.clone());
|
||||
}
|
||||
|
||||
let query = c.statement_name();
|
||||
let enum_query = c.statement_name();
|
||||
let composite_query = c.statement_name();
|
||||
|
||||
frontend::parse(&query, TYPEINFO_QUERY, [Type::OID.oid()], &mut c.raw.buf)?;
|
||||
frontend::parse(
|
||||
&enum_query,
|
||||
TYPEINFO_ENUM_QUERY,
|
||||
[Type::OID.oid()],
|
||||
&mut c.raw.buf,
|
||||
)?;
|
||||
c.sync().await?;
|
||||
frontend::parse(
|
||||
&composite_query,
|
||||
TYPEINFO_COMPOSITE_QUERY,
|
||||
[Type::OID.oid()],
|
||||
&mut c.raw.buf,
|
||||
)?;
|
||||
c.sync().await?;
|
||||
|
||||
let Message::ParseComplete = c.raw.next_message().await? else { return Err(Error::expecting("parse")) };
|
||||
let Message::ParseComplete = c.raw.next_message().await? else { return Err(Error::expecting("parse")) };
|
||||
let Message::ParseComplete = c.raw.next_message().await? else { return Err(Error::expecting("parse")) };
|
||||
c.wait_for_ready().await?;
|
||||
|
||||
Ok(c.typeinfo
|
||||
.insert(TypeinfoPreparedQueries {
|
||||
query,
|
||||
enum_query,
|
||||
composite_query,
|
||||
})
|
||||
.clone())
|
||||
}
|
||||
|
||||
fn get_type_rec<
|
||||
S: AsyncRead + AsyncWrite + Unpin + Send,
|
||||
T: AsyncRead + AsyncWrite + Unpin + Send,
|
||||
>(
|
||||
c: &mut Connection<S, T>,
|
||||
oid: Oid,
|
||||
) -> Pin<Box<dyn Future<Output = Result<Type, Error>> + Send + '_>> {
|
||||
Box::pin(Self::get_type(c, oid))
|
||||
}
|
||||
|
||||
pub async fn get_type<
|
||||
S: AsyncRead + AsyncWrite + Unpin + Send,
|
||||
T: AsyncRead + AsyncWrite + Unpin + Send,
|
||||
>(
|
||||
c: &mut Connection<S, T>,
|
||||
oid: Oid,
|
||||
) -> Result<Type, Error> {
|
||||
if let Some(type_) = Type::from_oid(oid) {
|
||||
return Ok(type_);
|
||||
}
|
||||
|
||||
if let Some(type_) = c.typecache.get(&oid) {
|
||||
return Ok(type_.clone());
|
||||
}
|
||||
|
||||
let queries = Self::new(c).await?;
|
||||
|
||||
frontend::bind(
|
||||
"",
|
||||
&queries.query,
|
||||
[1], // the only parameter is in binary format
|
||||
[oid],
|
||||
|param, buf| param.to_sql(&Type::OID, buf).map(map_is_null),
|
||||
Some(1), // binary return type
|
||||
&mut c.raw.buf,
|
||||
)
|
||||
.map_err(|e| match e {
|
||||
frontend::BindError::Conversion(e) => std::io::Error::new(std::io::ErrorKind::Other, e),
|
||||
frontend::BindError::Serialization(io) => io,
|
||||
})?;
|
||||
frontend::execute("", 0, &mut c.raw.buf)?;
|
||||
|
||||
c.sync().await?;
|
||||
|
||||
let mut stream = c.stream_query_results().await?;
|
||||
|
||||
let Some(row) = stream.next().await.transpose()? else {
|
||||
todo!()
|
||||
};
|
||||
|
||||
let row = row.map_err(Error::db)?;
|
||||
let b = row.buffer();
|
||||
let mut ranges = row.ranges();
|
||||
|
||||
let name: String = read_column(b, &Type::NAME, &mut ranges)?;
|
||||
let type_: i8 = read_column(b, &Type::CHAR, &mut ranges)?;
|
||||
let elem_oid: Oid = read_column(b, &Type::OID, &mut ranges)?;
|
||||
let rngsubtype: Option<Oid> = read_column(b, &Type::OID, &mut ranges)?;
|
||||
let basetype: Oid = read_column(b, &Type::OID, &mut ranges)?;
|
||||
let schema: String = read_column(b, &Type::NAME, &mut ranges)?;
|
||||
let relid: Oid = read_column(b, &Type::OID, &mut ranges)?;
|
||||
|
||||
{
|
||||
// should be none
|
||||
let None = stream.next().await.transpose()? else {
|
||||
todo!()
|
||||
};
|
||||
drop(stream);
|
||||
}
|
||||
|
||||
let kind = if type_ == b'e' as i8 {
|
||||
let variants = Self::get_enum_variants(c, oid).await?;
|
||||
Kind::Enum(variants)
|
||||
} else if type_ == b'p' as i8 {
|
||||
Kind::Pseudo
|
||||
} else if basetype != 0 {
|
||||
let type_ = Self::get_type_rec(c, basetype).await?;
|
||||
Kind::Domain(type_)
|
||||
} else if elem_oid != 0 {
|
||||
let type_ = Self::get_type_rec(c, elem_oid).await?;
|
||||
Kind::Array(type_)
|
||||
} else if relid != 0 {
|
||||
let fields = Self::get_composite_fields(c, relid).await?;
|
||||
Kind::Composite(fields)
|
||||
} else if let Some(rngsubtype) = rngsubtype {
|
||||
let type_ = Self::get_type_rec(c, rngsubtype).await?;
|
||||
Kind::Range(type_)
|
||||
} else {
|
||||
Kind::Simple
|
||||
};
|
||||
|
||||
let type_ = Type::new(name, oid, kind, schema);
|
||||
c.typecache.insert(oid, type_.clone());
|
||||
|
||||
Ok(type_)
|
||||
}
|
||||
|
||||
async fn get_enum_variants<
|
||||
S: AsyncRead + AsyncWrite + Unpin + Send,
|
||||
T: AsyncRead + AsyncWrite + Unpin + Send,
|
||||
>(
|
||||
c: &mut Connection<S, T>,
|
||||
oid: Oid,
|
||||
) -> Result<Vec<String>, Error> {
|
||||
let queries = Self::new(c).await?;
|
||||
|
||||
frontend::bind(
|
||||
"",
|
||||
&queries.enum_query,
|
||||
[1], // the only parameter is in binary format
|
||||
[oid],
|
||||
|param, buf| param.to_sql(&Type::OID, buf).map(map_is_null),
|
||||
Some(1), // binary return type
|
||||
&mut c.raw.buf,
|
||||
)
|
||||
.map_err(|e| match e {
|
||||
frontend::BindError::Conversion(e) => std::io::Error::new(std::io::ErrorKind::Other, e),
|
||||
frontend::BindError::Serialization(io) => io,
|
||||
})?;
|
||||
frontend::execute("", 0, &mut c.raw.buf)?;
|
||||
|
||||
c.sync().await?;
|
||||
|
||||
let mut stream = c.stream_query_results().await?;
|
||||
let mut variants = Vec::new();
|
||||
while let Some(row) = stream.next().await.transpose()? {
|
||||
let row = row.map_err(Error::db)?;
|
||||
|
||||
let variant: String = read_column(row.buffer(), &Type::NAME, &mut row.ranges())?;
|
||||
variants.push(variant);
|
||||
}
|
||||
|
||||
c.wait_for_ready().await?;
|
||||
|
||||
Ok(variants)
|
||||
}
|
||||
|
||||
async fn get_composite_fields<
|
||||
S: AsyncRead + AsyncWrite + Unpin + Send,
|
||||
T: AsyncRead + AsyncWrite + Unpin + Send,
|
||||
>(
|
||||
c: &mut Connection<S, T>,
|
||||
oid: Oid,
|
||||
) -> Result<Vec<Field>, Error> {
|
||||
let queries = Self::new(c).await?;
|
||||
|
||||
frontend::bind(
|
||||
"",
|
||||
&queries.composite_query,
|
||||
[1], // the only parameter is in binary format
|
||||
[oid],
|
||||
|param, buf| param.to_sql(&Type::OID, buf).map(map_is_null),
|
||||
Some(1), // binary return type
|
||||
&mut c.raw.buf,
|
||||
)
|
||||
.map_err(|e| match e {
|
||||
frontend::BindError::Conversion(e) => std::io::Error::new(std::io::ErrorKind::Other, e),
|
||||
frontend::BindError::Serialization(io) => io,
|
||||
})?;
|
||||
frontend::execute("", 0, &mut c.raw.buf)?;
|
||||
|
||||
c.sync().await?;
|
||||
|
||||
let mut stream = c.stream_query_results().await?;
|
||||
let mut fields = Vec::new();
|
||||
while let Some(row) = stream.next().await.transpose()? {
|
||||
let row = row.map_err(Error::db)?;
|
||||
|
||||
let mut ranges = row.ranges();
|
||||
let name: String = read_column(row.buffer(), &Type::NAME, &mut ranges)?;
|
||||
let oid: Oid = read_column(row.buffer(), &Type::OID, &mut ranges)?;
|
||||
fields.push((name, oid));
|
||||
}
|
||||
|
||||
c.wait_for_ready().await?;
|
||||
|
||||
let mut output_fields = Vec::with_capacity(fields.len());
|
||||
for (name, oid) in fields {
|
||||
let type_ = Self::get_type_rec(c, oid).await?;
|
||||
output_fields.push(Field::new(name, type_))
|
||||
}
|
||||
|
||||
Ok(output_fields)
|
||||
}
|
||||
}
|
||||
@@ -23,7 +23,7 @@ use tokio::{
|
||||
time,
|
||||
};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, info, warn};
|
||||
use tracing::{error, info, info_span, warn, Instrument};
|
||||
use utils::measured_stream::MeasuredStream;
|
||||
|
||||
/// Number of times we should retry the `/proxy_wake_compute` http request.
|
||||
@@ -101,21 +101,20 @@ pub async fn task_main(
|
||||
tokio::select! {
|
||||
accept_result = listener.accept() => {
|
||||
let (socket, peer_addr) = accept_result?;
|
||||
info!("accepted postgres client connection from {peer_addr}");
|
||||
|
||||
let session_id = uuid::Uuid::new_v4();
|
||||
let cancel_map = Arc::clone(&cancel_map);
|
||||
connections.spawn(
|
||||
async move {
|
||||
info!("spawned a task for {peer_addr}");
|
||||
info!("accepted postgres client connection");
|
||||
|
||||
socket
|
||||
.set_nodelay(true)
|
||||
.context("failed to set socket option")?;
|
||||
|
||||
handle_client(config, &cancel_map, session_id, socket, ClientMode::Tcp)
|
||||
.await
|
||||
handle_client(config, &cancel_map, session_id, socket, ClientMode::Tcp).await
|
||||
}
|
||||
.instrument(info_span!("handle_client", ?session_id, %peer_addr))
|
||||
.unwrap_or_else(move |e| {
|
||||
// Acknowledge that the task has finished with an error.
|
||||
error!(?session_id, "per-client task finished with an error: {e:#}");
|
||||
@@ -183,7 +182,6 @@ impl ClientMode {
|
||||
}
|
||||
}
|
||||
|
||||
#[tracing::instrument(fields(session_id = ?session_id), skip_all)]
|
||||
pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
config: &'static ProxyConfig,
|
||||
cancel_map: &CancelMap,
|
||||
@@ -425,11 +423,17 @@ where
|
||||
auth::BackendType::Test(x) => x.wake_compute(),
|
||||
};
|
||||
|
||||
match handle_try_wake(wake_res, num_retries)? {
|
||||
match handle_try_wake(wake_res, num_retries) {
|
||||
Err(e) => {
|
||||
error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
|
||||
return Err(e.into());
|
||||
}
|
||||
// failed to wake up but we can continue to retry
|
||||
ControlFlow::Continue(_) => {}
|
||||
Ok(ControlFlow::Continue(e)) => {
|
||||
warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
|
||||
}
|
||||
// successfully woke up a compute node and can break the wakeup loop
|
||||
ControlFlow::Break(mut node_info) => {
|
||||
Ok(ControlFlow::Break(mut node_info)) => {
|
||||
node_info.config.reuse_password(&config);
|
||||
mechanism.update_connect_config(&mut node_info.config);
|
||||
break node_info;
|
||||
@@ -440,7 +444,6 @@ where
|
||||
num_retries += 1;
|
||||
|
||||
time::sleep(wait_duration).await;
|
||||
info!(num_retries, "retrying wake compute");
|
||||
};
|
||||
|
||||
// now that we have a new node, try connect to it repeatedly.
|
||||
@@ -451,10 +454,12 @@ where
|
||||
match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
|
||||
Ok(res) => return Ok(res),
|
||||
Err(e) => {
|
||||
error!(error = ?e, "could not connect to compute node");
|
||||
if !e.should_retry(num_retries) {
|
||||
let retriable = e.should_retry(num_retries);
|
||||
if !retriable {
|
||||
error!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
|
||||
return Err(e.into());
|
||||
}
|
||||
warn!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -462,7 +467,6 @@ where
|
||||
num_retries += 1;
|
||||
|
||||
time::sleep(wait_duration).await;
|
||||
info!(num_retries, "retrying connect_once");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -541,7 +545,7 @@ impl ShouldRetry for compute::ConnectionError {
|
||||
}
|
||||
}
|
||||
|
||||
fn retry_after(num_retries: u32) -> time::Duration {
|
||||
pub fn retry_after(num_retries: u32) -> time::Duration {
|
||||
// 1.5 seems to be an ok growth factor heuristic
|
||||
BASE_RETRY_WAIT_DURATION.mul_f64(1.5_f64.powi(num_retries as i32))
|
||||
}
|
||||
|
||||
@@ -99,8 +99,9 @@ struct Scram(scram::ServerSecret);
|
||||
|
||||
impl Scram {
|
||||
fn new(password: &str) -> anyhow::Result<Self> {
|
||||
let secret =
|
||||
scram::ServerSecret::build(password).context("failed to generate scram secret")?;
|
||||
let salt = rand::random::<[u8; 16]>();
|
||||
let secret = scram::ServerSecret::build(password, &salt, 256)
|
||||
.context("failed to generate scram secret")?;
|
||||
Ok(Scram(secret))
|
||||
}
|
||||
|
||||
|
||||
@@ -12,6 +12,9 @@ mod messages;
|
||||
mod secret;
|
||||
mod signature;
|
||||
|
||||
#[cfg(any(test, doc))]
|
||||
mod password;
|
||||
|
||||
pub use exchange::Exchange;
|
||||
pub use key::ScramKey;
|
||||
pub use secret::ServerSecret;
|
||||
@@ -54,21 +57,27 @@ fn sha256<'a>(parts: impl IntoIterator<Item = &'a [u8]>) -> [u8; 32] {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256};
|
||||
|
||||
use crate::sasl::{Mechanism, Step};
|
||||
|
||||
use super::{Exchange, ServerSecret};
|
||||
use super::{password::SaltedPassword, Exchange, ServerSecret};
|
||||
|
||||
#[test]
|
||||
fn snapshot() {
|
||||
fn happy_path() {
|
||||
let iterations = 4096;
|
||||
let salt = "QSXCR+Q6sek8bf92";
|
||||
let stored_key = "FO+9jBb3MUukt6jJnzjPZOWc5ow/Pu6JtPyju0aqaE8=";
|
||||
let server_key = "qxJ1SbmSAi5EcS0J5Ck/cKAm/+Ixa+Kwp63f4OHDgzo=";
|
||||
let secret = format!("SCRAM-SHA-256${iterations}:{salt}${stored_key}:{server_key}",);
|
||||
let secret = ServerSecret::parse(&secret).unwrap();
|
||||
let salt_base64 = "QSXCR+Q6sek8bf92";
|
||||
let pw = SaltedPassword::new(
|
||||
b"pencil",
|
||||
base64::decode(salt_base64).unwrap().as_slice(),
|
||||
iterations,
|
||||
);
|
||||
|
||||
let secret = ServerSecret {
|
||||
iterations,
|
||||
salt_base64: salt_base64.to_owned(),
|
||||
stored_key: pw.client_key().sha256(),
|
||||
server_key: pw.server_key(),
|
||||
doomed: false,
|
||||
};
|
||||
const NONCE: [u8; 18] = [
|
||||
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
|
||||
];
|
||||
@@ -106,40 +115,4 @@ mod tests {
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
fn run_round_trip_test(client_password: &str) {
|
||||
let secret = ServerSecret::build("pencil").unwrap();
|
||||
let mut exchange = Exchange::new(&secret, rand::random, None);
|
||||
|
||||
let mut client =
|
||||
ScramSha256::new(client_password.as_bytes(), ChannelBinding::unsupported());
|
||||
|
||||
let client_first = std::str::from_utf8(client.message()).unwrap();
|
||||
exchange = match exchange.exchange(client_first).unwrap() {
|
||||
Step::Continue(exchange, message) => {
|
||||
client.update(message.as_bytes()).unwrap();
|
||||
exchange
|
||||
}
|
||||
Step::Success(_, _) => panic!("expected continue, got success"),
|
||||
Step::Failure(f) => panic!("{f}"),
|
||||
};
|
||||
|
||||
let client_final = std::str::from_utf8(client.message()).unwrap();
|
||||
match exchange.exchange(client_final).unwrap() {
|
||||
Step::Success(_, message) => client.finish(message.as_bytes()).unwrap(),
|
||||
Step::Continue(_, _) => panic!("expected success, got continue"),
|
||||
Step::Failure(f) => panic!("{f}"),
|
||||
};
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn round_trip() {
|
||||
run_round_trip_test("pencil")
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "password doesn't match")]
|
||||
fn failure() {
|
||||
run_round_trip_test("eraser")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
/// Faithfully taken from PostgreSQL.
|
||||
pub const SCRAM_KEY_LEN: usize = 32;
|
||||
|
||||
/// One of the keys derived from the user's password.
|
||||
/// One of the keys derived from the [password](super::password::SaltedPassword).
|
||||
/// We use the same structure for all keys, i.e.
|
||||
/// `ClientKey`, `StoredKey`, and `ServerKey`.
|
||||
#[derive(Default, PartialEq, Eq)]
|
||||
|
||||
74
proxy/src/scram/password.rs
Normal file
74
proxy/src/scram/password.rs
Normal file
@@ -0,0 +1,74 @@
|
||||
//! Password hashing routines.
|
||||
|
||||
use super::key::ScramKey;
|
||||
|
||||
pub const SALTED_PASSWORD_LEN: usize = 32;
|
||||
|
||||
/// Salted hashed password is essential for [key](super::key) derivation.
|
||||
#[repr(transparent)]
|
||||
pub struct SaltedPassword {
|
||||
bytes: [u8; SALTED_PASSWORD_LEN],
|
||||
}
|
||||
|
||||
impl SaltedPassword {
|
||||
/// See `scram-common.c : scram_SaltedPassword` for details.
|
||||
/// Further reading: <https://datatracker.ietf.org/doc/html/rfc2898> (see `PBKDF2`).
|
||||
pub fn new(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword {
|
||||
pbkdf2::pbkdf2_hmac_array::<sha2::Sha256, 32>(password, salt, iterations).into()
|
||||
}
|
||||
|
||||
/// Derive `ClientKey` from a salted hashed password.
|
||||
pub fn client_key(&self) -> ScramKey {
|
||||
super::hmac_sha256(&self.bytes, [b"Client Key".as_ref()]).into()
|
||||
}
|
||||
|
||||
/// Derive `ServerKey` from a salted hashed password.
|
||||
pub fn server_key(&self) -> ScramKey {
|
||||
super::hmac_sha256(&self.bytes, [b"Server Key".as_ref()]).into()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<[u8; SALTED_PASSWORD_LEN]> for SaltedPassword {
|
||||
#[inline(always)]
|
||||
fn from(bytes: [u8; SALTED_PASSWORD_LEN]) -> Self {
|
||||
Self { bytes }
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::SaltedPassword;
|
||||
|
||||
fn legacy_pbkdf2_impl(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword {
|
||||
let one = 1_u32.to_be_bytes(); // magic
|
||||
|
||||
let mut current = super::super::hmac_sha256(password, [salt, &one]);
|
||||
let mut result = current;
|
||||
for _ in 1..iterations {
|
||||
current = super::super::hmac_sha256(password, [current.as_ref()]);
|
||||
// TODO: result = current.zip(result).map(|(x, y)| x ^ y), issue #80094
|
||||
for (i, x) in current.iter().enumerate() {
|
||||
result[i] ^= x;
|
||||
}
|
||||
}
|
||||
|
||||
result.into()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn pbkdf2() {
|
||||
let password = "a-very-secure-password";
|
||||
let salt = "such-a-random-salt";
|
||||
let iterations = 4096;
|
||||
let output = [
|
||||
203, 18, 206, 81, 4, 154, 193, 100, 147, 41, 211, 217, 177, 203, 69, 210, 194, 211,
|
||||
101, 1, 248, 156, 96, 0, 8, 223, 30, 87, 158, 41, 20, 42,
|
||||
];
|
||||
|
||||
let actual = SaltedPassword::new(password.as_bytes(), salt.as_bytes(), iterations);
|
||||
let expected = legacy_pbkdf2_impl(password.as_bytes(), salt.as_bytes(), iterations);
|
||||
|
||||
assert_eq!(actual.bytes, output);
|
||||
assert_eq!(actual.bytes, expected.bytes);
|
||||
}
|
||||
}
|
||||
@@ -3,7 +3,7 @@
|
||||
use super::base64_decode_array;
|
||||
use super::key::ScramKey;
|
||||
|
||||
/// Server secret is produced from user's password,
|
||||
/// Server secret is produced from [password](super::password::SaltedPassword)
|
||||
/// and is used throughout the authentication process.
|
||||
pub struct ServerSecret {
|
||||
/// Number of iterations for `PBKDF2` function.
|
||||
@@ -58,10 +58,21 @@ impl ServerSecret {
|
||||
/// Build a new server secret from the prerequisites.
|
||||
/// XXX: We only use this function in tests.
|
||||
#[cfg(test)]
|
||||
pub fn build(password: &str) -> Option<Self> {
|
||||
Self::parse(&postgres_protocol::password::scram_sha_256(
|
||||
password.as_bytes(),
|
||||
))
|
||||
pub fn build(password: &str, salt: &[u8], iterations: u32) -> Option<Self> {
|
||||
// TODO: implement proper password normalization required by the RFC
|
||||
if !password.is_ascii() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let password = super::password::SaltedPassword::new(password.as_bytes(), salt, iterations);
|
||||
|
||||
Some(Self {
|
||||
iterations,
|
||||
salt_base64: base64::encode(salt),
|
||||
stored_key: password.client_key().sha256(),
|
||||
server_key: password.server_key(),
|
||||
doomed: false,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -91,4 +102,20 @@ mod tests {
|
||||
assert_eq!(base64::encode(parsed.stored_key), stored_key);
|
||||
assert_eq!(base64::encode(parsed.server_key), server_key);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn build_scram_secret() {
|
||||
let salt = b"salt";
|
||||
let secret = ServerSecret::build("password", salt, 4096).unwrap();
|
||||
assert_eq!(secret.iterations, 4096);
|
||||
assert_eq!(secret.salt_base64, base64::encode(salt));
|
||||
assert_eq!(
|
||||
base64::encode(secret.stored_key.as_ref()),
|
||||
"lF4cRm/Jky763CN4HtxdHnjV4Q8AWTNlKvGmEFFU8IQ="
|
||||
);
|
||||
assert_eq!(
|
||||
base64::encode(secret.server_key.as_ref()),
|
||||
"ub8OgRsftnk2ccDMOt7ffHXNcikRkQkq1lh4xaAqrSw="
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,6 +15,7 @@ use toml_edit::Document;
|
||||
use std::fs::{self, File};
|
||||
use std::io::{ErrorKind, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use storage_broker::Uri;
|
||||
@@ -79,6 +80,10 @@ struct Args {
|
||||
/// Listen http endpoint for management and metrics in the form host:port.
|
||||
#[arg(long, default_value = DEFAULT_HTTP_LISTEN_ADDR)]
|
||||
listen_http: String,
|
||||
/// Advertised endpoint for receiving/sending WAL in the form host:port. If not
|
||||
/// specified, listen_pg is used to advertise instead.
|
||||
#[arg(long, default_value = None)]
|
||||
advertise_pg: Option<String>,
|
||||
/// Availability zone of the safekeeper.
|
||||
#[arg(long)]
|
||||
availability_zone: Option<String>,
|
||||
@@ -118,9 +123,24 @@ struct Args {
|
||||
/// WAL backup horizon.
|
||||
#[arg(long)]
|
||||
disable_wal_backup: bool,
|
||||
/// Path to a .pem public key which is used to check JWT tokens.
|
||||
#[arg(long)]
|
||||
auth_validation_public_key_path: Option<PathBuf>,
|
||||
/// If given, enables auth on incoming connections to WAL service endpoint
|
||||
/// (--listen-pg). Value specifies path to a .pem public key used for
|
||||
/// validations of JWT tokens. Empty string is allowed and means disabling
|
||||
/// auth.
|
||||
#[arg(long, verbatim_doc_comment, value_parser = opt_pathbuf_parser)]
|
||||
pg_auth_public_key_path: Option<PathBuf>,
|
||||
/// If given, enables auth on incoming connections to tenant only WAL
|
||||
/// service endpoint (--listen-pg-tenant-only). Value specifies path to a
|
||||
/// .pem public key used for validations of JWT tokens. Empty string is
|
||||
/// allowed and means disabling auth.
|
||||
#[arg(long, verbatim_doc_comment, value_parser = opt_pathbuf_parser)]
|
||||
pg_tenant_only_auth_public_key_path: Option<PathBuf>,
|
||||
/// If given, enables auth on incoming connections to http management
|
||||
/// service endpoint (--listen-http). Value specifies path to a .pem public
|
||||
/// key used for validations of JWT tokens. Empty string is allowed and
|
||||
/// means disabling auth.
|
||||
#[arg(long, verbatim_doc_comment, value_parser = opt_pathbuf_parser)]
|
||||
http_auth_public_key_path: Option<PathBuf>,
|
||||
/// Format for logging, either 'plain' or 'json'.
|
||||
#[arg(long, default_value = "plain")]
|
||||
log_format: String,
|
||||
@@ -130,9 +150,39 @@ struct Args {
|
||||
current_thread_runtime: bool,
|
||||
}
|
||||
|
||||
// Like PathBufValueParser, but allows empty string.
|
||||
fn opt_pathbuf_parser(s: &str) -> Result<PathBuf, String> {
|
||||
Ok(PathBuf::from_str(s).unwrap())
|
||||
}
|
||||
|
||||
#[tokio::main(flavor = "current_thread")]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let args = Args::parse();
|
||||
// We want to allow multiple occurences of the same arg (taking the last) so
|
||||
// that neon_local could generate command with defaults + overrides without
|
||||
// getting 'argument cannot be used multiple times' error. This seems to be
|
||||
// impossible with pure Derive API, so convert struct to Command, modify it,
|
||||
// parse arguments, and then fill the struct back.
|
||||
let cmd = <Args as clap::CommandFactory>::command().args_override_self(true);
|
||||
let mut matches = cmd.get_matches();
|
||||
let mut args = <Args as clap::FromArgMatches>::from_arg_matches_mut(&mut matches)?;
|
||||
|
||||
// I failed to modify opt_pathbuf_parser to return Option<PathBuf> in
|
||||
// reasonable time, so turn empty string into option post factum.
|
||||
if let Some(pb) = &args.pg_auth_public_key_path {
|
||||
if pb.as_os_str().is_empty() {
|
||||
args.pg_auth_public_key_path = None;
|
||||
}
|
||||
}
|
||||
if let Some(pb) = &args.pg_tenant_only_auth_public_key_path {
|
||||
if pb.as_os_str().is_empty() {
|
||||
args.pg_tenant_only_auth_public_key_path = None;
|
||||
}
|
||||
}
|
||||
if let Some(pb) = &args.http_auth_public_key_path {
|
||||
if pb.as_os_str().is_empty() {
|
||||
args.http_auth_public_key_path = None;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(addr) = args.dump_control_file {
|
||||
let state = control_file::FileStorage::load_control_file(addr)?;
|
||||
@@ -166,13 +216,40 @@ async fn main() -> anyhow::Result<()> {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let auth = match args.auth_validation_public_key_path.as_ref() {
|
||||
let pg_auth = match args.pg_auth_public_key_path.as_ref() {
|
||||
None => {
|
||||
info!("auth is disabled");
|
||||
info!("pg auth is disabled");
|
||||
None
|
||||
}
|
||||
Some(path) => {
|
||||
info!("loading JWT auth key from {}", path.display());
|
||||
info!("loading pg auth JWT key from {}", path.display());
|
||||
Some(Arc::new(
|
||||
JwtAuth::from_key_path(path).context("failed to load the auth key")?,
|
||||
))
|
||||
}
|
||||
};
|
||||
let pg_tenant_only_auth = match args.pg_tenant_only_auth_public_key_path.as_ref() {
|
||||
None => {
|
||||
info!("pg tenant only auth is disabled");
|
||||
None
|
||||
}
|
||||
Some(path) => {
|
||||
info!(
|
||||
"loading pg tenant only auth JWT key from {}",
|
||||
path.display()
|
||||
);
|
||||
Some(Arc::new(
|
||||
JwtAuth::from_key_path(path).context("failed to load the auth key")?,
|
||||
))
|
||||
}
|
||||
};
|
||||
let http_auth = match args.http_auth_public_key_path.as_ref() {
|
||||
None => {
|
||||
info!("http auth is disabled");
|
||||
None
|
||||
}
|
||||
Some(path) => {
|
||||
info!("loading http auth JWT key from {}", path.display());
|
||||
Some(Arc::new(
|
||||
JwtAuth::from_key_path(path).context("failed to load the auth key")?,
|
||||
))
|
||||
@@ -185,6 +262,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
listen_pg_addr: args.listen_pg,
|
||||
listen_pg_addr_tenant_only: args.listen_pg_tenant_only,
|
||||
listen_http_addr: args.listen_http,
|
||||
advertise_pg_addr: args.advertise_pg,
|
||||
availability_zone: args.availability_zone,
|
||||
no_sync: args.no_sync,
|
||||
broker_endpoint: args.broker_endpoint,
|
||||
@@ -194,7 +272,9 @@ async fn main() -> anyhow::Result<()> {
|
||||
max_offloader_lag_bytes: args.max_offloader_lag,
|
||||
wal_backup_enabled: !args.disable_wal_backup,
|
||||
backup_parallel_jobs: args.wal_backup_parallel_jobs,
|
||||
auth,
|
||||
pg_auth,
|
||||
pg_tenant_only_auth,
|
||||
http_auth,
|
||||
current_thread_runtime: args.current_thread_runtime,
|
||||
};
|
||||
|
||||
@@ -283,7 +363,7 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
.spawn(wal_service::task_main(
|
||||
conf_,
|
||||
pg_listener,
|
||||
Some(Scope::SafekeeperData),
|
||||
Scope::SafekeeperData,
|
||||
))
|
||||
// wrap with task name for error reporting
|
||||
.map(|res| ("WAL service main".to_owned(), res));
|
||||
@@ -297,7 +377,7 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
.spawn(wal_service::task_main(
|
||||
conf_,
|
||||
pg_listener_tenant_only,
|
||||
Some(Scope::Tenant),
|
||||
Scope::Tenant,
|
||||
))
|
||||
// wrap with task name for error reporting
|
||||
.map(|res| ("WAL service tenant only main".to_owned(), res));
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user