mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-15 12:10:37 +00:00
Compare commits
2 Commits
faster-ci
...
joonas/syn
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
49e72c8564 | ||
|
|
42524a19b8 |
7
.github/actionlint.yml
vendored
7
.github/actionlint.yml
vendored
@@ -7,13 +7,6 @@ self-hosted-runner:
|
||||
- small-arm64
|
||||
- us-east-2
|
||||
config-variables:
|
||||
- AZURE_DEV_CLIENT_ID
|
||||
- AZURE_DEV_REGISTRY_NAME
|
||||
- AZURE_DEV_SUBSCRIPTION_ID
|
||||
- AZURE_PROD_CLIENT_ID
|
||||
- AZURE_PROD_REGISTRY_NAME
|
||||
- AZURE_PROD_SUBSCRIPTION_ID
|
||||
- AZURE_TENANT_ID
|
||||
- BENCHMARK_PROJECT_ID_PUB
|
||||
- BENCHMARK_PROJECT_ID_SUB
|
||||
- REMOTE_STORAGE_AZURE_CONTAINER
|
||||
|
||||
56
.github/workflows/_push-to-acr.yml
vendored
56
.github/workflows/_push-to-acr.yml
vendored
@@ -1,56 +0,0 @@
|
||||
name: Push images to ACR
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
client_id:
|
||||
description: Client ID of Azure managed identity or Entra app
|
||||
required: true
|
||||
type: string
|
||||
image_tag:
|
||||
description: Tag for the container image
|
||||
required: true
|
||||
type: string
|
||||
images:
|
||||
description: Images to push
|
||||
required: true
|
||||
type: string
|
||||
registry_name:
|
||||
description: Name of the container registry
|
||||
required: true
|
||||
type: string
|
||||
subscription_id:
|
||||
description: Azure subscription ID
|
||||
required: true
|
||||
type: string
|
||||
tenant_id:
|
||||
description: Azure tenant ID
|
||||
required: true
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
push-to-acr:
|
||||
runs-on: ubuntu-22.04
|
||||
permissions:
|
||||
contents: read # This is required for actions/checkout
|
||||
id-token: write # This is required for Azure Login to work.
|
||||
|
||||
steps:
|
||||
- name: Azure login
|
||||
uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1
|
||||
with:
|
||||
client-id: ${{ inputs.client_id }}
|
||||
subscription-id: ${{ inputs.subscription_id }}
|
||||
tenant-id: ${{ inputs.tenant_id }}
|
||||
|
||||
- name: Login to ACR
|
||||
run: |
|
||||
az acr login --name=${{ inputs.registry_name }}
|
||||
|
||||
- name: Copy docker images to ACR ${{ inputs.registry_name }}
|
||||
run: |
|
||||
images='${{ inputs.images }}'
|
||||
for image in ${images}; do
|
||||
docker buildx imagetools create \
|
||||
-t ${{ inputs.registry_name }}.azurecr.io/neondatabase/${image}:${{ inputs.image_tag }} \
|
||||
neondatabase/${image}:${{ inputs.image_tag }}
|
||||
done
|
||||
57
.github/workflows/build_and_test.yml
vendored
57
.github/workflows/build_and_test.yml
vendored
@@ -271,6 +271,10 @@ jobs:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: sync(1)
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: sync
|
||||
|
||||
- name: Pytest benchmarks
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
@@ -794,6 +798,9 @@ jobs:
|
||||
docker compose -f ./docker-compose/docker-compose.yml down
|
||||
|
||||
promote-images:
|
||||
permissions:
|
||||
contents: read # This is required for actions/checkout
|
||||
id-token: write # This is required for Azure Login to work.
|
||||
needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
@@ -820,6 +827,28 @@ jobs:
|
||||
neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
|
||||
done
|
||||
|
||||
- name: Azure login
|
||||
if: github.ref_name == 'main'
|
||||
uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1
|
||||
with:
|
||||
client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
|
||||
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
|
||||
subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
|
||||
|
||||
- name: Login to ACR
|
||||
if: github.ref_name == 'main'
|
||||
run: |
|
||||
az acr login --name=neoneastus2
|
||||
|
||||
- name: Copy docker images to ACR-dev
|
||||
if: github.ref_name == 'main'
|
||||
run: |
|
||||
for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do
|
||||
docker buildx imagetools create \
|
||||
-t neoneastus2.azurecr.io/neondatabase/${image}:${{ needs.tag.outputs.build-tag }} \
|
||||
neondatabase/${image}:${{ needs.tag.outputs.build-tag }}
|
||||
done
|
||||
|
||||
- name: Add latest tag to images
|
||||
if: github.ref_name == 'main'
|
||||
run: |
|
||||
@@ -857,30 +886,6 @@ jobs:
|
||||
369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }}
|
||||
done
|
||||
|
||||
push-to-acr-dev:
|
||||
if: github.ref_name == 'main'
|
||||
needs: [ tag, promote-images ]
|
||||
uses: ./.github/workflows/_push-to-acr.yml
|
||||
with:
|
||||
client_id: ${{ vars.AZURE_DEV_CLIENT_ID }}
|
||||
image_tag: ${{ needs.tag.outputs.build-tag }}
|
||||
images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16
|
||||
registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
|
||||
subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
|
||||
tenant_id: ${{ vars.AZURE_TENANT_ID }}
|
||||
|
||||
push-to-acr-prod:
|
||||
if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
||||
needs: [ tag, promote-images ]
|
||||
uses: ./.github/workflows/_push-to-acr.yml
|
||||
with:
|
||||
client_id: ${{ vars.AZURE_PROD_CLIENT_ID }}
|
||||
image_tag: ${{ needs.tag.outputs.build-tag }}
|
||||
images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16
|
||||
registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
|
||||
subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
|
||||
tenant_id: ${{ vars.AZURE_TENANT_ID }}
|
||||
|
||||
trigger-custom-extensions-build-and-wait:
|
||||
needs: [ check-permissions, tag ]
|
||||
runs-on: ubuntu-22.04
|
||||
@@ -956,8 +961,8 @@ jobs:
|
||||
exit 1
|
||||
|
||||
deploy:
|
||||
needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
|
||||
if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy') && !failure() && !cancelled()
|
||||
needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
|
||||
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
||||
|
||||
runs-on: [ self-hosted, small ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||
|
||||
34
.github/workflows/label-for-external-users.yml
vendored
34
.github/workflows/label-for-external-users.yml
vendored
@@ -7,11 +7,6 @@ on:
|
||||
pull_request_target:
|
||||
types:
|
||||
- opened
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
github-actor:
|
||||
description: 'GitHub username. If empty, the username of the current user will be used'
|
||||
required: false
|
||||
|
||||
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
|
||||
permissions: {}
|
||||
@@ -31,31 +26,12 @@ jobs:
|
||||
id: check-user
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
ACTOR: ${{ inputs.github-actor || github.actor }}
|
||||
run: |
|
||||
expected_error="User does not exist or is not a member of the organization"
|
||||
output_file=output.txt
|
||||
|
||||
for i in $(seq 1 10); do
|
||||
if gh api "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${ACTOR}" \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" > ${output_file}; then
|
||||
|
||||
is_member=true
|
||||
break
|
||||
elif grep -q "${expected_error}" ${output_file}; then
|
||||
is_member=false
|
||||
break
|
||||
elif [ $i -eq 10 ]; then
|
||||
title="Failed to get memmbership status for ${ACTOR}"
|
||||
message="The latest GitHub API error message: '$(cat ${output_file})'"
|
||||
echo "::error file=.github/workflows/label-for-external-users.yml,title=${title}::${message}"
|
||||
|
||||
exit 1
|
||||
fi
|
||||
|
||||
sleep 1
|
||||
done
|
||||
if gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${GITHUB_ACTOR}"; then
|
||||
is_member=true
|
||||
else
|
||||
is_member=false
|
||||
fi
|
||||
|
||||
echo "is-member=${is_member}" | tee -a ${GITHUB_OUTPUT}
|
||||
|
||||
|
||||
115
Cargo.lock
generated
115
Cargo.lock
generated
@@ -1189,9 +1189,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "comfy-table"
|
||||
version = "7.1.1"
|
||||
version = "6.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b34115915337defe99b2aff5c2ce6771e5fbc4079f4b506301f5cf394c8452f7"
|
||||
checksum = "6e7b787b0dc42e8111badfdbe4c3059158ccb2db8780352fa1b01e8ccf45cc4d"
|
||||
dependencies = [
|
||||
"crossterm",
|
||||
"strum",
|
||||
@@ -1246,7 +1246,7 @@ dependencies = [
|
||||
"tokio-postgres",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"toml_edit 0.19.10",
|
||||
"tracing",
|
||||
"tracing-opentelemetry",
|
||||
"tracing-subscriber",
|
||||
@@ -1360,8 +1360,8 @@ dependencies = [
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-util",
|
||||
"toml",
|
||||
"toml_edit",
|
||||
"toml 0.7.4",
|
||||
"toml_edit 0.19.10",
|
||||
"tracing",
|
||||
"url",
|
||||
"utils",
|
||||
@@ -1485,22 +1485,25 @@ checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
|
||||
|
||||
[[package]]
|
||||
name = "crossterm"
|
||||
version = "0.27.0"
|
||||
version = "0.25.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f476fe445d41c9e991fd07515a6f463074b782242ccf4a5b7b1d1012e70824df"
|
||||
checksum = "e64e6c0fbe2c17357405f7c758c1ef960fce08bdfb2c03d88d2a18d7e09c4b67"
|
||||
dependencies = [
|
||||
"bitflags 2.4.1",
|
||||
"bitflags 1.3.2",
|
||||
"crossterm_winapi",
|
||||
"libc",
|
||||
"mio",
|
||||
"parking_lot 0.12.1",
|
||||
"signal-hook",
|
||||
"signal-hook-mio",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossterm_winapi"
|
||||
version = "0.9.1"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b"
|
||||
checksum = "2ae1b35a484aa10e07fe0638d02301c5ad24de82d310ccbd2f3693da5f09bf1c"
|
||||
dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
@@ -3141,7 +3144,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fd01039851e82f8799046eabbb354056283fb265c8ec0996af940f4e85a380ff"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"toml",
|
||||
"toml 0.8.14",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3657,7 +3660,7 @@ dependencies = [
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"toml_edit 0.19.10",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
@@ -3744,7 +3747,7 @@ dependencies = [
|
||||
"tokio-stream",
|
||||
"tokio-tar",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"toml_edit 0.19.10",
|
||||
"tracing",
|
||||
"twox-hash",
|
||||
"url",
|
||||
@@ -3907,9 +3910,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "parquet"
|
||||
version = "53.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f0fbf928021131daaa57d334ca8e3904fe9ae22f73c56244fc7db9b04eedc3d8"
|
||||
version = "51.0.0"
|
||||
source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"bytes",
|
||||
@@ -3928,9 +3930,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "parquet_derive"
|
||||
version = "53.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "86e9fcfae007533a06b580429a3f7e07cb833ec8aa37c041c16563e7918f057e"
|
||||
version = "51.0.0"
|
||||
source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
|
||||
dependencies = [
|
||||
"parquet",
|
||||
"proc-macro2",
|
||||
@@ -4120,7 +4121,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres"
|
||||
version = "0.19.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -4133,7 +4134,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
dependencies = [
|
||||
"base64 0.20.0",
|
||||
"byteorder",
|
||||
@@ -4152,7 +4153,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-types"
|
||||
version = "0.2.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -4811,7 +4812,7 @@ dependencies = [
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"toml_edit 0.19.10",
|
||||
"tracing",
|
||||
"utils",
|
||||
]
|
||||
@@ -5321,7 +5322,7 @@ dependencies = [
|
||||
"tokio-stream",
|
||||
"tokio-tar",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"toml_edit 0.19.10",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
@@ -5730,6 +5731,17 @@ dependencies = [
|
||||
"signal-hook-registry",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "signal-hook-mio"
|
||||
version = "0.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "29ad2e15f37ec9a6cc544097b78a1ec90001e9f71b81338ca39f430adaca99af"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"mio",
|
||||
"signal-hook",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "signal-hook-registry"
|
||||
version = "1.4.1"
|
||||
@@ -6042,21 +6054,21 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
|
||||
|
||||
[[package]]
|
||||
name = "strum"
|
||||
version = "0.26.3"
|
||||
version = "0.24.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06"
|
||||
checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f"
|
||||
|
||||
[[package]]
|
||||
name = "strum_macros"
|
||||
version = "0.26.4"
|
||||
version = "0.24.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
|
||||
checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59"
|
||||
dependencies = [
|
||||
"heck 0.5.0",
|
||||
"heck 0.4.1",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"rustversion",
|
||||
"syn 2.0.52",
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6397,7 +6409,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.7"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
@@ -6508,6 +6520,18 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d6135d499e69981f9ff0ef2167955a5333c35e36f6937d382974566b3d5b94ec"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"serde_spanned",
|
||||
"toml_datetime",
|
||||
"toml_edit 0.19.10",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml"
|
||||
version = "0.8.14"
|
||||
@@ -6517,7 +6541,7 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_spanned",
|
||||
"toml_datetime",
|
||||
"toml_edit",
|
||||
"toml_edit 0.22.14",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6529,6 +6553,19 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml_edit"
|
||||
version = "0.19.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2380d56e8670370eee6566b0bfd4265f65b3f432e8c6d85623f728d4fa31f739"
|
||||
dependencies = [
|
||||
"indexmap 1.9.3",
|
||||
"serde",
|
||||
"serde_spanned",
|
||||
"toml_datetime",
|
||||
"winnow 0.4.6",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml_edit"
|
||||
version = "0.22.14"
|
||||
@@ -6539,7 +6576,7 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_spanned",
|
||||
"toml_datetime",
|
||||
"winnow",
|
||||
"winnow 0.6.13",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6952,7 +6989,7 @@ dependencies = [
|
||||
"tokio-stream",
|
||||
"tokio-tar",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"toml_edit 0.19.10",
|
||||
"tracing",
|
||||
"tracing-error",
|
||||
"tracing-subscriber",
|
||||
@@ -7498,6 +7535,15 @@ version = "0.52.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
|
||||
|
||||
[[package]]
|
||||
name = "winnow"
|
||||
version = "0.4.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "61de7bac303dc551fe038e2b3cef0f571087a47571ea6e79a87692ac99b99699"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winnow"
|
||||
version = "0.6.13"
|
||||
@@ -7605,7 +7651,6 @@ dependencies = [
|
||||
"tokio",
|
||||
"tokio-rustls 0.24.0",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"tonic",
|
||||
"tower",
|
||||
"tracing",
|
||||
|
||||
39
Cargo.toml
39
Cargo.toml
@@ -73,7 +73,7 @@ camino = "1.1.6"
|
||||
cfg-if = "1.0.0"
|
||||
chrono = { version = "0.4", default-features = false, features = ["clock"] }
|
||||
clap = { version = "4.0", features = ["derive"] }
|
||||
comfy-table = "7.1"
|
||||
comfy-table = "6.1"
|
||||
const_format = "0.2"
|
||||
crc32c = "0.6"
|
||||
crossbeam-deque = "0.8.5"
|
||||
@@ -123,8 +123,8 @@ opentelemetry = "0.20.0"
|
||||
opentelemetry-otlp = { version = "0.13.0", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-semantic-conventions = "0.12.0"
|
||||
parking_lot = "0.12"
|
||||
parquet = { version = "53", default-features = false, features = ["zstd"] }
|
||||
parquet_derive = "53"
|
||||
parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
|
||||
parquet_derive = "51.0.0"
|
||||
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
|
||||
pin-project-lite = "0.2"
|
||||
procfs = "0.16"
|
||||
@@ -158,8 +158,8 @@ signal-hook = "0.3"
|
||||
smallvec = "1.11"
|
||||
smol_str = { version = "0.2.0", features = ["serde"] }
|
||||
socket2 = "0.5"
|
||||
strum = "0.26"
|
||||
strum_macros = "0.26"
|
||||
strum = "0.24"
|
||||
strum_macros = "0.24"
|
||||
"subtle" = "2.5.0"
|
||||
svg_fmt = "0.4.3"
|
||||
sync_wrapper = "0.1.2"
|
||||
@@ -177,8 +177,8 @@ tokio-rustls = "0.25"
|
||||
tokio-stream = "0.1"
|
||||
tokio-tar = "0.3"
|
||||
tokio-util = { version = "0.7.10", features = ["io", "rt"] }
|
||||
toml = "0.8"
|
||||
toml_edit = "0.22"
|
||||
toml = "0.7"
|
||||
toml_edit = "0.19"
|
||||
tonic = {version = "0.9", features = ["tls", "tls-roots"]}
|
||||
tower-service = "0.3.2"
|
||||
tracing = "0.1"
|
||||
@@ -201,21 +201,10 @@ env_logger = "0.10"
|
||||
log = "0.4"
|
||||
|
||||
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
||||
|
||||
# We want to use the 'neon' branch for these, but there's currently one
|
||||
# incompatible change on the branch. See:
|
||||
#
|
||||
# - PR #8076 which contained changes that depended on the new changes in
|
||||
# the rust-postgres crate, and
|
||||
# - PR #8654 which reverted those changes and made the code in proxy incompatible
|
||||
# with the tip of the 'neon' branch again.
|
||||
#
|
||||
# When those proxy changes are re-applied (see PR #8747), we can switch using
|
||||
# the tip of the 'neon' branch again.
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
|
||||
## Local libraries
|
||||
compute_api = { version = "0.1", path = "./libs/compute_api/" }
|
||||
@@ -252,7 +241,11 @@ tonic-build = "0.9"
|
||||
[patch.crates-io]
|
||||
|
||||
# Needed to get `tokio-postgres-rustls` to depend on our fork.
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
|
||||
# bug fixes for UUID
|
||||
parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" }
|
||||
parquet_derive = { git = "https://github.com/apache/arrow-rs", branch = "master" }
|
||||
|
||||
################# Binary contents sections
|
||||
|
||||
|
||||
@@ -207,7 +207,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
|
||||
export PATH="$HOME/.cargo/bin:$PATH" && \
|
||||
. "$HOME/.cargo/env" && \
|
||||
cargo --version && rustup --version && \
|
||||
rustup component add llvm-tools rustfmt clippy && \
|
||||
rustup component add llvm-tools-preview rustfmt clippy && \
|
||||
cargo install rustfilt --version ${RUSTFILT_VERSION} && \
|
||||
cargo install cargo-hakari --version ${CARGO_HAKARI_VERSION} && \
|
||||
cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
|
||||
|
||||
@@ -22,10 +22,9 @@ use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
|
||||
|
||||
const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds
|
||||
|
||||
/// Escape a string for including it in a SQL literal.
|
||||
///
|
||||
/// Wrapping the result with `E'{}'` or `'{}'` is not required,
|
||||
/// as it returns a ready-to-use SQL string literal, e.g. `'db'''` or `E'db\\'`.
|
||||
/// Escape a string for including it in a SQL literal. Wrapping the result
|
||||
/// with `E'{}'` or `'{}'` is not required, as it returns a ready-to-use
|
||||
/// SQL string literal, e.g. `'db'''` or `E'db\\'`.
|
||||
/// See <https://github.com/postgres/postgres/blob/da98d005cdbcd45af563d0c4ac86d0e9772cd15f/src/backend/utils/adt/quote.c#L47>
|
||||
/// for the original implementation.
|
||||
pub fn escape_literal(s: &str) -> String {
|
||||
|
||||
@@ -75,14 +75,14 @@ impl PageServerNode {
|
||||
}
|
||||
}
|
||||
|
||||
fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::DocumentMut {
|
||||
toml_edit::DocumentMut::from_str(&format!("id={node_id}")).unwrap()
|
||||
fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::Document {
|
||||
toml_edit::Document::from_str(&format!("id={node_id}")).unwrap()
|
||||
}
|
||||
|
||||
fn pageserver_init_make_toml(
|
||||
&self,
|
||||
conf: NeonLocalInitPageserverConf,
|
||||
) -> anyhow::Result<toml_edit::DocumentMut> {
|
||||
) -> anyhow::Result<toml_edit::Document> {
|
||||
assert_eq!(&PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully");
|
||||
|
||||
// TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656)
|
||||
@@ -137,9 +137,9 @@ impl PageServerNode {
|
||||
|
||||
// Turn `overrides` into a toml document.
|
||||
// TODO: above code is legacy code, it should be refactored to use toml_edit directly.
|
||||
let mut config_toml = toml_edit::DocumentMut::new();
|
||||
let mut config_toml = toml_edit::Document::new();
|
||||
for fragment_str in overrides {
|
||||
let fragment = toml_edit::DocumentMut::from_str(&fragment_str)
|
||||
let fragment = toml_edit::Document::from_str(&fragment_str)
|
||||
.expect("all fragments in `overrides` are valid toml documents, this function controls that");
|
||||
for (key, item) in fragment.iter() {
|
||||
config_toml.insert(key, item.clone());
|
||||
|
||||
@@ -4,8 +4,8 @@ use std::{str::FromStr, time::Duration};
|
||||
use clap::{Parser, Subcommand};
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse, ShardSchedulingPolicy,
|
||||
TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
|
||||
NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest,
|
||||
TenantDescribeResponse, TenantPolicyRequest,
|
||||
},
|
||||
models::{
|
||||
EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
|
||||
@@ -80,10 +80,7 @@ enum Command {
|
||||
/// List nodes known to the storage controller
|
||||
Nodes {},
|
||||
/// List tenants known to the storage controller
|
||||
Tenants {
|
||||
/// If this field is set, it will list the tenants on a specific node
|
||||
node_id: Option<NodeId>,
|
||||
},
|
||||
Tenants {},
|
||||
/// Create a new tenant in the storage controller, and by extension on pageservers.
|
||||
TenantCreate {
|
||||
#[arg(long)]
|
||||
@@ -406,41 +403,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Command::Tenants {
|
||||
node_id: Some(node_id),
|
||||
} => {
|
||||
let describe_response = storcon_client
|
||||
.dispatch::<(), NodeShardResponse>(
|
||||
Method::GET,
|
||||
format!("control/v1/node/{node_id}/shards"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
let shards = describe_response.shards;
|
||||
let mut table = comfy_table::Table::new();
|
||||
table.set_header([
|
||||
"Shard",
|
||||
"Intended Primary/Secondary",
|
||||
"Observed Primary/Secondary",
|
||||
]);
|
||||
for shard in shards {
|
||||
table.add_row([
|
||||
format!("{}", shard.tenant_shard_id),
|
||||
match shard.is_intended_secondary {
|
||||
None => "".to_string(),
|
||||
Some(true) => "Secondary".to_string(),
|
||||
Some(false) => "Primary".to_string(),
|
||||
},
|
||||
match shard.is_observed_secondary {
|
||||
None => "".to_string(),
|
||||
Some(true) => "Secondary".to_string(),
|
||||
Some(false) => "Primary".to_string(),
|
||||
},
|
||||
]);
|
||||
}
|
||||
println!("{table}");
|
||||
}
|
||||
Command::Tenants { node_id: None } => {
|
||||
Command::Tenants {} => {
|
||||
let mut resp = storcon_client
|
||||
.dispatch::<(), Vec<TenantDescribeResponse>>(
|
||||
Method::GET,
|
||||
|
||||
@@ -68,7 +68,6 @@ macro_rules! register_uint_gauge {
|
||||
static INTERNAL_REGISTRY: Lazy<Registry> = Lazy::new(Registry::new);
|
||||
|
||||
/// Register a collector in the internal registry. MUST be called before the first call to `gather()`.
|
||||
///
|
||||
/// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector
|
||||
/// while holding the lock.
|
||||
pub fn register_internal(c: Box<dyn Collector>) -> prometheus::Result<()> {
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::collections::HashSet;
|
||||
use std::str::FromStr;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
@@ -74,17 +74,6 @@ pub struct TenantPolicyRequest {
|
||||
pub scheduling: Option<ShardSchedulingPolicy>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ShardsPreferredAzsRequest {
|
||||
#[serde(flatten)]
|
||||
pub preferred_az_ids: HashMap<TenantShardId, String>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ShardsPreferredAzsResponse {
|
||||
pub updated: Vec<TenantShardId>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantLocateResponseShard {
|
||||
pub shard_id: TenantShardId,
|
||||
@@ -112,21 +101,6 @@ pub struct TenantDescribeResponse {
|
||||
pub config: TenantConfig,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct NodeShardResponse {
|
||||
pub node_id: NodeId,
|
||||
pub shards: Vec<NodeShard>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct NodeShard {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
/// Whether the shard is observed secondary on a specific node. True = yes, False = no, None = not on this node.
|
||||
pub is_observed_secondary: Option<bool>,
|
||||
/// Whether the shard is intended to be a secondary on a specific node. True = yes, False = no, None = not on this node.
|
||||
pub is_intended_secondary: Option<bool>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct NodeDescribeResponse {
|
||||
pub id: NodeId,
|
||||
@@ -158,12 +132,8 @@ pub struct TenantDescribeResponseShard {
|
||||
pub is_splitting: bool,
|
||||
|
||||
pub scheduling_policy: ShardSchedulingPolicy,
|
||||
|
||||
pub preferred_az_id: Option<String>,
|
||||
}
|
||||
|
||||
/// Migration request for a given tenant shard to a given node.
|
||||
///
|
||||
/// Explicitly migrating a particular shard is a low level operation
|
||||
/// TODO: higher level "Reschedule tenant" operation where the request
|
||||
/// specifies some constraints, e.g. asking it to get off particular node(s)
|
||||
|
||||
@@ -263,6 +263,15 @@ impl Key {
|
||||
field5: u8::MAX,
|
||||
field6: u32::MAX,
|
||||
};
|
||||
/// A key slightly smaller than [`Key::MAX`] for use in layer key ranges to avoid them to be confused with L0 layers
|
||||
pub const NON_L0_MAX: Key = Key {
|
||||
field1: u8::MAX,
|
||||
field2: u32::MAX,
|
||||
field3: u32::MAX,
|
||||
field4: u32::MAX,
|
||||
field5: u8::MAX,
|
||||
field6: u32::MAX - 1,
|
||||
};
|
||||
|
||||
pub fn from_hex(s: &str) -> Result<Self> {
|
||||
if s.len() != 36 {
|
||||
|
||||
@@ -62,7 +62,7 @@ use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
serde::Serialize,
|
||||
serde::Deserialize,
|
||||
strum_macros::Display,
|
||||
strum_macros::VariantNames,
|
||||
strum_macros::EnumVariantNames,
|
||||
strum_macros::AsRefStr,
|
||||
strum_macros::IntoStaticStr,
|
||||
)]
|
||||
@@ -305,10 +305,8 @@ pub struct TenantConfig {
|
||||
pub lsn_lease_length_for_ts: Option<String>,
|
||||
}
|
||||
|
||||
/// The policy for the aux file storage.
|
||||
///
|
||||
/// It can be switched through `switch_aux_file_policy` tenant config.
|
||||
/// When the first aux file written, the policy will be persisted in the
|
||||
/// The policy for the aux file storage. It can be switched through `switch_aux_file_policy`
|
||||
/// tenant config. When the first aux file written, the policy will be persisted in the
|
||||
/// `index_part.json` file and has a limited migration path.
|
||||
///
|
||||
/// Currently, we only allow the following migration path:
|
||||
@@ -898,9 +896,7 @@ pub struct WalRedoManagerStatus {
|
||||
pub process: Option<WalRedoManagerProcessStatus>,
|
||||
}
|
||||
|
||||
/// The progress of a secondary tenant.
|
||||
///
|
||||
/// It is mostly useful when doing a long running download: e.g. initiating
|
||||
/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
|
||||
/// a download job, timing out while waiting for it to run, and then inspecting this status to understand
|
||||
/// what's happening.
|
||||
#[derive(Default, Debug, Serialize, Deserialize, Clone)]
|
||||
|
||||
@@ -69,10 +69,8 @@ impl QueryError {
|
||||
}
|
||||
|
||||
/// Returns true if the given error is a normal consequence of a network issue,
|
||||
/// or the client closing the connection.
|
||||
///
|
||||
/// These errors can happen during normal operations,
|
||||
/// and don't indicate a bug in our code.
|
||||
/// or the client closing the connection. These errors can happen during normal
|
||||
/// operations, and don't indicate a bug in our code.
|
||||
pub fn is_expected_io_error(e: &io::Error) -> bool {
|
||||
use io::ErrorKind::*;
|
||||
matches!(
|
||||
|
||||
@@ -7,7 +7,6 @@ use std::fmt;
|
||||
use url::Host;
|
||||
|
||||
/// Parses a string of format either `host:port` or `host` into a corresponding pair.
|
||||
///
|
||||
/// The `host` part should be a correct `url::Host`, while `port` (if present) should be
|
||||
/// a valid decimal u16 of digits only.
|
||||
pub fn parse_host_port<S: AsRef<str>>(host_port: S) -> Result<(Host, Option<u16>), anyhow::Error> {
|
||||
|
||||
@@ -121,7 +121,6 @@ fn main() -> anyhow::Result<()> {
|
||||
.allowlist_type("XLogPageHeaderData")
|
||||
.allowlist_type("XLogLongPageHeaderData")
|
||||
.allowlist_var("XLOG_PAGE_MAGIC")
|
||||
.allowlist_var("PG_MAJORVERSION_NUM")
|
||||
.allowlist_var("PG_CONTROL_FILE_SIZE")
|
||||
.allowlist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC")
|
||||
.allowlist_type("PageHeaderData")
|
||||
|
||||
@@ -44,9 +44,6 @@ macro_rules! postgres_ffi {
|
||||
// Re-export some symbols from bindings
|
||||
pub use bindings::DBState_DB_SHUTDOWNED;
|
||||
pub use bindings::{CheckPoint, ControlFileData, XLogRecord};
|
||||
|
||||
pub const ZERO_CHECKPOINT: bytes::Bytes =
|
||||
bytes::Bytes::from_static(&[0u8; xlog_utils::SIZEOF_CHECKPOINT]);
|
||||
}
|
||||
};
|
||||
}
|
||||
@@ -109,107 +106,6 @@ macro_rules! dispatch_pgversion {
|
||||
};
|
||||
}
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! enum_pgversion_dispatch {
|
||||
($name:expr, $typ:ident, $bind:ident, $code:block) => {
|
||||
enum_pgversion_dispatch!(
|
||||
name = $name,
|
||||
bind = $bind,
|
||||
typ = $typ,
|
||||
code = $code,
|
||||
pgversions = [
|
||||
V14 : v14,
|
||||
V15 : v15,
|
||||
V16 : v16,
|
||||
]
|
||||
)
|
||||
};
|
||||
(name = $name:expr,
|
||||
bind = $bind:ident,
|
||||
typ = $typ:ident,
|
||||
code = $code:block,
|
||||
pgversions = [$($variant:ident : $md:ident),+ $(,)?]) => {
|
||||
match $name {
|
||||
$(
|
||||
self::$typ::$variant($bind) => {
|
||||
use $crate::$md as pgv;
|
||||
$code
|
||||
}
|
||||
),+,
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! enum_pgversion {
|
||||
{$name:ident, pgv :: $t:ident} => {
|
||||
enum_pgversion!{
|
||||
name = $name,
|
||||
typ = $t,
|
||||
pgversions = [
|
||||
V14 : v14,
|
||||
V15 : v15,
|
||||
V16 : v16,
|
||||
]
|
||||
}
|
||||
};
|
||||
{$name:ident, pgv :: $p:ident :: $t:ident} => {
|
||||
enum_pgversion!{
|
||||
name = $name,
|
||||
path = $p,
|
||||
typ = $t,
|
||||
pgversions = [
|
||||
V14 : v14,
|
||||
V15 : v15,
|
||||
V16 : v16,
|
||||
]
|
||||
}
|
||||
};
|
||||
{name = $name:ident,
|
||||
typ = $t:ident,
|
||||
pgversions = [$($variant:ident : $md:ident),+ $(,)?]} => {
|
||||
pub enum $name {
|
||||
$($variant ( $crate::$md::$t )),+
|
||||
}
|
||||
impl self::$name {
|
||||
pub fn pg_version(&self) -> u32 {
|
||||
enum_pgversion_dispatch!(self, $name, _ign, {
|
||||
pgv::bindings::PG_MAJORVERSION_NUM
|
||||
})
|
||||
}
|
||||
}
|
||||
$(
|
||||
impl Into<self::$name> for $crate::$md::$t {
|
||||
fn into(self) -> self::$name {
|
||||
self::$name::$variant (self)
|
||||
}
|
||||
}
|
||||
)+
|
||||
};
|
||||
{name = $name:ident,
|
||||
path = $p:ident,
|
||||
typ = $t:ident,
|
||||
pgversions = [$($variant:ident : $md:ident),+ $(,)?]} => {
|
||||
pub enum $name {
|
||||
$($variant ($crate::$md::$p::$t)),+
|
||||
}
|
||||
impl $name {
|
||||
pub fn pg_version(&self) -> u32 {
|
||||
enum_pgversion_dispatch!(self, $name, _ign, {
|
||||
pgv::bindings::PG_MAJORVERSION_NUM
|
||||
})
|
||||
}
|
||||
}
|
||||
$(
|
||||
impl Into<$name> for $crate::$md::$p::$t {
|
||||
fn into(self) -> $name {
|
||||
$name::$variant (self)
|
||||
}
|
||||
}
|
||||
)+
|
||||
};
|
||||
}
|
||||
|
||||
pub mod pg_constants;
|
||||
pub mod relfile_utils;
|
||||
|
||||
|
||||
@@ -185,7 +185,7 @@ mod tests {
|
||||
use super::*;
|
||||
|
||||
fn parse(input: &str) -> anyhow::Result<RemoteStorageConfig> {
|
||||
let toml = input.parse::<toml_edit::DocumentMut>().unwrap();
|
||||
let toml = input.parse::<toml_edit::Document>().unwrap();
|
||||
RemoteStorageConfig::from_toml(toml.as_item())
|
||||
}
|
||||
|
||||
|
||||
@@ -45,8 +45,6 @@ pub use azure_core::Etag;
|
||||
|
||||
pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
|
||||
|
||||
/// Default concurrency limit for S3 operations
|
||||
///
|
||||
/// Currently, sync happens with AWS S3, that has two limits on requests per second:
|
||||
/// ~200 RPS for IAM services
|
||||
/// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
|
||||
@@ -302,9 +300,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
) -> Result<(), TimeTravelError>;
|
||||
}
|
||||
|
||||
/// Data part of an ongoing [`Download`].
|
||||
///
|
||||
/// `DownloadStream` is sensitive to the timeout and cancellation used with the original
|
||||
/// DownloadStream is sensitive to the timeout and cancellation used with the original
|
||||
/// [`RemoteStorage::download`] request. The type yields `std::io::Result<Bytes>` to be compatible
|
||||
/// with `tokio::io::copy_buf`.
|
||||
// This has 'static because safekeepers do not use cancellation tokens (yet)
|
||||
|
||||
@@ -60,16 +60,3 @@ pub struct TimelineCopyRequest {
|
||||
pub target_timeline_id: TimelineId,
|
||||
pub until_lsn: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct TimelineTermBumpRequest {
|
||||
/// bump to
|
||||
pub term: Option<u64>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct TimelineTermBumpResponse {
|
||||
// before the request
|
||||
pub previous_term: u64,
|
||||
pub current_term: u64,
|
||||
}
|
||||
|
||||
@@ -5,10 +5,9 @@
|
||||
mod calculation;
|
||||
pub mod svg;
|
||||
|
||||
/// StorageModel is the input to the synthetic size calculation.
|
||||
///
|
||||
/// It represents a tree of timelines, with just the information that's needed
|
||||
/// for the calculation. This doesn't track timeline names or where each timeline
|
||||
/// StorageModel is the input to the synthetic size calculation. It represents
|
||||
/// a tree of timelines, with just the information that's needed for the
|
||||
/// calculation. This doesn't track timeline names or where each timeline
|
||||
/// begins and ends, for example. Instead, it consists of "points of interest"
|
||||
/// on the timelines. A point of interest could be the timeline start or end point,
|
||||
/// the oldest point on a timeline that needs to be retained because of PITR
|
||||
|
||||
@@ -5,10 +5,8 @@ use std::{
|
||||
|
||||
use metrics::IntCounter;
|
||||
|
||||
/// Circuit breakers are for operations that are expensive and fallible.
|
||||
///
|
||||
/// If a circuit breaker fails repeatedly, we will stop attempting it for some
|
||||
/// period of time, to avoid denial-of-service from retries, and
|
||||
/// Circuit breakers are for operations that are expensive and fallible: if they fail repeatedly,
|
||||
/// we will stop attempting them for some period of time, to avoid denial-of-service from retries, and
|
||||
/// to mitigate the log spam from repeated failures.
|
||||
pub struct CircuitBreaker {
|
||||
/// An identifier that enables us to log useful errors when a circuit is broken
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use std::os::fd::AsRawFd;
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
fs::{self, File},
|
||||
@@ -204,27 +203,6 @@ pub fn overwrite(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Syncs the filesystem for the given file descriptor.
|
||||
#[cfg_attr(target_os = "macos", allow(unused_variables))]
|
||||
pub fn syncfs(fd: impl AsRawFd) -> anyhow::Result<()> {
|
||||
// Linux guarantees durability for syncfs.
|
||||
// POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
|
||||
#[cfg(target_os = "linux")]
|
||||
{
|
||||
use anyhow::Context;
|
||||
nix::unistd::syncfs(fd.as_raw_fd()).context("syncfs")?;
|
||||
}
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
// macOS is not a production platform for Neon, don't even bother.
|
||||
}
|
||||
#[cfg(not(any(target_os = "linux", target_os = "macos")))]
|
||||
{
|
||||
compile_error!("Unsupported OS");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
|
||||
@@ -249,10 +249,8 @@ macro_rules! id_newtype {
|
||||
};
|
||||
}
|
||||
|
||||
/// Neon timeline ID.
|
||||
///
|
||||
/// They are different from PostgreSQL timeline
|
||||
/// IDs, but serve a similar purpose: they differentiate
|
||||
/// Neon timeline IDs are different from PostgreSQL timeline
|
||||
/// IDs. They serve a similar purpose though: they differentiate
|
||||
/// between different "histories" of the same cluster. However,
|
||||
/// PostgreSQL timeline IDs are a bit cumbersome, because they are only
|
||||
/// 32-bits wide, and they must be in ascending order in any given
|
||||
|
||||
@@ -100,9 +100,7 @@ pub enum LockFileRead {
|
||||
}
|
||||
|
||||
/// Open & try to lock the lock file at the given `path`, returning a [handle][`LockFileRead`] to
|
||||
/// inspect its content.
|
||||
///
|
||||
/// It is not an `Err(...)` if the file does not exist or is already locked.
|
||||
/// inspect its content. It is not an `Err(...)` if the file does not exist or is already locked.
|
||||
/// Check the [`LockFileRead`] variants for details.
|
||||
pub fn read_and_hold_lock_file(path: &Utf8Path) -> anyhow::Result<LockFileRead> {
|
||||
let res = fs::OpenOptions::new().read(true).open(path);
|
||||
|
||||
@@ -3,9 +3,11 @@ use std::str::FromStr;
|
||||
use anyhow::Context;
|
||||
use metrics::{IntCounter, IntCounterVec};
|
||||
use once_cell::sync::Lazy;
|
||||
use strum_macros::{EnumString, VariantNames};
|
||||
use strum_macros::{EnumString, EnumVariantNames};
|
||||
|
||||
#[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)]
|
||||
#[derive(
|
||||
EnumString, strum_macros::Display, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy,
|
||||
)]
|
||||
#[strum(serialize_all = "snake_case")]
|
||||
pub enum LogFormat {
|
||||
Plain,
|
||||
@@ -188,7 +190,7 @@ impl Drop for TracingPanicHookGuard {
|
||||
}
|
||||
|
||||
/// Named symbol for our panic hook, which logs the panic.
|
||||
fn tracing_panic_hook(info: &std::panic::PanicHookInfo) {
|
||||
fn tracing_panic_hook(info: &std::panic::PanicInfo) {
|
||||
// following rust 1.66.1 std implementation:
|
||||
// https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288
|
||||
let location = info.location();
|
||||
|
||||
@@ -8,7 +8,6 @@ use tracing::{trace, warn};
|
||||
use crate::lsn::Lsn;
|
||||
|
||||
/// Feedback pageserver sends to safekeeper and safekeeper resends to compute.
|
||||
///
|
||||
/// Serialized in custom flexible key/value format. In replication protocol, it
|
||||
/// is marked with NEON_STATUS_UPDATE_TAG_BYTE to differentiate from postgres
|
||||
/// Standby status update / Hot standby feedback messages.
|
||||
|
||||
@@ -65,8 +65,6 @@ impl<T> Poison<T> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Armed pointer to a [`Poison`].
|
||||
///
|
||||
/// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state.
|
||||
/// Once modifications are done, use [`Self::disarm`].
|
||||
/// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned
|
||||
|
||||
@@ -13,11 +13,10 @@ pub struct ShardNumber(pub u8);
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
||||
pub struct ShardCount(pub u8);
|
||||
|
||||
/// Combination of ShardNumber and ShardCount.
|
||||
///
|
||||
/// For use within the context of a particular tenant, when we need to know which shard we're
|
||||
/// dealing with, but do not need to know the full ShardIdentity (because we won't be doing
|
||||
/// any page->shard mapping), and do not need to know the fully qualified TenantShardId.
|
||||
/// Combination of ShardNumber and ShardCount. For use within the context of a particular tenant,
|
||||
/// when we need to know which shard we're dealing with, but do not need to know the full
|
||||
/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
|
||||
/// the fully qualified TenantShardId.
|
||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||
pub struct ShardIndex {
|
||||
pub shard_number: ShardNumber,
|
||||
|
||||
@@ -49,11 +49,12 @@ use std::sync::{RwLock, RwLockWriteGuard};
|
||||
|
||||
use tokio::sync::watch;
|
||||
|
||||
/// Rcu allows multiple readers to read and hold onto a value without blocking
|
||||
/// (for very long).
|
||||
///
|
||||
/// Storing to the Rcu updates the value, making new readers immediately see
|
||||
/// the new value, but it also waits for all current readers to finish.
|
||||
/// Rcu allows multiple readers to read and hold onto a value without blocking
|
||||
/// (for very long). Storing to the Rcu updates the value, making new readers
|
||||
/// immediately see the new value, but it also waits for all current readers to
|
||||
/// finish.
|
||||
///
|
||||
pub struct Rcu<V> {
|
||||
inner: RwLock<RcuInner<V>>,
|
||||
}
|
||||
|
||||
@@ -5,9 +5,7 @@ use std::sync::{
|
||||
use tokio::sync::Semaphore;
|
||||
|
||||
/// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
|
||||
/// `SemaphorePermit`.
|
||||
///
|
||||
/// Allows use of `take` which does not require holding an outer mutex guard
|
||||
/// `SemaphorePermit`, allowing use of `take` which does not require holding an outer mutex guard
|
||||
/// for the duration of initialization.
|
||||
///
|
||||
/// Has no unsafe, builds upon [`tokio::sync::Semaphore`] and [`std::sync::Mutex`].
|
||||
|
||||
@@ -10,7 +10,7 @@ pub fn deserialize_item<T>(item: &toml_edit::Item) -> Result<T, Error>
|
||||
where
|
||||
T: serde::de::DeserializeOwned,
|
||||
{
|
||||
let document: toml_edit::DocumentMut = match item {
|
||||
let document: toml_edit::Document = match item {
|
||||
toml_edit::Item::Table(toml) => toml.clone().into(),
|
||||
toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
|
||||
toml.clone().into_table().into()
|
||||
|
||||
@@ -7,7 +7,6 @@ pub enum VecMapOrdering {
|
||||
}
|
||||
|
||||
/// Ordered map datastructure implemented in a Vec.
|
||||
///
|
||||
/// Append only - can only add keys that are larger than the
|
||||
/// current max key.
|
||||
/// Ordering can be adjusted using [`VecMapOrdering`]
|
||||
|
||||
@@ -6,10 +6,9 @@ pub enum YieldingLoopError {
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
/// Helper for long synchronous loops, e.g. over all tenants in the system.
|
||||
///
|
||||
/// Periodically yields to avoid blocking the executor, and after resuming
|
||||
/// checks the provided cancellation token to drop out promptly on shutdown.
|
||||
/// Helper for long synchronous loops, e.g. over all tenants in the system. Periodically
|
||||
/// yields to avoid blocking the executor, and after resuming checks the provided
|
||||
/// cancellation token to drop out promptly on shutdown.
|
||||
#[inline(always)]
|
||||
pub async fn yielding_loop<I, T, F>(
|
||||
interval: usize,
|
||||
|
||||
@@ -1,20 +1,2 @@
|
||||
pub mod mgmt_api;
|
||||
pub mod page_service;
|
||||
|
||||
/// For timeline_block_unblock_gc, distinguish the two different operations. This could be a bool.
|
||||
// If file structure is per-kind not per-feature then where to put this?
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum BlockUnblock {
|
||||
Block,
|
||||
Unblock,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for BlockUnblock {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let s = match self {
|
||||
BlockUnblock::Block => "block",
|
||||
BlockUnblock::Unblock => "unblock",
|
||||
};
|
||||
f.write_str(s)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,8 +12,6 @@ use utils::{
|
||||
|
||||
pub use reqwest::Body as ReqwestBody;
|
||||
|
||||
use crate::BlockUnblock;
|
||||
|
||||
pub mod util;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -456,20 +454,6 @@ impl Client {
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn timeline_block_unblock_gc(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
dir: BlockUnblock,
|
||||
) -> Result<()> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/{dir}_gc",
|
||||
self.mgmt_api_endpoint,
|
||||
);
|
||||
|
||||
self.request(Method::POST, &uri, ()).await.map(|_| ())
|
||||
}
|
||||
|
||||
pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{}/reset",
|
||||
|
||||
@@ -174,7 +174,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
println!("specified prefix '{}' failed validation", cmd.prefix);
|
||||
return Ok(());
|
||||
};
|
||||
let toml_document = toml_edit::DocumentMut::from_str(&cmd.config_toml_str)?;
|
||||
let toml_document = toml_edit::Document::from_str(&cmd.config_toml_str)?;
|
||||
let toml_item = toml_document
|
||||
.get("remote_storage")
|
||||
.expect("need remote_storage");
|
||||
|
||||
@@ -37,7 +37,6 @@ use pageserver::{
|
||||
virtual_file,
|
||||
};
|
||||
use postgres_backend::AuthType;
|
||||
use utils::crashsafe::syncfs;
|
||||
use utils::failpoint_support;
|
||||
use utils::logging::TracingErrorLayerEnablement;
|
||||
use utils::{
|
||||
@@ -156,7 +155,23 @@ fn main() -> anyhow::Result<()> {
|
||||
};
|
||||
|
||||
let started = Instant::now();
|
||||
syncfs(dirfd)?;
|
||||
// Linux guarantees durability for syncfs.
|
||||
// POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
|
||||
#[cfg(target_os = "linux")]
|
||||
{
|
||||
use std::os::fd::AsRawFd;
|
||||
nix::unistd::syncfs(dirfd.as_raw_fd()).context("syncfs")?;
|
||||
}
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
// macOS is not a production platform for Neon, don't even bother.
|
||||
drop(dirfd);
|
||||
}
|
||||
#[cfg(not(any(target_os = "linux", target_os = "macos")))]
|
||||
{
|
||||
compile_error!("Unsupported OS");
|
||||
}
|
||||
|
||||
let elapsed = started.elapsed();
|
||||
info!(
|
||||
elapsed_ms = elapsed.as_millis(),
|
||||
|
||||
@@ -180,8 +180,6 @@ pub struct PageServerConf {
|
||||
pub io_buffer_alignment: usize,
|
||||
}
|
||||
|
||||
/// Token for authentication to safekeepers
|
||||
///
|
||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||
/// and/or serialized at a whim, while the token is secret. Currently this token is the
|
||||
/// same for accessing all tenants/timelines, but may become per-tenant/per-timeline in
|
||||
|
||||
@@ -1,9 +1,7 @@
|
||||
//! Defines [`RequestContext`].
|
||||
//!
|
||||
//! It is a structure that we use throughout the pageserver to propagate
|
||||
//! high-level context from places that _originate_ activity down to the
|
||||
//! shared code paths at the heart of the pageserver. It's inspired by
|
||||
//! Golang's `context.Context`.
|
||||
//! This module defines `RequestContext`, a structure that we use throughout
|
||||
//! the pageserver to propagate high-level context from places
|
||||
//! that _originate_ activity down to the shared code paths at the
|
||||
//! heart of the pageserver. It's inspired by Golang's `context.Context`.
|
||||
//!
|
||||
//! For example, in `Timeline::get(page_nr, lsn)` we need to answer the following questions:
|
||||
//! 1. What high-level activity ([`TaskKind`]) needs this page?
|
||||
|
||||
@@ -9,7 +9,7 @@ use metrics::{
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use strum::{EnumCount, VariantNames};
|
||||
use strum_macros::{IntoStaticStr, VariantNames};
|
||||
use strum_macros::{EnumVariantNames, IntoStaticStr};
|
||||
use tracing::warn;
|
||||
use utils::id::TimelineId;
|
||||
|
||||
@@ -27,7 +27,7 @@ const CRITICAL_OP_BUCKETS: &[f64] = &[
|
||||
];
|
||||
|
||||
// Metrics collected on operations on the storage repository.
|
||||
#[derive(Debug, VariantNames, IntoStaticStr)]
|
||||
#[derive(Debug, EnumVariantNames, IntoStaticStr)]
|
||||
#[strum(serialize_all = "kebab_case")]
|
||||
pub(crate) enum StorageTimeOperation {
|
||||
#[strum(serialize = "layer flush")]
|
||||
|
||||
@@ -1021,10 +1021,9 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// DatadirModification represents an operation to ingest an atomic set of
|
||||
/// updates to the repository.
|
||||
///
|
||||
/// It is created by the 'begin_record' function. It is called for each WAL
|
||||
/// record, so that all the modifications by a one WAL record appear atomic.
|
||||
/// updates to the repository. It is created by the 'begin_record'
|
||||
/// function. It is called for each WAL record, so that all the modifications
|
||||
/// by a one WAL record appear atomic.
|
||||
pub struct DatadirModification<'a> {
|
||||
/// The timeline this modification applies to. You can access this to
|
||||
/// read the state, but note that any pending updates are *not* reflected
|
||||
@@ -2049,7 +2048,6 @@ impl<'a> DatadirModification<'a> {
|
||||
|
||||
/// This struct facilitates accessing either a committed key from the timeline at a
|
||||
/// specific LSN, or the latest uncommitted key from a pending modification.
|
||||
///
|
||||
/// During WAL ingestion, the records from multiple LSNs may be batched in the same
|
||||
/// modification before being flushed to the timeline. Hence, the routines in WalIngest
|
||||
/// need to look up the keys in the modification first before looking them up in the
|
||||
|
||||
@@ -1,9 +1,8 @@
|
||||
//! Timeline repository implementation that keeps old data in layer files, and
|
||||
//! the recent changes in ephemeral files.
|
||||
//!
|
||||
//! See tenant/*_layer.rs files. The functions here are responsible for locating
|
||||
//! the correct layer for the get/put call, walking back the timeline branching
|
||||
//! history as needed.
|
||||
//! Timeline repository implementation that keeps old data in files on disk, and
|
||||
//! the recent changes in memory. See tenant/*_layer.rs files.
|
||||
//! The functions here are responsible for locating the correct layer for the
|
||||
//! get/put call, walking back the timeline branching history as needed.
|
||||
//!
|
||||
//! The files are stored in the .neon/tenants/<tenant_id>/timelines/<timeline_id>
|
||||
//! directory. See docs/pageserver-storage.md for how the files are managed.
|
||||
@@ -7091,13 +7090,13 @@ mod tests {
|
||||
vec![
|
||||
// Image layer at GC horizon
|
||||
PersistentLayerKey {
|
||||
key_range: Key::MIN..Key::MAX,
|
||||
key_range: Key::MIN..Key::NON_L0_MAX,
|
||||
lsn_range: Lsn(0x30)..Lsn(0x31),
|
||||
is_delta: false
|
||||
},
|
||||
// The delta layer below the horizon
|
||||
// The delta layer covers the full range (with the layer key hack to avoid being recognized as L0)
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(3)..get_key(4),
|
||||
key_range: Key::MIN..Key::NON_L0_MAX,
|
||||
lsn_range: Lsn(0x30)..Lsn(0x48),
|
||||
is_delta: true
|
||||
},
|
||||
|
||||
@@ -452,8 +452,7 @@ impl TryFrom<toml_edit::Item> for TenantConfOpt {
|
||||
.map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
|
||||
}
|
||||
toml_edit::Item::Table(table) => {
|
||||
let deserializer =
|
||||
toml_edit::de::Deserializer::from(toml_edit::DocumentMut::from(table));
|
||||
let deserializer = toml_edit::de::Deserializer::new(table.into());
|
||||
return serde_path_to_error::deserialize(deserializer)
|
||||
.map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
|
||||
}
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
//! Describes the legacy now hopefully no longer modified per-timeline metadata.
|
||||
//!
|
||||
//! It is stored in `index_part.json` managed by [`remote_timeline_client`]. For many tenants and
|
||||
//! their timelines, this struct and its original serialization format is still needed because
|
||||
//! they were written a long time ago.
|
||||
//! Describes the legacy now hopefully no longer modified per-timeline metadata stored in
|
||||
//! `index_part.json` managed by [`remote_timeline_client`]. For many tenants and their timelines,
|
||||
//! this struct and it's original serialization format is still needed because they were written a
|
||||
//! long time ago.
|
||||
//!
|
||||
//! Instead of changing and adding versioning to this, just change [`IndexPart`] with soft json
|
||||
//! versioning.
|
||||
|
||||
@@ -282,10 +282,9 @@ impl BackgroundPurges {
|
||||
static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
|
||||
Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));
|
||||
|
||||
/// Responsible for storing and mutating the collection of all tenants
|
||||
/// that this pageserver has state for.
|
||||
///
|
||||
/// Every Tenant and SecondaryTenant instance lives inside the TenantManager.
|
||||
/// The TenantManager is responsible for storing and mutating the collection of all tenants
|
||||
/// that this pageserver process has state for. Every Tenant and SecondaryTenant instance
|
||||
/// lives inside the TenantManager.
|
||||
///
|
||||
/// The most important role of the TenantManager is to prevent conflicts: e.g. trying to attach
|
||||
/// the same tenant twice concurrently, or trying to configure the same tenant into secondary
|
||||
@@ -2347,9 +2346,8 @@ pub enum TenantMapError {
|
||||
ShuttingDown,
|
||||
}
|
||||
|
||||
/// Guards a particular tenant_id's content in the TenantsMap.
|
||||
///
|
||||
/// While this structure exists, the TenantsMap will contain a [`TenantSlot::InProgress`]
|
||||
/// Guards a particular tenant_id's content in the TenantsMap. While this
|
||||
/// structure exists, the TenantsMap will contain a [`TenantSlot::InProgress`]
|
||||
/// for this tenant, which acts as a marker for any operations targeting
|
||||
/// this tenant to retry later, or wait for the InProgress state to end.
|
||||
///
|
||||
|
||||
@@ -2184,8 +2184,6 @@ pub fn remote_timeline_path(
|
||||
remote_timelines_path(tenant_shard_id).join(Utf8Path::new(&timeline_id.to_string()))
|
||||
}
|
||||
|
||||
/// Obtains the path of the given Layer in the remote
|
||||
///
|
||||
/// Note that the shard component of a remote layer path is _not_ always the same
|
||||
/// as in the TenantShardId of the caller: tenants may reference layers from a different
|
||||
/// ShardIndex. Use the ShardIndex from the layer's metadata.
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
//! In-memory index to track the tenant files on the remote storage.
|
||||
//!
|
||||
//! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about
|
||||
//! remote timeline layers and its metadata.
|
||||
|
||||
|
||||
@@ -434,11 +434,10 @@ impl ReadableLayer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Layers contain a hint indicating whether they are likely to be used for reads.
|
||||
///
|
||||
/// This is a hint rather than an authoritative value, so that we do not have to update it synchronously
|
||||
/// when changing the visibility of layers (for example when creating a branch that makes some previously
|
||||
/// covered layers visible). It should be used for cache management but not for correctness-critical checks.
|
||||
/// Layers contain a hint indicating whether they are likely to be used for reads. This is a hint rather
|
||||
/// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
|
||||
/// of layers (for example when creating a branch that makes some previously covered layers visible). It should
|
||||
/// be used for cache management but not for correctness-critical checks.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum LayerVisibilityHint {
|
||||
/// A Visible layer might be read while serving a read, because there is not an image layer between it
|
||||
|
||||
@@ -136,11 +136,10 @@ impl Summary {
|
||||
// Flag indicating that this version initialize the page
|
||||
const WILL_INIT: u64 = 1;
|
||||
|
||||
/// Struct representing reference to BLOB in layers.
|
||||
///
|
||||
/// Reference contains BLOB offset, and for WAL records it also contains
|
||||
/// `will_init` flag. The flag helps to determine the range of records
|
||||
/// that needs to be applied, without reading/deserializing records themselves.
|
||||
/// Struct representing reference to BLOB in layers. Reference contains BLOB
|
||||
/// offset, and for WAL records it also contains `will_init` flag. The flag
|
||||
/// helps to determine the range of records that needs to be applied, without
|
||||
/// reading/deserializing records themselves.
|
||||
#[derive(Debug, Serialize, Deserialize, Copy, Clone)]
|
||||
pub struct BlobRef(pub u64);
|
||||
|
||||
|
||||
@@ -1,9 +1,7 @@
|
||||
//! An ImageLayer represents an image or a snapshot of a key-range at
|
||||
//! one particular LSN.
|
||||
//!
|
||||
//! It contains an image of all key-value pairs in its key-range. Any key
|
||||
//! that falls into the image layer's range but does not exist in the layer,
|
||||
//! does not exist.
|
||||
//! one particular LSN. It contains an image of all key-value pairs
|
||||
//! in its key-range. Any key that falls into the image layer's range
|
||||
//! but does not exist in the layer, does not exist.
|
||||
//!
|
||||
//! An image layer is stored in a file on disk. The file is stored in
|
||||
//! timelines/<timeline_id> directory. Currently, there are no
|
||||
|
||||
@@ -12,10 +12,8 @@ use serde::{Deserialize, Serialize};
|
||||
#[cfg(test)]
|
||||
use utils::id::TenantId;
|
||||
|
||||
/// A unique identifier of a persistent layer.
|
||||
///
|
||||
/// This is different from `LayerDescriptor`, which is only used in the benchmarks.
|
||||
/// This struct contains all necessary information to find the image / delta layer. It also provides
|
||||
/// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
|
||||
/// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
|
||||
/// a unified way to generate layer information like file name.
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Hash)]
|
||||
pub struct PersistentLayerDesc {
|
||||
|
||||
@@ -217,9 +217,8 @@ impl fmt::Display for ImageLayerName {
|
||||
}
|
||||
}
|
||||
|
||||
/// LayerName is the logical identity of a layer within a LayerMap at a moment in time.
|
||||
///
|
||||
/// The LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations
|
||||
/// LayerName is the logical identity of a layer within a LayerMap at a moment in time. The
|
||||
/// LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations
|
||||
/// over time (e.g. across shard splits or compression). The physical filenames of layers in local
|
||||
/// storage and object names in remote storage consist of the LayerName plus some extra qualifiers
|
||||
/// that uniquely identify the physical incarnation of a layer (see [crate::tenant::remote_timeline_client::remote_layer_path])
|
||||
|
||||
@@ -226,11 +226,9 @@ impl<'a> IteratorWrapper<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
/// A merge iterator over delta/image layer iterators.
|
||||
///
|
||||
/// When duplicated records are found, the iterator will not perform any
|
||||
/// deduplication, and the caller should handle these situation. By saying
|
||||
/// duplicated records, there are many possibilities:
|
||||
/// A merge iterator over delta/image layer iterators. When duplicated records are
|
||||
/// found, the iterator will not perform any deduplication, and the caller should handle
|
||||
/// these situation. By saying duplicated records, there are many possibilities:
|
||||
///
|
||||
/// * Two same delta at the same LSN.
|
||||
/// * Two same image at the same LSN.
|
||||
|
||||
@@ -34,10 +34,9 @@ impl SplitWriterResult {
|
||||
}
|
||||
}
|
||||
|
||||
/// An image writer that takes images and produces multiple image layers.
|
||||
///
|
||||
/// The interface does not guarantee atomicity (i.e., if the image layer generation
|
||||
/// fails, there might be leftover files to be cleaned up)
|
||||
/// An image writer that takes images and produces multiple image layers. The interface does not
|
||||
/// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files
|
||||
/// to be cleaned up)
|
||||
#[must_use]
|
||||
pub struct SplitImageLayerWriter {
|
||||
inner: ImageLayerWriter,
|
||||
@@ -188,23 +187,22 @@ impl SplitImageLayerWriter {
|
||||
.await
|
||||
}
|
||||
|
||||
/// This function will be deprecated with #8841.
|
||||
/// When split writer fails, the caller should call this function and handle partially generated layers.
|
||||
pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, ImageLayerWriter)> {
|
||||
Ok((self.generated_layers, self.inner))
|
||||
}
|
||||
}
|
||||
|
||||
/// A delta writer that takes key-lsn-values and produces multiple delta layers.
|
||||
///
|
||||
/// The interface does not guarantee atomicity (i.e., if the delta layer generation fails,
|
||||
/// there might be leftover files to be cleaned up).
|
||||
/// A delta writer that takes key-lsn-values and produces multiple delta layers. The interface does not
|
||||
/// guarantee atomicity (i.e., if the delta layer generation fails, there might be leftover files
|
||||
/// to be cleaned up).
|
||||
///
|
||||
/// Note that if updates of a single key exceed the target size limit, all of the updates will be batched
|
||||
/// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm
|
||||
/// will split them into multiple files based on size.
|
||||
#[must_use]
|
||||
pub struct SplitDeltaLayerWriter {
|
||||
inner: Option<(Key, DeltaLayerWriter)>,
|
||||
inner: DeltaLayerWriter,
|
||||
target_layer_size: u64,
|
||||
generated_layers: Vec<SplitWriterResult>,
|
||||
conf: &'static PageServerConf,
|
||||
@@ -212,6 +210,7 @@ pub struct SplitDeltaLayerWriter {
|
||||
tenant_shard_id: TenantShardId,
|
||||
lsn_range: Range<Lsn>,
|
||||
last_key_written: Key,
|
||||
start_key: Key,
|
||||
}
|
||||
|
||||
impl SplitDeltaLayerWriter {
|
||||
@@ -219,18 +218,29 @@ impl SplitDeltaLayerWriter {
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
start_key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
target_layer_size: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Self> {
|
||||
Ok(Self {
|
||||
target_layer_size,
|
||||
inner: None,
|
||||
inner: DeltaLayerWriter::new(
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
start_key,
|
||||
lsn_range.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
generated_layers: Vec::new(),
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
lsn_range,
|
||||
last_key_written: Key::MIN,
|
||||
start_key,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -253,26 +263,9 @@ impl SplitDeltaLayerWriter {
|
||||
//
|
||||
// Also, keep all updates of a single key in a single file. TODO: split them using the legacy compaction
|
||||
// strategy. https://github.com/neondatabase/neon/issues/8837
|
||||
|
||||
if self.inner.is_none() {
|
||||
self.inner = Some((
|
||||
key,
|
||||
DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
key,
|
||||
self.lsn_range.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
));
|
||||
}
|
||||
let (_, inner) = self.inner.as_mut().unwrap();
|
||||
|
||||
let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */;
|
||||
if inner.num_keys() >= 1
|
||||
&& inner.estimated_size() + addition_size_estimation >= self.target_layer_size
|
||||
if self.inner.num_keys() >= 1
|
||||
&& self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
|
||||
{
|
||||
if key != self.last_key_written {
|
||||
let next_delta_writer = DeltaLayerWriter::new(
|
||||
@@ -284,13 +277,13 @@ impl SplitDeltaLayerWriter {
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
let (start_key, prev_delta_writer) =
|
||||
std::mem::replace(&mut self.inner, Some((key, next_delta_writer))).unwrap();
|
||||
let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
|
||||
let layer_key = PersistentLayerKey {
|
||||
key_range: start_key..key,
|
||||
key_range: self.start_key..key,
|
||||
lsn_range: self.lsn_range.clone(),
|
||||
is_delta: true,
|
||||
};
|
||||
self.start_key = key;
|
||||
if discard(&layer_key).await {
|
||||
drop(prev_delta_writer);
|
||||
self.generated_layers
|
||||
@@ -301,18 +294,17 @@ impl SplitDeltaLayerWriter {
|
||||
self.generated_layers
|
||||
.push(SplitWriterResult::Produced(delta_layer));
|
||||
}
|
||||
} else if inner.estimated_size() >= S3_UPLOAD_LIMIT {
|
||||
} else if self.inner.estimated_size() >= S3_UPLOAD_LIMIT {
|
||||
// We have to produce a very large file b/c a key is updated too often.
|
||||
anyhow::bail!(
|
||||
"a single key is updated too often: key={}, estimated_size={}, and the layer file cannot be produced",
|
||||
key,
|
||||
inner.estimated_size()
|
||||
self.inner.estimated_size()
|
||||
);
|
||||
}
|
||||
}
|
||||
self.last_key_written = key;
|
||||
let (_, inner) = self.inner.as_mut().unwrap();
|
||||
inner.put_value(key, lsn, val, ctx).await
|
||||
self.inner.put_value(key, lsn, val, ctx).await
|
||||
}
|
||||
|
||||
pub async fn put_value(
|
||||
@@ -331,6 +323,7 @@ impl SplitDeltaLayerWriter {
|
||||
self,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
end_key: Key,
|
||||
discard: D,
|
||||
) -> anyhow::Result<Vec<SplitWriterResult>>
|
||||
where
|
||||
@@ -342,15 +335,11 @@ impl SplitDeltaLayerWriter {
|
||||
inner,
|
||||
..
|
||||
} = self;
|
||||
let Some((start_key, inner)) = inner else {
|
||||
return Ok(generated_layers);
|
||||
};
|
||||
if inner.num_keys() == 0 {
|
||||
return Ok(generated_layers);
|
||||
}
|
||||
let end_key = self.last_key_written.next();
|
||||
let layer_key = PersistentLayerKey {
|
||||
key_range: start_key..end_key,
|
||||
key_range: self.start_key..end_key,
|
||||
lsn_range: self.lsn_range.clone(),
|
||||
is_delta: true,
|
||||
};
|
||||
@@ -369,14 +358,15 @@ impl SplitDeltaLayerWriter {
|
||||
self,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
end_key: Key,
|
||||
) -> anyhow::Result<Vec<SplitWriterResult>> {
|
||||
self.finish_with_discard_fn(tline, ctx, |_| async { false })
|
||||
self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false })
|
||||
.await
|
||||
}
|
||||
|
||||
/// This function will be deprecated with #8841.
|
||||
pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, Option<DeltaLayerWriter>)> {
|
||||
Ok((self.generated_layers, self.inner.map(|x| x.1)))
|
||||
/// When split writer fails, the caller should call this function and handle partially generated layers.
|
||||
pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, DeltaLayerWriter)> {
|
||||
Ok((self.generated_layers, self.inner))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -440,8 +430,10 @@ mod tests {
|
||||
tenant.conf,
|
||||
tline.timeline_id,
|
||||
tenant.tenant_shard_id,
|
||||
get_key(0),
|
||||
Lsn(0x18)..Lsn(0x20),
|
||||
4 * 1024 * 1024,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -466,22 +458,11 @@ mod tests {
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let layers = delta_writer.finish(&tline, &ctx).await.unwrap();
|
||||
let layers = delta_writer
|
||||
.finish(&tline, &ctx, get_key(10))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(layers.len(), 1);
|
||||
assert_eq!(
|
||||
layers
|
||||
.into_iter()
|
||||
.next()
|
||||
.unwrap()
|
||||
.into_resident_layer()
|
||||
.layer_desc()
|
||||
.key(),
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(0)..get_key(1),
|
||||
lsn_range: Lsn(0x18)..Lsn(0x20),
|
||||
is_delta: true
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -518,8 +499,10 @@ mod tests {
|
||||
tenant.conf,
|
||||
tline.timeline_id,
|
||||
tenant.tenant_shard_id,
|
||||
get_key(0),
|
||||
Lsn(0x18)..Lsn(0x20),
|
||||
4 * 1024 * 1024,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -548,7 +531,10 @@ mod tests {
|
||||
.finish(&tline, &ctx, get_key(N as u32))
|
||||
.await
|
||||
.unwrap();
|
||||
let delta_layers = delta_writer.finish(&tline, &ctx).await.unwrap();
|
||||
let delta_layers = delta_writer
|
||||
.finish(&tline, &ctx, get_key(N as u32))
|
||||
.await
|
||||
.unwrap();
|
||||
if discard {
|
||||
for layer in image_layers {
|
||||
layer.into_discarded_layer();
|
||||
@@ -567,14 +553,6 @@ mod tests {
|
||||
.collect_vec();
|
||||
assert_eq!(image_layers.len(), N / 512 + 1);
|
||||
assert_eq!(delta_layers.len(), N / 512 + 1);
|
||||
assert_eq!(
|
||||
delta_layers.first().unwrap().layer_desc().key_range.start,
|
||||
get_key(0)
|
||||
);
|
||||
assert_eq!(
|
||||
delta_layers.last().unwrap().layer_desc().key_range.end,
|
||||
get_key(N as u32)
|
||||
);
|
||||
for idx in 0..image_layers.len() {
|
||||
assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
|
||||
assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
|
||||
@@ -622,8 +600,10 @@ mod tests {
|
||||
tenant.conf,
|
||||
tline.timeline_id,
|
||||
tenant.tenant_shard_id,
|
||||
get_key(0),
|
||||
Lsn(0x18)..Lsn(0x20),
|
||||
4 * 1024,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -662,35 +642,11 @@ mod tests {
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let layers = delta_writer.finish(&tline, &ctx).await.unwrap();
|
||||
let layers = delta_writer
|
||||
.finish(&tline, &ctx, get_key(10))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(layers.len(), 2);
|
||||
let mut layers_iter = layers.into_iter();
|
||||
assert_eq!(
|
||||
layers_iter
|
||||
.next()
|
||||
.unwrap()
|
||||
.into_resident_layer()
|
||||
.layer_desc()
|
||||
.key(),
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(0)..get_key(1),
|
||||
lsn_range: Lsn(0x18)..Lsn(0x20),
|
||||
is_delta: true
|
||||
}
|
||||
);
|
||||
assert_eq!(
|
||||
layers_iter
|
||||
.next()
|
||||
.unwrap()
|
||||
.into_resident_layer()
|
||||
.layer_desc()
|
||||
.key(),
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(1)..get_key(2),
|
||||
lsn_range: Lsn(0x18)..Lsn(0x20),
|
||||
is_delta: true
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -710,8 +666,10 @@ mod tests {
|
||||
tenant.conf,
|
||||
tline.timeline_id,
|
||||
tenant.tenant_shard_id,
|
||||
get_key(0),
|
||||
Lsn(0x10)..Lsn(N as u64 * 16 + 0x10),
|
||||
4 * 1024 * 1024,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -729,20 +687,10 @@ mod tests {
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
let delta_layers = delta_writer.finish(&tline, &ctx).await.unwrap();
|
||||
let delta_layers = delta_writer
|
||||
.finish(&tline, &ctx, get_key(N as u32))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(delta_layers.len(), 1);
|
||||
let delta_layer = delta_layers
|
||||
.into_iter()
|
||||
.next()
|
||||
.unwrap()
|
||||
.into_resident_layer();
|
||||
assert_eq!(
|
||||
delta_layer.layer_desc().key(),
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(0)..get_key(1),
|
||||
lsn_range: Lsn(0x10)..Lsn(N as u64 * 16 + 0x10),
|
||||
is_delta: true
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1809,6 +1809,7 @@ impl Timeline {
|
||||
.unwrap();
|
||||
// We don't want any of the produced layers to cover the full key range (i.e., MIN..MAX) b/c it will then be recognized
|
||||
// as an L0 layer.
|
||||
let hack_end_key = Key::NON_L0_MAX;
|
||||
let mut delta_layers = Vec::new();
|
||||
let mut image_layers = Vec::new();
|
||||
let mut downloaded_layers = Vec::new();
|
||||
@@ -1854,8 +1855,10 @@ impl Timeline {
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
Key::MIN,
|
||||
lowest_retain_lsn..end_lsn,
|
||||
self.get_compaction_target_size(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -1962,7 +1965,7 @@ impl Timeline {
|
||||
let produced_image_layers = if let Some(writer) = image_layer_writer {
|
||||
if !dry_run {
|
||||
writer
|
||||
.finish_with_discard_fn(self, ctx, Key::MAX, discard)
|
||||
.finish_with_discard_fn(self, ctx, hack_end_key, discard)
|
||||
.await?
|
||||
} else {
|
||||
let (layers, _) = writer.take()?;
|
||||
@@ -1975,7 +1978,7 @@ impl Timeline {
|
||||
|
||||
let produced_delta_layers = if !dry_run {
|
||||
delta_layer_writer
|
||||
.finish_with_discard_fn(self, ctx, discard)
|
||||
.finish_with_discard_fn(self, ctx, hack_end_key, discard)
|
||||
.await?
|
||||
} else {
|
||||
let (layers, _) = delta_layer_writer.take()?;
|
||||
|
||||
@@ -593,10 +593,8 @@ impl<'a> VectoredBlobReader<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`].
|
||||
///
|
||||
/// It provides a streaming API for getting read blobs. It returns a batch when
|
||||
/// `handle` gets called and when the current key would just exceed the read_size and
|
||||
/// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. It provides a streaming API for
|
||||
/// getting read blobs. It returns a batch when `handle` gets called and when the current key would just exceed the read_size and
|
||||
/// max_cnt constraints.
|
||||
pub struct StreamingVectoredReadPlanner {
|
||||
read_builder: Option<VectoredReadBuilder>,
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
//! VirtualFile is like a normal File, but it's not bound directly to
|
||||
//! a file descriptor.
|
||||
//!
|
||||
//! Instead, the file is opened when it's read from,
|
||||
//! VirtualFile is like a normal File, but it's not bound directly to
|
||||
//! a file descriptor. Instead, the file is opened when it's read from,
|
||||
//! and if too many files are open globally in the system, least-recently
|
||||
//! used ones are closed.
|
||||
//!
|
||||
|
||||
@@ -25,7 +25,9 @@ use std::time::Duration;
|
||||
use std::time::SystemTime;
|
||||
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
|
||||
use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
|
||||
use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
|
||||
use postgres_ffi::TimestampTz;
|
||||
use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
|
||||
|
||||
use anyhow::{bail, Context, Result};
|
||||
@@ -46,31 +48,16 @@ use pageserver_api::key::rel_block_to_key;
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment;
|
||||
use postgres_ffi::v14::xlog_utils::*;
|
||||
use postgres_ffi::v14::CheckPoint;
|
||||
use postgres_ffi::TransactionId;
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use utils::bin_ser::SerializeError;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
enum_pgversion! {CheckPoint, pgv::CheckPoint}
|
||||
|
||||
impl CheckPoint {
|
||||
fn encode(&self) -> Result<Bytes, SerializeError> {
|
||||
enum_pgversion_dispatch!(self, CheckPoint, cp, { cp.encode() })
|
||||
}
|
||||
|
||||
fn update_next_xid(&mut self, xid: u32) -> bool {
|
||||
enum_pgversion_dispatch!(self, CheckPoint, cp, { cp.update_next_xid(xid) })
|
||||
}
|
||||
|
||||
pub fn update_next_multixid(&mut self, multi_xid: u32, multi_offset: u32) -> bool {
|
||||
enum_pgversion_dispatch!(self, CheckPoint, cp, {
|
||||
cp.update_next_multixid(multi_xid, multi_offset)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub struct WalIngest {
|
||||
shard: ShardIdentity,
|
||||
pg_version: u32,
|
||||
checkpoint: CheckPoint,
|
||||
checkpoint_modified: bool,
|
||||
warn_ingest_lag: WarnIngestLag,
|
||||
@@ -91,16 +78,12 @@ impl WalIngest {
|
||||
// Fetch the latest checkpoint into memory, so that we can compare with it
|
||||
// quickly in `ingest_record` and update it when it changes.
|
||||
let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?;
|
||||
let pgversion = timeline.pg_version;
|
||||
|
||||
let checkpoint = dispatch_pgversion!(pgversion, {
|
||||
let checkpoint = pgv::CheckPoint::decode(&checkpoint_bytes)?;
|
||||
trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
|
||||
<pgv::CheckPoint as Into<CheckPoint>>::into(checkpoint)
|
||||
});
|
||||
let checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
|
||||
trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
|
||||
|
||||
Ok(WalIngest {
|
||||
shard: *timeline.get_shard_identity(),
|
||||
pg_version: timeline.pg_version,
|
||||
checkpoint,
|
||||
checkpoint_modified: false,
|
||||
warn_ingest_lag: WarnIngestLag {
|
||||
@@ -134,7 +117,7 @@ impl WalIngest {
|
||||
|
||||
modification.set_lsn(lsn)?;
|
||||
|
||||
if decoded.is_dbase_create_copy(pg_version) {
|
||||
if decoded.is_dbase_create_copy(self.pg_version) {
|
||||
// Records of this type should always be preceded by a commit(), as they
|
||||
// rely on reading data pages back from the Timeline.
|
||||
assert!(!modification.has_dirty_data_pages());
|
||||
@@ -354,67 +337,70 @@ impl WalIngest {
|
||||
pg_constants::RM_XLOG_ID => {
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
|
||||
enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
|
||||
if info == pg_constants::XLOG_NEXTOID {
|
||||
let next_oid = buf.get_u32_le();
|
||||
if cp.nextOid != next_oid {
|
||||
cp.nextOid = next_oid;
|
||||
self.checkpoint_modified = true;
|
||||
}
|
||||
} else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
|
||||
|| info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
|
||||
{
|
||||
let mut checkpoint_bytes = [0u8; pgv::xlog_utils::SIZEOF_CHECKPOINT];
|
||||
buf.copy_to_slice(&mut checkpoint_bytes);
|
||||
let xlog_checkpoint = pgv::CheckPoint::decode(&checkpoint_bytes)?;
|
||||
trace!(
|
||||
"xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
|
||||
xlog_checkpoint.oldestXid,
|
||||
cp.oldestXid
|
||||
);
|
||||
if (cp.oldestXid.wrapping_sub(xlog_checkpoint.oldestXid) as i32) < 0 {
|
||||
cp.oldestXid = xlog_checkpoint.oldestXid;
|
||||
}
|
||||
trace!(
|
||||
"xlog_checkpoint.oldestActiveXid={}, checkpoint.oldestActiveXid={}",
|
||||
xlog_checkpoint.oldestActiveXid,
|
||||
cp.oldestActiveXid
|
||||
);
|
||||
|
||||
// A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`,
|
||||
// because at shutdown, all in-progress transactions will implicitly
|
||||
// end. Postgres startup code knows that, and allows hot standby to start
|
||||
// immediately from a shutdown checkpoint.
|
||||
//
|
||||
// In Neon, Postgres hot standby startup always behaves as if starting from
|
||||
// an online checkpoint. It needs a valid `oldestActiveXid` value, so
|
||||
// instead of overwriting self.checkpoint.oldestActiveXid with
|
||||
// InvalidTransactionid from the checkpoint WAL record, update it to a
|
||||
// proper value, knowing that there are no in-progress transactions at this
|
||||
// point, except for prepared transactions.
|
||||
//
|
||||
// See also the neon code changes in the InitWalRecovery() function.
|
||||
if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID
|
||||
&& info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
|
||||
{
|
||||
let mut oldest_active_xid = cp.nextXid.value as u32;
|
||||
for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
|
||||
if (xid.wrapping_sub(oldest_active_xid) as i32) < 0 {
|
||||
oldest_active_xid = xid;
|
||||
}
|
||||
}
|
||||
cp.oldestActiveXid = oldest_active_xid;
|
||||
} else {
|
||||
cp.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
|
||||
}
|
||||
|
||||
// Write a new checkpoint key-value pair on every checkpoint record, even
|
||||
// if nothing really changed. Not strictly required, but it seems nice to
|
||||
// have some trace of the checkpoint records in the layer files at the same
|
||||
// LSNs.
|
||||
if info == pg_constants::XLOG_NEXTOID {
|
||||
let next_oid = buf.get_u32_le();
|
||||
if self.checkpoint.nextOid != next_oid {
|
||||
self.checkpoint.nextOid = next_oid;
|
||||
self.checkpoint_modified = true;
|
||||
}
|
||||
});
|
||||
} else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
|
||||
|| info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
|
||||
{
|
||||
let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT];
|
||||
buf.copy_to_slice(&mut checkpoint_bytes);
|
||||
let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
|
||||
trace!(
|
||||
"xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
|
||||
xlog_checkpoint.oldestXid,
|
||||
self.checkpoint.oldestXid
|
||||
);
|
||||
if (self
|
||||
.checkpoint
|
||||
.oldestXid
|
||||
.wrapping_sub(xlog_checkpoint.oldestXid) as i32)
|
||||
< 0
|
||||
{
|
||||
self.checkpoint.oldestXid = xlog_checkpoint.oldestXid;
|
||||
}
|
||||
trace!(
|
||||
"xlog_checkpoint.oldestActiveXid={}, checkpoint.oldestActiveXid={}",
|
||||
xlog_checkpoint.oldestActiveXid,
|
||||
self.checkpoint.oldestActiveXid
|
||||
);
|
||||
|
||||
// A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`,
|
||||
// because at shutdown, all in-progress transactions will implicitly
|
||||
// end. Postgres startup code knows that, and allows hot standby to start
|
||||
// immediately from a shutdown checkpoint.
|
||||
//
|
||||
// In Neon, Postgres hot standby startup always behaves as if starting from
|
||||
// an online checkpoint. It needs a valid `oldestActiveXid` value, so
|
||||
// instead of overwriting self.checkpoint.oldestActiveXid with
|
||||
// InvalidTransactionid from the checkpoint WAL record, update it to a
|
||||
// proper value, knowing that there are no in-progress transactions at this
|
||||
// point, except for prepared transactions.
|
||||
//
|
||||
// See also the neon code changes in the InitWalRecovery() function.
|
||||
if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID
|
||||
&& info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
|
||||
{
|
||||
let mut oldest_active_xid = self.checkpoint.nextXid.value as u32;
|
||||
for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
|
||||
if (xid.wrapping_sub(oldest_active_xid) as i32) < 0 {
|
||||
oldest_active_xid = xid;
|
||||
}
|
||||
}
|
||||
self.checkpoint.oldestActiveXid = oldest_active_xid;
|
||||
} else {
|
||||
self.checkpoint.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
|
||||
}
|
||||
|
||||
// Write a new checkpoint key-value pair on every checkpoint record, even
|
||||
// if nothing really changed. Not strictly required, but it seems nice to
|
||||
// have some trace of the checkpoint records in the layer files at the same
|
||||
// LSNs.
|
||||
self.checkpoint_modified = true;
|
||||
}
|
||||
}
|
||||
pg_constants::RM_LOGICALMSG_ID => {
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
@@ -438,11 +424,7 @@ impl WalIngest {
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_RUNNING_XACTS {
|
||||
let xlrec = crate::walrecord::XlRunningXacts::decode(&mut buf);
|
||||
|
||||
enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
|
||||
cp.oldestActiveXid = xlrec.oldest_running_xid;
|
||||
});
|
||||
|
||||
self.checkpoint.oldestActiveXid = xlrec.oldest_running_xid;
|
||||
self.checkpoint_modified = true;
|
||||
}
|
||||
}
|
||||
@@ -557,7 +539,7 @@ impl WalIngest {
|
||||
&& blk.has_image
|
||||
&& decoded.xl_rmid == pg_constants::RM_XLOG_ID
|
||||
&& (decoded.xl_info == pg_constants::XLOG_FPI
|
||||
|| decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
|
||||
|| decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
|
||||
// compression of WAL is not yet supported: fall back to storing the original WAL record
|
||||
&& !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version)
|
||||
// do not materialize null pages because them most likely be soon replaced with real data
|
||||
@@ -1260,17 +1242,12 @@ impl WalIngest {
|
||||
fn warn_on_ingest_lag(
|
||||
&mut self,
|
||||
conf: &crate::config::PageServerConf,
|
||||
wal_timestamp: TimestampTz,
|
||||
wal_timestmap: TimestampTz,
|
||||
) {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
let now = SystemTime::now();
|
||||
let rate_limits = &mut self.warn_ingest_lag;
|
||||
|
||||
let ts = enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, _cp, {
|
||||
pgv::xlog_utils::try_from_pg_timestamp(wal_timestamp)
|
||||
});
|
||||
|
||||
match ts {
|
||||
match try_from_pg_timestamp(wal_timestmap) {
|
||||
Ok(ts) => {
|
||||
match now.duration_since(ts) {
|
||||
Ok(lag) => {
|
||||
@@ -1280,7 +1257,7 @@ impl WalIngest {
|
||||
warn!(%rate_limit_stats, %lag, "ingesting record with timestamp lagging more than wait_lsn_timeout");
|
||||
})
|
||||
}
|
||||
}
|
||||
},
|
||||
Err(e) => {
|
||||
let delta_t = e.duration();
|
||||
// determined by prod victoriametrics query: 1000 * (timestamp(node_time_seconds{neon_service="pageserver"}) - node_time_seconds)
|
||||
@@ -1294,6 +1271,7 @@ impl WalIngest {
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
Err(error) => {
|
||||
rate_limits.timestamp_invalid_msg_ratelimit.call2(|rate_limit_stats| {
|
||||
@@ -1401,17 +1379,14 @@ impl WalIngest {
|
||||
// truncated, but a checkpoint record with the updated values isn't written until
|
||||
// later. In Neon, a server can start at any LSN, not just on a checkpoint record,
|
||||
// so we keep the oldestXid and oldestXidDB up-to-date.
|
||||
enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
|
||||
cp.oldestXid = xlrec.oldest_xid;
|
||||
cp.oldestXidDB = xlrec.oldest_xid_db;
|
||||
});
|
||||
self.checkpoint.oldestXid = xlrec.oldest_xid;
|
||||
self.checkpoint.oldestXidDB = xlrec.oldest_xid_db;
|
||||
self.checkpoint_modified = true;
|
||||
|
||||
// TODO Treat AdvanceOldestClogXid() or write a comment why we don't need it
|
||||
|
||||
let latest_page_number =
|
||||
enum_pgversion_dispatch!(self.checkpoint, CheckPoint, cp, { cp.nextXid.value }) as u32
|
||||
/ pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
self.checkpoint.nextXid.value as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
|
||||
// Now delete all segments containing pages between xlrec.pageno
|
||||
// and latest_page_number.
|
||||
@@ -1419,9 +1394,7 @@ impl WalIngest {
|
||||
// First, make an important safety check:
|
||||
// the current endpoint page must not be eligible for removal.
|
||||
// See SimpleLruTruncate() in slru.c
|
||||
if dispatch_pgversion!(modification.tline.pg_version, {
|
||||
pgv::nonrelfile_utils::clogpage_precedes(latest_page_number, xlrec.pageno)
|
||||
}) {
|
||||
if clogpage_precedes(latest_page_number, xlrec.pageno) {
|
||||
info!("could not truncate directory pg_xact apparent wraparound");
|
||||
return Ok(());
|
||||
}
|
||||
@@ -1438,12 +1411,7 @@ impl WalIngest {
|
||||
.await?
|
||||
{
|
||||
let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
|
||||
let may_delete = dispatch_pgversion!(modification.tline.pg_version, {
|
||||
pgv::nonrelfile_utils::slru_may_delete_clogsegment(segpage, xlrec.pageno)
|
||||
});
|
||||
|
||||
if may_delete {
|
||||
if slru_may_delete_clogsegment(segpage, xlrec.pageno) {
|
||||
modification
|
||||
.drop_slru_segment(SlruKind::Clog, segno, ctx)
|
||||
.await?;
|
||||
@@ -1562,23 +1530,14 @@ impl WalIngest {
|
||||
xlrec: &XlMultiXactTruncate,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
let (maxsegment, startsegment, endsegment) =
|
||||
enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
|
||||
cp.oldestMulti = xlrec.end_trunc_off;
|
||||
cp.oldestMultiDB = xlrec.oldest_multi_db;
|
||||
let maxsegment: i32 = pgv::nonrelfile_utils::mx_offset_to_member_segment(
|
||||
pg_constants::MAX_MULTIXACT_OFFSET,
|
||||
);
|
||||
let startsegment: i32 =
|
||||
pgv::nonrelfile_utils::mx_offset_to_member_segment(xlrec.start_trunc_memb);
|
||||
let endsegment: i32 =
|
||||
pgv::nonrelfile_utils::mx_offset_to_member_segment(xlrec.end_trunc_memb);
|
||||
(maxsegment, startsegment, endsegment)
|
||||
});
|
||||
|
||||
self.checkpoint.oldestMulti = xlrec.end_trunc_off;
|
||||
self.checkpoint.oldestMultiDB = xlrec.oldest_multi_db;
|
||||
self.checkpoint_modified = true;
|
||||
|
||||
// PerformMembersTruncation
|
||||
let maxsegment: i32 = mx_offset_to_member_segment(pg_constants::MAX_MULTIXACT_OFFSET);
|
||||
let startsegment: i32 = mx_offset_to_member_segment(xlrec.start_trunc_memb);
|
||||
let endsegment: i32 = mx_offset_to_member_segment(xlrec.end_trunc_memb);
|
||||
let mut segment: i32 = startsegment;
|
||||
|
||||
// Delete all the segments except the last one. The last segment can still
|
||||
@@ -1852,23 +1811,11 @@ mod tests {
|
||||
// TODO
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_zeroed_checkpoint_decodes_correctly() -> Result<()> {
|
||||
for i in 14..=16 {
|
||||
dispatch_pgversion!(i, {
|
||||
pgv::CheckPoint::decode(&pgv::ZERO_CHECKPOINT)?;
|
||||
});
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);
|
||||
|
||||
async fn init_walingest_test(tline: &Timeline, ctx: &RequestContext) -> Result<WalIngest> {
|
||||
let mut m = tline.begin_modification(Lsn(0x10));
|
||||
m.put_checkpoint(dispatch_pgversion!(
|
||||
tline.pg_version,
|
||||
pgv::ZERO_CHECKPOINT.clone()
|
||||
))?;
|
||||
m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
|
||||
m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
|
||||
m.commit(ctx).await?;
|
||||
let walingest = WalIngest::new(tline, Lsn(0x10), ctx).await?;
|
||||
|
||||
@@ -43,12 +43,13 @@ use utils::lsn::Lsn;
|
||||
use utils::sync::gate::GateError;
|
||||
use utils::sync::heavier_once_cell;
|
||||
|
||||
/// The real implementation that uses a Postgres process to
|
||||
/// perform WAL replay.
|
||||
///
|
||||
/// Only one thread can use the process at a time, that is controlled by the
|
||||
/// Mutex. In the future, we might want to launch a pool of processes to allow
|
||||
/// concurrent replay of multiple records.
|
||||
/// This is the real implementation that uses a Postgres process to
|
||||
/// perform WAL replay. Only one thread can use the process at a time,
|
||||
/// that is controlled by the Mutex. In the future, we might want to
|
||||
/// launch a pool of processes to allow concurrent replay of multiple
|
||||
/// records.
|
||||
///
|
||||
pub struct PostgresRedoManager {
|
||||
tenant_shard_id: TenantShardId,
|
||||
conf: &'static PageServerConf,
|
||||
|
||||
@@ -1038,12 +1038,9 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
if (SkipXLogPageHeader(wp, wp->propEpochStartLsn) != wp->api.get_redo_start_lsn(wp))
|
||||
{
|
||||
/*
|
||||
* However, allow to proceed if last_log_term on the node which gave
|
||||
* the highest vote (i.e. point where we are going to start writing)
|
||||
* actually had been won by me; plain restart of walproposer not
|
||||
* intervened by concurrent compute which wrote WAL is ok.
|
||||
*
|
||||
* This avoids compute crash after manual term_bump.
|
||||
* However, allow to proceed if previously elected leader was me;
|
||||
* plain restart of walproposer not intervened by concurrent
|
||||
* compute (who could generate WAL) is ok.
|
||||
*/
|
||||
if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term ==
|
||||
pg_atomic_read_u64(&walprop_shared->mineLastElectedTerm))))
|
||||
@@ -1445,17 +1442,12 @@ RecvAppendResponses(Safekeeper *sk)
|
||||
if (sk->appendResponse.term > wp->propTerm)
|
||||
{
|
||||
/*
|
||||
*
|
||||
* Term has changed to higher one, probably another compute is
|
||||
* running. If this is the case we could PANIC as well because
|
||||
* likely it inserted some data and our basebackup is unsuitable
|
||||
* anymore. However, we also bump term manually (term_bump endpoint)
|
||||
* on safekeepers for migration purposes, in this case we do want
|
||||
* compute to stay alive. So restart walproposer with FATAL instead
|
||||
* of panicking; if basebackup is spoiled next election will notice
|
||||
* this.
|
||||
* Another compute with higher term is running. Panic to restart
|
||||
* PG as we likely need to retake basebackup. However, don't dump
|
||||
* core as this is kinda expected scenario.
|
||||
*/
|
||||
wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us",
|
||||
disable_core_dump();
|
||||
wp_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us",
|
||||
sk->host, sk->port,
|
||||
sk->appendResponse.term, wp->propTerm);
|
||||
}
|
||||
|
||||
2
proxy/src/cache/timed_lru.rs
vendored
2
proxy/src/cache/timed_lru.rs
vendored
@@ -16,7 +16,7 @@ use tracing::debug;
|
||||
// On the other hand, `hashlink` has good download stats and appears to be maintained.
|
||||
use hashlink::{linked_hash_map::RawEntryMut, LruCache};
|
||||
|
||||
use super::{common::Cached, timed_lru, Cache};
|
||||
use super::{common::Cached, *};
|
||||
|
||||
/// An implementation of timed LRU cache with fixed capacity.
|
||||
/// Key properties:
|
||||
|
||||
@@ -598,15 +598,15 @@ mod tests {
|
||||
assert_eq!(
|
||||
file_stats,
|
||||
[
|
||||
(1312632, 3, 6000),
|
||||
(1312621, 3, 6000),
|
||||
(1312680, 3, 6000),
|
||||
(1312637, 3, 6000),
|
||||
(1312773, 3, 6000),
|
||||
(1312610, 3, 6000),
|
||||
(1312404, 3, 6000),
|
||||
(1312639, 3, 6000),
|
||||
(437848, 1, 2000)
|
||||
(1315874, 3, 6000),
|
||||
(1315867, 3, 6000),
|
||||
(1315927, 3, 6000),
|
||||
(1315884, 3, 6000),
|
||||
(1316014, 3, 6000),
|
||||
(1315856, 3, 6000),
|
||||
(1315648, 3, 6000),
|
||||
(1315884, 3, 6000),
|
||||
(438913, 1, 2000)
|
||||
]
|
||||
);
|
||||
|
||||
@@ -638,11 +638,11 @@ mod tests {
|
||||
assert_eq!(
|
||||
file_stats,
|
||||
[
|
||||
(1203465, 5, 10000),
|
||||
(1203189, 5, 10000),
|
||||
(1203490, 5, 10000),
|
||||
(1203475, 5, 10000),
|
||||
(1203729, 5, 10000)
|
||||
(1208861, 5, 10000),
|
||||
(1208592, 5, 10000),
|
||||
(1208885, 5, 10000),
|
||||
(1208873, 5, 10000),
|
||||
(1209128, 5, 10000)
|
||||
]
|
||||
);
|
||||
|
||||
@@ -667,15 +667,15 @@ mod tests {
|
||||
assert_eq!(
|
||||
file_stats,
|
||||
[
|
||||
(1312632, 3, 6000),
|
||||
(1312621, 3, 6000),
|
||||
(1312680, 3, 6000),
|
||||
(1312637, 3, 6000),
|
||||
(1312773, 3, 6000),
|
||||
(1312610, 3, 6000),
|
||||
(1312404, 3, 6000),
|
||||
(1312639, 3, 6000),
|
||||
(437848, 1, 2000)
|
||||
(1315874, 3, 6000),
|
||||
(1315867, 3, 6000),
|
||||
(1315927, 3, 6000),
|
||||
(1315884, 3, 6000),
|
||||
(1316014, 3, 6000),
|
||||
(1315856, 3, 6000),
|
||||
(1315648, 3, 6000),
|
||||
(1315884, 3, 6000),
|
||||
(438913, 1, 2000)
|
||||
]
|
||||
);
|
||||
|
||||
@@ -712,7 +712,7 @@ mod tests {
|
||||
// files are smaller than the size threshold, but they took too long to fill so were flushed early
|
||||
assert_eq!(
|
||||
file_stats,
|
||||
[(657696, 2, 3001), (657410, 2, 3000), (657206, 2, 2999)]
|
||||
[(659836, 2, 3001), (659550, 2, 3000), (659346, 2, 2999)]
|
||||
);
|
||||
|
||||
tmpdir.close().unwrap();
|
||||
|
||||
@@ -44,14 +44,16 @@
|
||||
clippy::items_after_statements,
|
||||
)]
|
||||
// List of temporarily allowed lints.
|
||||
// TODO: Switch to except() once stable with 1.81.
|
||||
// TODO: fix code and reduce list or move to permanent list above.
|
||||
#![expect(
|
||||
#![allow(
|
||||
clippy::cargo_common_metadata,
|
||||
clippy::cast_possible_truncation,
|
||||
clippy::cast_possible_wrap,
|
||||
clippy::cast_precision_loss,
|
||||
clippy::cast_sign_loss,
|
||||
clippy::doc_markdown,
|
||||
clippy::implicit_hasher,
|
||||
clippy::inline_always,
|
||||
clippy::match_same_arms,
|
||||
clippy::match_wild_err_arm,
|
||||
@@ -59,28 +61,21 @@
|
||||
clippy::missing_panics_doc,
|
||||
clippy::module_name_repetitions,
|
||||
clippy::needless_pass_by_value,
|
||||
clippy::needless_raw_string_hashes,
|
||||
clippy::redundant_closure_for_method_calls,
|
||||
clippy::return_self_not_must_use,
|
||||
clippy::similar_names,
|
||||
clippy::single_match_else,
|
||||
clippy::struct_excessive_bools,
|
||||
clippy::struct_field_names,
|
||||
clippy::too_many_lines,
|
||||
clippy::unused_self
|
||||
)]
|
||||
#![cfg_attr(
|
||||
any(test, feature = "testing"),
|
||||
allow(
|
||||
clippy::needless_raw_string_hashes,
|
||||
clippy::unreadable_literal,
|
||||
clippy::unused_async,
|
||||
)
|
||||
clippy::unreadable_literal,
|
||||
clippy::unused_async,
|
||||
clippy::unused_self,
|
||||
clippy::wildcard_imports
|
||||
)]
|
||||
// List of temporarily allowed lints to unblock beta/nightly.
|
||||
#![allow(
|
||||
unknown_lints,
|
||||
// TODO: 1.82: Add `use<T>` where necessary and remove from this list.
|
||||
impl_trait_overcaptures,
|
||||
)]
|
||||
#![allow(unknown_lints, clippy::manual_inspect)]
|
||||
|
||||
use std::{convert::Infallible, future::Future};
|
||||
|
||||
|
||||
@@ -217,7 +217,6 @@ impl sasl::Mechanism for Exchange<'_> {
|
||||
self.state = ExchangeState::SaltSent(sent);
|
||||
Ok(Step::Continue(self, msg))
|
||||
}
|
||||
#[allow(unreachable_patterns)] // TODO: 1.82: simply drop this match
|
||||
Step::Success(x, _) => match x {},
|
||||
Step::Failure(msg) => Ok(Step::Failure(msg)),
|
||||
}
|
||||
@@ -225,7 +224,6 @@ impl sasl::Mechanism for Exchange<'_> {
|
||||
ExchangeState::SaltSent(sent) => {
|
||||
match sent.transition(self.secret, &self.tls_server_end_point, input)? {
|
||||
Step::Success(keys, msg) => Ok(Step::Success(keys, msg)),
|
||||
#[allow(unreachable_patterns)] // TODO: 1.82: simply drop this match
|
||||
Step::Continue(x, _) => match x {},
|
||||
Step::Failure(msg) => Ok(Step::Failure(msg)),
|
||||
}
|
||||
|
||||
@@ -745,20 +745,22 @@ impl BatchQueryData {
|
||||
builder = builder.deferrable(true);
|
||||
}
|
||||
|
||||
let transaction = builder.start().await.inspect_err(|_| {
|
||||
let transaction = builder.start().await.map_err(|e| {
|
||||
// if we cannot start a transaction, we should return immediately
|
||||
// and not return to the pool. connection is clearly broken
|
||||
discard.discard();
|
||||
e
|
||||
})?;
|
||||
|
||||
let json_output =
|
||||
match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await {
|
||||
Ok(json_output) => {
|
||||
info!("commit");
|
||||
let status = transaction.commit().await.inspect_err(|_| {
|
||||
let status = transaction.commit().await.map_err(|e| {
|
||||
// if we cannot commit - for now don't return connection to pool
|
||||
// TODO: get a query status from the error
|
||||
discard.discard();
|
||||
e
|
||||
})?;
|
||||
discard.check_idle(status);
|
||||
json_output
|
||||
@@ -774,10 +776,11 @@ impl BatchQueryData {
|
||||
}
|
||||
Err(err) => {
|
||||
info!("rollback");
|
||||
let status = transaction.rollback().await.inspect_err(|_| {
|
||||
let status = transaction.rollback().await.map_err(|e| {
|
||||
// if we cannot rollback - for now don't return connection to pool
|
||||
// TODO: get a query status from the error
|
||||
discard.discard();
|
||||
e
|
||||
})?;
|
||||
discard.check_idle(status);
|
||||
return Err(err);
|
||||
|
||||
@@ -14,7 +14,6 @@ use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
|
||||
use tokio_rustls::server::TlsStream;
|
||||
|
||||
/// Stream wrapper which implements libpq's protocol.
|
||||
///
|
||||
/// NOTE: This object deliberately doesn't implement [`AsyncRead`]
|
||||
/// or [`AsyncWrite`] to prevent subtle errors (e.g. trying
|
||||
/// to pass random malformed bytes through the connection).
|
||||
|
||||
@@ -3,5 +3,5 @@ channel = "1.81.0"
|
||||
profile = "default"
|
||||
# The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
|
||||
# https://rust-lang.github.io/rustup/concepts/profiles.html
|
||||
# but we also need `llvm-tools` for coverage data merges on CI
|
||||
components = ["llvm-tools", "rustfmt", "clippy"]
|
||||
# but we also need `llvm-tools-preview` for coverage data merges on CI
|
||||
components = ["llvm-tools-preview", "rustfmt", "clippy"]
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
use utils::auth::{AuthError, Claims, Scope};
|
||||
use utils::id::TenantId;
|
||||
|
||||
/// If tenant_id is provided, allow if token (claims) is for this tenant or
|
||||
/// whole safekeeper scope (SafekeeperData). Else, allow only if token is
|
||||
/// SafekeeperData.
|
||||
pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<(), AuthError> {
|
||||
match (&claims.scope, tenant_id) {
|
||||
(Scope::Tenant, None) => Err(AuthError(
|
||||
|
||||
@@ -19,7 +19,7 @@ use std::fs::{self, File};
|
||||
use std::io::{ErrorKind, Write};
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
use std::time::Duration;
|
||||
use storage_broker::Uri;
|
||||
|
||||
use tracing::*;
|
||||
@@ -261,15 +261,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
// Change into the data directory.
|
||||
std::env::set_current_dir(&workdir)?;
|
||||
|
||||
// Prevent running multiple safekeepers on the same directory
|
||||
let lock_file_path = workdir.join(PID_FILE_NAME);
|
||||
let lock_file =
|
||||
pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
|
||||
info!("claimed pid file at {lock_file_path:?}");
|
||||
// ensure that the lock file is held even if the main thread of the process is panics
|
||||
// we need to release the lock file only when the current process is gone
|
||||
std::mem::forget(lock_file);
|
||||
|
||||
// Set or read our ID.
|
||||
let id = set_id(&workdir, args.id.map(NodeId))?;
|
||||
if args.init {
|
||||
@@ -373,15 +364,15 @@ async fn main() -> anyhow::Result<()> {
|
||||
type JoinTaskRes = Result<anyhow::Result<()>, JoinError>;
|
||||
|
||||
async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
// fsync the datadir to make sure we have a consistent state on disk.
|
||||
let dfd = File::open(&conf.workdir).context("open datadir for syncfs")?;
|
||||
let started = Instant::now();
|
||||
utils::crashsafe::syncfs(dfd)?;
|
||||
let elapsed = started.elapsed();
|
||||
info!(
|
||||
elapsed_ms = elapsed.as_millis(),
|
||||
"syncfs data directory done"
|
||||
);
|
||||
// Prevent running multiple safekeepers on the same directory
|
||||
let lock_file_path = conf.workdir.join(PID_FILE_NAME);
|
||||
let lock_file =
|
||||
pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
|
||||
info!("claimed pid file at {lock_file_path:?}");
|
||||
|
||||
// ensure that the lock file is held even if the main thread of the process is panics
|
||||
// we need to release the lock file only when the current process is gone
|
||||
std::mem::forget(lock_file);
|
||||
|
||||
info!("starting safekeeper WAL service on {}", conf.listen_pg_addr);
|
||||
let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
|
||||
|
||||
@@ -18,8 +18,8 @@ use utils::http::endpoint::{prometheus_metrics_handler, request_span, ChannelWri
|
||||
use utils::http::request::parse_query_param;
|
||||
|
||||
use postgres_ffi::WAL_SEGMENT_SIZE;
|
||||
use safekeeper_api::models::TimelineCreateRequest;
|
||||
use safekeeper_api::models::{SkTimelineInfo, TimelineCopyRequest};
|
||||
use safekeeper_api::models::{TimelineCreateRequest, TimelineTermBumpRequest};
|
||||
use utils::{
|
||||
auth::SwappableJwtAuth,
|
||||
http::{
|
||||
@@ -408,28 +408,6 @@ async fn timeline_backup_partial_reset(request: Request<Body>) -> Result<Respons
|
||||
json_response(StatusCode::OK, response)
|
||||
}
|
||||
|
||||
/// Make term at least as high as one in request. If one in request is None,
|
||||
/// increment current one.
|
||||
async fn timeline_term_bump_handler(
|
||||
mut request: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let ttid = TenantTimelineId::new(
|
||||
parse_request_param(&request, "tenant_id")?,
|
||||
parse_request_param(&request, "timeline_id")?,
|
||||
);
|
||||
check_permission(&request, Some(ttid.tenant_id))?;
|
||||
|
||||
let request_data: TimelineTermBumpRequest = json_request(&mut request).await?;
|
||||
|
||||
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
||||
let response = tli
|
||||
.term_bump(request_data.term)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, response)
|
||||
}
|
||||
|
||||
/// Used only in tests to hand craft required data.
|
||||
async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let ttid = TenantTimelineId::new(
|
||||
@@ -652,10 +630,6 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/backup_partial_reset",
|
||||
|r| request_span(r, timeline_backup_partial_reset),
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/term_bump",
|
||||
|r| request_span(r, timeline_term_bump_handler),
|
||||
)
|
||||
.post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| {
|
||||
request_span(r, record_safekeeper_info)
|
||||
})
|
||||
|
||||
@@ -484,7 +484,6 @@ pub async fn validate_temp_timeline(
|
||||
}
|
||||
|
||||
/// Move timeline from a temp directory to the main storage, and load it to the global map.
|
||||
///
|
||||
/// This operation is done under a lock to prevent bugs if several concurrent requests are
|
||||
/// trying to load the same timeline. Note that it doesn't guard against creating the
|
||||
/// timeline with the same ttid, but no one should be doing this anyway.
|
||||
|
||||
@@ -448,10 +448,8 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1);
|
||||
|
||||
/// Encapsulates a task which takes messages from msg_rx, processes and pushes
|
||||
/// replies to reply_tx.
|
||||
///
|
||||
/// Reading from socket and writing to disk in parallel is beneficial for
|
||||
/// performance, this struct provides the writing to disk part.
|
||||
/// replies to reply_tx; reading from socket and writing to disk in parallel is
|
||||
/// beneficial for performance, this struct provides writing to disk part.
|
||||
pub struct WalAcceptor {
|
||||
tli: WalResidentTimeline,
|
||||
msg_rx: Receiver<ProposerAcceptorMessage>,
|
||||
|
||||
@@ -938,9 +938,8 @@ where
|
||||
}
|
||||
|
||||
trace!(
|
||||
"processed AppendRequest of len {}, begin_lsn={}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, flushed={:?}",
|
||||
"processed AppendRequest of len {}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, flushed={:?}",
|
||||
msg.wal_data.len(),
|
||||
msg.h.begin_lsn,
|
||||
msg.h.end_lsn,
|
||||
msg.h.commit_lsn,
|
||||
msg.h.truncate_lsn,
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
//! Defines per timeline data stored persistently (SafeKeeperPersistentState)
|
||||
//! and its wrapper with in memory layer (SafekeeperState).
|
||||
|
||||
use std::{cmp::max, ops::Deref};
|
||||
use std::ops::Deref;
|
||||
|
||||
use anyhow::Result;
|
||||
use safekeeper_api::models::TimelineTermBumpResponse;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::{
|
||||
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
|
||||
@@ -13,7 +12,7 @@ use utils::{
|
||||
|
||||
use crate::{
|
||||
control_file,
|
||||
safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, Term, TermHistory},
|
||||
safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, TermHistory},
|
||||
wal_backup_partial::{self},
|
||||
};
|
||||
|
||||
@@ -148,11 +147,9 @@ pub struct TimelineMemState {
|
||||
pub proposer_uuid: PgUuid,
|
||||
}
|
||||
|
||||
/// Safekeeper persistent state plus in memory layer.
|
||||
///
|
||||
/// Allows us to avoid frequent fsyncs when we update fields like commit_lsn
|
||||
/// which don't need immediate persistence. Provides transactional like API
|
||||
/// to atomically update the state.
|
||||
/// Safekeeper persistent state plus in memory layer, to avoid frequent fsyncs
|
||||
/// when we update fields like commit_lsn which don't need immediate
|
||||
/// persistence. Provides transactional like API to atomically update the state.
|
||||
///
|
||||
/// Implements Deref into *persistent* part.
|
||||
pub struct TimelineState<CTRL: control_file::Storage> {
|
||||
@@ -212,27 +209,6 @@ where
|
||||
let s = self.start_change();
|
||||
self.finish_change(&s).await
|
||||
}
|
||||
|
||||
/// Make term at least as `to`. If `to` is None, increment current one. This
|
||||
/// is not in safekeeper.rs because we want to be able to do it even if
|
||||
/// timeline is offloaded.
|
||||
pub async fn term_bump(&mut self, to: Option<Term>) -> Result<TimelineTermBumpResponse> {
|
||||
let before = self.acceptor_state.term;
|
||||
let mut state = self.start_change();
|
||||
let new = match to {
|
||||
Some(to) => max(state.acceptor_state.term, to),
|
||||
None => state.acceptor_state.term + 1,
|
||||
};
|
||||
if new > state.acceptor_state.term {
|
||||
state.acceptor_state.term = new;
|
||||
self.finish_change(&state).await?;
|
||||
}
|
||||
let after = self.acceptor_state.term;
|
||||
Ok(TimelineTermBumpResponse {
|
||||
previous_term: before,
|
||||
current_term: after,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<CTRL> Deref for TimelineState<CTRL>
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use camino::Utf8PathBuf;
|
||||
use remote_storage::RemotePath;
|
||||
use safekeeper_api::models::TimelineTermBumpResponse;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::fs::{self};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -170,7 +169,6 @@ impl<'a> Drop for WriteGuardSharedState<'a> {
|
||||
}
|
||||
|
||||
/// This structure is stored in shared state and represents the state of the timeline.
|
||||
///
|
||||
/// Usually it holds SafeKeeper, but it also supports offloaded timeline state. In this
|
||||
/// case, SafeKeeper is not available (because WAL is not present on disk) and all
|
||||
/// operations can be done only with control file.
|
||||
@@ -216,10 +214,6 @@ impl StateSK {
|
||||
.get_last_log_term(self.flush_lsn())
|
||||
}
|
||||
|
||||
pub async fn term_bump(&mut self, to: Option<Term>) -> Result<TimelineTermBumpResponse> {
|
||||
self.state_mut().term_bump(to).await
|
||||
}
|
||||
|
||||
/// Close open WAL files to release FDs.
|
||||
fn close_wal_store(&mut self) {
|
||||
if let StateSK::Loaded(sk) = self {
|
||||
@@ -859,11 +853,6 @@ impl Timeline {
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
pub async fn term_bump(self: &Arc<Self>, to: Option<Term>) -> Result<TimelineTermBumpResponse> {
|
||||
let mut state = self.write_shared_state().await;
|
||||
state.sk.term_bump(to).await
|
||||
}
|
||||
|
||||
/// Get the timeline guard for reading/writing WAL files.
|
||||
/// If WAL files are not present on disk (evicted), they will be automatically
|
||||
/// downloaded from remote storage. This is done in the manager task, which is
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
//! Code related to evicting WAL files to remote storage.
|
||||
//!
|
||||
//! The actual upload is done by the partial WAL backup code. This file has
|
||||
//! code to delete and re-download WAL files, cross-validate with partial WAL
|
||||
//! backup if local file is still present.
|
||||
//! Code related to evicting WAL files to remote storage. The actual upload is done by the
|
||||
//! partial WAL backup code. This file has code to delete and re-download WAL files,
|
||||
//! cross-validate with partial WAL backup if local file is still present.
|
||||
|
||||
use anyhow::Context;
|
||||
use camino::Utf8PathBuf;
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
//! Timeline residence guard
|
||||
//!
|
||||
//! It is needed to ensure that WAL segments are present on disk,
|
||||
//! Timeline residence guard is needed to ensure that WAL segments are present on disk,
|
||||
//! as long as the code is holding the guard. This file implements guard logic, to issue
|
||||
//! and drop guards, and to notify the manager when the guard is dropped.
|
||||
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
//! The timeline manager task is responsible for managing the timeline's background tasks.
|
||||
//!
|
||||
//! It is spawned alongside each timeline and exits when the timeline is deleted.
|
||||
//! It watches for changes in the timeline state and decides when to spawn or kill background tasks.
|
||||
//! It also can manage some reactive state, like should the timeline be active for broker pushes or not.
|
||||
|
||||
@@ -60,8 +60,7 @@ impl TimelinesSet {
|
||||
}
|
||||
}
|
||||
|
||||
/// Guard is used to add or remove timelines from the set.
|
||||
///
|
||||
/// Guard is used to add or remove timeline from the set.
|
||||
/// If the timeline present in set, it will be removed from it on drop.
|
||||
/// Note: do not use more than one guard for the same timeline, it caches the presence state.
|
||||
/// It is designed to be used in the manager task only.
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
//! Safekeeper timeline has a background task which is subscribed to `commit_lsn`
|
||||
//! and `flush_lsn` updates.
|
||||
//!
|
||||
//! After the partial segment was updated (`flush_lsn` was changed), the segment
|
||||
//! will be uploaded to S3 within the configured `partial_backup_timeout`.
|
||||
//! and `flush_lsn` updates. After the partial segment was updated (`flush_lsn`
|
||||
//! was changed), the segment will be uploaded to S3 in about 15 minutes.
|
||||
//!
|
||||
//! The filename format for partial segments is
|
||||
//! `Segment_Term_Flush_Commit_skNN.partial`, where:
|
||||
|
||||
@@ -17,7 +17,6 @@ use crate::SafeKeeperConf;
|
||||
use postgres_backend::{AuthType, PostgresBackend};
|
||||
|
||||
/// Accept incoming TCP connections and spawn them into a background thread.
|
||||
///
|
||||
/// allowed_auth_scope is either SafekeeperData (wide JWT tokens giving access
|
||||
/// to any tenant are allowed) or Tenant (only tokens giving access to specific
|
||||
/// tenant are allowed). Doesn't matter if auth is disabled in conf.
|
||||
|
||||
@@ -98,19 +98,7 @@ pub struct PhysicalStorage {
|
||||
/// Also can be ahead of record_lsn, if happen to be in the middle of a WAL record.
|
||||
write_lsn: Lsn,
|
||||
|
||||
/// The LSN of the last WAL record written to disk. Still can be not fully
|
||||
/// flushed.
|
||||
///
|
||||
/// Note: Normally it (and flush_record_lsn) is <= write_lsn, but after xlog
|
||||
/// switch ingest the reverse is true because we don't bump write_lsn up to
|
||||
/// the next segment: WAL stream from the compute doesn't have the gap and
|
||||
/// for simplicity / as a sanity check we disallow any non-sequential
|
||||
/// writes, so write zeros as is.
|
||||
///
|
||||
/// Similar effect is in theory possible due to LSN alignment: if record
|
||||
/// ends at *2, decoder will report end lsn as *8 even though we haven't
|
||||
/// written these zeros yet. In practice compute likely never sends
|
||||
/// non-aligned chunks of data.
|
||||
/// The LSN of the last WAL record written to disk. Still can be not fully flushed.
|
||||
write_record_lsn: Lsn,
|
||||
|
||||
/// The LSN of the last WAL record flushed to disk.
|
||||
@@ -179,7 +167,8 @@ impl PhysicalStorage {
|
||||
)
|
||||
};
|
||||
|
||||
// note: this assumes we fsync'ed whole datadir on start.
|
||||
// TODO: do we really know that write_lsn is fully flushed to disk?
|
||||
// If not, maybe it's better to call fsync() here to be sure?
|
||||
let flush_lsn = write_lsn;
|
||||
|
||||
debug!(
|
||||
@@ -451,12 +440,11 @@ impl Storage for PhysicalStorage {
|
||||
.with_label_values(&["truncate_wal"])
|
||||
.start_timer();
|
||||
|
||||
// Streaming must not create a hole, so truncate cannot be called on
|
||||
// non-written lsn.
|
||||
if self.write_record_lsn != Lsn(0) && end_pos > self.write_record_lsn {
|
||||
// Streaming must not create a hole, so truncate cannot be called on non-written lsn
|
||||
if self.write_lsn != Lsn(0) && end_pos > self.write_lsn {
|
||||
bail!(
|
||||
"truncate_wal called on non-written WAL, write_record_lsn={}, end_pos={}",
|
||||
self.write_record_lsn,
|
||||
"truncate_wal called on non-written WAL, write_lsn={}, end_pos={}",
|
||||
self.write_lsn,
|
||||
end_pos
|
||||
);
|
||||
}
|
||||
|
||||
@@ -134,7 +134,7 @@ class LLVM:
|
||||
# Show a user-friendly warning
|
||||
raise Exception(' '.join([
|
||||
f"It appears that you don't have `{name}` installed.",
|
||||
"Please execute `rustup component add llvm-tools`,",
|
||||
"Please execute `rustup component add llvm-tools-preview`,",
|
||||
"or install it via your package manager of choice.",
|
||||
"LLVM tools should be the same version as LLVM in `rustc --version --verbose`.",
|
||||
]))
|
||||
@@ -518,7 +518,7 @@ def main() -> None:
|
||||
example = f"""
|
||||
prerequisites:
|
||||
# alternatively, install a system package for `llvm-tools`
|
||||
rustup component add llvm-tools
|
||||
rustup component add llvm-tools-preview
|
||||
|
||||
self-contained example:
|
||||
{app} run make
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
ALTER TABLE tenant_shards DROP preferred_az_id;
|
||||
@@ -1 +0,0 @@
|
||||
ALTER TABLE tenant_shards ADD preferred_az_id VARCHAR;
|
||||
@@ -14,14 +14,14 @@ use metrics::{BuildInfo, NeonMetrics};
|
||||
use pageserver_api::controller_api::{
|
||||
MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse,
|
||||
MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
|
||||
ShardsPreferredAzsRequest, TenantCreateRequest,
|
||||
TenantCreateRequest,
|
||||
};
|
||||
use pageserver_api::models::{
|
||||
TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
|
||||
TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::{mgmt_api, BlockUnblock};
|
||||
use pageserver_client::mgmt_api;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -369,23 +369,6 @@ async fn handle_tenant_timeline_detach_ancestor(
|
||||
json_response(StatusCode::OK, res)
|
||||
}
|
||||
|
||||
async fn handle_tenant_timeline_block_unblock_gc(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
dir: BlockUnblock,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
|
||||
let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
|
||||
|
||||
service
|
||||
.tenant_timeline_block_unblock_gc(tenant_id, timeline_id, dir)
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn handle_tenant_timeline_passthrough(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
@@ -556,17 +539,6 @@ async fn handle_node_status(req: Request<Body>) -> Result<Response<Body>, ApiErr
|
||||
json_response(StatusCode::OK, node_status)
|
||||
}
|
||||
|
||||
async fn handle_node_shards(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let state = get_state(&req);
|
||||
let node_id: NodeId = parse_request_param(&req, "node_id")?;
|
||||
|
||||
let node_status = state.service.get_node_shards(node_id).await?;
|
||||
|
||||
json_response(StatusCode::OK, node_status)
|
||||
}
|
||||
|
||||
async fn handle_get_leader(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
@@ -716,18 +688,6 @@ async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_update_preferred_azs(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let azs_req = json_request::<ShardsPreferredAzsRequest>(&mut req).await?;
|
||||
let state = get_state(&req);
|
||||
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
state.service.update_shards_preferred_azs(azs_req).await?,
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_step_down(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
@@ -1137,13 +1097,6 @@ pub fn make_router(
|
||||
.get("/control/v1/node/:node_id", |r| {
|
||||
named_request_span(r, handle_node_status, RequestName("control_v1_node_status"))
|
||||
})
|
||||
.get("/control/v1/node/:node_id/shards", |r| {
|
||||
named_request_span(
|
||||
r,
|
||||
handle_node_shards,
|
||||
RequestName("control_v1_node_describe"),
|
||||
)
|
||||
})
|
||||
.get("/control/v1/leader", |r| {
|
||||
named_request_span(r, handle_get_leader, RequestName("control_v1_get_leader"))
|
||||
})
|
||||
@@ -1221,13 +1174,6 @@ pub fn make_router(
|
||||
RequestName("control_v1_tenant_policy"),
|
||||
)
|
||||
})
|
||||
.put("/control/v1/preferred_azs", |r| {
|
||||
named_request_span(
|
||||
r,
|
||||
handle_update_preferred_azs,
|
||||
RequestName("control_v1_preferred_azs"),
|
||||
)
|
||||
})
|
||||
.put("/control/v1/step_down", |r| {
|
||||
named_request_span(r, handle_step_down, RequestName("control_v1_step_down"))
|
||||
})
|
||||
@@ -1309,26 +1255,6 @@ pub fn make_router(
|
||||
)
|
||||
},
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/block_gc",
|
||||
|r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
|s, r| handle_tenant_timeline_block_unblock_gc(s, r, BlockUnblock::Block),
|
||||
RequestName("v1_tenant_timeline_block_unblock_gc"),
|
||||
)
|
||||
},
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/unblock_gc",
|
||||
|r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
|s, r| handle_tenant_timeline_block_unblock_gc(s, r, BlockUnblock::Unblock),
|
||||
RequestName("v1_tenant_timeline_block_unblock_gc"),
|
||||
)
|
||||
},
|
||||
)
|
||||
// Tenant detail GET passthrough to shard zero:
|
||||
.get("/v1/tenant/:tenant_id", |r| {
|
||||
tenant_service_handler(
|
||||
|
||||
@@ -7,10 +7,7 @@ use pageserver_api::{
|
||||
},
|
||||
shard::TenantShardId,
|
||||
};
|
||||
use pageserver_client::{
|
||||
mgmt_api::{Client, Result},
|
||||
BlockUnblock,
|
||||
};
|
||||
use pageserver_client::mgmt_api::{Client, Result};
|
||||
use reqwest::StatusCode;
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
|
||||
@@ -261,24 +258,6 @@ impl PageserverClient {
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) async fn timeline_block_unblock_gc(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
dir: BlockUnblock,
|
||||
) -> Result<()> {
|
||||
// measuring these makes no sense because we synchronize with the gc loop and remote
|
||||
// storage on block_gc so there should be huge outliers
|
||||
measured_request!(
|
||||
"timeline_block_unblock_gc",
|
||||
crate::metrics::Method::Post,
|
||||
&self.node_id_label,
|
||||
self.inner
|
||||
.timeline_block_unblock_gc(tenant_shard_id, timeline_id, dir)
|
||||
.await
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) async fn get_utilization(&self) -> Result<PageserverUtilization> {
|
||||
measured_request!(
|
||||
"utilization",
|
||||
|
||||
@@ -105,7 +105,6 @@ pub(crate) enum DatabaseOperation {
|
||||
ListMetadataHealthOutdated,
|
||||
GetLeader,
|
||||
UpdateLeader,
|
||||
SetPreferredAzs,
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
@@ -665,33 +664,6 @@ impl Persistence {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn set_tenant_shard_preferred_azs(
|
||||
&self,
|
||||
preferred_azs: Vec<(TenantShardId, String)>,
|
||||
) -> DatabaseResult<Vec<(TenantShardId, String)>> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
|
||||
self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| {
|
||||
let mut shards_updated = Vec::default();
|
||||
|
||||
for (tenant_shard_id, preferred_az) in preferred_azs.iter() {
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
|
||||
.set(preferred_az_id.eq(preferred_az))
|
||||
.execute(conn)?;
|
||||
|
||||
if updated == 1 {
|
||||
shards_updated.push((*tenant_shard_id, preferred_az.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(shards_updated)
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
self.with_measured_conn(DatabaseOperation::Detach, move |conn| {
|
||||
@@ -1078,11 +1050,6 @@ pub(crate) struct TenantShardPersistence {
|
||||
pub(crate) config: String,
|
||||
#[serde(default)]
|
||||
pub(crate) scheduling_policy: String,
|
||||
|
||||
// Hint that we should attempt to schedule this tenant shard the given
|
||||
// availability zone in order to minimise the chances of cross-AZ communication
|
||||
// with compute.
|
||||
pub(crate) preferred_az_id: Option<String>,
|
||||
}
|
||||
|
||||
impl TenantShardPersistence {
|
||||
|
||||
@@ -41,7 +41,6 @@ diesel::table! {
|
||||
splitting -> Int2,
|
||||
config -> Text,
|
||||
scheduling_policy -> Varchar,
|
||||
preferred_az_id -> Nullable<Varchar>,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@ use crate::{
|
||||
ShardGenerationState, TenantFilter,
|
||||
},
|
||||
reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder},
|
||||
scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode},
|
||||
scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
|
||||
tenant_shard::{
|
||||
MigrateAttachment, ReconcileNeeded, ReconcilerStatus, ScheduleOptimization,
|
||||
ScheduleOptimizationAction,
|
||||
@@ -41,8 +41,7 @@ use itertools::Itertools;
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest,
|
||||
NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy, ShardSchedulingPolicy,
|
||||
ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, TenantCreateRequest,
|
||||
NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, TenantCreateRequest,
|
||||
TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
|
||||
TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
|
||||
TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
@@ -69,7 +68,7 @@ use pageserver_api::{
|
||||
ValidateResponse, ValidateResponseTenant,
|
||||
},
|
||||
};
|
||||
use pageserver_client::{mgmt_api, BlockUnblock};
|
||||
use pageserver_client::mgmt_api;
|
||||
use tokio::sync::mpsc::error::TrySendError;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{
|
||||
@@ -117,9 +116,7 @@ pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
|
||||
pub const MAX_OFFLINE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30);
|
||||
|
||||
/// How long a node may be unresponsive to heartbeats during start up before we declare it
|
||||
/// offline.
|
||||
///
|
||||
/// This is much more lenient than [`MAX_OFFLINE_INTERVAL_DEFAULT`] since the pageserver's
|
||||
/// offline. This is much more lenient than [`MAX_OFFLINE_INTERVAL_DEFAULT`] since the pageserver's
|
||||
/// handling of the re-attach response may take a long time and blocks heartbeats from
|
||||
/// being handled on the pageserver side.
|
||||
pub const MAX_WARMING_UP_INTERVAL_DEFAULT: Duration = Duration::from_secs(300);
|
||||
@@ -142,7 +139,6 @@ enum TenantOperations {
|
||||
AttachHook,
|
||||
TimelineArchivalConfig,
|
||||
TimelineDetachAncestor,
|
||||
TimelineGcBlockUnblock,
|
||||
}
|
||||
|
||||
#[derive(Clone, strum_macros::Display)]
|
||||
@@ -357,12 +353,6 @@ impl From<DatabaseError> for ApiError {
|
||||
}
|
||||
}
|
||||
|
||||
enum InitialShardScheduleOutcome {
|
||||
Scheduled(TenantCreateResponseShard),
|
||||
NotScheduled,
|
||||
ShardScheduleError(ScheduleError),
|
||||
}
|
||||
|
||||
pub struct Service {
|
||||
inner: Arc<std::sync::RwLock<ServiceState>>,
|
||||
config: Config,
|
||||
@@ -452,7 +442,7 @@ struct ShardSplitParams {
|
||||
// When preparing for a shard split, we may either choose to proceed with the split,
|
||||
// or find that the work is already done and return NoOp.
|
||||
enum ShardSplitAction {
|
||||
Split(Box<ShardSplitParams>),
|
||||
Split(ShardSplitParams),
|
||||
NoOp(TenantShardSplitResponse),
|
||||
}
|
||||
|
||||
@@ -1462,7 +1452,6 @@ impl Service {
|
||||
splitting: SplitState::default(),
|
||||
scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
|
||||
.unwrap(),
|
||||
preferred_az_id: None,
|
||||
};
|
||||
|
||||
match self.persistence.insert_tenant_shards(vec![tsp]).await {
|
||||
@@ -2034,7 +2023,6 @@ impl Service {
|
||||
splitting: SplitState::default(),
|
||||
scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
|
||||
.unwrap(),
|
||||
preferred_az_id: None,
|
||||
})
|
||||
.collect();
|
||||
|
||||
@@ -2058,87 +2046,99 @@ impl Service {
|
||||
};
|
||||
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
let mut schedule_error = None;
|
||||
let mut response_shards = Vec::new();
|
||||
for tenant_shard_id in create_ids {
|
||||
tracing::info!("Creating shard {tenant_shard_id}...");
|
||||
|
||||
let outcome = self
|
||||
.do_initial_shard_scheduling(
|
||||
tenant_shard_id,
|
||||
initial_generation,
|
||||
&create_req.shard_parameters,
|
||||
create_req.config.clone(),
|
||||
placement_policy.clone(),
|
||||
&mut schedule_context,
|
||||
)
|
||||
.await;
|
||||
|
||||
match outcome {
|
||||
InitialShardScheduleOutcome::Scheduled(resp) => response_shards.push(resp),
|
||||
InitialShardScheduleOutcome::NotScheduled => {}
|
||||
InitialShardScheduleOutcome::ShardScheduleError(err) => {
|
||||
schedule_error = Some(err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let preferred_azs = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
response_shards
|
||||
.iter()
|
||||
.filter_map(|resp| {
|
||||
let az_id = locked
|
||||
.nodes
|
||||
.get(&resp.node_id)
|
||||
.map(|n| n.get_availability_zone_id().to_string())?;
|
||||
|
||||
Some((resp.shard_id, az_id))
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
// Note that we persist the preferred AZ for the new shards separately.
|
||||
// In theory, we could "peek" the scheduler to determine where the shard will
|
||||
// land, but the subsequent "real" call into the scheduler might select a different
|
||||
// node. Hence, we do this awkward update to keep things consistent.
|
||||
let updated = self
|
||||
.persistence
|
||||
.set_tenant_shard_preferred_azs(preferred_azs)
|
||||
.await
|
||||
.map_err(|err| {
|
||||
ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Failed to persist preferred az ids: {err}"
|
||||
))
|
||||
})?;
|
||||
|
||||
{
|
||||
let (waiters, response_shards) = {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
for (tid, az_id) in updated {
|
||||
if let Some(shard) = locked.tenants.get_mut(&tid) {
|
||||
shard.set_preferred_az(az_id);
|
||||
}
|
||||
let (nodes, tenants, scheduler) = locked.parts_mut();
|
||||
|
||||
let mut response_shards = Vec::new();
|
||||
let mut schcedule_error = None;
|
||||
|
||||
for tenant_shard_id in create_ids {
|
||||
tracing::info!("Creating shard {tenant_shard_id}...");
|
||||
|
||||
use std::collections::btree_map::Entry;
|
||||
match tenants.entry(tenant_shard_id) {
|
||||
Entry::Occupied(mut entry) => {
|
||||
tracing::info!(
|
||||
"Tenant shard {tenant_shard_id} already exists while creating"
|
||||
);
|
||||
|
||||
// TODO: schedule() should take an anti-affinity expression that pushes
|
||||
// attached and secondary locations (independently) away frorm those
|
||||
// pageservers also holding a shard for this tenant.
|
||||
|
||||
entry
|
||||
.get_mut()
|
||||
.schedule(scheduler, &mut schedule_context)
|
||||
.map_err(|e| {
|
||||
ApiError::Conflict(format!(
|
||||
"Failed to schedule shard {tenant_shard_id}: {e}"
|
||||
))
|
||||
})?;
|
||||
|
||||
if let Some(node_id) = entry.get().intent.get_attached() {
|
||||
let generation = entry
|
||||
.get()
|
||||
.generation
|
||||
.expect("Generation is set when in attached mode");
|
||||
response_shards.push(TenantCreateResponseShard {
|
||||
shard_id: tenant_shard_id,
|
||||
node_id: *node_id,
|
||||
generation: generation.into().unwrap(),
|
||||
});
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
Entry::Vacant(entry) => {
|
||||
let state = entry.insert(TenantShard::new(
|
||||
tenant_shard_id,
|
||||
ShardIdentity::from_params(
|
||||
tenant_shard_id.shard_number,
|
||||
&create_req.shard_parameters,
|
||||
),
|
||||
placement_policy.clone(),
|
||||
));
|
||||
|
||||
state.generation = initial_generation;
|
||||
state.config = create_req.config.clone();
|
||||
if let Err(e) = state.schedule(scheduler, &mut schedule_context) {
|
||||
schcedule_error = Some(e);
|
||||
}
|
||||
|
||||
// Only include shards in result if we are attaching: the purpose
|
||||
// of the response is to tell the caller where the shards are attached.
|
||||
if let Some(node_id) = state.intent.get_attached() {
|
||||
let generation = state
|
||||
.generation
|
||||
.expect("Generation is set when in attached mode");
|
||||
response_shards.push(TenantCreateResponseShard {
|
||||
shard_id: tenant_shard_id,
|
||||
node_id: *node_id,
|
||||
generation: generation.into().unwrap(),
|
||||
});
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// If we failed to schedule shards, then they are still created in the controller,
|
||||
// but we return an error to the requester to avoid a silent failure when someone
|
||||
// tries to e.g. create a tenant whose placement policy requires more nodes than
|
||||
// are present in the system. We do this here rather than in the above loop, to
|
||||
// avoid situations where we only create a subset of shards in the tenant.
|
||||
if let Some(e) = schedule_error {
|
||||
return Err(ApiError::Conflict(format!(
|
||||
"Failed to schedule shard(s): {e}"
|
||||
)));
|
||||
}
|
||||
// If we failed to schedule shards, then they are still created in the controller,
|
||||
// but we return an error to the requester to avoid a silent failure when someone
|
||||
// tries to e.g. create a tenant whose placement policy requires more nodes than
|
||||
// are present in the system. We do this here rather than in the above loop, to
|
||||
// avoid situations where we only create a subset of shards in the tenant.
|
||||
if let Some(e) = schcedule_error {
|
||||
return Err(ApiError::Conflict(format!(
|
||||
"Failed to schedule shard(s): {e}"
|
||||
)));
|
||||
}
|
||||
|
||||
let waiters = {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let (nodes, tenants, _scheduler) = locked.parts_mut();
|
||||
tenants
|
||||
let waiters = tenants
|
||||
.range_mut(TenantShardId::tenant_range(tenant_id))
|
||||
.filter_map(|(_shard_id, shard)| self.maybe_reconcile_shard(shard, nodes))
|
||||
.collect::<Vec<_>>()
|
||||
.collect::<Vec<_>>();
|
||||
(waiters, response_shards)
|
||||
};
|
||||
|
||||
Ok((
|
||||
@@ -2149,78 +2149,6 @@ impl Service {
|
||||
))
|
||||
}
|
||||
|
||||
/// Helper for tenant creation that does the scheduling for an individual shard. Covers both the
|
||||
/// case of a new tenant and a pre-existing one.
|
||||
async fn do_initial_shard_scheduling(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
initial_generation: Option<Generation>,
|
||||
shard_params: &ShardParameters,
|
||||
config: TenantConfig,
|
||||
placement_policy: PlacementPolicy,
|
||||
schedule_context: &mut ScheduleContext,
|
||||
) -> InitialShardScheduleOutcome {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let (_nodes, tenants, scheduler) = locked.parts_mut();
|
||||
|
||||
use std::collections::btree_map::Entry;
|
||||
match tenants.entry(tenant_shard_id) {
|
||||
Entry::Occupied(mut entry) => {
|
||||
tracing::info!("Tenant shard {tenant_shard_id} already exists while creating");
|
||||
|
||||
// TODO: schedule() should take an anti-affinity expression that pushes
|
||||
// attached and secondary locations (independently) away frorm those
|
||||
// pageservers also holding a shard for this tenant.
|
||||
|
||||
if let Err(err) = entry.get_mut().schedule(scheduler, schedule_context) {
|
||||
return InitialShardScheduleOutcome::ShardScheduleError(err);
|
||||
}
|
||||
|
||||
if let Some(node_id) = entry.get().intent.get_attached() {
|
||||
let generation = entry
|
||||
.get()
|
||||
.generation
|
||||
.expect("Generation is set when in attached mode");
|
||||
InitialShardScheduleOutcome::Scheduled(TenantCreateResponseShard {
|
||||
shard_id: tenant_shard_id,
|
||||
node_id: *node_id,
|
||||
generation: generation.into().unwrap(),
|
||||
})
|
||||
} else {
|
||||
InitialShardScheduleOutcome::NotScheduled
|
||||
}
|
||||
}
|
||||
Entry::Vacant(entry) => {
|
||||
let state = entry.insert(TenantShard::new(
|
||||
tenant_shard_id,
|
||||
ShardIdentity::from_params(tenant_shard_id.shard_number, shard_params),
|
||||
placement_policy,
|
||||
));
|
||||
|
||||
state.generation = initial_generation;
|
||||
state.config = config;
|
||||
if let Err(e) = state.schedule(scheduler, schedule_context) {
|
||||
return InitialShardScheduleOutcome::ShardScheduleError(e);
|
||||
}
|
||||
|
||||
// Only include shards in result if we are attaching: the purpose
|
||||
// of the response is to tell the caller where the shards are attached.
|
||||
if let Some(node_id) = state.intent.get_attached() {
|
||||
let generation = state
|
||||
.generation
|
||||
.expect("Generation is set when in attached mode");
|
||||
InitialShardScheduleOutcome::Scheduled(TenantCreateResponseShard {
|
||||
shard_id: tenant_shard_id,
|
||||
node_id: *node_id,
|
||||
generation: generation.into().unwrap(),
|
||||
})
|
||||
} else {
|
||||
InitialShardScheduleOutcome::NotScheduled
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper for functions that reconcile a number of shards, and would like to do a timeout-bounded
|
||||
/// wait for reconciliation to complete before responding.
|
||||
async fn await_waiters(
|
||||
@@ -3198,57 +3126,6 @@ impl Service {
|
||||
}).await?
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_timeline_block_unblock_gc(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
dir: BlockUnblock,
|
||||
) -> Result<(), ApiError> {
|
||||
let _tenant_lock = trace_shared_lock(
|
||||
&self.tenant_op_locks,
|
||||
tenant_id,
|
||||
TenantOperations::TimelineGcBlockUnblock,
|
||||
)
|
||||
.await;
|
||||
|
||||
self.tenant_remote_mutation(tenant_id, move |targets| async move {
|
||||
if targets.is_empty() {
|
||||
return Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Tenant not found").into(),
|
||||
));
|
||||
}
|
||||
|
||||
async fn do_one(
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
node: Node,
|
||||
jwt: Option<String>,
|
||||
dir: BlockUnblock,
|
||||
) -> Result<(), ApiError> {
|
||||
let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
|
||||
|
||||
client
|
||||
.timeline_block_unblock_gc(tenant_shard_id, timeline_id, dir)
|
||||
.await
|
||||
.map_err(|e| passthrough_api_error(&node, e))
|
||||
}
|
||||
|
||||
// no shard needs to go first/last; the operation should be idempotent
|
||||
self.tenant_for_shards(targets, |tenant_shard_id, node| {
|
||||
futures::FutureExt::boxed(do_one(
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
node,
|
||||
self.config.jwt_token.clone(),
|
||||
dir,
|
||||
))
|
||||
})
|
||||
.await
|
||||
})
|
||||
.await??;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation.
|
||||
///
|
||||
/// On success, the returned vector contains exactly the same number of elements as the input `locations`.
|
||||
@@ -3634,7 +3511,6 @@ impl Service {
|
||||
is_pending_compute_notification: shard.pending_compute_notification,
|
||||
is_splitting: matches!(shard.splitting, SplitState::Splitting),
|
||||
scheduling_policy: *shard.get_scheduling_policy(),
|
||||
preferred_az_id: shard.preferred_az().map(ToString::to_string),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -4238,7 +4114,7 @@ impl Service {
|
||||
let policy = policy.unwrap();
|
||||
let config = config.unwrap();
|
||||
|
||||
Ok(ShardSplitAction::Split(Box::new(ShardSplitParams {
|
||||
Ok(ShardSplitAction::Split(ShardSplitParams {
|
||||
old_shard_count,
|
||||
new_shard_count: ShardCount::new(split_req.new_shard_count),
|
||||
new_stripe_size: split_req.new_stripe_size,
|
||||
@@ -4246,13 +4122,13 @@ impl Service {
|
||||
policy,
|
||||
config,
|
||||
shard_ident,
|
||||
})))
|
||||
}))
|
||||
}
|
||||
|
||||
async fn do_tenant_shard_split(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
params: Box<ShardSplitParams>,
|
||||
params: ShardSplitParams,
|
||||
) -> Result<(TenantShardSplitResponse, Vec<ReconcilerWaiter>), ApiError> {
|
||||
// FIXME: we have dropped self.inner lock, and not yet written anything to the database: another
|
||||
// request could occur here, deleting or mutating the tenant. begin_shard_split checks that the
|
||||
@@ -4268,7 +4144,7 @@ impl Service {
|
||||
policy,
|
||||
config,
|
||||
shard_ident,
|
||||
} = *params;
|
||||
} = params;
|
||||
|
||||
// Drop any secondary locations: pageservers do not support splitting these, and in any case the
|
||||
// end-state for a split tenant will usually be to have secondary locations on different nodes.
|
||||
@@ -4338,10 +4214,9 @@ impl Service {
|
||||
config: serde_json::to_string(&config).unwrap(),
|
||||
splitting: SplitState::Splitting,
|
||||
|
||||
// Scheduling policies and preferred AZ do not carry through to children
|
||||
// Scheduling policies do not carry through to children
|
||||
scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
|
||||
.unwrap(),
|
||||
preferred_az_id: None,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -4461,47 +4336,6 @@ impl Service {
|
||||
let (response, child_locations, waiters) =
|
||||
self.tenant_shard_split_commit_inmem(tenant_id, new_shard_count, new_stripe_size);
|
||||
|
||||
// Now that we have scheduled the child shards, attempt to set their preferred AZ
|
||||
// to that of the pageserver they've been attached on.
|
||||
let preferred_azs = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
child_locations
|
||||
.iter()
|
||||
.filter_map(|(tid, node_id, _stripe_size)| {
|
||||
let az_id = locked
|
||||
.nodes
|
||||
.get(node_id)
|
||||
.map(|n| n.get_availability_zone_id().to_string())?;
|
||||
|
||||
Some((*tid, az_id))
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
let updated = self
|
||||
.persistence
|
||||
.set_tenant_shard_preferred_azs(preferred_azs)
|
||||
.await
|
||||
.map_err(|err| {
|
||||
ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Failed to persist preferred az ids: {err}"
|
||||
))
|
||||
});
|
||||
|
||||
match updated {
|
||||
Ok(updated) => {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
for (tid, az_id) in updated {
|
||||
if let Some(shard) = locked.tenants.get_mut(&tid) {
|
||||
shard.set_preferred_az(az_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
tracing::warn!("Failed to persist preferred AZs after split: {err}");
|
||||
}
|
||||
}
|
||||
|
||||
// Send compute notifications for all the new shards
|
||||
let mut failed_notifications = Vec::new();
|
||||
for (child_id, child_ps, stripe_size) in child_locations {
|
||||
@@ -4976,45 +4810,6 @@ impl Service {
|
||||
))
|
||||
}
|
||||
|
||||
pub(crate) async fn get_node_shards(
|
||||
&self,
|
||||
node_id: NodeId,
|
||||
) -> Result<NodeShardResponse, ApiError> {
|
||||
let locked = self.inner.read().unwrap();
|
||||
let mut shards = Vec::new();
|
||||
for (tid, tenant) in locked.tenants.iter() {
|
||||
let is_intended_secondary = match (
|
||||
tenant.intent.get_attached() == &Some(node_id),
|
||||
tenant.intent.get_secondary().contains(&node_id),
|
||||
) {
|
||||
(true, true) => {
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"{} attached as primary+secondary on the same node",
|
||||
tid
|
||||
)))
|
||||
}
|
||||
(true, false) => Some(false),
|
||||
(false, true) => Some(true),
|
||||
(false, false) => None,
|
||||
};
|
||||
let is_observed_secondary = if let Some(ObservedStateLocation { conf: Some(conf) }) =
|
||||
tenant.observed.locations.get(&node_id)
|
||||
{
|
||||
Some(conf.secondary_conf.is_some())
|
||||
} else {
|
||||
None
|
||||
};
|
||||
if is_intended_secondary.is_some() || is_observed_secondary.is_some() {
|
||||
shards.push(NodeShard {
|
||||
tenant_shard_id: *tid,
|
||||
is_intended_secondary,
|
||||
is_observed_secondary,
|
||||
});
|
||||
}
|
||||
}
|
||||
Ok(NodeShardResponse { node_id, shards })
|
||||
}
|
||||
|
||||
pub(crate) async fn get_leader(&self) -> DatabaseResult<Option<ControllerPersistence>> {
|
||||
self.persistence.get_leader().await
|
||||
}
|
||||
@@ -6702,35 +6497,4 @@ impl Service {
|
||||
) -> Result<(), DatabaseError> {
|
||||
self.persistence.safekeeper_upsert(record).await
|
||||
}
|
||||
|
||||
pub(crate) async fn update_shards_preferred_azs(
|
||||
&self,
|
||||
req: ShardsPreferredAzsRequest,
|
||||
) -> Result<ShardsPreferredAzsResponse, ApiError> {
|
||||
let preferred_azs = req.preferred_az_ids.into_iter().collect::<Vec<_>>();
|
||||
let updated = self
|
||||
.persistence
|
||||
.set_tenant_shard_preferred_azs(preferred_azs)
|
||||
.await
|
||||
.map_err(|err| {
|
||||
ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Failed to persist preferred AZs: {err}"
|
||||
))
|
||||
})?;
|
||||
|
||||
let mut updated_in_mem_and_db = Vec::default();
|
||||
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
for (tid, az_id) in updated {
|
||||
let shard = locked.tenants.get_mut(&tid);
|
||||
if let Some(shard) = shard {
|
||||
shard.set_preferred_az(az_id);
|
||||
updated_in_mem_and_db.push(tid);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ShardsPreferredAzsResponse {
|
||||
updated: updated_in_mem_and_db,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -140,10 +140,6 @@ pub(crate) struct TenantShard {
|
||||
// Support/debug tool: if something is going wrong or flapping with scheduling, this may
|
||||
// be set to a non-active state to avoid making changes while the issue is fixed.
|
||||
scheduling_policy: ShardSchedulingPolicy,
|
||||
|
||||
// We should attempt to schedule this shard in the provided AZ to
|
||||
// decrease chances of cross-AZ compute.
|
||||
preferred_az_id: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Debug, Serialize)]
|
||||
@@ -467,7 +463,6 @@ impl TenantShard {
|
||||
last_error: Arc::default(),
|
||||
pending_compute_notification: false,
|
||||
scheduling_policy: ShardSchedulingPolicy::default(),
|
||||
preferred_az_id: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1302,7 +1297,6 @@ impl TenantShard {
|
||||
pending_compute_notification: false,
|
||||
delayed_reconcile: false,
|
||||
scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
|
||||
preferred_az_id: tsp.preferred_az_id,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -1318,17 +1312,8 @@ impl TenantShard {
|
||||
config: serde_json::to_string(&self.config).unwrap(),
|
||||
splitting: SplitState::default(),
|
||||
scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(),
|
||||
preferred_az_id: self.preferred_az_id.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn preferred_az(&self) -> Option<&str> {
|
||||
self.preferred_az_id.as_deref()
|
||||
}
|
||||
|
||||
pub(crate) fn set_preferred_az(&mut self, preferred_az_id: String) {
|
||||
self.preferred_az_id = Some(preferred_az_id);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
//! Functionality for finding and purging garbage, as in "garbage collection".
|
||||
//!
|
||||
//! Garbage means S3 objects which are either not referenced by any metadata,
|
||||
//! or are referenced by a control plane tenant/timeline in a deleted state.
|
||||
//! Functionality for finding and purging garbage, as in "garbage collection". Garbage means
|
||||
//! S3 objects which are either not referenced by any metadata, or are referenced by a
|
||||
//! control plane tenant/timeline in a deleted state.
|
||||
|
||||
use std::{
|
||||
collections::{HashMap, HashSet},
|
||||
|
||||
@@ -74,9 +74,7 @@ pub async fn stream_tenant_shards<'a>(
|
||||
}
|
||||
|
||||
/// Given a `TenantShardId`, output a stream of the timelines within that tenant, discovered
|
||||
/// using a listing.
|
||||
///
|
||||
/// The listing is done before the stream is built, so that this
|
||||
/// using a listing. The listing is done before the stream is built, so that this
|
||||
/// function can be used to generate concurrency on a stream using buffer_unordered.
|
||||
pub async fn stream_tenant_timelines<'a>(
|
||||
remote_client: &'a GenericRemoteStorage,
|
||||
|
||||
@@ -440,10 +440,9 @@ async fn gc_ancestor(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Physical garbage collection: removing unused S3 objects.
|
||||
///
|
||||
/// This is distinct from the garbage collection done inside the pageserver, which operates at a higher level
|
||||
/// (keys, layers). This type of garbage collection is about removing:
|
||||
/// Physical garbage collection: removing unused S3 objects. This is distinct from the garbage collection
|
||||
/// done inside the pageserver, which operates at a higher level (keys, layers). This type of garbage collection
|
||||
/// is about removing:
|
||||
/// - Objects that were uploaded but never referenced in the remote index (e.g. because of a shutdown between
|
||||
/// uploading a layer and uploading an index)
|
||||
/// - Index objects from historic generations
|
||||
|
||||
@@ -18,7 +18,8 @@ Prerequisites:
|
||||
|
||||
Regression tests are in the 'regress' directory. They can be run in
|
||||
parallel to minimize total runtime. Most regression test sets up their
|
||||
environment with its own pageservers and safekeepers.
|
||||
environment with its own pageservers and safekeepers (but see
|
||||
`TEST_SHARED_FIXTURES`).
|
||||
|
||||
'pg_clients' contains tests for connecting with various client
|
||||
libraries. Each client test uses a Dockerfile that pulls an image that
|
||||
@@ -73,6 +74,7 @@ This is used to construct full path to the postgres binaries.
|
||||
Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION=16`
|
||||
`TEST_OUTPUT`: Set the directory where test state and test output files
|
||||
should go.
|
||||
`TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests.
|
||||
`RUST_LOG`: logging configuration to pass into Neon CLI
|
||||
|
||||
Useful parameters and commands:
|
||||
@@ -257,8 +259,11 @@ compute Postgres nodes. The connections between them can be configured to use JW
|
||||
authentication tokens, and some other configuration options can be tweaked too.
|
||||
|
||||
The easiest way to get access to a Neon Environment is by using the `neon_simple_env`
|
||||
fixture. For convenience, there is a branch called `main` in environments created with
|
||||
'neon_simple_env', ready to be used in the test.
|
||||
fixture. The 'simple' env may be shared across multiple tests, so don't shut down the nodes
|
||||
or make other destructive changes in that environment. Also don't assume that
|
||||
there are no tenants or branches or data in the cluster. For convenience, there is a
|
||||
branch called `empty`, though. The convention is to create a test-specific branch of
|
||||
that and load any test data there, instead of the 'main' branch.
|
||||
|
||||
For more complicated cases, you can build a custom Neon Environment, with the `neon_env`
|
||||
fixture:
|
||||
|
||||
@@ -10,8 +10,4 @@ pytest_plugins = (
|
||||
"fixtures.compare_fixtures",
|
||||
"fixtures.slow",
|
||||
"fixtures.flaky",
|
||||
"fixtures.shared_fixtures",
|
||||
"fixtures.function.neon_storage",
|
||||
"fixtures.session.neon_storage",
|
||||
"fixtures.session.s3",
|
||||
)
|
||||
|
||||
@@ -140,14 +140,6 @@ class TenantId(Id):
|
||||
return self.id.hex()
|
||||
|
||||
|
||||
class NodeId(Id):
|
||||
def __repr__(self) -> str:
|
||||
return f'`NodeId("{self.id.hex()}")'
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.id.hex()
|
||||
|
||||
|
||||
class TimelineId(Id):
|
||||
def __repr__(self) -> str:
|
||||
return f'TimelineId("{self.id.hex()}")'
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user