mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-22 15:41:15 +00:00
Compare commits
31 Commits
vlad/safek
...
yuchen/fix
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2279d0259d | ||
|
|
2fcac0e66b | ||
|
|
ecde8d7632 | ||
|
|
af8238ae52 | ||
|
|
ab47804d00 | ||
|
|
ecca62a45d | ||
|
|
34a4eb6f2a | ||
|
|
b6bc954c5d | ||
|
|
30680d1f32 | ||
|
|
f561cbe1c7 | ||
|
|
3525d2e381 | ||
|
|
17c002b660 | ||
|
|
aa9112efce | ||
|
|
027889b06c | ||
|
|
79929bb1b6 | ||
|
|
9132d80aa3 | ||
|
|
82e3f0ecba | ||
|
|
75aa19aa2d | ||
|
|
a8d9939ea9 | ||
|
|
f18aa04b90 | ||
|
|
01265b7bc6 | ||
|
|
f54f0e8e2d | ||
|
|
d6aa26a533 | ||
|
|
e1d0b73824 | ||
|
|
011c0a175f | ||
|
|
2a95a51a0d | ||
|
|
11fc1a4c12 | ||
|
|
93123f2623 | ||
|
|
1d3559d4bc | ||
|
|
73bdc9a2d0 | ||
|
|
d182ff294c |
36
.github/actions/set-docker-config-dir/action.yml
vendored
36
.github/actions/set-docker-config-dir/action.yml
vendored
@@ -1,36 +0,0 @@
|
||||
name: "Set custom docker config directory"
|
||||
description: "Create a directory for docker config and set DOCKER_CONFIG"
|
||||
|
||||
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: Show warning on GitHub-hosted runners
|
||||
if: runner.environment == 'github-hosted'
|
||||
shell: bash -euo pipefail {0}
|
||||
run: |
|
||||
# Using the following environment variables to find a path to the workflow file
|
||||
# ${GITHUB_WORKFLOW_REF} - octocat/hello-world/.github/workflows/my-workflow.yml@refs/heads/my_branch
|
||||
# ${GITHUB_REPOSITORY} - octocat/hello-world
|
||||
# ${GITHUB_REF} - refs/heads/my_branch
|
||||
# From https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/variables
|
||||
|
||||
filename_with_ref=${GITHUB_WORKFLOW_REF#"$GITHUB_REPOSITORY/"}
|
||||
filename=${filename_with_ref%"@$GITHUB_REF"}
|
||||
|
||||
# https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#setting-a-warning-message
|
||||
title='Unnecessary usage of `.github/actions/set-docker-config-dir`'
|
||||
message='No need to use `.github/actions/set-docker-config-dir` action on GitHub-hosted runners'
|
||||
echo "::warning file=${filename},title=${title}::${message}"
|
||||
|
||||
- uses: pyTooling/Actions/with-post-step@74afc5a42a17a046c90c68cb5cfa627e5c6c5b6b # v1.0.7
|
||||
env:
|
||||
DOCKER_CONFIG: .docker-custom-${{ github.run_id }}-${{ github.run_attempt }}
|
||||
with:
|
||||
main: |
|
||||
mkdir -p "${DOCKER_CONFIG}"
|
||||
echo DOCKER_CONFIG=${DOCKER_CONFIG} | tee -a $GITHUB_ENV
|
||||
post: |
|
||||
if [ -d "${DOCKER_CONFIG}" ]; then
|
||||
rm -r "${DOCKER_CONFIG}"
|
||||
fi
|
||||
37
.github/workflows/_check-codestyle-python.yml
vendored
Normal file
37
.github/workflows/_check-codestyle-python.yml
vendored
Normal file
@@ -0,0 +1,37 @@
|
||||
name: Check Codestyle Python
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
build-tools-image:
|
||||
description: 'build-tools image'
|
||||
required: true
|
||||
type: string
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
jobs:
|
||||
check-codestyle-python:
|
||||
runs-on: [ self-hosted, small ]
|
||||
container:
|
||||
image: ${{ inputs.build-tools-image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}
|
||||
|
||||
- run: ./scripts/pysync
|
||||
|
||||
- run: poetry run ruff check .
|
||||
- run: poetry run ruff format --check .
|
||||
- run: poetry run mypy .
|
||||
@@ -64,7 +64,7 @@ jobs:
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: ./.github/actions/set-docker-config-dir
|
||||
- uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
with:
|
||||
cache-binary: false
|
||||
|
||||
42
.github/workflows/build_and_test.yml
vendored
42
.github/workflows/build_and_test.yml
vendored
@@ -90,35 +90,10 @@ jobs:
|
||||
|
||||
check-codestyle-python:
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
runs-on: [ self-hosted, small ]
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Cache poetry deps
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}
|
||||
|
||||
- name: Install Python deps
|
||||
run: ./scripts/pysync
|
||||
|
||||
- name: Run `ruff check` to ensure code format
|
||||
run: poetry run ruff check .
|
||||
|
||||
- name: Run `ruff format` to ensure code format
|
||||
run: poetry run ruff format --check .
|
||||
|
||||
- name: Run mypy to check types
|
||||
run: poetry run mypy .
|
||||
uses: ./.github/workflows/_check-codestyle-python.yml
|
||||
with:
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
secrets: inherit
|
||||
|
||||
check-codestyle-jsonnet:
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
@@ -141,6 +116,7 @@ jobs:
|
||||
# Check that the vendor/postgres-* submodules point to the
|
||||
# corresponding REL_*_STABLE_neon branches.
|
||||
check-submodules:
|
||||
needs: [ check-permissions ]
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -552,7 +528,7 @@ jobs:
|
||||
with:
|
||||
submodules: true
|
||||
|
||||
- uses: ./.github/actions/set-docker-config-dir
|
||||
- uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
with:
|
||||
cache-binary: false
|
||||
@@ -643,7 +619,7 @@ jobs:
|
||||
with:
|
||||
submodules: true
|
||||
|
||||
- uses: ./.github/actions/set-docker-config-dir
|
||||
- uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
with:
|
||||
cache-binary: false
|
||||
@@ -824,7 +800,7 @@ jobs:
|
||||
curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
|
||||
chmod +x vm-builder
|
||||
|
||||
- uses: ./.github/actions/set-docker-config-dir
|
||||
- uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
@@ -860,7 +836,7 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: ./.github/actions/set-docker-config-dir
|
||||
- uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
|
||||
94
.github/workflows/pre-merge-checks.yml
vendored
Normal file
94
.github/workflows/pre-merge-checks.yml
vendored
Normal file
@@ -0,0 +1,94 @@
|
||||
name: Pre-merge checks
|
||||
|
||||
on:
|
||||
merge_group:
|
||||
branches:
|
||||
- main
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
get-changed-files:
|
||||
runs-on: ubuntu-22.04
|
||||
outputs:
|
||||
python-changed: ${{ steps.python-src.outputs.any_changed }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4
|
||||
id: python-src
|
||||
with:
|
||||
files: |
|
||||
.github/workflows/pre-merge-checks.yml
|
||||
**/**.py
|
||||
poetry.lock
|
||||
pyproject.toml
|
||||
|
||||
- name: PRINT ALL CHANGED FILES FOR DEBUG PURPOSES
|
||||
env:
|
||||
PYTHON_CHANGED_FILES: ${{ steps.python-src.outputs.all_changed_files }}
|
||||
run: |
|
||||
echo "${PYTHON_CHANGED_FILES}"
|
||||
|
||||
check-build-tools-image:
|
||||
if: needs.get-changed-files.outputs.python-changed == 'true'
|
||||
needs: [ get-changed-files ]
|
||||
uses: ./.github/workflows/check-build-tools-image.yml
|
||||
|
||||
build-build-tools-image:
|
||||
needs: [ check-build-tools-image ]
|
||||
uses: ./.github/workflows/build-build-tools-image.yml
|
||||
with:
|
||||
image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
|
||||
secrets: inherit
|
||||
|
||||
check-codestyle-python:
|
||||
if: needs.get-changed-files.outputs.python-changed == 'true'
|
||||
needs: [ get-changed-files, build-build-tools-image ]
|
||||
uses: ./.github/workflows/_check-codestyle-python.yml
|
||||
with:
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
secrets: inherit
|
||||
|
||||
# To get items from the merge queue merged into main we need to satisfy "Status checks that are required".
|
||||
# Currently we require 2 jobs (checks with exact name):
|
||||
# - conclusion
|
||||
# - neon-cloud-e2e
|
||||
conclusion:
|
||||
if: always()
|
||||
permissions:
|
||||
statuses: write # for `github.repos.createCommitStatus(...)`
|
||||
needs:
|
||||
- get-changed-files
|
||||
- check-codestyle-python
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Create fake `neon-cloud-e2e` check
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
|
||||
retries: 5
|
||||
script: |
|
||||
const { repo, owner } = context.repo;
|
||||
const targetUrl = `${context.serverUrl}/${owner}/${repo}/actions/runs/${context.runId}`;
|
||||
|
||||
await github.rest.repos.createCommitStatus({
|
||||
owner: owner,
|
||||
repo: repo,
|
||||
sha: context.sha,
|
||||
context: `neon-cloud-e2e`,
|
||||
state: `success`,
|
||||
target_url: targetUrl,
|
||||
description: `fake check for merge queue`,
|
||||
});
|
||||
|
||||
- name: Fail the job if any of the dependencies do not succeed or skipped
|
||||
run: exit 1
|
||||
if: |
|
||||
(contains(needs.check-codestyle-python.result, 'skipped') && needs.get-changed-files.outputs.python-changed == 'true')
|
||||
|| contains(needs.*.result, 'failure')
|
||||
|| contains(needs.*.result, 'cancelled')
|
||||
1
.github/workflows/report-workflow-stats.yml
vendored
1
.github/workflows/report-workflow-stats.yml
vendored
@@ -23,6 +23,7 @@ on:
|
||||
- Test Postgres client libraries
|
||||
- Trigger E2E Tests
|
||||
- cleanup caches by a branch
|
||||
- Pre-merge checks
|
||||
types: [completed]
|
||||
|
||||
jobs:
|
||||
|
||||
101
Cargo.lock
generated
101
Cargo.lock
generated
@@ -1245,7 +1245,7 @@ dependencies = [
|
||||
"tar",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-postgres 0.7.7",
|
||||
"tokio-postgres",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
@@ -1351,7 +1351,7 @@ dependencies = [
|
||||
"storage_broker",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-postgres 0.7.7",
|
||||
"tokio-postgres",
|
||||
"tokio-util",
|
||||
"toml",
|
||||
"toml_edit",
|
||||
@@ -3620,8 +3620,8 @@ dependencies = [
|
||||
"pageserver_compaction",
|
||||
"pin-project-lite",
|
||||
"postgres",
|
||||
"postgres-protocol 0.6.4",
|
||||
"postgres-types 0.2.4",
|
||||
"postgres-protocol",
|
||||
"postgres-types",
|
||||
"postgres_backend",
|
||||
"postgres_connection",
|
||||
"postgres_ffi",
|
||||
@@ -3649,7 +3649,7 @@ dependencies = [
|
||||
"tokio",
|
||||
"tokio-epoll-uring",
|
||||
"tokio-io-timeout",
|
||||
"tokio-postgres 0.7.7",
|
||||
"tokio-postgres",
|
||||
"tokio-stream",
|
||||
"tokio-tar",
|
||||
"tokio-util",
|
||||
@@ -3707,7 +3707,7 @@ dependencies = [
|
||||
"serde",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-postgres 0.7.7",
|
||||
"tokio-postgres",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"utils",
|
||||
@@ -4006,31 +4006,14 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres"
|
||||
version = "0.19.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
"futures-util",
|
||||
"log",
|
||||
"tokio",
|
||||
"tokio-postgres 0.7.7",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.4"
|
||||
dependencies = [
|
||||
"base64 0.20.0",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
"hmac",
|
||||
"lazy_static",
|
||||
"md-5",
|
||||
"memchr",
|
||||
"rand 0.8.5",
|
||||
"sha2",
|
||||
"stringprep",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4052,17 +4035,6 @@ dependencies = [
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "postgres-types"
|
||||
version = "0.2.4"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
"postgres-protocol 0.6.4",
|
||||
"serde",
|
||||
"serde_json",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "postgres-types"
|
||||
version = "0.2.4"
|
||||
@@ -4070,7 +4042,7 @@ source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
"postgres-protocol 0.6.4 (git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2)",
|
||||
"postgres-protocol",
|
||||
"serde",
|
||||
"serde_json",
|
||||
]
|
||||
@@ -4088,7 +4060,7 @@ dependencies = [
|
||||
"serde",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-postgres 0.7.7",
|
||||
"tokio-postgres",
|
||||
"tokio-postgres-rustls",
|
||||
"tokio-rustls 0.26.0",
|
||||
"tokio-util",
|
||||
@@ -4103,7 +4075,7 @@ dependencies = [
|
||||
"itertools 0.10.5",
|
||||
"once_cell",
|
||||
"postgres",
|
||||
"tokio-postgres 0.7.7",
|
||||
"tokio-postgres",
|
||||
"url",
|
||||
]
|
||||
|
||||
@@ -4155,7 +4127,7 @@ dependencies = [
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"itertools 0.10.5",
|
||||
"postgres-protocol 0.6.4",
|
||||
"postgres-protocol",
|
||||
"rand 0.8.5",
|
||||
"serde",
|
||||
"thiserror",
|
||||
@@ -4341,7 +4313,7 @@ dependencies = [
|
||||
"parquet_derive",
|
||||
"pbkdf2",
|
||||
"pin-project-lite",
|
||||
"postgres-protocol 0.6.4",
|
||||
"postgres-protocol",
|
||||
"postgres_backend",
|
||||
"pq_proto",
|
||||
"prometheus",
|
||||
@@ -4376,7 +4348,7 @@ dependencies = [
|
||||
"tikv-jemalloc-ctl",
|
||||
"tikv-jemallocator",
|
||||
"tokio",
|
||||
"tokio-postgres 0.7.7",
|
||||
"tokio-postgres",
|
||||
"tokio-postgres-rustls",
|
||||
"tokio-rustls 0.26.0",
|
||||
"tokio-tungstenite",
|
||||
@@ -4771,6 +4743,7 @@ dependencies = [
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"rustls 0.22.4",
|
||||
"rustls-native-certs 0.7.0",
|
||||
"rustls-pemfile 2.1.1",
|
||||
"rustls-pki-types",
|
||||
"serde",
|
||||
@@ -5174,6 +5147,7 @@ dependencies = [
|
||||
"chrono",
|
||||
"clap",
|
||||
"crc32c",
|
||||
"criterion",
|
||||
"desim",
|
||||
"fail",
|
||||
"futures",
|
||||
@@ -5181,12 +5155,12 @@ dependencies = [
|
||||
"http 1.1.0",
|
||||
"humantime",
|
||||
"hyper 0.14.30",
|
||||
"itertools 0.10.5",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
"parking_lot 0.12.1",
|
||||
"postgres",
|
||||
"postgres-protocol 0.6.4",
|
||||
"postgres-protocol",
|
||||
"postgres_backend",
|
||||
"postgres_ffi",
|
||||
"pq_proto",
|
||||
@@ -5206,7 +5180,7 @@ dependencies = [
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-io-timeout",
|
||||
"tokio-postgres 0.7.7",
|
||||
"tokio-postgres",
|
||||
"tokio-stream",
|
||||
"tokio-tar",
|
||||
"tokio-util",
|
||||
@@ -5214,7 +5188,6 @@ dependencies = [
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
"utils",
|
||||
"wal_decoder",
|
||||
"walproposer",
|
||||
"workspace_hack",
|
||||
]
|
||||
@@ -5851,7 +5824,7 @@ dependencies = [
|
||||
"serde_json",
|
||||
"storage_controller_client",
|
||||
"tokio",
|
||||
"tokio-postgres 0.7.7",
|
||||
"tokio-postgres",
|
||||
"tokio-postgres-rustls",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
@@ -6248,28 +6221,6 @@ dependencies = [
|
||||
"syn 2.0.52",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.7"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
"futures-channel",
|
||||
"futures-util",
|
||||
"log",
|
||||
"parking_lot 0.12.1",
|
||||
"percent-encoding",
|
||||
"phf",
|
||||
"pin-project-lite",
|
||||
"postgres-protocol 0.6.4",
|
||||
"postgres-types 0.2.4",
|
||||
"socket2",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.7"
|
||||
@@ -6286,8 +6237,8 @@ dependencies = [
|
||||
"percent-encoding",
|
||||
"phf",
|
||||
"pin-project-lite",
|
||||
"postgres-protocol 0.6.4 (git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2)",
|
||||
"postgres-types 0.2.4 (git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2)",
|
||||
"postgres-protocol",
|
||||
"postgres-types",
|
||||
"socket2",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
@@ -6302,7 +6253,7 @@ dependencies = [
|
||||
"ring",
|
||||
"rustls 0.23.16",
|
||||
"tokio",
|
||||
"tokio-postgres 0.7.7",
|
||||
"tokio-postgres",
|
||||
"tokio-rustls 0.26.0",
|
||||
"x509-certificate",
|
||||
]
|
||||
@@ -6885,7 +6836,7 @@ dependencies = [
|
||||
"serde_json",
|
||||
"sysinfo",
|
||||
"tokio",
|
||||
"tokio-postgres 0.7.7",
|
||||
"tokio-postgres",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
@@ -7392,7 +7343,7 @@ dependencies = [
|
||||
"num-traits",
|
||||
"once_cell",
|
||||
"parquet",
|
||||
"postgres-types 0.2.4 (git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2)",
|
||||
"postgres-types",
|
||||
"prettyplease",
|
||||
"proc-macro2",
|
||||
"prost",
|
||||
@@ -7417,7 +7368,7 @@ dependencies = [
|
||||
"time",
|
||||
"time-macros",
|
||||
"tokio",
|
||||
"tokio-postgres 0.7.7 (git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2)",
|
||||
"tokio-postgres",
|
||||
"tokio-rustls 0.26.0",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
|
||||
15
Cargo.toml
15
Cargo.toml
@@ -214,14 +214,10 @@ log = "0.4"
|
||||
#
|
||||
# When those proxy changes are re-applied (see PR #8747), we can switch using
|
||||
# the tip of the 'neon' branch again.
|
||||
# postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
|
||||
# postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
|
||||
# postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
|
||||
# tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
|
||||
postgres = { path = "../../.cargo/git/checkouts/rust-postgres-e2c00088c8e2b112/20031d7/postgres" }
|
||||
postgres-protocol = { path = "../../.cargo/git/checkouts/rust-postgres-e2c00088c8e2b112/20031d7/postgres-protocol" }
|
||||
postgres-types = { path = "../../.cargo/git/checkouts/rust-postgres-e2c00088c8e2b112/20031d7/postgres-types" }
|
||||
tokio-postgres = { path = "../../.cargo/git/checkouts/rust-postgres-e2c00088c8e2b112/20031d7/tokio-postgres" }
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
|
||||
|
||||
## Local libraries
|
||||
compute_api = { version = "0.1", path = "./libs/compute_api/" }
|
||||
@@ -259,8 +255,7 @@ tonic-build = "0.12"
|
||||
[patch.crates-io]
|
||||
|
||||
# Needed to get `tokio-postgres-rustls` to depend on our fork.
|
||||
tokio-postgres = { path = "../../.cargo/git/checkouts/rust-postgres-e2c00088c8e2b112/20031d7/tokio-postgres" }
|
||||
# tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
|
||||
|
||||
################# Binary contents sections
|
||||
|
||||
|
||||
@@ -1,12 +1,66 @@
|
||||
ARG DEBIAN_VERSION=bullseye
|
||||
|
||||
FROM debian:${DEBIAN_VERSION}-slim
|
||||
FROM debian:bookworm-slim AS pgcopydb_builder
|
||||
ARG DEBIAN_VERSION
|
||||
|
||||
RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
|
||||
set -e && \
|
||||
apt update && \
|
||||
apt install -y --no-install-recommends \
|
||||
ca-certificates wget gpg && \
|
||||
wget -qO - https://www.postgresql.org/media/keys/ACCC4CF8.asc | gpg --dearmor -o /usr/share/keyrings/postgresql-keyring.gpg && \
|
||||
echo "deb [signed-by=/usr/share/keyrings/postgresql-keyring.gpg] http://apt.postgresql.org/pub/repos/apt bookworm-pgdg main" > /etc/apt/sources.list.d/pgdg.list && \
|
||||
apt-get update && \
|
||||
apt install -y --no-install-recommends \
|
||||
build-essential \
|
||||
autotools-dev \
|
||||
libedit-dev \
|
||||
libgc-dev \
|
||||
libpam0g-dev \
|
||||
libreadline-dev \
|
||||
libselinux1-dev \
|
||||
libxslt1-dev \
|
||||
libssl-dev \
|
||||
libkrb5-dev \
|
||||
zlib1g-dev \
|
||||
liblz4-dev \
|
||||
libpq5 \
|
||||
libpq-dev \
|
||||
libzstd-dev \
|
||||
postgresql-16 \
|
||||
postgresql-server-dev-16 \
|
||||
postgresql-common \
|
||||
python3-sphinx && \
|
||||
wget -O /tmp/pgcopydb.tar.gz https://github.com/dimitri/pgcopydb/archive/refs/tags/v0.17.tar.gz && \
|
||||
mkdir /tmp/pgcopydb && \
|
||||
tar -xzf /tmp/pgcopydb.tar.gz -C /tmp/pgcopydb --strip-components=1 && \
|
||||
cd /tmp/pgcopydb && \
|
||||
make -s clean && \
|
||||
make -s -j12 install && \
|
||||
libpq_path=$(find /lib /usr/lib -name "libpq.so.5" | head -n 1) && \
|
||||
mkdir -p /pgcopydb/lib && \
|
||||
cp "$libpq_path" /pgcopydb/lib/; \
|
||||
else \
|
||||
# copy command below will fail if we don't have dummy files, so we create them for other debian versions
|
||||
mkdir -p /usr/lib/postgresql/16/bin && touch /usr/lib/postgresql/16/bin/pgcopydb && \
|
||||
mkdir -p mkdir -p /pgcopydb/lib && touch /pgcopydb/lib/libpq.so.5; \
|
||||
fi
|
||||
|
||||
FROM debian:${DEBIAN_VERSION}-slim AS build_tools
|
||||
ARG DEBIAN_VERSION
|
||||
|
||||
# Add nonroot user
|
||||
RUN useradd -ms /bin/bash nonroot -b /home
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
|
||||
RUN mkdir -p /pgcopydb/bin && \
|
||||
mkdir -p /pgcopydb/lib && \
|
||||
chmod -R 755 /pgcopydb && \
|
||||
chown -R nonroot:nonroot /pgcopydb
|
||||
|
||||
COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/pgcopydb
|
||||
COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5
|
||||
|
||||
# System deps
|
||||
#
|
||||
# 'gdb' is included so that we get backtraces of core dumps produced in
|
||||
@@ -38,7 +92,7 @@ RUN set -e \
|
||||
libseccomp-dev \
|
||||
libsqlite3-dev \
|
||||
libssl-dev \
|
||||
$([[ "${DEBIAN_VERSION}" = "bullseye" ]] && libstdc++-10-dev || libstdc++-11-dev) \
|
||||
$([[ "${DEBIAN_VERSION}" = "bullseye" ]] && echo libstdc++-10-dev || echo libstdc++-11-dev) \
|
||||
libtool \
|
||||
libxml2-dev \
|
||||
libxmlsec1-dev \
|
||||
@@ -235,7 +289,13 @@ RUN whoami \
|
||||
&& cargo --version --verbose \
|
||||
&& rustup --version --verbose \
|
||||
&& rustc --version --verbose \
|
||||
&& clang --version
|
||||
&& clang --version
|
||||
|
||||
RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
|
||||
LD_LIBRARY_PATH=/pgcopydb/lib /pgcopydb/bin/pgcopydb --version; \
|
||||
else \
|
||||
echo "pgcopydb is not available for ${DEBIAN_VERSION}"; \
|
||||
fi
|
||||
|
||||
# Set following flag to check in Makefile if its running in Docker
|
||||
RUN touch /home/nonroot/.docker_build
|
||||
|
||||
@@ -559,8 +559,8 @@ RUN case "${PG_VERSION}" in \
|
||||
export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \
|
||||
;; \
|
||||
"v17") \
|
||||
export TIMESCALEDB_VERSION=2.17.0 \
|
||||
export TIMESCALEDB_CHECKSUM=155bf64391d3558c42f31ca0e523cfc6252921974f75298c9039ccad1c89811a \
|
||||
export TIMESCALEDB_VERSION=2.17.1 \
|
||||
export TIMESCALEDB_CHECKSUM=6277cf43f5695e23dae1c5cfeba00474d730b66ed53665a84b787a6bb1a57e28 \
|
||||
;; \
|
||||
esac && \
|
||||
wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
|
||||
@@ -1151,8 +1151,8 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# The topmost commit in the `neon` branch at the time of writing this
|
||||
# https://github.com/Mooncake-Labs/pg_mooncake/commits/neon/
|
||||
# https://github.com/Mooncake-Labs/pg_mooncake/commit/568b5a82b5fc16136bdf4ca5aac3e0cc261ab48d
|
||||
ENV PG_MOONCAKE_VERSION=568b5a82b5fc16136bdf4ca5aac3e0cc261ab48d
|
||||
# https://github.com/Mooncake-Labs/pg_mooncake/commit/077c92c452bb6896a7b7776ee95f039984f076af
|
||||
ENV PG_MOONCAKE_VERSION=077c92c452bb6896a7b7776ee95f039984f076af
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
|
||||
RUN case "${PG_VERSION}" in \
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
metrics: [
|
||||
import 'sql_exporter/checkpoints_req.libsonnet',
|
||||
import 'sql_exporter/checkpoints_timed.libsonnet',
|
||||
import 'sql_exporter/compute_backpressure_throttling_ms.libsonnet',
|
||||
import 'sql_exporter/compute_backpressure_throttling_seconds.libsonnet',
|
||||
import 'sql_exporter/compute_current_lsn.libsonnet',
|
||||
import 'sql_exporter/compute_logical_snapshot_files.libsonnet',
|
||||
import 'sql_exporter/compute_receive_lsn.libsonnet',
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
SELECT neon.backpressure_throttling_time() AS throttled;
|
||||
@@ -1,10 +1,10 @@
|
||||
{
|
||||
metric_name: 'compute_backpressure_throttling_ms',
|
||||
metric_name: 'compute_backpressure_throttling_seconds',
|
||||
type: 'gauge',
|
||||
help: 'Time compute has spent throttled',
|
||||
key_labels: null,
|
||||
values: [
|
||||
'throttled',
|
||||
],
|
||||
query: importstr 'sql_exporter/compute_backpressure_throttling_ms.sql',
|
||||
query: importstr 'sql_exporter/compute_backpressure_throttling_seconds.sql',
|
||||
}
|
||||
@@ -0,0 +1 @@
|
||||
SELECT neon.backpressure_throttling_time()::float8 / 1000 AS throttled;
|
||||
@@ -330,6 +330,7 @@ fn wait_spec(
|
||||
build_tag,
|
||||
};
|
||||
let compute = Arc::new(compute_node);
|
||||
launch_lsn_lease_bg_task_for_static(&compute);
|
||||
|
||||
// If this is a pooled VM, prewarm before starting HTTP server and becoming
|
||||
// available for binding. Prewarming helps Postgres start quicker later,
|
||||
@@ -372,8 +373,6 @@ fn wait_spec(
|
||||
state.start_time = now;
|
||||
}
|
||||
|
||||
launch_lsn_lease_bg_task_for_static(&compute);
|
||||
|
||||
Ok(WaitSpecResult {
|
||||
compute,
|
||||
http_port,
|
||||
|
||||
@@ -364,11 +364,29 @@ impl ComputeNode {
|
||||
let pageserver_connect_micros = start_time.elapsed().as_micros() as u64;
|
||||
|
||||
let basebackup_cmd = match lsn {
|
||||
Lsn(0) => format!("basebackup {} {} --gzip", spec.tenant_id, spec.timeline_id),
|
||||
_ => format!(
|
||||
"basebackup {} {} {} --gzip",
|
||||
spec.tenant_id, spec.timeline_id, lsn
|
||||
),
|
||||
Lsn(0) => {
|
||||
if spec.spec.mode != ComputeMode::Primary {
|
||||
format!(
|
||||
"basebackup {} {} --gzip --replica",
|
||||
spec.tenant_id, spec.timeline_id
|
||||
)
|
||||
} else {
|
||||
format!("basebackup {} {} --gzip", spec.tenant_id, spec.timeline_id)
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
if spec.spec.mode != ComputeMode::Primary {
|
||||
format!(
|
||||
"basebackup {} {} {} --gzip --replica",
|
||||
spec.tenant_id, spec.timeline_id, lsn
|
||||
)
|
||||
} else {
|
||||
format!(
|
||||
"basebackup {} {} {} --gzip",
|
||||
spec.tenant_id, spec.timeline_id, lsn
|
||||
)
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let copyreader = client.copy_out(basebackup_cmd.as_str())?;
|
||||
|
||||
@@ -73,6 +73,12 @@ pub fn write_postgres_conf(
|
||||
)?;
|
||||
}
|
||||
|
||||
// Locales
|
||||
writeln!(file, "lc_messages='C.UTF-8'")?;
|
||||
writeln!(file, "lc_monetary='C.UTF-8'")?;
|
||||
writeln!(file, "lc_time='C.UTF-8'")?;
|
||||
writeln!(file, "lc_numeric='C.UTF-8'")?;
|
||||
|
||||
match spec.mode {
|
||||
ComputeMode::Primary => {}
|
||||
ComputeMode::Static(lsn) => {
|
||||
|
||||
@@ -944,6 +944,9 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
|
||||
pg_auth_type: AuthType::Trust,
|
||||
http_auth_type: AuthType::Trust,
|
||||
other: Default::default(),
|
||||
// Typical developer machines use disks with slow fsync, and we don't care
|
||||
// about data integrity: disable disk syncs.
|
||||
no_sync: true,
|
||||
}
|
||||
})
|
||||
.collect(),
|
||||
|
||||
@@ -225,6 +225,7 @@ pub struct PageServerConf {
|
||||
pub listen_http_addr: String,
|
||||
pub pg_auth_type: AuthType,
|
||||
pub http_auth_type: AuthType,
|
||||
pub no_sync: bool,
|
||||
}
|
||||
|
||||
impl Default for PageServerConf {
|
||||
@@ -235,6 +236,7 @@ impl Default for PageServerConf {
|
||||
listen_http_addr: String::new(),
|
||||
pg_auth_type: AuthType::Trust,
|
||||
http_auth_type: AuthType::Trust,
|
||||
no_sync: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -249,6 +251,8 @@ pub struct NeonLocalInitPageserverConf {
|
||||
pub listen_http_addr: String,
|
||||
pub pg_auth_type: AuthType,
|
||||
pub http_auth_type: AuthType,
|
||||
#[serde(default, skip_serializing_if = "std::ops::Not::not")]
|
||||
pub no_sync: bool,
|
||||
#[serde(flatten)]
|
||||
pub other: HashMap<String, toml::Value>,
|
||||
}
|
||||
@@ -261,6 +265,7 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
|
||||
listen_http_addr,
|
||||
pg_auth_type,
|
||||
http_auth_type,
|
||||
no_sync,
|
||||
other: _,
|
||||
} = conf;
|
||||
Self {
|
||||
@@ -269,6 +274,7 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
|
||||
listen_http_addr: listen_http_addr.clone(),
|
||||
pg_auth_type: *pg_auth_type,
|
||||
http_auth_type: *http_auth_type,
|
||||
no_sync: *no_sync,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -569,6 +575,8 @@ impl LocalEnv {
|
||||
listen_http_addr: String,
|
||||
pg_auth_type: AuthType,
|
||||
http_auth_type: AuthType,
|
||||
#[serde(default)]
|
||||
no_sync: bool,
|
||||
}
|
||||
let config_toml_path = dentry.path().join("pageserver.toml");
|
||||
let config_toml: PageserverConfigTomlSubset = toml_edit::de::from_str(
|
||||
@@ -591,6 +599,7 @@ impl LocalEnv {
|
||||
listen_http_addr,
|
||||
pg_auth_type,
|
||||
http_auth_type,
|
||||
no_sync,
|
||||
} = config_toml;
|
||||
let IdentityTomlSubset {
|
||||
id: identity_toml_id,
|
||||
@@ -607,6 +616,7 @@ impl LocalEnv {
|
||||
listen_http_addr,
|
||||
pg_auth_type,
|
||||
http_auth_type,
|
||||
no_sync,
|
||||
};
|
||||
pageservers.push(conf);
|
||||
}
|
||||
|
||||
@@ -273,6 +273,7 @@ impl PageServerNode {
|
||||
)
|
||||
})?;
|
||||
let args = vec!["-D", datadir_path_str];
|
||||
|
||||
background_process::start_process(
|
||||
"pageserver",
|
||||
&datadir,
|
||||
|
||||
@@ -64,6 +64,7 @@ pub struct ConfigToml {
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub wal_redo_timeout: Duration,
|
||||
pub superuser: String,
|
||||
pub locale: String,
|
||||
pub page_cache_size: usize,
|
||||
pub max_file_descriptors: usize,
|
||||
pub pg_distrib_dir: Option<Utf8PathBuf>,
|
||||
@@ -106,6 +107,8 @@ pub struct ConfigToml {
|
||||
pub ephemeral_bytes_per_memory_kb: usize,
|
||||
pub l0_flush: Option<crate::models::L0FlushConfig>,
|
||||
pub virtual_file_io_mode: Option<crate::models::virtual_file::IoMode>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub no_sync: Option<bool>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
@@ -274,6 +277,7 @@ pub mod defaults {
|
||||
pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
|
||||
|
||||
pub const DEFAULT_SUPERUSER: &str = "cloud_admin";
|
||||
pub const DEFAULT_LOCALE: &str = "C.UTF-8";
|
||||
|
||||
pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
|
||||
pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
|
||||
@@ -324,6 +328,7 @@ impl Default for ConfigToml {
|
||||
wal_redo_timeout: (humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)
|
||||
.expect("cannot parse default wal redo timeout")),
|
||||
superuser: (DEFAULT_SUPERUSER.to_string()),
|
||||
locale: DEFAULT_LOCALE.to_string(),
|
||||
page_cache_size: (DEFAULT_PAGE_CACHE_SIZE),
|
||||
max_file_descriptors: (DEFAULT_MAX_FILE_DESCRIPTORS),
|
||||
pg_distrib_dir: None, // Utf8PathBuf::from("./pg_install"), // TODO: formely, this was std::env::current_dir()
|
||||
@@ -389,6 +394,7 @@ impl Default for ConfigToml {
|
||||
l0_flush: None,
|
||||
virtual_file_io_mode: None,
|
||||
tenant_config: TenantConfigToml::default(),
|
||||
no_sync: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,7 +24,7 @@ pub struct Key {
|
||||
|
||||
/// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as
|
||||
/// a struct of fields.
|
||||
#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize, Debug)]
|
||||
#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd)]
|
||||
pub struct CompactKey(i128);
|
||||
|
||||
/// The storage key size.
|
||||
|
||||
@@ -24,7 +24,7 @@ use postgres_ffi::Oid;
|
||||
// FIXME: should move 'forknum' as last field to keep this consistent with Postgres.
|
||||
// Then we could replace the custom Ord and PartialOrd implementations below with
|
||||
// deriving them. This will require changes in walredoproc.c.
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize)]
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize)]
|
||||
pub struct RelTag {
|
||||
pub forknum: u8,
|
||||
pub spcnode: Oid,
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
use std::ffi::CStr;
|
||||
use std::ffi::{CStr, CString};
|
||||
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use crc32c::crc32c_append;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use super::bindings::{XLogLongPageHeaderData, XLogPageHeaderData, XLOG_PAGE_MAGIC};
|
||||
use super::bindings::{RmgrId, XLogLongPageHeaderData, XLogPageHeaderData, XLOG_PAGE_MAGIC};
|
||||
use super::xlog_utils::{
|
||||
XlLogicalMessage, XLOG_RECORD_CRC_OFFS, XLOG_SIZE_OF_XLOG_RECORD, XLP_BKP_REMOVABLE,
|
||||
XLP_FIRST_IS_CONTRECORD,
|
||||
@@ -16,11 +16,65 @@ use crate::pg_constants::{
|
||||
};
|
||||
use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
|
||||
|
||||
/// Generates binary WAL records for use in tests and benchmarks. Currently only generates logical
|
||||
/// messages (effectively noops) with a fixed payload. It is used as an iterator which yields
|
||||
/// encoded bytes for a single WAL record, including internal page headers if it spans pages.
|
||||
/// Concatenating the bytes will yield a complete, well-formed WAL, which can be chunked at segment
|
||||
/// boundaries if desired. Not optimized for performance.
|
||||
/// A WAL record payload. Will be prefixed by an XLogRecord header when encoded.
|
||||
pub struct Record {
|
||||
pub rmid: RmgrId,
|
||||
pub info: u8,
|
||||
pub data: Bytes,
|
||||
}
|
||||
|
||||
impl Record {
|
||||
/// Encodes the WAL record including an XLogRecord header. prev_lsn is the start position of
|
||||
/// the previous record in the WAL -- this is ignored by the Safekeeper, but not Postgres.
|
||||
pub fn encode(&self, prev_lsn: Lsn) -> Bytes {
|
||||
// Prefix data with block ID and length.
|
||||
let data_header = Bytes::from(match self.data.len() {
|
||||
0 => vec![],
|
||||
1..=255 => vec![XLR_BLOCK_ID_DATA_SHORT, self.data.len() as u8],
|
||||
256.. => {
|
||||
let len_bytes = (self.data.len() as u32).to_le_bytes();
|
||||
[&[XLR_BLOCK_ID_DATA_LONG], len_bytes.as_slice()].concat()
|
||||
}
|
||||
});
|
||||
|
||||
// Construct the WAL record header.
|
||||
let mut header = XLogRecord {
|
||||
xl_tot_len: (XLOG_SIZE_OF_XLOG_RECORD + data_header.len() + self.data.len()) as u32,
|
||||
xl_xid: 0,
|
||||
xl_prev: prev_lsn.into(),
|
||||
xl_info: self.info,
|
||||
xl_rmid: self.rmid,
|
||||
__bindgen_padding_0: [0; 2],
|
||||
xl_crc: 0, // see below
|
||||
};
|
||||
|
||||
// Compute the CRC checksum for the data, and the header up to the CRC field.
|
||||
let mut crc = 0;
|
||||
crc = crc32c_append(crc, &data_header);
|
||||
crc = crc32c_append(crc, &self.data);
|
||||
crc = crc32c_append(crc, &header.encode().unwrap()[0..XLOG_RECORD_CRC_OFFS]);
|
||||
header.xl_crc = crc;
|
||||
|
||||
// Encode the final header and record.
|
||||
let header = header.encode().unwrap();
|
||||
|
||||
[header, data_header, self.data.clone()].concat().into()
|
||||
}
|
||||
}
|
||||
|
||||
/// Generates WAL record payloads.
|
||||
///
|
||||
/// TODO: currently only provides LogicalMessageGenerator for trivial noop messages. Add a generator
|
||||
/// that creates a table and inserts rows.
|
||||
pub trait RecordGenerator: Iterator<Item = Record> {}
|
||||
|
||||
impl<I: Iterator<Item = Record>> RecordGenerator for I {}
|
||||
|
||||
/// Generates binary WAL for use in tests and benchmarks. The provided record generator constructs
|
||||
/// the WAL records. It is used as an iterator which yields encoded bytes for a single WAL record,
|
||||
/// including internal page headers if it spans pages. Concatenating the bytes will yield a
|
||||
/// complete, well-formed WAL, which can be chunked at segment boundaries if desired. Not optimized
|
||||
/// for performance.
|
||||
///
|
||||
/// The WAL format is version-dependant (see e.g. `XLOG_PAGE_MAGIC`), so make sure to import this
|
||||
/// for the appropriate Postgres version (e.g. `postgres_ffi::v17::wal_generator::WalGenerator`).
|
||||
@@ -31,10 +85,10 @@ use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
|
||||
/// | Segment 1 | Segment 2 | Segment 3 |
|
||||
/// | Page 1 | Page 2 | Page 3 | Page 4 | Page 5 | Page 6 | Page 7 | Page 8 | Page 9 |
|
||||
/// | R1 | R2 |R3| R4 | R5 | R6 | R7 | R8 |
|
||||
///
|
||||
/// TODO: support generating actual tables and rows.
|
||||
#[derive(Default)]
|
||||
pub struct WalGenerator {
|
||||
pub struct WalGenerator<R: RecordGenerator> {
|
||||
/// Generates record payloads for the WAL.
|
||||
pub record_generator: R,
|
||||
/// Current LSN to append the next record at.
|
||||
///
|
||||
/// Callers can modify this (and prev_lsn) to restart generation at a different LSN, but should
|
||||
@@ -46,73 +100,35 @@ pub struct WalGenerator {
|
||||
pub prev_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl WalGenerator {
|
||||
// For now, hardcode the message payload.
|
||||
// TODO: support specifying the payload size.
|
||||
const PREFIX: &CStr = c"prefix";
|
||||
const MESSAGE: &[u8] = b"message";
|
||||
|
||||
// Hardcode the sys, timeline, and DB IDs. We can make them configurable if we care about them.
|
||||
impl<R: RecordGenerator> WalGenerator<R> {
|
||||
// Hardcode the sys and timeline ID. We can make them configurable if we care about them.
|
||||
const SYS_ID: u64 = 0;
|
||||
const TIMELINE_ID: u32 = 1;
|
||||
const DB_ID: u32 = 0;
|
||||
|
||||
/// Creates a new WAL generator, which emits logical message records (noops).
|
||||
pub fn new() -> Self {
|
||||
Self::default()
|
||||
/// Creates a new WAL generator with the given record generator.
|
||||
pub fn new(record_generator: R) -> WalGenerator<R> {
|
||||
Self {
|
||||
record_generator,
|
||||
lsn: Lsn(0),
|
||||
prev_lsn: Lsn(0),
|
||||
}
|
||||
}
|
||||
|
||||
/// Encodes a logical message (basically a noop), with the given prefix and message.
|
||||
pub(crate) fn encode_logical_message(prefix: &CStr, message: &[u8]) -> Bytes {
|
||||
let prefix = prefix.to_bytes_with_nul();
|
||||
let header = XlLogicalMessage {
|
||||
db_id: Self::DB_ID,
|
||||
transactional: 0,
|
||||
prefix_size: prefix.len() as u64,
|
||||
message_size: message.len() as u64,
|
||||
};
|
||||
[&header.encode(), prefix, message].concat().into()
|
||||
/// Appends a record with an arbitrary payload at the current LSN, then increments the LSN.
|
||||
/// Returns the WAL bytes for the record, including page headers and padding, and the start LSN.
|
||||
fn append_record(&mut self, record: Record) -> (Lsn, Bytes) {
|
||||
let record = record.encode(self.prev_lsn);
|
||||
let record = Self::insert_pages(record, self.lsn);
|
||||
let record = Self::pad_record(record, self.lsn);
|
||||
let lsn = self.lsn;
|
||||
self.prev_lsn = self.lsn;
|
||||
self.lsn += record.len() as u64;
|
||||
(lsn, record)
|
||||
}
|
||||
|
||||
/// Encode a WAL record with the given payload data (e.g. a logical message).
|
||||
pub(crate) fn encode_record(data: Bytes, rmid: u8, info: u8, prev_lsn: Lsn) -> Bytes {
|
||||
// Prefix data with block ID and length.
|
||||
let data_header = Bytes::from(match data.len() {
|
||||
0 => vec![],
|
||||
1..=255 => vec![XLR_BLOCK_ID_DATA_SHORT, data.len() as u8],
|
||||
256.. => {
|
||||
let len_bytes = (data.len() as u32).to_le_bytes();
|
||||
[&[XLR_BLOCK_ID_DATA_LONG], len_bytes.as_slice()].concat()
|
||||
}
|
||||
});
|
||||
|
||||
// Construct the WAL record header.
|
||||
let mut header = XLogRecord {
|
||||
xl_tot_len: (XLOG_SIZE_OF_XLOG_RECORD + data_header.len() + data.len()) as u32,
|
||||
xl_xid: 0,
|
||||
xl_prev: prev_lsn.into(),
|
||||
xl_info: info,
|
||||
xl_rmid: rmid,
|
||||
__bindgen_padding_0: [0; 2],
|
||||
xl_crc: 0, // see below
|
||||
};
|
||||
|
||||
// Compute the CRC checksum for the data, and the header up to the CRC field.
|
||||
let mut crc = 0;
|
||||
crc = crc32c_append(crc, &data_header);
|
||||
crc = crc32c_append(crc, &data);
|
||||
crc = crc32c_append(crc, &header.encode().unwrap()[0..XLOG_RECORD_CRC_OFFS]);
|
||||
header.xl_crc = crc;
|
||||
|
||||
// Encode the final header and record.
|
||||
let header = header.encode().unwrap();
|
||||
|
||||
[header, data_header, data].concat().into()
|
||||
}
|
||||
|
||||
/// Injects page headers on 8KB page boundaries. Takes the current LSN position where the record
|
||||
/// Inserts page headers on 8KB page boundaries. Takes the current LSN position where the record
|
||||
/// is to be appended.
|
||||
fn encode_pages(record: Bytes, mut lsn: Lsn) -> Bytes {
|
||||
fn insert_pages(record: Bytes, mut lsn: Lsn) -> Bytes {
|
||||
// Fast path: record fits in current page, and the page already has a header.
|
||||
if lsn.remaining_in_block() as usize >= record.len() && lsn.block_offset() > 0 {
|
||||
return record;
|
||||
@@ -173,31 +189,71 @@ impl WalGenerator {
|
||||
}
|
||||
[record, Bytes::from(vec![0; padding])].concat().into()
|
||||
}
|
||||
|
||||
/// Generates a record with an arbitrary payload at the current LSN, then increments the LSN.
|
||||
pub fn generate_record(&mut self, data: Bytes, rmid: u8, info: u8) -> Bytes {
|
||||
let record = Self::encode_record(data, rmid, info, self.prev_lsn);
|
||||
let record = Self::encode_pages(record, self.lsn);
|
||||
let record = Self::pad_record(record, self.lsn);
|
||||
self.prev_lsn = self.lsn;
|
||||
self.lsn += record.len() as u64;
|
||||
record
|
||||
}
|
||||
|
||||
/// Generates a logical message at the current LSN. Can be used to construct arbitrary messages.
|
||||
pub fn generate_logical_message(&mut self, prefix: &CStr, message: &[u8]) -> Bytes {
|
||||
let data = Self::encode_logical_message(prefix, message);
|
||||
self.generate_record(data, RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE)
|
||||
}
|
||||
}
|
||||
|
||||
/// Generate WAL records as an iterator.
|
||||
impl Iterator for WalGenerator {
|
||||
/// Generates WAL records as an iterator.
|
||||
impl<R: RecordGenerator> Iterator for WalGenerator<R> {
|
||||
type Item = (Lsn, Bytes);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let lsn = self.lsn;
|
||||
let record = self.generate_logical_message(Self::PREFIX, Self::MESSAGE);
|
||||
Some((lsn, record))
|
||||
let record = self.record_generator.next()?;
|
||||
Some(self.append_record(record))
|
||||
}
|
||||
}
|
||||
|
||||
/// Generates logical message records (effectively noops) with a fixed message.
|
||||
pub struct LogicalMessageGenerator {
|
||||
prefix: CString,
|
||||
message: Vec<u8>,
|
||||
}
|
||||
|
||||
impl LogicalMessageGenerator {
|
||||
const DB_ID: u32 = 0; // hardcoded for now
|
||||
const RM_ID: RmgrId = RM_LOGICALMSG_ID;
|
||||
const INFO: u8 = XLOG_LOGICAL_MESSAGE;
|
||||
|
||||
/// Creates a new LogicalMessageGenerator.
|
||||
pub fn new(prefix: &CStr, message: &[u8]) -> Self {
|
||||
Self {
|
||||
prefix: prefix.to_owned(),
|
||||
message: message.to_owned(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Encodes a logical message.
|
||||
fn encode(prefix: &CStr, message: &[u8]) -> Bytes {
|
||||
let prefix = prefix.to_bytes_with_nul();
|
||||
let header = XlLogicalMessage {
|
||||
db_id: Self::DB_ID,
|
||||
transactional: 0,
|
||||
prefix_size: prefix.len() as u64,
|
||||
message_size: message.len() as u64,
|
||||
};
|
||||
[&header.encode(), prefix, message].concat().into()
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for LogicalMessageGenerator {
|
||||
type Item = Record;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
Some(Record {
|
||||
rmid: Self::RM_ID,
|
||||
info: Self::INFO,
|
||||
data: Self::encode(&self.prefix, &self.message),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl WalGenerator<LogicalMessageGenerator> {
|
||||
/// Convenience method for appending a WAL record with an arbitrary logical message at the
|
||||
/// current WAL LSN position. Returns the start LSN and resulting WAL bytes.
|
||||
pub fn append_logical_message(&mut self, prefix: &CStr, message: &[u8]) -> (Lsn, Bytes) {
|
||||
let record = Record {
|
||||
rmid: LogicalMessageGenerator::RM_ID,
|
||||
info: LogicalMessageGenerator::INFO,
|
||||
data: LogicalMessageGenerator::encode(prefix, message),
|
||||
};
|
||||
self.append_record(record)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,7 +16,7 @@ use utils::bin_ser::DeserializeError;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[derive(Debug)]
|
||||
pub struct XlMultiXactCreate {
|
||||
pub mid: MultiXactId,
|
||||
/* new MultiXact's ID */
|
||||
@@ -46,7 +46,7 @@ impl XlMultiXactCreate {
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[derive(Debug)]
|
||||
pub struct XlMultiXactTruncate {
|
||||
pub oldest_multi_db: Oid,
|
||||
/* to-be-truncated range of multixact offsets */
|
||||
@@ -72,7 +72,7 @@ impl XlMultiXactTruncate {
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[derive(Debug)]
|
||||
pub struct XlRelmapUpdate {
|
||||
pub dbid: Oid, /* database ID, or 0 for shared map */
|
||||
pub tsid: Oid, /* database's tablespace, or pg_global */
|
||||
@@ -90,7 +90,7 @@ impl XlRelmapUpdate {
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[derive(Debug)]
|
||||
pub struct XlReploriginDrop {
|
||||
pub node_id: RepOriginId,
|
||||
}
|
||||
@@ -104,7 +104,7 @@ impl XlReploriginDrop {
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[derive(Debug)]
|
||||
pub struct XlReploriginSet {
|
||||
pub remote_lsn: Lsn,
|
||||
pub node_id: RepOriginId,
|
||||
@@ -120,7 +120,7 @@ impl XlReploriginSet {
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct RelFileNode {
|
||||
pub spcnode: Oid, /* tablespace */
|
||||
pub dbnode: Oid, /* database */
|
||||
@@ -911,7 +911,7 @@ impl XlSmgrCreate {
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[derive(Debug)]
|
||||
pub struct XlSmgrTruncate {
|
||||
pub blkno: BlockNumber,
|
||||
pub rnode: RelFileNode,
|
||||
@@ -984,7 +984,7 @@ impl XlDropDatabase {
|
||||
/// xl_xact_parsed_abort structs in PostgreSQL, but we use the same
|
||||
/// struct for commits and aborts.
|
||||
///
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[derive(Debug)]
|
||||
pub struct XlXactParsedRecord {
|
||||
pub xid: TransactionId,
|
||||
pub info: u8,
|
||||
|
||||
@@ -12,9 +12,9 @@ use super::bindings::{
|
||||
CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, FullTransactionId, TimeLineID, TimestampTz,
|
||||
XLogLongPageHeaderData, XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC,
|
||||
};
|
||||
use super::wal_generator::WalGenerator;
|
||||
use super::wal_generator::LogicalMessageGenerator;
|
||||
use super::PG_MAJORVERSION;
|
||||
use crate::pg_constants::{self, RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE};
|
||||
use crate::pg_constants;
|
||||
use crate::PG_TLI;
|
||||
use crate::{uint32, uint64, Oid};
|
||||
use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
|
||||
@@ -493,12 +493,10 @@ pub fn encode_logical_message(prefix: &str, message: &str) -> Bytes {
|
||||
// This function can take untrusted input, so discard any NUL bytes in the prefix string.
|
||||
let prefix = CString::new(prefix.replace('\0', "")).expect("no NULs");
|
||||
let message = message.as_bytes();
|
||||
WalGenerator::encode_record(
|
||||
WalGenerator::encode_logical_message(&prefix, message),
|
||||
RM_LOGICALMSG_ID,
|
||||
XLOG_LOGICAL_MESSAGE,
|
||||
Lsn(0),
|
||||
)
|
||||
LogicalMessageGenerator::new(&prefix, message)
|
||||
.next()
|
||||
.unwrap()
|
||||
.encode(Lsn(0))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -13,4 +13,3 @@ rand.workspace = true
|
||||
tokio = { workspace = true, features = ["io-util"] }
|
||||
thiserror.workspace = true
|
||||
serde.workspace = true
|
||||
# wal_decoder.workspace = true
|
||||
|
||||
@@ -562,7 +562,6 @@ pub enum BeMessage<'a> {
|
||||
options: &'a [&'a str],
|
||||
},
|
||||
KeepAlive(WalSndKeepAlive),
|
||||
InterpretedWalRecord(InterpretedWalRecordBody<'a>),
|
||||
}
|
||||
|
||||
/// Common shorthands.
|
||||
@@ -666,12 +665,6 @@ pub struct XLogDataBody<'a> {
|
||||
pub data: &'a [u8],
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct InterpretedWalRecordBody<'a> {
|
||||
pub wal_end: u64,
|
||||
pub data: &'a [u8],
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct WalSndKeepAlive {
|
||||
pub wal_end: u64, // current end of WAL on the server
|
||||
@@ -1003,15 +996,6 @@ impl BeMessage<'_> {
|
||||
Ok(())
|
||||
})?
|
||||
}
|
||||
|
||||
BeMessage::InterpretedWalRecord(rec) => {
|
||||
buf.put_u8(b'd'); // arbitrary?
|
||||
write_body(buf, |buf| {
|
||||
buf.put_u8(b'0');
|
||||
buf.put_u64(rec.wal_end);
|
||||
buf.put_slice(rec.data);
|
||||
});
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -euxo pipefail
|
||||
|
||||
@@ -6,9 +6,44 @@ PG_BIN=$1
|
||||
WAL_PATH=$2
|
||||
DATA_DIR=$3
|
||||
PORT=$4
|
||||
PG_VERSION=$5
|
||||
SYSID=$(od -A n -j 24 -N 8 -t d8 "$WAL_PATH"/000000010000000000000002* | cut -c 3-)
|
||||
|
||||
# The way that initdb is invoked must match how the pageserver runs initdb.
|
||||
function initdb_with_args {
|
||||
local cmd=(
|
||||
"$PG_BIN"/initdb
|
||||
-E utf8
|
||||
-U cloud_admin
|
||||
-D "$DATA_DIR"
|
||||
--locale 'C.UTF-8'
|
||||
--lc-collate 'C.UTF-8'
|
||||
--lc-ctype 'C.UTF-8'
|
||||
--lc-messages 'C.UTF-8'
|
||||
--lc-monetary 'C.UTF-8'
|
||||
--lc-numeric 'C.UTF-8'
|
||||
--lc-time 'C.UTF-8'
|
||||
--sysid="$SYSID"
|
||||
)
|
||||
|
||||
case "$PG_VERSION" in
|
||||
14)
|
||||
# Postgres 14 and below didn't support --locale-provider
|
||||
;;
|
||||
15 | 16)
|
||||
cmd+=(--locale-provider 'libc')
|
||||
;;
|
||||
*)
|
||||
# Postgres 17 added the builtin provider
|
||||
cmd+=(--locale-provider 'builtin')
|
||||
;;
|
||||
esac
|
||||
|
||||
eval env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "${cmd[*]}"
|
||||
}
|
||||
|
||||
rm -fr "$DATA_DIR"
|
||||
env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "$PG_BIN"/initdb -E utf8 -U cloud_admin -D "$DATA_DIR" --sysid="$SYSID"
|
||||
initdb_with_args
|
||||
echo "port=$PORT" >> "$DATA_DIR"/postgresql.conf
|
||||
echo "shared_preload_libraries='\$libdir/neon_rmgr.so'" >> "$DATA_DIR"/postgresql.conf
|
||||
REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-)
|
||||
|
||||
@@ -7,65 +7,29 @@ use postgres_connection::{parse_host_port, PgConnectionConfig};
|
||||
|
||||
use crate::id::TenantTimelineId;
|
||||
|
||||
/// Protocol used for safekeeper recovery. This sends raw Postgres WAL.
|
||||
pub const POSTGRES_PROTO_VERSION: u8 = 0;
|
||||
/// Protocol used for safekeeper to pageserver communication.
|
||||
/// This sends interpreted WAL records for the pageserver to ingest
|
||||
/// and is shard-aware.
|
||||
pub const PAGESERVER_SAFEKEEPER_PROTO_VERSION: u8 = 1;
|
||||
|
||||
pub struct ConnectionConfigArgs<'a> {
|
||||
pub protocol_version: u8,
|
||||
|
||||
pub ttid: TenantTimelineId,
|
||||
pub shard_number: Option<u8>,
|
||||
pub shard_count: Option<u8>,
|
||||
pub shard_stripe_size: Option<u32>,
|
||||
|
||||
pub listen_pg_addr_str: &'a str,
|
||||
|
||||
pub auth_token: Option<&'a str>,
|
||||
pub availability_zone: Option<&'a str>,
|
||||
}
|
||||
|
||||
impl<'a> ConnectionConfigArgs<'a> {
|
||||
fn options(&'a self) -> Vec<String> {
|
||||
let mut options = vec![
|
||||
"-c".to_owned(),
|
||||
format!("timeline_id={}", self.ttid.timeline_id),
|
||||
format!("tenant_id={}", self.ttid.tenant_id),
|
||||
format!("protocol_version={}", self.protocol_version),
|
||||
];
|
||||
|
||||
if self.shard_number.is_some() {
|
||||
assert!(self.shard_count.is_some());
|
||||
assert!(self.shard_stripe_size.is_some());
|
||||
|
||||
options.push(format!("shard_count={}", self.shard_count.unwrap()));
|
||||
options.push(format!("shard_number={}", self.shard_number.unwrap()));
|
||||
options.push(format!(
|
||||
"shard_stripe_size={}",
|
||||
self.shard_stripe_size.unwrap()
|
||||
));
|
||||
}
|
||||
|
||||
options
|
||||
}
|
||||
}
|
||||
|
||||
/// Create client config for fetching WAL from safekeeper on particular timeline.
|
||||
/// listen_pg_addr_str is in form host:\[port\].
|
||||
pub fn wal_stream_connection_config(
|
||||
args: ConnectionConfigArgs,
|
||||
TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
}: TenantTimelineId,
|
||||
listen_pg_addr_str: &str,
|
||||
auth_token: Option<&str>,
|
||||
availability_zone: Option<&str>,
|
||||
) -> anyhow::Result<PgConnectionConfig> {
|
||||
let (host, port) =
|
||||
parse_host_port(args.listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
|
||||
parse_host_port(listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
|
||||
let port = port.unwrap_or(5432);
|
||||
let mut connstr = PgConnectionConfig::new_host_port(host, port)
|
||||
.extend_options(args.options())
|
||||
.set_password(args.auth_token.map(|s| s.to_owned()));
|
||||
.extend_options([
|
||||
"-c".to_owned(),
|
||||
format!("timeline_id={}", timeline_id),
|
||||
format!("tenant_id={}", tenant_id),
|
||||
])
|
||||
.set_password(auth_token.map(|s| s.to_owned()));
|
||||
|
||||
if let Some(availability_zone) = args.availability_zone {
|
||||
if let Some(availability_zone) = availability_zone {
|
||||
connstr = connstr.extend_options([format!("availability_zone={}", availability_zone)]);
|
||||
}
|
||||
|
||||
|
||||
@@ -32,19 +32,16 @@ use postgres_ffi::walrecord::{
|
||||
XlSmgrTruncate, XlXactParsedRecord,
|
||||
};
|
||||
use postgres_ffi::{Oid, TransactionId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::serialized_batch::SerializedValueBatch;
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum FlushUncommittedRecords {
|
||||
Yes,
|
||||
No,
|
||||
}
|
||||
|
||||
/// An interpreted Postgres WAL record, ready to be handled by the pageserver
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct InterpretedWalRecord {
|
||||
/// Optional metadata record - may cause writes to metadata keys
|
||||
/// in the storage engine
|
||||
@@ -65,7 +62,6 @@ pub struct InterpretedWalRecord {
|
||||
|
||||
/// The interpreted part of the Postgres WAL record which requires metadata
|
||||
/// writes to the underlying storage engine.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum MetadataRecord {
|
||||
Heapam(HeapamRecord),
|
||||
Neonrmgr(NeonrmgrRecord),
|
||||
@@ -81,12 +77,10 @@ pub enum MetadataRecord {
|
||||
Replorigin(ReploriginRecord),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum HeapamRecord {
|
||||
ClearVmBits(ClearVmBits),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ClearVmBits {
|
||||
pub new_heap_blkno: Option<u32>,
|
||||
pub old_heap_blkno: Option<u32>,
|
||||
@@ -94,29 +88,24 @@ pub struct ClearVmBits {
|
||||
pub flags: u8,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum NeonrmgrRecord {
|
||||
ClearVmBits(ClearVmBits),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum SmgrRecord {
|
||||
Create(SmgrCreate),
|
||||
Truncate(XlSmgrTruncate),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct SmgrCreate {
|
||||
pub rel: RelTag,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum DbaseRecord {
|
||||
Create(DbaseCreate),
|
||||
Drop(DbaseDrop),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct DbaseCreate {
|
||||
pub db_id: Oid,
|
||||
pub tablespace_id: Oid,
|
||||
@@ -124,32 +113,27 @@ pub struct DbaseCreate {
|
||||
pub src_tablespace_id: Oid,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct DbaseDrop {
|
||||
pub db_id: Oid,
|
||||
pub tablespace_ids: Vec<Oid>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum ClogRecord {
|
||||
ZeroPage(ClogZeroPage),
|
||||
Truncate(ClogTruncate),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ClogZeroPage {
|
||||
pub segno: u32,
|
||||
pub rpageno: u32,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ClogTruncate {
|
||||
pub pageno: u32,
|
||||
pub oldest_xid: TransactionId,
|
||||
pub oldest_xid_db: Oid,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum XactRecord {
|
||||
Commit(XactCommon),
|
||||
Abort(XactCommon),
|
||||
@@ -158,7 +142,6 @@ pub enum XactRecord {
|
||||
Prepare(XactPrepare),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct XactCommon {
|
||||
pub parsed: XlXactParsedRecord,
|
||||
pub origin_id: u16,
|
||||
@@ -167,73 +150,61 @@ pub struct XactCommon {
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct XactPrepare {
|
||||
pub xl_xid: TransactionId,
|
||||
pub data: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum MultiXactRecord {
|
||||
ZeroPage(MultiXactZeroPage),
|
||||
Create(XlMultiXactCreate),
|
||||
Truncate(XlMultiXactTruncate),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct MultiXactZeroPage {
|
||||
pub slru_kind: SlruKind,
|
||||
pub segno: u32,
|
||||
pub rpageno: u32,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum RelmapRecord {
|
||||
Update(RelmapUpdate),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct RelmapUpdate {
|
||||
pub update: XlRelmapUpdate,
|
||||
pub buf: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum XlogRecord {
|
||||
Raw(RawXlogRecord),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct RawXlogRecord {
|
||||
pub info: u8,
|
||||
pub lsn: Lsn,
|
||||
pub buf: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum LogicalMessageRecord {
|
||||
Put(PutLogicalMessage),
|
||||
#[cfg(feature = "testing")]
|
||||
Failpoint,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct PutLogicalMessage {
|
||||
pub path: String,
|
||||
pub buf: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum StandbyRecord {
|
||||
RunningXacts(StandbyRunningXacts),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct StandbyRunningXacts {
|
||||
pub oldest_running_xid: TransactionId,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub enum ReploriginRecord {
|
||||
Set(XlReploriginSet),
|
||||
Drop(XlReploriginDrop),
|
||||
|
||||
@@ -16,7 +16,6 @@ use pageserver_api::shard::ShardIdentity;
|
||||
use pageserver_api::{key::CompactKey, value::Value};
|
||||
use postgres_ffi::walrecord::{DecodedBkpBlock, DecodedWALRecord};
|
||||
use postgres_ffi::{page_is_new, page_set_lsn, pg_constants, BLCKSZ};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::bin_ser::BeSer;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -30,7 +29,6 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
|
||||
/// relation sizes. In the case of "observed" values, we only need to know
|
||||
/// the key and LSN, so two types of metadata are supported to save on network
|
||||
/// bandwidth.
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub enum ValueMeta {
|
||||
Serialized(SerializedValueMeta),
|
||||
Observed(ObservedValueMeta),
|
||||
@@ -77,7 +75,6 @@ impl PartialEq for OrderedValueMeta {
|
||||
impl Eq for OrderedValueMeta {}
|
||||
|
||||
/// Metadata for a [`Value`] serialized into the batch.
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct SerializedValueMeta {
|
||||
pub key: CompactKey,
|
||||
pub lsn: Lsn,
|
||||
@@ -89,14 +86,12 @@ pub struct SerializedValueMeta {
|
||||
}
|
||||
|
||||
/// Metadata for a [`Value`] observed by the batch
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct ObservedValueMeta {
|
||||
pub key: CompactKey,
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
/// Batch of serialized [`Value`]s.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct SerializedValueBatch {
|
||||
/// [`Value`]s serialized in EphemeralFile's native format,
|
||||
/// ready for disk write by the pageserver
|
||||
|
||||
@@ -154,13 +154,17 @@ fn main() -> anyhow::Result<()> {
|
||||
},
|
||||
};
|
||||
|
||||
let started = Instant::now();
|
||||
syncfs(dirfd)?;
|
||||
let elapsed = started.elapsed();
|
||||
info!(
|
||||
elapsed_ms = elapsed.as_millis(),
|
||||
"made tenant directory contents durable"
|
||||
);
|
||||
if conf.no_sync {
|
||||
info!("Skipping syncfs on startup");
|
||||
} else {
|
||||
let started = Instant::now();
|
||||
syncfs(dirfd)?;
|
||||
let elapsed = started.elapsed();
|
||||
info!(
|
||||
elapsed_ms = elapsed.as_millis(),
|
||||
"made tenant directory contents durable"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize up failpoints support
|
||||
|
||||
@@ -69,6 +69,7 @@ pub struct PageServerConf {
|
||||
pub wal_redo_timeout: Duration,
|
||||
|
||||
pub superuser: String,
|
||||
pub locale: String,
|
||||
|
||||
pub page_cache_size: usize,
|
||||
pub max_file_descriptors: usize,
|
||||
@@ -178,6 +179,9 @@ pub struct PageServerConf {
|
||||
|
||||
/// Direct IO settings
|
||||
pub virtual_file_io_mode: virtual_file::IoMode,
|
||||
|
||||
/// Optionally disable disk syncs (unsafe!)
|
||||
pub no_sync: bool,
|
||||
}
|
||||
|
||||
/// Token for authentication to safekeepers
|
||||
@@ -298,6 +302,7 @@ impl PageServerConf {
|
||||
wait_lsn_timeout,
|
||||
wal_redo_timeout,
|
||||
superuser,
|
||||
locale,
|
||||
page_cache_size,
|
||||
max_file_descriptors,
|
||||
pg_distrib_dir,
|
||||
@@ -332,6 +337,7 @@ impl PageServerConf {
|
||||
concurrent_tenant_size_logical_size_queries,
|
||||
virtual_file_io_engine,
|
||||
tenant_config,
|
||||
no_sync,
|
||||
} = config_toml;
|
||||
|
||||
let mut conf = PageServerConf {
|
||||
@@ -344,6 +350,7 @@ impl PageServerConf {
|
||||
wait_lsn_timeout,
|
||||
wal_redo_timeout,
|
||||
superuser,
|
||||
locale,
|
||||
page_cache_size,
|
||||
max_file_descriptors,
|
||||
http_auth_type,
|
||||
@@ -409,6 +416,7 @@ impl PageServerConf {
|
||||
.map(crate::l0_flush::L0FlushConfig::from)
|
||||
.unwrap_or_default(),
|
||||
virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()),
|
||||
no_sync: no_sync.unwrap_or(false),
|
||||
};
|
||||
|
||||
// ------------------------------------------------------------
|
||||
|
||||
@@ -2002,9 +2002,9 @@ async fn timeline_offload_handler(
|
||||
"timeline has attached children".into(),
|
||||
));
|
||||
}
|
||||
if !timeline.can_offload() {
|
||||
if let (false, reason) = timeline.can_offload() {
|
||||
return Err(ApiError::PreconditionFailed(
|
||||
"Timeline::can_offload() returned false".into(),
|
||||
format!("Timeline::can_offload() check failed: {}", reason) .into(),
|
||||
));
|
||||
}
|
||||
offload_timeline(&tenant, &timeline)
|
||||
@@ -2169,6 +2169,21 @@ async fn timeline_detach_ancestor_handler(
|
||||
let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download);
|
||||
let ctx = &ctx;
|
||||
|
||||
// Flush the upload queues of all timelines before detaching ancestor. We do the same thing again
|
||||
// during shutdown. This early upload ensures the pageserver does not need to upload too many
|
||||
// things and creates downtime during timeline reloads.
|
||||
for timeline in tenant.list_timelines() {
|
||||
timeline
|
||||
.remote_client
|
||||
.wait_completion()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
ApiError::PreconditionFailed(format!("cannot drain upload queue: {e}").into())
|
||||
})?;
|
||||
}
|
||||
|
||||
tracing::info!("all timeline upload queues are drained");
|
||||
|
||||
let timeline = tenant.get_timeline(timeline_id, true)?;
|
||||
|
||||
let progress = timeline
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
//! The Page Service listens for client connections and serves their GetPage@LSN
|
||||
//! requests.
|
||||
|
||||
use anyhow::Context;
|
||||
use anyhow::{bail, Context};
|
||||
use async_compression::tokio::write::GzipEncoder;
|
||||
use bytes::Buf;
|
||||
use futures::FutureExt;
|
||||
use itertools::Itertools;
|
||||
use once_cell::sync::OnceCell;
|
||||
use pageserver_api::models::TenantState;
|
||||
use pageserver_api::models::{
|
||||
@@ -1221,6 +1222,222 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
|
||||
/// `basebackup tenant timeline [lsn] [--gzip] [--replica]`
|
||||
#[derive(Debug, Clone, Eq, PartialEq)]
|
||||
struct BaseBackupCmd {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Option<Lsn>,
|
||||
gzip: bool,
|
||||
replica: bool,
|
||||
}
|
||||
|
||||
/// `fullbackup tenant timeline [lsn] [prev_lsn]`
|
||||
#[derive(Debug, Clone, Eq, PartialEq)]
|
||||
struct FullBackupCmd {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Option<Lsn>,
|
||||
prev_lsn: Option<Lsn>,
|
||||
}
|
||||
|
||||
/// `pagestream_v2 tenant timeline`
|
||||
#[derive(Debug, Clone, Eq, PartialEq)]
|
||||
struct PageStreamCmd {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
}
|
||||
|
||||
/// `lease lsn tenant timeline lsn`
|
||||
#[derive(Debug, Clone, Eq, PartialEq)]
|
||||
struct LeaseLsnCmd {
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Eq, PartialEq)]
|
||||
enum PageServiceCmd {
|
||||
Set,
|
||||
PageStream(PageStreamCmd),
|
||||
BaseBackup(BaseBackupCmd),
|
||||
FullBackup(FullBackupCmd),
|
||||
LeaseLsn(LeaseLsnCmd),
|
||||
}
|
||||
|
||||
impl PageStreamCmd {
|
||||
fn parse(query: &str) -> anyhow::Result<Self> {
|
||||
let parameters = query.split_whitespace().collect_vec();
|
||||
if parameters.len() != 2 {
|
||||
bail!(
|
||||
"invalid number of parameters for pagestream command: {}",
|
||||
query
|
||||
);
|
||||
}
|
||||
let tenant_id = TenantId::from_str(parameters[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", parameters[0]))?;
|
||||
let timeline_id = TimelineId::from_str(parameters[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", parameters[1]))?;
|
||||
Ok(Self {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl FullBackupCmd {
|
||||
fn parse(query: &str) -> anyhow::Result<Self> {
|
||||
let parameters = query.split_whitespace().collect_vec();
|
||||
if parameters.len() < 2 || parameters.len() > 4 {
|
||||
bail!(
|
||||
"invalid number of parameters for basebackup command: {}",
|
||||
query
|
||||
);
|
||||
}
|
||||
let tenant_id = TenantId::from_str(parameters[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", parameters[0]))?;
|
||||
let timeline_id = TimelineId::from_str(parameters[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", parameters[1]))?;
|
||||
// The caller is responsible for providing correct lsn and prev_lsn.
|
||||
let lsn = if let Some(lsn_str) = parameters.get(2) {
|
||||
Some(
|
||||
Lsn::from_str(lsn_str)
|
||||
.with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let prev_lsn = if let Some(prev_lsn_str) = parameters.get(3) {
|
||||
Some(
|
||||
Lsn::from_str(prev_lsn_str)
|
||||
.with_context(|| format!("Failed to parse Lsn from {prev_lsn_str}"))?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
Ok(Self {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
prev_lsn,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl BaseBackupCmd {
|
||||
fn parse(query: &str) -> anyhow::Result<Self> {
|
||||
let parameters = query.split_whitespace().collect_vec();
|
||||
if parameters.len() < 2 {
|
||||
bail!(
|
||||
"invalid number of parameters for basebackup command: {}",
|
||||
query
|
||||
);
|
||||
}
|
||||
let tenant_id = TenantId::from_str(parameters[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", parameters[0]))?;
|
||||
let timeline_id = TimelineId::from_str(parameters[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", parameters[1]))?;
|
||||
let lsn;
|
||||
let flags_parse_from;
|
||||
if let Some(maybe_lsn) = parameters.get(2) {
|
||||
if *maybe_lsn == "latest" {
|
||||
lsn = None;
|
||||
flags_parse_from = 3;
|
||||
} else if maybe_lsn.starts_with("--") {
|
||||
lsn = None;
|
||||
flags_parse_from = 2;
|
||||
} else {
|
||||
lsn = Some(
|
||||
Lsn::from_str(maybe_lsn)
|
||||
.with_context(|| format!("Failed to parse lsn from {maybe_lsn}"))?,
|
||||
);
|
||||
flags_parse_from = 3;
|
||||
}
|
||||
} else {
|
||||
lsn = None;
|
||||
flags_parse_from = 2;
|
||||
}
|
||||
|
||||
let mut gzip = false;
|
||||
let mut replica = false;
|
||||
|
||||
for ¶m in ¶meters[flags_parse_from..] {
|
||||
match param {
|
||||
"--gzip" => {
|
||||
if gzip {
|
||||
bail!("duplicate parameter for basebackup command: {param}")
|
||||
}
|
||||
gzip = true
|
||||
}
|
||||
"--replica" => {
|
||||
if replica {
|
||||
bail!("duplicate parameter for basebackup command: {param}")
|
||||
}
|
||||
replica = true
|
||||
}
|
||||
_ => bail!("invalid parameter for basebackup command: {param}"),
|
||||
}
|
||||
}
|
||||
Ok(Self {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
gzip,
|
||||
replica,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl LeaseLsnCmd {
|
||||
fn parse(query: &str) -> anyhow::Result<Self> {
|
||||
let parameters = query.split_whitespace().collect_vec();
|
||||
if parameters.len() != 3 {
|
||||
bail!(
|
||||
"invalid number of parameters for lease lsn command: {}",
|
||||
query
|
||||
);
|
||||
}
|
||||
let tenant_shard_id = TenantShardId::from_str(parameters[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", parameters[0]))?;
|
||||
let timeline_id = TimelineId::from_str(parameters[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", parameters[1]))?;
|
||||
let lsn = Lsn::from_str(parameters[2])
|
||||
.with_context(|| format!("Failed to parse lsn from {}", parameters[2]))?;
|
||||
Ok(Self {
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl PageServiceCmd {
|
||||
fn parse(query: &str) -> anyhow::Result<Self> {
|
||||
let query = query.trim();
|
||||
let Some((cmd, other)) = query.split_once(' ') else {
|
||||
bail!("cannot parse query: {query}")
|
||||
};
|
||||
match cmd.to_ascii_lowercase().as_str() {
|
||||
"pagestream_v2" => Ok(Self::PageStream(PageStreamCmd::parse(other)?)),
|
||||
"basebackup" => Ok(Self::BaseBackup(BaseBackupCmd::parse(other)?)),
|
||||
"fullbackup" => Ok(Self::FullBackup(FullBackupCmd::parse(other)?)),
|
||||
"lease" => {
|
||||
let Some((cmd2, other)) = other.split_once(' ') else {
|
||||
bail!("invalid lease command: {cmd}");
|
||||
};
|
||||
let cmd2 = cmd2.to_ascii_lowercase();
|
||||
if cmd2 == "lsn" {
|
||||
Ok(Self::LeaseLsn(LeaseLsnCmd::parse(other)?))
|
||||
} else {
|
||||
bail!("invalid lease command: {cmd}");
|
||||
}
|
||||
}
|
||||
"set" => Ok(Self::Set),
|
||||
_ => Err(anyhow::anyhow!("unsupported command {cmd} in {query}")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<IO> postgres_backend::Handler<IO> for PageServerHandler
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
@@ -1277,206 +1494,137 @@ where
|
||||
fail::fail_point!("ps::connection-start::process-query");
|
||||
|
||||
let ctx = self.connection_ctx.attached_child();
|
||||
debug!("process query {query_string:?}");
|
||||
let parts = query_string.split_whitespace().collect::<Vec<_>>();
|
||||
if let Some(params) = parts.strip_prefix(&["pagestream_v2"]) {
|
||||
if params.len() != 2 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for pagestream command"
|
||||
)));
|
||||
}
|
||||
let tenant_id = TenantId::from_str(params[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
||||
let timeline_id = TimelineId::from_str(params[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::PageStreamV2)
|
||||
.inc();
|
||||
|
||||
self.handle_pagerequests(
|
||||
pgb,
|
||||
debug!("process query {query_string}");
|
||||
let query = PageServiceCmd::parse(query_string)?;
|
||||
match query {
|
||||
PageServiceCmd::PageStream(PageStreamCmd {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
PagestreamProtocolVersion::V2,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
} else if let Some(params) = parts.strip_prefix(&["basebackup"]) {
|
||||
if params.len() < 2 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for basebackup command"
|
||||
)));
|
||||
}) => {
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::PageStreamV2)
|
||||
.inc();
|
||||
|
||||
self.handle_pagerequests(
|
||||
pgb,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
PagestreamProtocolVersion::V2,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
PageServiceCmd::BaseBackup(BaseBackupCmd {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
gzip,
|
||||
replica,
|
||||
}) => {
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
let tenant_id = TenantId::from_str(params[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
||||
let timeline_id = TimelineId::from_str(params[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::Basebackup)
|
||||
.inc();
|
||||
|
||||
let mut lsn = None;
|
||||
let mut replica = false;
|
||||
let mut gzip = false;
|
||||
for param in ¶ms[2..] {
|
||||
if param.starts_with("--") {
|
||||
match *param {
|
||||
"--gzip" => gzip = true,
|
||||
"--replica" => replica = true,
|
||||
_ => {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"Unknown parameter {param}",
|
||||
)))
|
||||
}
|
||||
}
|
||||
} else {
|
||||
lsn = Some(
|
||||
Lsn::from_str(param)
|
||||
.with_context(|| format!("Failed to parse Lsn from {param}"))?,
|
||||
);
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::Basebackup)
|
||||
.inc();
|
||||
let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
|
||||
let res = async {
|
||||
self.handle_basebackup_request(
|
||||
pgb,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
None,
|
||||
false,
|
||||
gzip,
|
||||
replica,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
Result::<(), QueryError>::Ok(())
|
||||
}
|
||||
.await;
|
||||
metric_recording.observe(&res);
|
||||
res?;
|
||||
}
|
||||
// same as basebackup, but result includes relational data as well
|
||||
PageServiceCmd::FullBackup(FullBackupCmd {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
prev_lsn,
|
||||
}) => {
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
|
||||
let res = async {
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::Fullbackup)
|
||||
.inc();
|
||||
|
||||
// Check that the timeline exists
|
||||
self.handle_basebackup_request(
|
||||
pgb,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
None,
|
||||
prev_lsn,
|
||||
true,
|
||||
false,
|
||||
false,
|
||||
gzip,
|
||||
replica,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
Result::<(), QueryError>::Ok(())
|
||||
}
|
||||
.await;
|
||||
metric_recording.observe(&res);
|
||||
res?;
|
||||
}
|
||||
// same as basebackup, but result includes relational data as well
|
||||
else if let Some(params) = parts.strip_prefix(&["fullbackup"]) {
|
||||
if params.len() < 2 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for fullbackup command"
|
||||
)));
|
||||
PageServiceCmd::Set => {
|
||||
// important because psycopg2 executes "SET datestyle TO 'ISO'"
|
||||
// on connect
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
}
|
||||
|
||||
let tenant_id = TenantId::from_str(params[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
||||
let timeline_id = TimelineId::from_str(params[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
// The caller is responsible for providing correct lsn and prev_lsn.
|
||||
let lsn = if let Some(lsn_str) = params.get(2) {
|
||||
Some(
|
||||
Lsn::from_str(lsn_str)
|
||||
.with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let prev_lsn = if let Some(prev_lsn_str) = params.get(3) {
|
||||
Some(
|
||||
Lsn::from_str(prev_lsn_str)
|
||||
.with_context(|| format!("Failed to parse Lsn from {prev_lsn_str}"))?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::Fullbackup)
|
||||
.inc();
|
||||
|
||||
// Check that the timeline exists
|
||||
self.handle_basebackup_request(
|
||||
pgb,
|
||||
tenant_id,
|
||||
PageServiceCmd::LeaseLsn(LeaseLsnCmd {
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
prev_lsn,
|
||||
true,
|
||||
false,
|
||||
false,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.to_ascii_lowercase().starts_with("set ") {
|
||||
// important because psycopg2 executes "SET datestyle TO 'ISO'"
|
||||
// on connect
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.starts_with("lease lsn ") {
|
||||
let params = &parts[2..];
|
||||
if params.len() != 3 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number {} for lease lsn command",
|
||||
params.len()
|
||||
)));
|
||||
}) => {
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_shard_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::LeaseLsn)
|
||||
.inc();
|
||||
|
||||
match self
|
||||
.handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx)
|
||||
.await
|
||||
{
|
||||
Ok(()) => {
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?
|
||||
}
|
||||
Err(e) => {
|
||||
error!("error obtaining lsn lease for {lsn}: {e:?}");
|
||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(
|
||||
&e.to_string(),
|
||||
Some(e.pg_error_code()),
|
||||
))?
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
let tenant_shard_id = TenantShardId::from_str(params[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
||||
let timeline_id = TimelineId::from_str(params[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_shard_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::LeaseLsn)
|
||||
.inc();
|
||||
|
||||
// The caller is responsible for providing correct lsn.
|
||||
let lsn = Lsn::from_str(params[2])
|
||||
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
|
||||
|
||||
match self
|
||||
.handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx)
|
||||
.await
|
||||
{
|
||||
Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
|
||||
Err(e) => {
|
||||
error!("error obtaining lsn lease for {lsn}: {e:?}");
|
||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(
|
||||
&e.to_string(),
|
||||
Some(e.pg_error_code()),
|
||||
))?
|
||||
}
|
||||
};
|
||||
} else {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"unknown command {query_string}"
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -1525,3 +1673,181 @@ fn set_tracing_field_shard_id(timeline: &Timeline) {
|
||||
);
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use utils::shard::ShardCount;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn pageservice_cmd_parse() {
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
let cmd =
|
||||
PageServiceCmd::parse(&format!("pagestream_v2 {tenant_id} {timeline_id}")).unwrap();
|
||||
assert_eq!(
|
||||
cmd,
|
||||
PageServiceCmd::PageStream(PageStreamCmd {
|
||||
tenant_id,
|
||||
timeline_id
|
||||
})
|
||||
);
|
||||
let cmd = PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id}")).unwrap();
|
||||
assert_eq!(
|
||||
cmd,
|
||||
PageServiceCmd::BaseBackup(BaseBackupCmd {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn: None,
|
||||
gzip: false,
|
||||
replica: false
|
||||
})
|
||||
);
|
||||
let cmd =
|
||||
PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id} --gzip")).unwrap();
|
||||
assert_eq!(
|
||||
cmd,
|
||||
PageServiceCmd::BaseBackup(BaseBackupCmd {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn: None,
|
||||
gzip: true,
|
||||
replica: false
|
||||
})
|
||||
);
|
||||
let cmd =
|
||||
PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id} latest")).unwrap();
|
||||
assert_eq!(
|
||||
cmd,
|
||||
PageServiceCmd::BaseBackup(BaseBackupCmd {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn: None,
|
||||
gzip: false,
|
||||
replica: false
|
||||
})
|
||||
);
|
||||
let cmd = PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id} 0/16ABCDE"))
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
cmd,
|
||||
PageServiceCmd::BaseBackup(BaseBackupCmd {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn: Some(Lsn::from_str("0/16ABCDE").unwrap()),
|
||||
gzip: false,
|
||||
replica: false
|
||||
})
|
||||
);
|
||||
let cmd = PageServiceCmd::parse(&format!(
|
||||
"basebackup {tenant_id} {timeline_id} --replica --gzip"
|
||||
))
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
cmd,
|
||||
PageServiceCmd::BaseBackup(BaseBackupCmd {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn: None,
|
||||
gzip: true,
|
||||
replica: true
|
||||
})
|
||||
);
|
||||
let cmd = PageServiceCmd::parse(&format!(
|
||||
"basebackup {tenant_id} {timeline_id} 0/16ABCDE --replica --gzip"
|
||||
))
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
cmd,
|
||||
PageServiceCmd::BaseBackup(BaseBackupCmd {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn: Some(Lsn::from_str("0/16ABCDE").unwrap()),
|
||||
gzip: true,
|
||||
replica: true
|
||||
})
|
||||
);
|
||||
let cmd = PageServiceCmd::parse(&format!("fullbackup {tenant_id} {timeline_id}")).unwrap();
|
||||
assert_eq!(
|
||||
cmd,
|
||||
PageServiceCmd::FullBackup(FullBackupCmd {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn: None,
|
||||
prev_lsn: None
|
||||
})
|
||||
);
|
||||
let cmd = PageServiceCmd::parse(&format!(
|
||||
"fullbackup {tenant_id} {timeline_id} 0/16ABCDE 0/16ABCDF"
|
||||
))
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
cmd,
|
||||
PageServiceCmd::FullBackup(FullBackupCmd {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn: Some(Lsn::from_str("0/16ABCDE").unwrap()),
|
||||
prev_lsn: Some(Lsn::from_str("0/16ABCDF").unwrap()),
|
||||
})
|
||||
);
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
let cmd = PageServiceCmd::parse(&format!(
|
||||
"lease lsn {tenant_shard_id} {timeline_id} 0/16ABCDE"
|
||||
))
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
cmd,
|
||||
PageServiceCmd::LeaseLsn(LeaseLsnCmd {
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
lsn: Lsn::from_str("0/16ABCDE").unwrap(),
|
||||
})
|
||||
);
|
||||
let tenant_shard_id = TenantShardId::split(&tenant_shard_id, ShardCount(8))[1];
|
||||
let cmd = PageServiceCmd::parse(&format!(
|
||||
"lease lsn {tenant_shard_id} {timeline_id} 0/16ABCDE"
|
||||
))
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
cmd,
|
||||
PageServiceCmd::LeaseLsn(LeaseLsnCmd {
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
lsn: Lsn::from_str("0/16ABCDE").unwrap(),
|
||||
})
|
||||
);
|
||||
let cmd = PageServiceCmd::parse("set a = b").unwrap();
|
||||
assert_eq!(cmd, PageServiceCmd::Set);
|
||||
let cmd = PageServiceCmd::parse("SET foo").unwrap();
|
||||
assert_eq!(cmd, PageServiceCmd::Set);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn pageservice_cmd_err_handling() {
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
let cmd = PageServiceCmd::parse("unknown_command");
|
||||
assert!(cmd.is_err());
|
||||
let cmd = PageServiceCmd::parse("pagestream_v2");
|
||||
assert!(cmd.is_err());
|
||||
let cmd = PageServiceCmd::parse(&format!("pagestream_v2 {tenant_id}xxx"));
|
||||
assert!(cmd.is_err());
|
||||
let cmd = PageServiceCmd::parse(&format!("pagestream_v2 {tenant_id}xxx {timeline_id}xxx"));
|
||||
assert!(cmd.is_err());
|
||||
let cmd = PageServiceCmd::parse(&format!(
|
||||
"basebackup {tenant_id} {timeline_id} --gzip --gzip"
|
||||
));
|
||||
assert!(cmd.is_err());
|
||||
let cmd = PageServiceCmd::parse(&format!(
|
||||
"basebackup {tenant_id} {timeline_id} --gzip --unknown"
|
||||
));
|
||||
assert!(cmd.is_err());
|
||||
let cmd = PageServiceCmd::parse(&format!(
|
||||
"basebackup {tenant_id} {timeline_id} --gzip 0/16ABCDE"
|
||||
));
|
||||
assert!(cmd.is_err());
|
||||
let cmd = PageServiceCmd::parse(&format!("lease {tenant_id} {timeline_id} gzip 0/16ABCDE"));
|
||||
assert!(cmd.is_err());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -45,7 +45,7 @@ use wal_decoder::serialized_batch::SerializedValueBatch;
|
||||
pub const MAX_AUX_FILE_DELTAS: usize = 1024;
|
||||
|
||||
/// Max number of aux-file-related delta layers. The compaction will create a new image layer once this threshold is reached.
|
||||
pub const MAX_AUX_FILE_V2_DELTAS: usize = 64;
|
||||
pub const MAX_AUX_FILE_V2_DELTAS: usize = 16;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum LsnForTimestamp {
|
||||
@@ -1164,19 +1164,12 @@ impl<'a> DatadirModification<'a> {
|
||||
.get_rel_exists(rel, Version::Modified(self), ctx)
|
||||
.await?
|
||||
{
|
||||
tracing::debug!("Creating relation {rel:?} at lsn {}", self.get_lsn());
|
||||
|
||||
// create it with 0 size initially, the logic below will extend it
|
||||
self.put_rel_creation(rel, 0, ctx)
|
||||
.await
|
||||
.context("Relation Error")?;
|
||||
Ok(0)
|
||||
} else {
|
||||
tracing::debug!(
|
||||
"Skipping relation {rel:?} creation at lsn {}",
|
||||
self.get_lsn()
|
||||
);
|
||||
|
||||
self.tline
|
||||
.get_rel_size(rel, Version::Modified(self), ctx)
|
||||
.await
|
||||
@@ -1217,8 +1210,6 @@ impl<'a> DatadirModification<'a> {
|
||||
shard: &ShardIdentity,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
tracing::debug!("Ingesting batch with metadata: {:?}", batch.metadata);
|
||||
|
||||
let mut gaps_at_lsns = Vec::default();
|
||||
|
||||
for meta in batch.metadata.iter() {
|
||||
|
||||
@@ -2493,7 +2493,8 @@ impl Tenant {
|
||||
timelines_to_compact_or_offload = timelines
|
||||
.iter()
|
||||
.filter_map(|(timeline_id, timeline)| {
|
||||
let (is_active, can_offload) = (timeline.is_active(), timeline.can_offload());
|
||||
let (is_active, (can_offload, _)) =
|
||||
(timeline.is_active(), timeline.can_offload());
|
||||
let has_no_unoffloaded_children = {
|
||||
!timelines
|
||||
.iter()
|
||||
@@ -4779,10 +4780,18 @@ async fn run_initdb(
|
||||
|
||||
let _permit = INIT_DB_SEMAPHORE.acquire().await;
|
||||
|
||||
let initdb_command = tokio::process::Command::new(&initdb_bin_path)
|
||||
let mut initdb_command = tokio::process::Command::new(&initdb_bin_path);
|
||||
initdb_command
|
||||
.args(["--pgdata", initdb_target_dir.as_ref()])
|
||||
.args(["--username", &conf.superuser])
|
||||
.args(["--encoding", "utf8"])
|
||||
.args(["--locale", &conf.locale])
|
||||
.args(["--lc-collate", &conf.locale])
|
||||
.args(["--lc-ctype", &conf.locale])
|
||||
.args(["--lc-messages", &conf.locale])
|
||||
.args(["--lc-monetary", &conf.locale])
|
||||
.args(["--lc-numeric", &conf.locale])
|
||||
.args(["--lc-time", &conf.locale])
|
||||
.arg("--no-instructions")
|
||||
.arg("--no-sync")
|
||||
.env_clear()
|
||||
@@ -4792,15 +4801,27 @@ async fn run_initdb(
|
||||
// stdout invocation produces the same output every time, we don't need it
|
||||
.stdout(std::process::Stdio::null())
|
||||
// we would be interested in the stderr output, if there was any
|
||||
.stderr(std::process::Stdio::piped())
|
||||
.spawn()?;
|
||||
.stderr(std::process::Stdio::piped());
|
||||
|
||||
// Before version 14, only the libc provide was available.
|
||||
if pg_version > 14 {
|
||||
// Version 17 brought with it a builtin locale provider which only provides
|
||||
// C and C.UTF-8. While being safer for collation purposes since it is
|
||||
// guaranteed to be consistent throughout a major release, it is also more
|
||||
// performant.
|
||||
let locale_provider = if pg_version >= 17 { "builtin" } else { "libc" };
|
||||
|
||||
initdb_command.args(["--locale-provider", locale_provider]);
|
||||
}
|
||||
|
||||
let initdb_proc = initdb_command.spawn()?;
|
||||
|
||||
// Ideally we'd select here with the cancellation token, but the problem is that
|
||||
// we can't safely terminate initdb: it launches processes of its own, and killing
|
||||
// initdb doesn't kill them. After we return from this function, we want the target
|
||||
// directory to be able to be cleaned up.
|
||||
// See https://github.com/neondatabase/neon/issues/6385
|
||||
let initdb_output = initdb_command.wait_with_output().await?;
|
||||
let initdb_output = initdb_proc.wait_with_output().await?;
|
||||
if !initdb_output.status.success() {
|
||||
return Err(InitdbError::Failed(
|
||||
initdb_output.status,
|
||||
|
||||
@@ -1959,7 +1959,7 @@ impl TenantManager {
|
||||
attempt.before_reset_tenant();
|
||||
|
||||
let (_guard, progress) = utils::completion::channel();
|
||||
match tenant.shutdown(progress, ShutdownMode::Hard).await {
|
||||
match tenant.shutdown(progress, ShutdownMode::Flush).await {
|
||||
Ok(()) => {
|
||||
slot_guard.drop_old_value().expect("it was just shutdown");
|
||||
}
|
||||
|
||||
@@ -1445,7 +1445,7 @@ impl RemoteTimelineClient {
|
||||
let remote_path = remote_layer_path(
|
||||
&self.tenant_shard_id.tenant_id,
|
||||
&self.timeline_id,
|
||||
self.tenant_shard_id.to_index(),
|
||||
uploaded.metadata().shard,
|
||||
&uploaded.layer_desc().layer_name(),
|
||||
uploaded.metadata().generation,
|
||||
);
|
||||
@@ -1486,7 +1486,7 @@ impl RemoteTimelineClient {
|
||||
&adopted
|
||||
.get_timeline_id()
|
||||
.expect("Source timeline should be alive"),
|
||||
self.tenant_shard_id.to_index(),
|
||||
adopted.metadata().shard,
|
||||
&adopted.layer_desc().layer_name(),
|
||||
adopted.metadata().generation,
|
||||
);
|
||||
@@ -1494,7 +1494,7 @@ impl RemoteTimelineClient {
|
||||
let target_remote_path = remote_layer_path(
|
||||
&self.tenant_shard_id.tenant_id,
|
||||
&self.timeline_id,
|
||||
self.tenant_shard_id.to_index(),
|
||||
adopted_as.metadata().shard,
|
||||
&adopted_as.layer_desc().layer_name(),
|
||||
adopted_as.metadata().generation,
|
||||
);
|
||||
@@ -2201,6 +2201,18 @@ impl RemoteTimelineClient {
|
||||
inner.initialized_mut()?;
|
||||
Ok(UploadQueueAccessor { inner })
|
||||
}
|
||||
|
||||
pub(crate) fn no_pending_work(&self) -> bool {
|
||||
let inner = self.upload_queue.lock().unwrap();
|
||||
match &*inner {
|
||||
UploadQueue::Uninitialized
|
||||
| UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => true,
|
||||
UploadQueue::Stopped(UploadQueueStopped::Deletable(x)) => {
|
||||
x.upload_queue_for_deletion.no_pending_work()
|
||||
}
|
||||
UploadQueue::Initialized(x) => x.no_pending_work(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct UploadQueueAccessor<'a> {
|
||||
|
||||
@@ -12,7 +12,7 @@ pub mod merge_iterator;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::key::{Key, NON_INHERITED_SPARSE_RANGE};
|
||||
use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::value::Value;
|
||||
@@ -196,6 +196,9 @@ impl ValuesReconstructState {
|
||||
/// Returns true if this was the last value needed for the key and false otherwise.
|
||||
///
|
||||
/// If the key is done after the update, mark it as such.
|
||||
///
|
||||
/// If the key is in the sparse keyspace (i.e., aux files), we do not track them in
|
||||
/// `key_done`.
|
||||
pub(crate) fn update_key(
|
||||
&mut self,
|
||||
key: &Key,
|
||||
@@ -206,10 +209,18 @@ impl ValuesReconstructState {
|
||||
.keys
|
||||
.entry(*key)
|
||||
.or_insert(Ok(VectoredValueReconstructState::default()));
|
||||
|
||||
let is_sparse_key = NON_INHERITED_SPARSE_RANGE.contains(key);
|
||||
if let Ok(state) = state {
|
||||
let key_done = match state.situation {
|
||||
ValueReconstructSituation::Complete => unreachable!(),
|
||||
ValueReconstructSituation::Complete => {
|
||||
if is_sparse_key {
|
||||
// Sparse keyspace might be visited multiple times because
|
||||
// we don't track unmapped keyspaces.
|
||||
return ValueReconstructSituation::Complete;
|
||||
} else {
|
||||
unreachable!()
|
||||
}
|
||||
}
|
||||
ValueReconstructSituation::Continue => match value {
|
||||
Value::Image(img) => {
|
||||
state.img = Some((lsn, img));
|
||||
@@ -234,7 +245,9 @@ impl ValuesReconstructState {
|
||||
|
||||
if key_done && state.situation == ValueReconstructSituation::Continue {
|
||||
state.situation = ValueReconstructSituation::Complete;
|
||||
self.keys_done.add_key(*key);
|
||||
if !is_sparse_key {
|
||||
self.keys_done.add_key(*key);
|
||||
}
|
||||
}
|
||||
|
||||
state.situation
|
||||
|
||||
@@ -67,6 +67,8 @@ pub struct InMemoryLayer {
|
||||
/// The above fields never change, except for `end_lsn`, which is only set once.
|
||||
/// All other changing parts are in `inner`, and protected by a mutex.
|
||||
inner: RwLock<InMemoryLayerInner>,
|
||||
|
||||
estimated_in_mem_size: AtomicU64,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for InMemoryLayer {
|
||||
@@ -543,6 +545,10 @@ impl InMemoryLayer {
|
||||
Ok(inner.file.len())
|
||||
}
|
||||
|
||||
pub fn estimated_in_mem_size(&self) -> u64 {
|
||||
self.estimated_in_mem_size.load(AtomicOrdering::Relaxed)
|
||||
}
|
||||
|
||||
/// Create a new, empty, in-memory layer
|
||||
pub async fn create(
|
||||
conf: &'static PageServerConf,
|
||||
@@ -572,6 +578,7 @@ impl InMemoryLayer {
|
||||
file,
|
||||
resource_units: GlobalResourceUnits::new(),
|
||||
}),
|
||||
estimated_in_mem_size: AtomicU64::new(0),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -642,6 +649,12 @@ impl InMemoryLayer {
|
||||
// because this case is unexpected, and we would like tests to fail if this happens.
|
||||
warn!("Key {} at {} written twice at same LSN", key, lsn);
|
||||
}
|
||||
self.estimated_in_mem_size.fetch_add(
|
||||
(std::mem::size_of::<CompactKey>()
|
||||
+ std::mem::size_of::<Lsn>()
|
||||
+ std::mem::size_of::<IndexEntry>()) as u64,
|
||||
AtomicOrdering::Relaxed,
|
||||
);
|
||||
}
|
||||
|
||||
inner.resource_units.maybe_publish_size(new_size);
|
||||
|
||||
@@ -23,6 +23,7 @@ use handle::ShardTimelineId;
|
||||
use offload::OffloadError;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::{
|
||||
config::tenant_conf_defaults::DEFAULT_COMPACTION_THRESHOLD,
|
||||
key::{
|
||||
KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
|
||||
NON_INHERITED_SPARSE_RANGE,
|
||||
@@ -852,6 +853,10 @@ pub(crate) enum ShutdownMode {
|
||||
/// While we are flushing, we continue to accept read I/O for LSNs ingested before
|
||||
/// the call to [`Timeline::shutdown`].
|
||||
FreezeAndFlush,
|
||||
/// Only flush the layers to the remote storage without freezing any open layers. This is the
|
||||
/// mode used by ancestor detach and any other operations that reloads a tenant but not increasing
|
||||
/// the generation number.
|
||||
Flush,
|
||||
/// Shut down immediately, without waiting for any open layers to flush.
|
||||
Hard,
|
||||
}
|
||||
@@ -1565,12 +1570,16 @@ impl Timeline {
|
||||
///
|
||||
/// This is neccessary but not sufficient for offloading of the timeline as it might have
|
||||
/// child timelines that are not offloaded yet.
|
||||
pub(crate) fn can_offload(&self) -> bool {
|
||||
pub(crate) fn can_offload(&self) -> (bool, &'static str) {
|
||||
if self.remote_client.is_archived() != Some(true) {
|
||||
return false;
|
||||
return (false, "the timeline is not archived");
|
||||
}
|
||||
if !self.remote_client.no_pending_work() {
|
||||
// if the remote client is still processing some work, we can't offload
|
||||
return (false, "the upload queue is not drained yet");
|
||||
}
|
||||
|
||||
true
|
||||
(true, "ok")
|
||||
}
|
||||
|
||||
/// Outermost timeline compaction operation; downloads needed layers. Returns whether we have pending
|
||||
@@ -1678,11 +1687,6 @@ impl Timeline {
|
||||
pub(crate) async fn shutdown(&self, mode: ShutdownMode) {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let try_freeze_and_flush = match mode {
|
||||
ShutdownMode::FreezeAndFlush => true,
|
||||
ShutdownMode::Hard => false,
|
||||
};
|
||||
|
||||
// Regardless of whether we're going to try_freeze_and_flush
|
||||
// or not, stop ingesting any more data. Walreceiver only provides
|
||||
// cancellation but no "wait until gone", because it uses the Timeline::gate.
|
||||
@@ -1704,7 +1708,7 @@ impl Timeline {
|
||||
// ... and inform any waiters for newer LSNs that there won't be any.
|
||||
self.last_record_lsn.shutdown();
|
||||
|
||||
if try_freeze_and_flush {
|
||||
if let ShutdownMode::FreezeAndFlush = mode {
|
||||
if let Some((open, frozen)) = self
|
||||
.layers
|
||||
.read()
|
||||
@@ -1746,6 +1750,20 @@ impl Timeline {
|
||||
warn!("failed to freeze and flush: {e:#}");
|
||||
}
|
||||
}
|
||||
|
||||
// `self.remote_client.shutdown().await` above should have already flushed everything from the queue, but
|
||||
// we also do a final check here to ensure that the queue is empty.
|
||||
if !self.remote_client.no_pending_work() {
|
||||
warn!("still have pending work in remote upload queue, but continuing shutting down anyways");
|
||||
}
|
||||
}
|
||||
|
||||
if let ShutdownMode::Flush = mode {
|
||||
// drain the upload queue
|
||||
self.remote_client.shutdown().await;
|
||||
if !self.remote_client.no_pending_work() {
|
||||
warn!("still have pending work in remote upload queue, but continuing shutting down anyways");
|
||||
}
|
||||
}
|
||||
|
||||
// Signal any subscribers to our cancellation token to drop out
|
||||
@@ -3488,18 +3506,37 @@ impl Timeline {
|
||||
|
||||
let timer = self.metrics.flush_time_histo.start_timer();
|
||||
|
||||
let num_frozen_layers;
|
||||
let frozen_layer_total_size;
|
||||
let layer_to_flush = {
|
||||
let guard = self.layers.read().await;
|
||||
let Ok(lm) = guard.layer_map() else {
|
||||
info!("dropping out of flush loop for timeline shutdown");
|
||||
return;
|
||||
};
|
||||
num_frozen_layers = lm.frozen_layers.len();
|
||||
frozen_layer_total_size = lm
|
||||
.frozen_layers
|
||||
.iter()
|
||||
.map(|l| l.estimated_in_mem_size())
|
||||
.sum::<u64>();
|
||||
lm.frozen_layers.front().cloned()
|
||||
// drop 'layers' lock to allow concurrent reads and writes
|
||||
};
|
||||
let Some(layer_to_flush) = layer_to_flush else {
|
||||
break Ok(());
|
||||
};
|
||||
if num_frozen_layers
|
||||
> std::cmp::max(
|
||||
self.get_compaction_threshold(),
|
||||
DEFAULT_COMPACTION_THRESHOLD,
|
||||
)
|
||||
&& frozen_layer_total_size >= /* 64 MB */ 64000000
|
||||
{
|
||||
tracing::warn!(
|
||||
"too many frozen layers: {num_frozen_layers} layers with estimated in-mem size of {frozen_layer_total_size} bytes",
|
||||
);
|
||||
}
|
||||
match self.flush_frozen_layer(layer_to_flush, ctx).await {
|
||||
Ok(this_layer_to_lsn) => {
|
||||
flushed_to_lsn = std::cmp::max(flushed_to_lsn, this_layer_to_lsn);
|
||||
@@ -4090,6 +4127,7 @@ impl Timeline {
|
||||
) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
|
||||
// Metadata keys image layer creation.
|
||||
let mut reconstruct_state = ValuesReconstructState::default();
|
||||
let begin = Instant::now();
|
||||
let data = self
|
||||
.get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx)
|
||||
.await?;
|
||||
@@ -4106,14 +4144,11 @@ impl Timeline {
|
||||
(new_data, total_kb_retrieved / 1024, total_keys_retrieved)
|
||||
};
|
||||
let delta_files_accessed = reconstruct_state.get_delta_layers_visited();
|
||||
let elapsed = begin.elapsed();
|
||||
|
||||
let trigger_generation = delta_files_accessed as usize >= MAX_AUX_FILE_V2_DELTAS;
|
||||
debug!(
|
||||
trigger_generation,
|
||||
delta_files_accessed,
|
||||
total_kb_retrieved,
|
||||
total_keys_retrieved,
|
||||
"generate metadata images"
|
||||
info!(
|
||||
"metadata key compaction: trigger_generation={trigger_generation}, delta_files_accessed={delta_files_accessed}, total_kb_retrieved={total_kb_retrieved}, total_keys_retrieved={total_keys_retrieved}, read_time={}s", elapsed.as_secs_f64()
|
||||
);
|
||||
|
||||
if !trigger_generation && mode == ImageLayerCreationMode::Try {
|
||||
|
||||
@@ -12,7 +12,7 @@ use crate::{
|
||||
virtual_file::{MaybeFatalIo, VirtualFile},
|
||||
};
|
||||
use anyhow::Context;
|
||||
use pageserver_api::models::detach_ancestor::AncestorDetached;
|
||||
use pageserver_api::{models::detach_ancestor::AncestorDetached, shard::ShardIdentity};
|
||||
use tokio::sync::Semaphore;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::Instrument;
|
||||
@@ -376,8 +376,14 @@ pub(super) async fn prepare(
|
||||
tasks.spawn(
|
||||
async move {
|
||||
let _permit = limiter.acquire().await;
|
||||
let owned =
|
||||
remote_copy(&adopted, &timeline, timeline.generation, &timeline.cancel).await?;
|
||||
let owned = remote_copy(
|
||||
&adopted,
|
||||
&timeline,
|
||||
timeline.generation,
|
||||
timeline.shard_identity,
|
||||
&timeline.cancel,
|
||||
)
|
||||
.await?;
|
||||
tracing::info!(layer=%owned, "remote copied");
|
||||
Ok(owned)
|
||||
}
|
||||
@@ -629,6 +635,7 @@ async fn remote_copy(
|
||||
adopted: &Layer,
|
||||
adoptee: &Arc<Timeline>,
|
||||
generation: Generation,
|
||||
shard_identity: ShardIdentity,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Layer, Error> {
|
||||
// depending if Layer::keep_resident we could hardlink
|
||||
@@ -636,6 +643,7 @@ async fn remote_copy(
|
||||
let mut metadata = adopted.metadata();
|
||||
debug_assert!(metadata.generation <= generation);
|
||||
metadata.generation = generation;
|
||||
metadata.shard = shard_identity.shard_index();
|
||||
|
||||
let owned = crate::tenant::storage_layer::Layer::for_evicted(
|
||||
adoptee.conf,
|
||||
|
||||
@@ -47,21 +47,18 @@ pub(crate) async fn offload_timeline(
|
||||
match is_archived {
|
||||
Some(true) => (),
|
||||
Some(false) => {
|
||||
tracing::warn!(?is_archived, "tried offloading a non-archived timeline");
|
||||
tracing::warn!("tried offloading a non-archived timeline");
|
||||
return Err(OffloadError::NotArchived);
|
||||
}
|
||||
None => {
|
||||
// This is legal: calls to this function can race with the timeline shutting down
|
||||
tracing::info!(
|
||||
?is_archived,
|
||||
"tried offloading a timeline whose remote storage is not initialized"
|
||||
);
|
||||
tracing::info!("tried offloading a timeline whose remote storage is not initialized");
|
||||
return Err(OffloadError::Cancelled);
|
||||
}
|
||||
}
|
||||
|
||||
// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
|
||||
timeline.shutdown(super::ShutdownMode::Hard).await;
|
||||
timeline.shutdown(super::ShutdownMode::Flush).await;
|
||||
|
||||
// TODO extend guard mechanism above with method
|
||||
// to make deletions possible while offloading is in progress
|
||||
|
||||
@@ -36,9 +36,7 @@ use postgres_connection::PgConnectionConfig;
|
||||
use utils::backoff::{
|
||||
exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
};
|
||||
use utils::postgres_client::{
|
||||
wal_stream_connection_config, ConnectionConfigArgs, PAGESERVER_SAFEKEEPER_PROTO_VERSION, POSTGRES_PROTO_VERSION,
|
||||
};
|
||||
use utils::postgres_client::wal_stream_connection_config;
|
||||
use utils::{
|
||||
id::{NodeId, TenantTimelineId},
|
||||
lsn::Lsn,
|
||||
@@ -986,29 +984,15 @@ impl ConnectionManagerState {
|
||||
if info.safekeeper_connstr.is_empty() {
|
||||
return None; // no connection string, ignore sk
|
||||
}
|
||||
|
||||
let shard_identity = self.timeline.get_shard_identity();
|
||||
let connection_conf_args = ConnectionConfigArgs {
|
||||
protocol_version: PAGESERVER_SAFEKEEPER_PROTO_VERSION,
|
||||
ttid: self.id,
|
||||
shard_number: Some(shard_identity.number.0),
|
||||
shard_count: Some(shard_identity.count.0),
|
||||
shard_stripe_size: Some(shard_identity.stripe_size.0),
|
||||
listen_pg_addr_str: info.safekeeper_connstr.as_ref(),
|
||||
auth_token: self.conf.auth_token.as_ref().map(|t| t.as_str()),
|
||||
availability_zone: self.conf.availability_zone.as_deref()
|
||||
};
|
||||
// let connection_conf_args = ConnectionConfigArgs {
|
||||
// protocol_version: POSTGRES_PROTO_VERSION,
|
||||
// ttid: self.id,
|
||||
// shard_number: None,
|
||||
// shard_count: None,
|
||||
// shard_stripe_size: None,
|
||||
// listen_pg_addr_str: info.safekeeper_connstr.as_ref(),
|
||||
// auth_token: self.conf.auth_token.as_ref().map(|t| t.as_str()),
|
||||
// availability_zone: self.conf.availability_zone.as_deref()
|
||||
// };
|
||||
match wal_stream_connection_config(connection_conf_args) {
|
||||
match wal_stream_connection_config(
|
||||
self.id,
|
||||
info.safekeeper_connstr.as_ref(),
|
||||
match &self.conf.auth_token {
|
||||
None => None,
|
||||
Some(x) => Some(x),
|
||||
},
|
||||
self.conf.availability_zone.as_deref(),
|
||||
) {
|
||||
Ok(connstr) => Some((*sk_id, info, connstr)),
|
||||
Err(e) => {
|
||||
error!("Failed to create wal receiver connection string from broker data of safekeeper node {}: {e:#}", sk_id);
|
||||
|
||||
@@ -36,7 +36,7 @@ use crate::{
|
||||
use postgres_backend::is_expected_io_error;
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
use postgres_ffi::waldecoder::WalStreamDecoder;
|
||||
use utils::{bin_ser::BeSer, id::NodeId, lsn::Lsn};
|
||||
use utils::{id::NodeId, lsn::Lsn};
|
||||
use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};
|
||||
|
||||
/// Status of the connection.
|
||||
@@ -278,7 +278,6 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
// fails (e.g. in walingest), we still want to know latests LSNs from the safekeeper.
|
||||
match &replication_message {
|
||||
ReplicationMessage::XLogData(xlog_data) => {
|
||||
// TODO(vlad) Is this crap needed?
|
||||
connection_status.latest_connection_update = now;
|
||||
connection_status.commit_lsn = Some(Lsn::from(xlog_data.wal_end()));
|
||||
connection_status.streaming_lsn = Some(Lsn::from(
|
||||
@@ -300,24 +299,6 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
}
|
||||
|
||||
let status_update = match replication_message {
|
||||
ReplicationMessage::RawInterpretedWalRecord(raw) => {
|
||||
connection_status.latest_connection_update = now;
|
||||
connection_status.latest_wal_update = now;
|
||||
connection_status.commit_lsn = Some(Lsn::from(raw.wal_end()));
|
||||
|
||||
let interpreted = InterpretedWalRecord::des(raw.data()).unwrap();
|
||||
let end_lsn = interpreted.end_lsn;
|
||||
|
||||
let mut modification = timeline.begin_modification(end_lsn);
|
||||
walingest
|
||||
.ingest_record(interpreted, &mut modification, &ctx)
|
||||
.await
|
||||
.with_context(|| format!("could not ingest record at {}", end_lsn))?;
|
||||
modification.commit(&ctx).await?;
|
||||
|
||||
Some(end_lsn)
|
||||
}
|
||||
|
||||
ReplicationMessage::XLogData(xlog_data) => {
|
||||
// Pass the WAL data to the decoder, and see if we can decode
|
||||
// more records as a result.
|
||||
|
||||
@@ -611,6 +611,17 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun
|
||||
recptr = startptr;
|
||||
nbytes = count;
|
||||
|
||||
/* Try to read directly from WAL buffers first. */
|
||||
#if PG_MAJORVERSION_NUM >= 17
|
||||
{
|
||||
Size rbytes;
|
||||
rbytes = WALReadFromBuffers(p, recptr, nbytes, tli);
|
||||
recptr += rbytes;
|
||||
nbytes -= rbytes;
|
||||
p += rbytes;
|
||||
}
|
||||
#endif
|
||||
|
||||
while (nbytes > 0)
|
||||
{
|
||||
uint32 startoff;
|
||||
|
||||
@@ -1361,29 +1361,35 @@ SendAppendRequests(Safekeeper *sk)
|
||||
if (sk->active_state == SS_ACTIVE_READ_WAL)
|
||||
{
|
||||
char *errmsg;
|
||||
int req_len;
|
||||
|
||||
req = &sk->appendRequest;
|
||||
req_len = req->endLsn - req->beginLsn;
|
||||
|
||||
switch (wp->api.wal_read(sk,
|
||||
&sk->outbuf.data[sk->outbuf.len],
|
||||
req->beginLsn,
|
||||
req->endLsn - req->beginLsn,
|
||||
&errmsg))
|
||||
/* We send zero sized AppenRequests as heartbeats; don't wal_read for these. */
|
||||
if (req_len > 0)
|
||||
{
|
||||
case NEON_WALREAD_SUCCESS:
|
||||
break;
|
||||
case NEON_WALREAD_WOULDBLOCK:
|
||||
return true;
|
||||
case NEON_WALREAD_ERROR:
|
||||
wp_log(WARNING, "WAL reading for node %s:%s failed: %s",
|
||||
sk->host, sk->port, errmsg);
|
||||
ShutdownConnection(sk);
|
||||
return false;
|
||||
default:
|
||||
Assert(false);
|
||||
switch (wp->api.wal_read(sk,
|
||||
&sk->outbuf.data[sk->outbuf.len],
|
||||
req->beginLsn,
|
||||
req_len,
|
||||
&errmsg))
|
||||
{
|
||||
case NEON_WALREAD_SUCCESS:
|
||||
break;
|
||||
case NEON_WALREAD_WOULDBLOCK:
|
||||
return true;
|
||||
case NEON_WALREAD_ERROR:
|
||||
wp_log(WARNING, "WAL reading for node %s:%s failed: %s",
|
||||
sk->host, sk->port, errmsg);
|
||||
ShutdownConnection(sk);
|
||||
return false;
|
||||
default:
|
||||
Assert(false);
|
||||
}
|
||||
}
|
||||
|
||||
sk->outbuf.len += req->endLsn - req->beginLsn;
|
||||
sk->outbuf.len += req_len;
|
||||
|
||||
writeResult = wp->api.conn_async_write(sk, sk->outbuf.data, sk->outbuf.len);
|
||||
|
||||
|
||||
@@ -1489,33 +1489,11 @@ walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count,
|
||||
{
|
||||
NeonWALReadResult res;
|
||||
|
||||
#if PG_MAJORVERSION_NUM >= 17
|
||||
if (!sk->wp->config->syncSafekeepers)
|
||||
{
|
||||
Size rbytes;
|
||||
rbytes = WALReadFromBuffers(buf, startptr, count,
|
||||
walprop_pg_get_timeline_id());
|
||||
|
||||
startptr += rbytes;
|
||||
count -= rbytes;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (count == 0)
|
||||
{
|
||||
res = NEON_WALREAD_SUCCESS;
|
||||
}
|
||||
else
|
||||
{
|
||||
Assert(count > 0);
|
||||
|
||||
/* Now read the remaining WAL from the WAL file */
|
||||
res = NeonWALRead(sk->xlogreader,
|
||||
buf,
|
||||
startptr,
|
||||
count,
|
||||
walprop_pg_get_timeline_id());
|
||||
}
|
||||
res = NeonWALRead(sk->xlogreader,
|
||||
buf,
|
||||
startptr,
|
||||
count,
|
||||
walprop_pg_get_timeline_id());
|
||||
|
||||
if (res == NEON_WALREAD_SUCCESS)
|
||||
{
|
||||
|
||||
@@ -60,7 +60,7 @@ prometheus.workspace = true
|
||||
rand.workspace = true
|
||||
regex.workspace = true
|
||||
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
|
||||
reqwest.workspace = true
|
||||
reqwest = { workspace = true, features = ["rustls-tls-native-roots"] }
|
||||
reqwest-middleware = { workspace = true, features = ["json"] }
|
||||
reqwest-retry.workspace = true
|
||||
reqwest-tracing.workspace = true
|
||||
|
||||
@@ -7,8 +7,11 @@ use arc_swap::ArcSwapOption;
|
||||
use dashmap::DashMap;
|
||||
use jose_jwk::crypto::KeyInfo;
|
||||
use reqwest::{redirect, Client};
|
||||
use reqwest_retry::policies::ExponentialBackoff;
|
||||
use reqwest_retry::RetryTransientMiddleware;
|
||||
use serde::de::Visitor;
|
||||
use serde::{Deserialize, Deserializer};
|
||||
use serde_json::value::RawValue;
|
||||
use signature::Verifier;
|
||||
use thiserror::Error;
|
||||
use tokio::time::Instant;
|
||||
@@ -16,7 +19,7 @@ use tokio::time::Instant;
|
||||
use crate::auth::backend::ComputeCredentialKeys;
|
||||
use crate::context::RequestMonitoring;
|
||||
use crate::control_plane::errors::GetEndpointJwksError;
|
||||
use crate::http::parse_json_body_with_limit;
|
||||
use crate::http::read_body_with_limit;
|
||||
use crate::intern::RoleNameInt;
|
||||
use crate::types::{EndpointId, RoleName};
|
||||
|
||||
@@ -28,6 +31,10 @@ const MAX_RENEW: Duration = Duration::from_secs(3600);
|
||||
const MAX_JWK_BODY_SIZE: usize = 64 * 1024;
|
||||
const JWKS_USER_AGENT: &str = "neon-proxy";
|
||||
|
||||
const JWKS_CONNECT_TIMEOUT: Duration = Duration::from_secs(2);
|
||||
const JWKS_FETCH_TIMEOUT: Duration = Duration::from_secs(5);
|
||||
const JWKS_FETCH_RETRIES: u32 = 3;
|
||||
|
||||
/// How to get the JWT auth rules
|
||||
pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static {
|
||||
fn fetch_auth_rules(
|
||||
@@ -55,7 +62,7 @@ pub(crate) struct AuthRule {
|
||||
}
|
||||
|
||||
pub struct JwkCache {
|
||||
client: reqwest::Client,
|
||||
client: reqwest_middleware::ClientWithMiddleware,
|
||||
|
||||
map: DashMap<(EndpointId, RoleName), Arc<JwkCacheEntryLock>>,
|
||||
}
|
||||
@@ -117,6 +124,14 @@ impl Default for JwkCacheEntryLock {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct JwkSet<'a> {
|
||||
/// we parse into raw-value because not all keys in a JWKS are ones
|
||||
/// we can parse directly, so we parse them lazily.
|
||||
#[serde(borrow)]
|
||||
keys: Vec<&'a RawValue>,
|
||||
}
|
||||
|
||||
impl JwkCacheEntryLock {
|
||||
async fn acquire_permit<'a>(self: &'a Arc<Self>) -> JwkRenewalPermit<'a> {
|
||||
JwkRenewalPermit::acquire_permit(self).await
|
||||
@@ -130,7 +145,7 @@ impl JwkCacheEntryLock {
|
||||
&self,
|
||||
_permit: JwkRenewalPermit<'_>,
|
||||
ctx: &RequestMonitoring,
|
||||
client: &reqwest::Client,
|
||||
client: &reqwest_middleware::ClientWithMiddleware,
|
||||
endpoint: EndpointId,
|
||||
auth_rules: &F,
|
||||
) -> Result<Arc<JwkCacheEntry>, JwtError> {
|
||||
@@ -154,22 +169,73 @@ impl JwkCacheEntryLock {
|
||||
let req = client.get(rule.jwks_url.clone());
|
||||
// TODO(conrad): eventually switch to using reqwest_middleware/`new_client_with_timeout`.
|
||||
// TODO(conrad): We need to filter out URLs that point to local resources. Public internet only.
|
||||
match req.send().await.and_then(|r| r.error_for_status()) {
|
||||
match req.send().await.and_then(|r| {
|
||||
r.error_for_status()
|
||||
.map_err(reqwest_middleware::Error::Reqwest)
|
||||
}) {
|
||||
// todo: should we re-insert JWKs if we want to keep this JWKs URL?
|
||||
// I expect these failures would be quite sparse.
|
||||
Err(e) => tracing::warn!(url=?rule.jwks_url, error=?e, "could not fetch JWKs"),
|
||||
Ok(r) => {
|
||||
let resp: http::Response<reqwest::Body> = r.into();
|
||||
match parse_json_body_with_limit::<jose_jwk::JwkSet>(
|
||||
resp.into_body(),
|
||||
MAX_JWK_BODY_SIZE,
|
||||
)
|
||||
.await
|
||||
|
||||
let bytes = match read_body_with_limit(resp.into_body(), MAX_JWK_BODY_SIZE)
|
||||
.await
|
||||
{
|
||||
Ok(bytes) => bytes,
|
||||
Err(e) => {
|
||||
tracing::warn!(url=?rule.jwks_url, error=?e, "could not decode JWKs");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
match serde_json::from_slice::<JwkSet>(&bytes) {
|
||||
Err(e) => {
|
||||
tracing::warn!(url=?rule.jwks_url, error=?e, "could not decode JWKs");
|
||||
}
|
||||
Ok(jwks) => {
|
||||
// size_of::<&RawValue>() == 16
|
||||
// size_of::<jose_jwk::Jwk>() == 288
|
||||
// better to not pre-allocate this as it might be pretty large - especially if it has many
|
||||
// keys we don't want or need.
|
||||
// trivial 'attack': `{"keys":[` + repeat(`0`).take(30000).join(`,`) + `]}`
|
||||
// this would consume 8MiB just like that!
|
||||
let mut keys = vec![];
|
||||
let mut failed = 0;
|
||||
for key in jwks.keys {
|
||||
match serde_json::from_str::<jose_jwk::Jwk>(key.get()) {
|
||||
Ok(key) => {
|
||||
// if `use` (called `cls` in rust) is specified to be something other than signing,
|
||||
// we can skip storing it.
|
||||
if key
|
||||
.prm
|
||||
.cls
|
||||
.as_ref()
|
||||
.is_some_and(|c| *c != jose_jwk::Class::Signing)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
keys.push(key);
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::debug!(url=?rule.jwks_url, failed=?e, "could not decode JWK");
|
||||
failed += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
keys.shrink_to_fit();
|
||||
|
||||
if failed > 0 {
|
||||
tracing::warn!(url=?rule.jwks_url, failed, "could not decode JWKs");
|
||||
}
|
||||
|
||||
if keys.is_empty() {
|
||||
tracing::warn!(url=?rule.jwks_url, "no valid JWKs found inside the response body");
|
||||
continue;
|
||||
}
|
||||
|
||||
let jwks = jose_jwk::JwkSet { keys };
|
||||
key_sets.insert(
|
||||
rule.id,
|
||||
KeySet {
|
||||
@@ -179,7 +245,7 @@ impl JwkCacheEntryLock {
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -196,7 +262,7 @@ impl JwkCacheEntryLock {
|
||||
async fn get_or_update_jwk_cache<F: FetchAuthRules>(
|
||||
self: &Arc<Self>,
|
||||
ctx: &RequestMonitoring,
|
||||
client: &reqwest::Client,
|
||||
client: &reqwest_middleware::ClientWithMiddleware,
|
||||
endpoint: EndpointId,
|
||||
fetch: &F,
|
||||
) -> Result<Arc<JwkCacheEntry>, JwtError> {
|
||||
@@ -250,7 +316,7 @@ impl JwkCacheEntryLock {
|
||||
self: &Arc<Self>,
|
||||
ctx: &RequestMonitoring,
|
||||
jwt: &str,
|
||||
client: &reqwest::Client,
|
||||
client: &reqwest_middleware::ClientWithMiddleware,
|
||||
endpoint: EndpointId,
|
||||
role_name: &RoleName,
|
||||
fetch: &F,
|
||||
@@ -369,8 +435,19 @@ impl Default for JwkCache {
|
||||
let client = Client::builder()
|
||||
.user_agent(JWKS_USER_AGENT)
|
||||
.redirect(redirect::Policy::none())
|
||||
.tls_built_in_native_certs(true)
|
||||
.connect_timeout(JWKS_CONNECT_TIMEOUT)
|
||||
.timeout(JWKS_FETCH_TIMEOUT)
|
||||
.build()
|
||||
.expect("using &str and standard redirect::Policy");
|
||||
.expect("client config should be valid");
|
||||
|
||||
// Retry up to 3 times with increasing intervals between attempts.
|
||||
let retry_policy = ExponentialBackoff::builder().build_with_max_retries(JWKS_FETCH_RETRIES);
|
||||
|
||||
let client = reqwest_middleware::ClientBuilder::new(client)
|
||||
.with(RetryTransientMiddleware::new_with_policy(retry_policy))
|
||||
.build();
|
||||
|
||||
JwkCache {
|
||||
client,
|
||||
map: DashMap::default(),
|
||||
@@ -1209,4 +1286,63 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn check_jwk_keycloak_regression() {
|
||||
let (rs, valid_jwk) = new_rsa_jwk(RS1, "rs1".into());
|
||||
let valid_jwk = serde_json::to_value(valid_jwk).unwrap();
|
||||
|
||||
// This is valid, but we cannot parse it as we have no support for encryption JWKs, only signature based ones.
|
||||
// This is taken directly from keycloak.
|
||||
let invalid_jwk = serde_json::json! {
|
||||
{
|
||||
"kid": "U-Jc9xRli84eNqRpYQoIPF-GNuRWV3ZvAIhziRW2sbQ",
|
||||
"kty": "RSA",
|
||||
"alg": "RSA-OAEP",
|
||||
"use": "enc",
|
||||
"n": "yypYWsEKmM_wWdcPnSGLSm5ytw1WG7P7EVkKSulcDRlrM6HWj3PR68YS8LySYM2D9Z-79oAdZGKhIfzutqL8rK1vS14zDuPpAM-RWY3JuQfm1O_-1DZM8-07PmVRegP5KPxsKblLf_My8ByH6sUOIa1p2rbe2q_b0dSTXYu1t0dW-cGL5VShc400YymvTwpc-5uYNsaVxZajnB7JP1OunOiuCJ48AuVp3PqsLzgoXqlXEB1ZZdch3xT3bxaTtNruGvG4xmLZY68O_T3yrwTCNH2h_jFdGPyXdyZToCMSMK2qSbytlfwfN55pT9Vv42Lz1YmoB7XRjI9aExKPc5AxFw",
|
||||
"e": "AQAB",
|
||||
"x5c": [
|
||||
"MIICmzCCAYMCBgGS41E6azANBgkqhkiG9w0BAQsFADARMQ8wDQYDVQQDDAZtYXN0ZXIwHhcNMjQxMDMxMTYwMTQ0WhcNMzQxMDMxMTYwMzI0WjARMQ8wDQYDVQQDDAZtYXN0ZXIwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDLKlhawQqYz/BZ1w+dIYtKbnK3DVYbs/sRWQpK6VwNGWszodaPc9HrxhLwvJJgzYP1n7v2gB1kYqEh/O62ovysrW9LXjMO4+kAz5FZjcm5B+bU7/7UNkzz7Ts+ZVF6A/ko/GwpuUt/8zLwHIfqxQ4hrWnatt7ar9vR1JNdi7W3R1b5wYvlVKFzjTRjKa9PClz7m5g2xpXFlqOcHsk/U66c6K4InjwC5Wnc+qwvOCheqVcQHVll1yHfFPdvFpO02u4a8bjGYtljrw79PfKvBMI0faH+MV0Y/Jd3JlOgIxIwrapJvK2V/B83nmlP1W/jYvPViagHtdGMj1oTEo9zkDEXAgMBAAEwDQYJKoZIhvcNAQELBQADggEBAECYX59+Q9v6c9sb6Q0/C6IgLWG2nVCgVE1YWwIzz+68WrhlmNCRuPjY94roB+tc2tdHbj+Nh3LMzJk7L1KCQoW1+LPK6A6E8W9ad0YPcuw8csV2pUA3+H56exQMH0fUAPQAU7tXWvnQ7otcpV1XA8afn/NTMTsnxi9mSkor8MLMYQ3aeRyh1+LAchHBthWiltqsSUqXrbJF59u5p0ghquuKcWR3TXsA7klGYBgGU5KAJifr9XT87rN0bOkGvbeWAgKvnQnjZwxdnLqTfp/pRY/PiJJHhgIBYPIA7STGnMPjmJ995i34zhnbnd8WHXJA3LxrIMqLW/l8eIdvtM1w8KI="
|
||||
],
|
||||
"x5t": "QhfzMMnuAfkReTgZ1HtrfyOeeZs",
|
||||
"x5t#S256": "cmHDUdKgLiRCEN28D5FBy9IJLFmR7QWfm77SLhGTCTU"
|
||||
}
|
||||
};
|
||||
|
||||
let jwks = serde_json::json! {{ "keys": [invalid_jwk, valid_jwk ] }};
|
||||
let jwks_addr = jwks_server(move |path| match path {
|
||||
"/" => Some(serde_json::to_vec(&jwks).unwrap()),
|
||||
_ => None,
|
||||
})
|
||||
.await;
|
||||
|
||||
let role_name = RoleName::from("anonymous");
|
||||
let role = RoleNameInt::from(&role_name);
|
||||
|
||||
let rules = vec![AuthRule {
|
||||
id: "foo".to_owned(),
|
||||
jwks_url: format!("http://{jwks_addr}/").parse().unwrap(),
|
||||
audience: None,
|
||||
role_names: vec![role],
|
||||
}];
|
||||
|
||||
let fetch = Fetch(rules);
|
||||
let jwk_cache = JwkCache::default();
|
||||
|
||||
let endpoint = EndpointId::from("ep");
|
||||
|
||||
let token = new_rsa_jwt("rs1".into(), rs);
|
||||
|
||||
jwk_cache
|
||||
.check_jwt(
|
||||
&RequestMonitoring::test(),
|
||||
endpoint.clone(),
|
||||
&role_name,
|
||||
&fetch,
|
||||
&token,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
74
proxy/src/cache/endpoints.rs
vendored
74
proxy/src/cache/endpoints.rs
vendored
@@ -1,13 +1,12 @@
|
||||
use std::convert::Infallible;
|
||||
use std::future::pending;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use dashmap::DashSet;
|
||||
use redis::streams::{StreamReadOptions, StreamReadReply};
|
||||
use redis::{AsyncCommands, FromRedisValue, Value};
|
||||
use serde::Deserialize;
|
||||
use tokio::sync::Mutex;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::info;
|
||||
|
||||
@@ -32,17 +31,17 @@ struct ControlPlaneEvent {
|
||||
|
||||
#[derive(Deserialize, Debug, Clone, PartialEq)]
|
||||
struct EndpointCreated {
|
||||
endpoint_id: String,
|
||||
endpoint_id: EndpointIdInt,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Debug, Clone, PartialEq)]
|
||||
struct BranchCreated {
|
||||
branch_id: String,
|
||||
branch_id: BranchIdInt,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Debug, Clone, PartialEq)]
|
||||
struct ProjectCreated {
|
||||
project_id: String,
|
||||
project_id: ProjectIdInt,
|
||||
}
|
||||
|
||||
impl TryFrom<&Value> for ControlPlaneEvent {
|
||||
@@ -76,53 +75,72 @@ impl EndpointsCache {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool {
|
||||
pub(crate) fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool {
|
||||
if !self.ready.load(Ordering::Acquire) {
|
||||
// the endpoint cache is not yet fully initialised.
|
||||
return true;
|
||||
}
|
||||
let rejected = self.should_reject(endpoint);
|
||||
ctx.set_rejected(rejected);
|
||||
info!(?rejected, "check endpoint is valid, disabled cache");
|
||||
// If cache is disabled, just collect the metrics and return or
|
||||
// If the limiter allows, we don't need to check the cache.
|
||||
if self.config.disable_cache || self.limiter.lock().await.check() {
|
||||
|
||||
if !self.should_reject(endpoint) {
|
||||
ctx.set_rejected(false);
|
||||
return true;
|
||||
}
|
||||
!rejected
|
||||
|
||||
// report that we might want to reject this endpoint
|
||||
ctx.set_rejected(true);
|
||||
|
||||
// If cache is disabled, just collect the metrics and return.
|
||||
if self.config.disable_cache {
|
||||
return true;
|
||||
}
|
||||
|
||||
// If the limiter allows, we can pretend like it's valid
|
||||
// (incase it is, due to redis channel lag).
|
||||
if self.limiter.lock().unwrap().check() {
|
||||
return true;
|
||||
}
|
||||
|
||||
// endpoint not found, and there's too much load.
|
||||
false
|
||||
}
|
||||
|
||||
fn should_reject(&self, endpoint: &EndpointId) -> bool {
|
||||
if endpoint.is_endpoint() {
|
||||
!self.endpoints.contains(&EndpointIdInt::from(endpoint))
|
||||
let Some(endpoint) = EndpointIdInt::get(endpoint) else {
|
||||
// if we haven't interned this endpoint, it's not in the cache.
|
||||
return true;
|
||||
};
|
||||
!self.endpoints.contains(&endpoint)
|
||||
} else if endpoint.is_branch() {
|
||||
!self
|
||||
.branches
|
||||
.contains(&BranchIdInt::from(&endpoint.as_branch()))
|
||||
let Some(branch) = BranchIdInt::get(endpoint) else {
|
||||
// if we haven't interned this branch, it's not in the cache.
|
||||
return true;
|
||||
};
|
||||
!self.branches.contains(&branch)
|
||||
} else {
|
||||
!self
|
||||
.projects
|
||||
.contains(&ProjectIdInt::from(&endpoint.as_project()))
|
||||
let Some(project) = ProjectIdInt::get(endpoint) else {
|
||||
// if we haven't interned this project, it's not in the cache.
|
||||
return true;
|
||||
};
|
||||
!self.projects.contains(&project)
|
||||
}
|
||||
}
|
||||
|
||||
fn insert_event(&self, event: ControlPlaneEvent) {
|
||||
if let Some(endpoint_created) = event.endpoint_created {
|
||||
self.endpoints
|
||||
.insert(EndpointIdInt::from(&endpoint_created.endpoint_id.into()));
|
||||
self.endpoints.insert(endpoint_created.endpoint_id);
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.redis_events_count
|
||||
.inc(RedisEventsCount::EndpointCreated);
|
||||
} else if let Some(branch_created) = event.branch_created {
|
||||
self.branches
|
||||
.insert(BranchIdInt::from(&branch_created.branch_id.into()));
|
||||
self.branches.insert(branch_created.branch_id);
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.redis_events_count
|
||||
.inc(RedisEventsCount::BranchCreated);
|
||||
} else if let Some(project_created) = event.project_created {
|
||||
self.projects
|
||||
.insert(ProjectIdInt::from(&project_created.project_id.into()));
|
||||
self.projects.insert(project_created.project_id);
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.redis_events_count
|
||||
@@ -247,11 +265,13 @@ mod tests {
|
||||
fn test_parse_control_plane_event() {
|
||||
let s = r#"{"branch_created":null,"endpoint_created":{"endpoint_id":"ep-rapid-thunder-w0qqw2q9"},"project_created":null,"type":"endpoint_created"}"#;
|
||||
|
||||
let endpoint_id: EndpointId = "ep-rapid-thunder-w0qqw2q9".into();
|
||||
|
||||
assert_eq!(
|
||||
serde_json::from_str::<ControlPlaneEvent>(s).unwrap(),
|
||||
ControlPlaneEvent {
|
||||
endpoint_created: Some(EndpointCreated {
|
||||
endpoint_id: "ep-rapid-thunder-w0qqw2q9".into()
|
||||
endpoint_id: endpoint_id.into(),
|
||||
}),
|
||||
branch_created: None,
|
||||
project_created: None,
|
||||
|
||||
@@ -316,7 +316,6 @@ impl ConnCfg {
|
||||
let client_config = client_config.with_no_client_auth();
|
||||
|
||||
let mut mk_tls = tokio_postgres_rustls::MakeRustlsConnect::new(client_config);
|
||||
// TODO(vlad): que?
|
||||
let tls = <MakeRustlsConnect as MakeTlsConnect<tokio::net::TcpStream>>::make_tls_connect(
|
||||
&mut mk_tls,
|
||||
host,
|
||||
|
||||
@@ -72,7 +72,6 @@ impl NeonControlPlaneClient {
|
||||
.caches
|
||||
.endpoints_cache
|
||||
.is_valid(ctx, &user_info.endpoint.normalize())
|
||||
.await
|
||||
{
|
||||
info!("endpoint is not valid, skipping the request");
|
||||
return Ok(AuthInfo::default());
|
||||
@@ -145,7 +144,6 @@ impl NeonControlPlaneClient {
|
||||
.caches
|
||||
.endpoints_cache
|
||||
.is_valid(ctx, &endpoint.normalize())
|
||||
.await
|
||||
{
|
||||
return Err(GetEndpointJwksError::EndpointNotFound);
|
||||
}
|
||||
|
||||
@@ -6,7 +6,6 @@ pub mod health_server;
|
||||
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::bail;
|
||||
use bytes::Bytes;
|
||||
use http::Method;
|
||||
use http_body_util::BodyExt;
|
||||
@@ -16,7 +15,7 @@ use reqwest_middleware::RequestBuilder;
|
||||
pub(crate) use reqwest_middleware::{ClientWithMiddleware, Error};
|
||||
pub(crate) use reqwest_retry::policies::ExponentialBackoff;
|
||||
pub(crate) use reqwest_retry::RetryTransientMiddleware;
|
||||
use serde::de::DeserializeOwned;
|
||||
use thiserror::Error;
|
||||
|
||||
use crate::metrics::{ConsoleRequest, Metrics};
|
||||
use crate::url::ApiUrl;
|
||||
@@ -122,10 +121,19 @@ impl Endpoint {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn parse_json_body_with_limit<D: DeserializeOwned>(
|
||||
#[derive(Error, Debug)]
|
||||
pub(crate) enum ReadBodyError {
|
||||
#[error("Content length exceeds limit of {limit} bytes")]
|
||||
BodyTooLarge { limit: usize },
|
||||
|
||||
#[error(transparent)]
|
||||
Read(#[from] reqwest::Error),
|
||||
}
|
||||
|
||||
pub(crate) async fn read_body_with_limit(
|
||||
mut b: impl Body<Data = Bytes, Error = reqwest::Error> + Unpin,
|
||||
limit: usize,
|
||||
) -> anyhow::Result<D> {
|
||||
) -> Result<Vec<u8>, ReadBodyError> {
|
||||
// We could use `b.limited().collect().await.to_bytes()` here
|
||||
// but this ends up being slightly more efficient as far as I can tell.
|
||||
|
||||
@@ -133,20 +141,20 @@ pub(crate) async fn parse_json_body_with_limit<D: DeserializeOwned>(
|
||||
// in reqwest, this value is influenced by the Content-Length header.
|
||||
let lower_bound = match usize::try_from(b.size_hint().lower()) {
|
||||
Ok(bound) if bound <= limit => bound,
|
||||
_ => bail!("Content length exceeds limit of {limit} bytes"),
|
||||
_ => return Err(ReadBodyError::BodyTooLarge { limit }),
|
||||
};
|
||||
let mut bytes = Vec::with_capacity(lower_bound);
|
||||
|
||||
while let Some(frame) = b.frame().await.transpose()? {
|
||||
if let Ok(data) = frame.into_data() {
|
||||
if bytes.len() + data.len() > limit {
|
||||
bail!("Content length exceeds limit of {limit} bytes")
|
||||
return Err(ReadBodyError::BodyTooLarge { limit });
|
||||
}
|
||||
bytes.extend_from_slice(&data);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(serde_json::from_slice::<D>(&bytes)?)
|
||||
Ok(bytes)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -1,12 +1,6 @@
|
||||
// rustc lints/lint groups
|
||||
// https://doc.rust-lang.org/rustc/lints/groups.html
|
||||
#![deny(
|
||||
deprecated,
|
||||
future_incompatible,
|
||||
let_underscore,
|
||||
nonstandard_style,
|
||||
rust_2024_compatibility
|
||||
)]
|
||||
#![deny(deprecated, future_incompatible, let_underscore, nonstandard_style)]
|
||||
#![warn(clippy::all, clippy::pedantic, clippy::cargo)]
|
||||
// List of denied lints from the clippy::restriction group.
|
||||
// https://rust-lang.github.io/rust-clippy/master/index.html#?groups=restriction
|
||||
|
||||
@@ -16,8 +16,7 @@ use super::http_conn_pool::ClientDataHttp;
|
||||
use super::local_conn_pool::ClientDataLocal;
|
||||
use crate::auth::backend::ComputeUserInfo;
|
||||
use crate::context::RequestMonitoring;
|
||||
use crate::control_plane::messages::ColdStartInfo;
|
||||
use crate::control_plane::messages::MetricsAuxInfo;
|
||||
use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
|
||||
use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
|
||||
use crate::types::{DbName, EndpointCacheKey, RoleName};
|
||||
use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
|
||||
|
||||
@@ -7,7 +7,6 @@ use hyper::client::conn::http2;
|
||||
use hyper_util::rt::{TokioExecutor, TokioIo};
|
||||
use parking_lot::RwLock;
|
||||
use rand::Rng;
|
||||
use std::result::Result::Ok;
|
||||
use tokio::net::TcpStream;
|
||||
use tracing::{debug, error, info, info_span, Instrument};
|
||||
|
||||
|
||||
@@ -64,24 +64,28 @@ macro_rules! smol_str_wrapper {
|
||||
}
|
||||
|
||||
const POOLER_SUFFIX: &str = "-pooler";
|
||||
pub(crate) const LOCAL_PROXY_SUFFIX: &str = "-local-proxy";
|
||||
|
||||
impl EndpointId {
|
||||
#[must_use]
|
||||
pub fn normalize(&self) -> Self {
|
||||
fn normalize_str(&self) -> &str {
|
||||
if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) {
|
||||
stripped.into()
|
||||
stripped
|
||||
} else if let Some(stripped) = self.as_ref().strip_suffix(LOCAL_PROXY_SUFFIX) {
|
||||
stripped
|
||||
} else {
|
||||
self.clone()
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn normalize(&self) -> Self {
|
||||
self.normalize_str().into()
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn normalize_intern(&self) -> EndpointIdInt {
|
||||
if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) {
|
||||
EndpointIdTag::get_interner().get_or_intern(stripped)
|
||||
} else {
|
||||
self.into()
|
||||
}
|
||||
EndpointIdTag::get_interner().get_or_intern(self.normalize_str())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -110,13 +114,4 @@ impl EndpointId {
|
||||
pub(crate) fn is_branch(&self) -> bool {
|
||||
self.0.starts_with("br-")
|
||||
}
|
||||
// pub(crate) fn is_project(&self) -> bool {
|
||||
// !self.is_endpoint() && !self.is_branch()
|
||||
// }
|
||||
pub(crate) fn as_branch(&self) -> BranchId {
|
||||
BranchId(self.0.clone())
|
||||
}
|
||||
pub(crate) fn as_project(&self) -> ProjectId {
|
||||
ProjectId(self.0.clone())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -28,7 +28,6 @@ hyper0.workspace = true
|
||||
futures.workspace = true
|
||||
once_cell.workspace = true
|
||||
parking_lot.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
postgres.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
rand.workspace = true
|
||||
@@ -58,13 +57,18 @@ sd-notify.workspace = true
|
||||
storage_broker.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
utils.workspace = true
|
||||
wal_decoder.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
criterion.workspace = true
|
||||
itertools.workspace = true
|
||||
walproposer.workspace = true
|
||||
rand.workspace = true
|
||||
desim.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-subscriber = { workspace = true, features = ["json"] }
|
||||
|
||||
[[bench]]
|
||||
name = "receive_wal"
|
||||
harness = false
|
||||
|
||||
22
safekeeper/benches/README.md
Normal file
22
safekeeper/benches/README.md
Normal file
@@ -0,0 +1,22 @@
|
||||
## Safekeeper Benchmarks
|
||||
|
||||
To run benchmarks:
|
||||
|
||||
```sh
|
||||
# All benchmarks.
|
||||
cargo bench --package safekeeper
|
||||
|
||||
# Specific file.
|
||||
cargo bench --package safekeeper --bench receive_wal
|
||||
|
||||
# Specific benchmark.
|
||||
cargo bench --package safekeeper --bench receive_wal process_msg/fsync=false
|
||||
|
||||
# List available benchmarks.
|
||||
cargo bench --package safekeeper --benches -- --list
|
||||
```
|
||||
|
||||
Additional charts and statistics are available in `target/criterion/report/index.html`.
|
||||
|
||||
Benchmarks are automatically compared against the previous run. To compare against other runs, see
|
||||
`--baseline` and `--save-baseline`.
|
||||
102
safekeeper/benches/benchutils.rs
Normal file
102
safekeeper/benches/benchutils.rs
Normal file
@@ -0,0 +1,102 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use camino_tempfile::Utf8TempDir;
|
||||
use safekeeper::rate_limit::RateLimiter;
|
||||
use safekeeper::safekeeper::{ProposerAcceptorMessage, ProposerElected, SafeKeeper, TermHistory};
|
||||
use safekeeper::state::{TimelinePersistentState, TimelineState};
|
||||
use safekeeper::timeline::{get_timeline_dir, SharedState, StateSK, Timeline};
|
||||
use safekeeper::timelines_set::TimelinesSet;
|
||||
use safekeeper::wal_backup::remote_timeline_path;
|
||||
use safekeeper::{control_file, wal_storage, SafeKeeperConf};
|
||||
use tokio::fs::create_dir_all;
|
||||
use utils::id::{NodeId, TenantTimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
/// A Safekeeper benchmarking environment. Uses a tempdir for storage, removed on drop.
|
||||
pub struct Env {
|
||||
/// Whether to enable fsync.
|
||||
pub fsync: bool,
|
||||
/// Benchmark directory. Deleted when dropped.
|
||||
pub tempdir: Utf8TempDir,
|
||||
}
|
||||
|
||||
impl Env {
|
||||
/// Creates a new benchmarking environment in a temporary directory. fsync controls whether to
|
||||
/// enable fsyncing.
|
||||
pub fn new(fsync: bool) -> anyhow::Result<Self> {
|
||||
let tempdir = camino_tempfile::tempdir()?;
|
||||
Ok(Self { fsync, tempdir })
|
||||
}
|
||||
|
||||
/// Constructs a Safekeeper config for the given node ID.
|
||||
fn make_conf(&self, node_id: NodeId) -> SafeKeeperConf {
|
||||
let mut conf = SafeKeeperConf::dummy();
|
||||
conf.my_id = node_id;
|
||||
conf.no_sync = !self.fsync;
|
||||
conf.workdir = self.tempdir.path().join(format!("safekeeper-{node_id}"));
|
||||
conf
|
||||
}
|
||||
|
||||
/// Constructs a Safekeeper with the given node and tenant/timeline ID.
|
||||
///
|
||||
/// TODO: we should support using in-memory storage, to measure non-IO costs. This would be
|
||||
/// easier if SafeKeeper used trait objects for storage rather than generics. It's also not
|
||||
/// currently possible to construct a timeline using non-file storage since StateSK only accepts
|
||||
/// SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>.
|
||||
pub async fn make_safekeeper(
|
||||
&self,
|
||||
node_id: NodeId,
|
||||
ttid: TenantTimelineId,
|
||||
) -> anyhow::Result<SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>> {
|
||||
let conf = self.make_conf(node_id);
|
||||
|
||||
let timeline_dir = get_timeline_dir(&conf, &ttid);
|
||||
create_dir_all(&timeline_dir).await?;
|
||||
|
||||
let mut pstate = TimelinePersistentState::empty();
|
||||
pstate.tenant_id = ttid.tenant_id;
|
||||
pstate.timeline_id = ttid.timeline_id;
|
||||
|
||||
let wal = wal_storage::PhysicalStorage::new(&ttid, &timeline_dir, &pstate, conf.no_sync)?;
|
||||
let ctrl =
|
||||
control_file::FileStorage::create_new(&timeline_dir, pstate, conf.no_sync).await?;
|
||||
let state = TimelineState::new(ctrl);
|
||||
let mut safekeeper = SafeKeeper::new(state, wal, conf.my_id)?;
|
||||
|
||||
// Emulate an initial election.
|
||||
safekeeper
|
||||
.process_msg(&ProposerAcceptorMessage::Elected(ProposerElected {
|
||||
term: 1,
|
||||
start_streaming_at: Lsn(0),
|
||||
term_history: TermHistory(vec![(1, Lsn(0)).into()]),
|
||||
timeline_start_lsn: Lsn(0),
|
||||
}))
|
||||
.await?;
|
||||
|
||||
Ok(safekeeper)
|
||||
}
|
||||
|
||||
/// Constructs a timeline, including a new Safekeeper with the given node ID, and spawns its
|
||||
/// manager task.
|
||||
pub async fn make_timeline(
|
||||
&self,
|
||||
node_id: NodeId,
|
||||
ttid: TenantTimelineId,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
let conf = self.make_conf(node_id);
|
||||
let timeline_dir = get_timeline_dir(&conf, &ttid);
|
||||
let remote_path = remote_timeline_path(&ttid)?;
|
||||
|
||||
let safekeeper = self.make_safekeeper(node_id, ttid).await?;
|
||||
let shared_state = SharedState::new(StateSK::Loaded(safekeeper));
|
||||
|
||||
let timeline = Timeline::new(ttid, &timeline_dir, &remote_path, shared_state);
|
||||
timeline.bootstrap(
|
||||
&mut timeline.write_shared_state().await,
|
||||
&conf,
|
||||
Arc::new(TimelinesSet::default()), // ignored for now
|
||||
RateLimiter::new(0, 0),
|
||||
);
|
||||
Ok(timeline)
|
||||
}
|
||||
}
|
||||
341
safekeeper/benches/receive_wal.rs
Normal file
341
safekeeper/benches/receive_wal.rs
Normal file
@@ -0,0 +1,341 @@
|
||||
//! WAL ingestion benchmarks.
|
||||
|
||||
#[path = "benchutils.rs"]
|
||||
mod benchutils;
|
||||
|
||||
use std::io::Write as _;
|
||||
|
||||
use benchutils::Env;
|
||||
use camino_tempfile::tempfile;
|
||||
use criterion::{criterion_group, criterion_main, BatchSize, Bencher, Criterion};
|
||||
use itertools::Itertools as _;
|
||||
use postgres_ffi::v17::wal_generator::{LogicalMessageGenerator, WalGenerator};
|
||||
use safekeeper::receive_wal::{self, WalAcceptor};
|
||||
use safekeeper::safekeeper::{
|
||||
AcceptorProposerMessage, AppendRequest, AppendRequestHeader, ProposerAcceptorMessage,
|
||||
};
|
||||
use tokio::io::AsyncWriteExt as _;
|
||||
use utils::id::{NodeId, TenantTimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
const KB: usize = 1024;
|
||||
const MB: usize = 1024 * KB;
|
||||
const GB: usize = 1024 * MB;
|
||||
|
||||
// Register benchmarks with Criterion.
|
||||
criterion_group!(
|
||||
benches,
|
||||
bench_process_msg,
|
||||
bench_wal_acceptor,
|
||||
bench_wal_acceptor_throughput,
|
||||
bench_file_write
|
||||
);
|
||||
criterion_main!(benches);
|
||||
|
||||
/// Benchmarks SafeKeeper::process_msg() as time per message and throughput. Each message is an
|
||||
/// AppendRequest with a single WAL record containing an XlLogicalMessage of varying size. When
|
||||
/// measuring throughput, only the logical message payload is considered, excluding
|
||||
/// segment/page/record headers.
|
||||
fn bench_process_msg(c: &mut Criterion) {
|
||||
let mut g = c.benchmark_group("process_msg");
|
||||
for fsync in [false, true] {
|
||||
for commit in [false, true] {
|
||||
for size in [8, KB, 8 * KB, 128 * KB, MB] {
|
||||
// Kind of weird to change the group throughput per benchmark, but it's the only way
|
||||
// to vary it per benchmark. It works.
|
||||
g.throughput(criterion::Throughput::Bytes(size as u64));
|
||||
g.bench_function(format!("fsync={fsync}/commit={commit}/size={size}"), |b| {
|
||||
run_bench(b, size, fsync, commit).unwrap()
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// The actual benchmark. If commit is true, advance the commit LSN on every message.
|
||||
fn run_bench(b: &mut Bencher, size: usize, fsync: bool, commit: bool) -> anyhow::Result<()> {
|
||||
let runtime = tokio::runtime::Builder::new_current_thread() // single is fine, sync IO only
|
||||
.enable_all()
|
||||
.build()?;
|
||||
|
||||
// Construct the payload. The prefix counts towards the payload (including NUL terminator).
|
||||
let prefix = c"p";
|
||||
let prefixlen = prefix.to_bytes_with_nul().len();
|
||||
assert!(size >= prefixlen);
|
||||
let message = vec![0; size - prefixlen];
|
||||
|
||||
let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message));
|
||||
|
||||
// Set up the Safekeeper.
|
||||
let env = Env::new(fsync)?;
|
||||
let mut safekeeper =
|
||||
runtime.block_on(env.make_safekeeper(NodeId(1), TenantTimelineId::generate()))?;
|
||||
|
||||
b.iter_batched_ref(
|
||||
// Pre-construct WAL records and requests. Criterion will batch them.
|
||||
|| {
|
||||
let (lsn, record) = walgen.next().expect("endless WAL");
|
||||
ProposerAcceptorMessage::AppendRequest(AppendRequest {
|
||||
h: AppendRequestHeader {
|
||||
term: 1,
|
||||
term_start_lsn: Lsn(0),
|
||||
begin_lsn: lsn,
|
||||
end_lsn: lsn + record.len() as u64,
|
||||
commit_lsn: if commit { lsn } else { Lsn(0) }, // commit previous record
|
||||
truncate_lsn: Lsn(0),
|
||||
proposer_uuid: [0; 16],
|
||||
},
|
||||
wal_data: record,
|
||||
})
|
||||
},
|
||||
// Benchmark message processing (time per message).
|
||||
|msg| {
|
||||
runtime
|
||||
.block_on(safekeeper.process_msg(msg))
|
||||
.expect("message failed")
|
||||
},
|
||||
BatchSize::SmallInput, // automatically determine a batch size
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Benchmarks WalAcceptor message processing time by sending it a batch of WAL records and waiting
|
||||
/// for it to confirm that the last LSN has been flushed to storage. We pipeline a bunch of messages
|
||||
/// instead of measuring each individual message to amortize costs (e.g. fsync), which is more
|
||||
/// realistic. Records are XlLogicalMessage with a tiny payload (~64 bytes per record including
|
||||
/// headers). Records are pre-constructed to avoid skewing the benchmark.
|
||||
///
|
||||
/// TODO: add benchmarks with in-memory storage, see comment on `Env::make_safekeeper()`:
|
||||
fn bench_wal_acceptor(c: &mut Criterion) {
|
||||
let mut g = c.benchmark_group("wal_acceptor");
|
||||
for fsync in [false, true] {
|
||||
for n in [1, 100, 10000] {
|
||||
g.bench_function(format!("fsync={fsync}/n={n}"), |b| {
|
||||
run_bench(b, n, fsync).unwrap()
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// The actual benchmark. n is the number of WAL records to send in a pipelined batch.
|
||||
fn run_bench(b: &mut Bencher, n: usize, fsync: bool) -> anyhow::Result<()> {
|
||||
let runtime = tokio::runtime::Runtime::new()?; // needs multithreaded
|
||||
|
||||
let env = Env::new(fsync)?;
|
||||
let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(c"prefix", b"message"));
|
||||
|
||||
// Create buffered channels that can fit all requests, to avoid blocking on channels.
|
||||
let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(n);
|
||||
let (reply_tx, mut reply_rx) = tokio::sync::mpsc::channel(n);
|
||||
|
||||
// Spawn the WalAcceptor task.
|
||||
runtime.block_on(async {
|
||||
// TODO: WalAcceptor doesn't actually need a full timeline, only
|
||||
// Safekeeper::process_msg(). Consider decoupling them to simplify the setup.
|
||||
let tli = env
|
||||
.make_timeline(NodeId(1), TenantTimelineId::generate())
|
||||
.await?
|
||||
.wal_residence_guard()
|
||||
.await?;
|
||||
WalAcceptor::spawn(tli, msg_rx, reply_tx, Some(0));
|
||||
anyhow::Ok(())
|
||||
})?;
|
||||
|
||||
b.iter_batched(
|
||||
// Pre-construct a batch of WAL records and requests.
|
||||
|| {
|
||||
walgen
|
||||
.take(n)
|
||||
.map(|(lsn, record)| AppendRequest {
|
||||
h: AppendRequestHeader {
|
||||
term: 1,
|
||||
term_start_lsn: Lsn(0),
|
||||
begin_lsn: lsn,
|
||||
end_lsn: lsn + record.len() as u64,
|
||||
commit_lsn: Lsn(0),
|
||||
truncate_lsn: Lsn(0),
|
||||
proposer_uuid: [0; 16],
|
||||
},
|
||||
wal_data: record,
|
||||
})
|
||||
.collect_vec()
|
||||
},
|
||||
// Benchmark batch ingestion (time per batch).
|
||||
|reqs| {
|
||||
runtime.block_on(async {
|
||||
let final_lsn = reqs.last().unwrap().h.end_lsn;
|
||||
// Stuff all the messages into the buffered channel to pipeline them.
|
||||
for req in reqs {
|
||||
let msg = ProposerAcceptorMessage::AppendRequest(req);
|
||||
msg_tx.send(msg).await.expect("send failed");
|
||||
}
|
||||
// Wait for the last message to get flushed.
|
||||
while let Some(reply) = reply_rx.recv().await {
|
||||
if let AcceptorProposerMessage::AppendResponse(resp) = reply {
|
||||
if resp.flush_lsn >= final_lsn {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
panic!("disconnected")
|
||||
})
|
||||
},
|
||||
BatchSize::PerIteration, // only run one request batch at a time
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Benchmarks WalAcceptor throughput by sending 1 GB of data with varying message sizes and waiting
|
||||
/// for the last LSN to be flushed to storage. Only the actual message payload counts towards
|
||||
/// throughput, headers are excluded and considered overhead. Records are XlLogicalMessage.
|
||||
///
|
||||
/// To avoid running out of memory, messages are constructed during the benchmark.
|
||||
fn bench_wal_acceptor_throughput(c: &mut Criterion) {
|
||||
const VOLUME: usize = GB; // NB: excludes message/page/segment headers and padding
|
||||
|
||||
let mut g = c.benchmark_group("wal_acceptor_throughput");
|
||||
g.sample_size(10);
|
||||
g.throughput(criterion::Throughput::Bytes(VOLUME as u64));
|
||||
|
||||
for fsync in [false, true] {
|
||||
for commit in [false, true] {
|
||||
for size in [KB, 8 * KB, 128 * KB, MB] {
|
||||
assert_eq!(VOLUME % size, 0, "volume must be divisible by size");
|
||||
let count = VOLUME / size;
|
||||
g.bench_function(format!("fsync={fsync}/commit={commit}/size={size}"), |b| {
|
||||
run_bench(b, count, size, fsync, commit).unwrap()
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The actual benchmark. size is the payload size per message, count is the number of messages.
|
||||
/// If commit is true, advance the commit LSN on each message.
|
||||
fn run_bench(
|
||||
b: &mut Bencher,
|
||||
count: usize,
|
||||
size: usize,
|
||||
fsync: bool,
|
||||
commit: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let runtime = tokio::runtime::Runtime::new()?; // needs multithreaded
|
||||
|
||||
// Construct the payload. The prefix counts towards the payload (including NUL terminator).
|
||||
let prefix = c"p";
|
||||
let prefixlen = prefix.to_bytes_with_nul().len();
|
||||
assert!(size >= prefixlen);
|
||||
let message = vec![0; size - prefixlen];
|
||||
|
||||
let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message));
|
||||
|
||||
// Construct and spawn the WalAcceptor task.
|
||||
let env = Env::new(fsync)?;
|
||||
|
||||
let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(receive_wal::MSG_QUEUE_SIZE);
|
||||
let (reply_tx, mut reply_rx) = tokio::sync::mpsc::channel(receive_wal::REPLY_QUEUE_SIZE);
|
||||
|
||||
runtime.block_on(async {
|
||||
let tli = env
|
||||
.make_timeline(NodeId(1), TenantTimelineId::generate())
|
||||
.await?
|
||||
.wal_residence_guard()
|
||||
.await?;
|
||||
WalAcceptor::spawn(tli, msg_rx, reply_tx, Some(0));
|
||||
anyhow::Ok(())
|
||||
})?;
|
||||
|
||||
// Ingest the WAL.
|
||||
b.iter(|| {
|
||||
runtime.block_on(async {
|
||||
let reqgen = walgen.take(count).map(|(lsn, record)| AppendRequest {
|
||||
h: AppendRequestHeader {
|
||||
term: 1,
|
||||
term_start_lsn: Lsn(0),
|
||||
begin_lsn: lsn,
|
||||
end_lsn: lsn + record.len() as u64,
|
||||
commit_lsn: if commit { lsn } else { Lsn(0) }, // commit previous record
|
||||
truncate_lsn: Lsn(0),
|
||||
proposer_uuid: [0; 16],
|
||||
},
|
||||
wal_data: record,
|
||||
});
|
||||
|
||||
// Send requests.
|
||||
for req in reqgen {
|
||||
_ = reply_rx.try_recv(); // discard any replies, to avoid blocking
|
||||
let msg = ProposerAcceptorMessage::AppendRequest(req);
|
||||
msg_tx.send(msg).await.expect("send failed");
|
||||
}
|
||||
|
||||
// Wait for last message to get flushed.
|
||||
while let Some(reply) = reply_rx.recv().await {
|
||||
if let AcceptorProposerMessage::AppendResponse(resp) = reply {
|
||||
if resp.flush_lsn >= walgen.lsn {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
panic!("disconnected")
|
||||
})
|
||||
});
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Benchmarks OS write throughput by appending blocks of a given size to a file. This is intended
|
||||
/// to compare Tokio and stdlib writes, and give a baseline for optimal WAL throughput.
|
||||
fn bench_file_write(c: &mut Criterion) {
|
||||
let mut g = c.benchmark_group("file_write");
|
||||
|
||||
for kind in ["stdlib", "tokio"] {
|
||||
for fsync in [false, true] {
|
||||
for size in [8, KB, 8 * KB, 128 * KB, MB] {
|
||||
// Kind of weird to change the group throughput per benchmark, but it's the only way to
|
||||
// vary it per benchmark. It works.
|
||||
g.throughput(criterion::Throughput::Bytes(size as u64));
|
||||
g.bench_function(
|
||||
format!("{kind}/fsync={fsync}/size={size}"),
|
||||
|b| match kind {
|
||||
"stdlib" => run_bench_stdlib(b, size, fsync).unwrap(),
|
||||
"tokio" => run_bench_tokio(b, size, fsync).unwrap(),
|
||||
name => panic!("unknown kind {name}"),
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn run_bench_stdlib(b: &mut Bencher, size: usize, fsync: bool) -> anyhow::Result<()> {
|
||||
let mut file = tempfile()?;
|
||||
let buf = vec![0u8; size];
|
||||
|
||||
b.iter(|| {
|
||||
file.write_all(&buf).unwrap();
|
||||
file.flush().unwrap();
|
||||
if fsync {
|
||||
file.sync_data().unwrap();
|
||||
}
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn run_bench_tokio(b: &mut Bencher, size: usize, fsync: bool) -> anyhow::Result<()> {
|
||||
let runtime = tokio::runtime::Runtime::new()?; // needs multithreaded
|
||||
|
||||
let mut file = tokio::fs::File::from_std(tempfile()?);
|
||||
let buf = vec![0u8; size];
|
||||
|
||||
b.iter(|| {
|
||||
runtime.block_on(async {
|
||||
file.write_all(&buf).await.unwrap();
|
||||
file.flush().await.unwrap();
|
||||
if fsync {
|
||||
file.sync_data().await.unwrap();
|
||||
}
|
||||
})
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -2,14 +2,11 @@
|
||||
//! protocol commands.
|
||||
|
||||
use anyhow::Context;
|
||||
use pageserver_api::shard::{ShardIdentity, ShardStripeSize};
|
||||
use std::future::Future;
|
||||
use std::str::{self, FromStr};
|
||||
use std::sync::Arc;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{debug, info, info_span, Instrument};
|
||||
use utils::postgres_client::PAGESERVER_SAFEKEEPER_PROTO_VERSION;
|
||||
use utils::shard::{ShardCount, ShardNumber};
|
||||
|
||||
use crate::auth::check_permission;
|
||||
use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};
|
||||
@@ -38,8 +35,6 @@ pub struct SafekeeperPostgresHandler {
|
||||
pub tenant_id: Option<TenantId>,
|
||||
pub timeline_id: Option<TimelineId>,
|
||||
pub ttid: TenantTimelineId,
|
||||
pub shard: Option<ShardIdentity>,
|
||||
pub protocol_version: Option<u8>,
|
||||
/// Unique connection id is logged in spans for observability.
|
||||
pub conn_id: ConnectionId,
|
||||
/// Auth scope allowed on the connections and public key used to check auth tokens. None if auth is not configured.
|
||||
@@ -112,21 +107,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
|
||||
) -> Result<(), QueryError> {
|
||||
if let FeStartupPacket::StartupMessage { params, .. } = sm {
|
||||
if let Some(options) = params.options_raw() {
|
||||
let mut shard_count: Option<u8> = None;
|
||||
let mut shard_number: Option<u8> = None;
|
||||
let mut shard_stripe_size: Option<u32> = None;
|
||||
|
||||
for opt in options {
|
||||
// FIXME `ztenantid` and `ztimelineid` left for compatibility during deploy,
|
||||
// remove these after the PR gets deployed:
|
||||
// https://github.com/neondatabase/neon/pull/2433#discussion_r970005064
|
||||
match opt.split_once('=') {
|
||||
Some(("protocol_version", value)) => {
|
||||
self.protocol_version =
|
||||
Some(value.parse::<u8>().with_context(|| {
|
||||
format!("Failed to parse {value} as protocol_version")
|
||||
})?);
|
||||
}
|
||||
Some(("ztenantid", value)) | Some(("tenant_id", value)) => {
|
||||
self.tenant_id = Some(value.parse().with_context(|| {
|
||||
format!("Failed to parse {value} as tenant id")
|
||||
@@ -142,44 +127,9 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
|
||||
metrics.set_client_az(client_az)
|
||||
}
|
||||
}
|
||||
Some(("shard_count", value)) => {
|
||||
shard_count = Some(value.parse::<u8>().with_context(|| {
|
||||
format!("Failed to parse {value} as shard count")
|
||||
})?);
|
||||
}
|
||||
Some(("shard_number", value)) => {
|
||||
shard_number = Some(value.parse::<u8>().with_context(|| {
|
||||
format!("Failed to parse {value} as shard number")
|
||||
})?);
|
||||
}
|
||||
Some(("shard_stripe_size", value)) => {
|
||||
shard_stripe_size = Some(value.parse::<u32>().with_context(|| {
|
||||
format!("Failed to parse {value} as shard stripe size")
|
||||
})?);
|
||||
}
|
||||
_ => continue,
|
||||
}
|
||||
}
|
||||
|
||||
if self.protocol_version == Some(PAGESERVER_SAFEKEEPER_PROTO_VERSION) {
|
||||
match (shard_count, shard_number, shard_stripe_size) {
|
||||
(Some(count), Some(number), Some(stripe_size)) => {
|
||||
self.shard = Some(
|
||||
ShardIdentity::new(
|
||||
ShardNumber(number),
|
||||
ShardCount(count),
|
||||
ShardStripeSize(stripe_size),
|
||||
)
|
||||
.with_context(|| "Failed to create shard identity")?,
|
||||
);
|
||||
}
|
||||
_ => {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"Shard params were not specified"
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(app_name) = params.get("application_name") {
|
||||
@@ -200,11 +150,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
|
||||
tracing::field::debug(self.appname.clone()),
|
||||
);
|
||||
|
||||
if let Some(shard) = self.shard.as_ref() {
|
||||
tracing::Span::current()
|
||||
.record("shard", tracing::field::display(shard.shard_slug()));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
} else {
|
||||
Err(QueryError::Other(anyhow::anyhow!(
|
||||
@@ -313,8 +258,6 @@ impl SafekeeperPostgresHandler {
|
||||
tenant_id: None,
|
||||
timeline_id: None,
|
||||
ttid: TenantTimelineId::empty(),
|
||||
shard: None,
|
||||
protocol_version: None,
|
||||
conn_id,
|
||||
claims: None,
|
||||
auth,
|
||||
|
||||
@@ -112,9 +112,7 @@ impl SafeKeeperConf {
|
||||
}
|
||||
|
||||
impl SafeKeeperConf {
|
||||
#[cfg(test)]
|
||||
#[allow(unused)]
|
||||
fn dummy() -> Self {
|
||||
pub fn dummy() -> Self {
|
||||
SafeKeeperConf {
|
||||
workdir: Utf8PathBuf::from("./"),
|
||||
no_sync: false,
|
||||
|
||||
@@ -17,7 +17,6 @@ use tokio::{
|
||||
use tokio_postgres::replication::ReplicationStream;
|
||||
use tokio_postgres::types::PgLsn;
|
||||
use tracing::*;
|
||||
use utils::postgres_client::{ConnectionConfigArgs, POSTGRES_PROTO_VERSION};
|
||||
use utils::{id::NodeId, lsn::Lsn, postgres_client::wal_stream_connection_config};
|
||||
|
||||
use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE};
|
||||
@@ -326,17 +325,7 @@ async fn recovery_stream(
|
||||
conf: &SafeKeeperConf,
|
||||
) -> anyhow::Result<String> {
|
||||
// TODO: pass auth token
|
||||
let connection_conf_args = ConnectionConfigArgs {
|
||||
protocol_version: POSTGRES_PROTO_VERSION,
|
||||
ttid: tli.ttid,
|
||||
shard_number: None,
|
||||
shard_count: None,
|
||||
shard_stripe_size: None,
|
||||
listen_pg_addr_str: &donor.pg_connstr,
|
||||
auth_token: None,
|
||||
availability_zone: None,
|
||||
};
|
||||
let cfg = wal_stream_connection_config(connection_conf_args)?;
|
||||
let cfg = wal_stream_connection_config(tli.ttid, &donor.pg_connstr, None, None)?;
|
||||
let mut cfg = cfg.to_tokio_postgres_config();
|
||||
// It will make safekeeper give out not committed WAL (up to flush_lsn).
|
||||
cfg.application_name(&format!("safekeeper_{}", conf.my_id));
|
||||
|
||||
@@ -11,21 +11,17 @@ use crate::wal_storage::WalReader;
|
||||
use crate::GlobalTimelines;
|
||||
use anyhow::{bail, Context as AnyhowContext};
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use parking_lot::Mutex;
|
||||
use postgres_backend::PostgresBackend;
|
||||
use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError};
|
||||
use postgres_ffi::get_current_timestamp;
|
||||
use postgres_ffi::waldecoder::WalStreamDecoder;
|
||||
use postgres_ffi::{TimestampTz, MAX_SEND_SIZE};
|
||||
use pq_proto::{BeMessage, InterpretedWalRecordBody, WalSndKeepAlive, XLogDataBody};
|
||||
use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use utils::failpoint_support;
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::pageserver_feedback::PageserverFeedback;
|
||||
use utils::postgres_client::{PAGESERVER_SAFEKEEPER_PROTO_VERSION, POSTGRES_PROTO_VERSION};
|
||||
use wal_decoder::models::InterpretedWalRecord;
|
||||
|
||||
use std::cmp::{max, min};
|
||||
use std::net::SocketAddr;
|
||||
@@ -381,10 +377,6 @@ impl Drop for WalSenderGuard {
|
||||
}
|
||||
|
||||
impl SafekeeperPostgresHandler {
|
||||
pub fn protocol_version(&self) -> u8 {
|
||||
self.protocol_version.unwrap_or(POSTGRES_PROTO_VERSION)
|
||||
}
|
||||
|
||||
/// Wrapper around handle_start_replication_guts handling result. Error is
|
||||
/// handled here while we're still in walsender ttid span; with API
|
||||
/// extension, this can probably be moved into postgres_backend.
|
||||
@@ -420,7 +412,6 @@ impl SafekeeperPostgresHandler {
|
||||
let appname = self.appname.clone();
|
||||
|
||||
// Use a guard object to remove our entry from the timeline when we are done.
|
||||
// TODO(vlad): maybe thread shard stuff into here
|
||||
let ws_guard = Arc::new(tli.get_walsenders().register(
|
||||
self.ttid,
|
||||
*pgb.get_peer_addr(),
|
||||
@@ -484,10 +475,9 @@ impl SafekeeperPostgresHandler {
|
||||
tli,
|
||||
};
|
||||
|
||||
let protocol_version = self.protocol_version();
|
||||
let res = tokio::select! {
|
||||
// todo: add read|write .context to these errors
|
||||
r = sender.run(protocol_version, self.shard.as_ref()) => r,
|
||||
r = sender.run() => r,
|
||||
r = reply_reader.run() => r,
|
||||
};
|
||||
|
||||
@@ -570,35 +560,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
|
||||
///
|
||||
/// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ?
|
||||
/// convenience.
|
||||
/// TODO(vlad): add a run variant which accumulates a full wall record
|
||||
/// and interprets it.
|
||||
async fn run(
|
||||
&mut self,
|
||||
protocol_version: u8,
|
||||
shard: Option<&ShardIdentity>,
|
||||
) -> Result<(), CopyStreamHandlerEnd> {
|
||||
match protocol_version {
|
||||
POSTGRES_PROTO_VERSION => self.run_wal_sender().await,
|
||||
PAGESERVER_SAFEKEEPER_PROTO_VERSION => {
|
||||
self.run_interpreted_record_sender(shard.unwrap()).await
|
||||
}
|
||||
// TODO: make the proto version an enum
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
async fn run_interpreted_record_sender(
|
||||
&mut self,
|
||||
shard: &ShardIdentity,
|
||||
) -> Result<(), CopyStreamHandlerEnd> {
|
||||
let mut last_logged_at = std::time::Instant::now();
|
||||
let mut interpreted_records = 0;
|
||||
let mut interpreted_bytes = 0;
|
||||
let mut useful_bytes = 0;
|
||||
|
||||
let pg_version = self.tli.tli.get_state().await.1.server.pg_version / 10000;
|
||||
let mut wal_decoder = WalStreamDecoder::new(self.start_pos, pg_version);
|
||||
|
||||
async fn run(&mut self) -> Result<(), CopyStreamHandlerEnd> {
|
||||
loop {
|
||||
// Wait for the next portion if it is not there yet, or just
|
||||
// update our end of WAL available for sending value, we
|
||||
@@ -639,141 +601,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
|
||||
};
|
||||
let send_buf = &send_buf[..send_size];
|
||||
|
||||
wal_decoder.feed_bytes(send_buf);
|
||||
|
||||
// How fast or slow is this. Write a little benchmark
|
||||
// to see how quiclky we can decode 1GiB of WAL.
|
||||
// If this is slow, then we have a problem since it bottlenecks
|
||||
// the whole afair. SK can send about 60-70MiB of raw WAL and
|
||||
// about 13-17MiB of useful interpreted WAL per second (these
|
||||
// number are for one shard).
|
||||
while let Some((record_end_lsn, recdata)) = wal_decoder
|
||||
.poll_decode()
|
||||
.with_context(|| "Failed to decode WAL")?
|
||||
{
|
||||
assert!(record_end_lsn.is_aligned());
|
||||
|
||||
// Deserialize and interpret WAL record
|
||||
let interpreted = InterpretedWalRecord::from_bytes_filtered(
|
||||
recdata,
|
||||
shard,
|
||||
record_end_lsn,
|
||||
pg_version,
|
||||
)
|
||||
.with_context(|| "Failed to interpret WAL")?;
|
||||
|
||||
let useful_size = interpreted.batch.buffer_size();
|
||||
|
||||
let mut buf = Vec::new();
|
||||
interpreted
|
||||
.ser_into(&mut buf)
|
||||
.with_context(|| "Failed to serialize interpreted WAL")?;
|
||||
|
||||
let size = buf.len();
|
||||
|
||||
self.pgb
|
||||
.write_message(&BeMessage::InterpretedWalRecord(InterpretedWalRecordBody {
|
||||
wal_end: self.end_pos.0,
|
||||
data: buf.as_slice(),
|
||||
}))
|
||||
.await?;
|
||||
|
||||
interpreted_records += 1;
|
||||
interpreted_bytes += size;
|
||||
useful_bytes += useful_size;
|
||||
}
|
||||
|
||||
// and send it
|
||||
// self.pgb
|
||||
// .write_message(&BeMessage::XLogData(XLogDataBody {
|
||||
// wal_start: self.start_pos.0,
|
||||
// wal_end: self.end_pos.0,
|
||||
// timestamp: get_current_timestamp(),
|
||||
// data: send_buf,
|
||||
// }))
|
||||
// .await?;
|
||||
|
||||
// if let Some(appname) = &self.appname {
|
||||
// if appname == "replica" {
|
||||
// failpoint_support::sleep_millis_async!("sk-send-wal-replica-sleep");
|
||||
// }
|
||||
// }
|
||||
// trace!(
|
||||
// "sent {} bytes of WAL {}-{}",
|
||||
// send_size,
|
||||
// self.start_pos,
|
||||
// self.start_pos + send_size as u64
|
||||
// );
|
||||
|
||||
self.start_pos += send_size as u64;
|
||||
|
||||
let elapsed = last_logged_at.elapsed();
|
||||
if elapsed >= Duration::from_secs(5) {
|
||||
let records_rate = interpreted_records / elapsed.as_millis() * 1000;
|
||||
let bytes_rate = interpreted_bytes / elapsed.as_millis() as usize * 1000;
|
||||
let useful_bytes_rate = useful_bytes / elapsed.as_millis() as usize * 1000;
|
||||
tracing::info!(
|
||||
"Shard {} sender rate: rps={} bps={} ubps={}",
|
||||
shard.number.0,
|
||||
records_rate,
|
||||
bytes_rate,
|
||||
useful_bytes_rate
|
||||
);
|
||||
|
||||
last_logged_at = std::time::Instant::now();
|
||||
interpreted_records = 0;
|
||||
interpreted_bytes = 0;
|
||||
useful_bytes = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn run_wal_sender(&mut self) -> Result<(), CopyStreamHandlerEnd> {
|
||||
let mut useful_bytes = 0;
|
||||
let mut last_logged_at = std::time::Instant::now();
|
||||
|
||||
loop {
|
||||
// Wait for the next portion if it is not there yet, or just
|
||||
// update our end of WAL available for sending value, we
|
||||
// communicate it to the receiver.
|
||||
self.wait_wal().await?;
|
||||
assert!(
|
||||
self.end_pos > self.start_pos,
|
||||
"nothing to send after waiting for WAL"
|
||||
);
|
||||
|
||||
// try to send as much as available, capped by MAX_SEND_SIZE
|
||||
let mut chunk_end_pos = self.start_pos + MAX_SEND_SIZE as u64;
|
||||
// if we went behind available WAL, back off
|
||||
if chunk_end_pos >= self.end_pos {
|
||||
chunk_end_pos = self.end_pos;
|
||||
} else {
|
||||
// If sending not up to end pos, round down to page boundary to
|
||||
// avoid breaking WAL record not at page boundary, as protocol
|
||||
// demands. See walsender.c (XLogSendPhysical).
|
||||
chunk_end_pos = chunk_end_pos
|
||||
.checked_sub(chunk_end_pos.block_offset())
|
||||
.unwrap();
|
||||
}
|
||||
let send_size = (chunk_end_pos.0 - self.start_pos.0) as usize;
|
||||
let send_buf = &mut self.send_buf[..send_size];
|
||||
let send_size: usize;
|
||||
{
|
||||
// If uncommitted part is being pulled, check that the term is
|
||||
// still the expected one.
|
||||
let _term_guard = if let Some(t) = self.term {
|
||||
Some(self.tli.acquire_term(t).await?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
// Read WAL into buffer. send_size can be additionally capped to
|
||||
// segment boundary here.
|
||||
send_size = self.wal_reader.read(send_buf).await?
|
||||
};
|
||||
let send_buf = &send_buf[..send_size];
|
||||
|
||||
useful_bytes += send_buf.len();
|
||||
|
||||
// and send it
|
||||
self.pgb
|
||||
.write_message(&BeMessage::XLogData(XLogDataBody {
|
||||
@@ -796,18 +623,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
|
||||
self.start_pos + send_size as u64
|
||||
);
|
||||
self.start_pos += send_size as u64;
|
||||
|
||||
let elapsed = last_logged_at.elapsed();
|
||||
if elapsed >= Duration::from_secs(5) {
|
||||
let useful_bytes_rate = useful_bytes / elapsed.as_millis() as usize * 1000;
|
||||
tracing::info!(
|
||||
"Sender rate: ubps={}",
|
||||
useful_bytes_rate
|
||||
);
|
||||
|
||||
last_logged_at = std::time::Instant::now();
|
||||
useful_bytes = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -138,7 +138,6 @@ impl TimelinePersistentState {
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn empty() -> Self {
|
||||
TimelinePersistentState::new(
|
||||
&TenantTimelineId::empty(),
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
//! to glue together SafeKeeper and all other background services.
|
||||
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use camino::Utf8PathBuf;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use remote_storage::RemotePath;
|
||||
use safekeeper_api::models::TimelineTermBumpResponse;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -108,16 +108,11 @@ pub type ReadGuardSharedState<'a> = RwLockReadGuard<'a, SharedState>;
|
||||
pub struct WriteGuardSharedState<'a> {
|
||||
tli: Arc<Timeline>,
|
||||
guard: RwLockWriteGuard<'a, SharedState>,
|
||||
skip_update: bool,
|
||||
}
|
||||
|
||||
impl<'a> WriteGuardSharedState<'a> {
|
||||
fn new(tli: Arc<Timeline>, guard: RwLockWriteGuard<'a, SharedState>) -> Self {
|
||||
WriteGuardSharedState {
|
||||
tli,
|
||||
guard,
|
||||
skip_update: false,
|
||||
}
|
||||
WriteGuardSharedState { tli, guard }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -159,12 +154,10 @@ impl Drop for WriteGuardSharedState<'_> {
|
||||
}
|
||||
});
|
||||
|
||||
if !self.skip_update {
|
||||
// send notification about shared state update
|
||||
self.tli.shared_state_version_tx.send_modify(|old| {
|
||||
*old += 1;
|
||||
});
|
||||
}
|
||||
// send notification about shared state update
|
||||
self.tli.shared_state_version_tx.send_modify(|old| {
|
||||
*old += 1;
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -325,8 +318,17 @@ pub struct SharedState {
|
||||
}
|
||||
|
||||
impl SharedState {
|
||||
/// Creates a new SharedState.
|
||||
pub fn new(sk: StateSK) -> Self {
|
||||
Self {
|
||||
sk,
|
||||
peers_info: PeersInfo(vec![]),
|
||||
wal_removal_on_hold: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Restore SharedState from control file. If file doesn't exist, bails out.
|
||||
fn restore(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Result<Self> {
|
||||
pub fn restore(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Result<Self> {
|
||||
let timeline_dir = get_timeline_dir(conf, ttid);
|
||||
let control_store = control_file::FileStorage::restore_new(&timeline_dir, conf.no_sync)?;
|
||||
if control_store.server.wal_seg_size == 0 {
|
||||
@@ -352,11 +354,7 @@ impl SharedState {
|
||||
}
|
||||
};
|
||||
|
||||
Ok(Self {
|
||||
sk,
|
||||
peers_info: PeersInfo(vec![]),
|
||||
wal_removal_on_hold: false,
|
||||
})
|
||||
Ok(Self::new(sk))
|
||||
}
|
||||
|
||||
pub(crate) fn get_wal_seg_size(&self) -> usize {
|
||||
@@ -480,11 +478,13 @@ pub struct Timeline {
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
/// Load existing timeline from disk.
|
||||
pub fn load_timeline(conf: &SafeKeeperConf, ttid: TenantTimelineId) -> Result<Arc<Timeline>> {
|
||||
let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered();
|
||||
|
||||
let shared_state = SharedState::restore(conf, &ttid)?;
|
||||
/// Constructs a new timeline.
|
||||
pub fn new(
|
||||
ttid: TenantTimelineId,
|
||||
timeline_dir: &Utf8Path,
|
||||
remote_path: &RemotePath,
|
||||
shared_state: SharedState,
|
||||
) -> Arc<Self> {
|
||||
let (commit_lsn_watch_tx, commit_lsn_watch_rx) =
|
||||
watch::channel(shared_state.sk.state().commit_lsn);
|
||||
let (term_flush_lsn_watch_tx, term_flush_lsn_watch_rx) = watch::channel(TermLsn::from((
|
||||
@@ -494,10 +494,11 @@ impl Timeline {
|
||||
let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0);
|
||||
|
||||
let walreceivers = WalReceivers::new();
|
||||
let remote_path = remote_timeline_path(&ttid)?;
|
||||
Ok(Arc::new(Timeline {
|
||||
|
||||
Arc::new(Self {
|
||||
ttid,
|
||||
remote_path,
|
||||
remote_path: remote_path.to_owned(),
|
||||
timeline_dir: timeline_dir.to_owned(),
|
||||
commit_lsn_watch_tx,
|
||||
commit_lsn_watch_rx,
|
||||
term_flush_lsn_watch_tx,
|
||||
@@ -508,13 +509,28 @@ impl Timeline {
|
||||
walsenders: WalSenders::new(walreceivers.clone()),
|
||||
walreceivers,
|
||||
cancel: CancellationToken::default(),
|
||||
timeline_dir: get_timeline_dir(conf, &ttid),
|
||||
manager_ctl: ManagerCtl::new(),
|
||||
broker_active: AtomicBool::new(false),
|
||||
wal_backup_active: AtomicBool::new(false),
|
||||
last_removed_segno: AtomicU64::new(0),
|
||||
mgr_status: AtomicStatus::new(),
|
||||
}))
|
||||
})
|
||||
}
|
||||
|
||||
/// Load existing timeline from disk.
|
||||
pub fn load_timeline(conf: &SafeKeeperConf, ttid: TenantTimelineId) -> Result<Arc<Timeline>> {
|
||||
let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered();
|
||||
|
||||
let shared_state = SharedState::restore(conf, &ttid)?;
|
||||
let timeline_dir = get_timeline_dir(conf, &ttid);
|
||||
let remote_path = remote_timeline_path(&ttid)?;
|
||||
|
||||
Ok(Timeline::new(
|
||||
ttid,
|
||||
&timeline_dir,
|
||||
&remote_path,
|
||||
shared_state,
|
||||
))
|
||||
}
|
||||
|
||||
/// Initialize fresh timeline on disk and start background tasks. If init
|
||||
@@ -1128,13 +1144,13 @@ async fn delete_dir(path: &Utf8PathBuf) -> Result<bool> {
|
||||
|
||||
/// Get a path to the tenant directory. If you just need to get a timeline directory,
|
||||
/// use WalResidentTimeline::get_timeline_dir instead.
|
||||
pub(crate) fn get_tenant_dir(conf: &SafeKeeperConf, tenant_id: &TenantId) -> Utf8PathBuf {
|
||||
pub fn get_tenant_dir(conf: &SafeKeeperConf, tenant_id: &TenantId) -> Utf8PathBuf {
|
||||
conf.workdir.join(tenant_id.to_string())
|
||||
}
|
||||
|
||||
/// Get a path to the timeline directory. If you need to read WAL files from disk,
|
||||
/// use WalResidentTimeline::get_timeline_dir instead. This function does not check
|
||||
/// timeline eviction status and WAL files might not be present on disk.
|
||||
pub(crate) fn get_timeline_dir(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Utf8PathBuf {
|
||||
pub fn get_timeline_dir(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Utf8PathBuf {
|
||||
get_tenant_dir(conf, &ttid.tenant_id).join(ttid.timeline_id.to_string())
|
||||
}
|
||||
|
||||
@@ -31,7 +31,6 @@ use crate::state::TimelinePersistentState;
|
||||
use crate::wal_backup::{read_object, remote_timeline_path};
|
||||
use postgres_ffi::waldecoder::WalStreamDecoder;
|
||||
use postgres_ffi::XLogFileName;
|
||||
use postgres_ffi::XLOG_BLCKSZ;
|
||||
use pq_proto::SystemId;
|
||||
use utils::{id::TenantTimelineId, lsn::Lsn};
|
||||
|
||||
@@ -223,6 +222,15 @@ impl PhysicalStorage {
|
||||
)
|
||||
}
|
||||
|
||||
/// Call fsync if config requires so.
|
||||
async fn fsync_file(&mut self, file: &File) -> Result<()> {
|
||||
if !self.no_sync {
|
||||
self.metrics
|
||||
.observe_flush_seconds(time_io_closure(file.sync_all()).await?);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Call fdatasync if config requires so.
|
||||
async fn fdatasync_file(&mut self, file: &File) -> Result<()> {
|
||||
if !self.no_sync {
|
||||
@@ -256,11 +264,15 @@ impl PhysicalStorage {
|
||||
// half initialized segment, first bake it under tmp filename and
|
||||
// then rename.
|
||||
let tmp_path = self.timeline_dir.join("waltmp");
|
||||
let mut file = File::create(&tmp_path)
|
||||
let file = File::create(&tmp_path)
|
||||
.await
|
||||
.with_context(|| format!("Failed to open tmp wal file {:?}", &tmp_path))?;
|
||||
|
||||
write_zeroes(&mut file, self.wal_seg_size).await?;
|
||||
fail::fail_point!("sk-zero-segment", |_| {
|
||||
info!("sk-zero-segment failpoint hit");
|
||||
Err(anyhow::anyhow!("failpoint: sk-zero-segment"))
|
||||
});
|
||||
file.set_len(self.wal_seg_size as u64).await?;
|
||||
|
||||
// Note: this doesn't get into observe_flush_seconds metric. But
|
||||
// segment init should be separate metric, if any.
|
||||
@@ -486,12 +498,12 @@ impl Storage for PhysicalStorage {
|
||||
// Remove all segments after the given LSN.
|
||||
remove_segments_from_disk(&self.timeline_dir, self.wal_seg_size, |x| x > segno).await?;
|
||||
|
||||
let (mut file, is_partial) = self.open_or_create(segno).await?;
|
||||
let (file, is_partial) = self.open_or_create(segno).await?;
|
||||
|
||||
// Fill end with zeroes
|
||||
file.seek(SeekFrom::Start(xlogoff as u64)).await?;
|
||||
write_zeroes(&mut file, self.wal_seg_size - xlogoff).await?;
|
||||
self.fdatasync_file(&file).await?;
|
||||
file.set_len(xlogoff as u64).await?;
|
||||
file.set_len(self.wal_seg_size as u64).await?;
|
||||
self.fsync_file(&file).await?;
|
||||
|
||||
if !is_partial {
|
||||
// Make segment partial once again
|
||||
@@ -751,25 +763,6 @@ impl WalReader {
|
||||
}
|
||||
}
|
||||
|
||||
/// Zero block for filling created WAL segments.
|
||||
const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ];
|
||||
|
||||
/// Helper for filling file with zeroes.
|
||||
async fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> {
|
||||
fail::fail_point!("sk-write-zeroes", |_| {
|
||||
info!("write_zeroes hit failpoint");
|
||||
Err(anyhow::anyhow!("failpoint: sk-write-zeroes"))
|
||||
});
|
||||
|
||||
while count >= XLOG_BLCKSZ {
|
||||
file.write_all(ZERO_BLOCK).await?;
|
||||
count -= XLOG_BLCKSZ;
|
||||
}
|
||||
file.write_all(&ZERO_BLOCK[0..count]).await?;
|
||||
file.flush().await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Helper function for opening WAL segment `segno` in `dir`. Returns file and
|
||||
/// whether it is .partial.
|
||||
pub(crate) async fn open_wal_file(
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use std::{ffi::CStr, sync::Arc};
|
||||
|
||||
use parking_lot::{Mutex, MutexGuard};
|
||||
use postgres_ffi::v16::wal_generator::WalGenerator;
|
||||
use postgres_ffi::v16::wal_generator::{LogicalMessageGenerator, WalGenerator};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use super::block_storage::BlockStorage;
|
||||
@@ -18,7 +18,7 @@ impl DiskWalProposer {
|
||||
internal_available_lsn: Lsn(0),
|
||||
prev_lsn: Lsn(0),
|
||||
disk: BlockStorage::new(),
|
||||
wal_generator: WalGenerator::new(),
|
||||
wal_generator: WalGenerator::new(LogicalMessageGenerator::new(c"", &[])),
|
||||
}),
|
||||
})
|
||||
}
|
||||
@@ -36,7 +36,7 @@ pub struct State {
|
||||
// actual WAL storage
|
||||
disk: BlockStorage,
|
||||
// WAL record generator
|
||||
wal_generator: WalGenerator,
|
||||
wal_generator: WalGenerator<LogicalMessageGenerator>,
|
||||
}
|
||||
|
||||
impl State {
|
||||
@@ -64,7 +64,7 @@ impl State {
|
||||
|
||||
/// Inserts a logical record in the WAL at the current LSN.
|
||||
pub fn insert_logical_message(&mut self, prefix: &CStr, msg: &[u8]) {
|
||||
let record = self.wal_generator.generate_logical_message(prefix, msg);
|
||||
let (_, record) = self.wal_generator.append_logical_message(prefix, msg);
|
||||
self.disk.write(self.internal_available_lsn.into(), &record);
|
||||
self.prev_lsn = self.internal_available_lsn;
|
||||
self.internal_available_lsn += record.len() as u64;
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::{
|
||||
sync::Arc,
|
||||
};
|
||||
|
||||
use pageserver_api::controller_api::NodeSchedulingPolicy;
|
||||
use pageserver_api::controller_api::{NodeSchedulingPolicy, ShardSchedulingPolicy};
|
||||
use utils::{id::NodeId, shard::TenantShardId};
|
||||
|
||||
use crate::{
|
||||
@@ -98,6 +98,20 @@ impl TenantShardDrain {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Only tenants with a normal (Active) scheduling policy are proactively moved
|
||||
// around during a node drain. Shards which have been manually configured to a different
|
||||
// policy are only rescheduled by manual intervention.
|
||||
match tenant_shard.get_scheduling_policy() {
|
||||
ShardSchedulingPolicy::Active | ShardSchedulingPolicy::Essential => {
|
||||
// A migration during drain is classed as 'essential' because it is required to
|
||||
// uphold our availability goals for the tenant: this shard is elegible for migration.
|
||||
}
|
||||
ShardSchedulingPolicy::Pause | ShardSchedulingPolicy::Stop => {
|
||||
// If we have been asked to avoid rescheduling this shard, then do not migrate it during a drain
|
||||
return None;
|
||||
}
|
||||
}
|
||||
|
||||
match scheduler.node_preferred(tenant_shard.intent.get_secondary()) {
|
||||
Some(node) => Some(node),
|
||||
None => {
|
||||
|
||||
@@ -6721,6 +6721,16 @@ impl Service {
|
||||
.tenants
|
||||
.iter_mut()
|
||||
.filter_map(|(tid, tenant_shard)| {
|
||||
if !matches!(
|
||||
tenant_shard.get_scheduling_policy(),
|
||||
ShardSchedulingPolicy::Active
|
||||
) {
|
||||
// Only include tenants in fills if they have a normal (Active) scheduling policy. We
|
||||
// even exclude Essential, because moving to fill a node is not essential to keeping this
|
||||
// tenant available.
|
||||
return None;
|
||||
}
|
||||
|
||||
if tenant_shard.intent.get_secondary().contains(&node_id) {
|
||||
if let Some(primary) = tenant_shard.intent.get_attached() {
|
||||
return Some((*primary, *tid));
|
||||
|
||||
@@ -45,3 +45,4 @@ class TokenScope(str, Enum):
|
||||
SAFEKEEPER_DATA = "safekeeperdata"
|
||||
TENANT = "tenant"
|
||||
SCRUBBER = "scrubber"
|
||||
INFRA = "infra"
|
||||
|
||||
@@ -80,7 +80,13 @@ class PgBenchRunResult:
|
||||
):
|
||||
stdout_lines = stdout.splitlines()
|
||||
|
||||
number_of_clients = 0
|
||||
number_of_threads = 0
|
||||
number_of_transactions_actually_processed = 0
|
||||
latency_average = 0.0
|
||||
latency_stddev = None
|
||||
tps = 0.0
|
||||
scale = 0
|
||||
|
||||
# we know significant parts of these values from test input
|
||||
# but to be precise take them from output
|
||||
|
||||
@@ -8,7 +8,7 @@ from contextlib import _GeneratorContextManager, contextmanager
|
||||
|
||||
# Type-related stuff
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import TYPE_CHECKING, final
|
||||
|
||||
import pytest
|
||||
from _pytest.fixtures import FixtureRequest
|
||||
@@ -70,12 +70,12 @@ class PgCompare(ABC):
|
||||
|
||||
@contextmanager
|
||||
@abstractmethod
|
||||
def record_pageserver_writes(self, out_name: str):
|
||||
def record_pageserver_writes(self, out_name: str) -> Iterator[None]:
|
||||
pass
|
||||
|
||||
@contextmanager
|
||||
@abstractmethod
|
||||
def record_duration(self, out_name: str):
|
||||
def record_duration(self, out_name: str) -> Iterator[None]:
|
||||
pass
|
||||
|
||||
@contextmanager
|
||||
@@ -105,6 +105,7 @@ class PgCompare(ABC):
|
||||
return results
|
||||
|
||||
|
||||
@final
|
||||
class NeonCompare(PgCompare):
|
||||
"""PgCompare interface for the neon stack."""
|
||||
|
||||
@@ -206,6 +207,7 @@ class NeonCompare(PgCompare):
|
||||
return self.zenbenchmark.record_duration(out_name)
|
||||
|
||||
|
||||
@final
|
||||
class VanillaCompare(PgCompare):
|
||||
"""PgCompare interface for vanilla postgres."""
|
||||
|
||||
@@ -271,6 +273,7 @@ class VanillaCompare(PgCompare):
|
||||
return self.zenbenchmark.record_duration(out_name)
|
||||
|
||||
|
||||
@final
|
||||
class RemoteCompare(PgCompare):
|
||||
"""PgCompare interface for a remote postgres instance."""
|
||||
|
||||
|
||||
@@ -4,11 +4,14 @@ https://python-hyper.org/projects/hyper-h2/en/stable/asyncio-example.html
|
||||
auth-broker -> local-proxy needs a h2 connection, so we need a h2 server :)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import collections
|
||||
import io
|
||||
import json
|
||||
from collections.abc import AsyncIterable
|
||||
from typing import TYPE_CHECKING, final
|
||||
|
||||
import pytest_asyncio
|
||||
from h2.config import H2Configuration
|
||||
@@ -25,34 +28,45 @@ from h2.events import (
|
||||
)
|
||||
from h2.exceptions import ProtocolError, StreamClosedError
|
||||
from h2.settings import SettingCodes
|
||||
from typing_extensions import override
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import Any, Optional
|
||||
|
||||
|
||||
RequestData = collections.namedtuple("RequestData", ["headers", "data"])
|
||||
|
||||
|
||||
@final
|
||||
class H2Server:
|
||||
def __init__(self, host, port) -> None:
|
||||
def __init__(self, host: str, port: int) -> None:
|
||||
self.host = host
|
||||
self.port = port
|
||||
|
||||
|
||||
@final
|
||||
class H2Protocol(asyncio.Protocol):
|
||||
def __init__(self):
|
||||
config = H2Configuration(client_side=False, header_encoding="utf-8")
|
||||
self.conn = H2Connection(config=config)
|
||||
self.transport = None
|
||||
self.stream_data = {}
|
||||
self.flow_control_futures = {}
|
||||
self.transport: Optional[asyncio.Transport] = None
|
||||
self.stream_data: dict[int, RequestData] = {}
|
||||
self.flow_control_futures: dict[int, asyncio.Future[Any]] = {}
|
||||
|
||||
def connection_made(self, transport: asyncio.Transport): # type: ignore[override]
|
||||
@override
|
||||
def connection_made(self, transport: asyncio.BaseTransport):
|
||||
assert isinstance(transport, asyncio.Transport)
|
||||
self.transport = transport
|
||||
self.conn.initiate_connection()
|
||||
self.transport.write(self.conn.data_to_send())
|
||||
|
||||
def connection_lost(self, _exc):
|
||||
@override
|
||||
def connection_lost(self, exc: Optional[Exception]):
|
||||
for future in self.flow_control_futures.values():
|
||||
future.cancel()
|
||||
self.flow_control_futures = {}
|
||||
|
||||
@override
|
||||
def data_received(self, data: bytes):
|
||||
assert self.transport is not None
|
||||
try:
|
||||
@@ -77,7 +91,7 @@ class H2Protocol(asyncio.Protocol):
|
||||
self.window_updated(event.stream_id, event.delta)
|
||||
elif isinstance(event, RemoteSettingsChanged):
|
||||
if SettingCodes.INITIAL_WINDOW_SIZE in event.changed_settings:
|
||||
self.window_updated(None, 0)
|
||||
self.window_updated(0, 0)
|
||||
|
||||
self.transport.write(self.conn.data_to_send())
|
||||
|
||||
@@ -123,7 +137,7 @@ class H2Protocol(asyncio.Protocol):
|
||||
else:
|
||||
stream_data.data.write(data)
|
||||
|
||||
def stream_reset(self, stream_id):
|
||||
def stream_reset(self, stream_id: int):
|
||||
"""
|
||||
A stream reset was sent. Stop sending data.
|
||||
"""
|
||||
@@ -131,7 +145,7 @@ class H2Protocol(asyncio.Protocol):
|
||||
future = self.flow_control_futures.pop(stream_id)
|
||||
future.cancel()
|
||||
|
||||
async def send_data(self, data, stream_id):
|
||||
async def send_data(self, data: bytes, stream_id: int):
|
||||
"""
|
||||
Send data according to the flow control rules.
|
||||
"""
|
||||
@@ -161,7 +175,7 @@ class H2Protocol(asyncio.Protocol):
|
||||
self.transport.write(self.conn.data_to_send())
|
||||
data = data[chunk_size:]
|
||||
|
||||
async def wait_for_flow_control(self, stream_id):
|
||||
async def wait_for_flow_control(self, stream_id: int):
|
||||
"""
|
||||
Waits for a Future that fires when the flow control window is opened.
|
||||
"""
|
||||
@@ -169,7 +183,7 @@ class H2Protocol(asyncio.Protocol):
|
||||
self.flow_control_futures[stream_id] = f
|
||||
await f
|
||||
|
||||
def window_updated(self, stream_id, delta):
|
||||
def window_updated(self, stream_id: int, delta):
|
||||
"""
|
||||
A window update frame was received. Unblock some number of flow control
|
||||
Futures.
|
||||
|
||||
@@ -1782,7 +1782,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
self.request(
|
||||
"PUT",
|
||||
f"{self.api}/control/v1/node/{node_id}/drain",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
headers=self.headers(TokenScope.INFRA),
|
||||
)
|
||||
|
||||
def cancel_node_drain(self, node_id):
|
||||
@@ -1790,7 +1790,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
self.request(
|
||||
"DELETE",
|
||||
f"{self.api}/control/v1/node/{node_id}/drain",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
headers=self.headers(TokenScope.INFRA),
|
||||
)
|
||||
|
||||
def node_fill(self, node_id):
|
||||
@@ -1798,7 +1798,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
self.request(
|
||||
"PUT",
|
||||
f"{self.api}/control/v1/node/{node_id}/fill",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
headers=self.headers(TokenScope.INFRA),
|
||||
)
|
||||
|
||||
def cancel_node_fill(self, node_id):
|
||||
@@ -1806,14 +1806,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
self.request(
|
||||
"DELETE",
|
||||
f"{self.api}/control/v1/node/{node_id}/fill",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
headers=self.headers(TokenScope.INFRA),
|
||||
)
|
||||
|
||||
def node_status(self, node_id):
|
||||
response = self.request(
|
||||
"GET",
|
||||
f"{self.api}/control/v1/node/{node_id}",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
headers=self.headers(TokenScope.INFRA),
|
||||
)
|
||||
return response.json()
|
||||
|
||||
@@ -1829,7 +1829,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
response = self.request(
|
||||
"GET",
|
||||
f"{self.api}/control/v1/node",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
headers=self.headers(TokenScope.INFRA),
|
||||
)
|
||||
return response.json()
|
||||
|
||||
@@ -1857,7 +1857,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
shard_count: Optional[int] = None,
|
||||
shard_stripe_size: Optional[int] = None,
|
||||
tenant_config: Optional[dict[Any, Any]] = None,
|
||||
placement_policy: Optional[Union[dict[Any, Any] | str]] = None,
|
||||
placement_policy: Optional[Union[dict[Any, Any], str]] = None,
|
||||
):
|
||||
"""
|
||||
Use this rather than pageserver_api() when you need to include shard parameters
|
||||
@@ -4177,9 +4177,15 @@ class Safekeeper(LogUtils):
|
||||
return self
|
||||
|
||||
def assert_no_errors(self):
|
||||
assert not self.log_contains("manager task finished prematurely")
|
||||
assert not self.log_contains("error while acquiring WalResidentTimeline guard")
|
||||
assert not self.log_contains("timeout while acquiring WalResidentTimeline guard")
|
||||
not_allowed = [
|
||||
"manager task finished prematurely",
|
||||
"error while acquiring WalResidentTimeline guard",
|
||||
"timeout while acquiring WalResidentTimeline guard",
|
||||
"invalid xlog page header:",
|
||||
"WAL record crc mismatch at",
|
||||
]
|
||||
for na in not_allowed:
|
||||
assert not self.log_contains(na)
|
||||
|
||||
def append_logical_message(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, request: dict[str, Any]
|
||||
|
||||
@@ -316,7 +316,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
def tenant_location_conf(
|
||||
self,
|
||||
tenant_id: Union[TenantId, TenantShardId],
|
||||
location_conf=dict[str, Any],
|
||||
location_conf: dict[str, Any],
|
||||
flush_ms=None,
|
||||
lazy: Optional[bool] = None,
|
||||
):
|
||||
|
||||
@@ -56,6 +56,8 @@ def wait_for_upload(
|
||||
lsn: Lsn,
|
||||
):
|
||||
"""waits for local timeline upload up to specified lsn"""
|
||||
|
||||
current_lsn = Lsn(0)
|
||||
for i in range(20):
|
||||
current_lsn = remote_consistent_lsn(pageserver_http, tenant, timeline)
|
||||
if current_lsn >= lsn:
|
||||
@@ -203,6 +205,8 @@ def wait_for_last_record_lsn(
|
||||
lsn: Lsn,
|
||||
) -> Lsn:
|
||||
"""waits for pageserver to catch up to a certain lsn, returns the last observed lsn."""
|
||||
|
||||
current_lsn = Lsn(0)
|
||||
for i in range(1000):
|
||||
current_lsn = last_record_lsn(pageserver_http, tenant, timeline)
|
||||
if current_lsn >= lsn:
|
||||
|
||||
@@ -112,7 +112,7 @@ def compatibility_snapshot_dir() -> Iterator[Path]:
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def compatibility_neon_binpath() -> Optional[Iterator[Path]]:
|
||||
def compatibility_neon_binpath() -> Iterator[Optional[Path]]:
|
||||
if os.getenv("REMOTE_ENV"):
|
||||
return
|
||||
comp_binpath = None
|
||||
@@ -133,7 +133,7 @@ def pg_distrib_dir(base_dir: Path) -> Iterator[Path]:
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def compatibility_pg_distrib_dir() -> Optional[Iterator[Path]]:
|
||||
def compatibility_pg_distrib_dir() -> Iterator[Optional[Path]]:
|
||||
compat_distrib_dir = None
|
||||
if env_compat_postgres_bin := os.environ.get("COMPATIBILITY_POSTGRES_DISTRIB_DIR"):
|
||||
compat_distrib_dir = Path(env_compat_postgres_bin).resolve()
|
||||
|
||||
@@ -2,11 +2,13 @@ from __future__ import annotations
|
||||
|
||||
from contextlib import closing
|
||||
from io import BufferedReader, RawIOBase
|
||||
from typing import Optional
|
||||
from typing import Optional, final
|
||||
|
||||
from fixtures.compare_fixtures import PgCompare
|
||||
from typing_extensions import override
|
||||
|
||||
|
||||
@final
|
||||
class CopyTestData(RawIOBase):
|
||||
def __init__(self, rows: int):
|
||||
self.rows = rows
|
||||
@@ -14,6 +16,7 @@ class CopyTestData(RawIOBase):
|
||||
self.linebuf: Optional[bytes] = None
|
||||
self.ptr = 0
|
||||
|
||||
@override
|
||||
def readable(self):
|
||||
return True
|
||||
|
||||
|
||||
61
test_runner/regress/test_compute_locales.py
Normal file
61
test_runner/regress/test_compute_locales.py
Normal file
@@ -0,0 +1,61 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, cast
|
||||
|
||||
from fixtures.pg_version import PgVersion
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Sequence
|
||||
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
|
||||
|
||||
def test_default_locales(neon_simple_env: NeonEnv):
|
||||
"""
|
||||
Test that the default locales for compute databases is C.UTF-8.
|
||||
"""
|
||||
env = neon_simple_env
|
||||
|
||||
endpoint = env.endpoints.create_start("main")
|
||||
|
||||
domain_locales = cast(
|
||||
"Sequence[str]",
|
||||
endpoint.safe_psql(
|
||||
"SELECT current_setting('lc_messages') AS lc_messages,"
|
||||
+ "current_setting('lc_monetary') AS lc_monetary,"
|
||||
+ "current_setting('lc_numeric') AS lc_numeric,"
|
||||
+ "current_setting('lc_time') AS lc_time"
|
||||
)[0],
|
||||
)
|
||||
for dl in domain_locales:
|
||||
assert dl == "C.UTF-8"
|
||||
|
||||
# Postgres 15 added the locale providers
|
||||
if env.pg_version < PgVersion.V15:
|
||||
results = cast(
|
||||
"Sequence[str]",
|
||||
endpoint.safe_psql(
|
||||
"SELECT datcollate, datctype FROM pg_database WHERE datname = current_database()"
|
||||
)[0],
|
||||
)
|
||||
|
||||
datcollate = results[0]
|
||||
datctype = results[1]
|
||||
else:
|
||||
results = cast(
|
||||
"Sequence[str]",
|
||||
endpoint.safe_psql(
|
||||
"SELECT datlocprovider, datcollate, datctype FROM pg_database WHERE datname = current_database()"
|
||||
)[0],
|
||||
)
|
||||
datlocprovider = results[0]
|
||||
datcollate = results[1]
|
||||
datctype = results[2]
|
||||
|
||||
if env.pg_version >= PgVersion.V17:
|
||||
assert datlocprovider == "b", "The locale provider is not builtin"
|
||||
else:
|
||||
assert datlocprovider == "c", "The locale provider is not libc"
|
||||
|
||||
assert datcollate == "C.UTF-8"
|
||||
assert datctype == "C.UTF-8"
|
||||
@@ -656,6 +656,7 @@ def test_upgrade_generationless_local_file_paths(
|
||||
workload.write_rows(1000)
|
||||
|
||||
attached_pageserver = env.get_tenant_pageserver(tenant_id)
|
||||
assert attached_pageserver is not None
|
||||
secondary_pageserver = list([ps for ps in env.pageservers if ps.id != attached_pageserver.id])[
|
||||
0
|
||||
]
|
||||
|
||||
50
test_runner/regress/test_physical_and_logical_replicaiton.py
Normal file
50
test_runner/regress/test_physical_and_logical_replicaiton.py
Normal file
@@ -0,0 +1,50 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
|
||||
from fixtures.neon_fixtures import NeonEnv, logical_replication_sync
|
||||
|
||||
|
||||
def test_physical_and_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
|
||||
env = neon_simple_env
|
||||
|
||||
n_records = 100000
|
||||
|
||||
primary = env.endpoints.create_start(
|
||||
branch_name="main",
|
||||
endpoint_id="primary",
|
||||
config_lines=["min_wal_size=32MB", "max_wal_size=64MB"],
|
||||
)
|
||||
p_con = primary.connect()
|
||||
p_cur = p_con.cursor()
|
||||
p_cur.execute("CREATE TABLE t(pk bigint primary key, payload text default repeat('?',200))")
|
||||
p_cur.execute("create publication pub1 for table t")
|
||||
|
||||
# start subscriber to primary
|
||||
vanilla_pg.start()
|
||||
vanilla_pg.safe_psql("CREATE TABLE t(pk bigint primary key, payload text)")
|
||||
connstr = primary.connstr().replace("'", "''")
|
||||
vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
|
||||
|
||||
time.sleep(1)
|
||||
secondary = env.endpoints.new_replica_start(
|
||||
origin=primary,
|
||||
endpoint_id="secondary",
|
||||
config_lines=["min_wal_size=32MB", "max_wal_size=64MB"],
|
||||
)
|
||||
|
||||
s_con = secondary.connect()
|
||||
s_cur = s_con.cursor()
|
||||
|
||||
for pk in range(n_records):
|
||||
p_cur.execute("insert into t (pk) values (%s)", (pk,))
|
||||
|
||||
s_cur.execute("select count(*) from t")
|
||||
assert s_cur.fetchall()[0][0] == n_records
|
||||
|
||||
logical_replication_sync(vanilla_pg, primary)
|
||||
assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == n_records
|
||||
|
||||
# Check that LR slot is not copied to replica
|
||||
s_cur.execute("select count(*) from pg_replication_slots")
|
||||
assert s_cur.fetchall()[0][0] == 0
|
||||
@@ -37,7 +37,7 @@ async def test_websockets(static_proxy: NeonProxy):
|
||||
startup_message.extend(b"\0")
|
||||
length = (4 + len(startup_message)).to_bytes(4, byteorder="big")
|
||||
|
||||
await websocket.send([length, startup_message])
|
||||
await websocket.send([length, bytes(startup_message)])
|
||||
|
||||
startup_response = await websocket.recv()
|
||||
assert isinstance(startup_response, bytes)
|
||||
|
||||
@@ -1,19 +1,22 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from typing import Union
|
||||
|
||||
import pytest
|
||||
from fixtures.common_types import Lsn
|
||||
from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
Endpoint,
|
||||
LogCursor,
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
last_flush_lsn_upload,
|
||||
tenant_get_shards,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
from fixtures.pageserver.utils import wait_for_last_record_lsn
|
||||
from fixtures.utils import query_scalar
|
||||
from fixtures.utils import query_scalar, wait_until
|
||||
|
||||
|
||||
#
|
||||
@@ -119,7 +122,8 @@ def test_readonly_node(neon_simple_env: NeonEnv):
|
||||
)
|
||||
|
||||
|
||||
def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
|
||||
@pytest.mark.parametrize("iter", [i for i in range(20)])
|
||||
def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder, iter: int):
|
||||
"""
|
||||
Test static endpoint is protected from GC by acquiring and renewing lsn leases.
|
||||
"""
|
||||
@@ -169,23 +173,63 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
|
||||
)
|
||||
return last_flush_lsn
|
||||
|
||||
def trigger_gc_and_select(env: NeonEnv, ep_static: Endpoint, ctx: str):
|
||||
def get_layers_protected_by_lease(
|
||||
ps_http: PageserverHttpClient,
|
||||
tenant_id: Union[TenantId, TenantShardId],
|
||||
timeline_id: TimelineId,
|
||||
lease_lsn: Lsn,
|
||||
) -> set[str]:
|
||||
"""Get all layers whose start_lsn is less than or equal to the lease lsn."""
|
||||
layer_map_info = ps_http.layer_map_info(tenant_id, timeline_id)
|
||||
return set(
|
||||
x.layer_file_name
|
||||
for x in layer_map_info.historic_layers
|
||||
if Lsn(x.lsn_start) <= lease_lsn
|
||||
)
|
||||
|
||||
def trigger_gc_and_select(
|
||||
env: NeonEnv,
|
||||
ep_static: Endpoint,
|
||||
lease_lsn: Lsn,
|
||||
ctx: str,
|
||||
offset: None | LogCursor = None,
|
||||
) -> LogCursor:
|
||||
"""
|
||||
Trigger GC manually on all pageservers. Then run an `SELECT` query.
|
||||
"""
|
||||
for shard, ps in tenant_get_shards(env, env.initial_tenant):
|
||||
client = ps.http_client()
|
||||
layers_guarded_before_gc = get_layers_protected_by_lease(
|
||||
client, shard, env.initial_timeline, lease_lsn=lsn
|
||||
)
|
||||
gc_result = client.timeline_gc(shard, env.initial_timeline, 0)
|
||||
layers_guarded_after_gc = get_layers_protected_by_lease(
|
||||
client, shard, env.initial_timeline, lease_lsn=lsn
|
||||
)
|
||||
|
||||
# Note: cannot assert on `layers_removed` here because it could be layers
|
||||
# not guarded by the lease. Rely on successful execution of the query instead.
|
||||
# not guarded by the lease. Instead, use layer map dump.
|
||||
assert layers_guarded_before_gc.issubset(
|
||||
layers_guarded_after_gc
|
||||
), "Layers guarded by lease before GC should not be removed"
|
||||
|
||||
log.info(f"{gc_result=}")
|
||||
|
||||
# wait for lease renewal before running query.
|
||||
_, offset = wait_until(
|
||||
20,
|
||||
0.5,
|
||||
lambda: ep_static.assert_log_contains(
|
||||
"lsn_lease_bg_task.*Request succeeded", offset=offset
|
||||
),
|
||||
)
|
||||
with ep_static.cursor() as cur:
|
||||
# Following query should succeed if pages are properly guarded by leases.
|
||||
cur.execute("SELECT count(*) FROM t0")
|
||||
assert cur.fetchone() == (ROW_COUNT,)
|
||||
|
||||
log.info(f"`SELECT` query succeed after GC, {ctx=}")
|
||||
return offset
|
||||
|
||||
# Insert some records on main branch
|
||||
with env.endpoints.create_start("main") as ep_main:
|
||||
@@ -213,7 +257,9 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
generate_updates_on_main(env, ep_main, 3, end=100)
|
||||
|
||||
trigger_gc_and_select(env, ep_static, ctx="Before pageservers restart")
|
||||
offset = trigger_gc_and_select(
|
||||
env, ep_static, lease_lsn=lsn, ctx="Before pageservers restart"
|
||||
)
|
||||
|
||||
# Trigger Pageserver restarts
|
||||
for ps in env.pageservers:
|
||||
@@ -222,7 +268,13 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
|
||||
time.sleep(LSN_LEASE_LENGTH / 2)
|
||||
ps.start()
|
||||
|
||||
trigger_gc_and_select(env, ep_static, ctx="After pageservers restart")
|
||||
offset = trigger_gc_and_select(
|
||||
env,
|
||||
ep_static,
|
||||
lease_lsn=lsn,
|
||||
ctx="After pageservers restart",
|
||||
offset=offset,
|
||||
)
|
||||
|
||||
# Reconfigure pageservers
|
||||
env.pageservers[0].stop()
|
||||
@@ -231,7 +283,13 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
|
||||
)
|
||||
env.storage_controller.reconcile_until_idle()
|
||||
|
||||
trigger_gc_and_select(env, ep_static, ctx="After putting pageserver 0 offline")
|
||||
trigger_gc_and_select(
|
||||
env,
|
||||
ep_static,
|
||||
lease_lsn=lsn,
|
||||
ctx="After putting pageserver 0 offline",
|
||||
offset=offset,
|
||||
)
|
||||
|
||||
# Do some update so we can increment latest_gc_cutoff
|
||||
generate_updates_on_main(env, ep_main, i, end=100)
|
||||
|
||||
@@ -256,6 +256,7 @@ def test_sharding_split_compaction(
|
||||
# Cleanup part 1: while layers are still in PITR window, we should only drop layers that are fully redundant
|
||||
for shard in shards:
|
||||
ps = env.get_tenant_pageserver(shard)
|
||||
assert ps is not None
|
||||
|
||||
# Invoke compaction: this should drop any layers that don't overlap with the shard's key stripes
|
||||
detail_before = ps.http_client().timeline_detail(shard, timeline_id)
|
||||
|
||||
@@ -1237,6 +1237,7 @@ def test_storage_controller_tenant_deletion(
|
||||
# Assert attachments all have local content
|
||||
for shard_id in shard_ids:
|
||||
pageserver = env.get_tenant_pageserver(shard_id)
|
||||
assert pageserver is not None
|
||||
assert pageserver.tenant_dir(shard_id).exists()
|
||||
|
||||
# Assert all shards have some content in remote storage
|
||||
@@ -2745,6 +2746,7 @@ def test_storage_controller_validate_during_migration(neon_env_builder: NeonEnvB
|
||||
|
||||
# Upload but don't compact
|
||||
origin_pageserver = env.get_tenant_pageserver(tenant_id)
|
||||
assert origin_pageserver is not None
|
||||
dest_ps_id = [p.id for p in env.pageservers if p.id != origin_pageserver.id][0]
|
||||
origin_pageserver.http_client().timeline_checkpoint(
|
||||
tenant_id, timeline_id, wait_until_uploaded=True, compact=False
|
||||
|
||||
@@ -245,6 +245,7 @@ def test_scrubber_physical_gc_ancestors(
|
||||
workload.write_rows(100, upload=False)
|
||||
for shard in shards:
|
||||
ps = env.get_tenant_pageserver(shard)
|
||||
assert ps is not None
|
||||
log.info(f"Waiting for shard {shard} on pageserver {ps.id}")
|
||||
ps.http_client().timeline_checkpoint(
|
||||
shard, timeline_id, compact=False, wait_until_uploaded=True
|
||||
@@ -270,6 +271,7 @@ def test_scrubber_physical_gc_ancestors(
|
||||
workload.churn_rows(100)
|
||||
for shard in shards:
|
||||
ps = env.get_tenant_pageserver(shard)
|
||||
assert ps is not None
|
||||
ps.http_client().timeline_compact(shard, timeline_id, force_image_layer_creation=True)
|
||||
ps.http_client().timeline_gc(shard, timeline_id, 0)
|
||||
|
||||
@@ -336,12 +338,15 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder
|
||||
|
||||
# Issue a deletion queue flush so that the parent shard can't leave behind layers
|
||||
# that will look like unexpected garbage to the scrubber
|
||||
env.get_tenant_pageserver(tenant_id).http_client().deletion_queue_flush(execute=True)
|
||||
ps = env.get_tenant_pageserver(tenant_id)
|
||||
assert ps is not None
|
||||
ps.http_client().deletion_queue_flush(execute=True)
|
||||
|
||||
new_shard_count = 4
|
||||
shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
|
||||
for shard in shards:
|
||||
ps = env.get_tenant_pageserver(shard)
|
||||
assert ps is not None
|
||||
log.info(f"Waiting for shard {shard} on pageserver {ps.id}")
|
||||
ps.http_client().timeline_checkpoint(
|
||||
shard, timeline_id, compact=False, wait_until_uploaded=True
|
||||
|
||||
@@ -315,6 +315,7 @@ def test_single_branch_get_tenant_size_grows(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> tuple[Lsn, int]:
|
||||
size = 0
|
||||
consistent = False
|
||||
size_debug = None
|
||||
|
||||
@@ -360,7 +361,7 @@ def test_single_branch_get_tenant_size_grows(
|
||||
collected_responses.append(("CREATE", current_lsn, size))
|
||||
|
||||
batch_size = 100
|
||||
|
||||
prev_size = 0
|
||||
for i in range(3):
|
||||
with endpoint.cursor() as cur:
|
||||
cur.execute(
|
||||
|
||||
@@ -146,6 +146,7 @@ def test_threshold_based_eviction(
|
||||
out += [f" {remote} {layer.layer_file_name}"]
|
||||
return "\n".join(out)
|
||||
|
||||
stable_for: float = 0
|
||||
observation_window = 8 * eviction_threshold
|
||||
consider_stable_when_no_change_for_seconds = 3 * eviction_threshold
|
||||
poll_interval = eviction_threshold / 3
|
||||
|
||||
@@ -16,6 +16,7 @@ from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
PgBin,
|
||||
flush_ep_to_pageserver,
|
||||
last_flush_lsn_upload,
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException
|
||||
@@ -576,27 +577,49 @@ def test_compaction_induced_by_detaches_in_history(
|
||||
assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, set())
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sharded", [True, False])
|
||||
@pytest.mark.parametrize("shards_initial_after", [(1, 1), (2, 2), (1, 4)])
|
||||
def test_timeline_ancestor_detach_idempotent_success(
|
||||
neon_env_builder: NeonEnvBuilder, sharded: bool
|
||||
neon_env_builder: NeonEnvBuilder, shards_initial_after: tuple[int, int]
|
||||
):
|
||||
shards = 2 if sharded else 1
|
||||
shards_initial = shards_initial_after[0]
|
||||
shards_after = shards_initial_after[1]
|
||||
|
||||
neon_env_builder.num_pageservers = shards
|
||||
env = neon_env_builder.init_start(initial_tenant_shard_count=shards if sharded else None)
|
||||
neon_env_builder.num_pageservers = shards_after
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_shard_count=shards_initial if shards_initial > 1 else None,
|
||||
initial_tenant_conf={
|
||||
# small checkpointing and compaction targets to ensure we generate many upload operations
|
||||
"checkpoint_distance": 512 * 1024,
|
||||
"compaction_threshold": 1,
|
||||
"compaction_target_size": 512 * 1024,
|
||||
# disable background compaction and GC. We invoke it manually when we want it to happen.
|
||||
"gc_period": "0s",
|
||||
"compaction_period": "0s",
|
||||
},
|
||||
)
|
||||
|
||||
pageservers = dict((int(p.id), p) for p in env.pageservers)
|
||||
|
||||
for ps in pageservers.values():
|
||||
ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
|
||||
|
||||
if sharded:
|
||||
if shards_after > 1:
|
||||
# FIXME: should this be in the neon_env_builder.init_start?
|
||||
env.storage_controller.reconcile_until_idle()
|
||||
client = env.storage_controller.pageserver_api()
|
||||
else:
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
# Write some data so that we have some layers to copy
|
||||
with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as endpoint:
|
||||
endpoint.safe_psql_many(
|
||||
[
|
||||
"CREATE TABLE foo(key serial primary key, t text default 'data_content')",
|
||||
"INSERT INTO foo SELECT FROM generate_series(1,1024)",
|
||||
]
|
||||
)
|
||||
last_flush_lsn_upload(env, endpoint, env.initial_tenant, env.initial_timeline)
|
||||
|
||||
first_branch = env.create_branch("first_branch")
|
||||
|
||||
_ = env.create_branch("second_branch", ancestor_branch_name="first_branch")
|
||||
@@ -607,6 +630,12 @@ def test_timeline_ancestor_detach_idempotent_success(
|
||||
reparented1 = env.create_branch("first_reparented", ancestor_branch_name="main")
|
||||
reparented2 = env.create_branch("second_reparented", ancestor_branch_name="main")
|
||||
|
||||
if shards_after > shards_initial:
|
||||
# Do a shard split
|
||||
# This is a reproducer for https://github.com/neondatabase/neon/issues/9667
|
||||
env.storage_controller.tenant_shard_split(env.initial_tenant, shards_after)
|
||||
env.storage_controller.reconcile_until_idle()
|
||||
|
||||
first_reparenting_response = client.detach_ancestor(env.initial_tenant, first_branch)
|
||||
assert set(first_reparenting_response) == {reparented1, reparented2}
|
||||
|
||||
|
||||
@@ -1506,15 +1506,10 @@ class SafekeeperEnv:
|
||||
port=port.http,
|
||||
auth_token=None,
|
||||
)
|
||||
try:
|
||||
safekeeper_process = start_in_background(
|
||||
cmd, safekeeper_dir, "safekeeper.log", safekeeper_client.check_status
|
||||
)
|
||||
return safekeeper_process
|
||||
except Exception as e:
|
||||
log.error(e)
|
||||
safekeeper_process.kill()
|
||||
raise Exception(f"Failed to start safekepeer as {cmd}, reason: {e}") from e
|
||||
safekeeper_process = start_in_background(
|
||||
cmd, safekeeper_dir, "safekeeper.log", safekeeper_client.check_status
|
||||
)
|
||||
return safekeeper_process
|
||||
|
||||
def get_safekeeper_connstrs(self):
|
||||
assert self.safekeepers is not None, "safekeepers are not initialized"
|
||||
|
||||
@@ -602,7 +602,7 @@ async def run_segment_init_failure(env: NeonEnv):
|
||||
|
||||
sk = env.safekeepers[0]
|
||||
sk_http = sk.http_client()
|
||||
sk_http.configure_failpoints([("sk-write-zeroes", "return")])
|
||||
sk_http.configure_failpoints([("sk-zero-segment", "return")])
|
||||
conn = await ep.connect_async()
|
||||
ep.safe_psql("select pg_switch_wal()") # jump to the segment boundary
|
||||
# next insertion should hang until failpoint is disabled.
|
||||
|
||||
@@ -64,6 +64,7 @@ def test_wal_restore(
|
||||
),
|
||||
str(data_dir),
|
||||
str(port),
|
||||
env.pg_version,
|
||||
]
|
||||
)
|
||||
restored.start()
|
||||
@@ -127,6 +128,7 @@ def test_wal_restore_initdb(
|
||||
),
|
||||
str(data_dir),
|
||||
str(port),
|
||||
env.pg_version,
|
||||
]
|
||||
)
|
||||
restored.start()
|
||||
|
||||
@@ -64,7 +64,7 @@ rand = { version = "0.8", features = ["small_rng"] }
|
||||
regex = { version = "1" }
|
||||
regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
|
||||
regex-syntax = { version = "0.8" }
|
||||
reqwest = { version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls", "stream"] }
|
||||
reqwest = { version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls", "rustls-tls-native-roots", "stream"] }
|
||||
rustls = { version = "0.23", default-features = false, features = ["logging", "ring", "std", "tls12"] }
|
||||
scopeguard = { version = "1" }
|
||||
serde = { version = "1", features = ["alloc", "derive"] }
|
||||
|
||||
Reference in New Issue
Block a user