Compare commits

..

1 Commits

Author SHA1 Message Date
Erik Grinaker
c563a25f14 compute_tools: disable fast path for safekeeper sync 2024-10-17 10:35:03 +02:00
212 changed files with 4662 additions and 7337 deletions

View File

@@ -27,7 +27,7 @@ jobs:
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: neondatabase/build-tools:pinned-bookworm
image: neondatabase/build-tools:pinned
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

View File

@@ -124,28 +124,28 @@ jobs:
uses: actions/cache@v4
with:
path: pg_install/v14
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
- name: Cache postgres v15 build
id: cache_pg_15
uses: actions/cache@v4
with:
path: pg_install/v15
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
- name: Cache postgres v16 build
id: cache_pg_16
uses: actions/cache@v4
with:
path: pg_install/v16
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
- name: Cache postgres v17 build
id: cache_pg_17
uses: actions/cache@v4
with:
path: pg_install/v17
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
- name: Build postgres v14
if: steps.cache_pg_14.outputs.cache-hit != 'true'

View File

@@ -83,7 +83,7 @@ jobs:
runs-on: ${{ matrix.RUNNER }}
container:
image: neondatabase/build-tools:pinned-bookworm
image: neondatabase/build-tools:pinned
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -178,7 +178,7 @@ jobs:
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: neondatabase/build-tools:pinned-bookworm
image: neondatabase/build-tools:pinned
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -280,7 +280,7 @@ jobs:
region_id_default=${{ env.DEFAULT_REGION_ID }}
runner_default='["self-hosted", "us-east-2", "x64"]'
runner_azure='["self-hosted", "eastus2", "x64"]'
image_default="neondatabase/build-tools:pinned-bookworm"
image_default="neondatabase/build-tools:pinned"
matrix='{
"pg_version" : [
16
@@ -299,9 +299,9 @@ jobs:
"include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" },
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned-bookworm" },
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned-bookworm" },
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned-bookworm" },
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned" },
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned" },
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned" },
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]
}'
@@ -665,7 +665,7 @@ jobs:
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: neondatabase/build-tools:pinned-bookworm
image: neondatabase/build-tools:pinned
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -772,7 +772,7 @@ jobs:
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: neondatabase/build-tools:pinned-bookworm
image: neondatabase/build-tools:pinned
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -877,7 +877,7 @@ jobs:
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: neondatabase/build-tools:pinned-bookworm
image: neondatabase/build-tools:pinned
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

View File

@@ -82,7 +82,7 @@ jobs:
- uses: docker/build-push-action@v6
with:
file: build-tools.Dockerfile
file: Dockerfile.build-tools
context: .
provenance: false
push: true

View File

@@ -683,7 +683,7 @@ jobs:
provenance: false
push: true
pull: true
file: compute/compute-node.Dockerfile
file: compute/Dockerfile.compute-node
cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
tags: |
@@ -703,7 +703,7 @@ jobs:
provenance: false
push: true
pull: true
file: compute/compute-node.Dockerfile
file: compute/Dockerfile.compute-node
target: neon-pg-ext-test
cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
@@ -728,7 +728,7 @@ jobs:
provenance: false
push: true
pull: true
file: compute/compute-node.Dockerfile
file: compute/Dockerfile.compute-node
cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-tools-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
tags: |

View File

@@ -31,7 +31,7 @@ jobs:
id: get-build-tools-tag
env:
IMAGE_TAG: |
${{ hashFiles('build-tools.Dockerfile',
${{ hashFiles('Dockerfile.build-tools',
'.github/workflows/check-build-tools-image.yml',
'.github/workflows/build-build-tools-image.yml') }}
run: |

View File

@@ -31,7 +31,7 @@ jobs:
runs-on: us-east-2
container:
image: neondatabase/build-tools:pinned-bookworm
image: neondatabase/build-tools:pinned
options: --init
steps:

View File

@@ -112,7 +112,7 @@ jobs:
# This isn't exhaustive, just the paths that are most directly compute-related.
# For example, compute_ctl also depends on libs/utils, but we don't trigger
# an e2e run on that.
vendor/*|pgxn/*|compute_tools/*|libs/vm_monitor/*|compute/compute-node.Dockerfile)
vendor/*|pgxn/*|compute_tools/*|libs/vm_monitor/*|compute/Dockerfile.compute-node)
platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
;;
*)

235
Cargo.lock generated
View File

@@ -148,9 +148,9 @@ dependencies = [
[[package]]
name = "asn1-rs"
version = "0.6.2"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5493c3bedbacf7fd7382c6346bbd66687d12bbaad3a89a2d2c303ee6cf20b048"
checksum = "7f6fd5ddaf0351dff5b8da21b2fb4ff8e08ddd02857f0bf69c47639106c0fff0"
dependencies = [
"asn1-rs-derive",
"asn1-rs-impl",
@@ -164,25 +164,25 @@ dependencies = [
[[package]]
name = "asn1-rs-derive"
version = "0.5.1"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "965c2d33e53cb6b267e148a4cb0760bc01f4904c1cd4bb4002a085bb016d1490"
checksum = "726535892e8eae7e70657b4c8ea93d26b8553afb1ce617caee529ef96d7dee6c"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.52",
"syn 1.0.109",
"synstructure",
]
[[package]]
name = "asn1-rs-impl"
version = "0.2.0"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7b18050c2cd6fe86c3a76584ef5e0baf286d038cda203eb6223df2cc413565f7"
checksum = "2777730b2039ac0f95f093556e61b6d26cebed5393ca6f152717777cec3a42ed"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.52",
"syn 1.0.109",
]
[[package]]
@@ -310,33 +310,6 @@ dependencies = [
"zeroize",
]
[[package]]
name = "aws-lc-rs"
version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2f95446d919226d587817a7d21379e6eb099b97b45110a7f272a444ca5c54070"
dependencies = [
"aws-lc-sys",
"mirai-annotations",
"paste",
"zeroize",
]
[[package]]
name = "aws-lc-sys"
version = "0.21.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b3ddc4a5b231dd6958b140ff3151b6412b3f4321fab354f399eec8f14b06df62"
dependencies = [
"bindgen 0.69.5",
"cc",
"cmake",
"dunce",
"fs_extra",
"libc",
"paste",
]
[[package]]
name = "aws-runtime"
version = "1.4.3"
@@ -622,7 +595,7 @@ dependencies = [
"once_cell",
"pin-project-lite",
"pin-utils",
"rustls 0.21.12",
"rustls 0.21.11",
"tokio",
"tracing",
]
@@ -942,29 +915,6 @@ dependencies = [
"serde",
]
[[package]]
name = "bindgen"
version = "0.69.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
dependencies = [
"bitflags 2.4.1",
"cexpr",
"clang-sys",
"itertools 0.10.5",
"lazy_static",
"lazycell",
"log",
"prettyplease",
"proc-macro2",
"quote",
"regex",
"rustc-hash",
"shlex",
"syn 2.0.52",
"which",
]
[[package]]
name = "bindgen"
version = "0.70.1"
@@ -974,7 +924,7 @@ dependencies = [
"bitflags 2.4.1",
"cexpr",
"clang-sys",
"itertools 0.10.5",
"itertools 0.12.1",
"log",
"prettyplease",
"proc-macro2",
@@ -1088,13 +1038,12 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
[[package]]
name = "cc"
version = "1.1.30"
version = "1.0.83"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b16803a61b81d9eabb7eae2588776c4c1e584b738ede45fdbb4c972cec1e9945"
checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0"
dependencies = [
"jobserver",
"libc",
"shlex",
]
[[package]]
@@ -1220,15 +1169,6 @@ version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"
[[package]]
name = "cmake"
version = "0.1.51"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fb1e43aa7fd152b1f968787f7dbcdeb306d1867ff373c69955211876c053f91a"
dependencies = [
"cc",
]
[[package]]
name = "colorchoice"
version = "1.0.0"
@@ -1684,9 +1624,9 @@ dependencies = [
[[package]]
name = "der-parser"
version = "9.0.0"
version = "8.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5cd0a5c643689626bec213c4d8bd4d96acc8ffdb4ad4bb6bc16abf27d5f4b553"
checksum = "dbd676fbbab537128ef0278adb5576cf363cff6aa22a7b24effe97347cfab61e"
dependencies = [
"asn1-rs",
"displaydoc",
@@ -1815,12 +1755,6 @@ dependencies = [
"syn 2.0.52",
]
[[package]]
name = "dunce"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"
[[package]]
name = "dyn-clone"
version = "1.0.14"
@@ -2125,12 +2059,6 @@ dependencies = [
"tokio-util",
]
[[package]]
name = "fs_extra"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
[[package]]
name = "fsevent-sys"
version = "4.1.0"
@@ -2484,15 +2412,6 @@ dependencies = [
"digest",
]
[[package]]
name = "home"
version = "0.5.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5"
dependencies = [
"windows-sys 0.52.0",
]
[[package]]
name = "hostname"
version = "0.4.0"
@@ -2662,7 +2581,7 @@ dependencies = [
"http 0.2.9",
"hyper 0.14.30",
"log",
"rustls 0.21.12",
"rustls 0.21.11",
"rustls-native-certs 0.6.2",
"tokio",
"tokio-rustls 0.24.0",
@@ -2882,9 +2801,9 @@ checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
[[package]]
name = "jobserver"
version = "0.1.32"
version = "0.1.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0"
checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2"
dependencies = [
"libc",
]
@@ -2988,12 +2907,6 @@ dependencies = [
"spin",
]
[[package]]
name = "lazycell"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
[[package]]
name = "libc"
version = "0.2.150"
@@ -3224,12 +3137,6 @@ dependencies = [
"windows-sys 0.48.0",
]
[[package]]
name = "mirai-annotations"
version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c9be0862c1b3f26a88803c4a49de6889c10e608b3ee9344e6ef5b45fb37ad3d1"
[[package]]
name = "multimap"
version = "0.8.3"
@@ -3449,9 +3356,9 @@ dependencies = [
[[package]]
name = "oid-registry"
version = "0.7.1"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8d8034d9489cdaf79228eb9f6a3b8d7bb32ba00d6645ebd48eef4077ceb5bd9"
checksum = "9bedf36ffb6ba96c2eb7144ef6270557b52e54b20c0a8e1eb2ff99a6c6959bff"
dependencies = [
"asn1-rs",
]
@@ -3749,7 +3656,6 @@ dependencies = [
"tracing",
"url",
"utils",
"wal_decoder",
"walkdir",
"workspace_hack",
]
@@ -4147,14 +4053,14 @@ dependencies = [
"bytes",
"once_cell",
"pq_proto",
"rustls 0.23.7",
"rustls 0.22.4",
"rustls-pemfile 2.1.1",
"serde",
"thiserror",
"tokio",
"tokio-postgres",
"tokio-postgres-rustls",
"tokio-rustls 0.26.0",
"tokio-rustls 0.25.0",
"tokio-util",
"tracing",
]
@@ -4176,7 +4082,7 @@ name = "postgres_ffi"
version = "0.1.0"
dependencies = [
"anyhow",
"bindgen 0.70.1",
"bindgen",
"bytes",
"crc32c",
"env_logger",
@@ -4187,7 +4093,6 @@ dependencies = [
"regex",
"serde",
"thiserror",
"tracing",
"utils",
]
@@ -4314,7 +4219,7 @@ checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15"
dependencies = [
"bytes",
"heck 0.5.0",
"itertools 0.10.5",
"itertools 0.12.1",
"log",
"multimap",
"once_cell",
@@ -4334,7 +4239,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e9552f850d5f0964a4e4d0bf306459ac29323ddfbae05e35a7c0d35cb0803cc5"
dependencies = [
"anyhow",
"itertools 0.10.5",
"itertools 0.12.1",
"proc-macro2",
"quote",
"syn 2.0.52",
@@ -4422,8 +4327,8 @@ dependencies = [
"rsa",
"rstest",
"rustc-hash",
"rustls 0.23.7",
"rustls-native-certs 0.8.0",
"rustls 0.22.4",
"rustls-native-certs 0.7.0",
"rustls-pemfile 2.1.1",
"scopeguard",
"serde",
@@ -4440,7 +4345,7 @@ dependencies = [
"tokio",
"tokio-postgres",
"tokio-postgres-rustls",
"tokio-rustls 0.26.0",
"tokio-rustls 0.25.0",
"tokio-tungstenite",
"tokio-util",
"tracing",
@@ -4604,13 +4509,12 @@ dependencies = [
[[package]]
name = "rcgen"
version = "0.13.1"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "54077e1872c46788540de1ea3d7f4ccb1983d12f9aa909b234468676c1a36779"
checksum = "48406db8ac1f3cbc7dcdb56ec355343817958a356ff430259bb07baf7607e1e1"
dependencies = [
"pem",
"ring",
"rustls-pki-types",
"time",
"yasna",
]
@@ -4789,7 +4693,7 @@ dependencies = [
"once_cell",
"percent-encoding",
"pin-project-lite",
"rustls 0.21.12",
"rustls 0.21.11",
"rustls-pemfile 1.0.2",
"serde",
"serde_json",
@@ -5087,9 +4991,9 @@ dependencies = [
[[package]]
name = "rustls"
version = "0.21.12"
version = "0.21.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f56a14d1f48b391359b22f731fd4bd7e43c97f3c50eee276f3aa09c94784d3e"
checksum = "7fecbfb7b1444f477b345853b1fce097a2c6fb637b2bfb87e6bc5db0f043fae4"
dependencies = [
"log",
"ring",
@@ -5117,7 +5021,6 @@ version = "0.23.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ebbbdb961df0ad3f2652da8f3fdc4b36122f568f968f45ad3316f26c025c677b"
dependencies = [
"aws-lc-rs",
"log",
"once_cell",
"ring",
@@ -5186,9 +5089,9 @@ dependencies = [
[[package]]
name = "rustls-pki-types"
version = "1.10.0"
version = "1.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "16f1201b3c9a7ee8039bcadc17b7e605e2945b27eee7631788c1bd2b0643674b"
checksum = "5ede67b28608b4c60685c7d54122d4400d90f62b40caee7700e700380a390fa8"
[[package]]
name = "rustls-webpki"
@@ -5206,7 +5109,6 @@ version = "0.102.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "faaa0a62740bedb9b2ef5afa303da42764c012f743917351dc9a237ea1663610"
dependencies = [
"aws-lc-rs",
"ring",
"rustls-pki-types",
"untrusted",
@@ -5410,7 +5312,7 @@ checksum = "00421ed8fa0c995f07cde48ba6c89e80f2b312f74ff637326f392fbfd23abe02"
dependencies = [
"httpdate",
"reqwest 0.12.4",
"rustls 0.21.12",
"rustls 0.21.11",
"sentry-backtrace",
"sentry-contexts",
"sentry-core",
@@ -5905,8 +5807,8 @@ dependencies = [
"postgres_ffi",
"remote_storage",
"reqwest 0.12.4",
"rustls 0.23.7",
"rustls-native-certs 0.8.0",
"rustls 0.22.4",
"rustls-native-certs 0.7.0",
"serde",
"serde_json",
"storage_controller_client",
@@ -6028,13 +5930,14 @@ checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394"
[[package]]
name = "synstructure"
version = "0.13.1"
version = "0.12.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
checksum = "f36bdaa60a83aca3921b5259d5400cbf5e90fc51931376a9bd4a0eb79aa7210f"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.52",
"syn 1.0.109",
"unicode-xid",
]
[[package]]
@@ -6333,15 +6236,16 @@ dependencies = [
[[package]]
name = "tokio-postgres-rustls"
version = "0.12.0"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04fb792ccd6bbcd4bba408eb8a292f70fc4a3589e5d793626f45190e6454b6ab"
checksum = "0ea13f22eda7127c827983bdaf0d7fff9df21c8817bab02815ac277a21143677"
dependencies = [
"futures",
"ring",
"rustls 0.23.7",
"rustls 0.22.4",
"tokio",
"tokio-postgres",
"tokio-rustls 0.26.0",
"tokio-rustls 0.25.0",
"x509-certificate",
]
@@ -6351,7 +6255,7 @@ version = "0.24.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5"
dependencies = [
"rustls 0.21.12",
"rustls 0.21.11",
"tokio",
]
@@ -6774,15 +6678,16 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
[[package]]
name = "ureq"
version = "2.10.1"
version = "2.9.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b74fc6b57825be3373f7054754755f03ac3a8f5d70015ccad699ba2029956f4a"
checksum = "d11a831e3c0b56e438a28308e7c810799e3c118417f342d30ecec080105395cd"
dependencies = [
"base64 0.22.1",
"log",
"once_cell",
"rustls 0.23.7",
"rustls 0.22.4",
"rustls-pki-types",
"rustls-webpki 0.102.2",
"url",
"webpki-roots 0.26.1",
]
@@ -6956,19 +6861,6 @@ dependencies = [
"utils",
]
[[package]]
name = "wal_decoder"
version = "0.1.0"
dependencies = [
"anyhow",
"bytes",
"pageserver_api",
"postgres_ffi",
"serde",
"tracing",
"utils",
]
[[package]]
name = "walkdir"
version = "2.3.3"
@@ -6984,7 +6876,7 @@ name = "walproposer"
version = "0.1.0"
dependencies = [
"anyhow",
"bindgen 0.70.1",
"bindgen",
"postgres_ffi",
"utils",
]
@@ -7159,18 +7051,6 @@ dependencies = [
"rustls-pki-types",
]
[[package]]
name = "which"
version = "4.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7"
dependencies = [
"either",
"home",
"once_cell",
"rustix",
]
[[package]]
name = "whoami"
version = "1.5.1"
@@ -7415,6 +7295,7 @@ dependencies = [
"digest",
"either",
"fail",
"futures",
"futures-channel",
"futures-executor",
"futures-io",
@@ -7430,7 +7311,7 @@ dependencies = [
"hyper-util",
"indexmap 1.9.3",
"indexmap 2.0.1",
"itertools 0.10.5",
"itertools 0.12.1",
"lazy_static",
"libc",
"log",
@@ -7451,8 +7332,6 @@ dependencies = [
"regex-automata 0.4.3",
"regex-syntax 0.8.2",
"reqwest 0.12.4",
"rustls 0.23.7",
"rustls-webpki 0.102.2",
"scopeguard",
"serde",
"serde_json",
@@ -7461,6 +7340,7 @@ dependencies = [
"smallvec",
"spki 0.7.3",
"subtle",
"syn 1.0.109",
"syn 2.0.52",
"sync_wrapper 0.1.2",
"tikv-jemalloc-sys",
@@ -7468,7 +7348,6 @@ dependencies = [
"time-macros",
"tokio",
"tokio-postgres",
"tokio-rustls 0.26.0",
"tokio-stream",
"tokio-util",
"toml_edit",
@@ -7504,9 +7383,9 @@ dependencies = [
[[package]]
name = "x509-parser"
version = "0.16.0"
version = "0.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fcbc162f30700d6f3f82a24bf7cc62ffe7caea42c0b2cba8bf7f3ae50cf51f69"
checksum = "bab0c2f54ae1d92f4fcb99c0b7ccf0b1e3451cbd395e5f115ccbdbcb18d4f634"
dependencies = [
"asn1-rs",
"data-encoding",

View File

@@ -33,7 +33,6 @@ members = [
"libs/postgres_ffi/wal_craft",
"libs/vm_monitor",
"libs/walproposer",
"libs/wal_decoder",
]
[workspace.package]
@@ -143,7 +142,7 @@ reqwest-retry = "0.5"
routerify = "3"
rpds = "0.13"
rustc-hash = "1.1.0"
rustls = "0.23"
rustls = "0.22"
rustls-pemfile = "2"
scopeguard = "1.1"
sysinfo = "0.29.2"
@@ -173,8 +172,8 @@ tikv-jemalloc-ctl = "0.5"
tokio = { version = "1.17", features = ["macros"] }
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
tokio-io-timeout = "1.2.0"
tokio-postgres-rustls = "0.12.0"
tokio-rustls = "0.26"
tokio-postgres-rustls = "0.11.0"
tokio-rustls = "0.25"
tokio-stream = "0.1"
tokio-tar = "0.3"
tokio-util = { version = "0.7.10", features = ["io", "rt"] }
@@ -193,8 +192,8 @@ url = "2.2"
urlencoding = "2.1"
uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
walkdir = "2.3.2"
rustls-native-certs = "0.8"
x509-parser = "0.16"
rustls-native-certs = "0.7"
x509-parser = "0.15"
whoami = "1.5.1"
## TODO replace this with tracing
@@ -239,14 +238,13 @@ tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
utils = { version = "0.1", path = "./libs/utils/" }
vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }
walproposer = { version = "0.1", path = "./libs/walproposer/" }
wal_decoder = { version = "0.1", path = "./libs/wal_decoder" }
## Common library dependency
workspace_hack = { version = "0.1", path = "./workspace_hack/" }
## Build dependencies
criterion = "0.5.1"
rcgen = "0.13"
rcgen = "0.12"
rstest = "0.18"
camino-tempfile = "1.0.2"
tonic-build = "0.12"

View File

@@ -72,7 +72,7 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/
&& mv s5cmd /usr/local/bin/s5cmd
# LLVM
ENV LLVM_VERSION=19
ENV LLVM_VERSION=18
RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
&& echo "deb http://apt.llvm.org/${DEBIAN_VERSION}/ llvm-toolchain-${DEBIAN_VERSION}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
&& apt update \
@@ -99,7 +99,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws
&& rm awscliv2.zip
# Mold: A Modern Linker
ENV MOLD_VERSION=v2.34.1
ENV MOLD_VERSION=v2.33.0
RUN set -e \
&& git clone https://github.com/rui314/mold.git \
&& mkdir mold/build \
@@ -142,7 +142,7 @@ RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/sourc
# Use the same version of libicu as the compute nodes so that
# clusters created using inidb on pageserver can be used by computes.
#
# TODO: at this time, compute-node.Dockerfile uses the debian bullseye libicu
# TODO: at this time, Dockerfile.compute-node uses the debian bullseye libicu
# package, which is 67.1. We're duplicating that knowledge here, and also, technically,
# Debian has a few patches on top of 67.1 that we're not adding here.
ENV ICU_VERSION=67.1
@@ -192,7 +192,7 @@ WORKDIR /home/nonroot
# Rust
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
ENV RUSTC_VERSION=1.82.0
ENV RUSTC_VERSION=1.81.0
ENV RUSTUP_HOME="/home/nonroot/.rustup"
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
ARG RUSTFILT_VERSION=0.2.1

View File

@@ -297,7 +297,7 @@ clean: postgres-clean neon-pg-clean-ext
# This removes everything
.PHONY: distclean
distclean:
$(RM) -r $(POSTGRES_INSTALL_DIR)
rm -rf $(POSTGRES_INSTALL_DIR)
$(CARGO_CMD_PREFIX) cargo clean
.PHONY: fmt
@@ -329,7 +329,7 @@ postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list
$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/pgindent --typedefs postgres-$*-typedefs-full.list \
$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/ \
--excludes $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/exclude_file_patterns
$(RM) pg*.BAK
rm -f pg*.BAK
# Indent pxgn/neon.
.PHONY: neon-pgindent

View File

@@ -353,10 +353,13 @@ COPY compute/patches/pgvector.patch /pgvector.patch
# because we build the images on different machines than where we run them.
# Pass OPTFLAGS="" to remove it.
#
# vector 0.7.4 supports v17
# last release v0.7.4 - Aug 5, 2024
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.4.tar.gz -O pgvector.tar.gz && \
echo "0341edf89b1924ae0d552f617e14fb7f8867c0194ed775bcc44fa40288642583 pgvector.tar.gz" | sha256sum --check && \
# v17 is not supported yet because of upstream issue
# https://github.com/pgvector/pgvector/issues/669
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \
echo "617fba855c9bcb41a2a9bc78a78567fd2e147c72afd5bf9d37b31b9591632b30 pgvector.tar.gz" | sha256sum --check && \
mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
patch -p1 < /pgvector.patch && \
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -975,8 +978,8 @@ ARG PG_VERSION
RUN case "${PG_VERSION}" in "v17") \
echo "pg_session_jwt does not yet have a release that supports pg17" && exit 0;; \
esac && \
wget https://github.com/neondatabase/pg_session_jwt/archive/e1310b08ba51377a19e0559e4d1194883b9b2ba2.tar.gz -O pg_session_jwt.tar.gz && \
echo "837932a077888d5545fd54b0abcc79e5f8e37017c2769a930afc2f5c94df6f4e pg_session_jwt.tar.gz" | sha256sum --check && \
wget https://github.com/neondatabase/pg_session_jwt/archive/5aee2625af38213650e1a07ae038fdc427250ee4.tar.gz -O pg_session_jwt.tar.gz && \
echo "5d91b10bc1347d36cffc456cb87bec25047935d6503dc652ca046f04760828e7 pg_session_jwt.tar.gz" | sha256sum --check && \
mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgrx install --release

View File

@@ -20,21 +20,19 @@ neon_collector_autoscaling.yml: $(jsonnet_files)
sql_exporter.yml: $(jsonnet_files)
JSONNET_PATH=etc jsonnet \
--output-file etc/$@ \
--tla-str collector_name=neon_collector \
--tla-str collector_file=neon_collector.yml \
etc/sql_exporter.jsonnet
sql_exporter_autoscaling.yml: $(jsonnet_files)
JSONNET_PATH=etc jsonnet \
--output-file etc/$@ \
--tla-str collector_name=neon_collector_autoscaling \
--tla-str collector_file=neon_collector_autoscaling.yml \
--tla-str application_name=sql_exporter_autoscaling \
etc/sql_exporter.jsonnet
.PHONY: clean
clean:
$(RM) \
rm --force \
etc/neon_collector.yml \
etc/neon_collector_autoscaling.yml \
etc/sql_exporter.yml \

View File

@@ -1,7 +1,7 @@
This directory contains files that are needed to build the compute
images, or included in the compute images.
compute-node.Dockerfile
Dockerfile.compute-node
To build the compute image
vm-image-spec.yaml
@@ -14,8 +14,8 @@ etc/
patches/
Some extensions need to be patched to work with Neon. This
directory contains such patches. They are applied to the extension
sources in compute-node.Dockerfile
sources in Dockerfile.compute-node
In addition to these, postgres itself, the neon postgres extension,
and compute_ctl are built and copied into the compute image by
compute-node.Dockerfile.
Dockerfile.compute-node.

View File

@@ -1,4 +1,4 @@
function(collector_name, collector_file, application_name='sql_exporter') {
function(collector_file, application_name='sql_exporter') {
// Configuration for sql_exporter for autoscaling-agent
// Global defaults.
global: {
@@ -28,7 +28,7 @@ function(collector_name, collector_file, application_name='sql_exporter') {
// Collectors (referenced by name) to execute on the target.
// Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
collectors: [
collector_name,
'neon_collector',
],
},

View File

@@ -1,7 +1,7 @@
local neon = import 'neon.libsonnet';
local pg_stat_bgwriter = importstr 'sql_exporter/checkpoints_timed.sql';
local pg_stat_checkpointer = importstr 'sql_exporter/checkpoints_timed.17.sql';
local pg_stat_bgwriter = importstr 'sql_exporter/checkpoints_req.sql';
local pg_stat_checkpointer = importstr 'sql_exporter/checkpoints_req.17.sql';
{
metric_name: 'checkpoints_timed',

View File

@@ -1,10 +1,5 @@
SELECT
slot_name,
pg_wal_lsn_diff(
CASE
WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn()
ELSE pg_current_wal_lsn()
END,
restart_lsn)::FLOAT8 AS retained_wal
pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal
FROM pg_replication_slots
WHERE active = false;

View File

@@ -15,7 +15,6 @@ use std::time::Instant;
use anyhow::{Context, Result};
use chrono::{DateTime, Utc};
use compute_api::spec::PgIdent;
use futures::future::join_all;
use futures::stream::FuturesUnordered;
use futures::StreamExt;
@@ -26,9 +25,8 @@ use tracing::{debug, error, info, instrument, warn};
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
use compute_api::privilege::Privilege;
use compute_api::responses::{ComputeMetrics, ComputeStatus};
use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec, ExtVersion};
use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
use utils::measured_stream::MeasuredReader;
use nix::sys::signal::{kill, Signal};
@@ -36,7 +34,6 @@ use nix::sys::signal::{kill, Signal};
use remote_storage::{DownloadError, RemotePath};
use crate::checker::create_availability_check_data;
use crate::installed_extensions::get_installed_extensions_sync;
use crate::local_proxy;
use crate::logger::inlinify;
use crate::pg_helpers::*;
@@ -529,6 +526,10 @@ impl ComputeNode {
// Fast path for sync_safekeepers. If they're already synced we get the lsn
// in one roundtrip. If not, we should do a full sync_safekeepers.
//
// NB: currently unused, see: https://github.com/neondatabase/neon/issues/9259. Consider
// removing this, or improving it e.g. by using a different protocol, bumping the safekeeper
// term, or flushing the Safekeeper WAL on STATUS_TIMELINE.
pub fn check_safekeepers_synced(&self, compute_state: &ComputeState) -> Result<Option<Lsn>> {
let start_time = Utc::now();
@@ -632,14 +633,10 @@ impl ComputeNode {
// cannot sync safekeepers.
let lsn = match spec.mode {
ComputeMode::Primary => {
info!("checking if safekeepers are synced");
let lsn = if let Ok(Some(lsn)) = self.check_safekeepers_synced(compute_state) {
lsn
} else {
info!("starting safekeepers syncing");
self.sync_safekeepers(pspec.storage_auth_token.clone())
.with_context(|| "failed to sync safekeepers")?
};
info!("starting safekeepers syncing");
let lsn = self
.sync_safekeepers(pspec.storage_auth_token.clone())
.with_context(|| "failed to sync safekeepers")?;
info!("safekeepers synced at LSN {}", lsn);
lsn
}
@@ -1124,11 +1121,6 @@ impl ComputeNode {
self.pg_reload_conf()?;
}
self.post_apply_config()?;
let connstr = self.connstr.clone();
thread::spawn(move || {
get_installed_extensions_sync(connstr).context("get_installed_extensions")
});
}
let startup_end_time = Utc::now();
@@ -1375,97 +1367,6 @@ LIMIT 100",
download_size
}
pub async fn set_role_grants(
&self,
db_name: &PgIdent,
schema_name: &PgIdent,
privileges: &[Privilege],
role_name: &PgIdent,
) -> Result<()> {
use tokio_postgres::config::Config;
use tokio_postgres::NoTls;
let mut conf = Config::from_str(self.connstr.as_str()).unwrap();
conf.dbname(db_name);
let (db_client, conn) = conf
.connect(NoTls)
.await
.context("Failed to connect to the database")?;
tokio::spawn(conn);
// TODO: support other types of grants apart from schemas?
let query = format!(
"GRANT {} ON SCHEMA {} TO {}",
privileges
.iter()
// should not be quoted as it's part of the command.
// is already sanitized so it's ok
.map(|p| p.as_str())
.collect::<Vec<&'static str>>()
.join(", "),
// quote the schema and role name as identifiers to sanitize them.
schema_name.pg_quote(),
role_name.pg_quote(),
);
db_client
.simple_query(&query)
.await
.with_context(|| format!("Failed to execute query: {}", query))?;
Ok(())
}
pub async fn install_extension(
&self,
ext_name: &PgIdent,
db_name: &PgIdent,
ext_version: ExtVersion,
) -> Result<ExtVersion> {
use tokio_postgres::config::Config;
use tokio_postgres::NoTls;
let mut conf = Config::from_str(self.connstr.as_str()).unwrap();
conf.dbname(db_name);
let (db_client, conn) = conf
.connect(NoTls)
.await
.context("Failed to connect to the database")?;
tokio::spawn(conn);
let version_query = "SELECT extversion FROM pg_extension WHERE extname = $1";
let version: Option<ExtVersion> = db_client
.query_opt(version_query, &[&ext_name])
.await
.with_context(|| format!("Failed to execute query: {}", version_query))?
.map(|row| row.get(0));
// sanitize the inputs as postgres idents.
let ext_name: String = ext_name.pg_quote();
let quoted_version: String = ext_version.pg_quote();
if let Some(installed_version) = version {
if installed_version == ext_version {
return Ok(installed_version);
}
let query = format!("ALTER EXTENSION {ext_name} UPDATE TO {quoted_version}");
db_client
.simple_query(&query)
.await
.with_context(|| format!("Failed to execute query: {}", query))?;
} else {
let query =
format!("CREATE EXTENSION IF NOT EXISTS {ext_name} WITH VERSION {quoted_version}");
db_client
.simple_query(&query)
.await
.with_context(|| format!("Failed to execute query: {}", query))?;
}
Ok(ext_version)
}
#[tokio::main]
pub async fn prepare_preload_libraries(
&self,
@@ -1583,6 +1484,28 @@ LIMIT 100",
info!("Pageserver config changed");
}
}
// Gather info about installed extensions
pub fn get_installed_extensions(&self) -> Result<()> {
let connstr = self.connstr.clone();
let rt = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.expect("failed to create runtime");
let result = rt
.block_on(crate::installed_extensions::get_installed_extensions(
connstr,
))
.expect("failed to get installed extensions");
info!(
"{}",
serde_json::to_string(&result).expect("failed to serialize extensions list")
);
Ok(())
}
}
pub fn forward_termination_signal() {

View File

@@ -107,7 +107,7 @@ pub fn get_pg_version(pgbin: &str) -> String {
// pg_config --version returns a (platform specific) human readable string
// such as "PostgreSQL 15.4". We parse this to v14/v15/v16 etc.
let human_version = get_pg_config("--version", pgbin);
parse_pg_version(&human_version).to_string()
return parse_pg_version(&human_version).to_string();
}
fn parse_pg_version(human_version: &str) -> &str {

View File

@@ -9,11 +9,8 @@ use crate::catalog::SchemaDumpError;
use crate::catalog::{get_database_schema, get_dbs_and_roles};
use crate::compute::forward_termination_signal;
use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
use compute_api::requests::{ConfigurationRequest, ExtensionInstallRequest, SetRoleGrantsRequest};
use compute_api::responses::{
ComputeStatus, ComputeStatusResponse, ExtensionInstallResult, GenericAPIError,
SetRoleGrantsResponse,
};
use compute_api::requests::ConfigurationRequest;
use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
use anyhow::Result;
use hyper::header::CONTENT_TYPE;
@@ -101,38 +98,6 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
}
}
(&Method::POST, "/extensions") => {
info!("serving /extensions POST request");
let status = compute.get_status();
if status != ComputeStatus::Running {
let msg = format!(
"invalid compute status for extensions request: {:?}",
status
);
error!(msg);
return render_json_error(&msg, StatusCode::PRECONDITION_FAILED);
}
let request = hyper::body::to_bytes(req.into_body()).await.unwrap();
let request = serde_json::from_slice::<ExtensionInstallRequest>(&request).unwrap();
let res = compute
.install_extension(&request.extension, &request.database, request.version)
.await;
match res {
Ok(version) => render_json(Body::from(
serde_json::to_string(&ExtensionInstallResult {
extension: request.extension,
version,
})
.unwrap(),
)),
Err(e) => {
error!("install_extension failed: {}", e);
render_json_error(&e.to_string(), StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
(&Method::GET, "/info") => {
let num_cpus = num_cpus::get_physical();
info!("serving /info GET request. num_cpus: {}", num_cpus);
@@ -200,48 +165,6 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
}
}
(&Method::POST, "/grants") => {
info!("serving /grants POST request");
let status = compute.get_status();
if status != ComputeStatus::Running {
let msg = format!(
"invalid compute status for set_role_grants request: {:?}",
status
);
error!(msg);
return render_json_error(&msg, StatusCode::PRECONDITION_FAILED);
}
let request = hyper::body::to_bytes(req.into_body()).await.unwrap();
let request = serde_json::from_slice::<SetRoleGrantsRequest>(&request).unwrap();
let res = compute
.set_role_grants(
&request.database,
&request.schema,
&request.privileges,
&request.role,
)
.await;
match res {
Ok(()) => render_json(Body::from(
serde_json::to_string(&SetRoleGrantsResponse {
database: request.database,
schema: request.schema,
role: request.role,
privileges: request.privileges,
})
.unwrap(),
)),
Err(e) => render_json_error(
&format!("could not grant role privileges to the schema: {e}"),
// TODO: can we filter on role/schema not found errors
// and return appropriate error code?
StatusCode::INTERNAL_SERVER_ERROR,
),
}
}
// get the list of installed extensions
// currently only used in python tests
// TODO: call it from cplane

View File

@@ -127,41 +127,6 @@ paths:
schema:
$ref: "#/components/schemas/GenericError"
/grants:
post:
tags:
- Grants
summary: Apply grants to the database.
description: ""
operationId: setRoleGrants
requestBody:
description: Grants request.
required: true
content:
application/json:
schema:
$ref: "#/components/schemas/SetRoleGrantsRequest"
responses:
200:
description: Grants applied.
content:
application/json:
schema:
$ref: "#/components/schemas/SetRoleGrantsResponse"
412:
description: |
Compute is not in the right state for processing the request.
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
500:
description: Error occurred during grants application.
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
/check_writability:
post:
tags:
@@ -179,41 +144,6 @@ paths:
description: Error text or 'true' if check passed.
example: "true"
/extensions:
post:
tags:
- Extensions
summary: Install extension if possible.
description: ""
operationId: installExtension
requestBody:
description: Extension name and database to install it to.
required: true
content:
application/json:
schema:
$ref: "#/components/schemas/ExtensionInstallRequest"
responses:
200:
description: Result from extension installation
content:
application/json:
schema:
$ref: "#/components/schemas/ExtensionInstallResult"
412:
description: |
Compute is in the wrong state for processing the request.
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
500:
description: Error during extension installation.
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
/configure:
post:
tags:
@@ -439,7 +369,7 @@ components:
moment, when spec was received.
example: "2022-10-12T07:20:50.52Z"
status:
$ref: "#/components/schemas/ComputeStatus"
$ref: '#/components/schemas/ComputeStatus'
last_active:
type: string
description: |
@@ -479,38 +409,6 @@ components:
- configuration
example: running
ExtensionInstallRequest:
type: object
required:
- extension
- database
- version
properties:
extension:
type: string
description: Extension name.
example: "pg_session_jwt"
version:
type: string
description: Version of the extension.
example: "1.0.0"
database:
type: string
description: Database name.
example: "neondb"
ExtensionInstallResult:
type: object
properties:
extension:
description: Name of the extension.
type: string
example: "pg_session_jwt"
version:
description: Version of the extension.
type: string
example: "1.0.0"
InstalledExtensions:
type: object
properties:
@@ -529,60 +427,6 @@ components:
n_databases:
type: integer
SetRoleGrantsRequest:
type: object
required:
- database
- schema
- privileges
- role
properties:
database:
type: string
description: Database name.
example: "neondb"
schema:
type: string
description: Schema name.
example: "public"
privileges:
type: array
items:
type: string
description: List of privileges to set.
example: ["SELECT", "INSERT"]
role:
type: string
description: Role name.
example: "neon"
SetRoleGrantsResponse:
type: object
required:
- database
- schema
- privileges
- role
properties:
database:
type: string
description: Database name.
example: "neondb"
schema:
type: string
description: Schema name.
example: "public"
privileges:
type: array
items:
type: string
description: List of privileges set.
example: ["SELECT", "INSERT"]
role:
type: string
description: Role name.
example: "neon"
#
# Errors
#

View File

@@ -1,7 +1,6 @@
use compute_api::responses::{InstalledExtension, InstalledExtensions};
use std::collections::HashMap;
use std::collections::HashSet;
use tracing::info;
use url::Url;
use anyhow::Result;
@@ -80,23 +79,3 @@ pub async fn get_installed_extensions(connstr: Url) -> Result<InstalledExtension
})
.await?
}
// Gather info about installed extensions
pub fn get_installed_extensions_sync(connstr: Url) -> Result<()> {
let rt = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.expect("failed to create runtime");
let result = rt
.block_on(crate::installed_extensions::get_installed_extensions(
connstr,
))
.expect("failed to get installed extensions");
info!(
"[NEON_EXT_STAT] {}",
serde_json::to_string(&result).expect("failed to serialize extensions list")
);
Ok(())
}

View File

@@ -20,16 +20,7 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt;
use postgres_backend::AuthType;
use reqwest::Method;
use serde::{de::DeserializeOwned, Deserialize, Serialize};
use std::{
ffi::OsStr,
fs,
net::SocketAddr,
path::PathBuf,
process::ExitStatus,
str::FromStr,
sync::OnceLock,
time::{Duration, Instant},
};
use std::{fs, net::SocketAddr, path::PathBuf, str::FromStr, sync::OnceLock};
use tokio::process::Command;
use tracing::instrument;
use url::Url;
@@ -177,6 +168,16 @@ impl StorageController {
.expect("non-Unicode path")
}
/// PIDFile for the postgres instance used to store storage controller state
fn postgres_pid_file(&self) -> Utf8PathBuf {
Utf8PathBuf::from_path_buf(
self.env
.base_data_dir
.join("storage_controller_postgres.pid"),
)
.expect("non-Unicode path")
}
/// Find the directory containing postgres subdirectories, such `bin` and `lib`
///
/// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back
@@ -295,31 +296,6 @@ impl StorageController {
.map_err(anyhow::Error::new)
}
/// Wrapper for the pg_ctl binary, which we spawn as a short-lived subprocess when starting and stopping postgres
async fn pg_ctl<I, S>(&self, args: I) -> ExitStatus
where
I: IntoIterator<Item = S>,
S: AsRef<OsStr>,
{
let pg_bin_dir = self.get_pg_bin_dir().await.unwrap();
let bin_path = pg_bin_dir.join("pg_ctl");
let pg_lib_dir = self.get_pg_lib_dir().await.unwrap();
let envs = [
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
];
Command::new(bin_path)
.args(args)
.envs(envs)
.spawn()
.expect("Failed to spawn pg_ctl, binary_missing?")
.wait()
.await
.expect("Failed to wait for pg_ctl termination")
}
pub async fn start(&self, start_args: NeonStorageControllerStartArgs) -> anyhow::Result<()> {
let instance_dir = self.storage_controller_instance_dir(start_args.instance_id);
if let Err(err) = tokio::fs::create_dir(&instance_dir).await {
@@ -428,34 +404,20 @@ impl StorageController {
db_start_args
);
let db_start_status = self.pg_ctl(db_start_args).await;
let start_timeout: Duration = start_args.start_timeout.into();
let db_start_deadline = Instant::now() + start_timeout;
if !db_start_status.success() {
return Err(anyhow::anyhow!(
"Failed to start postgres {}",
db_start_status.code().unwrap()
));
}
loop {
if Instant::now() > db_start_deadline {
return Err(anyhow::anyhow!("Timed out waiting for postgres to start"));
}
match self.pg_isready(&pg_bin_dir, postgres_port).await {
Ok(true) => {
tracing::info!("storage controller postgres is now ready");
break;
}
Ok(false) => {
tokio::time::sleep(Duration::from_millis(100)).await;
}
Err(e) => {
tracing::warn!("Failed to check postgres status: {e}")
}
}
}
background_process::start_process(
"storage_controller_db",
&self.env.base_data_dir,
pg_bin_dir.join("pg_ctl").as_std_path(),
db_start_args,
vec![
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
],
background_process::InitialPidFile::Create(self.postgres_pid_file()),
&start_args.start_timeout,
|| self.pg_isready(&pg_bin_dir, postgres_port),
)
.await?;
self.setup_database(postgres_port).await?;
}
@@ -621,10 +583,15 @@ impl StorageController {
}
let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
let pg_bin_dir = self.get_pg_bin_dir().await?;
println!("Stopping storage controller database...");
let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"];
let stop_status = self.pg_ctl(pg_stop_args).await;
let stop_status = Command::new(pg_bin_dir.join("pg_ctl"))
.args(pg_stop_args)
.spawn()?
.wait()
.await?;
if !stop_status.success() {
match self.is_postgres_running().await {
Ok(false) => {
@@ -645,9 +612,14 @@ impl StorageController {
async fn is_postgres_running(&self) -> anyhow::Result<bool> {
let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
let pg_bin_dir = self.get_pg_bin_dir().await?;
let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
let status_exitcode = self.pg_ctl(pg_status_args).await;
let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
.args(pg_status_args)
.spawn()?
.wait()
.await?;
// pg_ctl status returns this exit code if postgres is not running: in this case it is
// fine that stop failed. Otherwise it is an error that stop failed.

View File

@@ -5,7 +5,7 @@
Currently we build two main images:
- [neondatabase/neon](https://hub.docker.com/repository/docker/neondatabase/neon) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
- [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14. Built from [/compute-node/Dockerfile](/compute/compute-node.Dockerfile).
- [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14. Built from [/compute-node/Dockerfile](/compute/Dockerfile.compute-node).
And additional intermediate image:
@@ -56,7 +56,7 @@ CREATE TABLE
postgres=# insert into t values(1, 1);
INSERT 0 1
postgres=# select * from t;
key | value
key | value
-----+-------
1 | 1
(1 row)
@@ -84,4 +84,4 @@ Access http://localhost:9001 and sign in.
- Username: `minio`
- Password: `password`
You can see durable pages and WAL data in `neon` bucket.
You can see durable pages and WAL data in `neon` bucket.

View File

@@ -1,6 +1,5 @@
#![deny(unsafe_code)]
#![deny(clippy::undocumented_unsafe_blocks)]
pub mod privilege;
pub mod requests;
pub mod responses;
pub mod spec;

View File

@@ -1,35 +0,0 @@
#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)]
#[serde(rename_all = "UPPERCASE")]
pub enum Privilege {
Select,
Insert,
Update,
Delete,
Truncate,
References,
Trigger,
Usage,
Create,
Connect,
Temporary,
Execute,
}
impl Privilege {
pub fn as_str(&self) -> &'static str {
match self {
Privilege::Select => "SELECT",
Privilege::Insert => "INSERT",
Privilege::Update => "UPDATE",
Privilege::Delete => "DELETE",
Privilege::Truncate => "TRUNCATE",
Privilege::References => "REFERENCES",
Privilege::Trigger => "TRIGGER",
Privilege::Usage => "USAGE",
Privilege::Create => "CREATE",
Privilege::Connect => "CONNECT",
Privilege::Temporary => "TEMPORARY",
Privilege::Execute => "EXECUTE",
}
}
}

View File

@@ -1,8 +1,6 @@
//! Structs representing the JSON formats used in the compute_ctl's HTTP API.
use crate::{
privilege::Privilege,
spec::{ComputeSpec, ExtVersion, PgIdent},
};
use crate::spec::ComputeSpec;
use serde::Deserialize;
/// Request of the /configure API
@@ -14,18 +12,3 @@ use serde::Deserialize;
pub struct ConfigurationRequest {
pub spec: ComputeSpec,
}
#[derive(Deserialize, Debug)]
pub struct ExtensionInstallRequest {
pub extension: PgIdent,
pub database: PgIdent,
pub version: ExtVersion,
}
#[derive(Deserialize, Debug)]
pub struct SetRoleGrantsRequest {
pub database: PgIdent,
pub schema: PgIdent,
pub privileges: Vec<Privilege>,
pub role: PgIdent,
}

View File

@@ -6,10 +6,7 @@ use std::fmt::Display;
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize, Serializer};
use crate::{
privilege::Privilege,
spec::{ComputeSpec, Database, ExtVersion, PgIdent, Role},
};
use crate::spec::{ComputeSpec, Database, Role};
#[derive(Serialize, Debug, Deserialize)]
pub struct GenericAPIError {
@@ -171,16 +168,3 @@ pub struct InstalledExtension {
pub struct InstalledExtensions {
pub extensions: Vec<InstalledExtension>,
}
#[derive(Clone, Debug, Default, Serialize)]
pub struct ExtensionInstallResult {
pub extension: PgIdent,
pub version: ExtVersion,
}
#[derive(Clone, Debug, Default, Serialize)]
pub struct SetRoleGrantsResponse {
pub database: PgIdent,
pub schema: PgIdent,
pub privileges: Vec<Privilege>,
pub role: PgIdent,
}

View File

@@ -16,9 +16,6 @@ use remote_storage::RemotePath;
/// intended to be used for DB / role names.
pub type PgIdent = String;
/// String type alias representing Postgres extension version
pub type ExtVersion = String;
/// Cluster spec or configuration represented as an optional number of
/// delta operations + final cluster state description.
#[derive(Clone, Debug, Default, Deserialize, Serialize)]

View File

@@ -102,7 +102,6 @@ pub struct ConfigToml {
pub ingest_batch_size: u64,
pub max_vectored_read_bytes: MaxVectoredReadBytes,
pub image_compression: ImageCompressionAlgorithm,
pub timeline_offloading: bool,
pub ephemeral_bytes_per_memory_kb: usize,
pub l0_flush: Option<crate::models::L0FlushConfig>,
pub virtual_file_io_mode: Option<crate::models::virtual_file::IoMode>,
@@ -386,7 +385,6 @@ impl Default for ConfigToml {
NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
)),
image_compression: (DEFAULT_IMAGE_COMPRESSION),
timeline_offloading: false,
ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
l0_flush: None,
virtual_file_io_mode: None,

View File

@@ -5,11 +5,9 @@ pub mod controller_api;
pub mod key;
pub mod keyspace;
pub mod models;
pub mod record;
pub mod reltag;
pub mod shard;
/// Public API types
pub mod upcall_api;
pub mod value;
pub mod config;

View File

@@ -684,25 +684,6 @@ pub struct TimelineArchivalConfigRequest {
pub state: TimelineArchivalState,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct TimelinesInfoAndOffloaded {
pub timelines: Vec<TimelineInfo>,
pub offloaded: Vec<OffloadedTimelineInfo>,
}
/// Analog of [`TimelineInfo`] for offloaded timelines.
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct OffloadedTimelineInfo {
pub tenant_id: TenantShardId,
pub timeline_id: TimelineId,
/// Whether the timeline has a parent it has been branched off from or not
pub ancestor_timeline_id: Option<TimelineId>,
/// Whether to retain the branch lsn at the ancestor or not
pub ancestor_retain_lsn: Option<Lsn>,
/// The time point when the timeline was archived
pub archived_at: chrono::DateTime<chrono::Utc>,
}
/// This represents the output of the "timeline_detail" and "timeline_list" API calls.
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct TimelineInfo {
@@ -762,6 +743,8 @@ pub struct TimelineInfo {
// Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does
// not deny unknown fields by default so it's safe to set the field to some value, though it won't be
// read.
/// The last aux file policy being used on this timeline
pub last_aux_file_policy: Option<AuxFilePolicy>,
pub is_archived: Option<bool>,
}

View File

@@ -16,7 +16,7 @@ impl serde::Serialize for Partitioning {
{
pub struct KeySpace<'a>(&'a crate::keyspace::KeySpace);
impl serde::Serialize for KeySpace<'_> {
impl<'a> serde::Serialize for KeySpace<'a> {
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
where
S: serde::Serializer,
@@ -44,7 +44,7 @@ impl serde::Serialize for Partitioning {
pub struct WithDisplay<'a, T>(&'a T);
impl<T: std::fmt::Display> serde::Serialize for WithDisplay<'_, T> {
impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> {
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
where
S: serde::Serializer,
@@ -55,7 +55,7 @@ impl<T: std::fmt::Display> serde::Serialize for WithDisplay<'_, T> {
pub struct KeyRange<'a>(&'a std::ops::Range<crate::key::Key>);
impl serde::Serialize for KeyRange<'_> {
impl<'a> serde::Serialize for KeyRange<'a> {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,

View File

@@ -1,113 +0,0 @@
//! This module defines the WAL record format used within the pageserver.
use bytes::Bytes;
use postgres_ffi::record::{describe_postgres_wal_record, MultiXactMember};
use postgres_ffi::{MultiXactId, MultiXactOffset, TimestampTz, TransactionId};
use serde::{Deserialize, Serialize};
use utils::bin_ser::DeserializeError;
/// Each update to a page is represented by a NeonWalRecord. It can be a wrapper
/// around a PostgreSQL WAL record, or a custom neon-specific "record".
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub enum NeonWalRecord {
/// Native PostgreSQL WAL record
Postgres { will_init: bool, rec: Bytes },
/// Clear bits in heap visibility map. ('flags' is bitmap of bits to clear)
ClearVisibilityMapFlags {
new_heap_blkno: Option<u32>,
old_heap_blkno: Option<u32>,
flags: u8,
},
/// Mark transaction IDs as committed on a CLOG page
ClogSetCommitted {
xids: Vec<TransactionId>,
timestamp: TimestampTz,
},
/// Mark transaction IDs as aborted on a CLOG page
ClogSetAborted { xids: Vec<TransactionId> },
/// Extend multixact offsets SLRU
MultixactOffsetCreate {
mid: MultiXactId,
moff: MultiXactOffset,
},
/// Extend multixact members SLRU.
MultixactMembersCreate {
moff: MultiXactOffset,
members: Vec<MultiXactMember>,
},
/// Update the map of AUX files, either writing or dropping an entry
AuxFile {
file_path: String,
content: Option<Bytes>,
},
/// A testing record for unit testing purposes. It supports append data to an existing image, or clear it.
#[cfg(feature = "testing")]
Test {
/// Append a string to the image.
append: String,
/// Clear the image before appending.
clear: bool,
/// Treat this record as an init record. `clear` should be set to true if this field is set
/// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and
/// its references in `timeline.rs`.
will_init: bool,
},
}
impl NeonWalRecord {
/// Does replaying this WAL record initialize the page from scratch, or does
/// it need to be applied over the previous image of the page?
pub fn will_init(&self) -> bool {
// If you change this function, you'll also need to change ValueBytes::will_init
match self {
NeonWalRecord::Postgres { will_init, rec: _ } => *will_init,
#[cfg(feature = "testing")]
NeonWalRecord::Test { will_init, .. } => *will_init,
// None of the special neon record types currently initialize the page
_ => false,
}
}
#[cfg(feature = "testing")]
pub fn wal_append(s: impl AsRef<str>) -> Self {
Self::Test {
append: s.as_ref().to_string(),
clear: false,
will_init: false,
}
}
#[cfg(feature = "testing")]
pub fn wal_clear() -> Self {
Self::Test {
append: "".to_string(),
clear: true,
will_init: false,
}
}
#[cfg(feature = "testing")]
pub fn wal_init() -> Self {
Self::Test {
append: "".to_string(),
clear: true,
will_init: true,
}
}
}
/// Build a human-readable string to describe a WAL record
///
/// For debugging purposes
pub fn describe_wal_record(rec: &NeonWalRecord) -> Result<String, DeserializeError> {
match rec {
NeonWalRecord::Postgres { will_init, rec } => Ok(format!(
"will_init: {}, {}",
will_init,
describe_postgres_wal_record(rec)?
)),
_ => Ok(format!("{:?}", rec)),
}
}

View File

@@ -738,20 +738,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
QueryError::SimulatedConnectionError => {
return Err(QueryError::SimulatedConnectionError)
}
err @ QueryError::Reconnect => {
// Instruct the client to reconnect, stop processing messages
// from this libpq connection and, finally, disconnect from the
// server side (returning an Err achieves the later).
//
// Note the flushing is done by the caller.
let reconnect_error = short_error(&err);
self.write_message_noflush(&BeMessage::ErrorResponse(
&reconnect_error,
Some(err.pg_error_code()),
))?;
return Err(err);
}
e => {
log_query_error(query_string, &e);
let short_error = short_error(&e);
@@ -935,11 +921,12 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackendReader<IO> {
/// A futures::AsyncWrite implementation that wraps all data written to it in CopyData
/// messages.
///
pub struct CopyDataWriter<'a, IO> {
pgb: &'a mut PostgresBackend<IO>,
}
impl<IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'_, IO> {
impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, IO> {
fn poll_write(
self: Pin<&mut Self>,
cx: &mut std::task::Context<'_>,

View File

@@ -2,7 +2,6 @@
use once_cell::sync::Lazy;
use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError};
use pq_proto::{BeMessage, RowDescriptor};
use rustls::crypto::aws_lc_rs;
use std::io::Cursor;
use std::sync::Arc;
use tokio::io::{AsyncRead, AsyncWrite};
@@ -93,13 +92,10 @@ static CERT: Lazy<rustls::pki_types::CertificateDer<'static>> = Lazy::new(|| {
async fn simple_select_ssl() {
let (client_sock, server_sock) = make_tcp_pair().await;
let server_cfg =
rustls::ServerConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
.with_safe_default_protocol_versions()
.expect("aws_lc_rs should support the default protocol versions")
.with_no_client_auth()
.with_single_cert(vec![CERT.clone()], KEY.clone_key())
.unwrap();
let server_cfg = rustls::ServerConfig::builder()
.with_no_client_auth()
.with_single_cert(vec![CERT.clone()], KEY.clone_key())
.unwrap();
let tls_config = Some(Arc::new(server_cfg));
let pgbackend =
PostgresBackend::new(server_sock, AuthType::Trust, tls_config).expect("pgbackend creation");
@@ -109,16 +105,13 @@ async fn simple_select_ssl() {
pgbackend.run(&mut handler, &CancellationToken::new()).await
});
let client_cfg =
rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
.with_safe_default_protocol_versions()
.expect("aws_lc_rs should support the default protocol versions")
.with_root_certificates({
let mut store = rustls::RootCertStore::empty();
store.add(CERT.clone()).unwrap();
store
})
.with_no_client_auth();
let client_cfg = rustls::ClientConfig::builder()
.with_root_certificates({
let mut store = rustls::RootCertStore::empty();
store.add(CERT.clone()).unwrap();
store
})
.with_no_client_auth();
let mut make_tls_connect = tokio_postgres_rustls::MakeRustlsConnect::new(client_cfg);
let tls_connect = <MakeRustlsConnect as MakeTlsConnect<TcpStream>>::make_tls_connect(
&mut make_tls_connect,

View File

@@ -15,7 +15,6 @@ memoffset.workspace = true
thiserror.workspace = true
serde.workspace = true
utils.workspace = true
tracing.workspace = true
[dev-dependencies]
env_logger.workspace = true

View File

@@ -216,7 +216,6 @@ macro_rules! enum_pgversion {
}
pub mod pg_constants;
pub mod record;
pub mod relfile_utils;
// Export some widely used datatypes that are unlikely to change across Postgres versions

View File

@@ -727,7 +727,7 @@ pub const SQLSTATE_INTERNAL_ERROR: &[u8; 5] = b"XX000";
pub const SQLSTATE_ADMIN_SHUTDOWN: &[u8; 5] = b"57P01";
pub const SQLSTATE_SUCCESSFUL_COMPLETION: &[u8; 5] = b"00000";
impl BeMessage<'_> {
impl<'a> BeMessage<'a> {
/// Serialize `message` to the given `buf`.
/// Apart from smart memory managemet, BytesMut is good here as msg len
/// precedes its body and it is handy to write it down first and then fill

View File

@@ -19,12 +19,7 @@ mod simulate_failures;
mod support;
use std::{
collections::HashMap,
fmt::Debug,
num::NonZeroU32,
ops::Bound,
pin::{pin, Pin},
sync::Arc,
collections::HashMap, fmt::Debug, num::NonZeroU32, ops::Bound, pin::Pin, sync::Arc,
time::SystemTime,
};
@@ -33,7 +28,6 @@ use camino::{Utf8Path, Utf8PathBuf};
use bytes::Bytes;
use futures::{stream::Stream, StreamExt};
use itertools::Itertools as _;
use serde::{Deserialize, Serialize};
use tokio::sync::Semaphore;
use tokio_util::sync::CancellationToken;
@@ -267,7 +261,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> Result<Listing, DownloadError> {
let mut stream = pin!(self.list_streaming(prefix, mode, max_keys, cancel));
let mut stream = std::pin::pin!(self.list_streaming(prefix, mode, max_keys, cancel));
let mut combined = stream.next().await.expect("At least one item required")?;
while let Some(list) = stream.next().await {
let list = list?;
@@ -330,35 +324,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
cancel: &CancellationToken,
) -> anyhow::Result<()>;
/// Deletes all objects matching the given prefix.
///
/// NB: this uses NoDelimiter and will match partial prefixes. For example, the prefix /a/b will
/// delete /a/b, /a/b/*, /a/bc, /a/bc/*, etc.
///
/// If the operation fails because of timeout or cancellation, the root cause of the error will
/// be set to `TimeoutOrCancel`. In such situation it is unknown which deletions, if any, went
/// through.
async fn delete_prefix(
&self,
prefix: &RemotePath,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
let mut stream =
pin!(self.list_streaming(Some(prefix), ListingMode::NoDelimiter, None, cancel));
while let Some(result) = stream.next().await {
let keys = match result {
Ok(listing) if listing.keys.is_empty() => continue,
Ok(listing) => listing.keys.into_iter().map(|o| o.key).collect_vec(),
Err(DownloadError::Cancelled) => return Err(TimeoutOrCancel::Cancel.into()),
Err(DownloadError::Timeout) => return Err(TimeoutOrCancel::Timeout.into()),
Err(err) => return Err(err.into()),
};
tracing::info!("Deleting {} keys from remote storage", keys.len());
self.delete_objects(&keys, cancel).await?;
}
Ok(())
}
/// Copy a remote object inside a bucket from one path to another.
async fn copy(
&self,
@@ -523,20 +488,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
}
}
/// See [`RemoteStorage::delete_prefix`]
pub async fn delete_prefix(
&self,
prefix: &RemotePath,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
match self {
Self::LocalFs(s) => s.delete_prefix(prefix, cancel).await,
Self::AwsS3(s) => s.delete_prefix(prefix, cancel).await,
Self::AzureBlob(s) => s.delete_prefix(prefix, cancel).await,
Self::Unreliable(s) => s.delete_prefix(prefix, cancel).await,
}
}
/// See [`RemoteStorage::copy`]
pub async fn copy_object(
&self,

View File

@@ -199,138 +199,6 @@ async fn list_no_delimiter_works(
Ok(())
}
/// Tests that giving a partial prefix returns all matches (e.g. "/foo" yields "/foobar/baz"),
/// but only with NoDelimiter.
#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
#[tokio::test]
async fn list_partial_prefix(
ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs,
) -> anyhow::Result<()> {
let ctx = match ctx {
MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
MaybeEnabledStorageWithSimpleTestBlobs::UploadsFailed(e, _) => {
anyhow::bail!("S3 init failed: {e:?}")
}
};
let cancel = CancellationToken::new();
let test_client = Arc::clone(&ctx.enabled.client);
// Prefix "fold" should match all "folder{i}" directories with NoDelimiter.
let objects: HashSet<_> = test_client
.list(
Some(&RemotePath::from_string("fold")?),
ListingMode::NoDelimiter,
None,
&cancel,
)
.await?
.keys
.into_iter()
.map(|o| o.key)
.collect();
assert_eq!(&objects, &ctx.remote_blobs);
// Prefix "fold" matches nothing with WithDelimiter.
let objects: HashSet<_> = test_client
.list(
Some(&RemotePath::from_string("fold")?),
ListingMode::WithDelimiter,
None,
&cancel,
)
.await?
.keys
.into_iter()
.map(|o| o.key)
.collect();
assert!(objects.is_empty());
// Prefix "" matches everything.
let objects: HashSet<_> = test_client
.list(
Some(&RemotePath::from_string("")?),
ListingMode::NoDelimiter,
None,
&cancel,
)
.await?
.keys
.into_iter()
.map(|o| o.key)
.collect();
assert_eq!(&objects, &ctx.remote_blobs);
// Prefix "" matches nothing with WithDelimiter.
let objects: HashSet<_> = test_client
.list(
Some(&RemotePath::from_string("")?),
ListingMode::WithDelimiter,
None,
&cancel,
)
.await?
.keys
.into_iter()
.map(|o| o.key)
.collect();
assert!(objects.is_empty());
// Prefix "foo" matches nothing.
let objects: HashSet<_> = test_client
.list(
Some(&RemotePath::from_string("foo")?),
ListingMode::NoDelimiter,
None,
&cancel,
)
.await?
.keys
.into_iter()
.map(|o| o.key)
.collect();
assert!(objects.is_empty());
// Prefix "folder2/blob" matches.
let objects: HashSet<_> = test_client
.list(
Some(&RemotePath::from_string("folder2/blob")?),
ListingMode::NoDelimiter,
None,
&cancel,
)
.await?
.keys
.into_iter()
.map(|o| o.key)
.collect();
let expect: HashSet<_> = ctx
.remote_blobs
.iter()
.filter(|o| o.get_path().starts_with("folder2"))
.cloned()
.collect();
assert_eq!(&objects, &expect);
// Prefix "folder2/foo" matches nothing.
let objects: HashSet<_> = test_client
.list(
Some(&RemotePath::from_string("folder2/foo")?),
ListingMode::NoDelimiter,
None,
&cancel,
)
.await?
.keys
.into_iter()
.map(|o| o.key)
.collect();
assert!(objects.is_empty());
Ok(())
}
#[test_context(MaybeEnabledStorage)]
#[tokio::test]
async fn delete_non_exising_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
@@ -397,80 +265,6 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(
Ok(())
}
/// Tests that delete_prefix() will delete all objects matching a prefix, including
/// partial prefixes (i.e. "/foo" matches "/foobar").
#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
#[tokio::test]
async fn delete_prefix(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> {
let ctx = match ctx {
MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
MaybeEnabledStorageWithSimpleTestBlobs::UploadsFailed(e, _) => {
anyhow::bail!("S3 init failed: {e:?}")
}
};
let cancel = CancellationToken::new();
let test_client = Arc::clone(&ctx.enabled.client);
/// Asserts that the S3 listing matches the given paths.
macro_rules! assert_list {
($expect:expr) => {{
let listing = test_client
.list(None, ListingMode::NoDelimiter, None, &cancel)
.await?
.keys
.into_iter()
.map(|o| o.key)
.collect();
assert_eq!($expect, listing);
}};
}
// We start with the full set of uploaded files.
let mut expect = ctx.remote_blobs.clone();
// Deleting a non-existing prefix should do nothing.
test_client
.delete_prefix(&RemotePath::from_string("xyz")?, &cancel)
.await?;
assert_list!(expect);
// Prefixes are case-sensitive.
test_client
.delete_prefix(&RemotePath::from_string("Folder")?, &cancel)
.await?;
assert_list!(expect);
// Deleting a path which overlaps with an existing object should do nothing. We pick the first
// path in the set as our common prefix.
let path = expect.iter().next().expect("empty set").clone().join("xyz");
test_client.delete_prefix(&path, &cancel).await?;
assert_list!(expect);
// Deleting an exact path should work. We pick the first path in the set.
let path = expect.iter().next().expect("empty set").clone();
test_client.delete_prefix(&path, &cancel).await?;
expect.remove(&path);
assert_list!(expect);
// Deleting a prefix should delete all matching objects.
test_client
.delete_prefix(&RemotePath::from_string("folder0/blob_")?, &cancel)
.await?;
expect.retain(|p| !p.get_path().as_str().starts_with("folder0/"));
assert_list!(expect);
// Deleting a common prefix should delete all objects.
test_client
.delete_prefix(&RemotePath::from_string("fold")?, &cancel)
.await?;
expect.clear();
assert_list!(expect);
Ok(())
}
#[test_context(MaybeEnabledStorage)]
#[tokio::test]
async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {

View File

@@ -97,7 +97,7 @@ pub fn draw_svg(
Ok(result)
}
impl SvgDraw<'_> {
impl<'a> SvgDraw<'a> {
fn calculate_svg_layout(&mut self) {
// Find x scale
let segments = &self.storage.segments;

View File

@@ -82,7 +82,7 @@ where
fn extract_remote_context(headers: &HeaderMap) -> opentelemetry::Context {
struct HeaderExtractor<'a>(&'a HeaderMap);
impl opentelemetry::propagation::Extractor for HeaderExtractor<'_> {
impl<'a> opentelemetry::propagation::Extractor for HeaderExtractor<'a> {
fn get(&self, key: &str) -> Option<&str> {
self.0.get(key).and_then(|value| value.to_str().ok())
}

View File

@@ -37,7 +37,7 @@ impl<'de> Deserialize<'de> for Lsn {
is_human_readable_deserializer: bool,
}
impl Visitor<'_> for LsnVisitor {
impl<'de> Visitor<'de> for LsnVisitor {
type Value = Lsn;
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {

View File

@@ -73,7 +73,7 @@ impl<T> Poison<T> {
/// and subsequent calls to [`Poison::check_and_arm`] will fail with an error.
pub struct Guard<'a, T>(&'a mut Poison<T>);
impl<T> Guard<'_, T> {
impl<'a, T> Guard<'a, T> {
pub fn data(&self) -> &T {
&self.0.data
}
@@ -94,7 +94,7 @@ impl<T> Guard<'_, T> {
}
}
impl<T> Drop for Guard<'_, T> {
impl<'a, T> Drop for Guard<'a, T> {
fn drop(&mut self) {
match self.0.state {
State::Clean => {

View File

@@ -164,7 +164,7 @@ impl TenantShardId {
}
}
impl std::fmt::Display for ShardSlug<'_> {
impl<'a> std::fmt::Display for ShardSlug<'a> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,

View File

@@ -152,7 +152,7 @@ pub struct RcuWriteGuard<'a, V> {
inner: RwLockWriteGuard<'a, RcuInner<V>>,
}
impl<V> Deref for RcuWriteGuard<'_, V> {
impl<'a, V> Deref for RcuWriteGuard<'a, V> {
type Target = V;
fn deref(&self) -> &V {
@@ -160,7 +160,7 @@ impl<V> Deref for RcuWriteGuard<'_, V> {
}
}
impl<V> RcuWriteGuard<'_, V> {
impl<'a, V> RcuWriteGuard<'a, V> {
///
/// Store a new value. The new value will be written to the Rcu immediately,
/// and will be immediately seen by any `read` calls that start afterwards.

View File

@@ -219,7 +219,7 @@ impl<'a, T> CountWaitingInitializers<'a, T> {
}
}
impl<T> Drop for CountWaitingInitializers<'_, T> {
impl<'a, T> Drop for CountWaitingInitializers<'a, T> {
fn drop(&mut self) {
self.0.initializers.fetch_sub(1, Ordering::Relaxed);
}
@@ -250,7 +250,7 @@ impl<T> std::ops::DerefMut for Guard<'_, T> {
}
}
impl<T> Guard<'_, T> {
impl<'a, T> Guard<'a, T> {
/// Take the current value, and a new permit for it's deinitialization.
///
/// The permit will be on a semaphore part of the new internal value, and any following

View File

@@ -184,23 +184,23 @@ mod tests {
struct MemoryIdentity<'a>(&'a dyn Extractor);
impl MemoryIdentity<'_> {
impl<'a> MemoryIdentity<'a> {
fn as_ptr(&self) -> *const () {
self.0 as *const _ as *const ()
}
}
impl PartialEq for MemoryIdentity<'_> {
impl<'a> PartialEq for MemoryIdentity<'a> {
fn eq(&self, other: &Self) -> bool {
self.as_ptr() == other.as_ptr()
}
}
impl Eq for MemoryIdentity<'_> {}
impl Hash for MemoryIdentity<'_> {
impl<'a> Eq for MemoryIdentity<'a> {}
impl<'a> Hash for MemoryIdentity<'a> {
fn hash<H: Hasher>(&self, state: &mut H) {
self.as_ptr().hash(state);
}
}
impl fmt::Debug for MemoryIdentity<'_> {
impl<'a> fmt::Debug for MemoryIdentity<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{:p}: {}", self.as_ptr(), self.0.id())
}

View File

@@ -1,17 +0,0 @@
[package]
name = "wal_decoder"
version = "0.1.0"
edition.workspace = true
license.workspace = true
[features]
testing = []
[dependencies]
anyhow.workspace = true
bytes.workspace = true
pageserver_api.workspace = true
postgres_ffi.workspace = true
serde.workspace = true
tracing.workspace = true
utils.workspace = true

View File

@@ -1,2 +0,0 @@
pub mod decoder;
pub mod models;

View File

@@ -1,144 +0,0 @@
//! This module houses types which represent decoded PG WAL records
//! ready for the pageserver to interpret. They are higher level
//! than their counterparts in [`postgres_ffi::record`].
use bytes::Bytes;
use pageserver_api::reltag::{RelTag, SlruKind};
use postgres_ffi::record::{
XlMultiXactCreate, XlMultiXactTruncate, XlRelmapUpdate, XlReploriginDrop, XlReploriginSet, XlSmgrTruncate, XlXactParsedRecord
};
use postgres_ffi::{Oid, TransactionId};
use utils::lsn::Lsn;
pub enum HeapamRecord {
ClearVmBits(ClearVmBits),
}
pub struct ClearVmBits {
pub new_heap_blkno: Option<u32>,
pub old_heap_blkno: Option<u32>,
pub vm_rel: RelTag,
pub flags: u8,
}
pub enum NeonrmgrRecord {
ClearVmBits(ClearVmBits),
}
pub enum SmgrRecord {
Create(SmgrCreate),
Truncate(XlSmgrTruncate),
}
pub struct SmgrCreate {
pub rel: RelTag,
}
pub enum DbaseRecord {
Create(DbaseCreate),
Drop(DbaseDrop),
}
pub struct DbaseCreate {
pub db_id: Oid,
pub tablespace_id: Oid,
pub src_db_id: Oid,
pub src_tablespace_id: Oid,
}
pub struct DbaseDrop {
pub db_id: Oid,
pub tablespace_ids: Vec<Oid>,
}
pub enum ClogRecord {
ZeroPage(ClogZeroPage),
Truncate(ClogTruncate),
}
pub struct ClogZeroPage {
pub segno: u32,
pub rpageno: u32,
}
pub struct ClogTruncate {
pub pageno: u32,
pub oldest_xid: TransactionId,
pub oldest_xid_db: Oid,
}
pub enum XactRecord {
Commit(XactCommon),
Abort(XactCommon),
CommitPrepared(XactCommon),
AbortPrepared(XactCommon),
Prepare(XactPrepare),
}
pub struct XactCommon {
pub parsed: XlXactParsedRecord,
pub origin_id: u16,
// Fields below are only used for logging
pub xl_xid: TransactionId,
pub lsn: Lsn,
}
pub struct XactPrepare {
pub xl_xid: TransactionId,
pub data: Bytes,
}
pub enum MultiXactRecord {
ZeroPage(MultiXactZeroPage),
Create(XlMultiXactCreate),
Truncate(XlMultiXactTruncate),
}
pub struct MultiXactZeroPage {
pub slru_kind: SlruKind,
pub segno: u32,
pub rpageno: u32,
}
pub enum RelmapRecord {
Update(RelmapUpdate),
}
pub struct RelmapUpdate {
pub update: XlRelmapUpdate,
pub buf: Bytes,
}
pub enum XlogRecord {
Raw(RawXlogRecord),
}
pub struct RawXlogRecord {
pub info: u8,
pub lsn: Lsn,
pub buf: Bytes,
}
pub enum LogicalMessageRecord {
Put(PutLogicalMessage),
#[cfg(feature = "testing")]
Failpoint,
}
pub struct PutLogicalMessage {
pub path: String,
pub buf: Bytes,
}
pub enum StandbyRecord {
RunningXacts(StandbyRunningXacts),
}
pub struct StandbyRunningXacts {
pub oldest_running_xid: TransactionId,
}
pub enum ReploriginRecord {
Set(XlReploriginSet),
Drop(XlReploriginDrop),
}

View File

@@ -8,7 +8,7 @@ license.workspace = true
default = []
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
# which adds some runtime cost to run tests on outage conditions
testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing"]
testing = ["fail/failpoints", "pageserver_api/testing" ]
[dependencies]
anyhow.workspace = true
@@ -83,7 +83,6 @@ enum-map.workspace = true
enumset = { workspace = true, features = ["serde"]}
strum.workspace = true
strum_macros.workspace = true
wal_decoder.workspace = true
[target.'cfg(target_os = "linux")'.dependencies]
procfs.workspace = true
@@ -93,7 +92,6 @@ criterion.workspace = true
hex-literal.workspace = true
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
indoc.workspace = true
# pageserver_api = { workspace = true, features = ["testing"] }
[[bench]]
name = "bench_layer_map"

View File

@@ -6,15 +6,15 @@ use criterion::{criterion_group, criterion_main, Criterion};
use pageserver::{
config::PageServerConf,
context::{DownloadBehavior, RequestContext},
gc_result::Value,
l0_flush::{L0FlushConfig, L0FlushGlobalState},
page_cache,
repository::Value,
task_mgr::TaskKind,
tenant::storage_layer::inmemory_layer::SerializedBatch,
tenant::storage_layer::InMemoryLayer,
virtual_file,
};
use pageserver_api::{key::Key, shard::TenantShardId, value::Value};
use pageserver_api::{key::Key, shard::TenantShardId};
use utils::{
bin_ser::BeSer,
id::{TenantId, TimelineId},
@@ -164,11 +164,7 @@ fn criterion_benchmark(c: &mut Criterion) {
let conf: &'static PageServerConf = Box::leak(Box::new(
pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
));
virtual_file::init(
16384,
virtual_file::io_engine_for_bench(),
conf.virtual_file_io_mode,
);
virtual_file::init(16384, virtual_file::io_engine_for_bench());
page_cache::init(conf.page_cache_size);
{

View File

@@ -1,9 +1,9 @@
use criterion::measurement::WallTime;
use pageserver::keyspace::{KeyPartitioning, KeySpace};
use pageserver::repository::Key;
use pageserver::tenant::layer_map::LayerMap;
use pageserver::tenant::storage_layer::LayerName;
use pageserver::tenant::storage_layer::PersistentLayerDesc;
use pageserver_api::key::Key;
use pageserver_api::shard::TenantShardId;
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
use std::cmp::{max, min};

View File

@@ -60,8 +60,7 @@ use anyhow::Context;
use bytes::{Buf, Bytes};
use criterion::{BenchmarkId, Criterion};
use once_cell::sync::Lazy;
use pageserver::{config::PageServerConf, walredo::PostgresRedoManager};
use pageserver_api::record::NeonWalRecord;
use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
use pageserver_api::{key::Key, shard::TenantShardId};
use std::{
future::Future,

View File

@@ -133,7 +133,7 @@ enum LazyLoadLayer<'a, E: CompactionJobExecutor> {
Loaded(VecDeque<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>),
Unloaded(&'a E::DeltaLayer),
}
impl<E: CompactionJobExecutor> LazyLoadLayer<'_, E> {
impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> {
fn min_key(&self) -> E::Key {
match self {
Self::Loaded(entries) => entries.front().unwrap().key(),
@@ -147,23 +147,23 @@ impl<E: CompactionJobExecutor> LazyLoadLayer<'_, E> {
}
}
}
impl<E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'_, E> {
impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl<E: CompactionJobExecutor> Ord for LazyLoadLayer<'_, E> {
impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
// reverse order so that we get a min-heap
(other.min_key(), other.min_lsn()).cmp(&(self.min_key(), self.min_lsn()))
}
}
impl<E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'_, E> {
impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> {
fn eq(&self, other: &Self) -> bool {
self.cmp(other) == std::cmp::Ordering::Equal
}
}
impl<E: CompactionJobExecutor> Eq for LazyLoadLayer<'_, E> {}
impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {}
type LoadFuture<'a, E> = BoxFuture<'a, anyhow::Result<Vec<E>>>;

View File

@@ -51,7 +51,7 @@
//!
use anyhow::{Context, Result};
use pageserver_api::key::Key;
use pageserver::repository::Key;
use std::cmp::Ordering;
use std::io::{self, BufRead};
use std::path::PathBuf;

View File

@@ -11,7 +11,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
match cmd {
IndexPartCmd::Dump { path } => {
let bytes = tokio::fs::read(path).await.context("read file")?;
let des: IndexPart = IndexPart::from_json_bytes(&bytes).context("deserialize")?;
let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
let output = serde_json::to_string_pretty(&des).context("serialize output")?;
println!("{output}");
Ok(())

View File

@@ -7,19 +7,18 @@ use camino::{Utf8Path, Utf8PathBuf};
use pageserver::context::{DownloadBehavior, RequestContext};
use pageserver::task_mgr::TaskKind;
use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
use pageserver::virtual_file::api::IoMode;
use std::cmp::Ordering;
use std::collections::BinaryHeap;
use std::ops::Range;
use std::{fs, str};
use pageserver::page_cache::{self, PAGE_SZ};
use pageserver::repository::{Key, KEY_SIZE};
use pageserver::tenant::block_io::FileBlockReader;
use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE};
use pageserver::tenant::storage_layer::range_overlaps;
use pageserver::virtual_file::{self, VirtualFile};
use pageserver_api::key::{Key, KEY_SIZE};
use utils::{bin_ser::BeSer, lsn::Lsn};
@@ -153,11 +152,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
// Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
pageserver::virtual_file::init(
10,
virtual_file::api::IoEngineKind::StdFs,
IoMode::preferred(),
);
pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
pageserver::page_cache::init(100);
let mut total_delta_layers = 0usize;

View File

@@ -11,16 +11,15 @@ use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
use pageserver::tenant::storage_layer::{delta_layer, image_layer};
use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer};
use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
use pageserver::virtual_file::api::IoMode;
use pageserver::{page_cache, virtual_file};
use pageserver::{
repository::{Key, KEY_SIZE},
tenant::{
block_io::FileBlockReader, disk_btree::VisitDirection,
storage_layer::delta_layer::DELTA_KEY_SIZE,
},
virtual_file::VirtualFile,
};
use pageserver_api::key::{Key, KEY_SIZE};
use pageserver::{page_cache, virtual_file};
use std::fs;
use utils::bin_ser::BeSer;
use utils::id::{TenantId, TimelineId};
@@ -60,11 +59,7 @@ pub(crate) enum LayerCmd {
async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
virtual_file::init(
10,
virtual_file::api::IoEngineKind::StdFs,
IoMode::preferred(),
);
virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
page_cache::init(100);
let file = VirtualFile::open(path, ctx).await?;
let file_id = page_cache::next_file_id();
@@ -195,11 +190,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
new_tenant_id,
new_timeline_id,
} => {
pageserver::virtual_file::init(
10,
virtual_file::api::IoEngineKind::StdFs,
IoMode::preferred(),
);
pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
pageserver::page_cache::init(100);
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);

View File

@@ -24,7 +24,7 @@ use pageserver::{
page_cache,
task_mgr::TaskKind,
tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
virtual_file::{self, api::IoMode},
virtual_file,
};
use pageserver_api::shard::TenantShardId;
use postgres_ffi::ControlFileData;
@@ -205,11 +205,7 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {
async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
// Basic initialization of things that don't change after startup
virtual_file::init(
10,
virtual_file::api::IoEngineKind::StdFs,
IoMode::preferred(),
);
virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
page_cache::init(100);
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
dump_layerfile_from_path(path, true, &ctx).await

View File

@@ -167,11 +167,7 @@ fn main() -> anyhow::Result<()> {
let scenario = failpoint_support::init();
// Basic initialization of things that don't change after startup
virtual_file::init(
conf.max_file_descriptors,
conf.virtual_file_io_engine,
conf.virtual_file_io_mode,
);
virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine);
page_cache::init(conf.page_cache_size);
start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;

View File

@@ -164,9 +164,6 @@ pub struct PageServerConf {
pub image_compression: ImageCompressionAlgorithm,
/// Whether to offload archived timelines automatically
pub timeline_offloading: bool,
/// How many bytes of ephemeral layer content will we allow per kilobyte of RAM. When this
/// is exceeded, we start proactively closing ephemeral layers to limit the total amount
/// of ephemeral data.
@@ -324,7 +321,6 @@ impl PageServerConf {
ingest_batch_size,
max_vectored_read_bytes,
image_compression,
timeline_offloading,
ephemeral_bytes_per_memory_kb,
l0_flush,
virtual_file_io_mode,
@@ -368,7 +364,6 @@ impl PageServerConf {
ingest_batch_size,
max_vectored_read_bytes,
image_compression,
timeline_offloading,
ephemeral_bytes_per_memory_kb,
// ------------------------------------------------------------

View File

@@ -198,7 +198,7 @@ fn serialize_in_chunks<'a>(
}
}
impl ExactSizeIterator for Iter<'_> {}
impl<'a> ExactSizeIterator for Iter<'a> {}
let buffer = bytes::BytesMut::new();
let inner = input.chunks(chunk_size);

View File

@@ -696,7 +696,7 @@ impl DeletionQueue {
mod test {
use camino::Utf8Path;
use hex_literal::hex;
use pageserver_api::{key::Key, shard::ShardIndex, upcall_api::ReAttachResponseTenant};
use pageserver_api::{shard::ShardIndex, upcall_api::ReAttachResponseTenant};
use std::{io::ErrorKind, time::Duration};
use tracing::info;
@@ -705,6 +705,7 @@ mod test {
use crate::{
controller_upcall_client::RetryForeverError,
repository::Key,
tenant::{harness::TenantHarness, storage_layer::DeltaLayerName},
};

View File

@@ -654,7 +654,7 @@ impl std::fmt::Debug for EvictionCandidate {
let ts = chrono::DateTime::<chrono::Utc>::from(self.last_activity_ts);
let ts = ts.to_rfc3339_opts(chrono::SecondsFormat::Nanos, true);
struct DisplayIsDebug<'a, T>(&'a T);
impl<T: std::fmt::Display> std::fmt::Debug for DisplayIsDebug<'_, T> {
impl<'a, T: std::fmt::Display> std::fmt::Debug for DisplayIsDebug<'a, T> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
@@ -1218,7 +1218,16 @@ mod filesystem_level_usage {
let stat = Statvfs::get(tenants_dir, mock_config)
.context("statvfs failed, presumably directory got unlinked")?;
let (avail_bytes, total_bytes) = stat.get_avail_total_bytes();
// https://unix.stackexchange.com/a/703650
let blocksize = if stat.fragment_size() > 0 {
stat.fragment_size()
} else {
stat.block_size()
};
// use blocks_available (b_avail) since, pageserver runs as unprivileged user
let avail_bytes = stat.blocks_available() * blocksize;
let total_bytes = stat.blocks() * blocksize;
Ok(Usage {
config,

View File

@@ -1,57 +0,0 @@
use anyhow::Result;
use serde::Serialize;
use std::ops::AddAssign;
use std::time::Duration;
///
/// Result of performing GC
///
#[derive(Default, Serialize, Debug)]
pub struct GcResult {
pub layers_total: u64,
pub layers_needed_by_cutoff: u64,
pub layers_needed_by_pitr: u64,
pub layers_needed_by_branches: u64,
pub layers_needed_by_leases: u64,
pub layers_not_updated: u64,
pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
#[serde(serialize_with = "serialize_duration_as_millis")]
pub elapsed: Duration,
/// The layers which were garbage collected.
///
/// Used in `/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc` to wait for the layers to be
/// dropped in tests.
#[cfg(feature = "testing")]
#[serde(skip)]
pub(crate) doomed_layers: Vec<crate::tenant::storage_layer::Layer>,
}
// helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds
fn serialize_duration_as_millis<S>(d: &Duration, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
d.as_millis().serialize(serializer)
}
impl AddAssign for GcResult {
fn add_assign(&mut self, other: Self) {
self.layers_total += other.layers_total;
self.layers_needed_by_pitr += other.layers_needed_by_pitr;
self.layers_needed_by_cutoff += other.layers_needed_by_cutoff;
self.layers_needed_by_branches += other.layers_needed_by_branches;
self.layers_needed_by_leases += other.layers_needed_by_leases;
self.layers_not_updated += other.layers_not_updated;
self.layers_removed += other.layers_removed;
self.elapsed += other.elapsed;
#[cfg(feature = "testing")]
{
let mut other = other;
self.doomed_layers.append(&mut other.doomed_layers);
}
}
}

View File

@@ -18,6 +18,7 @@ use hyper::StatusCode;
use hyper::{Body, Request, Response, Uri};
use metrics::launch_timestamp::LaunchTimestamp;
use pageserver_api::models::virtual_file::IoMode;
use pageserver_api::models::AuxFilePolicy;
use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
use pageserver_api::models::IngestAuxFilesRequest;
use pageserver_api::models::ListAuxFilesRequest;
@@ -26,7 +27,6 @@ use pageserver_api::models::LocationConfigListResponse;
use pageserver_api::models::LocationConfigMode;
use pageserver_api::models::LsnLease;
use pageserver_api::models::LsnLeaseRequest;
use pageserver_api::models::OffloadedTimelineInfo;
use pageserver_api::models::ShardParameters;
use pageserver_api::models::TenantDetails;
use pageserver_api::models::TenantLocationConfigRequest;
@@ -38,7 +38,6 @@ use pageserver_api::models::TenantShardSplitRequest;
use pageserver_api::models::TenantShardSplitResponse;
use pageserver_api::models::TenantSorting;
use pageserver_api::models::TimelineArchivalConfigRequest;
use pageserver_api::models::TimelinesInfoAndOffloaded;
use pageserver_api::models::TopTenantShardItem;
use pageserver_api::models::TopTenantShardsRequest;
use pageserver_api::models::TopTenantShardsResponse;
@@ -83,7 +82,6 @@ use crate::tenant::timeline::CompactFlags;
use crate::tenant::timeline::CompactionError;
use crate::tenant::timeline::Timeline;
use crate::tenant::GetTimelineError;
use crate::tenant::OffloadedTimeline;
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
use crate::{disk_usage_eviction_task, tenant};
use pageserver_api::models::{
@@ -476,28 +474,12 @@ async fn build_timeline_info_common(
is_archived: Some(is_archived),
walreceiver_status,
last_aux_file_policy: timeline.last_aux_file_policy.load(),
};
Ok(info)
}
fn build_timeline_offloaded_info(offloaded: &Arc<OffloadedTimeline>) -> OffloadedTimelineInfo {
let &OffloadedTimeline {
tenant_shard_id,
timeline_id,
ancestor_retain_lsn,
ancestor_timeline_id,
archived_at,
..
} = offloaded.as_ref();
OffloadedTimelineInfo {
tenant_id: tenant_shard_id,
timeline_id,
ancestor_retain_lsn,
ancestor_timeline_id,
archived_at: archived_at.and_utc(),
}
}
// healthcheck handler
async fn status_handler(
request: Request<Body>,
@@ -664,7 +646,7 @@ async fn timeline_list_handler(
)
.instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
.await
.context("Failed to build timeline info")
.context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}")
.map_err(ApiError::InternalServerError)?;
response_data.push(timeline_info);
@@ -679,62 +661,6 @@ async fn timeline_list_handler(
json_response(StatusCode::OK, response_data)
}
async fn timeline_and_offloaded_list_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let include_non_incremental_logical_size: Option<bool> =
parse_query_param(&request, "include-non-incremental-logical-size")?;
let force_await_initial_logical_size: Option<bool> =
parse_query_param(&request, "force-await-initial-logical-size")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let state = get_state(&request);
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let response_data = async {
let tenant = state
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id)?;
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
let (timelines, offloadeds) = tenant.list_timelines_and_offloaded();
let mut timeline_infos = Vec::with_capacity(timelines.len());
for timeline in timelines {
let timeline_info = build_timeline_info(
&timeline,
include_non_incremental_logical_size.unwrap_or(false),
force_await_initial_logical_size.unwrap_or(false),
&ctx,
)
.instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
.await
.context("Failed to build timeline info")
.map_err(ApiError::InternalServerError)?;
timeline_infos.push(timeline_info);
}
let offloaded_infos = offloadeds
.into_iter()
.map(|offloaded| build_timeline_offloaded_info(&offloaded))
.collect::<Vec<_>>();
let res = TimelinesInfoAndOffloaded {
timelines: timeline_infos,
offloaded: offloaded_infos,
};
Ok::<TimelinesInfoAndOffloaded, ApiError>(res)
}
.instrument(info_span!("timeline_and_offloaded_list",
tenant_id = %tenant_shard_id.tenant_id,
shard_id = %tenant_shard_id.shard_slug()))
.await?;
json_response(StatusCode::OK, response_data)
}
async fn timeline_preserve_initdb_handler(
request: Request<Body>,
_cancel: CancellationToken,
@@ -2129,13 +2055,13 @@ async fn getpage_at_lsn_handler(
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let state = get_state(&request);
struct Key(pageserver_api::key::Key);
struct Key(crate::repository::Key);
impl std::str::FromStr for Key {
type Err = anyhow::Error;
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
pageserver_api::key::Key::from_hex(s).map(Key)
crate::repository::Key::from_hex(s).map(Key)
}
}
@@ -2328,7 +2254,7 @@ async fn tenant_scan_remote_handler(
%timeline_id))
.await
{
Ok((index_part, index_generation, _index_mtime)) => {
Ok((index_part, index_generation)) => {
tracing::info!("Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)",
index_part.layer_metadata.len(), index_part.metadata.disk_consistent_lsn());
generation = std::cmp::max(generation, index_generation);
@@ -2473,6 +2399,31 @@ async fn post_tracing_event_handler(
json_response(StatusCode::OK, ())
}
async fn force_aux_policy_switch_handler(
mut r: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
check_permission(&r, None)?;
let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;
let timeline_id: TimelineId = parse_request_param(&r, "timeline_id")?;
let policy: AuxFilePolicy = json_request(&mut r).await?;
let state = get_state(&r);
let tenant = state
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id)?;
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
timeline
.do_switch_aux_policy(policy)
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, ())
}
async fn put_io_engine_handler(
mut r: Request<Body>,
_cancel: CancellationToken,
@@ -3070,9 +3021,6 @@ pub fn make_router(
.get("/v1/tenant/:tenant_shard_id/timeline", |r| {
api_handler(r, timeline_list_handler)
})
.get("/v1/tenant/:tenant_shard_id/timeline_and_offloaded", |r| {
api_handler(r, timeline_and_offloaded_list_handler)
})
.post("/v1/tenant/:tenant_shard_id/timeline", |r| {
api_handler(r, timeline_create_handler)
})
@@ -3188,6 +3136,10 @@ pub fn make_router(
)
.put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
.put("/v1/io_mode", |r| api_handler(r, put_io_mode_handler))
.put(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
|r| api_handler(r, force_aux_policy_switch_handler),
)
.get("/v1/utilization", |r| api_handler(r, get_utilization))
.post(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files",

View File

@@ -19,9 +19,10 @@ use crate::metrics::WAL_INGEST;
use crate::pgdatadir_mapping::*;
use crate::tenant::Timeline;
use crate::walingest::WalIngest;
use crate::walrecord::decode_wal_record;
use crate::walrecord::DecodedWALRecord;
use pageserver_api::reltag::{RelTag, SlruKind};
use postgres_ffi::pg_constants;
use postgres_ffi::record::{decode_wal_record, DecodedWALRecord};
use postgres_ffi::relfile_utils::*;
use postgres_ffi::waldecoder::WalStreamDecoder;
use postgres_ffi::ControlFileData;
@@ -455,7 +456,6 @@ pub async fn import_wal_from_tar(
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
let mut decoded = DecodedWALRecord::default();
decode_wal_record(recdata, &mut decoded, tline.pg_version)?;
// let (ephemeral_file_ready_buf, special_records) = decode_wal_record(recdata, tline.pg_version);
walingest
.ingest_record(decoded, lsn, &mut modification, ctx)
.await?;

View File

@@ -20,11 +20,11 @@ pub use pageserver_api::keyspace;
use tokio_util::sync::CancellationToken;
mod assert_u64_eq_usize;
pub mod aux_file;
pub mod gc_result;
pub mod metrics;
pub mod page_cache;
pub mod page_service;
pub mod pgdatadir_mapping;
pub mod repository;
pub mod span;
pub(crate) mod statvfs;
pub mod task_mgr;
@@ -32,6 +32,7 @@ pub mod tenant;
pub mod utilization;
pub mod virtual_file;
pub mod walingest;
pub mod walrecord;
pub mod walredo;
use camino::Utf8Path;

View File

@@ -1189,7 +1189,7 @@ struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
op: SmgrQueryType,
}
impl Drop for GlobalAndPerTimelineHistogramTimer<'_, '_> {
impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
fn drop(&mut self) {
let elapsed = self.start.elapsed();
let ex_throttled = self
@@ -1560,7 +1560,7 @@ impl BasebackupQueryTime {
}
}
impl BasebackupQueryTimeOngoingRecording<'_, '_> {
impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
pub(crate) fn observe<T>(self, res: &Result<T, QueryError>) {
let elapsed = self.start.elapsed();
let ex_throttled = self
@@ -2092,7 +2092,6 @@ pub(crate) struct WalIngestMetrics {
pub(crate) records_received: IntCounter,
pub(crate) records_committed: IntCounter,
pub(crate) records_filtered: IntCounter,
pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter,
}
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
@@ -2116,11 +2115,6 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
"Number of WAL records filtered out due to sharding"
)
.expect("failed to define a metric"),
gap_blocks_zeroed_on_rel_extend: register_int_counter!(
"pageserver_gap_blocks_zeroed_on_rel_extend",
"Total number of zero gap blocks written on relation extends"
)
.expect("failed to define a metric"),
});
pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {

View File

@@ -82,7 +82,6 @@ use once_cell::sync::OnceCell;
use crate::{
context::RequestContext,
metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics},
virtual_file::{IoBufferMut, IoPageSlice},
};
static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
@@ -145,7 +144,7 @@ struct SlotInner {
key: Option<CacheKey>,
// for `coalesce_readers_permit`
permit: std::sync::Mutex<Weak<PinnedSlotsPermit>>,
buf: IoPageSlice<'static>,
buf: &'static mut [u8; PAGE_SZ],
}
impl Slot {
@@ -235,13 +234,13 @@ impl std::ops::Deref for PageReadGuard<'_> {
type Target = [u8; PAGE_SZ];
fn deref(&self) -> &Self::Target {
self.slot_guard.buf.deref()
self.slot_guard.buf
}
}
impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
fn as_ref(&self) -> &[u8; PAGE_SZ] {
self.slot_guard.buf.as_ref()
self.slot_guard.buf
}
}
@@ -267,7 +266,7 @@ enum PageWriteGuardState<'i> {
impl std::ops::DerefMut for PageWriteGuard<'_> {
fn deref_mut(&mut self) -> &mut Self::Target {
match &mut self.state {
PageWriteGuardState::Invalid { inner, _permit } => inner.buf.deref_mut(),
PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
PageWriteGuardState::Downgraded => unreachable!(),
}
}
@@ -278,7 +277,7 @@ impl std::ops::Deref for PageWriteGuard<'_> {
fn deref(&self) -> &Self::Target {
match &self.state {
PageWriteGuardState::Invalid { inner, _permit } => inner.buf.deref(),
PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
PageWriteGuardState::Downgraded => unreachable!(),
}
}
@@ -644,7 +643,7 @@ impl PageCache {
// We could use Vec::leak here, but that potentially also leaks
// uninitialized reserved capacity. With into_boxed_slice and Box::leak
// this is avoided.
let page_buffer = IoBufferMut::with_capacity_zeroed(num_pages * PAGE_SZ).leak();
let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());
let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
size_metrics.max_bytes.set_page_sz(num_pages);
@@ -653,8 +652,7 @@ impl PageCache {
let slots = page_buffer
.chunks_exact_mut(PAGE_SZ)
.map(|chunk| {
// SAFETY: Each chunk has `PAGE_SZ` (8192) bytes, greater than 512, still aligned.
let buf = unsafe { IoPageSlice::new_unchecked(chunk.try_into().unwrap()) };
let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();
Slot {
inner: tokio::sync::RwLock::new(SlotInner {

View File

@@ -1326,22 +1326,22 @@ where
.for_command(ComputeCommandKind::Basebackup)
.inc();
let (lsn, gzip) = match (params.get(2), params.get(3)) {
(None, _) => (None, false),
(Some(&"--gzip"), _) => (None, true),
(Some(lsn_str), gzip_str_opt) => {
let lsn = Lsn::from_str(lsn_str)
.with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?;
let gzip = match gzip_str_opt {
Some(&"--gzip") => true,
None => false,
Some(third_param) => {
return Err(QueryError::Other(anyhow::anyhow!(
"Parameter in position 3 unknown {third_param}",
)))
}
};
(Some(lsn), gzip)
let lsn = if let Some(lsn_str) = params.get(2) {
Some(
Lsn::from_str(lsn_str)
.with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
)
} else {
None
};
let gzip = match params.get(3) {
Some(&"--gzip") => true,
None => false,
Some(third_param) => {
return Err(QueryError::Other(anyhow::anyhow!(
"Parameter in position 3 unknown {third_param}",
)))
}
};

View File

@@ -7,10 +7,11 @@
//! Clarify that)
//!
use super::tenant::{PageReconstructError, Timeline};
use crate::aux_file;
use crate::context::RequestContext;
use crate::keyspace::{KeySpace, KeySpaceAccum};
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
use crate::walrecord::NeonWalRecord;
use crate::{aux_file, repository::*};
use anyhow::{ensure, Context};
use bytes::{Buf, Bytes, BytesMut};
use enum_map::Enum;
@@ -21,10 +22,8 @@ use pageserver_api::key::{
CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
};
use pageserver_api::keyspace::SparseKeySpace;
use pageserver_api::record::NeonWalRecord;
use pageserver_api::models::AuxFilePolicy;
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
use pageserver_api::value::Value;
use pageserver_api::key::Key;
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::BLCKSZ;
use postgres_ffi::{Oid, RepOriginId, TimestampTz, TransactionId};
@@ -34,7 +33,7 @@ use std::ops::ControlFlow;
use std::ops::Range;
use strum::IntoEnumIterator;
use tokio_util::sync::CancellationToken;
use tracing::{debug, trace, warn};
use tracing::{debug, info, trace, warn};
use utils::bin_ser::DeserializeError;
use utils::pausable_failpoint;
use utils::{bin_ser::BeSer, lsn::Lsn};
@@ -678,6 +677,21 @@ impl Timeline {
self.get(CHECKPOINT_KEY, lsn, ctx).await
}
async fn list_aux_files_v1(
&self,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
match self.get(AUX_FILES_KEY, lsn, ctx).await {
Ok(buf) => Ok(AuxFilesDirectory::des(&buf)?.files),
Err(e) => {
// This is expected: historical databases do not have the key.
debug!("Failed to get info about AUX files: {}", e);
Ok(HashMap::new())
}
}
}
async fn list_aux_files_v2(
&self,
lsn: Lsn,
@@ -708,7 +722,10 @@ impl Timeline {
lsn: Lsn,
ctx: &RequestContext,
) -> Result<(), PageReconstructError> {
self.list_aux_files_v2(lsn, ctx).await?;
let current_policy = self.last_aux_file_policy.load();
if let Some(AuxFilePolicy::V2) | Some(AuxFilePolicy::CrossValidation) = current_policy {
self.list_aux_files_v2(lsn, ctx).await?;
}
Ok(())
}
@@ -717,7 +734,51 @@ impl Timeline {
lsn: Lsn,
ctx: &RequestContext,
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
self.list_aux_files_v2(lsn, ctx).await
let current_policy = self.last_aux_file_policy.load();
match current_policy {
Some(AuxFilePolicy::V1) => {
let res = self.list_aux_files_v1(lsn, ctx).await?;
let empty_str = if res.is_empty() { ", empty" } else { "" };
warn!(
"this timeline is using deprecated aux file policy V1 (policy=v1{empty_str})"
);
Ok(res)
}
None => {
let res = self.list_aux_files_v1(lsn, ctx).await?;
if !res.is_empty() {
warn!("this timeline is using deprecated aux file policy V1 (policy=None)");
}
Ok(res)
}
Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await,
Some(AuxFilePolicy::CrossValidation) => {
let v1_result = self.list_aux_files_v1(lsn, ctx).await;
let v2_result = self.list_aux_files_v2(lsn, ctx).await;
match (v1_result, v2_result) {
(Ok(v1), Ok(v2)) => {
if v1 != v2 {
tracing::error!(
"unmatched aux file v1 v2 result:\nv1 {v1:?}\nv2 {v2:?}"
);
return Err(PageReconstructError::Other(anyhow::anyhow!(
"unmatched aux file v1 v2 result"
)));
}
Ok(v1)
}
(Ok(_), Err(v2)) => {
tracing::error!("aux file v1 returns Ok while aux file v2 returns an err");
Err(v2)
}
(Err(v1), Ok(_)) => {
tracing::error!("aux file v2 returns Ok while aux file v1 returns an err");
Err(v1)
}
(Err(_), Err(v2)) => Err(v2),
}
}
}
}
pub(crate) async fn get_replorigins(
@@ -893,6 +954,9 @@ impl Timeline {
result.add_key(CONTROLFILE_KEY);
result.add_key(CHECKPOINT_KEY);
if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
result.add_key(AUX_FILES_KEY);
}
// Add extra keyspaces in the test cases. Some test cases write keys into the storage without
// creating directory keys. These test cases will add such keyspaces into `extra_test_dense_keyspace`
@@ -1102,6 +1166,9 @@ impl<'a> DatadirModification<'a> {
self.pending_directory_entries.push((DirectoryKind::Db, 0));
self.put(DBDIR_KEY, Value::Image(buf.into()));
// Create AuxFilesDirectory
self.init_aux_dir()?;
let buf = if self.tline.pg_version >= 17 {
TwoPhaseDirectoryV17::ser(&TwoPhaseDirectoryV17 {
xids: HashSet::new(),
@@ -1280,6 +1347,9 @@ impl<'a> DatadirModification<'a> {
// 'true', now write the updated 'dbdirs' map back.
let buf = DbDirectory::ser(&dbdir)?;
self.put(DBDIR_KEY, Value::Image(buf.into()));
// Create AuxFilesDirectory as well
self.init_aux_dir()?;
}
if r.is_none() {
// Create RelDirectory
@@ -1656,60 +1726,200 @@ impl<'a> DatadirModification<'a> {
Ok(())
}
pub fn init_aux_dir(&mut self) -> anyhow::Result<()> {
if let AuxFilePolicy::V2 = self.tline.get_switch_aux_file_policy() {
return Ok(());
}
let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
files: HashMap::new(),
})?;
self.pending_directory_entries
.push((DirectoryKind::AuxFiles, 0));
self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
Ok(())
}
pub async fn put_file(
&mut self,
path: &str,
content: &[u8],
ctx: &RequestContext,
) -> anyhow::Result<()> {
let key = aux_file::encode_aux_file_key(path);
// retrieve the key from the engine
let old_val = match self.get(key, ctx).await {
Ok(val) => Some(val),
Err(PageReconstructError::MissingKey(_)) => None,
Err(e) => return Err(e.into()),
};
let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val {
aux_file::decode_file_value(old_val)?
} else {
Vec::new()
};
let mut other_files = Vec::with_capacity(files.len());
let mut modifying_file = None;
for file @ (p, content) in files {
if path == p {
assert!(
modifying_file.is_none(),
"duplicated entries found for {}",
path
);
modifying_file = Some(content);
let switch_policy = self.tline.get_switch_aux_file_policy();
let policy = {
let current_policy = self.tline.last_aux_file_policy.load();
// Allowed switch path:
// * no aux files -> v1/v2/cross-validation
// * cross-validation->v2
let current_policy = if current_policy.is_none() {
// This path will only be hit once per tenant: we will decide the final policy in this code block.
// The next call to `put_file` will always have `last_aux_file_policy != None`.
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
let aux_files_key_v1 = self.tline.list_aux_files_v1(lsn, ctx).await?;
if aux_files_key_v1.is_empty() {
None
} else {
warn!("this timeline is using deprecated aux file policy V1 (detected existing v1 files)");
self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
Some(AuxFilePolicy::V1)
}
} else {
other_files.push(file);
current_policy
};
if AuxFilePolicy::is_valid_migration_path(current_policy, switch_policy) {
self.tline.do_switch_aux_policy(switch_policy)?;
info!(current=?current_policy, next=?switch_policy, "switching aux file policy");
switch_policy
} else {
// This branch handles non-valid migration path, and the case that switch_policy == current_policy.
// And actually, because the migration path always allow unspecified -> *, this unwrap_or will never be hit.
current_policy.unwrap_or(AuxFilePolicy::default_tenant_config())
}
};
if let AuxFilePolicy::V2 | AuxFilePolicy::CrossValidation = policy {
let key = aux_file::encode_aux_file_key(path);
// retrieve the key from the engine
let old_val = match self.get(key, ctx).await {
Ok(val) => Some(val),
Err(PageReconstructError::MissingKey(_)) => None,
Err(e) => return Err(e.into()),
};
let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val {
aux_file::decode_file_value(old_val)?
} else {
Vec::new()
};
let mut other_files = Vec::with_capacity(files.len());
let mut modifying_file = None;
for file @ (p, content) in files {
if path == p {
assert!(
modifying_file.is_none(),
"duplicated entries found for {}",
path
);
modifying_file = Some(content);
} else {
other_files.push(file);
}
}
let mut new_files = other_files;
match (modifying_file, content.is_empty()) {
(Some(old_content), false) => {
self.tline
.aux_file_size_estimator
.on_update(old_content.len(), content.len());
new_files.push((path, content));
}
(Some(old_content), true) => {
self.tline
.aux_file_size_estimator
.on_remove(old_content.len());
// not adding the file key to the final `new_files` vec.
}
(None, false) => {
self.tline.aux_file_size_estimator.on_add(content.len());
new_files.push((path, content));
}
(None, true) => warn!("removing non-existing aux file: {}", path),
}
let new_val = aux_file::encode_file_value(&new_files)?;
self.put(key, Value::Image(new_val.into()));
}
let mut new_files = other_files;
match (modifying_file, content.is_empty()) {
(Some(old_content), false) => {
self.tline
.aux_file_size_estimator
.on_update(old_content.len(), content.len());
new_files.push((path, content));
if let AuxFilePolicy::V1 | AuxFilePolicy::CrossValidation = policy {
let file_path = path.to_string();
let content = if content.is_empty() {
None
} else {
Some(Bytes::copy_from_slice(content))
};
let n_files;
let mut aux_files = self.tline.aux_files.lock().await;
if let Some(mut dir) = aux_files.dir.take() {
// We already updated aux files in `self`: emit a delta and update our latest value.
dir.upsert(file_path.clone(), content.clone());
n_files = dir.files.len();
if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {
self.put(
AUX_FILES_KEY,
Value::Image(Bytes::from(
AuxFilesDirectory::ser(&dir).context("serialize")?,
)),
);
aux_files.n_deltas = 0;
} else {
self.put(
AUX_FILES_KEY,
Value::WalRecord(NeonWalRecord::AuxFile { file_path, content }),
);
aux_files.n_deltas += 1;
}
aux_files.dir = Some(dir);
} else {
// Check if the AUX_FILES_KEY is initialized
match self.get(AUX_FILES_KEY, ctx).await {
Ok(dir_bytes) => {
let mut dir = AuxFilesDirectory::des(&dir_bytes)?;
// Key is already set, we may append a delta
self.put(
AUX_FILES_KEY,
Value::WalRecord(NeonWalRecord::AuxFile {
file_path: file_path.clone(),
content: content.clone(),
}),
);
dir.upsert(file_path, content);
n_files = dir.files.len();
aux_files.dir = Some(dir);
}
Err(
e @ (PageReconstructError::Cancelled
| PageReconstructError::AncestorLsnTimeout(_)),
) => {
// Important that we do not interpret a shutdown error as "not found" and thereby
// reset the map.
return Err(e.into());
}
// Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but
// the original code assumes all other errors are missing keys. Therefore, we keep the code path
// the same for now, though in theory, we should only match the `MissingKey` variant.
Err(
e @ (PageReconstructError::Other(_)
| PageReconstructError::WalRedo(_)
| PageReconstructError::MissingKey(_)),
) => {
// Key is missing, we must insert an image as the basis for subsequent deltas.
if !matches!(e, PageReconstructError::MissingKey(_)) {
let e = utils::error::report_compact_sources(&e);
tracing::warn!("treating error as if it was a missing key: {}", e);
}
let mut dir = AuxFilesDirectory {
files: HashMap::new(),
};
dir.upsert(file_path, content);
self.put(
AUX_FILES_KEY,
Value::Image(Bytes::from(
AuxFilesDirectory::ser(&dir).context("serialize")?,
)),
);
n_files = 1;
aux_files.dir = Some(dir);
}
}
}
(Some(old_content), true) => {
self.tline
.aux_file_size_estimator
.on_remove(old_content.len());
// not adding the file key to the final `new_files` vec.
}
(None, false) => {
self.tline.aux_file_size_estimator.on_add(content.len());
new_files.push((path, content));
}
(None, true) => warn!("removing non-existing aux file: {}", path),
self.pending_directory_entries
.push((DirectoryKind::AuxFiles, n_files));
}
let new_val = aux_file::encode_file_value(&new_files)?;
self.put(key, Value::Image(new_val.into()));
Ok(())
}
@@ -1879,6 +2089,12 @@ impl<'a> DatadirModification<'a> {
self.tline.get(key, lsn, ctx).await
}
/// Only used during unit tests, force putting a key into the modification.
#[cfg(test)]
pub(crate) fn put_for_test(&mut self, key: Key, val: Value) {
self.put(key, val);
}
fn put(&mut self, key: Key, val: Value) {
if Self::is_data_key(&key) {
self.put_data(key.to_compact(), val)
@@ -1996,6 +2212,21 @@ struct RelDirectory {
rels: HashSet<(Oid, u8)>,
}
#[derive(Debug, Serialize, Deserialize, Default, PartialEq)]
pub(crate) struct AuxFilesDirectory {
pub(crate) files: HashMap<String, Bytes>,
}
impl AuxFilesDirectory {
pub(crate) fn upsert(&mut self, key: String, value: Option<Bytes>) {
if let Some(value) = value {
self.files.insert(key, value);
} else {
self.files.remove(&key);
}
}
}
#[derive(Debug, Serialize, Deserialize)]
struct RelSizeEntry {
nblocks: u32,

View File

@@ -1,9 +1,13 @@
//! This module defines the value type used by the storage engine.
use crate::record::NeonWalRecord;
use crate::walrecord::NeonWalRecord;
use anyhow::Result;
use bytes::Bytes;
use serde::{Deserialize, Serialize};
use std::ops::AddAssign;
use std::time::Duration;
pub use pageserver_api::key::{Key, KEY_SIZE};
/// A 'value' stored for a one Key.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub enum Value {
/// An Image value contains a full copy of the value
@@ -29,17 +33,17 @@ impl Value {
}
#[derive(Debug, PartialEq)]
pub enum InvalidInput {
pub(crate) enum InvalidInput {
TooShortValue,
TooShortPostgresRecord,
}
/// We could have a ValueRef where everything is `serde(borrow)`. Before implementing that, lets
/// use this type for querying if a slice looks some particular way.
pub struct ValueBytes;
pub(crate) struct ValueBytes;
impl ValueBytes {
pub fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
pub(crate) fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
if raw.len() < 12 {
return Err(InvalidInput::TooShortValue);
}
@@ -75,7 +79,6 @@ impl ValueBytes {
mod test {
use super::*;
use bytes::Bytes;
use utils::bin_ser::BeSer;
macro_rules! roundtrip {
@@ -226,3 +229,56 @@ mod test {
assert!(!ValueBytes::will_init(&expected).unwrap());
}
}
///
/// Result of performing GC
///
#[derive(Default, Serialize, Debug)]
pub struct GcResult {
pub layers_total: u64,
pub layers_needed_by_cutoff: u64,
pub layers_needed_by_pitr: u64,
pub layers_needed_by_branches: u64,
pub layers_needed_by_leases: u64,
pub layers_not_updated: u64,
pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
#[serde(serialize_with = "serialize_duration_as_millis")]
pub elapsed: Duration,
/// The layers which were garbage collected.
///
/// Used in `/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc` to wait for the layers to be
/// dropped in tests.
#[cfg(feature = "testing")]
#[serde(skip)]
pub(crate) doomed_layers: Vec<crate::tenant::storage_layer::Layer>,
}
// helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds
fn serialize_duration_as_millis<S>(d: &Duration, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
d.as_millis().serialize(serializer)
}
impl AddAssign for GcResult {
fn add_assign(&mut self, other: Self) {
self.layers_total += other.layers_total;
self.layers_needed_by_pitr += other.layers_needed_by_pitr;
self.layers_needed_by_cutoff += other.layers_needed_by_cutoff;
self.layers_needed_by_branches += other.layers_needed_by_branches;
self.layers_needed_by_leases += other.layers_needed_by_leases;
self.layers_not_updated += other.layers_not_updated;
self.layers_removed += other.layers_removed;
self.elapsed += other.elapsed;
#[cfg(feature = "testing")]
{
let mut other = other;
self.doomed_layers.append(&mut other.doomed_layers);
}
}
}

View File

@@ -53,22 +53,6 @@ impl Statvfs {
Statvfs::Mock(stat) => stat.block_size,
}
}
/// Get the available and total bytes on the filesystem.
pub fn get_avail_total_bytes(&self) -> (u64, u64) {
// https://unix.stackexchange.com/a/703650
let blocksize = if self.fragment_size() > 0 {
self.fragment_size()
} else {
self.block_size()
};
// use blocks_available (b_avail) since, pageserver runs as unprivileged user
let avail_bytes = self.blocks_available() * blocksize;
let total_bytes = self.blocks() * blocksize;
(avail_bytes, total_bytes)
}
}
pub mod mock {
@@ -90,7 +74,7 @@ pub mod mock {
let used_bytes = walk_dir_disk_usage(tenants_dir, name_filter.as_deref()).unwrap();
// round it up to the nearest block multiple
let used_blocks = used_bytes.div_ceil(*blocksize);
let used_blocks = (used_bytes + (blocksize - 1)) / blocksize;
if used_blocks > *total_blocks {
panic!(

File diff suppressed because it is too large Load Diff

View File

@@ -5,8 +5,6 @@
use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
use crate::context::RequestContext;
use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
#[cfg(test)]
use crate::virtual_file::IoBufferMut;
use crate::virtual_file::VirtualFile;
use bytes::Bytes;
use std::ops::Deref;
@@ -42,7 +40,7 @@ pub enum BlockLease<'a> {
#[cfg(test)]
Arc(std::sync::Arc<[u8; PAGE_SZ]>),
#[cfg(test)]
IoBufferMut(IoBufferMut),
Vec(Vec<u8>),
}
impl From<PageReadGuard<'static>> for BlockLease<'static> {
@@ -52,13 +50,13 @@ impl From<PageReadGuard<'static>> for BlockLease<'static> {
}
#[cfg(test)]
impl From<std::sync::Arc<[u8; PAGE_SZ]>> for BlockLease<'_> {
impl<'a> From<std::sync::Arc<[u8; PAGE_SZ]>> for BlockLease<'a> {
fn from(value: std::sync::Arc<[u8; PAGE_SZ]>) -> Self {
BlockLease::Arc(value)
}
}
impl Deref for BlockLease<'_> {
impl<'a> Deref for BlockLease<'a> {
type Target = [u8; PAGE_SZ];
fn deref(&self) -> &Self::Target {
@@ -69,7 +67,7 @@ impl Deref for BlockLease<'_> {
#[cfg(test)]
BlockLease::Arc(v) => v.deref(),
#[cfg(test)]
BlockLease::IoBufferMut(v) => {
BlockLease::Vec(v) => {
TryFrom::try_from(&v[..]).expect("caller must ensure that v has PAGE_SZ")
}
}

View File

@@ -131,7 +131,7 @@ struct OnDiskNode<'a, const L: usize> {
values: &'a [u8],
}
impl<const L: usize> OnDiskNode<'_, L> {
impl<'a, const L: usize> OnDiskNode<'a, L> {
///
/// Interpret a PAGE_SZ page as a node.
///

View File

@@ -6,11 +6,10 @@ use crate::config::PageServerConf;
use crate::context::RequestContext;
use crate::page_cache;
use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File;
use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut;
use crate::virtual_file::owned_buffers_io::slice::SliceMutExt;
use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
use crate::virtual_file::owned_buffers_io::write::Buffer;
use crate::virtual_file::{self, owned_buffers_io, IoBufferMut, VirtualFile};
use crate::virtual_file::{self, owned_buffers_io, VirtualFile};
use bytes::BytesMut;
use camino::Utf8PathBuf;
use num_traits::Num;
@@ -108,18 +107,15 @@ impl EphemeralFile {
self.page_cache_file_id
}
pub(crate) async fn load_to_io_buf(
&self,
ctx: &RequestContext,
) -> Result<IoBufferMut, io::Error> {
pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
let size = self.len().into_usize();
let buf = IoBufferMut::with_capacity(size);
let (slice, nread) = self.read_exact_at_eof_ok(0, buf.slice_full(), ctx).await?;
let vec = Vec::with_capacity(size);
let (slice, nread) = self.read_exact_at_eof_ok(0, vec.slice_full(), ctx).await?;
assert_eq!(nread, size);
let buf = slice.into_inner();
assert_eq!(buf.len(), nread);
assert_eq!(buf.capacity(), size, "we shouldn't be reallocating");
Ok(buf)
let vec = slice.into_inner();
assert_eq!(vec.len(), nread);
assert_eq!(vec.capacity(), size, "we shouldn't be reallocating");
Ok(vec)
}
/// Returns the offset at which the first byte of the input was written, for use
@@ -162,7 +158,7 @@ impl EphemeralFile {
}
impl super::storage_layer::inmemory_layer::vectored_dio_read::File for EphemeralFile {
async fn read_exact_at_eof_ok<'a, 'b, B: IoBufAlignedMut + Send>(
async fn read_exact_at_eof_ok<'a, 'b, B: tokio_epoll_uring::IoBufMut + Send>(
&'b self,
start: u64,
dst: tokio_epoll_uring::Slice<B>,
@@ -349,7 +345,7 @@ mod tests {
assert!(file.len() as usize == write_nbytes);
for i in 0..write_nbytes {
assert_eq!(value_offsets[i], i.into_u64());
let buf = IoBufferMut::with_capacity(1);
let buf = Vec::with_capacity(1);
let (buf_slice, nread) = file
.read_exact_at_eof_ok(i.into_u64(), buf.slice_full(), &ctx)
.await
@@ -389,7 +385,7 @@ mod tests {
// assert the state is as this test expects it to be
assert_eq!(
&file.load_to_io_buf(&ctx).await.unwrap(),
&file.load_to_vec(&ctx).await.unwrap(),
&content[0..cap + cap / 2]
);
let md = file
@@ -444,7 +440,7 @@ mod tests {
let (buf, nread) = file
.read_exact_at_eof_ok(
start.into_u64(),
IoBufferMut::with_capacity(len).slice_full(),
Vec::with_capacity(len).slice_full(),
ctx,
)
.await

View File

@@ -48,9 +48,9 @@ mod layer_coverage;
use crate::context::RequestContext;
use crate::keyspace::KeyPartitioning;
use crate::repository::Key;
use crate::tenant::storage_layer::InMemoryLayer;
use anyhow::Result;
use pageserver_api::key::Key;
use pageserver_api::keyspace::{KeySpace, KeySpaceAccum};
use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze};
use std::collections::{HashMap, VecDeque};

View File

@@ -11,7 +11,6 @@ use pageserver_api::shard::{
};
use pageserver_api::upcall_api::ReAttachResponseTenant;
use rand::{distributions::Alphanumeric, Rng};
use remote_storage::TimeoutOrCancel;
use std::borrow::Cow;
use std::cmp::Ordering;
use std::collections::{BTreeMap, HashMap, HashSet};
@@ -1351,17 +1350,47 @@ impl TenantManager {
}
}
async fn delete_tenant_remote(
&self,
tenant_shard_id: TenantShardId,
) -> Result<(), DeleteTenantError> {
let remote_path = remote_tenant_path(&tenant_shard_id);
let mut keys_stream = self.resources.remote_storage.list_streaming(
Some(&remote_path),
remote_storage::ListingMode::NoDelimiter,
None,
&self.cancel,
);
while let Some(chunk) = keys_stream.next().await {
let keys = match chunk {
Ok(listing) => listing.keys,
Err(remote_storage::DownloadError::Cancelled) => {
return Err(DeleteTenantError::Cancelled)
}
Err(remote_storage::DownloadError::NotFound) => return Ok(()),
Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
};
if keys.is_empty() {
tracing::info!("Remote storage already deleted");
} else {
tracing::info!("Deleting {} keys from remote storage", keys.len());
let keys = keys.into_iter().map(|o| o.key).collect::<Vec<_>>();
self.resources
.remote_storage
.delete_objects(&keys, &self.cancel)
.await?;
}
}
Ok(())
}
/// If a tenant is attached, detach it. Then remove its data from remote storage.
///
/// A tenant is considered deleted once it is gone from remote storage. It is the caller's
/// responsibility to avoid trying to attach the tenant again or use it any way once deletion
/// has started: this operation is not atomic, and must be retried until it succeeds.
///
/// As a special case, if an unsharded tenant ID is given for a sharded tenant, it will remove
/// all tenant shards in remote storage (removing all paths with the tenant prefix). The storage
/// controller uses this to purge all remote tenant data, including any stale parent shards that
/// may remain after splits. Ideally, this special case would be handled elsewhere. See:
/// <https://github.com/neondatabase/neon/pull/9394>.
pub(crate) async fn delete_tenant(
&self,
tenant_shard_id: TenantShardId,
@@ -1413,29 +1442,25 @@ impl TenantManager {
// in 500 responses to delete requests.
// - We keep the `SlotGuard` during this I/O, so that if a concurrent delete request comes in, it will
// 503/retry, rather than kicking off a wasteful concurrent deletion.
// NB: this also deletes partial prefixes, i.e. a <tenant_id> path will delete all
// <tenant_id>_<shard_id>/* objects. See method comment for why.
backoff::retry(
|| async move {
self.resources
.remote_storage
.delete_prefix(&remote_tenant_path(&tenant_shard_id), &self.cancel)
.await
match backoff::retry(
|| async move { self.delete_tenant_remote(tenant_shard_id).await },
|e| match e {
DeleteTenantError::Cancelled => true,
DeleteTenantError::SlotError(_) => {
unreachable!("Remote deletion doesn't touch slots")
}
_ => false,
},
|_| false, // backoff::retry handles cancellation
1,
3,
&format!("delete_tenant[tenant_shard_id={tenant_shard_id}]"),
&self.cancel,
)
.await
.unwrap_or(Err(TimeoutOrCancel::Cancel.into()))
.map_err(|err| {
if TimeoutOrCancel::caused_by_cancel(&err) {
return DeleteTenantError::Cancelled;
}
DeleteTenantError::Other(err)
})
{
Some(r) => r,
None => Err(DeleteTenantError::Cancelled),
}
}
#[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))]
@@ -2811,7 +2836,7 @@ where
}
use {
crate::gc_result::GcResult, pageserver_api::models::TimelineGcRequest,
crate::repository::GcResult, pageserver_api::models::TimelineGcRequest,
utils::http::error::ApiError,
};

View File

@@ -180,7 +180,6 @@
pub(crate) mod download;
pub mod index;
pub mod manifest;
pub(crate) mod upload;
use anyhow::Context;
@@ -188,10 +187,11 @@ use camino::Utf8Path;
use chrono::{NaiveDateTime, Utc};
pub(crate) use download::download_initdb_tar_zst;
use pageserver_api::models::TimelineArchivalState;
use pageserver_api::models::{AuxFilePolicy, TimelineArchivalState};
use pageserver_api::shard::{ShardIndex, TenantShardId};
use scopeguard::ScopeGuard;
use tokio_util::sync::CancellationToken;
pub(crate) use upload::upload_initdb_dir;
use utils::backoff::{
self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
};
@@ -245,11 +245,9 @@ use super::upload_queue::{NotInitialized, SetDeletedFlagProgress};
use super::Generation;
pub(crate) use download::{
do_download_tenant_manifest, download_index_part, is_temp_download_file,
list_remote_tenant_shards, list_remote_timelines,
download_index_part, is_temp_download_file, list_remote_tenant_shards, list_remote_timelines,
};
pub(crate) use index::LayerFileMetadata;
pub(crate) use upload::{upload_initdb_dir, upload_tenant_manifest};
// Occasional network issues and such can cause remote operations to fail, and
// that's expected. If a download fails, we log it at info-level, and retry.
@@ -274,12 +272,6 @@ pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
/// which we warn and skip.
const DELETION_QUEUE_FLUSH_TIMEOUT: Duration = Duration::from_secs(10);
/// Hardcode a generation for the tenant manifest for now so that we don't
/// need to deal with generation-less manifests in the future.
///
/// TODO: add proper generation support to all the places that use this.
pub(crate) const TENANT_MANIFEST_GENERATION: Generation = Generation::new(1);
pub enum MaybeDeletedIndexPart {
IndexPart(IndexPart),
Deleted(IndexPart),
@@ -303,10 +295,6 @@ pub enum WaitCompletionError {
UploadQueueShutDownOrStopped,
}
#[derive(Debug, thiserror::Error)]
#[error("Upload queue either in unexpected state or hasn't downloaded manifest yet")]
pub struct UploadQueueNotReadyError;
/// A client for accessing a timeline's data in remote storage.
///
/// This takes care of managing the number of connections, and balancing them
@@ -480,20 +468,6 @@ impl RemoteTimelineClient {
.ok()
}
/// Returns `Ok(Some(timestamp))` if the timeline has been archived, `Ok(None)` if the timeline hasn't been archived.
///
/// Return Err(_) if the remote index_part hasn't been downloaded yet, or the timeline hasn't been stopped yet.
pub(crate) fn archived_at_stopped_queue(
&self,
) -> Result<Option<NaiveDateTime>, UploadQueueNotReadyError> {
self.upload_queue
.lock()
.unwrap()
.stopped_mut()
.map(|q| q.upload_queue_for_deletion.clean.0.archived_at)
.map_err(|_| UploadQueueNotReadyError)
}
fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) {
let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part {
current_remote_index_part
@@ -531,7 +505,7 @@ impl RemoteTimelineClient {
},
);
let (index_part, index_generation, index_last_modified) = download::download_index_part(
let (index_part, _index_generation) = download::download_index_part(
&self.storage_impl,
&self.tenant_shard_id,
&self.timeline_id,
@@ -545,49 +519,6 @@ impl RemoteTimelineClient {
)
.await?;
// Defense in depth: monotonicity of generation numbers is an important correctness guarantee, so when we see a very
// old index, we do extra checks in case this is the result of backward time-travel of the generation number (e.g.
// in case of a bug in the service that issues generation numbers). Indices are allowed to be old, but we expect that
// when we load an old index we are loading the _latest_ index: if we are asked to load an old index and there is
// also a newer index available, that is surprising.
const INDEX_AGE_CHECKS_THRESHOLD: Duration = Duration::from_secs(14 * 24 * 3600);
let index_age = index_last_modified.elapsed().unwrap_or_else(|e| {
if e.duration() > Duration::from_secs(5) {
// We only warn if the S3 clock and our local clock are >5s out: because this is a low resolution
// timestamp, it is common to be out by at least 1 second.
tracing::warn!("Index has modification time in the future: {e}");
}
Duration::ZERO
});
if index_age > INDEX_AGE_CHECKS_THRESHOLD {
tracing::info!(
?index_generation,
age = index_age.as_secs_f64(),
"Loaded an old index, checking for other indices..."
);
// Find the highest-generation index
let (_latest_index_part, latest_index_generation, latest_index_mtime) =
download::download_index_part(
&self.storage_impl,
&self.tenant_shard_id,
&self.timeline_id,
Generation::MAX,
cancel,
)
.await?;
if latest_index_generation > index_generation {
// Unexpected! Why are we loading such an old index if a more recent one exists?
tracing::warn!(
?index_generation,
?latest_index_generation,
?latest_index_mtime,
"Found a newer index while loading an old one"
);
}
}
if index_part.deleted_at.is_some() {
Ok(MaybeDeletedIndexPart::Deleted(index_part))
} else {
@@ -697,6 +628,18 @@ impl RemoteTimelineClient {
Ok(())
}
/// Launch an index-file upload operation in the background, with only the `aux_file_policy` flag updated.
pub(crate) fn schedule_index_upload_for_aux_file_policy_update(
self: &Arc<Self>,
last_aux_file_policy: Option<AuxFilePolicy>,
) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
upload_queue.dirty.last_aux_file_policy = last_aux_file_policy;
self.schedule_index_upload(upload_queue)?;
Ok(())
}
/// Launch an index-file upload operation in the background, with only the `archived_at` field updated.
///
/// Returns whether it is required to wait for the queue to be empty to ensure that the change is uploaded,
@@ -2208,7 +2151,7 @@ pub(crate) struct UploadQueueAccessor<'a> {
inner: std::sync::MutexGuard<'a, UploadQueue>,
}
impl UploadQueueAccessor<'_> {
impl<'a> UploadQueueAccessor<'a> {
pub(crate) fn latest_uploaded_index_part(&self) -> &IndexPart {
match &*self.inner {
UploadQueue::Initialized(x) => &x.clean.0,
@@ -2224,17 +2167,6 @@ pub fn remote_tenant_path(tenant_shard_id: &TenantShardId) -> RemotePath {
RemotePath::from_string(&path).expect("Failed to construct path")
}
pub fn remote_tenant_manifest_path(
tenant_shard_id: &TenantShardId,
generation: Generation,
) -> RemotePath {
let path = format!(
"tenants/{tenant_shard_id}/tenant-manifest{}.json",
generation.get_suffix()
);
RemotePath::from_string(&path).expect("Failed to construct path")
}
pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
let path = format!("tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}");
RemotePath::from_string(&path).expect("Failed to construct path")

View File

@@ -6,7 +6,6 @@
use std::collections::HashSet;
use std::future::Future;
use std::str::FromStr;
use std::time::SystemTime;
use anyhow::{anyhow, Context};
use camino::{Utf8Path, Utf8PathBuf};
@@ -34,11 +33,10 @@ use utils::id::{TenantId, TimelineId};
use utils::pausable_failpoint;
use super::index::{IndexPart, LayerFileMetadata};
use super::manifest::TenantManifest;
use super::{
parse_remote_index_path, remote_index_path, remote_initdb_archive_path,
remote_initdb_preserved_archive_path, remote_tenant_manifest_path, remote_tenant_path,
FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
remote_initdb_preserved_archive_path, remote_tenant_path, FAILED_DOWNLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
};
///
@@ -339,15 +337,19 @@ pub async fn list_remote_timelines(
list_identifiers::<TimelineId>(storage, remote_path, cancel).await
}
async fn do_download_remote_path_retry_forever(
async fn do_download_index_part(
storage: &GenericRemoteStorage,
remote_path: &RemotePath,
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
index_generation: Generation,
cancel: &CancellationToken,
) -> Result<(Vec<u8>, SystemTime), DownloadError> {
download_retry_forever(
) -> Result<(IndexPart, Generation), DownloadError> {
let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
let index_part_bytes = download_retry_forever(
|| async {
let download = storage
.download(remote_path, &DownloadOpts::default(), cancel)
.download(&remote_path, &DownloadOpts::default(), cancel)
.await?;
let mut bytes = Vec::new();
@@ -357,50 +359,18 @@ async fn do_download_remote_path_retry_forever(
tokio::io::copy_buf(&mut stream, &mut bytes).await?;
Ok((bytes, download.last_modified))
Ok(bytes)
},
&format!("download {remote_path:?}"),
cancel,
)
.await
}
pub async fn do_download_tenant_manifest(
storage: &GenericRemoteStorage,
tenant_shard_id: &TenantShardId,
cancel: &CancellationToken,
) -> Result<(TenantManifest, Generation), DownloadError> {
// TODO: generation support
let generation = super::TENANT_MANIFEST_GENERATION;
let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation);
let (manifest_bytes, _manifest_bytes_mtime) =
do_download_remote_path_retry_forever(storage, &remote_path, cancel).await?;
let tenant_manifest = TenantManifest::from_json_bytes(&manifest_bytes)
.with_context(|| format!("deserialize tenant manifest file at {remote_path:?}"))
.map_err(DownloadError::Other)?;
Ok((tenant_manifest, generation))
}
async fn do_download_index_part(
storage: &GenericRemoteStorage,
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
index_generation: Generation,
cancel: &CancellationToken,
) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
let (index_part_bytes, index_part_mtime) =
do_download_remote_path_retry_forever(storage, &remote_path, cancel).await?;
.await?;
let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
.with_context(|| format!("deserialize index part file at {remote_path:?}"))
.map_err(DownloadError::Other)?;
Ok((index_part, index_generation, index_part_mtime))
Ok((index_part, index_generation))
}
/// index_part.json objects are suffixed with a generation number, so we cannot
@@ -415,7 +385,7 @@ pub(crate) async fn download_index_part(
timeline_id: &TimelineId,
my_generation: Generation,
cancel: &CancellationToken,
) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
) -> Result<(IndexPart, Generation), DownloadError> {
debug_assert_current_span_has_tenant_and_timeline_id();
if my_generation.is_none() {

View File

@@ -121,11 +121,11 @@ impl IndexPart {
self.disk_consistent_lsn
}
pub fn from_json_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
pub fn from_s3_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
serde_json::from_slice::<IndexPart>(bytes)
}
pub fn to_json_bytes(&self) -> serde_json::Result<Vec<u8>> {
pub fn to_s3_bytes(&self) -> serde_json::Result<Vec<u8>> {
serde_json::to_vec(self)
}
@@ -133,6 +133,10 @@ impl IndexPart {
pub(crate) fn example() -> Self {
Self::empty(TimelineMetadata::example())
}
pub(crate) fn last_aux_file_policy(&self) -> Option<AuxFilePolicy> {
self.last_aux_file_policy
}
}
/// Metadata gathered for each of the layer files.
@@ -383,7 +387,7 @@ mod tests {
last_aux_file_policy: None,
};
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
assert_eq!(part, expected);
}
@@ -427,7 +431,7 @@ mod tests {
last_aux_file_policy: None,
};
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
assert_eq!(part, expected);
}
@@ -472,7 +476,7 @@ mod tests {
last_aux_file_policy: None,
};
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
assert_eq!(part, expected);
}
@@ -520,7 +524,7 @@ mod tests {
last_aux_file_policy: None,
};
let empty_layers_parsed = IndexPart::from_json_bytes(empty_layers_json.as_bytes()).unwrap();
let empty_layers_parsed = IndexPart::from_s3_bytes(empty_layers_json.as_bytes()).unwrap();
assert_eq!(empty_layers_parsed, expected);
}
@@ -563,7 +567,7 @@ mod tests {
last_aux_file_policy: None,
};
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
assert_eq!(part, expected);
}
@@ -609,7 +613,7 @@ mod tests {
last_aux_file_policy: None,
};
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
assert_eq!(part, expected);
}
@@ -660,7 +664,7 @@ mod tests {
last_aux_file_policy: Some(AuxFilePolicy::V2),
};
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
assert_eq!(part, expected);
}
@@ -716,7 +720,7 @@ mod tests {
last_aux_file_policy: Default::default(),
};
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
assert_eq!(part, expected);
}
@@ -773,7 +777,7 @@ mod tests {
last_aux_file_policy: Default::default(),
};
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
assert_eq!(part, expected);
}
@@ -835,7 +839,7 @@ mod tests {
archived_at: None,
};
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
assert_eq!(part, expected);
}

View File

@@ -1,53 +0,0 @@
use chrono::NaiveDateTime;
use serde::{Deserialize, Serialize};
use utils::{id::TimelineId, lsn::Lsn};
/// Tenant-shard scoped manifest
#[derive(Clone, Serialize, Deserialize)]
pub struct TenantManifest {
/// Debugging aid describing the version of this manifest.
/// Can also be used for distinguishing breaking changes later on.
pub version: usize,
/// The list of offloaded timelines together with enough information
/// to not have to actually load them.
///
/// Note: the timelines mentioned in this list might be deleted, i.e.
/// we don't hold an invariant that the references aren't dangling.
/// Existence of index-part.json is the actual indicator of timeline existence.
pub offloaded_timelines: Vec<OffloadedTimelineManifest>,
}
/// The remote level representation of an offloaded timeline.
///
/// Very similar to [`pageserver_api::models::OffloadedTimelineInfo`],
/// but the two datastructures serve different needs, this is for a persistent disk format
/// that must be backwards compatible, while the other is only for informative purposes.
#[derive(Clone, Serialize, Deserialize, Copy)]
pub struct OffloadedTimelineManifest {
pub timeline_id: TimelineId,
/// Whether the timeline has a parent it has been branched off from or not
pub ancestor_timeline_id: Option<TimelineId>,
/// Whether to retain the branch lsn at the ancestor or not
pub ancestor_retain_lsn: Option<Lsn>,
/// The time point when the timeline was archived
pub archived_at: NaiveDateTime,
}
pub const LATEST_TENANT_MANIFEST_VERSION: usize = 1;
impl TenantManifest {
pub(crate) fn empty() -> Self {
Self {
version: LATEST_TENANT_MANIFEST_VERSION,
offloaded_timelines: vec![],
}
}
pub(crate) fn from_json_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
serde_json::from_slice::<Self>(bytes)
}
pub(crate) fn to_json_bytes(&self) -> serde_json::Result<Vec<u8>> {
serde_json::to_vec(self)
}
}

View File

@@ -13,11 +13,9 @@ use tokio_util::sync::CancellationToken;
use utils::{backoff, pausable_failpoint};
use super::index::IndexPart;
use super::manifest::TenantManifest;
use super::Generation;
use crate::tenant::remote_timeline_client::{
remote_index_path, remote_initdb_archive_path, remote_initdb_preserved_archive_path,
remote_tenant_manifest_path,
};
use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError};
use utils::id::{TenantId, TimelineId};
@@ -41,7 +39,7 @@ pub(crate) async fn upload_index_part<'a>(
pausable_failpoint!("before-upload-index-pausable");
// FIXME: this error comes too late
let serialized = index_part.to_json_bytes()?;
let serialized = index_part.to_s3_bytes()?;
let serialized = Bytes::from(serialized);
let index_part_size = serialized.len();
@@ -57,37 +55,6 @@ pub(crate) async fn upload_index_part<'a>(
.await
.with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
}
/// Serializes and uploads the given tenant manifest data to the remote storage.
pub(crate) async fn upload_tenant_manifest(
storage: &GenericRemoteStorage,
tenant_shard_id: &TenantShardId,
generation: Generation,
tenant_manifest: &TenantManifest,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
tracing::trace!("uploading new tenant manifest");
fail_point!("before-upload-manifest", |_| {
bail!("failpoint before-upload-manifest")
});
pausable_failpoint!("before-upload-manifest-pausable");
let serialized = tenant_manifest.to_json_bytes()?;
let serialized = Bytes::from(serialized);
let tenant_manifest_site = serialized.len();
let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation);
storage
.upload_storage_object(
futures::stream::once(futures::future::ready(Ok(serialized))),
tenant_manifest_site,
&remote_path,
cancel,
)
.await
.with_context(|| format!("upload tenant manifest for '{tenant_shard_id}'"))
}
/// Attempts to upload given layer files.
/// No extra checks for overlapping files is made and any files that are already present remotely will be overwritten, if submitted during the upload.

View File

@@ -108,6 +108,7 @@ impl scheduler::Completion for WriteComplete {
/// when we last did a write. We only populate this after doing at least one
/// write for a tenant -- this avoids holding state for tenants that have
/// uploads disabled.
struct UploaderTenantState {
// This Weak only exists to enable culling idle instances of this type
// when the Tenant has been deallocated.

View File

@@ -11,11 +11,11 @@ pub mod merge_iterator;
pub mod split_writer;
use crate::context::{AccessStatsBehavior, RequestContext};
use crate::repository::Value;
use crate::walrecord::NeonWalRecord;
use bytes::Bytes;
use pageserver_api::key::Key;
use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
use pageserver_api::record::NeonWalRecord;
use pageserver_api::value::Value;
use std::cmp::{Ordering, Reverse};
use std::collections::hash_map::Entry;
use std::collections::{BinaryHeap, HashMap};
@@ -705,7 +705,7 @@ pub mod tests {
/// Useful with `Key`, which has too verbose `{:?}` for printing multiple layers.
struct RangeDisplayDebug<'a, T: std::fmt::Display>(&'a Range<T>);
impl<T: std::fmt::Display> std::fmt::Debug for RangeDisplayDebug<'_, T> {
impl<'a, T: std::fmt::Display> std::fmt::Debug for RangeDisplayDebug<'a, T> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}..{}", self.0.start, self.0.end)
}

View File

@@ -30,6 +30,7 @@
use crate::config::PageServerConf;
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
use crate::page_cache::{self, FileId, PAGE_SZ};
use crate::repository::{Key, Value, KEY_SIZE};
use crate::tenant::blob_io::BlobWriter;
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
use crate::tenant::disk_btree::{
@@ -43,21 +44,19 @@ use crate::tenant::vectored_blob_io::{
};
use crate::tenant::PageReconstructError;
use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
use crate::virtual_file::IoBufferMut;
use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
use crate::TEMP_FILE_SUFFIX;
use crate::{walrecord, TEMP_FILE_SUFFIX};
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
use anyhow::{anyhow, bail, ensure, Context, Result};
use bytes::BytesMut;
use camino::{Utf8Path, Utf8PathBuf};
use futures::StreamExt;
use itertools::Itertools;
use pageserver_api::config::MaxVectoredReadBytes;
use pageserver_api::key::DBDIR_KEY;
use pageserver_api::key::{Key, KEY_SIZE};
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::ImageCompressionAlgorithm;
use pageserver_api::shard::TenantShardId;
use pageserver_api::value::Value;
use rand::{distributions::Alphanumeric, Rng};
use serde::{Deserialize, Serialize};
use std::collections::VecDeque;
@@ -516,8 +515,8 @@ impl DeltaLayerWriterInner {
) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
let temp_path = self.path.clone();
let result = self.finish0(key_end, ctx).await;
if let Err(ref e) = result {
tracing::info!(%temp_path, "cleaning up temporary file after error during writing: {e}");
if result.is_err() {
tracing::info!(%temp_path, "cleaning up temporary file after error during writing");
if let Err(e) = std::fs::remove_file(&temp_path) {
tracing::warn!(error=%e, %temp_path, "error cleaning up temporary layer file after error during writing");
}
@@ -530,7 +529,8 @@ impl DeltaLayerWriterInner {
key_end: Key,
ctx: &RequestContext,
) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
let index_start_blk = self.blob_writer.size().div_ceil(PAGE_SZ as u64) as u32;
let index_start_blk =
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
let mut file = self.blob_writer.into_inner(ctx).await?;
@@ -1003,7 +1003,7 @@ impl DeltaLayerInner {
.0
.into();
let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes);
let mut buf = Some(IoBufferMut::with_capacity(buf_size));
let mut buf = Some(BytesMut::with_capacity(buf_size));
// Note that reads are processed in reverse order (from highest key+lsn).
// This is the order that `ReconstructState` requires such that it can
@@ -1030,7 +1030,7 @@ impl DeltaLayerInner {
// We have "lost" the buffer since the lower level IO api
// doesn't return the buffer on error. Allocate a new one.
buf = Some(IoBufferMut::with_capacity(buf_size));
buf = Some(BytesMut::with_capacity(buf_size));
continue;
}
@@ -1204,7 +1204,7 @@ impl DeltaLayerInner {
.map(|x| x.0.get())
.unwrap_or(8192);
let mut buffer = Some(IoBufferMut::with_capacity(max_read_size));
let mut buffer = Some(BytesMut::with_capacity(max_read_size));
// FIXME: buffering of DeltaLayerWriter
let mut per_blob_copy = Vec::new();
@@ -1294,7 +1294,7 @@ impl DeltaLayerInner {
// is it an image or will_init walrecord?
// FIXME: this could be handled by threading the BlobRef to the
// VectoredReadBuilder
let will_init = pageserver_api::value::ValueBytes::will_init(&data)
let will_init = crate::repository::ValueBytes::will_init(&data)
.inspect_err(|_e| {
#[cfg(feature = "testing")]
tracing::error!(data=?utils::Hex(&data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value");
@@ -1357,7 +1357,7 @@ impl DeltaLayerInner {
format!(" img {} bytes", img.len())
}
Value::WalRecord(rec) => {
let wal_desc = pageserver_api::record::describe_wal_record(&rec)?;
let wal_desc = walrecord::describe_wal_record(&rec)?;
format!(
" rec {} bytes will_init: {} {}",
buf.len(),
@@ -1562,11 +1562,12 @@ impl<'a> DeltaLayerIterator<'a> {
let vectored_blob_reader = VectoredBlobReader::new(&self.delta_layer.file);
let mut next_batch = std::collections::VecDeque::new();
let buf_size = plan.size();
let buf = IoBufferMut::with_capacity(buf_size);
let buf = BytesMut::with_capacity(buf_size);
let blobs_buf = vectored_blob_reader
.read_blobs(&plan, buf, self.ctx)
.await?;
let view = BufView::new_slice(&blobs_buf.buf);
let frozen_buf = blobs_buf.buf.freeze();
let view = BufView::new_bytes(frozen_buf);
for meta in blobs_buf.blobs.iter() {
let blob_read = meta.read(&view).await?;
let value = Value::des(&blob_read)?;
@@ -1601,6 +1602,7 @@ pub(crate) mod test {
use rand::RngCore;
use super::*;
use crate::repository::Value;
use crate::tenant::harness::TIMELINE_ID;
use crate::tenant::storage_layer::{Layer, ResidentLayer};
use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
@@ -1612,7 +1614,6 @@ pub(crate) mod test {
DEFAULT_PG_VERSION,
};
use bytes::Bytes;
use pageserver_api::value::Value;
/// Construct an index for a fictional delta layer and and then
/// traverse in order to plan vectored reads for a query. Finally,
@@ -1941,7 +1942,7 @@ pub(crate) mod test {
&vectored_reads,
constants::MAX_VECTORED_READ_BYTES,
);
let mut buf = Some(IoBufferMut::with_capacity(buf_size));
let mut buf = Some(BytesMut::with_capacity(buf_size));
for read in vectored_reads {
let blobs_buf = vectored_blob_reader
@@ -1965,8 +1966,8 @@ pub(crate) mod test {
#[tokio::test]
async fn copy_delta_prefix_smoke() {
use crate::walrecord::NeonWalRecord;
use bytes::Bytes;
use pageserver_api::record::NeonWalRecord;
let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke")
.await

View File

@@ -7,7 +7,7 @@ use pageserver_api::{
};
use utils::lsn::Lsn;
use pageserver_api::value::Value;
use crate::repository::Value;
use super::merge_iterator::MergeIterator;
@@ -121,8 +121,8 @@ mod tests {
#[tokio::test]
async fn filter_keyspace_iterator() {
use crate::repository::Value;
use bytes::Bytes;
use pageserver_api::value::Value;
let harness = TenantHarness::create("filter_iterator_filter_keyspace_iterator")
.await

View File

@@ -28,6 +28,7 @@
use crate::config::PageServerConf;
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
use crate::page_cache::{self, FileId, PAGE_SZ};
use crate::repository::{Key, Value, KEY_SIZE};
use crate::tenant::blob_io::BlobWriter;
use crate::tenant::block_io::{BlockBuf, FileBlockReader};
use crate::tenant::disk_btree::{
@@ -40,20 +41,17 @@ use crate::tenant::vectored_blob_io::{
};
use crate::tenant::PageReconstructError;
use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
use crate::virtual_file::IoBufferMut;
use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
use anyhow::{anyhow, bail, ensure, Context, Result};
use bytes::Bytes;
use bytes::{Bytes, BytesMut};
use camino::{Utf8Path, Utf8PathBuf};
use hex;
use itertools::Itertools;
use pageserver_api::config::MaxVectoredReadBytes;
use pageserver_api::key::DBDIR_KEY;
use pageserver_api::key::{Key, KEY_SIZE};
use pageserver_api::keyspace::KeySpace;
use pageserver_api::shard::{ShardIdentity, TenantShardId};
use pageserver_api::value::Value;
use rand::{distributions::Alphanumeric, Rng};
use serde::{Deserialize, Serialize};
use std::collections::VecDeque;
@@ -549,10 +547,10 @@ impl ImageLayerInner {
for read in plan.into_iter() {
let buf_size = read.size();
let buf = IoBufferMut::with_capacity(buf_size);
let buf = BytesMut::with_capacity(buf_size);
let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?;
let view = BufView::new_slice(&blobs_buf.buf);
let frozen_buf = blobs_buf.buf.freeze();
let view = BufView::new_bytes(frozen_buf);
for meta in blobs_buf.blobs.iter() {
let img_buf = meta.read(&view).await?;
@@ -611,12 +609,13 @@ impl ImageLayerInner {
}
}
let buf = IoBufferMut::with_capacity(buf_size);
let buf = BytesMut::with_capacity(buf_size);
let res = vectored_blob_reader.read_blobs(&read, buf, ctx).await;
match res {
Ok(blobs_buf) => {
let view = BufView::new_slice(&blobs_buf.buf);
let frozen_buf = blobs_buf.buf.freeze();
let view = BufView::new_bytes(frozen_buf);
for meta in blobs_buf.blobs.iter() {
let img_buf = meta.read(&view).await;
@@ -829,26 +828,8 @@ impl ImageLayerWriterInner {
ctx: &RequestContext,
end_key: Option<Key>,
) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
let temp_path = self.path.clone();
let result = self.finish0(ctx, end_key).await;
if let Err(ref e) = result {
tracing::info!(%temp_path, "cleaning up temporary file after error during writing: {e}");
if let Err(e) = std::fs::remove_file(&temp_path) {
tracing::warn!(error=%e, %temp_path, "error cleaning up temporary layer file after error during writing");
}
}
result
}
///
/// Finish writing the image layer.
///
async fn finish0(
self,
ctx: &RequestContext,
end_key: Option<Key>,
) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
let index_start_blk = self.blob_writer.size().div_ceil(PAGE_SZ as u64) as u32;
let index_start_blk =
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
// Calculate compression ratio
let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header
@@ -1070,11 +1051,12 @@ impl<'a> ImageLayerIterator<'a> {
let vectored_blob_reader = VectoredBlobReader::new(&self.image_layer.file);
let mut next_batch = std::collections::VecDeque::new();
let buf_size = plan.size();
let buf = IoBufferMut::with_capacity(buf_size);
let buf = BytesMut::with_capacity(buf_size);
let blobs_buf = vectored_blob_reader
.read_blobs(&plan, buf, self.ctx)
.await?;
let view = BufView::new_slice(&blobs_buf.buf);
let frozen_buf = blobs_buf.buf.freeze();
let view = BufView::new_bytes(frozen_buf);
for meta in blobs_buf.blobs.iter() {
let img_buf = meta.read(&view).await?;
next_batch.push_back((
@@ -1110,7 +1092,6 @@ mod test {
use itertools::Itertools;
use pageserver_api::{
key::Key,
value::Value,
shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize},
};
use utils::{
@@ -1121,6 +1102,7 @@ mod test {
use crate::{
context::RequestContext,
repository::Value,
tenant::{
config::TenantConf,
harness::{TenantHarness, TIMELINE_ID},

View File

@@ -7,19 +7,19 @@
use crate::assert_u64_eq_usize::{u64_to_usize, U64IsUsize, UsizeIsU64};
use crate::config::PageServerConf;
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
use crate::repository::{Key, Value};
use crate::tenant::ephemeral_file::EphemeralFile;
use crate::tenant::timeline::GetVectoredError;
use crate::tenant::PageReconstructError;
use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
use crate::{l0_flush, page_cache};
use anyhow::{anyhow, Context, Result};
use bytes::Bytes;
use camino::Utf8PathBuf;
use pageserver_api::key::CompactKey;
use pageserver_api::key::Key;
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::InMemoryLayerInfo;
use pageserver_api::shard::TenantShardId;
use pageserver_api::value::Value;
use std::collections::{BTreeMap, HashMap};
use std::sync::{Arc, OnceLock};
use std::time::Instant;
@@ -809,8 +809,9 @@ impl InMemoryLayer {
match l0_flush_global_state {
l0_flush::Inner::Direct { .. } => {
let file_contents = inner.file.load_to_io_buf(ctx).await?;
let file_contents = file_contents.freeze();
let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
let file_contents = Bytes::from(file_contents);
for (key, vec_map) in inner.index.iter() {
// Write all page versions
@@ -824,7 +825,7 @@ impl InMemoryLayer {
len,
will_init,
} = entry;
let buf = file_contents.slice(pos as usize..(pos + len) as usize);
let buf = Bytes::slice(&file_contents, pos as usize..(pos + len) as usize);
let (_buf, res) = delta_layer_writer
.put_value_bytes(
Key::from_compact(*key),

View File

@@ -9,7 +9,6 @@ use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice};
use crate::{
assert_u64_eq_usize::{U64IsUsize, UsizeIsU64},
context::RequestContext,
virtual_file::{owned_buffers_io::io_buf_aligned::IoBufAlignedMut, IoBufferMut},
};
/// The file interface we require. At runtime, this is a [`crate::tenant::ephemeral_file::EphemeralFile`].
@@ -25,7 +24,7 @@ pub trait File: Send {
/// [`std::io::ErrorKind::UnexpectedEof`] error if the file is shorter than `start+dst.len()`.
///
/// No guarantees are made about the remaining bytes in `dst` in case of a short read.
async fn read_exact_at_eof_ok<'a, 'b, B: IoBufAlignedMut + Send>(
async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>(
&'b self,
start: u64,
dst: Slice<B>,
@@ -228,7 +227,7 @@ where
// Execute physical reads and fill the logical read buffers
// TODO: pipelined reads; prefetch;
let get_io_buffer = |nchunks| IoBufferMut::with_capacity(nchunks * DIO_CHUNK_SIZE);
let get_io_buffer = |nchunks| Vec::with_capacity(nchunks * DIO_CHUNK_SIZE);
for PhysicalRead {
start_chunk_no,
nchunks,
@@ -460,7 +459,7 @@ mod tests {
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
let file = InMemoryFile::new_random(10);
let test_read = |pos, len| {
let buf = IoBufferMut::with_capacity_zeroed(len);
let buf = vec![0; len];
let fut = file.read_exact_at_eof_ok(pos, buf.slice_full(), &ctx);
use futures::FutureExt;
let (slice, nread) = fut
@@ -471,9 +470,9 @@ mod tests {
buf.truncate(nread);
buf
};
assert_eq!(&test_read(0, 1), &file.content[0..1]);
assert_eq!(&test_read(1, 2), &file.content[1..3]);
assert_eq!(&test_read(9, 2), &file.content[9..]);
assert_eq!(test_read(0, 1), &file.content[0..1]);
assert_eq!(test_read(1, 2), &file.content[1..3]);
assert_eq!(test_read(9, 2), &file.content[9..]);
assert!(test_read(10, 2).is_empty());
assert!(test_read(11, 2).is_empty());
}
@@ -610,7 +609,7 @@ mod tests {
}
impl<'x> File for RecorderFile<'x> {
async fn read_exact_at_eof_ok<'a, 'b, B: IoBufAlignedMut + Send>(
async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>(
&'b self,
start: u64,
dst: Slice<B>,
@@ -783,7 +782,7 @@ mod tests {
2048, 1024 => Err("foo".to_owned()),
};
let buf = IoBufferMut::with_capacity(512);
let buf = Vec::with_capacity(512);
let (buf, nread) = mock_file
.read_exact_at_eof_ok(0, buf.slice_full(), &ctx)
.await
@@ -791,7 +790,7 @@ mod tests {
assert_eq!(nread, 512);
assert_eq!(&buf.into_inner()[..nread], &[0; 512]);
let buf = IoBufferMut::with_capacity(512);
let buf = Vec::with_capacity(512);
let (buf, nread) = mock_file
.read_exact_at_eof_ok(512, buf.slice_full(), &ctx)
.await
@@ -799,7 +798,7 @@ mod tests {
assert_eq!(nread, 512);
assert_eq!(&buf.into_inner()[..nread], &[1; 512]);
let buf = IoBufferMut::with_capacity(512);
let buf = Vec::with_capacity(512);
let (buf, nread) = mock_file
.read_exact_at_eof_ok(1024, buf.slice_full(), &ctx)
.await
@@ -807,7 +806,7 @@ mod tests {
assert_eq!(nread, 10);
assert_eq!(&buf.into_inner()[..nread], &[2; 10]);
let buf = IoBufferMut::with_capacity(1024);
let buf = Vec::with_capacity(1024);
let err = mock_file
.read_exact_at_eof_ok(2048, buf.slice_full(), &ctx)
.await

View File

@@ -341,10 +341,6 @@ impl Layer {
Ok(())
}
pub(crate) async fn needs_download(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
self.0.needs_download().await
}
/// Assuming the layer is already downloaded, returns a guard which will prohibit eviction
/// while the guard exists.
///
@@ -978,7 +974,7 @@ impl LayerInner {
let timeline = self
.timeline
.upgrade()
.ok_or(DownloadError::TimelineShutdown)?;
.ok_or_else(|| DownloadError::TimelineShutdown)?;
// count cancellations, which currently remain largely unexpected
let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());

View File

@@ -760,8 +760,8 @@ async fn evict_and_wait_does_not_wait_for_download() {
/// Also checks that the same does not happen on a non-evicted layer (regression test).
#[tokio::test(start_paused = true)]
async fn eviction_cancellation_on_drop() {
use crate::repository::Value;
use bytes::Bytes;
use pageserver_api::value::Value;
// this is the runtime on which Layer spawns the blocking tasks on
let handle = tokio::runtime::Handle::current();
@@ -782,7 +782,7 @@ async fn eviction_cancellation_on_drop() {
let mut writer = timeline.writer().await;
writer
.put(
pageserver_api::key::Key::from_i128(5),
crate::repository::Key::from_i128(5),
Lsn(0x20),
&Value::Image(Bytes::from_static(b"this does not matter either")),
&ctx,

View File

@@ -3,7 +3,7 @@ use pageserver_api::shard::TenantShardId;
use std::ops::Range;
use utils::{id::TimelineId, lsn::Lsn};
use pageserver_api::key::Key;
use crate::repository::Key;
use super::{DeltaLayerName, ImageLayerName, LayerName};

Some files were not shown because too many files have changed in this diff Show More