mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-25 17:10:38 +00:00
Compare commits
61 Commits
hackathon/
...
hackaneon/
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6037b8e6c1 | ||
|
|
c108872a36 | ||
|
|
588e6dd1ef | ||
|
|
2fafe47e09 | ||
|
|
e6e72e8b68 | ||
|
|
2a0695e01a | ||
|
|
af6c168265 | ||
|
|
1b3209497f | ||
|
|
9f97523d0d | ||
|
|
33196c90bc | ||
|
|
c72c5df922 | ||
|
|
4c7599e8df | ||
|
|
4b2b0a24da | ||
|
|
ef5a95010b | ||
|
|
c736f9d6ef | ||
|
|
adc798b59e | ||
|
|
f0668a7a4d | ||
|
|
6d73642a93 | ||
|
|
9012a18fa1 | ||
|
|
a6a7550bb4 | ||
|
|
10556f25df | ||
|
|
f54cf567ff | ||
|
|
4303914681 | ||
|
|
539225d792 | ||
|
|
c118736c9c | ||
|
|
3f85246a42 | ||
|
|
709a6ad29b | ||
|
|
3640824553 | ||
|
|
fea1f34f6a | ||
|
|
5d40d1ccdd | ||
|
|
b2cb10590e | ||
|
|
2923fd2a5b | ||
|
|
2a5336b9ab | ||
|
|
6f20726610 | ||
|
|
29f741e1e9 | ||
|
|
2b37a40079 | ||
|
|
af2b65a2fb | ||
|
|
5d194c7824 | ||
|
|
ac2702afd3 | ||
|
|
88fd46d795 | ||
|
|
2d6763882e | ||
|
|
c0c23cde72 | ||
|
|
942bc9544b | ||
|
|
02b7cdb305 | ||
|
|
7d7d1f354b | ||
|
|
16c200d6d9 | ||
|
|
3dbd34aa78 | ||
|
|
fa3fc73c1b | ||
|
|
ac5815b594 | ||
|
|
30583cb626 | ||
|
|
c1a51416db | ||
|
|
8eab7009c1 | ||
|
|
11cf16e3f3 | ||
|
|
af6f63617e | ||
|
|
e287f36a05 | ||
|
|
cbcd4058ed | ||
|
|
e86fef05dd | ||
|
|
a1323231bc | ||
|
|
06e840b884 | ||
|
|
cf11c8ab6a | ||
|
|
04f99a87bf |
7
.github/actionlint.yml
vendored
7
.github/actionlint.yml
vendored
@@ -7,6 +7,13 @@ self-hosted-runner:
|
||||
- small-arm64
|
||||
- us-east-2
|
||||
config-variables:
|
||||
- AZURE_DEV_CLIENT_ID
|
||||
- AZURE_DEV_REGISTRY_NAME
|
||||
- AZURE_DEV_SUBSCRIPTION_ID
|
||||
- AZURE_PROD_CLIENT_ID
|
||||
- AZURE_PROD_REGISTRY_NAME
|
||||
- AZURE_PROD_SUBSCRIPTION_ID
|
||||
- AZURE_TENANT_ID
|
||||
- BENCHMARK_PROJECT_ID_PUB
|
||||
- BENCHMARK_PROJECT_ID_SUB
|
||||
- REMOTE_STORAGE_AZURE_CONTAINER
|
||||
|
||||
56
.github/workflows/_push-to-acr.yml
vendored
Normal file
56
.github/workflows/_push-to-acr.yml
vendored
Normal file
@@ -0,0 +1,56 @@
|
||||
name: Push images to ACR
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
client_id:
|
||||
description: Client ID of Azure managed identity or Entra app
|
||||
required: true
|
||||
type: string
|
||||
image_tag:
|
||||
description: Tag for the container image
|
||||
required: true
|
||||
type: string
|
||||
images:
|
||||
description: Images to push
|
||||
required: true
|
||||
type: string
|
||||
registry_name:
|
||||
description: Name of the container registry
|
||||
required: true
|
||||
type: string
|
||||
subscription_id:
|
||||
description: Azure subscription ID
|
||||
required: true
|
||||
type: string
|
||||
tenant_id:
|
||||
description: Azure tenant ID
|
||||
required: true
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
push-to-acr:
|
||||
runs-on: ubuntu-22.04
|
||||
permissions:
|
||||
contents: read # This is required for actions/checkout
|
||||
id-token: write # This is required for Azure Login to work.
|
||||
|
||||
steps:
|
||||
- name: Azure login
|
||||
uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1
|
||||
with:
|
||||
client-id: ${{ inputs.client_id }}
|
||||
subscription-id: ${{ inputs.subscription_id }}
|
||||
tenant-id: ${{ inputs.tenant_id }}
|
||||
|
||||
- name: Login to ACR
|
||||
run: |
|
||||
az acr login --name=${{ inputs.registry_name }}
|
||||
|
||||
- name: Copy docker images to ACR ${{ inputs.registry_name }}
|
||||
run: |
|
||||
images='${{ inputs.images }}'
|
||||
for image in ${images}; do
|
||||
docker buildx imagetools create \
|
||||
-t ${{ inputs.registry_name }}.azurecr.io/neondatabase/${image}:${{ inputs.image_tag }} \
|
||||
neondatabase/${image}:${{ inputs.image_tag }}
|
||||
done
|
||||
53
.github/workflows/build_and_test.yml
vendored
53
.github/workflows/build_and_test.yml
vendored
@@ -794,9 +794,6 @@ jobs:
|
||||
docker compose -f ./docker-compose/docker-compose.yml down
|
||||
|
||||
promote-images:
|
||||
permissions:
|
||||
contents: read # This is required for actions/checkout
|
||||
id-token: write # This is required for Azure Login to work.
|
||||
needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
@@ -823,28 +820,6 @@ jobs:
|
||||
neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
|
||||
done
|
||||
|
||||
- name: Azure login
|
||||
if: github.ref_name == 'main'
|
||||
uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1
|
||||
with:
|
||||
client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
|
||||
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
|
||||
subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
|
||||
|
||||
- name: Login to ACR
|
||||
if: github.ref_name == 'main'
|
||||
run: |
|
||||
az acr login --name=neoneastus2
|
||||
|
||||
- name: Copy docker images to ACR-dev
|
||||
if: github.ref_name == 'main'
|
||||
run: |
|
||||
for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do
|
||||
docker buildx imagetools create \
|
||||
-t neoneastus2.azurecr.io/neondatabase/${image}:${{ needs.tag.outputs.build-tag }} \
|
||||
neondatabase/${image}:${{ needs.tag.outputs.build-tag }}
|
||||
done
|
||||
|
||||
- name: Add latest tag to images
|
||||
if: github.ref_name == 'main'
|
||||
run: |
|
||||
@@ -882,6 +857,30 @@ jobs:
|
||||
369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }}
|
||||
done
|
||||
|
||||
push-to-acr-dev:
|
||||
if: github.ref_name == 'main'
|
||||
needs: [ tag, promote-images ]
|
||||
uses: ./.github/workflows/_push-to-acr.yml
|
||||
with:
|
||||
client_id: ${{ vars.AZURE_DEV_CLIENT_ID }}
|
||||
image_tag: ${{ needs.tag.outputs.build-tag }}
|
||||
images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16
|
||||
registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
|
||||
subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
|
||||
tenant_id: ${{ vars.AZURE_TENANT_ID }}
|
||||
|
||||
push-to-acr-prod:
|
||||
if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
||||
needs: [ tag, promote-images ]
|
||||
uses: ./.github/workflows/_push-to-acr.yml
|
||||
with:
|
||||
client_id: ${{ vars.AZURE_PROD_CLIENT_ID }}
|
||||
image_tag: ${{ needs.tag.outputs.build-tag }}
|
||||
images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16
|
||||
registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
|
||||
subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
|
||||
tenant_id: ${{ vars.AZURE_TENANT_ID }}
|
||||
|
||||
trigger-custom-extensions-build-and-wait:
|
||||
needs: [ check-permissions, tag ]
|
||||
runs-on: ubuntu-22.04
|
||||
@@ -957,8 +956,8 @@ jobs:
|
||||
exit 1
|
||||
|
||||
deploy:
|
||||
needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
|
||||
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
||||
needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
|
||||
if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy') && !failure() && !cancelled()
|
||||
|
||||
runs-on: [ self-hosted, small ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||
|
||||
34
.github/workflows/label-for-external-users.yml
vendored
34
.github/workflows/label-for-external-users.yml
vendored
@@ -7,6 +7,11 @@ on:
|
||||
pull_request_target:
|
||||
types:
|
||||
- opened
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
github-actor:
|
||||
description: 'GitHub username. If empty, the username of the current user will be used'
|
||||
required: false
|
||||
|
||||
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
|
||||
permissions: {}
|
||||
@@ -26,12 +31,31 @@ jobs:
|
||||
id: check-user
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
ACTOR: ${{ inputs.github-actor || github.actor }}
|
||||
run: |
|
||||
if gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${GITHUB_ACTOR}"; then
|
||||
is_member=true
|
||||
else
|
||||
is_member=false
|
||||
fi
|
||||
expected_error="User does not exist or is not a member of the organization"
|
||||
output_file=output.txt
|
||||
|
||||
for i in $(seq 1 10); do
|
||||
if gh api "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${ACTOR}" \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" > ${output_file}; then
|
||||
|
||||
is_member=true
|
||||
break
|
||||
elif grep -q "${expected_error}" ${output_file}; then
|
||||
is_member=false
|
||||
break
|
||||
elif [ $i -eq 10 ]; then
|
||||
title="Failed to get memmbership status for ${ACTOR}"
|
||||
message="The latest GitHub API error message: '$(cat ${output_file})'"
|
||||
echo "::error file=.github/workflows/label-for-external-users.yml,title=${title}::${message}"
|
||||
|
||||
exit 1
|
||||
fi
|
||||
|
||||
sleep 1
|
||||
done
|
||||
|
||||
echo "is-member=${is_member}" | tee -a ${GITHUB_OUTPUT}
|
||||
|
||||
|
||||
35
Cargo.lock
generated
35
Cargo.lock
generated
@@ -915,25 +915,22 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "bindgen"
|
||||
version = "0.65.1"
|
||||
version = "0.70.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cfdf7b466f9a4903edc73f95d6d2bcd5baf8ae620638762244d3f60143643cc5"
|
||||
checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f"
|
||||
dependencies = [
|
||||
"bitflags 1.3.2",
|
||||
"bitflags 2.4.1",
|
||||
"cexpr",
|
||||
"clang-sys",
|
||||
"lazy_static",
|
||||
"lazycell",
|
||||
"itertools 0.12.1",
|
||||
"log",
|
||||
"peeking_take_while",
|
||||
"prettyplease 0.2.6",
|
||||
"prettyplease 0.2.17",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"regex",
|
||||
"rustc-hash",
|
||||
"shlex",
|
||||
"syn 2.0.52",
|
||||
"which",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2949,12 +2946,6 @@ dependencies = [
|
||||
"spin 0.5.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lazycell"
|
||||
version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.150"
|
||||
@@ -3977,12 +3968,6 @@ dependencies = [
|
||||
"sha2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "peeking_take_while"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
|
||||
|
||||
[[package]]
|
||||
name = "pem"
|
||||
version = "3.0.3"
|
||||
@@ -4280,9 +4265,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "prettyplease"
|
||||
version = "0.2.6"
|
||||
version = "0.2.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1"
|
||||
checksum = "8d3928fb5db768cb86f891ff014f0144589297e3c6a1aba6ed7cecfdace270c7"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"syn 2.0.52",
|
||||
@@ -6094,8 +6079,9 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
|
||||
|
||||
[[package]]
|
||||
name = "svg_fmt"
|
||||
version = "0.4.2"
|
||||
source = "git+https://github.com/nical/rust_debug?rev=28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4#28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4"
|
||||
version = "0.4.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "20e16a0f46cf5fd675563ef54f26e83e20f2366bcf027bcb3cc3ed2b98aaf2ca"
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
@@ -7627,6 +7613,7 @@ dependencies = [
|
||||
"hyper 0.14.26",
|
||||
"indexmap 1.9.3",
|
||||
"itertools 0.10.5",
|
||||
"itertools 0.12.1",
|
||||
"lazy_static",
|
||||
"libc",
|
||||
"log",
|
||||
|
||||
@@ -64,7 +64,7 @@ aws-types = "1.2.0"
|
||||
axum = { version = "0.6.20", features = ["ws"] }
|
||||
base64 = "0.13.0"
|
||||
bincode = "1.3"
|
||||
bindgen = "0.65"
|
||||
bindgen = "0.70"
|
||||
bit_field = "0.10.2"
|
||||
bstr = "1.0"
|
||||
byteorder = "1.4"
|
||||
@@ -161,8 +161,7 @@ socket2 = "0.5"
|
||||
strum = "0.24"
|
||||
strum_macros = "0.24"
|
||||
"subtle" = "2.5.0"
|
||||
# Our PR https://github.com/nical/rust_debug/pull/4 has been merged but no new version released yet
|
||||
svg_fmt = { git = "https://github.com/nical/rust_debug", rev = "28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4" }
|
||||
svg_fmt = "0.4.3"
|
||||
sync_wrapper = "0.1.2"
|
||||
tar = "0.4"
|
||||
task-local-extensions = "0.1.4"
|
||||
|
||||
@@ -87,6 +87,7 @@ RUN mkdir -p /data/.neon/ && \
|
||||
"pg_distrib_dir='/usr/local/'\n" \
|
||||
"listen_pg_addr='0.0.0.0:6400'\n" \
|
||||
"listen_http_addr='0.0.0.0:9898'\n" \
|
||||
"availability_zone='local'\n" \
|
||||
> /data/.neon/pageserver.toml && \
|
||||
chown -R neon:neon /data/.neon
|
||||
|
||||
|
||||
@@ -192,7 +192,7 @@ WORKDIR /home/nonroot
|
||||
|
||||
# Rust
|
||||
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
|
||||
ENV RUSTC_VERSION=1.80.1
|
||||
ENV RUSTC_VERSION=1.81.0
|
||||
ENV RUSTUP_HOME="/home/nonroot/.rustup"
|
||||
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
|
||||
ARG RUSTFILT_VERSION=0.2.1
|
||||
@@ -207,7 +207,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
|
||||
export PATH="$HOME/.cargo/bin:$PATH" && \
|
||||
. "$HOME/.cargo/env" && \
|
||||
cargo --version && rustup --version && \
|
||||
rustup component add llvm-tools-preview rustfmt clippy && \
|
||||
rustup component add llvm-tools rustfmt clippy && \
|
||||
cargo install rustfilt --version ${RUSTFILT_VERSION} && \
|
||||
cargo install cargo-hakari --version ${CARGO_HAKARI_VERSION} && \
|
||||
cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
|
||||
|
||||
@@ -22,9 +22,10 @@ use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
|
||||
|
||||
const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds
|
||||
|
||||
/// Escape a string for including it in a SQL literal. Wrapping the result
|
||||
/// with `E'{}'` or `'{}'` is not required, as it returns a ready-to-use
|
||||
/// SQL string literal, e.g. `'db'''` or `E'db\\'`.
|
||||
/// Escape a string for including it in a SQL literal.
|
||||
///
|
||||
/// Wrapping the result with `E'{}'` or `'{}'` is not required,
|
||||
/// as it returns a ready-to-use SQL string literal, e.g. `'db'''` or `E'db\\'`.
|
||||
/// See <https://github.com/postgres/postgres/blob/da98d005cdbcd45af563d0c4ac86d0e9772cd15f/src/backend/utils/adt/quote.c#L47>
|
||||
/// for the original implementation.
|
||||
pub fn escape_literal(s: &str) -> String {
|
||||
|
||||
@@ -4,8 +4,8 @@ use std::{str::FromStr, time::Duration};
|
||||
use clap::{Parser, Subcommand};
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest,
|
||||
TenantDescribeResponse, TenantPolicyRequest,
|
||||
NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse, ShardSchedulingPolicy,
|
||||
TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
|
||||
},
|
||||
models::{
|
||||
EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
|
||||
@@ -80,7 +80,10 @@ enum Command {
|
||||
/// List nodes known to the storage controller
|
||||
Nodes {},
|
||||
/// List tenants known to the storage controller
|
||||
Tenants {},
|
||||
Tenants {
|
||||
/// If this field is set, it will list the tenants on a specific node
|
||||
node_id: Option<NodeId>,
|
||||
},
|
||||
/// Create a new tenant in the storage controller, and by extension on pageservers.
|
||||
TenantCreate {
|
||||
#[arg(long)]
|
||||
@@ -336,7 +339,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
listen_pg_port,
|
||||
listen_http_addr,
|
||||
listen_http_port,
|
||||
availability_zone_id: Some(availability_zone_id),
|
||||
availability_zone_id,
|
||||
}),
|
||||
)
|
||||
.await?;
|
||||
@@ -403,7 +406,41 @@ async fn main() -> anyhow::Result<()> {
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Command::Tenants {} => {
|
||||
Command::Tenants {
|
||||
node_id: Some(node_id),
|
||||
} => {
|
||||
let describe_response = storcon_client
|
||||
.dispatch::<(), NodeShardResponse>(
|
||||
Method::GET,
|
||||
format!("control/v1/node/{node_id}/shards"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
let shards = describe_response.shards;
|
||||
let mut table = comfy_table::Table::new();
|
||||
table.set_header([
|
||||
"Shard",
|
||||
"Intended Primary/Secondary",
|
||||
"Observed Primary/Secondary",
|
||||
]);
|
||||
for shard in shards {
|
||||
table.add_row([
|
||||
format!("{}", shard.tenant_shard_id),
|
||||
match shard.is_intended_secondary {
|
||||
None => "".to_string(),
|
||||
Some(true) => "Secondary".to_string(),
|
||||
Some(false) => "Primary".to_string(),
|
||||
},
|
||||
match shard.is_observed_secondary {
|
||||
None => "".to_string(),
|
||||
Some(true) => "Secondary".to_string(),
|
||||
Some(false) => "Primary".to_string(),
|
||||
},
|
||||
]);
|
||||
}
|
||||
println!("{table}");
|
||||
}
|
||||
Command::Tenants { node_id: None } => {
|
||||
let mut resp = storcon_client
|
||||
.dispatch::<(), Vec<TenantDescribeResponse>>(
|
||||
Method::GET,
|
||||
|
||||
@@ -68,6 +68,7 @@ macro_rules! register_uint_gauge {
|
||||
static INTERNAL_REGISTRY: Lazy<Registry> = Lazy::new(Registry::new);
|
||||
|
||||
/// Register a collector in the internal registry. MUST be called before the first call to `gather()`.
|
||||
///
|
||||
/// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector
|
||||
/// while holding the lock.
|
||||
pub fn register_internal(c: Box<dyn Collector>) -> prometheus::Result<()> {
|
||||
|
||||
@@ -104,7 +104,9 @@ pub struct ConfigToml {
|
||||
pub image_compression: ImageCompressionAlgorithm,
|
||||
pub ephemeral_bytes_per_memory_kb: usize,
|
||||
pub l0_flush: Option<crate::models::L0FlushConfig>,
|
||||
pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess,
|
||||
#[serde(skip_serializing)]
|
||||
// TODO(https://github.com/neondatabase/neon/issues/8184): remove after this field is removed from all pageserver.toml's
|
||||
pub compact_level0_phase1_value_access: serde::de::IgnoredAny,
|
||||
pub virtual_file_direct_io: crate::models::virtual_file::DirectIoMode,
|
||||
pub io_buffer_alignment: usize,
|
||||
}
|
||||
@@ -209,43 +211,6 @@ pub enum GetImpl {
|
||||
#[serde(transparent)]
|
||||
pub struct MaxVectoredReadBytes(pub NonZeroUsize);
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
|
||||
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
|
||||
pub enum CompactL0Phase1ValueAccess {
|
||||
/// The old way.
|
||||
PageCachedBlobIo,
|
||||
/// The new way.
|
||||
StreamingKmerge {
|
||||
/// If set, we run both the old way and the new way, validate that
|
||||
/// they are identical (=> [`CompactL0BypassPageCacheValidation`]),
|
||||
/// and if the validation fails,
|
||||
/// - in tests: fail them with a panic or
|
||||
/// - in prod, log a rate-limited warning and use the old way's results.
|
||||
///
|
||||
/// If not set, we only run the new way and trust its results.
|
||||
validate: Option<CompactL0BypassPageCacheValidation>,
|
||||
},
|
||||
}
|
||||
|
||||
/// See [`CompactL0Phase1ValueAccess::StreamingKmerge`].
|
||||
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
|
||||
#[serde(rename_all = "kebab-case")]
|
||||
pub enum CompactL0BypassPageCacheValidation {
|
||||
/// Validate that the series of (key, lsn) pairs are the same.
|
||||
KeyLsn,
|
||||
/// Validate that the entire output of old and new way is identical.
|
||||
KeyLsnValue,
|
||||
}
|
||||
|
||||
impl Default for CompactL0Phase1ValueAccess {
|
||||
fn default() -> Self {
|
||||
CompactL0Phase1ValueAccess::StreamingKmerge {
|
||||
// TODO(https://github.com/neondatabase/neon/issues/8184): change to None once confident
|
||||
validate: Some(CompactL0BypassPageCacheValidation::KeyLsnValue),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A tenant's calcuated configuration, which is the result of merging a
|
||||
/// tenant's TenantConfOpt with the global TenantConf from PageServerConf.
|
||||
///
|
||||
@@ -452,7 +417,7 @@ impl Default for ConfigToml {
|
||||
image_compression: (DEFAULT_IMAGE_COMPRESSION),
|
||||
ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
||||
l0_flush: None,
|
||||
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||
compact_level0_phase1_value_access: Default::default(),
|
||||
virtual_file_direct_io: crate::models::virtual_file::DirectIoMode::default(),
|
||||
|
||||
io_buffer_alignment: DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::collections::HashSet;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::str::FromStr;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
@@ -57,7 +57,7 @@ pub struct NodeRegisterRequest {
|
||||
pub listen_http_addr: String,
|
||||
pub listen_http_port: u16,
|
||||
|
||||
pub availability_zone_id: Option<String>,
|
||||
pub availability_zone_id: String,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
@@ -74,6 +74,17 @@ pub struct TenantPolicyRequest {
|
||||
pub scheduling: Option<ShardSchedulingPolicy>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ShardsPreferredAzsRequest {
|
||||
#[serde(flatten)]
|
||||
pub preferred_az_ids: HashMap<TenantShardId, String>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ShardsPreferredAzsResponse {
|
||||
pub updated: Vec<TenantShardId>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantLocateResponseShard {
|
||||
pub shard_id: TenantShardId,
|
||||
@@ -101,6 +112,21 @@ pub struct TenantDescribeResponse {
|
||||
pub config: TenantConfig,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct NodeShardResponse {
|
||||
pub node_id: NodeId,
|
||||
pub shards: Vec<NodeShard>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct NodeShard {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
/// Whether the shard is observed secondary on a specific node. True = yes, False = no, None = not on this node.
|
||||
pub is_observed_secondary: Option<bool>,
|
||||
/// Whether the shard is intended to be a secondary on a specific node. True = yes, False = no, None = not on this node.
|
||||
pub is_intended_secondary: Option<bool>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct NodeDescribeResponse {
|
||||
pub id: NodeId,
|
||||
@@ -132,8 +158,12 @@ pub struct TenantDescribeResponseShard {
|
||||
pub is_splitting: bool,
|
||||
|
||||
pub scheduling_policy: ShardSchedulingPolicy,
|
||||
|
||||
pub preferred_az_id: Option<String>,
|
||||
}
|
||||
|
||||
/// Migration request for a given tenant shard to a given node.
|
||||
///
|
||||
/// Explicitly migrating a particular shard is a low level operation
|
||||
/// TODO: higher level "Reschedule tenant" operation where the request
|
||||
/// specifies some constraints, e.g. asking it to get off particular node(s)
|
||||
|
||||
@@ -305,8 +305,10 @@ pub struct TenantConfig {
|
||||
pub lsn_lease_length_for_ts: Option<String>,
|
||||
}
|
||||
|
||||
/// The policy for the aux file storage. It can be switched through `switch_aux_file_policy`
|
||||
/// tenant config. When the first aux file written, the policy will be persisted in the
|
||||
/// The policy for the aux file storage.
|
||||
///
|
||||
/// It can be switched through `switch_aux_file_policy` tenant config.
|
||||
/// When the first aux file written, the policy will be persisted in the
|
||||
/// `index_part.json` file and has a limited migration path.
|
||||
///
|
||||
/// Currently, we only allow the following migration path:
|
||||
@@ -896,7 +898,9 @@ pub struct WalRedoManagerStatus {
|
||||
pub process: Option<WalRedoManagerProcessStatus>,
|
||||
}
|
||||
|
||||
/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
|
||||
/// The progress of a secondary tenant.
|
||||
///
|
||||
/// It is mostly useful when doing a long running download: e.g. initiating
|
||||
/// a download job, timing out while waiting for it to run, and then inspecting this status to understand
|
||||
/// what's happening.
|
||||
#[derive(Default, Debug, Serialize, Deserialize, Clone)]
|
||||
|
||||
@@ -69,8 +69,10 @@ impl QueryError {
|
||||
}
|
||||
|
||||
/// Returns true if the given error is a normal consequence of a network issue,
|
||||
/// or the client closing the connection. These errors can happen during normal
|
||||
/// operations, and don't indicate a bug in our code.
|
||||
/// or the client closing the connection.
|
||||
///
|
||||
/// These errors can happen during normal operations,
|
||||
/// and don't indicate a bug in our code.
|
||||
pub fn is_expected_io_error(e: &io::Error) -> bool {
|
||||
use io::ErrorKind::*;
|
||||
matches!(
|
||||
|
||||
@@ -7,6 +7,7 @@ use std::fmt;
|
||||
use url::Host;
|
||||
|
||||
/// Parses a string of format either `host:port` or `host` into a corresponding pair.
|
||||
///
|
||||
/// The `host` part should be a correct `url::Host`, while `port` (if present) should be
|
||||
/// a valid decimal u16 of digits only.
|
||||
pub fn parse_host_port<S: AsRef<str>>(host_port: S) -> Result<(Host, Option<u16>), anyhow::Error> {
|
||||
|
||||
@@ -14,7 +14,7 @@ impl ParseCallbacks for PostgresFfiCallbacks {
|
||||
fn include_file(&self, filename: &str) {
|
||||
// This does the equivalent of passing bindgen::CargoCallbacks
|
||||
// to the builder .parse_callbacks() method.
|
||||
let cargo_callbacks = bindgen::CargoCallbacks;
|
||||
let cargo_callbacks = bindgen::CargoCallbacks::new();
|
||||
cargo_callbacks.include_file(filename)
|
||||
}
|
||||
|
||||
|
||||
@@ -45,6 +45,8 @@ pub use azure_core::Etag;
|
||||
|
||||
pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
|
||||
|
||||
/// Default concurrency limit for S3 operations
|
||||
///
|
||||
/// Currently, sync happens with AWS S3, that has two limits on requests per second:
|
||||
/// ~200 RPS for IAM services
|
||||
/// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
|
||||
@@ -300,7 +302,9 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
) -> Result<(), TimeTravelError>;
|
||||
}
|
||||
|
||||
/// DownloadStream is sensitive to the timeout and cancellation used with the original
|
||||
/// Data part of an ongoing [`Download`].
|
||||
///
|
||||
/// `DownloadStream` is sensitive to the timeout and cancellation used with the original
|
||||
/// [`RemoteStorage::download`] request. The type yields `std::io::Result<Bytes>` to be compatible
|
||||
/// with `tokio::io::copy_buf`.
|
||||
// This has 'static because safekeepers do not use cancellation tokens (yet)
|
||||
|
||||
@@ -60,3 +60,16 @@ pub struct TimelineCopyRequest {
|
||||
pub target_timeline_id: TimelineId,
|
||||
pub until_lsn: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct TimelineTermBumpRequest {
|
||||
/// bump to
|
||||
pub term: Option<u64>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct TimelineTermBumpResponse {
|
||||
// before the request
|
||||
pub previous_term: u64,
|
||||
pub current_term: u64,
|
||||
}
|
||||
|
||||
@@ -5,9 +5,10 @@
|
||||
mod calculation;
|
||||
pub mod svg;
|
||||
|
||||
/// StorageModel is the input to the synthetic size calculation. It represents
|
||||
/// a tree of timelines, with just the information that's needed for the
|
||||
/// calculation. This doesn't track timeline names or where each timeline
|
||||
/// StorageModel is the input to the synthetic size calculation.
|
||||
///
|
||||
/// It represents a tree of timelines, with just the information that's needed
|
||||
/// for the calculation. This doesn't track timeline names or where each timeline
|
||||
/// begins and ends, for example. Instead, it consists of "points of interest"
|
||||
/// on the timelines. A point of interest could be the timeline start or end point,
|
||||
/// the oldest point on a timeline that needs to be retained because of PITR
|
||||
|
||||
@@ -5,8 +5,10 @@ use std::{
|
||||
|
||||
use metrics::IntCounter;
|
||||
|
||||
/// Circuit breakers are for operations that are expensive and fallible: if they fail repeatedly,
|
||||
/// we will stop attempting them for some period of time, to avoid denial-of-service from retries, and
|
||||
/// Circuit breakers are for operations that are expensive and fallible.
|
||||
///
|
||||
/// If a circuit breaker fails repeatedly, we will stop attempting it for some
|
||||
/// period of time, to avoid denial-of-service from retries, and
|
||||
/// to mitigate the log spam from repeated failures.
|
||||
pub struct CircuitBreaker {
|
||||
/// An identifier that enables us to log useful errors when a circuit is broken
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use std::os::fd::AsRawFd;
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
fs::{self, File},
|
||||
@@ -203,6 +204,27 @@ pub fn overwrite(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Syncs the filesystem for the given file descriptor.
|
||||
#[cfg_attr(target_os = "macos", allow(unused_variables))]
|
||||
pub fn syncfs(fd: impl AsRawFd) -> anyhow::Result<()> {
|
||||
// Linux guarantees durability for syncfs.
|
||||
// POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
|
||||
#[cfg(target_os = "linux")]
|
||||
{
|
||||
use anyhow::Context;
|
||||
nix::unistd::syncfs(fd.as_raw_fd()).context("syncfs")?;
|
||||
}
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
// macOS is not a production platform for Neon, don't even bother.
|
||||
}
|
||||
#[cfg(not(any(target_os = "linux", target_os = "macos")))]
|
||||
{
|
||||
compile_error!("Unsupported OS");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
|
||||
@@ -249,8 +249,10 @@ macro_rules! id_newtype {
|
||||
};
|
||||
}
|
||||
|
||||
/// Neon timeline IDs are different from PostgreSQL timeline
|
||||
/// IDs. They serve a similar purpose though: they differentiate
|
||||
/// Neon timeline ID.
|
||||
///
|
||||
/// They are different from PostgreSQL timeline
|
||||
/// IDs, but serve a similar purpose: they differentiate
|
||||
/// between different "histories" of the same cluster. However,
|
||||
/// PostgreSQL timeline IDs are a bit cumbersome, because they are only
|
||||
/// 32-bits wide, and they must be in ascending order in any given
|
||||
|
||||
@@ -100,7 +100,9 @@ pub enum LockFileRead {
|
||||
}
|
||||
|
||||
/// Open & try to lock the lock file at the given `path`, returning a [handle][`LockFileRead`] to
|
||||
/// inspect its content. It is not an `Err(...)` if the file does not exist or is already locked.
|
||||
/// inspect its content.
|
||||
///
|
||||
/// It is not an `Err(...)` if the file does not exist or is already locked.
|
||||
/// Check the [`LockFileRead`] variants for details.
|
||||
pub fn read_and_hold_lock_file(path: &Utf8Path) -> anyhow::Result<LockFileRead> {
|
||||
let res = fs::OpenOptions::new().read(true).open(path);
|
||||
|
||||
@@ -190,7 +190,7 @@ impl Drop for TracingPanicHookGuard {
|
||||
}
|
||||
|
||||
/// Named symbol for our panic hook, which logs the panic.
|
||||
fn tracing_panic_hook(info: &std::panic::PanicInfo) {
|
||||
fn tracing_panic_hook(info: &std::panic::PanicHookInfo) {
|
||||
// following rust 1.66.1 std implementation:
|
||||
// https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288
|
||||
let location = info.location();
|
||||
|
||||
@@ -8,6 +8,7 @@ use tracing::{trace, warn};
|
||||
use crate::lsn::Lsn;
|
||||
|
||||
/// Feedback pageserver sends to safekeeper and safekeeper resends to compute.
|
||||
///
|
||||
/// Serialized in custom flexible key/value format. In replication protocol, it
|
||||
/// is marked with NEON_STATUS_UPDATE_TAG_BYTE to differentiate from postgres
|
||||
/// Standby status update / Hot standby feedback messages.
|
||||
|
||||
@@ -65,6 +65,8 @@ impl<T> Poison<T> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Armed pointer to a [`Poison`].
|
||||
///
|
||||
/// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state.
|
||||
/// Once modifications are done, use [`Self::disarm`].
|
||||
/// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned
|
||||
|
||||
@@ -13,10 +13,11 @@ pub struct ShardNumber(pub u8);
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
||||
pub struct ShardCount(pub u8);
|
||||
|
||||
/// Combination of ShardNumber and ShardCount. For use within the context of a particular tenant,
|
||||
/// when we need to know which shard we're dealing with, but do not need to know the full
|
||||
/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
|
||||
/// the fully qualified TenantShardId.
|
||||
/// Combination of ShardNumber and ShardCount.
|
||||
///
|
||||
/// For use within the context of a particular tenant, when we need to know which shard we're
|
||||
/// dealing with, but do not need to know the full ShardIdentity (because we won't be doing
|
||||
/// any page->shard mapping), and do not need to know the fully qualified TenantShardId.
|
||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||
pub struct ShardIndex {
|
||||
pub shard_number: ShardNumber,
|
||||
|
||||
@@ -49,12 +49,11 @@ use std::sync::{RwLock, RwLockWriteGuard};
|
||||
|
||||
use tokio::sync::watch;
|
||||
|
||||
///
|
||||
/// Rcu allows multiple readers to read and hold onto a value without blocking
|
||||
/// (for very long). Storing to the Rcu updates the value, making new readers
|
||||
/// immediately see the new value, but it also waits for all current readers to
|
||||
/// finish.
|
||||
/// (for very long).
|
||||
///
|
||||
/// Storing to the Rcu updates the value, making new readers immediately see
|
||||
/// the new value, but it also waits for all current readers to finish.
|
||||
pub struct Rcu<V> {
|
||||
inner: RwLock<RcuInner<V>>,
|
||||
}
|
||||
|
||||
@@ -5,7 +5,9 @@ use std::sync::{
|
||||
use tokio::sync::Semaphore;
|
||||
|
||||
/// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
|
||||
/// `SemaphorePermit`, allowing use of `take` which does not require holding an outer mutex guard
|
||||
/// `SemaphorePermit`.
|
||||
///
|
||||
/// Allows use of `take` which does not require holding an outer mutex guard
|
||||
/// for the duration of initialization.
|
||||
///
|
||||
/// Has no unsafe, builds upon [`tokio::sync::Semaphore`] and [`std::sync::Mutex`].
|
||||
|
||||
@@ -7,6 +7,7 @@ pub enum VecMapOrdering {
|
||||
}
|
||||
|
||||
/// Ordered map datastructure implemented in a Vec.
|
||||
///
|
||||
/// Append only - can only add keys that are larger than the
|
||||
/// current max key.
|
||||
/// Ordering can be adjusted using [`VecMapOrdering`]
|
||||
|
||||
@@ -6,9 +6,10 @@ pub enum YieldingLoopError {
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
/// Helper for long synchronous loops, e.g. over all tenants in the system. Periodically
|
||||
/// yields to avoid blocking the executor, and after resuming checks the provided
|
||||
/// cancellation token to drop out promptly on shutdown.
|
||||
/// Helper for long synchronous loops, e.g. over all tenants in the system.
|
||||
///
|
||||
/// Periodically yields to avoid blocking the executor, and after resuming
|
||||
/// checks the provided cancellation token to drop out promptly on shutdown.
|
||||
#[inline(always)]
|
||||
pub async fn yielding_loop<I, T, F>(
|
||||
interval: usize,
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
use std::{env, path::PathBuf, process::Command};
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use bindgen::CargoCallbacks;
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
// Tell cargo to invalidate the built crate whenever the wrapper changes
|
||||
@@ -64,16 +63,25 @@ fn main() -> anyhow::Result<()> {
|
||||
.map_err(|s| anyhow!("Bad postgres server path {s:?}"))?
|
||||
};
|
||||
|
||||
let unwind_abi_functions = [
|
||||
"log_internal",
|
||||
"recovery_download",
|
||||
"start_streaming",
|
||||
"finish_sync_safekeepers",
|
||||
"wait_event_set",
|
||||
"WalProposerStart",
|
||||
];
|
||||
|
||||
// The bindgen::Builder is the main entry point
|
||||
// to bindgen, and lets you build up options for
|
||||
// the resulting bindings.
|
||||
let bindings = bindgen::Builder::default()
|
||||
let mut builder = bindgen::Builder::default()
|
||||
// The input header we would like to generate
|
||||
// bindings for.
|
||||
.header("bindgen_deps.h")
|
||||
// Tell cargo to invalidate the built crate whenever any of the
|
||||
// included header files changed.
|
||||
.parse_callbacks(Box::new(CargoCallbacks))
|
||||
.parse_callbacks(Box::new(bindgen::CargoCallbacks::new()))
|
||||
.allowlist_type("WalProposer")
|
||||
.allowlist_type("WalProposerConfig")
|
||||
.allowlist_type("walproposer_api")
|
||||
@@ -105,7 +113,12 @@ fn main() -> anyhow::Result<()> {
|
||||
.allowlist_var("WL_SOCKET_MASK")
|
||||
.clang_arg("-DWALPROPOSER_LIB")
|
||||
.clang_arg(format!("-I{pgxn_neon}"))
|
||||
.clang_arg(format!("-I{inc_server_path}"))
|
||||
.clang_arg(format!("-I{inc_server_path}"));
|
||||
|
||||
for name in unwind_abi_functions {
|
||||
builder = builder.override_abi(bindgen::Abi::CUnwind, name);
|
||||
}
|
||||
let bindings = builder
|
||||
// Finish the builder and generate the bindings.
|
||||
.generate()
|
||||
// Unwrap the Result and panic on failure.
|
||||
|
||||
@@ -33,7 +33,7 @@ extern "C" fn get_shmem_state(wp: *mut WalProposer) -> *mut WalproposerShmemStat
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) {
|
||||
extern "C-unwind" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
@@ -187,7 +187,7 @@ extern "C" fn conn_blocking_write(
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool {
|
||||
extern "C-unwind" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
@@ -272,7 +272,7 @@ extern "C" fn rm_safekeeper_event_set(sk: *mut Safekeeper) {
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn wait_event_set(
|
||||
extern "C-unwind" fn wait_event_set(
|
||||
wp: *mut WalProposer,
|
||||
timeout: ::std::os::raw::c_long,
|
||||
event_sk: *mut *mut Safekeeper,
|
||||
@@ -324,7 +324,7 @@ extern "C" fn get_redo_start_lsn(wp: *mut WalProposer) -> XLogRecPtr {
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
|
||||
extern "C-unwind" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
@@ -340,7 +340,7 @@ extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, sk: *mut Safekee
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn log_internal(
|
||||
extern "C-unwind" fn log_internal(
|
||||
wp: *mut WalProposer,
|
||||
level: ::std::os::raw::c_int,
|
||||
line: *const ::std::os::raw::c_char,
|
||||
|
||||
@@ -1,2 +1,20 @@
|
||||
pub mod mgmt_api;
|
||||
pub mod page_service;
|
||||
|
||||
/// For timeline_block_unblock_gc, distinguish the two different operations. This could be a bool.
|
||||
// If file structure is per-kind not per-feature then where to put this?
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum BlockUnblock {
|
||||
Block,
|
||||
Unblock,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for BlockUnblock {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let s = match self {
|
||||
BlockUnblock::Block => "block",
|
||||
BlockUnblock::Unblock => "unblock",
|
||||
};
|
||||
f.write_str(s)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,6 +12,8 @@ use utils::{
|
||||
|
||||
pub use reqwest::Body as ReqwestBody;
|
||||
|
||||
use crate::BlockUnblock;
|
||||
|
||||
pub mod util;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -454,6 +456,20 @@ impl Client {
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn timeline_block_unblock_gc(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
dir: BlockUnblock,
|
||||
) -> Result<()> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/{dir}_gc",
|
||||
self.mgmt_api_endpoint,
|
||||
);
|
||||
|
||||
self.request(Method::POST, &uri, ()).await.map(|_| ())
|
||||
}
|
||||
|
||||
pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{}/reset",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::pin::Pin;
|
||||
use std::{pin::Pin, sync::Arc};
|
||||
|
||||
use futures::SinkExt;
|
||||
use futures::{SinkExt, StreamExt};
|
||||
use pageserver_api::{
|
||||
models::{
|
||||
PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest,
|
||||
@@ -10,7 +10,6 @@ use pageserver_api::{
|
||||
};
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_postgres::CopyOutStream;
|
||||
use tokio_stream::StreamExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
@@ -136,18 +135,68 @@ impl PagestreamClient {
|
||||
drop(copy_both);
|
||||
}
|
||||
|
||||
pub async fn getpage(
|
||||
&mut self,
|
||||
req: PagestreamGetPageRequest,
|
||||
) -> anyhow::Result<PagestreamGetPageResponse> {
|
||||
pub fn split(self) -> (PagestreamTx, PagestreamRx) {
|
||||
let Self {
|
||||
copy_both,
|
||||
cancel_on_client_drop,
|
||||
conn_task,
|
||||
} = self;
|
||||
let keep_client_alive = KeepClientAlive {
|
||||
client: conn_task,
|
||||
cancel_on_client_drop: cancel_on_client_drop.unwrap(),
|
||||
};
|
||||
let keep_client_alive = Arc::new(keep_client_alive);
|
||||
let (sink, stream): (
|
||||
futures::stream::SplitSink<
|
||||
Pin<Box<tokio_postgres::CopyBothDuplex<bytes::Bytes>>>,
|
||||
bytes::Bytes,
|
||||
>,
|
||||
futures::stream::SplitStream<Pin<Box<tokio_postgres::CopyBothDuplex<bytes::Bytes>>>>,
|
||||
) = copy_both.split();
|
||||
(
|
||||
PagestreamTx {
|
||||
sink,
|
||||
keep_client_alive: keep_client_alive.clone(),
|
||||
},
|
||||
PagestreamRx {
|
||||
stream,
|
||||
keep_client_alive,
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
struct KeepClientAlive {
|
||||
client: JoinHandle<()>,
|
||||
cancel_on_client_drop: tokio_util::sync::DropGuard,
|
||||
}
|
||||
|
||||
pub struct PagestreamTx {
|
||||
sink: futures::stream::SplitSink<
|
||||
Pin<Box<tokio_postgres::CopyBothDuplex<bytes::Bytes>>>,
|
||||
bytes::Bytes,
|
||||
>,
|
||||
keep_client_alive: Arc<KeepClientAlive>,
|
||||
}
|
||||
|
||||
pub struct PagestreamRx {
|
||||
stream: futures::stream::SplitStream<Pin<Box<tokio_postgres::CopyBothDuplex<bytes::Bytes>>>>,
|
||||
keep_client_alive: Arc<KeepClientAlive>,
|
||||
}
|
||||
|
||||
impl PagestreamTx {
|
||||
pub async fn send_getpage(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> {
|
||||
let req = PagestreamFeMessage::GetPage(req);
|
||||
let req: bytes::Bytes = req.serialize();
|
||||
// let mut req = tokio_util::io::ReaderStream::new(&req);
|
||||
let mut req = tokio_stream::once(Ok(req));
|
||||
let mut req = tokio_stream::once(Ok(req.clone()));
|
||||
self.sink.send_all(&mut req).await?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
self.copy_both.send_all(&mut req).await?;
|
||||
|
||||
let next: Option<Result<bytes::Bytes, _>> = self.copy_both.next().await;
|
||||
impl PagestreamRx {
|
||||
pub async fn recv_getpage(&mut self) -> anyhow::Result<PagestreamGetPageResponse> {
|
||||
let next: Option<Result<bytes::Bytes, _>> = self.stream.next().await;
|
||||
let next: bytes::Bytes = next.unwrap()?;
|
||||
|
||||
let msg = PagestreamBeMessage::deserialize(next)?;
|
||||
|
||||
@@ -108,6 +108,7 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
|
||||
enum LineKind {
|
||||
GcCutoff,
|
||||
Branch,
|
||||
KeyVertical,
|
||||
}
|
||||
|
||||
impl From<LineKind> for Fill {
|
||||
@@ -115,6 +116,7 @@ impl From<LineKind> for Fill {
|
||||
match value {
|
||||
LineKind::GcCutoff => Fill::Color(rgb(255, 0, 0)),
|
||||
LineKind::Branch => Fill::Color(rgb(0, 255, 0)),
|
||||
LineKind::KeyVertical => Fill::Color(rgb(0, 0, 255)),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -126,6 +128,7 @@ impl FromStr for LineKind {
|
||||
Ok(match s {
|
||||
"gc_cutoff" => LineKind::GcCutoff,
|
||||
"branch" => LineKind::Branch,
|
||||
"key" => LineKind::KeyVertical,
|
||||
_ => anyhow::bail!("unsupported linekind: {s}"),
|
||||
})
|
||||
}
|
||||
@@ -142,25 +145,31 @@ pub fn main() -> Result<()> {
|
||||
let stdin = io::stdin();
|
||||
|
||||
let mut lines: Vec<(Lsn, LineKind)> = vec![];
|
||||
let mut vertical_lines: Vec<(Key, LineKind)> = vec![];
|
||||
|
||||
for (lineno, line) in stdin.lock().lines().enumerate() {
|
||||
let lineno = lineno + 1;
|
||||
|
||||
let line = line.unwrap();
|
||||
if let Some((kind, lsn)) = line.split_once(':') {
|
||||
let (kind, lsn) = LineKind::from_str(kind)
|
||||
.context("parse kind")
|
||||
.and_then(|kind| {
|
||||
if lsn.contains('/') {
|
||||
Lsn::from_str(lsn)
|
||||
} else {
|
||||
Lsn::from_hex(lsn)
|
||||
if let Some((kind, what)) = line.split_once(':') {
|
||||
(|| {
|
||||
match LineKind::from_str(kind).context("parse kind")? {
|
||||
kind @ LineKind::Branch | kind @ LineKind::GcCutoff => {
|
||||
let lsn = if what.contains('/') {
|
||||
Lsn::from_str(what)?
|
||||
} else {
|
||||
Lsn::from_hex(what)?
|
||||
};
|
||||
lines.push((lsn, kind));
|
||||
}
|
||||
.map(|lsn| (kind, lsn))
|
||||
.context("parse lsn")
|
||||
})
|
||||
.with_context(|| format!("parse {line:?} on {lineno}"))?;
|
||||
lines.push((lsn, kind));
|
||||
kind @ LineKind::KeyVertical => {
|
||||
let key = Key::from_hex(what).context("parse key")?;
|
||||
vertical_lines.push((key, kind));
|
||||
}
|
||||
}
|
||||
anyhow::Ok(())
|
||||
})()
|
||||
.with_context(|| format!("parse {line:?} on {lineno}"))?;
|
||||
continue;
|
||||
}
|
||||
let line = PathBuf::from_str(&line).unwrap();
|
||||
@@ -175,7 +184,7 @@ pub fn main() -> Result<()> {
|
||||
}
|
||||
|
||||
// Collect all coordinates
|
||||
let mut keys: Vec<Key> = Vec::with_capacity(files.len());
|
||||
let mut keys: Vec<Key> = Vec::with_capacity(files.len() + vertical_lines.len());
|
||||
let mut lsns: Vec<Lsn> = Vec::with_capacity(files.len() + lines.len());
|
||||
|
||||
for Layer {
|
||||
@@ -192,6 +201,8 @@ pub fn main() -> Result<()> {
|
||||
|
||||
lsns.extend(lines.iter().map(|(lsn, _)| *lsn));
|
||||
|
||||
keys.extend(vertical_lines.iter().map(|(key, _)| *key));
|
||||
|
||||
// Analyze
|
||||
let key_map = build_coordinate_compression_map(keys);
|
||||
let lsn_map = build_coordinate_compression_map(lsns);
|
||||
@@ -283,6 +294,25 @@ pub fn main() -> Result<()> {
|
||||
);
|
||||
}
|
||||
|
||||
for (key, kind) in vertical_lines {
|
||||
let key = *key_map.get(&key).unwrap();
|
||||
let stretch = 2.0;
|
||||
let xmargin = 0.05;
|
||||
let ymargin = 0.05;
|
||||
let lsn_diff = 0.3;
|
||||
let lsn_offset = -lsn_diff / 2.0;
|
||||
println!(
|
||||
"{}",
|
||||
rectangle(
|
||||
5.0 + key as f32,
|
||||
0.0,
|
||||
(key_map.len() + 10) as f32,
|
||||
lsn_map.len() as f32,
|
||||
)
|
||||
.fill(kind)
|
||||
);
|
||||
}
|
||||
|
||||
println!("{}", EndSvg);
|
||||
|
||||
eprintln!("num_images: {}", num_images);
|
||||
|
||||
@@ -13,7 +13,7 @@ use rand::prelude::*;
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::info;
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::collections::{HashSet, VecDeque};
|
||||
use std::future::Future;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::pin::Pin;
|
||||
@@ -295,64 +295,58 @@ async fn main_impl(
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let (mut pagestream_tx, mut pagestream_rx) = client.split();
|
||||
|
||||
start_work_barrier.wait().await;
|
||||
let client_start = Instant::now();
|
||||
let mut ticks_processed = 0;
|
||||
while !cancel.is_cancelled() {
|
||||
// Detect if a request took longer than the RPS rate
|
||||
if let Some(period) = &rps_period {
|
||||
let periods_passed_until_now =
|
||||
usize::try_from(client_start.elapsed().as_micros() / period.as_micros())
|
||||
let (rq_tx, mut rq_rx) = tokio::sync::mpsc::channel(4096);
|
||||
let sender = tokio::spawn(async move {
|
||||
while !cancel.is_cancelled() {
|
||||
let start = Instant::now();
|
||||
let req = {
|
||||
let mut rng = rand::thread_rng();
|
||||
let r = &ranges[weights.sample(&mut rng)];
|
||||
let key: i128 = rng.gen_range(r.start..r.end);
|
||||
let key = Key::from_i128(key);
|
||||
assert!(key.is_rel_block_key());
|
||||
let (rel_tag, block_no) = key
|
||||
.to_rel_block()
|
||||
.expect("we filter non-rel-block keys out above");
|
||||
PagestreamGetPageRequest {
|
||||
request_lsn: if rng.gen_bool(args.req_latest_probability) {
|
||||
Lsn::MAX
|
||||
} else {
|
||||
r.timeline_lsn
|
||||
},
|
||||
not_modified_since: r.timeline_lsn,
|
||||
rel: rel_tag,
|
||||
blkno: block_no,
|
||||
}
|
||||
};
|
||||
pagestream_tx.send_getpage(req).await.unwrap();
|
||||
rq_tx.send(start).await.unwrap();
|
||||
}
|
||||
});
|
||||
|
||||
let receiver = tokio::spawn(async move {
|
||||
while let Some(start) = rq_rx.recv().await {
|
||||
let response = pagestream_rx.recv_getpage().await.unwrap();
|
||||
let end = Instant::now();
|
||||
live_stats.request_done();
|
||||
STATS.with(|stats| {
|
||||
stats
|
||||
.borrow()
|
||||
.lock()
|
||||
.unwrap()
|
||||
.observe(end.duration_since(start))
|
||||
.unwrap();
|
||||
|
||||
if periods_passed_until_now > ticks_processed {
|
||||
live_stats.missed((periods_passed_until_now - ticks_processed) as u64);
|
||||
}
|
||||
ticks_processed = periods_passed_until_now;
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
let start = Instant::now();
|
||||
let req = {
|
||||
let mut rng = rand::thread_rng();
|
||||
let r = &ranges[weights.sample(&mut rng)];
|
||||
let key: i128 = rng.gen_range(r.start..r.end);
|
||||
let key = Key::from_i128(key);
|
||||
assert!(key.is_rel_block_key());
|
||||
let (rel_tag, block_no) = key
|
||||
.to_rel_block()
|
||||
.expect("we filter non-rel-block keys out above");
|
||||
PagestreamGetPageRequest {
|
||||
request_lsn: if rng.gen_bool(args.req_latest_probability) {
|
||||
Lsn::MAX
|
||||
} else {
|
||||
r.timeline_lsn
|
||||
},
|
||||
not_modified_since: r.timeline_lsn,
|
||||
rel: rel_tag,
|
||||
blkno: block_no,
|
||||
}
|
||||
};
|
||||
client.getpage(req).await.unwrap();
|
||||
let end = Instant::now();
|
||||
live_stats.request_done();
|
||||
ticks_processed += 1;
|
||||
STATS.with(|stats| {
|
||||
stats
|
||||
.borrow()
|
||||
.lock()
|
||||
.unwrap()
|
||||
.observe(end.duration_since(start))
|
||||
.unwrap();
|
||||
});
|
||||
|
||||
if let Some(period) = &rps_period {
|
||||
let next_at = client_start
|
||||
+ Duration::from_micros(
|
||||
(ticks_processed) as u64 * u64::try_from(period.as_micros()).unwrap(),
|
||||
);
|
||||
tokio::time::sleep_until(next_at.into()).await;
|
||||
}
|
||||
}
|
||||
sender.await.unwrap();
|
||||
receiver.await.unwrap();
|
||||
})
|
||||
};
|
||||
|
||||
|
||||
@@ -37,6 +37,7 @@ use pageserver::{
|
||||
virtual_file,
|
||||
};
|
||||
use postgres_backend::AuthType;
|
||||
use utils::crashsafe::syncfs;
|
||||
use utils::failpoint_support;
|
||||
use utils::logging::TracingErrorLayerEnablement;
|
||||
use utils::{
|
||||
@@ -125,7 +126,6 @@ fn main() -> anyhow::Result<()> {
|
||||
// after setting up logging, log the effective IO engine choice and read path implementations
|
||||
info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
|
||||
info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
|
||||
info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
|
||||
info!(?conf.io_buffer_alignment, "starting with setting for IO buffer alignment");
|
||||
|
||||
// The tenants directory contains all the pageserver local disk state.
|
||||
@@ -156,23 +156,7 @@ fn main() -> anyhow::Result<()> {
|
||||
};
|
||||
|
||||
let started = Instant::now();
|
||||
// Linux guarantees durability for syncfs.
|
||||
// POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
|
||||
#[cfg(target_os = "linux")]
|
||||
{
|
||||
use std::os::fd::AsRawFd;
|
||||
nix::unistd::syncfs(dirfd.as_raw_fd()).context("syncfs")?;
|
||||
}
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
// macOS is not a production platform for Neon, don't even bother.
|
||||
drop(dirfd);
|
||||
}
|
||||
#[cfg(not(any(target_os = "linux", target_os = "macos")))]
|
||||
{
|
||||
compile_error!("Unsupported OS");
|
||||
}
|
||||
|
||||
syncfs(dirfd)?;
|
||||
let elapsed = started.elapsed();
|
||||
info!(
|
||||
elapsed_ms = elapsed.as_millis(),
|
||||
|
||||
@@ -174,16 +174,14 @@ pub struct PageServerConf {
|
||||
|
||||
pub l0_flush: crate::l0_flush::L0FlushConfig,
|
||||
|
||||
/// This flag is temporary and will be removed after gradual rollout.
|
||||
/// See <https://github.com/neondatabase/neon/issues/8184>.
|
||||
pub compact_level0_phase1_value_access: pageserver_api::config::CompactL0Phase1ValueAccess,
|
||||
|
||||
/// Direct IO settings
|
||||
pub virtual_file_direct_io: virtual_file::DirectIoMode,
|
||||
|
||||
pub io_buffer_alignment: usize,
|
||||
}
|
||||
|
||||
/// Token for authentication to safekeepers
|
||||
///
|
||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||
/// and/or serialized at a whim, while the token is secret. Currently this token is the
|
||||
/// same for accessing all tenants/timelines, but may become per-tenant/per-timeline in
|
||||
@@ -338,7 +336,7 @@ impl PageServerConf {
|
||||
max_vectored_read_bytes,
|
||||
image_compression,
|
||||
ephemeral_bytes_per_memory_kb,
|
||||
compact_level0_phase1_value_access,
|
||||
compact_level0_phase1_value_access: _,
|
||||
l0_flush,
|
||||
virtual_file_direct_io,
|
||||
concurrent_tenant_warmup,
|
||||
@@ -383,7 +381,6 @@ impl PageServerConf {
|
||||
max_vectored_read_bytes,
|
||||
image_compression,
|
||||
ephemeral_bytes_per_memory_kb,
|
||||
compact_level0_phase1_value_access,
|
||||
virtual_file_direct_io,
|
||||
io_buffer_alignment,
|
||||
|
||||
@@ -561,6 +558,16 @@ mod tests {
|
||||
.expect("parse_and_validate");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compactl0_phase1_access_mode_is_ignored_silently() {
|
||||
let input = indoc::indoc! {r#"
|
||||
[compact_level0_phase1_value_access]
|
||||
mode = "streaming-kmerge"
|
||||
validate = "key-lsn-value"
|
||||
"#};
|
||||
toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input).unwrap();
|
||||
}
|
||||
|
||||
/// If there's a typo in the pageserver config, we'd rather catch that typo
|
||||
/// and fail pageserver startup than silently ignoring the typo, leaving whoever
|
||||
/// made it in the believe that their config change is effective.
|
||||
@@ -637,14 +644,5 @@ mod tests {
|
||||
// some_invalid_field = 23
|
||||
// "#}
|
||||
// );
|
||||
|
||||
test!(
|
||||
compact_level0_phase1_value_access,
|
||||
indoc! {r#"
|
||||
[compact_level0_phase1_value_access]
|
||||
mode = "streaming-kmerge"
|
||||
some_invalid_field = 23
|
||||
"#}
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
//! This module defines `RequestContext`, a structure that we use throughout
|
||||
//! the pageserver to propagate high-level context from places
|
||||
//! that _originate_ activity down to the shared code paths at the
|
||||
//! heart of the pageserver. It's inspired by Golang's `context.Context`.
|
||||
//! Defines [`RequestContext`].
|
||||
//!
|
||||
//! It is a structure that we use throughout the pageserver to propagate
|
||||
//! high-level context from places that _originate_ activity down to the
|
||||
//! shared code paths at the heart of the pageserver. It's inspired by
|
||||
//! Golang's `context.Context`.
|
||||
//!
|
||||
//! For example, in `Timeline::get(page_nr, lsn)` we need to answer the following questions:
|
||||
//! 1. What high-level activity ([`TaskKind`]) needs this page?
|
||||
|
||||
@@ -141,10 +141,24 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
||||
m.other
|
||||
);
|
||||
|
||||
let az_id = m
|
||||
.other
|
||||
.get("availability_zone_id")
|
||||
.and_then(|jv| jv.as_str().map(|str| str.to_owned()));
|
||||
let az_id = {
|
||||
let az_id_from_metadata = m
|
||||
.other
|
||||
.get("availability_zone_id")
|
||||
.and_then(|jv| jv.as_str().map(|str| str.to_owned()));
|
||||
|
||||
match az_id_from_metadata {
|
||||
Some(az_id) => Some(az_id),
|
||||
None => {
|
||||
tracing::warn!("metadata.json does not contain an 'availability_zone_id' field");
|
||||
conf.availability_zone.clone()
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
if az_id.is_none() {
|
||||
panic!("Availablity zone id could not be inferred from metadata.json or pageserver config");
|
||||
}
|
||||
|
||||
Some(NodeRegisterRequest {
|
||||
node_id: conf.id,
|
||||
@@ -152,7 +166,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
||||
listen_pg_port: m.postgres_port,
|
||||
listen_http_addr: m.http_host,
|
||||
listen_http_port: m.http_port,
|
||||
availability_zone_id: az_id,
|
||||
availability_zone_id: az_id.expect("Checked above"),
|
||||
})
|
||||
}
|
||||
Err(e) => {
|
||||
|
||||
@@ -1185,6 +1185,7 @@ struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
|
||||
ctx: &'c RequestContext,
|
||||
start: std::time::Instant,
|
||||
op: SmgrQueryType,
|
||||
count: usize,
|
||||
}
|
||||
|
||||
impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
|
||||
@@ -1212,9 +1213,11 @@ impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
|
||||
elapsed
|
||||
}
|
||||
};
|
||||
self.global_metric.observe(ex_throttled.as_secs_f64());
|
||||
if let Some(timeline_metric) = self.timeline_metric {
|
||||
timeline_metric.observe(ex_throttled.as_secs_f64());
|
||||
for _ in 0..self.count {
|
||||
self.global_metric.observe(ex_throttled.as_secs_f64());
|
||||
if let Some(timeline_metric) = self.timeline_metric {
|
||||
timeline_metric.observe(ex_throttled.as_secs_f64());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1343,6 +1346,14 @@ impl SmgrQueryTimePerTimeline {
|
||||
&'a self,
|
||||
op: SmgrQueryType,
|
||||
ctx: &'c RequestContext,
|
||||
) -> Option<impl Drop + '_> {
|
||||
self.start_timer_many(op, 1, ctx)
|
||||
}
|
||||
pub(crate) fn start_timer_many<'c: 'a, 'a>(
|
||||
&'a self,
|
||||
op: SmgrQueryType,
|
||||
count: usize,
|
||||
ctx: &'c RequestContext,
|
||||
) -> Option<impl Drop + '_> {
|
||||
let global_metric = &self.global_metrics[op as usize];
|
||||
let start = Instant::now();
|
||||
@@ -1376,6 +1387,7 @@ impl SmgrQueryTimePerTimeline {
|
||||
ctx,
|
||||
start,
|
||||
op,
|
||||
count,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -3170,6 +3182,16 @@ static TOKIO_EXECUTOR_THREAD_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub(crate) static CONSECUTIVE_NONBLOCKING_GETPAGE_REQUESTS_HISTOGRAM: Lazy<Histogram> =
|
||||
Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_consecutive_nonblocking_getpage_requests",
|
||||
"Number of consecutive nonblocking getpage requests",
|
||||
(0..=256).map(|x| x as f64).collect::<Vec<f64>>(),
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) {
|
||||
static SERIALIZE: std::sync::Mutex<()> = std::sync::Mutex::new(());
|
||||
let _guard = SERIALIZE.lock().unwrap();
|
||||
|
||||
@@ -5,14 +5,14 @@ use anyhow::Context;
|
||||
use async_compression::tokio::write::GzipEncoder;
|
||||
use bytes::Buf;
|
||||
use futures::FutureExt;
|
||||
use once_cell::sync::OnceCell;
|
||||
use pageserver_api::models::TenantState;
|
||||
use once_cell::sync::{Lazy, OnceCell};
|
||||
use pageserver_api::models::{self, TenantState};
|
||||
use pageserver_api::models::{
|
||||
PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
|
||||
PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
|
||||
PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
|
||||
PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest,
|
||||
PagestreamNblocksResponse, PagestreamProtocolVersion,
|
||||
PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetSlruSegmentRequest,
|
||||
PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest, PagestreamNblocksResponse,
|
||||
PagestreamProtocolVersion,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError};
|
||||
@@ -43,7 +43,7 @@ use crate::basebackup;
|
||||
use crate::basebackup::BasebackupError;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::metrics;
|
||||
use crate::metrics::{self, CONSECUTIVE_NONBLOCKING_GETPAGE_REQUESTS_HISTOGRAM};
|
||||
use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS};
|
||||
use crate::pgdatadir_mapping::Version;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
@@ -58,7 +58,7 @@ use crate::tenant::GetTimelineError;
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::tenant::Timeline;
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use pageserver_api::reltag::SlruKind;
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
||||
use postgres_ffi::BLCKSZ;
|
||||
|
||||
@@ -577,124 +577,326 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
|
||||
loop {
|
||||
// read request bytes (it's exactly 1 PagestreamFeMessage per CopyData)
|
||||
let msg = tokio::select! {
|
||||
biased;
|
||||
_ = self.cancel.cancelled() => {
|
||||
return Err(QueryError::Shutdown)
|
||||
}
|
||||
msg = pgb.read_message() => { msg }
|
||||
};
|
||||
let copy_data_bytes = match msg? {
|
||||
Some(FeMessage::CopyData(bytes)) => bytes,
|
||||
Some(FeMessage::Terminate) => break,
|
||||
Some(m) => {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"unexpected message: {m:?} during COPY"
|
||||
)));
|
||||
}
|
||||
None => break, // client disconnected
|
||||
};
|
||||
let mut batched = None;
|
||||
'outer: loop {
|
||||
enum DebouncedFeMessage {
|
||||
Exists(models::PagestreamExistsRequest),
|
||||
Nblocks(models::PagestreamNblocksRequest),
|
||||
GetPage {
|
||||
span: Span,
|
||||
shard: timeline::handle::Handle<TenantManagerTypes>,
|
||||
effective_request_lsn: Lsn,
|
||||
pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>,
|
||||
},
|
||||
DbSize(models::PagestreamDbSizeRequest),
|
||||
GetSlruSegment(models::PagestreamGetSlruSegmentRequest),
|
||||
RespondError(Span, PageStreamError),
|
||||
}
|
||||
let mut debounce: Option<std::time::Instant> = None;
|
||||
// return or `?` on protocol error
|
||||
// `break EXPR` to stop batching. The EXPR will be the first message in the next batch.
|
||||
let next_batched: Option<DebouncedFeMessage> = loop {
|
||||
static BOUNCE_TIMEOUT: Lazy<Duration> = Lazy::new(|| {
|
||||
utils::env::var::<humantime::Duration, _>("NEON_PAGESERVER_DEBOUNCE")
|
||||
.unwrap()
|
||||
.into()
|
||||
});
|
||||
let sleep_fut = if let Some(started_at) = debounce {
|
||||
futures::future::Either::Left(tokio::time::sleep_until(
|
||||
(started_at + *BOUNCE_TIMEOUT).into(),
|
||||
))
|
||||
} else {
|
||||
futures::future::Either::Right(futures::future::pending())
|
||||
};
|
||||
let msg = tokio::select! {
|
||||
biased;
|
||||
_ = self.cancel.cancelled() => {
|
||||
return Err(QueryError::Shutdown)
|
||||
}
|
||||
msg = pgb.read_message() => {
|
||||
msg
|
||||
}
|
||||
_ = sleep_fut => {
|
||||
assert!(batched.is_some());
|
||||
break None;
|
||||
}
|
||||
};
|
||||
let copy_data_bytes = match msg? {
|
||||
Some(FeMessage::CopyData(bytes)) => bytes,
|
||||
Some(FeMessage::Terminate) => break 'outer,
|
||||
Some(m) => {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"unexpected message: {m:?} during COPY"
|
||||
)));
|
||||
}
|
||||
None => break 'outer, // client disconnected
|
||||
};
|
||||
trace!("query: {copy_data_bytes:?}");
|
||||
fail::fail_point!("ps::handle-pagerequest-message");
|
||||
|
||||
trace!("query: {copy_data_bytes:?}");
|
||||
fail::fail_point!("ps::handle-pagerequest-message");
|
||||
// parse request
|
||||
let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
|
||||
|
||||
// parse request
|
||||
let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
|
||||
let this_msg = match neon_fe_msg {
|
||||
PagestreamFeMessage::Exists(msg) => DebouncedFeMessage::Exists(msg),
|
||||
PagestreamFeMessage::Nblocks(msg) => DebouncedFeMessage::Nblocks(msg),
|
||||
PagestreamFeMessage::DbSize(msg) => DebouncedFeMessage::DbSize(msg),
|
||||
PagestreamFeMessage::GetSlruSegment(msg) => {
|
||||
DebouncedFeMessage::GetSlruSegment(msg)
|
||||
}
|
||||
PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
rel,
|
||||
blkno,
|
||||
}) => {
|
||||
let span = tracing::info_span!("handle_get_page_at_lsn_request_batched", %tenant_id, %timeline_id, shard_id = tracing::field::Empty, req_lsn = %request_lsn, batch_size = tracing::field::Empty, batch_id = tracing::field::Empty);
|
||||
let key = rel_block_to_key(rel, blkno);
|
||||
let shard = match self
|
||||
.timeline_handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Page(key))
|
||||
.instrument(span.clone())
|
||||
.await
|
||||
{
|
||||
Ok(tl) => tl,
|
||||
Err(GetActiveTimelineError::Tenant(
|
||||
GetActiveTenantError::NotFound(_),
|
||||
)) => {
|
||||
// We already know this tenant exists in general, because we resolved it at
|
||||
// start of connection. Getting a NotFound here indicates that the shard containing
|
||||
// the requested page is not present on this node: the client's knowledge of shard->pageserver
|
||||
// mapping is out of date.
|
||||
//
|
||||
// Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
|
||||
// client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
|
||||
// and talk to a different pageserver.
|
||||
break Some(DebouncedFeMessage::RespondError(
|
||||
span,
|
||||
PageStreamError::Reconnect(
|
||||
"getpage@lsn request routed to wrong shard".into(),
|
||||
),
|
||||
));
|
||||
}
|
||||
Err(e) => break Some(DebouncedFeMessage::RespondError(span, e.into())),
|
||||
};
|
||||
let effective_request_lsn = match Self::wait_or_get_last_lsn(
|
||||
&shard,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
&shard.get_latest_gc_cutoff_lsn(),
|
||||
&ctx,
|
||||
)
|
||||
// TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait
|
||||
.await
|
||||
{
|
||||
Ok(lsn) => lsn,
|
||||
Err(e) => {
|
||||
break Some(DebouncedFeMessage::RespondError(span, e));
|
||||
}
|
||||
};
|
||||
DebouncedFeMessage::GetPage {
|
||||
span,
|
||||
shard,
|
||||
effective_request_lsn,
|
||||
pages: smallvec::smallvec![(rel, blkno)],
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// check if we can debounce
|
||||
match (&mut batched, this_msg) {
|
||||
(None, this_msg) => {
|
||||
batched = Some(this_msg);
|
||||
}
|
||||
(
|
||||
Some(DebouncedFeMessage::GetPage {
|
||||
span: _,
|
||||
shard: accum_shard,
|
||||
pages: accum_pages,
|
||||
effective_request_lsn: accum_lsn,
|
||||
}),
|
||||
DebouncedFeMessage::GetPage {
|
||||
span: _,
|
||||
shard: this_shard,
|
||||
pages: this_pages,
|
||||
effective_request_lsn: this_lsn,
|
||||
},
|
||||
) if async {
|
||||
assert_eq!(this_pages.len(), 1);
|
||||
if accum_pages.len() >= Timeline::MAX_GET_VECTORED_KEYS as usize {
|
||||
assert_eq!(accum_pages.len(), Timeline::MAX_GET_VECTORED_KEYS as usize);
|
||||
return false;
|
||||
}
|
||||
if (accum_shard.tenant_shard_id, accum_shard.timeline_id)
|
||||
!= (this_shard.tenant_shard_id, this_shard.timeline_id)
|
||||
{
|
||||
// TODO: we _could_ batch & execute each shard seperately (and in parallel).
|
||||
// But the current logig for keeping responses in order does not support that.
|
||||
return false;
|
||||
}
|
||||
// the vectored get currently only supports a single LSN, so, bounce as soon
|
||||
// as the effective request_lsn changes
|
||||
if (*accum_lsn != this_lsn) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
.await =>
|
||||
{
|
||||
// ok to batch
|
||||
accum_pages.extend(this_pages);
|
||||
}
|
||||
(Some(_), this_msg) => {
|
||||
// by default, don't continue batching
|
||||
break Some(this_msg);
|
||||
}
|
||||
}
|
||||
|
||||
// debounce impl piece
|
||||
let started_at = debounce.get_or_insert_with(Instant::now);
|
||||
if started_at.elapsed() > *BOUNCE_TIMEOUT {
|
||||
break None;
|
||||
}
|
||||
};
|
||||
|
||||
// invoke handler function
|
||||
let (handler_result, span) = match neon_fe_msg {
|
||||
PagestreamFeMessage::Exists(req) => {
|
||||
let (handler_results, span): (
|
||||
smallvec::SmallVec<[Result<PagestreamBeMessage, PageStreamError>; 1]>,
|
||||
_,
|
||||
) = match batched.take().expect("loop above ensures this") {
|
||||
DebouncedFeMessage::Exists(req) => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::exists");
|
||||
let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
|
||||
(
|
||||
self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
|
||||
.instrument(span.clone())
|
||||
.await,
|
||||
smallvec::smallvec![
|
||||
self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
|
||||
.instrument(span.clone())
|
||||
.await
|
||||
],
|
||||
span,
|
||||
)
|
||||
}
|
||||
PagestreamFeMessage::Nblocks(req) => {
|
||||
DebouncedFeMessage::Nblocks(req) => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::nblocks");
|
||||
let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
|
||||
(
|
||||
self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
|
||||
.instrument(span.clone())
|
||||
.await,
|
||||
smallvec::smallvec![
|
||||
self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
|
||||
.instrument(span.clone())
|
||||
.await,
|
||||
],
|
||||
span,
|
||||
)
|
||||
}
|
||||
PagestreamFeMessage::GetPage(req) => {
|
||||
DebouncedFeMessage::GetPage {
|
||||
span,
|
||||
shard,
|
||||
effective_request_lsn,
|
||||
pages,
|
||||
} => {
|
||||
CONSECUTIVE_NONBLOCKING_GETPAGE_REQUESTS_HISTOGRAM.observe(pages.len() as f64);
|
||||
span.record("batch_size", pages.len() as u64);
|
||||
static BATCH_ID: Lazy<std::sync::atomic::AtomicUsize> =
|
||||
Lazy::new(|| std::sync::atomic::AtomicUsize::new(0));
|
||||
span.record(
|
||||
"batch_id",
|
||||
BATCH_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed) as u64,
|
||||
);
|
||||
fail::fail_point!("ps::handle-pagerequest-message::getpage");
|
||||
// shard_id is filled in by the handler
|
||||
let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.request_lsn);
|
||||
(
|
||||
self.handle_get_page_at_lsn_request(tenant_id, timeline_id, &req, &ctx)
|
||||
.instrument(span.clone())
|
||||
.await,
|
||||
{
|
||||
let npages = pages.len();
|
||||
let res = self
|
||||
.handle_get_page_at_lsn_request_batched(
|
||||
&shard,
|
||||
effective_request_lsn,
|
||||
pages,
|
||||
&ctx,
|
||||
)
|
||||
.instrument(span.clone())
|
||||
.await;
|
||||
assert_eq!(res.len(), npages);
|
||||
res
|
||||
},
|
||||
span,
|
||||
)
|
||||
}
|
||||
PagestreamFeMessage::DbSize(req) => {
|
||||
DebouncedFeMessage::DbSize(req) => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::dbsize");
|
||||
let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
|
||||
(
|
||||
self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
|
||||
.instrument(span.clone())
|
||||
.await,
|
||||
smallvec::smallvec![
|
||||
self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
|
||||
.instrument(span.clone())
|
||||
.await
|
||||
],
|
||||
span,
|
||||
)
|
||||
}
|
||||
PagestreamFeMessage::GetSlruSegment(req) => {
|
||||
DebouncedFeMessage::GetSlruSegment(req) => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
|
||||
let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
|
||||
(
|
||||
self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx)
|
||||
smallvec::smallvec![
|
||||
self.handle_get_slru_segment_request(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
&req,
|
||||
&ctx
|
||||
)
|
||||
.instrument(span.clone())
|
||||
.await,
|
||||
.await
|
||||
],
|
||||
span,
|
||||
)
|
||||
}
|
||||
DebouncedFeMessage::RespondError(span, e) => {
|
||||
// We've already decided to respond with an error, so we don't need to
|
||||
// call the handler.
|
||||
(smallvec::smallvec![Err(e)], span)
|
||||
}
|
||||
};
|
||||
|
||||
// Map handler result to protocol behavior.
|
||||
// Some handler errors cause exit from pagestream protocol.
|
||||
// Other handler errors are sent back as an error message and we stay in pagestream protocol.
|
||||
let response_msg = match handler_result {
|
||||
Err(e) => match &e {
|
||||
PageStreamError::Shutdown => {
|
||||
// If we fail to fulfil a request during shutdown, which may be _because_ of
|
||||
// shutdown, then do not send the error to the client. Instead just drop the
|
||||
// connection.
|
||||
span.in_scope(|| info!("dropping connection due to shutdown"));
|
||||
return Err(QueryError::Shutdown);
|
||||
}
|
||||
PageStreamError::Reconnect(reason) => {
|
||||
span.in_scope(|| info!("handler requested reconnect: {reason}"));
|
||||
return Err(QueryError::Reconnect);
|
||||
}
|
||||
PageStreamError::Read(_)
|
||||
| PageStreamError::LsnTimeout(_)
|
||||
| PageStreamError::NotFound(_)
|
||||
| PageStreamError::BadRequest(_) => {
|
||||
// print the all details to the log with {:#}, but for the client the
|
||||
// error message is enough. Do not log if shutting down, as the anyhow::Error
|
||||
// here includes cancellation which is not an error.
|
||||
let full = utils::error::report_compact_sources(&e);
|
||||
span.in_scope(|| {
|
||||
error!("error reading relation or page version: {full:#}")
|
||||
});
|
||||
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
||||
message: e.to_string(),
|
||||
})
|
||||
}
|
||||
},
|
||||
Ok(response_msg) => response_msg,
|
||||
};
|
||||
for handler_result in handler_results {
|
||||
let response_msg = match handler_result {
|
||||
Err(e) => match &e {
|
||||
PageStreamError::Shutdown => {
|
||||
// If we fail to fulfil a request during shutdown, which may be _because_ of
|
||||
// shutdown, then do not send the error to the client. Instead just drop the
|
||||
// connection.
|
||||
span.in_scope(|| info!("dropping connection due to shutdown"));
|
||||
return Err(QueryError::Shutdown);
|
||||
}
|
||||
PageStreamError::Reconnect(reason) => {
|
||||
span.in_scope(|| info!("handler requested reconnect: {reason}"));
|
||||
return Err(QueryError::Reconnect);
|
||||
}
|
||||
PageStreamError::Read(_)
|
||||
| PageStreamError::LsnTimeout(_)
|
||||
| PageStreamError::NotFound(_)
|
||||
| PageStreamError::BadRequest(_) => {
|
||||
// print the all details to the log with {:#}, but for the client the
|
||||
// error message is enough. Do not log if shutting down, as the anyhow::Error
|
||||
// here includes cancellation which is not an error.
|
||||
let full = utils::error::report_compact_sources(&e);
|
||||
span.in_scope(|| {
|
||||
error!("error reading relation or page version: {full:#}")
|
||||
});
|
||||
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
||||
message: e.to_string(),
|
||||
})
|
||||
}
|
||||
},
|
||||
Ok(response_msg) => response_msg,
|
||||
};
|
||||
|
||||
// marshal & transmit response message
|
||||
pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
|
||||
// marshal & transmit response message
|
||||
pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
|
||||
}
|
||||
tokio::select! {
|
||||
biased;
|
||||
_ = self.cancel.cancelled() => {
|
||||
@@ -706,6 +908,9 @@ impl PageServerHandler {
|
||||
res?;
|
||||
}
|
||||
}
|
||||
|
||||
assert!(batched.is_none(), "we take() earlier");
|
||||
batched = next_batched;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -949,60 +1154,30 @@ impl PageServerHandler {
|
||||
}))
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(shard_id))]
|
||||
async fn handle_get_page_at_lsn_request(
|
||||
#[instrument(skip_all)]
|
||||
async fn handle_get_page_at_lsn_request_batched(
|
||||
&mut self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
req: &PagestreamGetPageRequest,
|
||||
timeline: &Timeline,
|
||||
effective_lsn: Lsn,
|
||||
pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||
let timeline = match self
|
||||
.timeline_handles
|
||||
.get(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
ShardSelector::Page(rel_block_to_key(req.rel, req.blkno)),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(tl) => tl,
|
||||
Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
|
||||
// We already know this tenant exists in general, because we resolved it at
|
||||
// start of connection. Getting a NotFound here indicates that the shard containing
|
||||
// the requested page is not present on this node: the client's knowledge of shard->pageserver
|
||||
// mapping is out of date.
|
||||
//
|
||||
// Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
|
||||
// client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
|
||||
// and talk to a different pageserver.
|
||||
return Err(PageStreamError::Reconnect(
|
||||
"getpage@lsn request routed to wrong shard".into(),
|
||||
));
|
||||
}
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
|
||||
let _timer = timeline
|
||||
.query_metrics
|
||||
.start_timer(metrics::SmgrQueryType::GetPageAtLsn, ctx);
|
||||
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(
|
||||
&timeline,
|
||||
req.request_lsn,
|
||||
req.not_modified_since,
|
||||
&latest_gc_cutoff_lsn,
|
||||
) -> smallvec::SmallVec<[Result<PagestreamBeMessage, PageStreamError>; 1]> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
let _timer = timeline.query_metrics.start_timer_many(
|
||||
metrics::SmgrQueryType::GetPageAtLsn,
|
||||
pages.len(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
);
|
||||
|
||||
let page = timeline
|
||||
.get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), ctx)
|
||||
.await?;
|
||||
let pages = timeline
|
||||
.get_rel_page_at_lsn_batched(pages, Version::Lsn(effective_lsn), ctx)
|
||||
.await;
|
||||
|
||||
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
|
||||
page,
|
||||
smallvec::SmallVec::from_iter(pages.into_iter().map(|page| {
|
||||
page.map(|page| {
|
||||
PagestreamBeMessage::GetPage(models::PagestreamGetPageResponse { page })
|
||||
})
|
||||
.map_err(PageStreamError::Read)
|
||||
}))
|
||||
}
|
||||
|
||||
@@ -1499,3 +1674,10 @@ fn set_tracing_field_shard_id(timeline: &Timeline) {
|
||||
);
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
}
|
||||
|
||||
struct WaitedForLsn(Lsn);
|
||||
impl From<WaitedForLsn> for Lsn {
|
||||
fn from(WaitedForLsn(lsn): WaitedForLsn) -> Self {
|
||||
lsn
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,12 +9,17 @@
|
||||
use super::tenant::{PageReconstructError, Timeline};
|
||||
use crate::context::RequestContext;
|
||||
use crate::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
|
||||
use crate::span::{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id,
|
||||
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
|
||||
};
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use crate::{aux_file, repository::*};
|
||||
use anyhow::{ensure, Context};
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use enum_map::Enum;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::{
|
||||
dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
|
||||
relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
|
||||
@@ -28,7 +33,7 @@ use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use postgres_ffi::{Oid, RepOriginId, TimestampTz, TransactionId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::{hash_map, HashMap, HashSet};
|
||||
use std::collections::{hash_map, BTreeMap, HashMap, HashSet};
|
||||
use std::ops::ControlFlow;
|
||||
use std::ops::Range;
|
||||
use strum::IntoEnumIterator;
|
||||
@@ -191,26 +196,184 @@ impl Timeline {
|
||||
version: Version<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
if tag.relnode == 0 {
|
||||
return Err(PageReconstructError::Other(
|
||||
RelationError::InvalidRelnode.into(),
|
||||
));
|
||||
}
|
||||
let pages = smallvec::smallvec![(tag, blknum)];
|
||||
let res = self.get_rel_page_at_lsn_batched(pages, version, ctx).await;
|
||||
assert_eq!(res.len(), 1);
|
||||
res.into_iter().next().unwrap()
|
||||
}
|
||||
|
||||
let nblocks = self.get_rel_size(tag, version, ctx).await?;
|
||||
if blknum >= nblocks {
|
||||
debug!(
|
||||
"read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
|
||||
tag,
|
||||
blknum,
|
||||
version.get_lsn(),
|
||||
nblocks
|
||||
);
|
||||
return Ok(ZERO_PAGE.clone());
|
||||
/// Like [`get_rel_page_at_lsn`], but returns a batch of pages.
|
||||
pub(crate) async fn get_rel_page_at_lsn_batched(
|
||||
&self,
|
||||
pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>,
|
||||
version: Version<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> smallvec::SmallVec<[Result<Bytes, PageReconstructError>; 1]> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
let request_lsn = match version {
|
||||
Version::Lsn(lsn) => lsn,
|
||||
Version::Modified(_) => panic!("unsupported"),
|
||||
};
|
||||
enum KeyState {
|
||||
NeedsVectoredGet,
|
||||
Done(Result<Bytes, PageReconstructError>),
|
||||
}
|
||||
let mut key_states = BTreeMap::new();
|
||||
let mut vectored_gets: smallvec::SmallVec<[_; 1]> =
|
||||
smallvec::SmallVec::with_capacity(pages.len());
|
||||
for (response_order, (tag, blknum)) in pages.into_iter().enumerate() {
|
||||
let key = rel_block_to_key(tag, blknum);
|
||||
use std::collections::btree_map::Entry;
|
||||
let key_state_slot = match key_states.entry((key, response_order)) {
|
||||
Entry::Occupied(_entry) => unreachable!(
|
||||
"enumerate makes keys unique, even if batch contains same key twice"
|
||||
),
|
||||
Entry::Vacant(entry) => entry,
|
||||
};
|
||||
|
||||
let key = rel_block_to_key(tag, blknum);
|
||||
version.get(self, key, ctx).await
|
||||
if tag.relnode == 0 {
|
||||
key_state_slot.insert(KeyState::Done(Err(PageReconstructError::Other(
|
||||
RelationError::InvalidRelnode.into(),
|
||||
))));
|
||||
continue;
|
||||
}
|
||||
|
||||
let nblocks = match self.get_rel_size(tag, version, ctx).await {
|
||||
Ok(nblocks) => nblocks,
|
||||
Err(err) => {
|
||||
key_state_slot.insert(KeyState::Done(Err(err)));
|
||||
continue;
|
||||
}
|
||||
};
|
||||
if blknum >= nblocks {
|
||||
debug!(
|
||||
"read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
|
||||
tag,
|
||||
blknum,
|
||||
version.get_lsn(),
|
||||
nblocks
|
||||
);
|
||||
key_state_slot.insert(KeyState::Done(Ok(ZERO_PAGE.clone())));
|
||||
continue;
|
||||
}
|
||||
|
||||
vectored_gets.push(key);
|
||||
key_state_slot.insert(KeyState::NeedsVectoredGet);
|
||||
}
|
||||
// turn vectored_gets into a keyspace
|
||||
let keyspace = {
|
||||
// add_key reuqires monotonicity
|
||||
vectored_gets.sort_unstable();
|
||||
let mut acc = KeySpaceAccum::new();
|
||||
for key in vectored_gets
|
||||
.into_iter()
|
||||
// in fact it requires strong monotonicity
|
||||
.dedup()
|
||||
{
|
||||
acc.add_key(key);
|
||||
}
|
||||
acc.to_keyspace()
|
||||
};
|
||||
|
||||
match self.get_vectored(keyspace, request_lsn, ctx).await {
|
||||
Ok(results) => {
|
||||
for (key, res) in results {
|
||||
if let Err(err) = &res {
|
||||
warn!(%key, ?err, "a key inside get_vectored failed with a per-key error");
|
||||
}
|
||||
let mut interests = key_states.range_mut((key, 0)..(key.next(), 0)).peekable();
|
||||
let first_interest = interests.next().unwrap();
|
||||
let next_interest = interests.peek().is_some();
|
||||
if !next_interest {
|
||||
match first_interest.1 {
|
||||
KeyState::NeedsVectoredGet => {
|
||||
*first_interest.1 = KeyState::Done(res);
|
||||
}
|
||||
KeyState::Done(_) => unreachable!(),
|
||||
}
|
||||
continue;
|
||||
} else {
|
||||
for ((_, _), state) in [first_interest].into_iter().chain(interests) {
|
||||
match state {
|
||||
KeyState::NeedsVectoredGet => {
|
||||
*state = KeyState::Done(match &res {
|
||||
Ok(buf) => Ok(buf.clone()),
|
||||
// this `match` is working around the fact that we cannot Clone the PageReconstructError
|
||||
Err(err) => Err(match err {
|
||||
PageReconstructError::Cancelled => {
|
||||
PageReconstructError::Cancelled
|
||||
}
|
||||
|
||||
x @ PageReconstructError::Other(_) |
|
||||
x @ PageReconstructError::AncestorLsnTimeout(_) |
|
||||
x @ PageReconstructError::WalRedo(_) |
|
||||
x @ PageReconstructError::MissingKey(_) => {
|
||||
PageReconstructError::Other(anyhow::anyhow!("there was more than one request for this key in the batch, error logged once: {x:?}"))
|
||||
},
|
||||
}),
|
||||
});
|
||||
}
|
||||
KeyState::Done(_) => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
warn!(?err, "get_vectored failed with a global error, mapping that error to per-key failure");
|
||||
// this cannot really happen because get_vectored only errors globally on invalid LSN or too large batch size
|
||||
for ((_, _), state) in key_states.iter_mut() {
|
||||
// this whole `match` is a lot like `From<GetVectoredError> for PageReconstructError`
|
||||
// but without taking ownership of the GetVectoredError
|
||||
match &err {
|
||||
GetVectoredError::Cancelled => {
|
||||
*state = KeyState::Done(Err(PageReconstructError::Cancelled));
|
||||
}
|
||||
// TODO: restructure get_vectored API to make this error per-key
|
||||
GetVectoredError::MissingKey(err) => {
|
||||
*state = KeyState::Done(Err(PageReconstructError::Other(anyhow::anyhow!("whole vectored get request failed because one or more of the requested keys were missing: {err:?}"))));
|
||||
}
|
||||
// TODO: restructure get_vectored API to make this error per-key
|
||||
GetVectoredError::GetReadyAncestorError(err) => {
|
||||
*state = KeyState::Done(Err(PageReconstructError::Other(anyhow::anyhow!("whole vectored get request failed because one or more key required ancestor that wasn't ready: {err:?}"))));
|
||||
}
|
||||
// TODO: restructure get_vectored API to make this error per-key
|
||||
GetVectoredError::Other(err) => {
|
||||
*state = KeyState::Done(Err(PageReconstructError::Other(
|
||||
anyhow::anyhow!("whole vectored get request failed: {err:?}"),
|
||||
)));
|
||||
}
|
||||
// TODO: we can prevent this error class by moving this check into the type system
|
||||
GetVectoredError::InvalidLsn(e) => {
|
||||
*state =
|
||||
KeyState::Done(Err(anyhow::anyhow!("invalid LSN: {e:?}").into()));
|
||||
}
|
||||
// NB: this should never happen in practice because we limit MAX_GET_VECTORED_KEYS
|
||||
// TODO: we can prevent this error class by moving this check into the type system
|
||||
GetVectoredError::Oversized(err) => {
|
||||
*state = KeyState::Done(Err(anyhow::anyhow!(
|
||||
"batching oversized: {err:?}"
|
||||
)
|
||||
.into()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// get the results into the order in which they were requested
|
||||
let mut return_order: smallvec::SmallVec<[_; Timeline::MAX_GET_VECTORED_KEYS as usize]> =
|
||||
smallvec::SmallVec::with_capacity(key_states.len());
|
||||
return_order.extend(key_states.keys().map(|(key, idx)| (*key, *idx)));
|
||||
return_order.sort_unstable_by_key(|(_, idx)| *idx);
|
||||
let mut res = smallvec::SmallVec::with_capacity(key_states.len());
|
||||
res.extend(return_order.into_iter().map(|key_states_key| {
|
||||
match key_states.remove(&key_states_key).unwrap() {
|
||||
KeyState::Done(res) => res,
|
||||
KeyState::NeedsVectoredGet => unreachable!(),
|
||||
}
|
||||
}));
|
||||
res
|
||||
}
|
||||
|
||||
// Get size of a database in blocks
|
||||
@@ -1021,9 +1184,10 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// DatadirModification represents an operation to ingest an atomic set of
|
||||
/// updates to the repository. It is created by the 'begin_record'
|
||||
/// function. It is called for each WAL record, so that all the modifications
|
||||
/// by a one WAL record appear atomic.
|
||||
/// updates to the repository.
|
||||
///
|
||||
/// It is created by the 'begin_record' function. It is called for each WAL
|
||||
/// record, so that all the modifications by a one WAL record appear atomic.
|
||||
pub struct DatadirModification<'a> {
|
||||
/// The timeline this modification applies to. You can access this to
|
||||
/// read the state, but note that any pending updates are *not* reflected
|
||||
@@ -2048,6 +2212,7 @@ impl<'a> DatadirModification<'a> {
|
||||
|
||||
/// This struct facilitates accessing either a committed key from the timeline at a
|
||||
/// specific LSN, or the latest uncommitted key from a pending modification.
|
||||
///
|
||||
/// During WAL ingestion, the records from multiple LSNs may be batched in the same
|
||||
/// modification before being flushed to the timeline. Hence, the routines in WalIngest
|
||||
/// need to look up the keys in the modification first before looking them up in the
|
||||
|
||||
@@ -73,6 +73,21 @@ impl ValueBytes {
|
||||
|
||||
Ok(raw[8] == 1)
|
||||
}
|
||||
|
||||
pub(crate) fn is_image(raw: &[u8]) -> Result<bool, InvalidInput> {
|
||||
if raw.len() < 12 {
|
||||
return Err(InvalidInput::TooShortValue);
|
||||
}
|
||||
|
||||
let value_discriminator = &raw[0..4];
|
||||
|
||||
if value_discriminator == [0, 0, 0, 0] {
|
||||
// Value::Image always initializes
|
||||
return Ok(true);
|
||||
}
|
||||
|
||||
Ok(false)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
//! Timeline repository implementation that keeps old data in layer files, and
|
||||
//! the recent changes in ephemeral files.
|
||||
//!
|
||||
//! Timeline repository implementation that keeps old data in files on disk, and
|
||||
//! the recent changes in memory. See tenant/*_layer.rs files.
|
||||
//! The functions here are responsible for locating the correct layer for the
|
||||
//! get/put call, walking back the timeline branching history as needed.
|
||||
//! See tenant/*_layer.rs files. The functions here are responsible for locating
|
||||
//! the correct layer for the get/put call, walking back the timeline branching
|
||||
//! history as needed.
|
||||
//!
|
||||
//! The files are stored in the .neon/tenants/<tenant_id>/timelines/<timeline_id>
|
||||
//! directory. See docs/pageserver-storage.md for how the files are managed.
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
//! Describes the legacy now hopefully no longer modified per-timeline metadata stored in
|
||||
//! `index_part.json` managed by [`remote_timeline_client`]. For many tenants and their timelines,
|
||||
//! this struct and it's original serialization format is still needed because they were written a
|
||||
//! long time ago.
|
||||
//! Describes the legacy now hopefully no longer modified per-timeline metadata.
|
||||
//!
|
||||
//! It is stored in `index_part.json` managed by [`remote_timeline_client`]. For many tenants and
|
||||
//! their timelines, this struct and its original serialization format is still needed because
|
||||
//! they were written a long time ago.
|
||||
//!
|
||||
//! Instead of changing and adding versioning to this, just change [`IndexPart`] with soft json
|
||||
//! versioning.
|
||||
|
||||
@@ -282,9 +282,10 @@ impl BackgroundPurges {
|
||||
static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
|
||||
Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));
|
||||
|
||||
/// The TenantManager is responsible for storing and mutating the collection of all tenants
|
||||
/// that this pageserver process has state for. Every Tenant and SecondaryTenant instance
|
||||
/// lives inside the TenantManager.
|
||||
/// Responsible for storing and mutating the collection of all tenants
|
||||
/// that this pageserver has state for.
|
||||
///
|
||||
/// Every Tenant and SecondaryTenant instance lives inside the TenantManager.
|
||||
///
|
||||
/// The most important role of the TenantManager is to prevent conflicts: e.g. trying to attach
|
||||
/// the same tenant twice concurrently, or trying to configure the same tenant into secondary
|
||||
@@ -2346,8 +2347,9 @@ pub enum TenantMapError {
|
||||
ShuttingDown,
|
||||
}
|
||||
|
||||
/// Guards a particular tenant_id's content in the TenantsMap. While this
|
||||
/// structure exists, the TenantsMap will contain a [`TenantSlot::InProgress`]
|
||||
/// Guards a particular tenant_id's content in the TenantsMap.
|
||||
///
|
||||
/// While this structure exists, the TenantsMap will contain a [`TenantSlot::InProgress`]
|
||||
/// for this tenant, which acts as a marker for any operations targeting
|
||||
/// this tenant to retry later, or wait for the InProgress state to end.
|
||||
///
|
||||
|
||||
@@ -2184,6 +2184,8 @@ pub fn remote_timeline_path(
|
||||
remote_timelines_path(tenant_shard_id).join(Utf8Path::new(&timeline_id.to_string()))
|
||||
}
|
||||
|
||||
/// Obtains the path of the given Layer in the remote
|
||||
///
|
||||
/// Note that the shard component of a remote layer path is _not_ always the same
|
||||
/// as in the TenantShardId of the caller: tenants may reference layers from a different
|
||||
/// ShardIndex. Use the ShardIndex from the layer's metadata.
|
||||
|
||||
@@ -548,7 +548,7 @@ pub(crate) async fn download_initdb_tar_zst(
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
.inspect_err(|_e| {
|
||||
// Do a best-effort attempt at deleting the temporary file upon encountering an error.
|
||||
// We don't have async here nor do we want to pile on any extra errors.
|
||||
if let Err(e) = std::fs::remove_file(&temp_path) {
|
||||
@@ -556,7 +556,6 @@ pub(crate) async fn download_initdb_tar_zst(
|
||||
warn!("error deleting temporary file {temp_path}: {e}");
|
||||
}
|
||||
}
|
||||
e
|
||||
})?;
|
||||
|
||||
Ok((temp_path, file))
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
//! In-memory index to track the tenant files on the remote storage.
|
||||
//!
|
||||
//! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about
|
||||
//! remote timeline layers and its metadata.
|
||||
|
||||
|
||||
@@ -8,15 +8,18 @@ mod layer_desc;
|
||||
mod layer_name;
|
||||
pub mod merge_iterator;
|
||||
|
||||
use tokio::sync::{self};
|
||||
use tracing::{debug, Instrument};
|
||||
use utils::bin_ser::BeSer;
|
||||
pub mod split_writer;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||
use crate::repository::Value;
|
||||
use crate::repository::{Value, ValueBytes};
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::key::{Key, DBDIR_KEY};
|
||||
use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
|
||||
use std::cmp::{Ordering, Reverse};
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::{BinaryHeap, HashMap};
|
||||
use std::ops::Range;
|
||||
@@ -79,30 +82,57 @@ pub(crate) enum ValueReconstructSituation {
|
||||
}
|
||||
|
||||
/// Reconstruct data accumulated for a single key during a vectored get
|
||||
#[derive(Debug, Default, Clone)]
|
||||
#[derive(Debug, Default)]
|
||||
pub(crate) struct VectoredValueReconstructState {
|
||||
pub(crate) records: Vec<(Lsn, NeonWalRecord)>,
|
||||
pub(crate) img: Option<(Lsn, Bytes)>,
|
||||
pub(crate) records: Vec<(
|
||||
Lsn,
|
||||
tokio::sync::oneshot::Receiver<Result<Bytes, std::io::Error>>,
|
||||
)>,
|
||||
pub(crate) will_init_lsn: Option<Lsn>,
|
||||
|
||||
situation: ValueReconstructSituation,
|
||||
pub(crate) situation: ValueReconstructSituation,
|
||||
}
|
||||
|
||||
impl VectoredValueReconstructState {
|
||||
fn get_cached_lsn(&self) -> Option<Lsn> {
|
||||
self.img.as_ref().map(|img| img.0)
|
||||
self.will_init_lsn
|
||||
}
|
||||
}
|
||||
|
||||
impl From<VectoredValueReconstructState> for ValueReconstructState {
|
||||
fn from(mut state: VectoredValueReconstructState) -> Self {
|
||||
// walredo expects the records to be descending in terms of Lsn
|
||||
state.records.sort_by_key(|(lsn, _)| Reverse(*lsn));
|
||||
pub(crate) async fn convert(
|
||||
_key: Key,
|
||||
from: VectoredValueReconstructState,
|
||||
) -> Result<ValueReconstructState, PageReconstructError> {
|
||||
let mut to = ValueReconstructState::default();
|
||||
|
||||
ValueReconstructState {
|
||||
records: state.records,
|
||||
img: state.img,
|
||||
for (lsn, fut) in from.records {
|
||||
match fut.await {
|
||||
Ok(res) => match res {
|
||||
Ok(bytes) => {
|
||||
let value = Value::des(&bytes)
|
||||
.map_err(|err| PageReconstructError::Other(err.into()))?;
|
||||
|
||||
match value {
|
||||
Value::WalRecord(rec) => {
|
||||
to.records.push((lsn, rec));
|
||||
}
|
||||
Value::Image(img) => {
|
||||
assert!(to.img.is_none());
|
||||
to.img = Some((lsn, img));
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
return Err(PageReconstructError::Other(err.into()));
|
||||
}
|
||||
},
|
||||
Err(err) => {
|
||||
return Err(PageReconstructError::Other(err.into()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(to)
|
||||
}
|
||||
|
||||
/// Bag of data accumulated during a vectored get..
|
||||
@@ -119,6 +149,47 @@ pub(crate) struct ValuesReconstructState {
|
||||
// Statistics that are still accessible as a caller of `get_vectored_impl`.
|
||||
layers_visited: u32,
|
||||
delta_layers_visited: u32,
|
||||
|
||||
io_concurrency: IoConcurrency,
|
||||
}
|
||||
|
||||
enum IoConcurrency {
|
||||
Serial {
|
||||
prev_io: Option<(usize, tokio::task::JoinHandle<()>)>,
|
||||
},
|
||||
Parallel,
|
||||
}
|
||||
|
||||
impl IoConcurrency {
|
||||
pub(crate) fn spawn_io<F>(&mut self, fut: F)
|
||||
where
|
||||
F: std::future::Future<Output = ()> + Send + 'static,
|
||||
{
|
||||
static IO_COUNTER: std::sync::atomic::AtomicUsize = std::sync::atomic::AtomicUsize::new(0);
|
||||
let io_id = IO_COUNTER.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||
let span = tracing::debug_span!("spawned_io", io_id,);
|
||||
match self {
|
||||
IoConcurrency::Serial { prev_io } => {
|
||||
let prev = prev_io.take();
|
||||
*prev_io = Some((
|
||||
io_id,
|
||||
tokio::spawn(
|
||||
async move {
|
||||
if let Some((prev_id, prev_task)) = prev {
|
||||
debug!(prev_io = prev_id, "Waiting for previous IO to complete");
|
||||
prev_task.await.unwrap();
|
||||
}
|
||||
fut.await;
|
||||
}
|
||||
.instrument(span),
|
||||
),
|
||||
));
|
||||
}
|
||||
IoConcurrency::Parallel => {
|
||||
tokio::spawn(fut);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ValuesReconstructState {
|
||||
@@ -129,9 +200,30 @@ impl ValuesReconstructState {
|
||||
keys_with_image_coverage: None,
|
||||
layers_visited: 0,
|
||||
delta_layers_visited: 0,
|
||||
io_concurrency: {
|
||||
static IO_CONCURRENCY: once_cell::sync::Lazy<String> =
|
||||
once_cell::sync::Lazy::new(|| {
|
||||
std::env::var("NEON_PAGESERVER_VALUE_RECONSTRUCT_IO_CONCURRENCY").unwrap()
|
||||
});
|
||||
match IO_CONCURRENCY.as_str() {
|
||||
"parallel" => IoConcurrency::Parallel,
|
||||
"serial" => IoConcurrency::Serial { prev_io: None },
|
||||
x => panic!(
|
||||
"Invalid value for NEON_PAGESERVER_VALUE_RECONSTRUCT_IO_CONCURRENCY: {}",
|
||||
x
|
||||
),
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn spawn_io<F>(&mut self, fut: F)
|
||||
where
|
||||
F: std::future::Future<Output = ()> + Send + 'static,
|
||||
{
|
||||
self.io_concurrency.spawn_io(fut);
|
||||
}
|
||||
|
||||
/// Associate a key with the error which it encountered and mark it as done
|
||||
pub(crate) fn on_key_error(&mut self, key: Key, err: PageReconstructError) {
|
||||
let previous = self.keys.insert(key, Err(err));
|
||||
@@ -200,7 +292,8 @@ impl ValuesReconstructState {
|
||||
&mut self,
|
||||
key: &Key,
|
||||
lsn: Lsn,
|
||||
value: Value,
|
||||
completes: bool,
|
||||
value: sync::oneshot::Receiver<Result<Bytes, std::io::Error>>,
|
||||
) -> ValueReconstructSituation {
|
||||
let state = self
|
||||
.keys
|
||||
@@ -208,31 +301,16 @@ impl ValuesReconstructState {
|
||||
.or_insert(Ok(VectoredValueReconstructState::default()));
|
||||
|
||||
if let Ok(state) = state {
|
||||
let key_done = match state.situation {
|
||||
match state.situation {
|
||||
ValueReconstructSituation::Complete => unreachable!(),
|
||||
ValueReconstructSituation::Continue => match value {
|
||||
Value::Image(img) => {
|
||||
state.img = Some((lsn, img));
|
||||
true
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
debug_assert!(
|
||||
Some(lsn) > state.get_cached_lsn(),
|
||||
"Attempt to collect a record below cached LSN for walredo: {} < {}",
|
||||
lsn,
|
||||
state
|
||||
.get_cached_lsn()
|
||||
.expect("Assertion can only fire if a cached lsn is present")
|
||||
);
|
||||
ValueReconstructSituation::Continue => {
|
||||
state.records.push((lsn, value));
|
||||
}
|
||||
}
|
||||
|
||||
let will_init = rec.will_init();
|
||||
state.records.push((lsn, rec));
|
||||
will_init
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
if key_done && state.situation == ValueReconstructSituation::Continue {
|
||||
if completes && state.situation == ValueReconstructSituation::Continue {
|
||||
assert_eq!(state.will_init_lsn, None);
|
||||
state.will_init_lsn = Some(lsn);
|
||||
state.situation = ValueReconstructSituation::Complete;
|
||||
self.keys_done.add_key(*key);
|
||||
}
|
||||
@@ -279,7 +357,7 @@ pub(crate) enum LayerId {
|
||||
/// Layer wrapper for the read path. Note that it is valid
|
||||
/// to use these layers even after external operations have
|
||||
/// been performed on them (compaction, freeze, etc.).
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) enum ReadableLayer {
|
||||
PersistentLayer(Layer),
|
||||
InMemoryLayer(Arc<InMemoryLayer>),
|
||||
@@ -292,6 +370,8 @@ struct ReadDesc {
|
||||
layer_id: LayerId,
|
||||
/// Lsn range for the read, used for selecting the next read
|
||||
lsn_range: Range<Lsn>,
|
||||
/// This read's index in [`LayerKeyspace::target_keyspace`];
|
||||
read_id: LayerKeyspaceReadId,
|
||||
}
|
||||
|
||||
/// Data structure which maintains a fringe of layers for the
|
||||
@@ -310,9 +390,13 @@ pub(crate) struct LayerFringe {
|
||||
#[derive(Debug)]
|
||||
struct LayerKeyspace {
|
||||
layer: ReadableLayer,
|
||||
target_keyspace: KeySpaceRandomAccum,
|
||||
next_read_id: LayerKeyspaceReadId,
|
||||
reads: HashMap<LayerKeyspaceReadId, (Range<Lsn>, KeySpace)>,
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, Hash, Debug, Clone, Copy)]
|
||||
struct LayerKeyspaceReadId(usize);
|
||||
|
||||
impl LayerFringe {
|
||||
pub(crate) fn new() -> Self {
|
||||
LayerFringe {
|
||||
@@ -327,22 +411,24 @@ impl LayerFringe {
|
||||
None => return None,
|
||||
};
|
||||
|
||||
let removed = self.layers.remove_entry(&read_desc.layer_id);
|
||||
let mut entry = match self.layers.entry(read_desc.layer_id) {
|
||||
Entry::Occupied(o) => o,
|
||||
Entry::Vacant(_) => unreachable!("fringe internals are always consistent"),
|
||||
};
|
||||
|
||||
match removed {
|
||||
Some((
|
||||
_,
|
||||
LayerKeyspace {
|
||||
layer,
|
||||
mut target_keyspace,
|
||||
},
|
||||
)) => Some((
|
||||
layer,
|
||||
target_keyspace.consume_keyspace(),
|
||||
read_desc.lsn_range,
|
||||
)),
|
||||
None => unreachable!("fringe internals are always consistent"),
|
||||
let (lsn_range, keyspace) = entry
|
||||
.get_mut()
|
||||
.reads
|
||||
.remove(&read_desc.read_id)
|
||||
.expect("fringe internals are always consistent");
|
||||
|
||||
let layer = entry.get().layer.clone();
|
||||
|
||||
if entry.get().reads.is_empty() {
|
||||
entry.remove();
|
||||
}
|
||||
|
||||
Some((layer, keyspace, lsn_range))
|
||||
}
|
||||
|
||||
pub(crate) fn update(
|
||||
@@ -355,18 +441,31 @@ impl LayerFringe {
|
||||
let entry = self.layers.entry(layer_id.clone());
|
||||
match entry {
|
||||
Entry::Occupied(mut entry) => {
|
||||
entry.get_mut().target_keyspace.add_keyspace(keyspace);
|
||||
let read_id = {
|
||||
let r = &mut entry.get_mut().next_read_id;
|
||||
let read_id = *r;
|
||||
*r = LayerKeyspaceReadId(r.0 + 1);
|
||||
read_id
|
||||
};
|
||||
self.planned_reads_by_lsn.push(ReadDesc {
|
||||
lsn_range: lsn_range.clone(),
|
||||
layer_id: layer_id.clone(),
|
||||
read_id,
|
||||
});
|
||||
let replaced = entry.get_mut().reads.insert(read_id, (lsn_range, keyspace));
|
||||
assert!(replaced.is_none());
|
||||
}
|
||||
Entry::Vacant(entry) => {
|
||||
let read_id = LayerKeyspaceReadId(0);
|
||||
self.planned_reads_by_lsn.push(ReadDesc {
|
||||
lsn_range,
|
||||
lsn_range: lsn_range.clone(),
|
||||
layer_id: layer_id.clone(),
|
||||
read_id,
|
||||
});
|
||||
let mut accum = KeySpaceRandomAccum::new();
|
||||
accum.add_keyspace(keyspace);
|
||||
entry.insert(LayerKeyspace {
|
||||
layer,
|
||||
target_keyspace: accum,
|
||||
next_read_id: LayerKeyspaceReadId(1),
|
||||
reads: [(read_id, (lsn_range, keyspace))].into(),
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -434,10 +533,11 @@ impl ReadableLayer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Layers contain a hint indicating whether they are likely to be used for reads. This is a hint rather
|
||||
/// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
|
||||
/// of layers (for example when creating a branch that makes some previously covered layers visible). It should
|
||||
/// be used for cache management but not for correctness-critical checks.
|
||||
/// Layers contain a hint indicating whether they are likely to be used for reads.
|
||||
///
|
||||
/// This is a hint rather than an authoritative value, so that we do not have to update it synchronously
|
||||
/// when changing the visibility of layers (for example when creating a branch that makes some previously
|
||||
/// covered layers visible). It should be used for cache management but not for correctness-critical checks.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum LayerVisibilityHint {
|
||||
/// A Visible layer might be read while serving a read, because there is not an image layer between it
|
||||
|
||||
@@ -42,13 +42,12 @@ use crate::tenant::vectored_blob_io::{
|
||||
BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
|
||||
VectoredReadCoalesceMode, VectoredReadPlanner,
|
||||
};
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
use crate::{walrecord, TEMP_FILE_SUFFIX};
|
||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use bytes::BytesMut;
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use futures::StreamExt;
|
||||
use itertools::Itertools;
|
||||
@@ -58,14 +57,14 @@ use pageserver_api::models::ImageCompressionAlgorithm;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::VecDeque;
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::fs::File;
|
||||
use std::io::SeekFrom;
|
||||
use std::ops::Range;
|
||||
use std::os::unix::fs::FileExt;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::OnceCell;
|
||||
use tokio::sync::{self, OnceCell};
|
||||
use tokio_epoll_uring::IoBuf;
|
||||
use tracing::*;
|
||||
|
||||
@@ -136,10 +135,11 @@ impl Summary {
|
||||
// Flag indicating that this version initialize the page
|
||||
const WILL_INIT: u64 = 1;
|
||||
|
||||
/// Struct representing reference to BLOB in layers. Reference contains BLOB
|
||||
/// offset, and for WAL records it also contains `will_init` flag. The flag
|
||||
/// helps to determine the range of records that needs to be applied, without
|
||||
/// reading/deserializing records themselves.
|
||||
/// Struct representing reference to BLOB in layers.
|
||||
///
|
||||
/// Reference contains BLOB offset, and for WAL records it also contains
|
||||
/// `will_init` flag. The flag helps to determine the range of records
|
||||
/// that needs to be applied, without reading/deserializing records themselves.
|
||||
#[derive(Debug, Serialize, Deserialize, Copy, Clone)]
|
||||
pub struct BlobRef(pub u64);
|
||||
|
||||
@@ -223,7 +223,7 @@ pub struct DeltaLayerInner {
|
||||
index_start_blk: u32,
|
||||
index_root_blk: u32,
|
||||
|
||||
file: VirtualFile,
|
||||
file: Arc<VirtualFile>,
|
||||
file_id: FileId,
|
||||
|
||||
layer_key_range: Range<Key>,
|
||||
@@ -787,9 +787,11 @@ impl DeltaLayerInner {
|
||||
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Self> {
|
||||
let file = VirtualFile::open(path, ctx)
|
||||
.await
|
||||
.context("open layer file")?;
|
||||
let file = Arc::new(
|
||||
VirtualFile::open(path, ctx)
|
||||
.await
|
||||
.context("open layer file")?,
|
||||
);
|
||||
|
||||
let file_id = page_cache::next_file_id();
|
||||
|
||||
@@ -840,6 +842,7 @@ impl DeltaLayerInner {
|
||||
// can be further optimised to visit the index only once.
|
||||
pub(super) async fn get_values_reconstruct_data(
|
||||
&self,
|
||||
self_desc: PersistentLayerDesc,
|
||||
keyspace: KeySpace,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
@@ -862,6 +865,7 @@ impl DeltaLayerInner {
|
||||
let data_end_offset = self.index_start_offset();
|
||||
|
||||
let reads = Self::plan_reads(
|
||||
self_desc,
|
||||
&keyspace,
|
||||
lsn_range.clone(),
|
||||
data_end_offset,
|
||||
@@ -882,6 +886,7 @@ impl DeltaLayerInner {
|
||||
}
|
||||
|
||||
async fn plan_reads<Reader>(
|
||||
self_desc: PersistentLayerDesc,
|
||||
keyspace: &KeySpace,
|
||||
lsn_range: Range<Lsn>,
|
||||
data_end_offset: u64,
|
||||
@@ -910,6 +915,8 @@ impl DeltaLayerInner {
|
||||
let lsn = DeltaKey::extract_lsn_from_buf(&raw_key);
|
||||
let blob_ref = BlobRef(value);
|
||||
|
||||
debug!(file = %self_desc.layer_name(), %key, %lsn, will_init = blob_ref.will_init(), "delta layer found key");
|
||||
|
||||
// Lsns are not monotonically increasing across keys, so we don't assert on them.
|
||||
assert!(key >= range.start);
|
||||
|
||||
@@ -932,7 +939,7 @@ impl DeltaLayerInner {
|
||||
range_end_handled = true;
|
||||
break;
|
||||
} else {
|
||||
planner.handle(key, lsn, blob_ref.pos(), flag);
|
||||
planner.handle(key, lsn, blob_ref.pos(), flag, blob_ref.will_init());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -979,77 +986,59 @@ impl DeltaLayerInner {
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) {
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&self.file);
|
||||
let mut ignore_key_with_err = None;
|
||||
|
||||
let max_vectored_read_bytes = self
|
||||
.max_vectored_read_bytes
|
||||
.expect("Layer is loaded with max vectored bytes config")
|
||||
.0
|
||||
.into();
|
||||
let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes);
|
||||
let mut buf = Some(BytesMut::with_capacity(buf_size));
|
||||
|
||||
// Note that reads are processed in reverse order (from highest key+lsn).
|
||||
// This is the order that `ReconstructState` requires such that it can
|
||||
// track when a key is done.
|
||||
for read in reads.into_iter().rev() {
|
||||
let res = vectored_blob_reader
|
||||
.read_blobs(&read, buf.take().expect("Should have a buffer"), ctx)
|
||||
.await;
|
||||
|
||||
let blobs_buf = match res {
|
||||
Ok(blobs_buf) => blobs_buf,
|
||||
Err(err) => {
|
||||
let kind = err.kind();
|
||||
for (_, blob_meta) in read.blobs_at.as_slice() {
|
||||
reconstruct_state.on_key_error(
|
||||
blob_meta.key,
|
||||
PageReconstructError::Other(anyhow!(
|
||||
"Failed to read blobs from virtual file {}: {}",
|
||||
self.file.path,
|
||||
kind
|
||||
)),
|
||||
);
|
||||
}
|
||||
|
||||
// We have "lost" the buffer since the lower level IO api
|
||||
// doesn't return the buffer on error. Allocate a new one.
|
||||
buf = Some(BytesMut::with_capacity(buf_size));
|
||||
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
for meta in blobs_buf.blobs.iter().rev() {
|
||||
if Some(meta.meta.key) == ignore_key_with_err {
|
||||
continue;
|
||||
}
|
||||
|
||||
let value = Value::des(&blobs_buf.buf[meta.start..meta.end]);
|
||||
let value = match value {
|
||||
Ok(v) => v,
|
||||
Err(e) => {
|
||||
reconstruct_state.on_key_error(
|
||||
meta.meta.key,
|
||||
PageReconstructError::Other(anyhow!(e).context(format!(
|
||||
"Failed to deserialize blob from virtual file {}",
|
||||
self.file.path,
|
||||
))),
|
||||
);
|
||||
|
||||
ignore_key_with_err = Some(meta.meta.key);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
// Invariant: once a key reaches [`ValueReconstructSituation::Complete`]
|
||||
// state, no further updates shall be made to it. The call below will
|
||||
// panic if the invariant is violated.
|
||||
reconstruct_state.update_key(&meta.meta.key, meta.meta.lsn, value);
|
||||
let mut senders: HashMap<
|
||||
(Key, Lsn),
|
||||
sync::oneshot::Sender<Result<Bytes, std::io::Error>>,
|
||||
> = Default::default();
|
||||
for (_, blob_meta) in read.blobs_at.as_slice().iter().rev() {
|
||||
let (tx, rx) = sync::oneshot::channel();
|
||||
senders.insert((blob_meta.key, blob_meta.lsn), tx);
|
||||
reconstruct_state.update_key(
|
||||
&blob_meta.key,
|
||||
blob_meta.lsn,
|
||||
blob_meta.will_init,
|
||||
rx,
|
||||
);
|
||||
}
|
||||
|
||||
buf = Some(blobs_buf.buf);
|
||||
let read_from = self.file.clone();
|
||||
let read_ctx = ctx.attached_child();
|
||||
reconstruct_state.spawn_io(async move {
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&read_from);
|
||||
let buf = BytesMut::with_capacity(buf_size);
|
||||
|
||||
let res = vectored_blob_reader.read_blobs(&read, buf, &read_ctx).await;
|
||||
match res {
|
||||
Ok(blobs_buf) => {
|
||||
for meta in blobs_buf.blobs.iter().rev() {
|
||||
let buf = &blobs_buf.buf[meta.start..meta.end];
|
||||
let sender = senders
|
||||
.remove(&(meta.meta.key, meta.meta.lsn))
|
||||
.expect("sender must exist");
|
||||
let _ = sender.send(Ok(Bytes::copy_from_slice(buf)));
|
||||
}
|
||||
|
||||
assert!(senders.is_empty());
|
||||
}
|
||||
Err(err) => {
|
||||
for (_, sender) in senders {
|
||||
let _ = sender
|
||||
.send(Err(std::io::Error::new(err.kind(), "vec read failed")));
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1189,7 +1178,14 @@ impl DeltaLayerInner {
|
||||
let actionable = if let Some((key, lsn, start_offset)) = prev.take() {
|
||||
let end_offset = offset;
|
||||
|
||||
Some((BlobMeta { key, lsn }, start_offset..end_offset))
|
||||
Some((
|
||||
BlobMeta {
|
||||
key,
|
||||
lsn,
|
||||
will_init: false,
|
||||
},
|
||||
start_offset..end_offset,
|
||||
))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
//! An ImageLayer represents an image or a snapshot of a key-range at
|
||||
//! one particular LSN. It contains an image of all key-value pairs
|
||||
//! in its key-range. Any key that falls into the image layer's range
|
||||
//! but does not exist in the layer, does not exist.
|
||||
//! one particular LSN.
|
||||
//!
|
||||
//! It contains an image of all key-value pairs in its key-range. Any key
|
||||
//! that falls into the image layer's range but does not exist in the layer,
|
||||
//! does not exist.
|
||||
//!
|
||||
//! An image layer is stored in a file on disk. The file is stored in
|
||||
//! timelines/<timeline_id> directory. Currently, there are no
|
||||
@@ -19,7 +21,7 @@
|
||||
//!
|
||||
//! Every image layer file consists of three parts: "summary",
|
||||
//! "index", and "values". The summary is a fixed size header at the
|
||||
//! beginning of the file, and it contains basic information about the
|
||||
//! beginningof the file, and it contains basic information about the
|
||||
//! layer, and offsets to the other parts. The "index" is a B-tree,
|
||||
//! mapping from Key to an offset in the "values" part. The
|
||||
//! actual page images are stored in the "values" part.
|
||||
@@ -36,11 +38,11 @@ use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::vectored_blob_io::{
|
||||
BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
|
||||
};
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
use crate::tenant::Timeline;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use hex;
|
||||
@@ -50,13 +52,14 @@ use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::shard::{ShardIdentity, TenantShardId};
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::VecDeque;
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::fs::File;
|
||||
use std::io::SeekFrom;
|
||||
use std::ops::Range;
|
||||
use std::os::unix::prelude::FileExt;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::oneshot;
|
||||
use tokio::sync::OnceCell;
|
||||
use tokio_stream::StreamExt;
|
||||
use tracing::*;
|
||||
@@ -161,7 +164,7 @@ pub struct ImageLayerInner {
|
||||
key_range: Range<Key>,
|
||||
lsn: Lsn,
|
||||
|
||||
file: VirtualFile,
|
||||
file: Arc<VirtualFile>,
|
||||
file_id: FileId,
|
||||
|
||||
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
|
||||
@@ -388,9 +391,11 @@ impl ImageLayerInner {
|
||||
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Self> {
|
||||
let file = VirtualFile::open(path, ctx)
|
||||
.await
|
||||
.context("open layer file")?;
|
||||
let file = Arc::new(
|
||||
VirtualFile::open(path, ctx)
|
||||
.await
|
||||
.context("open layer file")?,
|
||||
);
|
||||
let file_id = page_cache::next_file_id();
|
||||
let block_reader = FileBlockReader::new(&file, file_id);
|
||||
let summary_blk = block_reader
|
||||
@@ -436,12 +441,13 @@ impl ImageLayerInner {
|
||||
// the reconstruct state with whatever is found.
|
||||
pub(super) async fn get_values_reconstruct_data(
|
||||
&self,
|
||||
self_desc: PersistentLayerDesc,
|
||||
keyspace: KeySpace,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
let reads = self
|
||||
.plan_reads(keyspace, None, ctx)
|
||||
.plan_reads(self_desc, keyspace, None, ctx)
|
||||
.await
|
||||
.map_err(GetVectoredError::Other)?;
|
||||
|
||||
@@ -460,6 +466,7 @@ impl ImageLayerInner {
|
||||
/// this shard.
|
||||
async fn plan_reads(
|
||||
&self,
|
||||
self_desc: PersistentLayerDesc,
|
||||
keyspace: KeySpace,
|
||||
shard_identity: Option<&ShardIdentity>,
|
||||
ctx: &RequestContext,
|
||||
@@ -503,12 +510,14 @@ impl ImageLayerInner {
|
||||
BlobFlag::None
|
||||
};
|
||||
|
||||
debug!(file = %self_desc.layer_name(), %key, %self.lsn, will_init=true, "image layer found key");
|
||||
|
||||
if key >= range.end {
|
||||
planner.handle_range_end(offset);
|
||||
range_end_handled = true;
|
||||
break;
|
||||
} else {
|
||||
planner.handle(key, self.lsn, offset, flag);
|
||||
planner.handle(key, self.lsn, offset, flag, true);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -532,6 +541,7 @@ impl ImageLayerInner {
|
||||
// Fragment the range into the regions owned by this ShardIdentity
|
||||
let plan = self
|
||||
.plan_reads(
|
||||
todo!(),
|
||||
KeySpace {
|
||||
// If asked for the total key space, plan_reads will give us all the keys in the layer
|
||||
ranges: vec![Key::MIN..Key::MAX],
|
||||
@@ -577,8 +587,16 @@ impl ImageLayerInner {
|
||||
.0
|
||||
.into();
|
||||
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&self.file);
|
||||
for read in reads.into_iter() {
|
||||
let mut senders: HashMap<(Key, Lsn), oneshot::Sender<Result<Bytes, std::io::Error>>> =
|
||||
Default::default();
|
||||
for (_, blob_meta) in read.blobs_at.as_slice() {
|
||||
let (tx, rx) = oneshot::channel();
|
||||
senders.insert((blob_meta.key, blob_meta.lsn), tx);
|
||||
|
||||
reconstruct_state.update_key(&blob_meta.key, blob_meta.lsn, true, rx);
|
||||
}
|
||||
|
||||
let buf_size = read.size();
|
||||
|
||||
if buf_size > max_vectored_read_bytes {
|
||||
@@ -597,36 +615,36 @@ impl ImageLayerInner {
|
||||
);
|
||||
}
|
||||
|
||||
let buf = BytesMut::with_capacity(buf_size);
|
||||
let res = vectored_blob_reader.read_blobs(&read, buf, ctx).await;
|
||||
let read_from = self.file.clone();
|
||||
let read_ctx = ctx.attached_child();
|
||||
reconstruct_state.spawn_io(async move {
|
||||
let buf = BytesMut::with_capacity(buf_size);
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&read_from);
|
||||
let res = vectored_blob_reader.read_blobs(&read, buf, &read_ctx).await;
|
||||
|
||||
match res {
|
||||
Ok(blobs_buf) => {
|
||||
let frozen_buf = blobs_buf.buf.freeze();
|
||||
match res {
|
||||
Ok(blobs_buf) => {
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let buf = &blobs_buf.buf[meta.start..meta.end];
|
||||
let sender = senders
|
||||
.remove(&(meta.meta.key, meta.meta.lsn))
|
||||
.expect("sender must exist");
|
||||
// TODO: this is silly - sort it out
|
||||
let bytes = Value::ser(&Value::Image(Bytes::copy_from_slice(buf)))
|
||||
.expect("stupid but correct");
|
||||
let _ = sender.send(Ok(bytes.into()));
|
||||
}
|
||||
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let img_buf = frozen_buf.slice(meta.start..meta.end);
|
||||
reconstruct_state.update_key(
|
||||
&meta.meta.key,
|
||||
self.lsn,
|
||||
Value::Image(img_buf),
|
||||
);
|
||||
assert!(senders.is_empty());
|
||||
}
|
||||
Err(err) => {
|
||||
for (_, sender) in senders {
|
||||
let _ = sender
|
||||
.send(Err(std::io::Error::new(err.kind(), "vec read failed")));
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
let kind = err.kind();
|
||||
for (_, blob_meta) in read.blobs_at.as_slice() {
|
||||
reconstruct_state.on_key_error(
|
||||
blob_meta.key,
|
||||
PageReconstructError::from(anyhow!(
|
||||
"Failed to read blobs from virtual file {}: {}",
|
||||
self.file.path,
|
||||
kind
|
||||
)),
|
||||
);
|
||||
}
|
||||
}
|
||||
};
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -10,10 +10,9 @@ use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::tenant::ephemeral_file::EphemeralFile;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
|
||||
use crate::{l0_flush, page_cache};
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use anyhow::{Context, Result};
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::key::CompactKey;
|
||||
@@ -35,9 +34,7 @@ use std::sync::atomic::Ordering as AtomicOrdering;
|
||||
use std::sync::atomic::{AtomicU64, AtomicUsize};
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
use super::{
|
||||
DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
|
||||
};
|
||||
use super::{DeltaLayerWriter, PersistentLayerDesc, ValuesReconstructState};
|
||||
|
||||
pub(crate) mod vectored_dio_read;
|
||||
|
||||
@@ -87,7 +84,7 @@ pub struct InMemoryLayerInner {
|
||||
/// The values are stored in a serialized format in this file.
|
||||
/// Each serialized Value is preceded by a 'u32' length field.
|
||||
/// PerSeg::page_versions map stores offsets into this file.
|
||||
file: EphemeralFile,
|
||||
file: Arc<tokio::sync::RwLock<EphemeralFile>>,
|
||||
|
||||
resource_units: GlobalResourceUnits,
|
||||
}
|
||||
@@ -381,7 +378,11 @@ impl InMemoryLayer {
|
||||
}
|
||||
|
||||
pub(crate) fn try_len(&self) -> Option<u64> {
|
||||
self.inner.try_read().map(|i| i.file.len()).ok()
|
||||
self.inner
|
||||
.try_read()
|
||||
.map(|i| i.file.try_read().map(|i| i.len()).ok())
|
||||
.ok()
|
||||
.flatten()
|
||||
}
|
||||
|
||||
pub(crate) fn assert_writable(&self) {
|
||||
@@ -432,6 +433,10 @@ impl InMemoryLayer {
|
||||
read: vectored_dio_read::LogicalRead<Vec<u8>>,
|
||||
}
|
||||
let mut reads: HashMap<Key, Vec<ValueRead>> = HashMap::new();
|
||||
let mut senders: HashMap<
|
||||
(Key, Lsn),
|
||||
tokio::sync::oneshot::Sender<Result<Bytes, std::io::Error>>,
|
||||
> = Default::default();
|
||||
|
||||
for range in keyspace.ranges.iter() {
|
||||
for (key, vec_map) in inner
|
||||
@@ -459,6 +464,13 @@ impl InMemoryLayer {
|
||||
Vec::with_capacity(len as usize),
|
||||
),
|
||||
});
|
||||
|
||||
let (tx, rx) = tokio::sync::oneshot::channel();
|
||||
senders.insert((key, *entry_lsn), tx);
|
||||
reconstruct_state.update_key(&key, *entry_lsn, will_init, rx);
|
||||
|
||||
debug!(%key, %entry_lsn, will_init, "inmemory layer found key");
|
||||
|
||||
if will_init {
|
||||
break;
|
||||
}
|
||||
@@ -466,46 +478,39 @@ impl InMemoryLayer {
|
||||
}
|
||||
}
|
||||
|
||||
// Execute the reads.
|
||||
let read_from = inner.file.clone();
|
||||
let read_ctx = ctx.attached_child();
|
||||
reconstruct_state.spawn_io(async move {
|
||||
let locked = read_from.read().await;
|
||||
let f = vectored_dio_read::execute(
|
||||
&*locked,
|
||||
reads
|
||||
.iter()
|
||||
.flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
|
||||
&read_ctx,
|
||||
);
|
||||
send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865
|
||||
.await;
|
||||
|
||||
let f = vectored_dio_read::execute(
|
||||
&inner.file,
|
||||
reads
|
||||
.iter()
|
||||
.flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
|
||||
&ctx,
|
||||
);
|
||||
send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865
|
||||
.await;
|
||||
|
||||
// Process results into the reconstruct state
|
||||
'next_key: for (key, value_reads) in reads {
|
||||
for ValueRead { entry_lsn, read } in value_reads {
|
||||
match read.into_result().expect("we run execute() above") {
|
||||
Err(e) => {
|
||||
reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e)));
|
||||
continue 'next_key;
|
||||
}
|
||||
Ok(value_buf) => {
|
||||
let value = Value::des(&value_buf);
|
||||
if let Err(e) = value {
|
||||
reconstruct_state
|
||||
.on_key_error(key, PageReconstructError::from(anyhow!(e)));
|
||||
continue 'next_key;
|
||||
for (key, value_reads) in reads {
|
||||
for ValueRead { entry_lsn, read } in value_reads {
|
||||
let sender = senders
|
||||
.remove(&(key, entry_lsn))
|
||||
.expect("sender must exist");
|
||||
match read.into_result().expect("we run execute() above") {
|
||||
Err(e) => {
|
||||
let _ = sender
|
||||
.send(Err(std::io::Error::new(e.kind(), "dio vec read failed")));
|
||||
}
|
||||
|
||||
let key_situation =
|
||||
reconstruct_state.update_key(&key, entry_lsn, value.unwrap());
|
||||
if key_situation == ValueReconstructSituation::Complete {
|
||||
// TODO: metric to see if we fetched more values than necessary
|
||||
continue 'next_key;
|
||||
Ok(value_buf) => {
|
||||
let _ = sender.send(Ok(value_buf.into()));
|
||||
}
|
||||
|
||||
// process the next value in the next iteration of the loop
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
assert!(senders.is_empty());
|
||||
});
|
||||
|
||||
reconstruct_state.on_lsn_advanced(&keyspace, self.start_lsn);
|
||||
|
||||
@@ -600,7 +605,8 @@ impl InMemoryLayer {
|
||||
/// Get layer size.
|
||||
pub async fn size(&self) -> Result<u64> {
|
||||
let inner = self.inner.read().await;
|
||||
Ok(inner.file.len())
|
||||
let locked = inner.file.try_read().expect("no contention");
|
||||
Ok(locked.len())
|
||||
}
|
||||
|
||||
/// Create a new, empty, in-memory layer
|
||||
@@ -614,9 +620,10 @@ impl InMemoryLayer {
|
||||
) -> Result<InMemoryLayer> {
|
||||
trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
|
||||
|
||||
let file =
|
||||
EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?;
|
||||
let key = InMemoryLayerFileId(file.page_cache_file_id());
|
||||
let file = Arc::new(tokio::sync::RwLock::new(
|
||||
EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?,
|
||||
));
|
||||
let key = InMemoryLayerFileId(file.read().await.page_cache_file_id());
|
||||
|
||||
Ok(InMemoryLayer {
|
||||
file_id: key,
|
||||
@@ -648,7 +655,7 @@ impl InMemoryLayer {
|
||||
let mut inner = self.inner.write().await;
|
||||
self.assert_writable();
|
||||
|
||||
let base_offset = inner.file.len();
|
||||
let base_offset = inner.file.read().await.len();
|
||||
|
||||
let SerializedBatch {
|
||||
raw,
|
||||
@@ -672,8 +679,13 @@ impl InMemoryLayer {
|
||||
}
|
||||
|
||||
// Write the batch to the file
|
||||
inner.file.write_raw(&raw, ctx).await?;
|
||||
let new_size = inner.file.len();
|
||||
// FIXME: can't borrow arc
|
||||
let new_size = {
|
||||
let mut locked = inner.file.write().await;
|
||||
locked.write_raw(&raw, ctx).await?;
|
||||
locked.len()
|
||||
};
|
||||
|
||||
let expected_new_len = base_offset
|
||||
.checked_add(raw.len().into_u64())
|
||||
// write_raw would error if we were to overflow u64.
|
||||
@@ -713,7 +725,7 @@ impl InMemoryLayer {
|
||||
|
||||
pub(crate) async fn tick(&self) -> Option<u64> {
|
||||
let mut inner = self.inner.write().await;
|
||||
let size = inner.file.len();
|
||||
let size = inner.file.read().await.len();
|
||||
inner.resource_units.publish_size(size)
|
||||
}
|
||||
|
||||
@@ -809,7 +821,7 @@ impl InMemoryLayer {
|
||||
|
||||
match l0_flush_global_state {
|
||||
l0_flush::Inner::Direct { .. } => {
|
||||
let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
|
||||
let file_contents: Vec<u8> = inner.file.read().await.load_to_vec(ctx).await?;
|
||||
|
||||
let file_contents = Bytes::from(file_contents);
|
||||
|
||||
|
||||
@@ -1755,11 +1755,17 @@ impl DownloadedLayer {
|
||||
.map_err(GetVectoredError::Other)?
|
||||
{
|
||||
Delta(d) => {
|
||||
d.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, ctx)
|
||||
.await
|
||||
d.get_values_reconstruct_data(
|
||||
owner.desc.clone(),
|
||||
keyspace,
|
||||
lsn_range,
|
||||
reconstruct_data,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
}
|
||||
Image(i) => {
|
||||
i.get_values_reconstruct_data(keyspace, reconstruct_data, ctx)
|
||||
i.get_values_reconstruct_data(owner.desc.clone(), keyspace, reconstruct_data, ctx)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -107,6 +107,8 @@ async fn smoke_test() {
|
||||
.expect("tenant harness writes the control file")
|
||||
};
|
||||
|
||||
let img_before = (img_before.0, img_before.1.await.unwrap().unwrap());
|
||||
let img_after = (img_after.0, img_after.1.await.unwrap().unwrap());
|
||||
assert_eq!(img_before, img_after);
|
||||
|
||||
// evict_and_wait can timeout, but it doesn't cancel the evicting itself
|
||||
|
||||
@@ -12,8 +12,10 @@ use serde::{Deserialize, Serialize};
|
||||
#[cfg(test)]
|
||||
use utils::id::TenantId;
|
||||
|
||||
/// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
|
||||
/// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
|
||||
/// A unique identifier of a persistent layer.
|
||||
///
|
||||
/// This is different from `LayerDescriptor`, which is only used in the benchmarks.
|
||||
/// This struct contains all necessary information to find the image / delta layer. It also provides
|
||||
/// a unified way to generate layer information like file name.
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Hash)]
|
||||
pub struct PersistentLayerDesc {
|
||||
|
||||
@@ -217,8 +217,9 @@ impl fmt::Display for ImageLayerName {
|
||||
}
|
||||
}
|
||||
|
||||
/// LayerName is the logical identity of a layer within a LayerMap at a moment in time. The
|
||||
/// LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations
|
||||
/// LayerName is the logical identity of a layer within a LayerMap at a moment in time.
|
||||
///
|
||||
/// The LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations
|
||||
/// over time (e.g. across shard splits or compression). The physical filenames of layers in local
|
||||
/// storage and object names in remote storage consist of the LayerName plus some extra qualifiers
|
||||
/// that uniquely identify the physical incarnation of a layer (see [crate::tenant::remote_timeline_client::remote_layer_path])
|
||||
|
||||
@@ -226,9 +226,11 @@ impl<'a> IteratorWrapper<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
/// A merge iterator over delta/image layer iterators. When duplicated records are
|
||||
/// found, the iterator will not perform any deduplication, and the caller should handle
|
||||
/// these situation. By saying duplicated records, there are many possibilities:
|
||||
/// A merge iterator over delta/image layer iterators.
|
||||
///
|
||||
/// When duplicated records are found, the iterator will not perform any
|
||||
/// deduplication, and the caller should handle these situation. By saying
|
||||
/// duplicated records, there are many possibilities:
|
||||
///
|
||||
/// * Two same delta at the same LSN.
|
||||
/// * Two same image at the same LSN.
|
||||
|
||||
@@ -34,9 +34,10 @@ impl SplitWriterResult {
|
||||
}
|
||||
}
|
||||
|
||||
/// An image writer that takes images and produces multiple image layers. The interface does not
|
||||
/// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files
|
||||
/// to be cleaned up)
|
||||
/// An image writer that takes images and produces multiple image layers.
|
||||
///
|
||||
/// The interface does not guarantee atomicity (i.e., if the image layer generation
|
||||
/// fails, there might be leftover files to be cleaned up)
|
||||
#[must_use]
|
||||
pub struct SplitImageLayerWriter {
|
||||
inner: ImageLayerWriter,
|
||||
@@ -193,9 +194,10 @@ impl SplitImageLayerWriter {
|
||||
}
|
||||
}
|
||||
|
||||
/// A delta writer that takes key-lsn-values and produces multiple delta layers. The interface does not
|
||||
/// guarantee atomicity (i.e., if the delta layer generation fails, there might be leftover files
|
||||
/// to be cleaned up).
|
||||
/// A delta writer that takes key-lsn-values and produces multiple delta layers.
|
||||
///
|
||||
/// The interface does not guarantee atomicity (i.e., if the delta layer generation fails,
|
||||
/// there might be leftover files to be cleaned up).
|
||||
///
|
||||
/// Note that if updates of a single key exceed the target size limit, all of the updates will be batched
|
||||
/// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm
|
||||
|
||||
@@ -18,6 +18,7 @@ use camino::Utf8Path;
|
||||
use chrono::{DateTime, Utc};
|
||||
use enumset::EnumSet;
|
||||
use fail::fail_point;
|
||||
use futures::{stream::FuturesUnordered, StreamExt};
|
||||
use handle::ShardTimelineId;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::{
|
||||
@@ -48,7 +49,6 @@ use utils::{
|
||||
sync::gate::{Gate, GateGuard},
|
||||
};
|
||||
|
||||
use std::pin::pin;
|
||||
use std::sync::atomic::Ordering as AtomicOrdering;
|
||||
use std::sync::{Arc, Mutex, RwLock, Weak};
|
||||
use std::time::{Duration, Instant, SystemTime};
|
||||
@@ -62,13 +62,16 @@ use std::{
|
||||
collections::btree_map::Entry,
|
||||
ops::{Deref, Range},
|
||||
};
|
||||
use std::{pin::pin, sync::atomic::AtomicUsize};
|
||||
|
||||
use crate::{
|
||||
aux_file::AuxFileSizeEstimator,
|
||||
tenant::{
|
||||
layer_map::{LayerMap, SearchResult},
|
||||
metadata::TimelineMetadata,
|
||||
storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc},
|
||||
storage_layer::{
|
||||
convert, inmemory_layer::IndexEntry, PersistentLayerDesc, ValueReconstructSituation,
|
||||
},
|
||||
},
|
||||
walredo,
|
||||
};
|
||||
@@ -565,7 +568,7 @@ impl From<layer_manager::Shutdown> for GetVectoredError {
|
||||
|
||||
#[derive(thiserror::Error)]
|
||||
pub struct MissingKeyError {
|
||||
key: Key,
|
||||
keys: KeySpace,
|
||||
shard: ShardNumber,
|
||||
cont_lsn: Lsn,
|
||||
request_lsn: Lsn,
|
||||
@@ -583,8 +586,8 @@ impl std::fmt::Display for MissingKeyError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"could not find data for key {} (shard {:?}) at LSN {}, request LSN {}",
|
||||
self.key, self.shard, self.cont_lsn, self.request_lsn
|
||||
"could not find data for keys (shard {:?}) at LSN {}, request LSN {} keys {}",
|
||||
self.shard, self.cont_lsn, self.request_lsn, self.keys
|
||||
)?;
|
||||
if let Some(ref ancestor_lsn) = self.ancestor_lsn {
|
||||
write!(f, ", ancestor {}", ancestor_lsn)?;
|
||||
@@ -952,7 +955,11 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
None => Err(PageReconstructError::MissingKey(MissingKeyError {
|
||||
key,
|
||||
keys: {
|
||||
let mut accum = KeySpaceAccum::new();
|
||||
accum.add_key(key);
|
||||
accum.to_keyspace()
|
||||
},
|
||||
shard: self.shard_identity.get_shard_number(&key),
|
||||
cont_lsn: Lsn(0),
|
||||
request_lsn: lsn,
|
||||
@@ -1122,29 +1129,53 @@ impl Timeline {
|
||||
let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
|
||||
.for_get_kind(get_kind)
|
||||
.start_timer();
|
||||
static INVOCATION: Lazy<AtomicUsize> = Lazy::new(|| AtomicUsize::new(0));
|
||||
let invocation = INVOCATION.fetch_add(1, AtomicOrdering::Relaxed);
|
||||
self.get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx)
|
||||
.await?;
|
||||
.instrument(debug_span!("get_vectored_reconstruct_data", invocation))
|
||||
.await
|
||||
.map_err(|err| {
|
||||
anyhow::anyhow!(
|
||||
"get_vectored_reconstruct_data invocation {invocation} lsn={lsn} keyspace={keyspace} {err:?}",
|
||||
)
|
||||
})?;
|
||||
get_data_timer.stop_and_record();
|
||||
|
||||
let reconstruct_timer = crate::metrics::RECONSTRUCT_TIME
|
||||
.for_get_kind(get_kind)
|
||||
.start_timer();
|
||||
let mut results: BTreeMap<Key, Result<Bytes, PageReconstructError>> = BTreeMap::new();
|
||||
let layers_visited = reconstruct_state.get_layers_visited();
|
||||
|
||||
let futs = FuturesUnordered::new();
|
||||
for (key, res) in std::mem::take(&mut reconstruct_state.keys) {
|
||||
match res {
|
||||
Err(err) => {
|
||||
results.insert(key, Err(err));
|
||||
}
|
||||
Ok(state) => {
|
||||
let state = ValueReconstructState::from(state);
|
||||
futs.push({
|
||||
let walredo_self = self.myself.upgrade().expect("&self method holds the arc");
|
||||
async move {
|
||||
let state = res.expect("Read path is infallible");
|
||||
assert!(matches!(
|
||||
state.situation,
|
||||
ValueReconstructSituation::Complete
|
||||
));
|
||||
|
||||
let reconstruct_res = self.reconstruct_value(key, lsn, state).await;
|
||||
results.insert(key, reconstruct_res);
|
||||
let converted = match convert(key, state).await {
|
||||
Ok(ok) => ok,
|
||||
Err(err) => {
|
||||
return (key, Err(err));
|
||||
}
|
||||
};
|
||||
|
||||
(
|
||||
key,
|
||||
walredo_self.reconstruct_value(key, lsn, converted).await,
|
||||
)
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
let results = futs
|
||||
.collect::<BTreeMap<Key, Result<Bytes, PageReconstructError>>>()
|
||||
.await;
|
||||
|
||||
reconstruct_timer.stop_and_record();
|
||||
|
||||
// For aux file keys (v1 or v2) the vectored read path does not return an error
|
||||
@@ -3120,7 +3151,7 @@ impl Timeline {
|
||||
|
||||
if let Some(missing_keyspace) = missing_keyspace {
|
||||
return Err(GetVectoredError::MissingKey(MissingKeyError {
|
||||
key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */
|
||||
keys: missing_keyspace.clone(),
|
||||
shard: self
|
||||
.shard_identity
|
||||
.get_shard_number(&missing_keyspace.start().unwrap()),
|
||||
@@ -5496,30 +5527,30 @@ impl Timeline {
|
||||
#[cfg(test)]
|
||||
pub(crate) async fn inspect_image_layers(
|
||||
self: &Arc<Timeline>,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
_lsn: Lsn,
|
||||
_ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<(Key, Bytes)>> {
|
||||
let mut all_data = Vec::new();
|
||||
let guard = self.layers.read().await;
|
||||
for layer in guard.layer_map()?.iter_historic_layers() {
|
||||
if !layer.is_delta() && layer.image_layer_lsn() == lsn {
|
||||
let layer = guard.get_from_desc(&layer);
|
||||
let mut reconstruct_data = ValuesReconstructState::default();
|
||||
layer
|
||||
.get_values_reconstruct_data(
|
||||
KeySpace::single(Key::MIN..Key::MAX),
|
||||
lsn..Lsn(lsn.0 + 1),
|
||||
&mut reconstruct_data,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
for (k, v) in reconstruct_data.keys {
|
||||
all_data.push((k, v?.img.unwrap().1));
|
||||
}
|
||||
}
|
||||
}
|
||||
all_data.sort();
|
||||
Ok(all_data)
|
||||
// let mut all_data = Vec::new();
|
||||
// let guard = self.layers.read().await;
|
||||
// for layer in guard.layer_map()?.iter_historic_layers() {
|
||||
// if !layer.is_delta() && layer.image_layer_lsn() == lsn {
|
||||
// let layer = guard.get_from_desc(&layer);
|
||||
// let mut reconstruct_data = ValuesReconstructState::default();
|
||||
// layer
|
||||
// .get_values_reconstruct_data(
|
||||
// KeySpace::single(Key::MIN..Key::MAX),
|
||||
// lsn..Lsn(lsn.0 + 1),
|
||||
// &mut reconstruct_data,
|
||||
// ctx,
|
||||
// )
|
||||
// .await?;
|
||||
// for (k, v) in reconstruct_data.keys {
|
||||
// all_data.push((k, v?.img.unwrap().1));
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// all_data.sort();
|
||||
Ok(Vec::new())
|
||||
}
|
||||
|
||||
/// Get all historic layer descriptors in the layer map
|
||||
|
||||
@@ -19,7 +19,6 @@ use bytes::Bytes;
|
||||
use enumset::EnumSet;
|
||||
use fail::fail_point;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::config::{CompactL0BypassPageCacheValidation, CompactL0Phase1ValueAccess};
|
||||
use pageserver_api::key::KEY_SIZE;
|
||||
use pageserver_api::keyspace::ShardedRange;
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
|
||||
@@ -912,137 +911,13 @@ impl Timeline {
|
||||
// we're compacting, in key, LSN order.
|
||||
// If there's both a Value::Image and Value::WalRecord for the same (key,lsn),
|
||||
// then the Value::Image is ordered before Value::WalRecord.
|
||||
//
|
||||
// TODO(https://github.com/neondatabase/neon/issues/8184): remove the page cached blob_io
|
||||
// option and validation code once we've reached confidence.
|
||||
enum AllValuesIter<'a> {
|
||||
PageCachedBlobIo {
|
||||
all_keys_iter: VecIter<'a>,
|
||||
},
|
||||
StreamingKmergeBypassingPageCache {
|
||||
merge_iter: MergeIterator<'a>,
|
||||
},
|
||||
ValidatingStreamingKmergeBypassingPageCache {
|
||||
mode: CompactL0BypassPageCacheValidation,
|
||||
merge_iter: MergeIterator<'a>,
|
||||
all_keys_iter: VecIter<'a>,
|
||||
},
|
||||
}
|
||||
type VecIter<'a> = std::slice::Iter<'a, DeltaEntry<'a>>; // TODO: distinguished lifetimes
|
||||
impl AllValuesIter<'_> {
|
||||
async fn next_all_keys_iter(
|
||||
iter: &mut VecIter<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
|
||||
let Some(DeltaEntry {
|
||||
key,
|
||||
lsn,
|
||||
val: value_ref,
|
||||
..
|
||||
}) = iter.next()
|
||||
else {
|
||||
return Ok(None);
|
||||
};
|
||||
let value = value_ref.load(ctx).await?;
|
||||
Ok(Some((*key, *lsn, value)))
|
||||
}
|
||||
async fn next(
|
||||
&mut self,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
|
||||
match self {
|
||||
AllValuesIter::PageCachedBlobIo { all_keys_iter: iter } => {
|
||||
Self::next_all_keys_iter(iter, ctx).await
|
||||
}
|
||||
AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter } => merge_iter.next().await,
|
||||
AllValuesIter::ValidatingStreamingKmergeBypassingPageCache { mode, merge_iter, all_keys_iter } => async {
|
||||
// advance both iterators
|
||||
let all_keys_iter_item = Self::next_all_keys_iter(all_keys_iter, ctx).await;
|
||||
let merge_iter_item = merge_iter.next().await;
|
||||
// compare results & log warnings as needed
|
||||
macro_rules! rate_limited_warn {
|
||||
($($arg:tt)*) => {{
|
||||
if cfg!(debug_assertions) || cfg!(feature = "testing") {
|
||||
warn!($($arg)*);
|
||||
panic!("CompactL0BypassPageCacheValidation failure, check logs");
|
||||
}
|
||||
use once_cell::sync::Lazy;
|
||||
use utils::rate_limit::RateLimit;
|
||||
use std::sync::Mutex;
|
||||
use std::time::Duration;
|
||||
static LOGGED: Lazy<Mutex<RateLimit>> =
|
||||
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
|
||||
let mut rate_limit = LOGGED.lock().unwrap();
|
||||
rate_limit.call(|| {
|
||||
warn!($($arg)*);
|
||||
});
|
||||
}}
|
||||
}
|
||||
match (&all_keys_iter_item, &merge_iter_item) {
|
||||
(Err(_), Err(_)) => {
|
||||
// don't bother asserting equivality of the errors
|
||||
}
|
||||
(Err(all_keys), Ok(merge)) => {
|
||||
rate_limited_warn!(?merge, "all_keys_iter returned an error where merge did not: {all_keys:?}");
|
||||
},
|
||||
(Ok(all_keys), Err(merge)) => {
|
||||
rate_limited_warn!(?all_keys, "merge returned an error where all_keys_iter did not: {merge:?}");
|
||||
},
|
||||
(Ok(None), Ok(None)) => { }
|
||||
(Ok(Some(all_keys)), Ok(None)) => {
|
||||
rate_limited_warn!(?all_keys, "merge returned None where all_keys_iter returned Some");
|
||||
}
|
||||
(Ok(None), Ok(Some(merge))) => {
|
||||
rate_limited_warn!(?merge, "all_keys_iter returned None where merge returned Some");
|
||||
}
|
||||
(Ok(Some((all_keys_key, all_keys_lsn, all_keys_value))), Ok(Some((merge_key, merge_lsn, merge_value)))) => {
|
||||
match mode {
|
||||
// TODO: in this mode, we still load the value from disk for both iterators, even though we only need the all_keys_iter one
|
||||
CompactL0BypassPageCacheValidation::KeyLsn => {
|
||||
let all_keys = (all_keys_key, all_keys_lsn);
|
||||
let merge = (merge_key, merge_lsn);
|
||||
if all_keys != merge {
|
||||
rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN) than all_keys_iter");
|
||||
}
|
||||
}
|
||||
CompactL0BypassPageCacheValidation::KeyLsnValue => {
|
||||
let all_keys = (all_keys_key, all_keys_lsn, all_keys_value);
|
||||
let merge = (merge_key, merge_lsn, merge_value);
|
||||
if all_keys != merge {
|
||||
rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN,Value) than all_keys_iter");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// in case of mismatch, trust the legacy all_keys_iter_item
|
||||
all_keys_iter_item
|
||||
}.instrument(info_span!("next")).await
|
||||
}
|
||||
}
|
||||
}
|
||||
let mut all_values_iter = match &self.conf.compact_level0_phase1_value_access {
|
||||
CompactL0Phase1ValueAccess::PageCachedBlobIo => AllValuesIter::PageCachedBlobIo {
|
||||
all_keys_iter: all_keys.iter(),
|
||||
},
|
||||
CompactL0Phase1ValueAccess::StreamingKmerge { validate } => {
|
||||
let merge_iter = {
|
||||
let mut deltas = Vec::with_capacity(deltas_to_compact.len());
|
||||
for l in deltas_to_compact.iter() {
|
||||
let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
|
||||
deltas.push(l);
|
||||
}
|
||||
MergeIterator::create(&deltas, &[], ctx)
|
||||
};
|
||||
match validate {
|
||||
None => AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter },
|
||||
Some(validate) => AllValuesIter::ValidatingStreamingKmergeBypassingPageCache {
|
||||
mode: validate.clone(),
|
||||
merge_iter,
|
||||
all_keys_iter: all_keys.iter(),
|
||||
},
|
||||
}
|
||||
let mut all_values_iter = {
|
||||
let mut deltas = Vec::with_capacity(deltas_to_compact.len());
|
||||
for l in deltas_to_compact.iter() {
|
||||
let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
|
||||
deltas.push(l);
|
||||
}
|
||||
MergeIterator::create(&deltas, &[], ctx)
|
||||
};
|
||||
|
||||
// This iterator walks through all keys and is needed to calculate size used by each key
|
||||
@@ -1119,7 +994,7 @@ impl Timeline {
|
||||
let mut keys = 0;
|
||||
|
||||
while let Some((key, lsn, value)) = all_values_iter
|
||||
.next(ctx)
|
||||
.next()
|
||||
.await
|
||||
.map_err(CompactionError::Other)?
|
||||
{
|
||||
|
||||
@@ -33,6 +33,7 @@ use crate::virtual_file::{self, VirtualFile};
|
||||
pub struct BlobMeta {
|
||||
pub key: Key,
|
||||
pub lsn: Lsn,
|
||||
pub will_init: bool,
|
||||
}
|
||||
|
||||
/// Blob offsets into [`VectoredBlobsBuf::buf`]
|
||||
@@ -355,9 +356,10 @@ pub enum BlobFlag {
|
||||
/// * Iterate over the collected blobs and coalesce them into reads at the end
|
||||
pub struct VectoredReadPlanner {
|
||||
// Track all the blob offsets. Start offsets must be ordered.
|
||||
blobs: BTreeMap<Key, Vec<(Lsn, u64, u64)>>,
|
||||
// Note: last bool is will_init
|
||||
blobs: BTreeMap<Key, Vec<(Lsn, u64, u64, bool)>>,
|
||||
// Arguments for previous blob passed into [`VectoredReadPlanner::handle`]
|
||||
prev: Option<(Key, Lsn, u64, BlobFlag)>,
|
||||
prev: Option<(Key, Lsn, u64, BlobFlag, bool)>,
|
||||
|
||||
max_read_size: usize,
|
||||
|
||||
@@ -392,40 +394,62 @@ impl VectoredReadPlanner {
|
||||
/// This is used for WAL records that `will_init`.
|
||||
/// * [`BlobFlag::Ignore`]: This blob should not be included in the read. This happens
|
||||
/// if the blob is cached.
|
||||
pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64, flag: BlobFlag) {
|
||||
pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64, flag: BlobFlag, will_init: bool) {
|
||||
// Implementation note: internally lag behind by one blob such that
|
||||
// we have a start and end offset when initialising [`VectoredRead`]
|
||||
let (prev_key, prev_lsn, prev_offset, prev_flag) = match self.prev {
|
||||
let (prev_key, prev_lsn, prev_offset, prev_flag, prev_will_init) = match self.prev {
|
||||
None => {
|
||||
self.prev = Some((key, lsn, offset, flag));
|
||||
self.prev = Some((key, lsn, offset, flag, will_init));
|
||||
return;
|
||||
}
|
||||
Some(prev) => prev,
|
||||
};
|
||||
|
||||
self.add_blob(prev_key, prev_lsn, prev_offset, offset, prev_flag);
|
||||
self.add_blob(
|
||||
prev_key,
|
||||
prev_lsn,
|
||||
prev_offset,
|
||||
offset,
|
||||
prev_flag,
|
||||
prev_will_init,
|
||||
);
|
||||
|
||||
self.prev = Some((key, lsn, offset, flag));
|
||||
self.prev = Some((key, lsn, offset, flag, will_init));
|
||||
}
|
||||
|
||||
pub fn handle_range_end(&mut self, offset: u64) {
|
||||
if let Some((prev_key, prev_lsn, prev_offset, prev_flag)) = self.prev {
|
||||
self.add_blob(prev_key, prev_lsn, prev_offset, offset, prev_flag);
|
||||
if let Some((prev_key, prev_lsn, prev_offset, prev_flag, prev_will_init)) = self.prev {
|
||||
self.add_blob(
|
||||
prev_key,
|
||||
prev_lsn,
|
||||
prev_offset,
|
||||
offset,
|
||||
prev_flag,
|
||||
prev_will_init,
|
||||
);
|
||||
}
|
||||
|
||||
self.prev = None;
|
||||
}
|
||||
|
||||
fn add_blob(&mut self, key: Key, lsn: Lsn, start_offset: u64, end_offset: u64, flag: BlobFlag) {
|
||||
fn add_blob(
|
||||
&mut self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
start_offset: u64,
|
||||
end_offset: u64,
|
||||
flag: BlobFlag,
|
||||
will_init: bool,
|
||||
) {
|
||||
match flag {
|
||||
BlobFlag::None => {
|
||||
let blobs_for_key = self.blobs.entry(key).or_default();
|
||||
blobs_for_key.push((lsn, start_offset, end_offset));
|
||||
blobs_for_key.push((lsn, start_offset, end_offset, will_init));
|
||||
}
|
||||
BlobFlag::ReplaceAll => {
|
||||
let blobs_for_key = self.blobs.entry(key).or_default();
|
||||
blobs_for_key.clear();
|
||||
blobs_for_key.push((lsn, start_offset, end_offset));
|
||||
blobs_for_key.push((lsn, start_offset, end_offset, will_init));
|
||||
}
|
||||
BlobFlag::Ignore => {}
|
||||
}
|
||||
@@ -436,11 +460,17 @@ impl VectoredReadPlanner {
|
||||
let mut reads = Vec::new();
|
||||
|
||||
for (key, blobs_for_key) in self.blobs {
|
||||
for (lsn, start_offset, end_offset) in blobs_for_key {
|
||||
for (lsn, start_offset, end_offset, will_init) in blobs_for_key {
|
||||
let extended = match &mut current_read_builder {
|
||||
Some(read_builder) => {
|
||||
read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn })
|
||||
}
|
||||
Some(read_builder) => read_builder.extend(
|
||||
start_offset,
|
||||
end_offset,
|
||||
BlobMeta {
|
||||
key,
|
||||
lsn,
|
||||
will_init,
|
||||
},
|
||||
),
|
||||
None => VectoredReadExtended::No,
|
||||
};
|
||||
|
||||
@@ -448,7 +478,11 @@ impl VectoredReadPlanner {
|
||||
let next_read_builder = VectoredReadBuilder::new(
|
||||
start_offset,
|
||||
end_offset,
|
||||
BlobMeta { key, lsn },
|
||||
BlobMeta {
|
||||
key,
|
||||
lsn,
|
||||
will_init,
|
||||
},
|
||||
self.max_read_size,
|
||||
self.mode,
|
||||
);
|
||||
@@ -593,8 +627,10 @@ impl<'a> VectoredBlobReader<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. It provides a streaming API for
|
||||
/// getting read blobs. It returns a batch when `handle` gets called and when the current key would just exceed the read_size and
|
||||
/// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`].
|
||||
///
|
||||
/// It provides a streaming API for getting read blobs. It returns a batch when
|
||||
/// `handle` gets called and when the current key would just exceed the read_size and
|
||||
/// max_cnt constraints.
|
||||
pub struct StreamingVectoredReadPlanner {
|
||||
read_builder: Option<VectoredReadBuilder>,
|
||||
@@ -666,7 +702,15 @@ impl StreamingVectoredReadPlanner {
|
||||
) -> Option<VectoredRead> {
|
||||
match &mut self.read_builder {
|
||||
Some(read_builder) => {
|
||||
let extended = read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn });
|
||||
let extended = read_builder.extend(
|
||||
start_offset,
|
||||
end_offset,
|
||||
BlobMeta {
|
||||
key,
|
||||
lsn,
|
||||
will_init: todo!(),
|
||||
},
|
||||
);
|
||||
assert_eq!(extended, VectoredReadExtended::Yes);
|
||||
}
|
||||
None => {
|
||||
@@ -674,7 +718,11 @@ impl StreamingVectoredReadPlanner {
|
||||
Some(VectoredReadBuilder::new_streaming(
|
||||
start_offset,
|
||||
end_offset,
|
||||
BlobMeta { key, lsn },
|
||||
BlobMeta {
|
||||
key,
|
||||
lsn,
|
||||
will_init: todo!(),
|
||||
},
|
||||
self.mode,
|
||||
))
|
||||
};
|
||||
@@ -1006,6 +1054,7 @@ mod tests {
|
||||
let meta = BlobMeta {
|
||||
key: Key::MIN,
|
||||
lsn: Lsn(0),
|
||||
will_init: false,
|
||||
};
|
||||
|
||||
for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
//!
|
||||
//! VirtualFile is like a normal File, but it's not bound directly to
|
||||
//! a file descriptor. Instead, the file is opened when it's read from,
|
||||
//! a file descriptor.
|
||||
//!
|
||||
//! Instead, the file is opened when it's read from,
|
||||
//! and if too many files are open globally in the system, least-recently
|
||||
//! used ones are closed.
|
||||
//!
|
||||
|
||||
@@ -43,13 +43,12 @@ use utils::lsn::Lsn;
|
||||
use utils::sync::gate::GateError;
|
||||
use utils::sync::heavier_once_cell;
|
||||
|
||||
/// The real implementation that uses a Postgres process to
|
||||
/// perform WAL replay.
|
||||
///
|
||||
/// This is the real implementation that uses a Postgres process to
|
||||
/// perform WAL replay. Only one thread can use the process at a time,
|
||||
/// that is controlled by the Mutex. In the future, we might want to
|
||||
/// launch a pool of processes to allow concurrent replay of multiple
|
||||
/// records.
|
||||
///
|
||||
/// Only one thread can use the process at a time, that is controlled by the
|
||||
/// Mutex. In the future, we might want to launch a pool of processes to allow
|
||||
/// concurrent replay of multiple records.
|
||||
pub struct PostgresRedoManager {
|
||||
tenant_shard_id: TenantShardId,
|
||||
conf: &'static PageServerConf,
|
||||
|
||||
@@ -1038,9 +1038,12 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
if (SkipXLogPageHeader(wp, wp->propEpochStartLsn) != wp->api.get_redo_start_lsn(wp))
|
||||
{
|
||||
/*
|
||||
* However, allow to proceed if previously elected leader was me;
|
||||
* plain restart of walproposer not intervened by concurrent
|
||||
* compute (who could generate WAL) is ok.
|
||||
* However, allow to proceed if last_log_term on the node which gave
|
||||
* the highest vote (i.e. point where we are going to start writing)
|
||||
* actually had been won by me; plain restart of walproposer not
|
||||
* intervened by concurrent compute which wrote WAL is ok.
|
||||
*
|
||||
* This avoids compute crash after manual term_bump.
|
||||
*/
|
||||
if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term ==
|
||||
pg_atomic_read_u64(&walprop_shared->mineLastElectedTerm))))
|
||||
@@ -1442,12 +1445,17 @@ RecvAppendResponses(Safekeeper *sk)
|
||||
if (sk->appendResponse.term > wp->propTerm)
|
||||
{
|
||||
/*
|
||||
* Another compute with higher term is running. Panic to restart
|
||||
* PG as we likely need to retake basebackup. However, don't dump
|
||||
* core as this is kinda expected scenario.
|
||||
*
|
||||
* Term has changed to higher one, probably another compute is
|
||||
* running. If this is the case we could PANIC as well because
|
||||
* likely it inserted some data and our basebackup is unsuitable
|
||||
* anymore. However, we also bump term manually (term_bump endpoint)
|
||||
* on safekeepers for migration purposes, in this case we do want
|
||||
* compute to stay alive. So restart walproposer with FATAL instead
|
||||
* of panicking; if basebackup is spoiled next election will notice
|
||||
* this.
|
||||
*/
|
||||
disable_core_dump();
|
||||
wp_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us",
|
||||
wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us",
|
||||
sk->host, sk->port,
|
||||
sk->appendResponse.term, wp->propTerm);
|
||||
}
|
||||
|
||||
2
proxy/src/cache/timed_lru.rs
vendored
2
proxy/src/cache/timed_lru.rs
vendored
@@ -16,7 +16,7 @@ use tracing::debug;
|
||||
// On the other hand, `hashlink` has good download stats and appears to be maintained.
|
||||
use hashlink::{linked_hash_map::RawEntryMut, LruCache};
|
||||
|
||||
use super::{common::Cached, *};
|
||||
use super::{common::Cached, timed_lru, Cache};
|
||||
|
||||
/// An implementation of timed LRU cache with fixed capacity.
|
||||
/// Key properties:
|
||||
|
||||
@@ -38,10 +38,7 @@ impl Api {
|
||||
locks: &'static ApiLocks<EndpointCacheKey>,
|
||||
wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
|
||||
) -> Self {
|
||||
let jwt = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
|
||||
Ok(v) => v,
|
||||
Err(_) => String::new(),
|
||||
};
|
||||
let jwt = std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN").unwrap_or_default();
|
||||
Self {
|
||||
endpoint,
|
||||
caches,
|
||||
|
||||
@@ -44,16 +44,14 @@
|
||||
clippy::items_after_statements,
|
||||
)]
|
||||
// List of temporarily allowed lints.
|
||||
// TODO: Switch to except() once stable with 1.81.
|
||||
// TODO: fix code and reduce list or move to permanent list above.
|
||||
#![allow(
|
||||
#![expect(
|
||||
clippy::cargo_common_metadata,
|
||||
clippy::cast_possible_truncation,
|
||||
clippy::cast_possible_wrap,
|
||||
clippy::cast_precision_loss,
|
||||
clippy::cast_sign_loss,
|
||||
clippy::doc_markdown,
|
||||
clippy::implicit_hasher,
|
||||
clippy::inline_always,
|
||||
clippy::match_same_arms,
|
||||
clippy::match_wild_err_arm,
|
||||
@@ -61,21 +59,28 @@
|
||||
clippy::missing_panics_doc,
|
||||
clippy::module_name_repetitions,
|
||||
clippy::needless_pass_by_value,
|
||||
clippy::needless_raw_string_hashes,
|
||||
clippy::redundant_closure_for_method_calls,
|
||||
clippy::return_self_not_must_use,
|
||||
clippy::similar_names,
|
||||
clippy::single_match_else,
|
||||
clippy::struct_excessive_bools,
|
||||
clippy::struct_field_names,
|
||||
clippy::too_many_lines,
|
||||
clippy::unreadable_literal,
|
||||
clippy::unused_async,
|
||||
clippy::unused_self,
|
||||
clippy::wildcard_imports
|
||||
clippy::unused_self
|
||||
)]
|
||||
#![cfg_attr(
|
||||
any(test, feature = "testing"),
|
||||
allow(
|
||||
clippy::needless_raw_string_hashes,
|
||||
clippy::unreadable_literal,
|
||||
clippy::unused_async,
|
||||
)
|
||||
)]
|
||||
// List of temporarily allowed lints to unblock beta/nightly.
|
||||
#![allow(unknown_lints, clippy::manual_inspect)]
|
||||
#![allow(
|
||||
unknown_lints,
|
||||
// TODO: 1.82: Add `use<T>` where necessary and remove from this list.
|
||||
impl_trait_overcaptures,
|
||||
)]
|
||||
|
||||
use std::{convert::Infallible, future::Future};
|
||||
|
||||
|
||||
@@ -217,6 +217,7 @@ impl sasl::Mechanism for Exchange<'_> {
|
||||
self.state = ExchangeState::SaltSent(sent);
|
||||
Ok(Step::Continue(self, msg))
|
||||
}
|
||||
#[allow(unreachable_patterns)] // TODO: 1.82: simply drop this match
|
||||
Step::Success(x, _) => match x {},
|
||||
Step::Failure(msg) => Ok(Step::Failure(msg)),
|
||||
}
|
||||
@@ -224,6 +225,7 @@ impl sasl::Mechanism for Exchange<'_> {
|
||||
ExchangeState::SaltSent(sent) => {
|
||||
match sent.transition(self.secret, &self.tls_server_end_point, input)? {
|
||||
Step::Success(keys, msg) => Ok(Step::Success(keys, msg)),
|
||||
#[allow(unreachable_patterns)] // TODO: 1.82: simply drop this match
|
||||
Step::Continue(x, _) => match x {},
|
||||
Step::Failure(msg) => Ok(Step::Failure(msg)),
|
||||
}
|
||||
|
||||
@@ -745,22 +745,20 @@ impl BatchQueryData {
|
||||
builder = builder.deferrable(true);
|
||||
}
|
||||
|
||||
let transaction = builder.start().await.map_err(|e| {
|
||||
let transaction = builder.start().await.inspect_err(|_| {
|
||||
// if we cannot start a transaction, we should return immediately
|
||||
// and not return to the pool. connection is clearly broken
|
||||
discard.discard();
|
||||
e
|
||||
})?;
|
||||
|
||||
let json_output =
|
||||
match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await {
|
||||
Ok(json_output) => {
|
||||
info!("commit");
|
||||
let status = transaction.commit().await.map_err(|e| {
|
||||
let status = transaction.commit().await.inspect_err(|_| {
|
||||
// if we cannot commit - for now don't return connection to pool
|
||||
// TODO: get a query status from the error
|
||||
discard.discard();
|
||||
e
|
||||
})?;
|
||||
discard.check_idle(status);
|
||||
json_output
|
||||
@@ -776,11 +774,10 @@ impl BatchQueryData {
|
||||
}
|
||||
Err(err) => {
|
||||
info!("rollback");
|
||||
let status = transaction.rollback().await.map_err(|e| {
|
||||
let status = transaction.rollback().await.inspect_err(|_| {
|
||||
// if we cannot rollback - for now don't return connection to pool
|
||||
// TODO: get a query status from the error
|
||||
discard.discard();
|
||||
e
|
||||
})?;
|
||||
discard.check_idle(status);
|
||||
return Err(err);
|
||||
|
||||
@@ -14,6 +14,7 @@ use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
|
||||
use tokio_rustls::server::TlsStream;
|
||||
|
||||
/// Stream wrapper which implements libpq's protocol.
|
||||
///
|
||||
/// NOTE: This object deliberately doesn't implement [`AsyncRead`]
|
||||
/// or [`AsyncWrite`] to prevent subtle errors (e.g. trying
|
||||
/// to pass random malformed bytes through the connection).
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
[toolchain]
|
||||
channel = "1.80.1"
|
||||
channel = "1.81.0"
|
||||
profile = "default"
|
||||
# The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
|
||||
# https://rust-lang.github.io/rustup/concepts/profiles.html
|
||||
# but we also need `llvm-tools-preview` for coverage data merges on CI
|
||||
components = ["llvm-tools-preview", "rustfmt", "clippy"]
|
||||
# but we also need `llvm-tools` for coverage data merges on CI
|
||||
components = ["llvm-tools", "rustfmt", "clippy"]
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
use utils::auth::{AuthError, Claims, Scope};
|
||||
use utils::id::TenantId;
|
||||
|
||||
/// If tenant_id is provided, allow if token (claims) is for this tenant or
|
||||
/// whole safekeeper scope (SafekeeperData). Else, allow only if token is
|
||||
/// SafekeeperData.
|
||||
pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<(), AuthError> {
|
||||
match (&claims.scope, tenant_id) {
|
||||
(Scope::Tenant, None) => Err(AuthError(
|
||||
|
||||
@@ -19,7 +19,7 @@ use std::fs::{self, File};
|
||||
use std::io::{ErrorKind, Write};
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::time::{Duration, Instant};
|
||||
use storage_broker::Uri;
|
||||
|
||||
use tracing::*;
|
||||
@@ -261,6 +261,15 @@ async fn main() -> anyhow::Result<()> {
|
||||
// Change into the data directory.
|
||||
std::env::set_current_dir(&workdir)?;
|
||||
|
||||
// Prevent running multiple safekeepers on the same directory
|
||||
let lock_file_path = workdir.join(PID_FILE_NAME);
|
||||
let lock_file =
|
||||
pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
|
||||
info!("claimed pid file at {lock_file_path:?}");
|
||||
// ensure that the lock file is held even if the main thread of the process is panics
|
||||
// we need to release the lock file only when the current process is gone
|
||||
std::mem::forget(lock_file);
|
||||
|
||||
// Set or read our ID.
|
||||
let id = set_id(&workdir, args.id.map(NodeId))?;
|
||||
if args.init {
|
||||
@@ -364,15 +373,15 @@ async fn main() -> anyhow::Result<()> {
|
||||
type JoinTaskRes = Result<anyhow::Result<()>, JoinError>;
|
||||
|
||||
async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
// Prevent running multiple safekeepers on the same directory
|
||||
let lock_file_path = conf.workdir.join(PID_FILE_NAME);
|
||||
let lock_file =
|
||||
pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
|
||||
info!("claimed pid file at {lock_file_path:?}");
|
||||
|
||||
// ensure that the lock file is held even if the main thread of the process is panics
|
||||
// we need to release the lock file only when the current process is gone
|
||||
std::mem::forget(lock_file);
|
||||
// fsync the datadir to make sure we have a consistent state on disk.
|
||||
let dfd = File::open(&conf.workdir).context("open datadir for syncfs")?;
|
||||
let started = Instant::now();
|
||||
utils::crashsafe::syncfs(dfd)?;
|
||||
let elapsed = started.elapsed();
|
||||
info!(
|
||||
elapsed_ms = elapsed.as_millis(),
|
||||
"syncfs data directory done"
|
||||
);
|
||||
|
||||
info!("starting safekeeper WAL service on {}", conf.listen_pg_addr);
|
||||
let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
|
||||
|
||||
@@ -18,8 +18,8 @@ use utils::http::endpoint::{prometheus_metrics_handler, request_span, ChannelWri
|
||||
use utils::http::request::parse_query_param;
|
||||
|
||||
use postgres_ffi::WAL_SEGMENT_SIZE;
|
||||
use safekeeper_api::models::TimelineCreateRequest;
|
||||
use safekeeper_api::models::{SkTimelineInfo, TimelineCopyRequest};
|
||||
use safekeeper_api::models::{TimelineCreateRequest, TimelineTermBumpRequest};
|
||||
use utils::{
|
||||
auth::SwappableJwtAuth,
|
||||
http::{
|
||||
@@ -408,6 +408,28 @@ async fn timeline_backup_partial_reset(request: Request<Body>) -> Result<Respons
|
||||
json_response(StatusCode::OK, response)
|
||||
}
|
||||
|
||||
/// Make term at least as high as one in request. If one in request is None,
|
||||
/// increment current one.
|
||||
async fn timeline_term_bump_handler(
|
||||
mut request: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let ttid = TenantTimelineId::new(
|
||||
parse_request_param(&request, "tenant_id")?,
|
||||
parse_request_param(&request, "timeline_id")?,
|
||||
);
|
||||
check_permission(&request, Some(ttid.tenant_id))?;
|
||||
|
||||
let request_data: TimelineTermBumpRequest = json_request(&mut request).await?;
|
||||
|
||||
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
||||
let response = tli
|
||||
.term_bump(request_data.term)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, response)
|
||||
}
|
||||
|
||||
/// Used only in tests to hand craft required data.
|
||||
async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let ttid = TenantTimelineId::new(
|
||||
@@ -630,6 +652,10 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/backup_partial_reset",
|
||||
|r| request_span(r, timeline_backup_partial_reset),
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/term_bump",
|
||||
|r| request_span(r, timeline_term_bump_handler),
|
||||
)
|
||||
.post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| {
|
||||
request_span(r, record_safekeeper_info)
|
||||
})
|
||||
|
||||
@@ -484,6 +484,7 @@ pub async fn validate_temp_timeline(
|
||||
}
|
||||
|
||||
/// Move timeline from a temp directory to the main storage, and load it to the global map.
|
||||
///
|
||||
/// This operation is done under a lock to prevent bugs if several concurrent requests are
|
||||
/// trying to load the same timeline. Note that it doesn't guard against creating the
|
||||
/// timeline with the same ttid, but no one should be doing this anyway.
|
||||
|
||||
@@ -448,8 +448,10 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1);
|
||||
|
||||
/// Encapsulates a task which takes messages from msg_rx, processes and pushes
|
||||
/// replies to reply_tx; reading from socket and writing to disk in parallel is
|
||||
/// beneficial for performance, this struct provides writing to disk part.
|
||||
/// replies to reply_tx.
|
||||
///
|
||||
/// Reading from socket and writing to disk in parallel is beneficial for
|
||||
/// performance, this struct provides the writing to disk part.
|
||||
pub struct WalAcceptor {
|
||||
tli: WalResidentTimeline,
|
||||
msg_rx: Receiver<ProposerAcceptorMessage>,
|
||||
|
||||
@@ -938,8 +938,9 @@ where
|
||||
}
|
||||
|
||||
trace!(
|
||||
"processed AppendRequest of len {}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, flushed={:?}",
|
||||
"processed AppendRequest of len {}, begin_lsn={}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, flushed={:?}",
|
||||
msg.wal_data.len(),
|
||||
msg.h.begin_lsn,
|
||||
msg.h.end_lsn,
|
||||
msg.h.commit_lsn,
|
||||
msg.h.truncate_lsn,
|
||||
|
||||
@@ -758,9 +758,8 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
|
||||
// pq_sendint32(&reply_message, xmin);
|
||||
// pq_sendint32(&reply_message, xmin_epoch);
|
||||
// So it is two big endian 32-bit words in low endian order!
|
||||
hs_feedback.xmin = (hs_feedback.xmin >> 32) | (hs_feedback.xmin << 32);
|
||||
hs_feedback.catalog_xmin =
|
||||
(hs_feedback.catalog_xmin >> 32) | (hs_feedback.catalog_xmin << 32);
|
||||
hs_feedback.xmin = hs_feedback.xmin.rotate_left(32);
|
||||
hs_feedback.catalog_xmin = hs_feedback.catalog_xmin.rotate_left(32);
|
||||
self.ws_guard
|
||||
.walsenders
|
||||
.record_hs_feedback(self.ws_guard.id, &hs_feedback);
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
//! Defines per timeline data stored persistently (SafeKeeperPersistentState)
|
||||
//! and its wrapper with in memory layer (SafekeeperState).
|
||||
|
||||
use std::ops::Deref;
|
||||
use std::{cmp::max, ops::Deref};
|
||||
|
||||
use anyhow::Result;
|
||||
use safekeeper_api::models::TimelineTermBumpResponse;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::{
|
||||
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
|
||||
@@ -12,7 +13,7 @@ use utils::{
|
||||
|
||||
use crate::{
|
||||
control_file,
|
||||
safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, TermHistory},
|
||||
safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, Term, TermHistory},
|
||||
wal_backup_partial::{self},
|
||||
};
|
||||
|
||||
@@ -147,9 +148,11 @@ pub struct TimelineMemState {
|
||||
pub proposer_uuid: PgUuid,
|
||||
}
|
||||
|
||||
/// Safekeeper persistent state plus in memory layer, to avoid frequent fsyncs
|
||||
/// when we update fields like commit_lsn which don't need immediate
|
||||
/// persistence. Provides transactional like API to atomically update the state.
|
||||
/// Safekeeper persistent state plus in memory layer.
|
||||
///
|
||||
/// Allows us to avoid frequent fsyncs when we update fields like commit_lsn
|
||||
/// which don't need immediate persistence. Provides transactional like API
|
||||
/// to atomically update the state.
|
||||
///
|
||||
/// Implements Deref into *persistent* part.
|
||||
pub struct TimelineState<CTRL: control_file::Storage> {
|
||||
@@ -209,6 +212,27 @@ where
|
||||
let s = self.start_change();
|
||||
self.finish_change(&s).await
|
||||
}
|
||||
|
||||
/// Make term at least as `to`. If `to` is None, increment current one. This
|
||||
/// is not in safekeeper.rs because we want to be able to do it even if
|
||||
/// timeline is offloaded.
|
||||
pub async fn term_bump(&mut self, to: Option<Term>) -> Result<TimelineTermBumpResponse> {
|
||||
let before = self.acceptor_state.term;
|
||||
let mut state = self.start_change();
|
||||
let new = match to {
|
||||
Some(to) => max(state.acceptor_state.term, to),
|
||||
None => state.acceptor_state.term + 1,
|
||||
};
|
||||
if new > state.acceptor_state.term {
|
||||
state.acceptor_state.term = new;
|
||||
self.finish_change(&state).await?;
|
||||
}
|
||||
let after = self.acceptor_state.term;
|
||||
Ok(TimelineTermBumpResponse {
|
||||
previous_term: before,
|
||||
current_term: after,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<CTRL> Deref for TimelineState<CTRL>
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use camino::Utf8PathBuf;
|
||||
use remote_storage::RemotePath;
|
||||
use safekeeper_api::models::TimelineTermBumpResponse;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::fs::{self};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -169,6 +170,7 @@ impl<'a> Drop for WriteGuardSharedState<'a> {
|
||||
}
|
||||
|
||||
/// This structure is stored in shared state and represents the state of the timeline.
|
||||
///
|
||||
/// Usually it holds SafeKeeper, but it also supports offloaded timeline state. In this
|
||||
/// case, SafeKeeper is not available (because WAL is not present on disk) and all
|
||||
/// operations can be done only with control file.
|
||||
@@ -214,6 +216,10 @@ impl StateSK {
|
||||
.get_last_log_term(self.flush_lsn())
|
||||
}
|
||||
|
||||
pub async fn term_bump(&mut self, to: Option<Term>) -> Result<TimelineTermBumpResponse> {
|
||||
self.state_mut().term_bump(to).await
|
||||
}
|
||||
|
||||
/// Close open WAL files to release FDs.
|
||||
fn close_wal_store(&mut self) {
|
||||
if let StateSK::Loaded(sk) = self {
|
||||
@@ -853,6 +859,11 @@ impl Timeline {
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
pub async fn term_bump(self: &Arc<Self>, to: Option<Term>) -> Result<TimelineTermBumpResponse> {
|
||||
let mut state = self.write_shared_state().await;
|
||||
state.sk.term_bump(to).await
|
||||
}
|
||||
|
||||
/// Get the timeline guard for reading/writing WAL files.
|
||||
/// If WAL files are not present on disk (evicted), they will be automatically
|
||||
/// downloaded from remote storage. This is done in the manager task, which is
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
//! Code related to evicting WAL files to remote storage. The actual upload is done by the
|
||||
//! partial WAL backup code. This file has code to delete and re-download WAL files,
|
||||
//! cross-validate with partial WAL backup if local file is still present.
|
||||
//! Code related to evicting WAL files to remote storage.
|
||||
//!
|
||||
//! The actual upload is done by the partial WAL backup code. This file has
|
||||
//! code to delete and re-download WAL files, cross-validate with partial WAL
|
||||
//! backup if local file is still present.
|
||||
|
||||
use anyhow::Context;
|
||||
use camino::Utf8PathBuf;
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
//! Timeline residence guard is needed to ensure that WAL segments are present on disk,
|
||||
//! Timeline residence guard
|
||||
//!
|
||||
//! It is needed to ensure that WAL segments are present on disk,
|
||||
//! as long as the code is holding the guard. This file implements guard logic, to issue
|
||||
//! and drop guards, and to notify the manager when the guard is dropped.
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
//! The timeline manager task is responsible for managing the timeline's background tasks.
|
||||
//!
|
||||
//! It is spawned alongside each timeline and exits when the timeline is deleted.
|
||||
//! It watches for changes in the timeline state and decides when to spawn or kill background tasks.
|
||||
//! It also can manage some reactive state, like should the timeline be active for broker pushes or not.
|
||||
|
||||
@@ -60,7 +60,8 @@ impl TimelinesSet {
|
||||
}
|
||||
}
|
||||
|
||||
/// Guard is used to add or remove timeline from the set.
|
||||
/// Guard is used to add or remove timelines from the set.
|
||||
///
|
||||
/// If the timeline present in set, it will be removed from it on drop.
|
||||
/// Note: do not use more than one guard for the same timeline, it caches the presence state.
|
||||
/// It is designed to be used in the manager task only.
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
//! Safekeeper timeline has a background task which is subscribed to `commit_lsn`
|
||||
//! and `flush_lsn` updates. After the partial segment was updated (`flush_lsn`
|
||||
//! was changed), the segment will be uploaded to S3 in about 15 minutes.
|
||||
//! and `flush_lsn` updates.
|
||||
//!
|
||||
//! After the partial segment was updated (`flush_lsn` was changed), the segment
|
||||
//! will be uploaded to S3 within the configured `partial_backup_timeout`.
|
||||
//!
|
||||
//! The filename format for partial segments is
|
||||
//! `Segment_Term_Flush_Commit_skNN.partial`, where:
|
||||
|
||||
@@ -17,6 +17,7 @@ use crate::SafeKeeperConf;
|
||||
use postgres_backend::{AuthType, PostgresBackend};
|
||||
|
||||
/// Accept incoming TCP connections and spawn them into a background thread.
|
||||
///
|
||||
/// allowed_auth_scope is either SafekeeperData (wide JWT tokens giving access
|
||||
/// to any tenant are allowed) or Tenant (only tokens giving access to specific
|
||||
/// tenant are allowed). Doesn't matter if auth is disabled in conf.
|
||||
|
||||
@@ -98,7 +98,19 @@ pub struct PhysicalStorage {
|
||||
/// Also can be ahead of record_lsn, if happen to be in the middle of a WAL record.
|
||||
write_lsn: Lsn,
|
||||
|
||||
/// The LSN of the last WAL record written to disk. Still can be not fully flushed.
|
||||
/// The LSN of the last WAL record written to disk. Still can be not fully
|
||||
/// flushed.
|
||||
///
|
||||
/// Note: Normally it (and flush_record_lsn) is <= write_lsn, but after xlog
|
||||
/// switch ingest the reverse is true because we don't bump write_lsn up to
|
||||
/// the next segment: WAL stream from the compute doesn't have the gap and
|
||||
/// for simplicity / as a sanity check we disallow any non-sequential
|
||||
/// writes, so write zeros as is.
|
||||
///
|
||||
/// Similar effect is in theory possible due to LSN alignment: if record
|
||||
/// ends at *2, decoder will report end lsn as *8 even though we haven't
|
||||
/// written these zeros yet. In practice compute likely never sends
|
||||
/// non-aligned chunks of data.
|
||||
write_record_lsn: Lsn,
|
||||
|
||||
/// The LSN of the last WAL record flushed to disk.
|
||||
@@ -167,8 +179,7 @@ impl PhysicalStorage {
|
||||
)
|
||||
};
|
||||
|
||||
// TODO: do we really know that write_lsn is fully flushed to disk?
|
||||
// If not, maybe it's better to call fsync() here to be sure?
|
||||
// note: this assumes we fsync'ed whole datadir on start.
|
||||
let flush_lsn = write_lsn;
|
||||
|
||||
debug!(
|
||||
@@ -440,11 +451,12 @@ impl Storage for PhysicalStorage {
|
||||
.with_label_values(&["truncate_wal"])
|
||||
.start_timer();
|
||||
|
||||
// Streaming must not create a hole, so truncate cannot be called on non-written lsn
|
||||
if self.write_lsn != Lsn(0) && end_pos > self.write_lsn {
|
||||
// Streaming must not create a hole, so truncate cannot be called on
|
||||
// non-written lsn.
|
||||
if self.write_record_lsn != Lsn(0) && end_pos > self.write_record_lsn {
|
||||
bail!(
|
||||
"truncate_wal called on non-written WAL, write_lsn={}, end_pos={}",
|
||||
self.write_lsn,
|
||||
"truncate_wal called on non-written WAL, write_record_lsn={}, end_pos={}",
|
||||
self.write_record_lsn,
|
||||
end_pos
|
||||
);
|
||||
}
|
||||
|
||||
@@ -134,7 +134,7 @@ class LLVM:
|
||||
# Show a user-friendly warning
|
||||
raise Exception(' '.join([
|
||||
f"It appears that you don't have `{name}` installed.",
|
||||
"Please execute `rustup component add llvm-tools-preview`,",
|
||||
"Please execute `rustup component add llvm-tools`,",
|
||||
"or install it via your package manager of choice.",
|
||||
"LLVM tools should be the same version as LLVM in `rustc --version --verbose`.",
|
||||
]))
|
||||
@@ -518,7 +518,7 @@ def main() -> None:
|
||||
example = f"""
|
||||
prerequisites:
|
||||
# alternatively, install a system package for `llvm-tools`
|
||||
rustup component add llvm-tools-preview
|
||||
rustup component add llvm-tools
|
||||
|
||||
self-contained example:
|
||||
{app} run make
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
ALTER TABLE nodes ALTER availability_zone_id DROP NOT NULL;
|
||||
@@ -0,0 +1 @@
|
||||
ALTER TABLE nodes ALTER availability_zone_id SET NOT NULL;
|
||||
@@ -0,0 +1 @@
|
||||
ALTER TABLE tenant_shards DROP preferred_az_id;
|
||||
@@ -0,0 +1 @@
|
||||
ALTER TABLE tenant_shards ADD preferred_az_id VARCHAR;
|
||||
@@ -14,14 +14,14 @@ use metrics::{BuildInfo, NeonMetrics};
|
||||
use pageserver_api::controller_api::{
|
||||
MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse,
|
||||
MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
|
||||
TenantCreateRequest,
|
||||
ShardsPreferredAzsRequest, TenantCreateRequest,
|
||||
};
|
||||
use pageserver_api::models::{
|
||||
TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
|
||||
TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api;
|
||||
use pageserver_client::{mgmt_api, BlockUnblock};
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -369,6 +369,23 @@ async fn handle_tenant_timeline_detach_ancestor(
|
||||
json_response(StatusCode::OK, res)
|
||||
}
|
||||
|
||||
async fn handle_tenant_timeline_block_unblock_gc(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
dir: BlockUnblock,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
|
||||
let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
|
||||
|
||||
service
|
||||
.tenant_timeline_block_unblock_gc(tenant_id, timeline_id, dir)
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn handle_tenant_timeline_passthrough(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
@@ -539,6 +556,17 @@ async fn handle_node_status(req: Request<Body>) -> Result<Response<Body>, ApiErr
|
||||
json_response(StatusCode::OK, node_status)
|
||||
}
|
||||
|
||||
async fn handle_node_shards(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let state = get_state(&req);
|
||||
let node_id: NodeId = parse_request_param(&req, "node_id")?;
|
||||
|
||||
let node_status = state.service.get_node_shards(node_id).await?;
|
||||
|
||||
json_response(StatusCode::OK, node_status)
|
||||
}
|
||||
|
||||
async fn handle_get_leader(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
@@ -688,6 +716,18 @@ async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_update_preferred_azs(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let azs_req = json_request::<ShardsPreferredAzsRequest>(&mut req).await?;
|
||||
let state = get_state(&req);
|
||||
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
state.service.update_shards_preferred_azs(azs_req).await?,
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_step_down(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
@@ -1097,6 +1137,13 @@ pub fn make_router(
|
||||
.get("/control/v1/node/:node_id", |r| {
|
||||
named_request_span(r, handle_node_status, RequestName("control_v1_node_status"))
|
||||
})
|
||||
.get("/control/v1/node/:node_id/shards", |r| {
|
||||
named_request_span(
|
||||
r,
|
||||
handle_node_shards,
|
||||
RequestName("control_v1_node_describe"),
|
||||
)
|
||||
})
|
||||
.get("/control/v1/leader", |r| {
|
||||
named_request_span(r, handle_get_leader, RequestName("control_v1_get_leader"))
|
||||
})
|
||||
@@ -1174,6 +1221,13 @@ pub fn make_router(
|
||||
RequestName("control_v1_tenant_policy"),
|
||||
)
|
||||
})
|
||||
.put("/control/v1/preferred_azs", |r| {
|
||||
named_request_span(
|
||||
r,
|
||||
handle_update_preferred_azs,
|
||||
RequestName("control_v1_preferred_azs"),
|
||||
)
|
||||
})
|
||||
.put("/control/v1/step_down", |r| {
|
||||
named_request_span(r, handle_step_down, RequestName("control_v1_step_down"))
|
||||
})
|
||||
@@ -1255,6 +1309,26 @@ pub fn make_router(
|
||||
)
|
||||
},
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/block_gc",
|
||||
|r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
|s, r| handle_tenant_timeline_block_unblock_gc(s, r, BlockUnblock::Block),
|
||||
RequestName("v1_tenant_timeline_block_unblock_gc"),
|
||||
)
|
||||
},
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/unblock_gc",
|
||||
|r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
|s, r| handle_tenant_timeline_block_unblock_gc(s, r, BlockUnblock::Unblock),
|
||||
RequestName("v1_tenant_timeline_block_unblock_gc"),
|
||||
)
|
||||
},
|
||||
)
|
||||
// Tenant detail GET passthrough to shard zero:
|
||||
.get("/v1/tenant/:tenant_id", |r| {
|
||||
tenant_service_handler(
|
||||
|
||||
@@ -36,7 +36,7 @@ pub(crate) struct Node {
|
||||
listen_pg_addr: String,
|
||||
listen_pg_port: u16,
|
||||
|
||||
availability_zone_id: Option<String>,
|
||||
availability_zone_id: String,
|
||||
|
||||
// This cancellation token means "stop any RPCs in flight to this node, and don't start
|
||||
// any more". It is not related to process shutdown.
|
||||
@@ -63,8 +63,9 @@ impl Node {
|
||||
self.id
|
||||
}
|
||||
|
||||
pub(crate) fn get_availability_zone_id(&self) -> Option<&str> {
|
||||
self.availability_zone_id.as_deref()
|
||||
#[allow(unused)]
|
||||
pub(crate) fn get_availability_zone_id(&self) -> &str {
|
||||
self.availability_zone_id.as_str()
|
||||
}
|
||||
|
||||
pub(crate) fn get_scheduling(&self) -> NodeSchedulingPolicy {
|
||||
@@ -78,22 +79,12 @@ impl Node {
|
||||
/// Does this registration request match `self`? This is used when deciding whether a registration
|
||||
/// request should be allowed to update an existing record with the same node ID.
|
||||
pub(crate) fn registration_match(&self, register_req: &NodeRegisterRequest) -> bool {
|
||||
let az_ids_match = {
|
||||
match (
|
||||
self.availability_zone_id.as_deref(),
|
||||
register_req.availability_zone_id.as_deref(),
|
||||
) {
|
||||
(Some(current_az), Some(register_req_az)) => current_az == register_req_az,
|
||||
_ => true,
|
||||
}
|
||||
};
|
||||
|
||||
az_ids_match
|
||||
&& self.id == register_req.node_id
|
||||
self.id == register_req.node_id
|
||||
&& self.listen_http_addr == register_req.listen_http_addr
|
||||
&& self.listen_http_port == register_req.listen_http_port
|
||||
&& self.listen_pg_addr == register_req.listen_pg_addr
|
||||
&& self.listen_pg_port == register_req.listen_pg_port
|
||||
&& self.availability_zone_id == register_req.availability_zone_id
|
||||
}
|
||||
|
||||
/// For a shard located on this node, populate a response object
|
||||
@@ -190,7 +181,7 @@ impl Node {
|
||||
listen_http_port: u16,
|
||||
listen_pg_addr: String,
|
||||
listen_pg_port: u16,
|
||||
availability_zone_id: Option<String>,
|
||||
availability_zone_id: String,
|
||||
) -> Self {
|
||||
Self {
|
||||
id,
|
||||
|
||||
@@ -7,7 +7,10 @@ use pageserver_api::{
|
||||
},
|
||||
shard::TenantShardId,
|
||||
};
|
||||
use pageserver_client::mgmt_api::{Client, Result};
|
||||
use pageserver_client::{
|
||||
mgmt_api::{Client, Result},
|
||||
BlockUnblock,
|
||||
};
|
||||
use reqwest::StatusCode;
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
|
||||
@@ -258,6 +261,24 @@ impl PageserverClient {
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) async fn timeline_block_unblock_gc(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
dir: BlockUnblock,
|
||||
) -> Result<()> {
|
||||
// measuring these makes no sense because we synchronize with the gc loop and remote
|
||||
// storage on block_gc so there should be huge outliers
|
||||
measured_request!(
|
||||
"timeline_block_unblock_gc",
|
||||
crate::metrics::Method::Post,
|
||||
&self.node_id_label,
|
||||
self.inner
|
||||
.timeline_block_unblock_gc(tenant_shard_id, timeline_id, dir)
|
||||
.await
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) async fn get_utilization(&self) -> Result<PageserverUtilization> {
|
||||
measured_request!(
|
||||
"utilization",
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user