Merge branch 'main' into immutable_bst_layer_map

This commit is contained in:
Bojan Serafimov
2023-01-10 11:02:45 -05:00
93 changed files with 2982 additions and 2123 deletions

View File

@@ -0,0 +1,10 @@
## Describe your changes
## Issue ticket number and link
## Checklist before requesting a review
- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section.

View File

@@ -27,6 +27,8 @@ storage:
ansible_host: i-0c3e70929edb5d691
pageserver-1.us-east-2.aws.neon.build:
ansible_host: i-0565a8b4008aa3f40
pageserver-2.us-east-2.aws.neon.build:
ansible_host: i-01e31cdf7e970586a
safekeepers:
hosts:

View File

@@ -9,6 +9,7 @@ settings:
authEndpoint: "http://console-staging.local/management/api/v2"
domain: "*.eu-west-1.aws.neon.build"
sentryEnvironment: "development"
wssPort: 8443
# -- Additional labels for neon-proxy pods
podLabels:
@@ -23,6 +24,7 @@ exposedService:
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: eu-west-1.aws.neon.build
httpsPort: 443
#metrics:
# enabled: true

View File

@@ -9,6 +9,7 @@ settings:
authEndpoint: "http://console-staging.local/management/api/v2"
domain: "*.cloud.stage.neon.tech"
sentryEnvironment: "development"
wssPort: 8443
# -- Additional labels for neon-proxy pods
podLabels:
@@ -23,6 +24,7 @@ exposedService:
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: neon-proxy-scram-legacy.beta.us-east-2.aws.neon.build
httpsPort: 443
#metrics:
# enabled: true

View File

@@ -9,6 +9,7 @@ settings:
authEndpoint: "http://console-staging.local/management/api/v2"
domain: "*.us-east-2.aws.neon.build"
sentryEnvironment: "development"
wssPort: 8443
# -- Additional labels for neon-proxy pods
podLabels:
@@ -23,6 +24,7 @@ exposedService:
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.build
httpsPort: 443
#metrics:
# enabled: true

View File

@@ -9,6 +9,7 @@ settings:
authEndpoint: "http://console-release.local/management/api/v2"
domain: "*.ap-southeast-1.aws.neon.tech"
sentryEnvironment: "production"
wssPort: 8443
# -- Additional labels for neon-proxy pods
podLabels:
@@ -23,6 +24,7 @@ exposedService:
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: ap-southeast-1.aws.neon.tech
httpsPort: 443
#metrics:
# enabled: true

View File

@@ -9,6 +9,7 @@ settings:
authEndpoint: "http://console-release.local/management/api/v2"
domain: "*.eu-central-1.aws.neon.tech"
sentryEnvironment: "production"
wssPort: 8443
# -- Additional labels for neon-proxy pods
podLabels:
@@ -23,6 +24,7 @@ exposedService:
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: eu-central-1.aws.neon.tech
httpsPort: 443
#metrics:
# enabled: true

View File

@@ -9,6 +9,7 @@ settings:
authEndpoint: "http://console-release.local/management/api/v2"
domain: "*.us-east-2.aws.neon.tech"
sentryEnvironment: "production"
wssPort: 8443
# -- Additional labels for neon-proxy pods
podLabels:
@@ -23,6 +24,7 @@ exposedService:
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.tech
httpsPort: 443
#metrics:
# enabled: true

View File

@@ -9,6 +9,7 @@ settings:
authEndpoint: "http://console-release.local/management/api/v2"
domain: "*.us-west-2.aws.neon.tech"
sentryEnvironment: "production"
wssPort: 8443
# -- Additional labels for neon-proxy pods
podLabels:
@@ -23,6 +24,7 @@ exposedService:
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: us-west-2.aws.neon.tech
httpsPort: 443
#metrics:
# enabled: true

View File

@@ -3,6 +3,7 @@ settings:
authEndpoint: "http://console-release.local/management/api/v2"
domain: "*.cloud.neon.tech"
sentryEnvironment: "production"
wssPort: 8443
podLabels:
zenith_service: proxy-scram
@@ -16,6 +17,7 @@ exposedService:
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: '*.cloud.neon.tech'
httpsPort: 443
metrics:
enabled: true

View File

@@ -111,6 +111,7 @@ jobs:
# Some of our rust modules use FFI and need those to be checked
- name: Get postgres headers
run: make postgres-headers -j$(nproc)
- name: Run cargo clippy
run: ./run_clippy.sh
@@ -126,6 +127,11 @@ jobs:
cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date
cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack
# https://github.com/EmbarkStudios/cargo-deny
- name: Check rust licenses/bans/advisories/sources
if: ${{ !cancelled() }}
run: cargo deny check
build-neon:
runs-on: [ self-hosted, dev, x64 ]
container:
@@ -177,13 +183,12 @@ jobs:
# corresponding Cargo.toml files for their descriptions.
- name: Set env variables
run: |
CARGO_FEATURES="--features testing"
if [[ $BUILD_TYPE == "debug" ]]; then
cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
CARGO_FEATURES="--features testing"
CARGO_FLAGS="--locked $CARGO_FEATURES"
elif [[ $BUILD_TYPE == "release" ]]; then
cov_prefix=""
CARGO_FEATURES="--features testing,profiling"
CARGO_FLAGS="--locked --release $CARGO_FEATURES"
fi
echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
@@ -789,6 +794,8 @@ jobs:
strategy:
matrix:
include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
environment:
name: prod-old
steps:
- name: Checkout
uses: actions/checkout@v3
@@ -834,7 +841,9 @@ jobs:
shell: bash
strategy:
matrix:
target_region: [ us-east-2 ]
target_region: [ eu-west-1, us-east-2 ]
environment:
name: dev-${{ matrix.target_region }}
steps:
- name: Checkout
uses: actions/checkout@v3
@@ -906,6 +915,8 @@ jobs:
strategy:
matrix:
target_region: [ us-east-2, us-west-2, eu-central-1, ap-southeast-1 ]
environment:
name: prod-${{ matrix.target_region }}
steps:
- name: Checkout
uses: actions/checkout@v3
@@ -945,6 +956,8 @@ jobs:
strategy:
matrix:
include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
environment:
name: prod-old
env:
KUBECONFIG: .kubeconfig
steps:
@@ -970,8 +983,8 @@ jobs:
- name: Re-deploy proxy
run: |
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
deploy-storage-broker:
name: deploy storage broker on old staging and old prod
@@ -988,6 +1001,8 @@ jobs:
strategy:
matrix:
include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
environment:
name: prod-old
env:
KUBECONFIG: .kubeconfig
steps:
@@ -1036,6 +1051,8 @@ jobs:
target_cluster: dev-eu-west-1-zeta
deploy_link_proxy: false
deploy_legacy_scram_proxy: false
environment:
name: dev-${{ matrix.target_region }}
steps:
- name: Checkout
uses: actions/checkout@v3
@@ -1051,19 +1068,19 @@ jobs:
- name: Re-deploy scram proxy
run: |
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
- name: Re-deploy link proxy
if: matrix.deploy_link_proxy
run: |
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
- name: Re-deploy legacy scram proxy
if: matrix.deploy_legacy_scram_proxy
run: |
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
deploy-storage-broker-dev-new:
runs-on: [ self-hosted, dev, x64 ]
@@ -1083,6 +1100,8 @@ jobs:
target_cluster: dev-us-east-2-beta
- target_region: eu-west-1
target_cluster: dev-eu-west-1-zeta
environment:
name: dev-${{ matrix.target_region }}
steps:
- name: Checkout
uses: actions/checkout@v3
@@ -1121,6 +1140,8 @@ jobs:
target_cluster: prod-eu-central-1-gamma
- target_region: ap-southeast-1
target_cluster: prod-ap-southeast-1-epsilon
environment:
name: prod-${{ matrix.target_region }}
steps:
- name: Checkout
uses: actions/checkout@v3
@@ -1136,7 +1157,7 @@ jobs:
- name: Re-deploy proxy
run: |
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
deploy-storage-broker-prod-new:
runs-on: prod
@@ -1160,6 +1181,8 @@ jobs:
target_cluster: prod-eu-central-1-gamma
- target_region: ap-southeast-1
target_cluster: prod-ap-southeast-1-epsilon
environment:
name: prod-${{ matrix.target_region }}
steps:
- name: Checkout
uses: actions/checkout@v3

575
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -31,7 +31,8 @@ libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler
* On Fedora, these packages are needed:
```bash
dnf install flex bison readline-devel zlib-devel openssl-devel \
libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler
libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
protobuf-devel
```
2. [Install Rust](https://www.rust-lang.org/tools/install)
@@ -117,11 +118,8 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r
# Later that would be responsibility of a package install script
> ./target/debug/neon_local init
Starting pageserver at '127.0.0.1:64000' in '.neon'.
pageserver started, pid: 2545906
Successfully initialized timeline de200bd42b49cc1814412c7e592dd6e9
Stopped pageserver 1 process with pid 2545906
# start pageserver and safekeeper
# start pageserver, safekeeper, and broker for their intercommunication
> ./target/debug/neon_local start
Starting neon broker at 127.0.0.1:50051
storage_broker started, pid: 2918372
@@ -130,6 +128,12 @@ pageserver started, pid: 2918386
Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'.
safekeeper 1 started, pid: 2918437
# create initial tenant and use it as a default for every future neon_local invocation
> ./target/debug/neon_local tenant create --set-default
tenant 9ef87a5bf0d92544f6fafeeb3239695c successfully created on the pageserver
Created an initial timeline 'de200bd42b49cc1814412c7e592dd6e9' at Lsn 0/16B5A50 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c
Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one
# start postgres compute node
> ./target/debug/neon_local pg start main
Starting new postgres (v14) main on timeline de200bd42b49cc1814412c7e592dd6e9 ...

View File

@@ -2,6 +2,7 @@
name = "compute_tools"
version = "0.1.0"
edition = "2021"
license = "Apache-2.0"
[dependencies]
anyhow = "1.0"

View File

@@ -52,10 +52,16 @@ fn watch_compute_activity(compute: &ComputeNode) {
let mut idle_backs: Vec<DateTime<Utc>> = vec![];
for b in backs.into_iter() {
let state: String = b.get("state");
let change: String = b.get("state_change");
let state: String = match b.try_get("state") {
Ok(state) => state,
Err(_) => continue,
};
if state == "idle" {
let change: String = match b.try_get("state_change") {
Ok(state_change) => state_change,
Err(_) => continue,
};
let change = DateTime::parse_from_rfc3339(&change);
match change {
Ok(t) => idle_backs.push(t.with_timezone(&Utc)),

View File

@@ -1,5 +1,6 @@
use std::path::Path;
use std::str::FromStr;
use std::time::Instant;
use anyhow::Result;
use log::{info, log_enabled, warn, Level};
@@ -197,22 +198,18 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
/// Reassign all dependent objects and delete requested roles.
pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<()> {
let spec = &node.spec;
// First, reassign all dependent objects to db owners.
if let Some(ops) = &spec.delta_operations {
if let Some(ops) = &node.spec.delta_operations {
// First, reassign all dependent objects to db owners.
info!("reassigning dependent objects of to-be-deleted roles");
for op in ops {
if op.action == "delete_role" {
reassign_owned_objects(node, &op.name)?;
}
}
}
// Second, proceed with role deletions.
let mut xact = client.transaction()?;
if let Some(ops) = &spec.delta_operations {
// Second, proceed with role deletions.
info!("processing role deletions");
let mut xact = client.transaction()?;
for op in ops {
// We do not check either role exists or not,
// Postgres will take care of it for us
@@ -223,6 +220,7 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<
xact.execute(query.as_str(), &[])?;
}
}
xact.commit()?;
}
Ok(())
@@ -317,6 +315,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
// XXX: with a limited number of databases it is fine, but consider making it a HashMap
let pg_db = existing_dbs.iter().find(|r| r.name == *name);
let start_time = Instant::now();
if let Some(r) = pg_db {
// XXX: db owner name is returned as quoted string from Postgres,
// when quoting is needed.
@@ -335,6 +334,8 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
info_print!(" -> update");
client.execute(query.as_str(), &[])?;
let elapsed = start_time.elapsed().as_millis();
info_print!(" ({} ms)", elapsed);
}
} else {
let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote());
@@ -342,6 +343,9 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
query.push_str(&db.to_pg_options());
client.execute(query.as_str(), &[])?;
let elapsed = start_time.elapsed().as_millis();
info_print!(" ({} ms)", elapsed);
}
info_print!("\n");

View File

@@ -2,6 +2,7 @@
name = "control_plane"
version = "0.1.0"
edition = "2021"
license = "Apache-2.0"
[dependencies]
anyhow = "1.0"

View File

@@ -136,22 +136,6 @@ where
anyhow::bail!("{process_name} did not start in {RETRY_UNTIL_SECS} seconds");
}
/// Send SIGTERM to child process
pub fn send_stop_child_process(child: &std::process::Child) -> anyhow::Result<()> {
let pid = child.id();
match kill(
nix::unistd::Pid::from_raw(pid.try_into().unwrap()),
Signal::SIGTERM,
) {
Ok(()) => Ok(()),
Err(Errno::ESRCH) => {
println!("child process with pid {pid} does not exist");
Ok(())
}
Err(e) => anyhow::bail!("Failed to send signal to child process with pid {pid}: {e}"),
}
}
/// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> anyhow::Result<()> {
let pid = match pid_file::read(pid_file)

View File

@@ -263,7 +263,7 @@ fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::R
} else if let Some(default_id) = env.default_tenant_id {
Ok(default_id)
} else {
bail!("No tenant id. Use --tenant-id, or set 'default_tenant_id' in the config file");
anyhow::bail!("No tenant id. Use --tenant-id, or set a default tenant");
}
}
@@ -284,8 +284,6 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId
}
fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
let initial_timeline_id_arg = parse_timeline_id(init_match)?;
// Create config file
let toml_file: String = if let Some(config_path) = init_match.get_one::<PathBuf>("config") {
// load and parse the file
@@ -309,30 +307,16 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
env.init(pg_version)
.context("Failed to initialize neon repository")?;
let initial_tenant_id = env
.default_tenant_id
.expect("default_tenant_id should be generated by the `env.init()` call above");
// Initialize pageserver, create initial tenant and timeline.
let pageserver = PageServerNode::from_env(&env);
let initial_timeline_id = pageserver
.initialize(
Some(initial_tenant_id),
initial_timeline_id_arg,
&pageserver_config_overrides(init_match),
pg_version,
)
pageserver
.initialize(&pageserver_config_overrides(init_match))
.unwrap_or_else(|e| {
eprintln!("pageserver init failed: {e:?}");
exit(1);
});
env.register_branch_mapping(
DEFAULT_BRANCH_NAME.to_owned(),
initial_tenant_id,
initial_timeline_id,
)?;
Ok(env)
}
@@ -388,6 +372,17 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
println!(
"Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {new_tenant_id}",
);
if create_match.get_flag("set-default") {
println!("Setting tenant {new_tenant_id} as a default one");
env.default_tenant_id = Some(new_tenant_id);
}
}
Some(("set-default", set_default_match)) => {
let tenant_id =
parse_tenant_id(set_default_match)?.context("No tenant id specified")?;
println!("Setting tenant {tenant_id} as a default one");
env.default_tenant_id = Some(tenant_id);
}
Some(("config", create_match)) => {
let tenant_id = get_tenant_id(create_match, env)?;
@@ -928,9 +923,8 @@ fn cli() -> Command {
.version(GIT_VERSION)
.subcommand(
Command::new("init")
.about("Initialize a new Neon repository")
.about("Initialize a new Neon repository, preparing configs for services to start with")
.arg(pageserver_config_args.clone())
.arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
.arg(
Arg::new("config")
.long("config")
@@ -992,11 +986,14 @@ fn cli() -> Command {
.arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
.arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))
.arg(pg_version_arg.clone())
.arg(Arg::new("set-default").long("set-default").action(ArgAction::SetTrue).required(false)
.help("Use this tenant in future CLI commands where tenant_id is needed, but not specified"))
)
.subcommand(Command::new("set-default").arg(tenant_id_arg.clone().required(true))
.about("Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified"))
.subcommand(Command::new("config")
.arg(tenant_id_arg.clone())
.arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))
)
.arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
)
.subcommand(
Command::new("pageserver")

View File

@@ -296,11 +296,6 @@ impl LocalEnv {
env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
}
// If no initial tenant ID was given, generate it.
if env.default_tenant_id.is_none() {
env.default_tenant_id = Some(TenantId::generate());
}
env.base_data_dir = base_path();
Ok(env)

View File

@@ -7,7 +7,7 @@ use std::path::PathBuf;
use std::process::{Child, Command};
use std::{io, result};
use anyhow::{bail, ensure, Context};
use anyhow::{bail, Context};
use pageserver_api::models::{
TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo,
};
@@ -130,83 +130,15 @@ impl PageServerNode {
overrides
}
/// Initializes a pageserver node by creating its config with the overrides provided,
/// and creating an initial tenant and timeline afterwards.
pub fn initialize(
&self,
create_tenant: Option<TenantId>,
initial_timeline_id: Option<TimelineId>,
config_overrides: &[&str],
pg_version: u32,
) -> anyhow::Result<TimelineId> {
/// Initializes a pageserver node by creating its config with the overrides provided.
pub fn initialize(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
// First, run `pageserver --init` and wait for it to write a config into FS and exit.
self.pageserver_init(config_overrides).with_context(|| {
format!(
"Failed to run init for pageserver node {}",
self.env.pageserver.id,
)
})?;
// Then, briefly start it fully to run HTTP commands on it,
// to create initial tenant and timeline.
// We disable the remote storage, since we stop pageserver right after the timeline creation,
// hence most of the uploads will either aborted or not started: no point to start them at all.
let disabled_remote_storage_override = "remote_storage={}";
let mut pageserver_process = self
.start_node(
&[disabled_remote_storage_override],
// Previous overrides will be taken from the config created before, don't overwrite them.
false,
)
.with_context(|| {
format!(
"Failed to start a process for pageserver node {}",
self.env.pageserver.id,
)
})?;
let init_result = self
.try_init_timeline(create_tenant, initial_timeline_id, pg_version)
.context("Failed to create initial tenant and timeline for pageserver");
match &init_result {
Ok(initial_timeline_id) => {
println!("Successfully initialized timeline {initial_timeline_id}")
}
Err(e) => eprintln!("{e:#}"),
}
background_process::send_stop_child_process(&pageserver_process)?;
let exit_code = pageserver_process.wait()?;
ensure!(
exit_code.success(),
format!(
"pageserver init failed with exit code {:?}",
exit_code.code()
)
);
println!(
"Stopped pageserver {} process with pid {}",
self.env.pageserver.id,
pageserver_process.id(),
);
init_result
}
fn try_init_timeline(
&self,
new_tenant_id: Option<TenantId>,
new_timeline_id: Option<TimelineId>,
pg_version: u32,
) -> anyhow::Result<TimelineId> {
let initial_tenant_id = self.tenant_create(new_tenant_id, HashMap::new())?;
let initial_timeline_info = self.timeline_create(
initial_tenant_id,
new_timeline_id,
None,
None,
Some(pg_version),
)?;
Ok(initial_timeline_info.timeline_id)
})
}
pub fn repo_path(&self) -> PathBuf {

90
deny.toml Normal file
View File

@@ -0,0 +1,90 @@
# This file was auto-generated using `cargo deny init`.
# cargo-deny is a cargo plugin that lets you lint your project's
# dependency graph to ensure all your dependencies conform
# to your expectations and requirements.
# Root options
targets = []
all-features = false
no-default-features = false
feature-depth = 1
# This section is considered when running `cargo deny check advisories`
# More documentation for the advisories section can be found here:
# https://embarkstudios.github.io/cargo-deny/checks/advisories/cfg.html
[advisories]
db-urls = ["https://github.com/rustsec/advisory-db"]
vulnerability = "deny"
unmaintained = "warn"
yanked = "warn"
notice = "warn"
ignore = []
# This section is considered when running `cargo deny check licenses`
# More documentation for the licenses section can be found here:
# https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html
[licenses]
unlicensed = "deny"
allow = [
"Apache-2.0",
"Artistic-2.0",
"BSD-2-Clause",
"BSD-3-Clause",
"ISC",
"MIT",
"MPL-2.0",
"OpenSSL",
"Unicode-DFS-2016",
]
deny = []
copyleft = "warn"
allow-osi-fsf-free = "neither"
default = "deny"
confidence-threshold = 0.8
exceptions = [
# Zlib license has some restrictions if we decide to change sth
{ allow = ["Zlib"], name = "const_format_proc_macros", version = "*" },
{ allow = ["Zlib"], name = "const_format", version = "*" },
]
[[licenses.clarify]]
name = "ring"
version = "*"
expression = "MIT AND ISC AND OpenSSL"
license-files = [
{ path = "LICENSE", hash = 0xbd0eed23 },
]
[licenses.private]
ignore = true
registries = []
# This section is considered when running `cargo deny check bans`.
# More documentation about the 'bans' section can be found here:
# https://embarkstudios.github.io/cargo-deny/checks/bans/cfg.html
[bans]
multiple-versions = "warn"
wildcards = "allow"
highlight = "all"
workspace-default-features = "allow"
external-default-features = "allow"
allow = []
deny = []
skip = []
skip-tree = []
# This section is considered when running `cargo deny check sources`.
# More documentation about the 'sources' section can be found here:
# https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html
[sources]
unknown-registry = "warn"
unknown-git = "warn"
allow-registry = ["https://github.com/rust-lang/crates.io-index"]
allow-git = []
[sources.allow-org]
github = [
"neondatabase",
]
gitlab = []
bitbucket = []

View File

@@ -2,6 +2,7 @@
name = "metrics"
version = "0.1.0"
edition = "2021"
license = "Apache-2.0"
[dependencies]
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency

View File

@@ -2,6 +2,7 @@
name = "pageserver_api"
version = "0.1.0"
edition = "2021"
license = "Apache-2.0"
[dependencies]
serde = { version = "1.0", features = ["derive"] }

View File

@@ -2,6 +2,7 @@
name = "postgres_connection"
version = "0.1.0"
edition = "2021"
license = "Apache-2.0"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

View File

@@ -2,6 +2,7 @@
name = "postgres_ffi"
version = "0.1.0"
edition = "2021"
license = "Apache-2.0"
[dependencies]
rand = "0.8.3"

View File

@@ -2,7 +2,7 @@
name = "wal_craft"
version = "0.1.0"
edition = "2021"
license = "Apache-2.0"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]

View File

@@ -2,6 +2,7 @@
name = "pq_proto"
version = "0.1.0"
edition = "2021"
license = "Apache-2.0"
[dependencies]
anyhow = "1.0"
@@ -12,5 +13,6 @@ rand = "0.8.3"
serde = { version = "1.0", features = ["derive"] }
tokio = { version = "1.17", features = ["macros"] }
tracing = "0.1"
thiserror = "1.0"
workspace_hack = { version = "0.1", path = "../../workspace_hack" }

View File

@@ -5,7 +5,7 @@
// Tools for calling certain async methods in sync contexts.
pub mod sync;
use anyhow::{bail, ensure, Context, Result};
use anyhow::{ensure, Context, Result};
use bytes::{Buf, BufMut, Bytes, BytesMut};
use postgres_protocol::PG_EPOCH;
use serde::{Deserialize, Serialize};
@@ -194,6 +194,35 @@ macro_rules! retry_read {
};
}
/// An error occured during connection being open.
#[derive(thiserror::Error, Debug)]
pub enum ConnectionError {
/// IO error during writing to or reading from the connection socket.
#[error("Socket IO error: {0}")]
Socket(std::io::Error),
/// Invalid packet was received from client
#[error("Protocol error: {0}")]
Protocol(String),
/// Failed to parse a protocol mesage
#[error("Message parse error: {0}")]
MessageParse(anyhow::Error),
}
impl From<anyhow::Error> for ConnectionError {
fn from(e: anyhow::Error) -> Self {
Self::MessageParse(e)
}
}
impl ConnectionError {
pub fn into_io_error(self) -> io::Error {
match self {
ConnectionError::Socket(io) => io,
other => io::Error::new(io::ErrorKind::Other, other.to_string()),
}
}
}
impl FeMessage {
/// Read one message from the stream.
/// This function returns `Ok(None)` in case of EOF.
@@ -216,7 +245,9 @@ impl FeMessage {
/// }
/// ```
#[inline(never)]
pub fn read(stream: &mut (impl io::Read + Unpin)) -> anyhow::Result<Option<FeMessage>> {
pub fn read(
stream: &mut (impl io::Read + Unpin),
) -> Result<Option<FeMessage>, ConnectionError> {
Self::read_fut(&mut AsyncishRead(stream)).wait()
}
@@ -224,7 +255,7 @@ impl FeMessage {
/// See documentation for `Self::read`.
pub fn read_fut<Reader>(
stream: &mut Reader,
) -> SyncFuture<Reader, impl Future<Output = anyhow::Result<Option<FeMessage>>> + '_>
) -> SyncFuture<Reader, impl Future<Output = Result<Option<FeMessage>, ConnectionError>> + '_>
where
Reader: tokio::io::AsyncRead + Unpin,
{
@@ -238,17 +269,21 @@ impl FeMessage {
let tag = match retry_read!(stream.read_u8().await) {
Ok(b) => b,
Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None),
Err(e) => return Err(e.into()),
Err(e) => return Err(ConnectionError::Socket(e)),
};
// The message length includes itself, so it better be at least 4.
let len = retry_read!(stream.read_u32().await)?
let len = retry_read!(stream.read_u32().await)
.map_err(ConnectionError::Socket)?
.checked_sub(4)
.context("invalid message length")?;
.ok_or_else(|| ConnectionError::Protocol("invalid message length".to_string()))?;
let body = {
let mut buffer = vec![0u8; len as usize];
stream.read_exact(&mut buffer).await?;
stream
.read_exact(&mut buffer)
.await
.map_err(ConnectionError::Socket)?;
Bytes::from(buffer)
};
@@ -265,7 +300,11 @@ impl FeMessage {
b'c' => Ok(Some(FeMessage::CopyDone)),
b'f' => Ok(Some(FeMessage::CopyFail)),
b'p' => Ok(Some(FeMessage::PasswordMessage(body))),
tag => bail!("unknown message tag: {},'{:?}'", tag, body),
tag => {
return Err(ConnectionError::Protocol(format!(
"unknown message tag: {tag},'{body:?}'"
)))
}
}
})
}
@@ -275,7 +314,9 @@ impl FeStartupPacket {
/// Read startup message from the stream.
// XXX: It's tempting yet undesirable to accept `stream` by value,
// since such a change will cause user-supplied &mut references to be consumed
pub fn read(stream: &mut (impl io::Read + Unpin)) -> anyhow::Result<Option<FeMessage>> {
pub fn read(
stream: &mut (impl io::Read + Unpin),
) -> Result<Option<FeMessage>, ConnectionError> {
Self::read_fut(&mut AsyncishRead(stream)).wait()
}
@@ -284,7 +325,7 @@ impl FeStartupPacket {
// since such a change will cause user-supplied &mut references to be consumed
pub fn read_fut<Reader>(
stream: &mut Reader,
) -> SyncFuture<Reader, impl Future<Output = anyhow::Result<Option<FeMessage>>> + '_>
) -> SyncFuture<Reader, impl Future<Output = Result<Option<FeMessage>, ConnectionError>> + '_>
where
Reader: tokio::io::AsyncRead + Unpin,
{
@@ -302,31 +343,41 @@ impl FeStartupPacket {
let len = match retry_read!(stream.read_u32().await) {
Ok(len) => len as usize,
Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None),
Err(e) => return Err(e.into()),
Err(e) => return Err(ConnectionError::Socket(e)),
};
#[allow(clippy::manual_range_contains)]
if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
bail!("invalid message length");
return Err(ConnectionError::Protocol(format!(
"invalid message length {len}"
)));
}
let request_code = retry_read!(stream.read_u32().await)?;
let request_code =
retry_read!(stream.read_u32().await).map_err(ConnectionError::Socket)?;
// the rest of startup packet are params
let params_len = len - 8;
let mut params_bytes = vec![0u8; params_len];
stream.read_exact(params_bytes.as_mut()).await?;
stream
.read_exact(params_bytes.as_mut())
.await
.map_err(ConnectionError::Socket)?;
// Parse params depending on request code
let req_hi = request_code >> 16;
let req_lo = request_code & ((1 << 16) - 1);
let message = match (req_hi, req_lo) {
(RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => {
ensure!(params_len == 8, "expected 8 bytes for CancelRequest params");
if params_len != 8 {
return Err(ConnectionError::Protocol(
"expected 8 bytes for CancelRequest params".to_string(),
));
}
let mut cursor = Cursor::new(params_bytes);
FeStartupPacket::CancelRequest(CancelKeyData {
backend_pid: cursor.read_i32().await?,
cancel_key: cursor.read_i32().await?,
backend_pid: cursor.read_i32().await.map_err(ConnectionError::Socket)?,
cancel_key: cursor.read_i32().await.map_err(ConnectionError::Socket)?,
})
}
(RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => {
@@ -338,7 +389,9 @@ impl FeStartupPacket {
FeStartupPacket::GssEncRequest
}
(RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => {
bail!("Unrecognized request code {}", unrecognized_code)
return Err(ConnectionError::Protocol(format!(
"Unrecognized request code {unrecognized_code}"
)));
}
// TODO bail if protocol major_version is not 3?
(major_version, minor_version) => {
@@ -346,15 +399,21 @@ impl FeStartupPacket {
// See `postgres: ProcessStartupPacket, build_startup_packet`.
let mut tokens = str::from_utf8(&params_bytes)
.context("StartupMessage params: invalid utf-8")?
.strip_suffix('\0') // drop packet's own null terminator
.context("StartupMessage params: missing null terminator")?
.strip_suffix('\0') // drop packet's own null
.ok_or_else(|| {
ConnectionError::Protocol(
"StartupMessage params: missing null terminator".to_string(),
)
})?
.split_terminator('\0');
let mut params = HashMap::new();
while let Some(name) = tokens.next() {
let value = tokens
.next()
.context("StartupMessage params: key without value")?;
let value = tokens.next().ok_or_else(|| {
ConnectionError::Protocol(
"StartupMessage params: key without value".to_string(),
)
})?;
params.insert(name.to_owned(), value.to_owned());
}
@@ -458,7 +517,7 @@ pub enum BeMessage<'a> {
CloseComplete,
// None means column is NULL
DataRow(&'a [Option<&'a [u8]>]),
ErrorResponse(&'a str),
ErrorResponse(&'a str, Option<&'a [u8; 5]>),
/// Single byte - used in response to SSLRequest/GSSENCRequest.
EncryptionResponse(bool),
NoData,
@@ -606,7 +665,7 @@ fn write_body<R>(buf: &mut BytesMut, f: impl FnOnce(&mut BytesMut) -> R) -> R {
}
/// Safe write of s into buf as cstring (String in the protocol).
fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> Result<(), io::Error> {
fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> io::Result<()> {
let bytes = s.as_ref();
if bytes.contains(&0) {
return Err(io::Error::new(
@@ -626,6 +685,8 @@ fn read_cstr(buf: &mut Bytes) -> anyhow::Result<Bytes> {
Ok(result)
}
pub const SQLSTATE_INTERNAL_ERROR: &[u8; 5] = b"XX000";
impl<'a> BeMessage<'a> {
/// Write message to the given buf.
// Unlike the reading side, we use BytesMut
@@ -765,10 +826,7 @@ impl<'a> BeMessage<'a> {
// First byte of each field represents type of this field. Set just enough fields
// to satisfy rust-postgres client: 'S' -- severity, 'C' -- error, 'M' -- error
// message text.
BeMessage::ErrorResponse(error_msg) => {
// For all the errors set Severity to Error and error code to
// 'internal error'.
BeMessage::ErrorResponse(error_msg, pg_error_code) => {
// 'E' signalizes ErrorResponse messages
buf.put_u8(b'E');
write_body(buf, |buf| {
@@ -776,7 +834,9 @@ impl<'a> BeMessage<'a> {
buf.put_slice(b"ERROR\0");
buf.put_u8(b'C'); // SQLSTATE error code
buf.put_slice(b"CXX000\0");
buf.put_slice(&terminate_code(
pg_error_code.unwrap_or(SQLSTATE_INTERNAL_ERROR),
));
buf.put_u8(b'M'); // the message
write_cstr(error_msg, buf)?;
@@ -799,7 +859,7 @@ impl<'a> BeMessage<'a> {
buf.put_slice(b"NOTICE\0");
buf.put_u8(b'C'); // SQLSTATE error code
buf.put_slice(b"CXX000\0");
buf.put_slice(&terminate_code(SQLSTATE_INTERNAL_ERROR));
buf.put_u8(b'M'); // the message
write_cstr(error_msg.as_bytes(), buf)?;
@@ -1087,3 +1147,12 @@ mod tests {
let _ = FeStartupPacket::read_fut(stream).await;
}
}
fn terminate_code(code: &[u8; 5]) -> [u8; 6] {
let mut terminated = [0; 6];
for (i, &elem) in code.iter().enumerate() {
terminated[i] = elem;
}
terminated
}

View File

@@ -2,6 +2,7 @@
name = "remote_storage"
version = "0.1.0"
edition = "2021"
license = "Apache-2.0"
[dependencies]
anyhow = { version = "1.0", features = ["backtrace"] }

View File

@@ -2,6 +2,7 @@
name = "safekeeper_api"
version = "0.1.0"
edition = "2021"
license = "Apache-2.0"
[dependencies]
serde = { version = "1.0", features = ["derive"] }

View File

@@ -3,6 +3,7 @@ name = "tenant_size_model"
version = "0.1.0"
edition = "2021"
publish = false
license = "Apache-2.0"
[dependencies]
workspace_hack = { version = "0.1", path = "../../workspace_hack" }

View File

@@ -2,9 +2,10 @@
name = "utils"
version = "0.1.0"
edition = "2021"
license = "Apache-2.0"
[dependencies]
sentry = "0.29.0"
sentry = { version = "0.29.0", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
async-trait = "0.1"
anyhow = "1.0"
bincode = "1.3"

View File

@@ -3,11 +3,11 @@
//! implementation determining how to process the queries. Currently its API
//! is rather narrow, but we can extend it once required.
use crate::postgres_backend_async::{log_query_error, short_error, QueryError};
use crate::sock_split::{BidiStream, ReadStream, WriteStream};
use anyhow::{bail, ensure, Context, Result};
use anyhow::Context;
use bytes::{Bytes, BytesMut};
use pq_proto::{BeMessage, FeMessage, FeStartupPacket};
use rand::Rng;
use serde::{Deserialize, Serialize};
use std::fmt;
use std::io::{self, Write};
@@ -22,25 +22,32 @@ pub trait Handler {
/// postgres_backend will issue ReadyForQuery after calling this (this
/// might be not what we want after CopyData streaming, but currently we don't
/// care).
fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()>;
fn process_query(
&mut self,
pgb: &mut PostgresBackend,
query_string: &str,
) -> Result<(), QueryError>;
/// Called on startup packet receival, allows to process params.
///
/// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users
/// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow
/// to override whole init logic in implementations.
fn startup(&mut self, _pgb: &mut PostgresBackend, _sm: &FeStartupPacket) -> Result<()> {
fn startup(
&mut self,
_pgb: &mut PostgresBackend,
_sm: &FeStartupPacket,
) -> Result<(), QueryError> {
Ok(())
}
/// Check auth md5
fn check_auth_md5(&mut self, _pgb: &mut PostgresBackend, _md5_response: &[u8]) -> Result<()> {
bail!("MD5 auth failed")
}
/// Check auth jwt
fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> {
bail!("JWT auth failed")
fn check_auth_jwt(
&mut self,
_pgb: &mut PostgresBackend,
_jwt_response: &[u8],
) -> Result<(), QueryError> {
Err(QueryError::Other(anyhow::anyhow!("JWT auth failed")))
}
fn is_shutdown_requested(&self) -> bool {
@@ -61,7 +68,6 @@ pub enum ProtoState {
#[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)]
pub enum AuthType {
Trust,
MD5,
// This mimics postgres's AuthenticationCleartextPassword but instead of password expects JWT
NeonJWT,
}
@@ -72,9 +78,8 @@ impl FromStr for AuthType {
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"Trust" => Ok(Self::Trust),
"MD5" => Ok(Self::MD5),
"NeonJWT" => Ok(Self::NeonJWT),
_ => bail!("invalid value \"{s}\" for auth type"),
_ => anyhow::bail!("invalid value \"{s}\" for auth type"),
}
}
}
@@ -83,7 +88,6 @@ impl fmt::Display for AuthType {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(match self {
AuthType::Trust => "Trust",
AuthType::MD5 => "MD5",
AuthType::NeonJWT => "NeonJWT",
})
}
@@ -134,7 +138,6 @@ pub struct PostgresBackend {
pub state: ProtoState,
md5_salt: [u8; 4],
auth_type: AuthType,
peer_addr: SocketAddr,
@@ -164,7 +167,7 @@ pub fn is_socket_read_timed_out(error: &anyhow::Error) -> bool {
}
// Cast a byte slice to a string slice, dropping null terminator if there's one.
fn cstr_to_str(bytes: &[u8]) -> Result<&str> {
fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> {
let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes);
std::str::from_utf8(without_null).map_err(|e| e.into())
}
@@ -187,7 +190,6 @@ impl PostgresBackend {
stream: Some(Stream::Bidirectional(BidiStream::from_tcp(socket))),
buf_out: BytesMut::with_capacity(10 * 1024),
state: ProtoState::Initialization,
md5_salt: [0u8; 4],
auth_type,
tls_config,
peer_addr,
@@ -199,10 +201,10 @@ impl PostgresBackend {
}
/// Get direct reference (into the Option) to the read stream.
fn get_stream_in(&mut self) -> Result<&mut BidiStream> {
fn get_stream_in(&mut self) -> anyhow::Result<&mut BidiStream> {
match &mut self.stream {
Some(Stream::Bidirectional(stream)) => Ok(stream),
_ => bail!("reader taken"),
_ => anyhow::bail!("reader taken"),
}
}
@@ -226,7 +228,7 @@ impl PostgresBackend {
}
/// Read full message or return None if connection is closed.
pub fn read_message(&mut self) -> Result<Option<FeMessage>> {
pub fn read_message(&mut self) -> Result<Option<FeMessage>, QueryError> {
let (state, stream) = (self.state, self.get_stream_in()?);
use ProtoState::*;
@@ -234,6 +236,7 @@ impl PostgresBackend {
Initialization | Encrypted => FeStartupPacket::read(stream),
Authentication | Established => FeMessage::read(stream),
}
.map_err(QueryError::from)
}
/// Write message into internal output buffer.
@@ -257,7 +260,7 @@ impl PostgresBackend {
}
// Wrapper for run_message_loop() that shuts down socket when we are done
pub fn run(mut self, handler: &mut impl Handler) -> Result<()> {
pub fn run(mut self, handler: &mut impl Handler) -> Result<(), QueryError> {
let ret = self.run_message_loop(handler);
if let Some(stream) = self.stream.as_mut() {
let _ = stream.shutdown(Shutdown::Both);
@@ -265,7 +268,7 @@ impl PostgresBackend {
ret
}
fn run_message_loop(&mut self, handler: &mut impl Handler) -> Result<()> {
fn run_message_loop(&mut self, handler: &mut impl Handler) -> Result<(), QueryError> {
trace!("postgres backend to {:?} started", self.peer_addr);
let mut unnamed_query_string = Bytes::new();
@@ -274,7 +277,7 @@ impl PostgresBackend {
match self.read_message() {
Ok(message) => {
if let Some(msg) = message {
trace!("got message {:?}", msg);
trace!("got message {msg:?}");
match self.process_message(handler, msg, &mut unnamed_query_string)? {
ProcessMsgResult::Continue => continue,
@@ -285,10 +288,12 @@ impl PostgresBackend {
}
}
Err(e) => {
// If it is a timeout error, continue the loop
if !is_socket_read_timed_out(&e) {
return Err(e);
if let QueryError::Other(e) = &e {
if is_socket_read_timed_out(e) {
continue;
}
}
return Err(e);
}
}
}
@@ -306,7 +311,7 @@ impl PostgresBackend {
}
stream => {
self.stream = stream;
bail!("can't start TLs without bidi stream");
anyhow::bail!("can't start TLs without bidi stream");
}
}
}
@@ -316,17 +321,16 @@ impl PostgresBackend {
handler: &mut impl Handler,
msg: FeMessage,
unnamed_query_string: &mut Bytes,
) -> Result<ProcessMsgResult> {
) -> Result<ProcessMsgResult, QueryError> {
// Allow only startup and password messages during auth. Otherwise client would be able to bypass auth
// TODO: change that to proper top-level match of protocol state with separate message handling for each state
if self.state < ProtoState::Established {
ensure!(
matches!(
msg,
FeMessage::PasswordMessage(_) | FeMessage::StartupPacket(_)
),
"protocol violation"
);
if self.state < ProtoState::Established
&& !matches!(
msg,
FeMessage::PasswordMessage(_) | FeMessage::StartupPacket(_)
)
{
return Err(QueryError::Other(anyhow::anyhow!("protocol violation")));
}
let have_tls = self.tls_config.is_some();
@@ -350,8 +354,13 @@ impl PostgresBackend {
}
FeStartupPacket::StartupMessage { .. } => {
if have_tls && !matches!(self.state, ProtoState::Encrypted) {
self.write_message(&BeMessage::ErrorResponse("must connect with TLS"))?;
bail!("client did not connect with TLS");
self.write_message(&BeMessage::ErrorResponse(
"must connect with TLS",
None,
))?;
return Err(QueryError::Other(anyhow::anyhow!(
"client did not connect with TLS"
)));
}
// NB: startup() may change self.auth_type -- we are using that in proxy code
@@ -367,13 +376,6 @@ impl PostgresBackend {
.write_message(&BeMessage::ReadyForQuery)?;
self.state = ProtoState::Established;
}
AuthType::MD5 => {
rand::thread_rng().fill(&mut self.md5_salt);
self.write_message(&BeMessage::AuthenticationMD5Password(
self.md5_salt,
))?;
self.state = ProtoState::Authentication;
}
AuthType::NeonJWT => {
self.write_message(&BeMessage::AuthenticationCleartextPassword)?;
self.state = ProtoState::Authentication;
@@ -393,20 +395,15 @@ impl PostgresBackend {
match self.auth_type {
AuthType::Trust => unreachable!(),
AuthType::MD5 => {
let (_, md5_response) = m.split_last().context("protocol violation")?;
if let Err(e) = handler.check_auth_md5(self, md5_response) {
self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
bail!("auth failed: {}", e);
}
}
AuthType::NeonJWT => {
let (_, jwt_response) = m.split_last().context("protocol violation")?;
if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
bail!("auth failed: {}", e);
self.write_message(&BeMessage::ErrorResponse(
&e.to_string(),
Some(e.pg_error_code()),
))?;
return Err(e);
}
}
}
@@ -420,33 +417,14 @@ impl PostgresBackend {
// remove null terminator
let query_string = cstr_to_str(&body)?;
trace!("got query {:?}", query_string);
// xxx distinguish fatal and recoverable errors?
trace!("got query {query_string:?}");
if let Err(e) = handler.process_query(self, query_string) {
// ":?" uses the alternate formatting style, which makes anyhow display the
// full cause of the error, not just the top-level context + its trace.
// We don't want to send that in the ErrorResponse though,
// because it's not relevant to the compute node logs.
//
// We also don't want to log full stacktrace when the error is primitive,
// such as usual connection closed.
let short_error = format!("{:#}", e);
let root_cause = e.root_cause().to_string();
if root_cause.contains("connection closed unexpectedly")
|| root_cause.contains("Broken pipe (os error 32)")
{
error!(
"query handler for '{}' failed: {}",
query_string, short_error
);
} else {
error!("query handler for '{}' failed: {:?}", query_string, e);
}
self.write_message_noflush(&BeMessage::ErrorResponse(&short_error))?;
// TODO: untangle convoluted control flow
if e.to_string().contains("failed to run") {
return Ok(ProcessMsgResult::Break);
}
log_query_error(query_string, &e);
let short_error = short_error(&e);
self.write_message_noflush(&BeMessage::ErrorResponse(
&short_error,
Some(e.pg_error_code()),
))?;
}
self.write_message(&BeMessage::ReadyForQuery)?;
}
@@ -471,11 +449,13 @@ impl PostgresBackend {
FeMessage::Execute(_) => {
let query_string = cstr_to_str(unnamed_query_string)?;
trace!("got execute {:?}", query_string);
// xxx distinguish fatal and recoverable errors?
trace!("got execute {query_string:?}");
if let Err(e) = handler.process_query(self, query_string) {
error!("query handler for '{}' failed: {:?}", query_string, e);
self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
log_query_error(query_string, &e);
self.write_message(&BeMessage::ErrorResponse(
&e.to_string(),
Some(e.pg_error_code()),
))?;
}
// NOTE there is no ReadyForQuery message. This handler is used
// for basebackup and it uses CopyOut which doesn't require
@@ -494,7 +474,9 @@ impl PostgresBackend {
// We prefer explicit pattern matching to wildcards, because
// this helps us spot the places where new variants are missing
FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail => {
bail!("unexpected message type: {:?}", msg);
return Err(QueryError::Other(anyhow::anyhow!(
"unexpected message type: {msg:?}"
)));
}
}

View File

@@ -4,45 +4,87 @@
//! is rather narrow, but we can extend it once required.
use crate::postgres_backend::AuthType;
use anyhow::{bail, Context, Result};
use bytes::{Bytes, BytesMut};
use pq_proto::{BeMessage, FeMessage, FeStartupPacket};
use rand::Rng;
use anyhow::Context;
use bytes::{Buf, Bytes, BytesMut};
use pq_proto::{BeMessage, ConnectionError, FeMessage, FeStartupPacket, SQLSTATE_INTERNAL_ERROR};
use std::future::Future;
use std::io;
use std::net::SocketAddr;
use std::pin::Pin;
use std::sync::Arc;
use std::task::Poll;
use tracing::{debug, error, trace};
use tracing::{debug, error, info, trace};
use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufReader};
use tokio_rustls::TlsAcceptor;
pub fn is_expected_io_error(e: &io::Error) -> bool {
use io::ErrorKind::*;
matches!(
e.kind(),
ConnectionRefused | ConnectionAborted | ConnectionReset
)
}
/// An error, occurred during query processing:
/// either during the connection ([`ConnectionError`]) or before/after it.
#[derive(thiserror::Error, Debug)]
pub enum QueryError {
/// The connection was lost while processing the query.
#[error(transparent)]
Disconnected(#[from] ConnectionError),
/// Some other error
#[error(transparent)]
Other(#[from] anyhow::Error),
}
impl From<io::Error> for QueryError {
fn from(e: io::Error) -> Self {
Self::Disconnected(ConnectionError::Socket(e))
}
}
impl QueryError {
pub fn pg_error_code(&self) -> &'static [u8; 5] {
match self {
Self::Disconnected(_) => b"08006", // connection failure
Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
}
}
}
#[async_trait::async_trait]
pub trait Handler {
/// Handle single query.
/// postgres_backend will issue ReadyForQuery after calling this (this
/// might be not what we want after CopyData streaming, but currently we don't
/// care).
async fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()>;
async fn process_query(
&mut self,
pgb: &mut PostgresBackend,
query_string: &str,
) -> Result<(), QueryError>;
/// Called on startup packet receival, allows to process params.
///
/// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users
/// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow
/// to override whole init logic in implementations.
fn startup(&mut self, _pgb: &mut PostgresBackend, _sm: &FeStartupPacket) -> Result<()> {
fn startup(
&mut self,
_pgb: &mut PostgresBackend,
_sm: &FeStartupPacket,
) -> Result<(), QueryError> {
Ok(())
}
/// Check auth md5
fn check_auth_md5(&mut self, _pgb: &mut PostgresBackend, _md5_response: &[u8]) -> Result<()> {
bail!("MD5 auth failed")
}
/// Check auth jwt
fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> {
bail!("JWT auth failed")
fn check_auth_jwt(
&mut self,
_pgb: &mut PostgresBackend,
_jwt_response: &[u8],
) -> Result<(), QueryError> {
Err(QueryError::Other(anyhow::anyhow!("JWT auth failed")))
}
}
@@ -76,17 +118,14 @@ impl AsyncWrite for Stream {
self: Pin<&mut Self>,
cx: &mut std::task::Context<'_>,
buf: &[u8],
) -> Poll<Result<usize, std::io::Error>> {
) -> Poll<io::Result<usize>> {
match self.get_mut() {
Self::Unencrypted(stream) => Pin::new(stream).poll_write(cx, buf),
Self::Tls(stream) => Pin::new(stream).poll_write(cx, buf),
Self::Broken => unreachable!(),
}
}
fn poll_flush(
self: Pin<&mut Self>,
cx: &mut std::task::Context<'_>,
) -> Poll<Result<(), std::io::Error>> {
fn poll_flush(self: Pin<&mut Self>, cx: &mut std::task::Context<'_>) -> Poll<io::Result<()>> {
match self.get_mut() {
Self::Unencrypted(stream) => Pin::new(stream).poll_flush(cx),
Self::Tls(stream) => Pin::new(stream).poll_flush(cx),
@@ -96,7 +135,7 @@ impl AsyncWrite for Stream {
fn poll_shutdown(
self: Pin<&mut Self>,
cx: &mut std::task::Context<'_>,
) -> Poll<Result<(), std::io::Error>> {
) -> Poll<io::Result<()>> {
match self.get_mut() {
Self::Unencrypted(stream) => Pin::new(stream).poll_shutdown(cx),
Self::Tls(stream) => Pin::new(stream).poll_shutdown(cx),
@@ -109,7 +148,7 @@ impl AsyncRead for Stream {
self: Pin<&mut Self>,
cx: &mut std::task::Context<'_>,
buf: &mut tokio::io::ReadBuf<'_>,
) -> Poll<Result<(), std::io::Error>> {
) -> Poll<io::Result<()>> {
match self.get_mut() {
Self::Unencrypted(stream) => Pin::new(stream).poll_read(cx, buf),
Self::Tls(stream) => Pin::new(stream).poll_read(cx, buf),
@@ -120,12 +159,14 @@ impl AsyncRead for Stream {
pub struct PostgresBackend {
stream: Stream,
// Output buffer. c.f. BeMessage::write why we are using BytesMut here.
// The data between 0 and "current position" as tracked by the bytes::Buf
// implementation of BytesMut, have already been written.
buf_out: BytesMut,
pub state: ProtoState,
md5_salt: [u8; 4],
auth_type: AuthType,
peer_addr: SocketAddr,
@@ -143,7 +184,7 @@ pub fn query_from_cstring(query_string: Bytes) -> Vec<u8> {
}
// Cast a byte slice to a string slice, dropping null terminator if there's one.
fn cstr_to_str(bytes: &[u8]) -> Result<&str> {
fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> {
let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes);
std::str::from_utf8(without_null).map_err(|e| e.into())
}
@@ -153,14 +194,13 @@ impl PostgresBackend {
socket: tokio::net::TcpStream,
auth_type: AuthType,
tls_config: Option<Arc<rustls::ServerConfig>>,
) -> std::io::Result<Self> {
) -> io::Result<Self> {
let peer_addr = socket.peer_addr()?;
Ok(Self {
stream: Stream::Unencrypted(BufReader::new(socket)),
buf_out: BytesMut::with_capacity(10 * 1024),
state: ProtoState::Initialization,
md5_salt: [0u8; 4],
auth_type,
tls_config,
peer_addr,
@@ -172,30 +212,68 @@ impl PostgresBackend {
}
/// Read full message or return None if connection is closed.
pub async fn read_message(&mut self) -> Result<Option<FeMessage>> {
pub async fn read_message(&mut self) -> Result<Option<FeMessage>, QueryError> {
use ProtoState::*;
match self.state {
Initialization | Encrypted => FeStartupPacket::read_fut(&mut self.stream).await,
Authentication | Established => FeMessage::read_fut(&mut self.stream).await,
Closed => Ok(None),
}
.map_err(QueryError::from)
}
/// Flush output buffer into the socket.
pub async fn flush(&mut self) -> std::io::Result<&mut Self> {
self.stream.write_all(&self.buf_out).await?;
pub async fn flush(&mut self) -> io::Result<()> {
while self.buf_out.has_remaining() {
let bytes_written = self.stream.write(self.buf_out.chunk()).await?;
self.buf_out.advance(bytes_written);
}
self.buf_out.clear();
Ok(self)
Ok(())
}
/// Write message into internal output buffer.
pub fn write_message(&mut self, message: &BeMessage<'_>) -> Result<&mut Self, std::io::Error> {
pub fn write_message(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> {
BeMessage::write(&mut self.buf_out, message)?;
Ok(self)
}
/// Returns an AsyncWrite implementation that wraps all the data written
/// to it in CopyData messages, and writes them to the connection
///
/// The caller is responsible for sending CopyOutResponse and CopyDone messages.
pub fn copyout_writer(&mut self) -> CopyDataWriter {
CopyDataWriter { pgb: self }
}
/// A polling function that tries to write all the data from 'buf_out' to the
/// underlying stream.
fn poll_write_buf(
&mut self,
cx: &mut std::task::Context<'_>,
) -> Poll<Result<(), std::io::Error>> {
while self.buf_out.has_remaining() {
match Pin::new(&mut self.stream).poll_write(cx, self.buf_out.chunk()) {
Poll::Ready(Ok(bytes_written)) => {
self.buf_out.advance(bytes_written);
}
Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
Poll::Pending => return Poll::Pending,
}
}
Poll::Ready(Ok(()))
}
fn poll_flush(&mut self, cx: &mut std::task::Context<'_>) -> Poll<Result<(), std::io::Error>> {
Pin::new(&mut self.stream).poll_flush(cx)
}
// Wrapper for run_message_loop() that shuts down socket when we are done
pub async fn run<F, S>(mut self, handler: &mut impl Handler, shutdown_watcher: F) -> Result<()>
pub async fn run<F, S>(
mut self,
handler: &mut impl Handler,
shutdown_watcher: F,
) -> Result<(), QueryError>
where
F: Fn() -> S,
S: Future,
@@ -209,7 +287,7 @@ impl PostgresBackend {
&mut self,
handler: &mut impl Handler,
shutdown_watcher: F,
) -> Result<()>
) -> Result<(), QueryError>
where
F: Fn() -> S,
S: Future,
@@ -245,7 +323,7 @@ impl PostgresBackend {
return Ok(());
}
}
Ok::<(), anyhow::Error>(())
Ok::<(), QueryError>(())
} => {
// Handshake complete.
result?;
@@ -290,14 +368,14 @@ impl PostgresBackend {
self.stream = Stream::Tls(Box::new(tls_stream));
return Ok(());
};
bail!("TLS already started");
anyhow::bail!("TLS already started");
}
async fn process_handshake_message(
&mut self,
handler: &mut impl Handler,
msg: FeMessage,
) -> Result<ProcessMsgResult> {
) -> Result<ProcessMsgResult, QueryError> {
assert!(self.state < ProtoState::Established);
let have_tls = self.tls_config.is_some();
match msg {
@@ -320,8 +398,13 @@ impl PostgresBackend {
}
FeStartupPacket::StartupMessage { .. } => {
if have_tls && !matches!(self.state, ProtoState::Encrypted) {
self.write_message(&BeMessage::ErrorResponse("must connect with TLS"))?;
bail!("client did not connect with TLS");
self.write_message(&BeMessage::ErrorResponse(
"must connect with TLS",
None,
))?;
return Err(QueryError::Other(anyhow::anyhow!(
"client did not connect with TLS"
)));
}
// NB: startup() may change self.auth_type -- we are using that in proxy code
@@ -337,13 +420,6 @@ impl PostgresBackend {
.write_message(&BeMessage::ReadyForQuery)?;
self.state = ProtoState::Established;
}
AuthType::MD5 => {
rand::thread_rng().fill(&mut self.md5_salt);
self.write_message(&BeMessage::AuthenticationMD5Password(
self.md5_salt,
))?;
self.state = ProtoState::Authentication;
}
AuthType::NeonJWT => {
self.write_message(&BeMessage::AuthenticationCleartextPassword)?;
self.state = ProtoState::Authentication;
@@ -364,20 +440,15 @@ impl PostgresBackend {
match self.auth_type {
AuthType::Trust => unreachable!(),
AuthType::MD5 => {
let (_, md5_response) = m.split_last().context("protocol violation")?;
if let Err(e) = handler.check_auth_md5(self, md5_response) {
self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
bail!("auth failed: {}", e);
}
}
AuthType::NeonJWT => {
let (_, jwt_response) = m.split_last().context("protocol violation")?;
if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
bail!("auth failed: {}", e);
self.write_message(&BeMessage::ErrorResponse(
&e.to_string(),
Some(e.pg_error_code()),
))?;
return Err(e);
}
}
}
@@ -400,33 +471,28 @@ impl PostgresBackend {
handler: &mut impl Handler,
msg: FeMessage,
unnamed_query_string: &mut Bytes,
) -> Result<ProcessMsgResult> {
) -> Result<ProcessMsgResult, QueryError> {
// Allow only startup and password messages during auth. Otherwise client would be able to bypass auth
// TODO: change that to proper top-level match of protocol state with separate message handling for each state
assert!(self.state == ProtoState::Established);
match msg {
FeMessage::StartupPacket(_) | FeMessage::PasswordMessage(_) => {
bail!("protocol violation");
return Err(QueryError::Other(anyhow::anyhow!("protocol violation")));
}
FeMessage::Query(body) => {
// remove null terminator
let query_string = cstr_to_str(&body)?;
trace!("got query {:?}", query_string);
// xxx distinguish fatal and recoverable errors?
trace!("got query {query_string:?}");
if let Err(e) = handler.process_query(self, query_string).await {
// ":?" uses the alternate formatting style, which makes anyhow display the
// full cause of the error, not just the top-level context + its trace.
// We don't want to send that in the ErrorResponse though,
// because it's not relevant to the compute node logs.
error!("query handler for '{}' failed: {:?}", query_string, e);
self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
// TODO: untangle convoluted control flow
if e.to_string().contains("failed to run") {
return Ok(ProcessMsgResult::Break);
}
log_query_error(query_string, &e);
let short_error = short_error(&e);
self.write_message(&BeMessage::ErrorResponse(
&short_error,
Some(e.pg_error_code()),
))?;
}
self.write_message(&BeMessage::ReadyForQuery)?;
}
@@ -451,11 +517,13 @@ impl PostgresBackend {
FeMessage::Execute(_) => {
let query_string = cstr_to_str(unnamed_query_string)?;
trace!("got execute {:?}", query_string);
// xxx distinguish fatal and recoverable errors?
trace!("got execute {query_string:?}");
if let Err(e) = handler.process_query(self, query_string).await {
error!("query handler for '{}' failed: {:?}", query_string, e);
self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
log_query_error(query_string, &e);
self.write_message(&BeMessage::ErrorResponse(
&e.to_string(),
Some(e.pg_error_code()),
))?;
}
// NOTE there is no ReadyForQuery message. This handler is used
// for basebackup and it uses CopyOut which doesn't require
@@ -474,10 +542,99 @@ impl PostgresBackend {
// We prefer explicit pattern matching to wildcards, because
// this helps us spot the places where new variants are missing
FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail => {
bail!("unexpected message type: {:?}", msg);
return Err(QueryError::Other(anyhow::anyhow!(
"unexpected message type: {:?}",
msg
)));
}
}
Ok(ProcessMsgResult::Continue)
}
}
///
/// A futures::AsyncWrite implementation that wraps all data written to it in CopyData
/// messages.
///
pub struct CopyDataWriter<'a> {
pgb: &'a mut PostgresBackend,
}
impl<'a> AsyncWrite for CopyDataWriter<'a> {
fn poll_write(
self: Pin<&mut Self>,
cx: &mut std::task::Context<'_>,
buf: &[u8],
) -> Poll<Result<usize, std::io::Error>> {
let this = self.get_mut();
// It's not strictly required to flush between each message, but makes it easier
// to view in wireshark, and usually the messages that the callers write are
// decently-sized anyway.
match this.pgb.poll_write_buf(cx) {
Poll::Ready(Ok(())) => {}
Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
Poll::Pending => return Poll::Pending,
}
// CopyData
// XXX: if the input is large, we should split it into multiple messages.
// Not sure what the threshold should be, but the ultimate hard limit is that
// the length cannot exceed u32.
this.pgb.write_message(&BeMessage::CopyData(buf))?;
Poll::Ready(Ok(buf.len()))
}
fn poll_flush(
self: Pin<&mut Self>,
cx: &mut std::task::Context<'_>,
) -> Poll<Result<(), std::io::Error>> {
let this = self.get_mut();
match this.pgb.poll_write_buf(cx) {
Poll::Ready(Ok(())) => {}
Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
Poll::Pending => return Poll::Pending,
}
this.pgb.poll_flush(cx)
}
fn poll_shutdown(
self: Pin<&mut Self>,
cx: &mut std::task::Context<'_>,
) -> Poll<Result<(), std::io::Error>> {
let this = self.get_mut();
match this.pgb.poll_write_buf(cx) {
Poll::Ready(Ok(())) => {}
Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
Poll::Pending => return Poll::Pending,
}
this.pgb.poll_flush(cx)
}
}
pub fn short_error(e: &QueryError) -> String {
match e {
QueryError::Disconnected(connection_error) => connection_error.to_string(),
QueryError::Other(e) => format!("{e:#}"),
}
}
pub(super) fn log_query_error(query: &str, e: &QueryError) {
match e {
QueryError::Disconnected(ConnectionError::Socket(io_error)) => {
if is_expected_io_error(io_error) {
info!("query handler for '{query}' failed with expected io error: {io_error}");
} else {
error!("query handler for '{query}' failed with io error: {io_error}");
}
}
QueryError::Disconnected(other_connection_error) => {
error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
}
QueryError::Other(e) => {
error!("query handler for '{query}' failed: {e:?}");
}
}
}

View File

@@ -9,7 +9,10 @@ use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
use bytes::{Buf, BufMut, Bytes, BytesMut};
use once_cell::sync::Lazy;
use utils::postgres_backend::{AuthType, Handler, PostgresBackend};
use utils::{
postgres_backend::{AuthType, Handler, PostgresBackend},
postgres_backend_async::QueryError,
};
fn make_tcp_pair() -> (TcpStream, TcpStream) {
let listener = TcpListener::bind("127.0.0.1:0").unwrap();
@@ -105,7 +108,7 @@ fn ssl() {
&mut self,
_pgb: &mut PostgresBackend,
query_string: &str,
) -> anyhow::Result<()> {
) -> Result<(), QueryError> {
self.got_query = query_string == QUERY;
Ok(())
}
@@ -152,7 +155,7 @@ fn no_ssl() {
&mut self,
_pgb: &mut PostgresBackend,
_query_string: &str,
) -> anyhow::Result<()> {
) -> Result<(), QueryError> {
panic!()
}
}
@@ -212,7 +215,7 @@ fn server_forces_ssl() {
&mut self,
_pgb: &mut PostgresBackend,
_query_string: &str,
) -> anyhow::Result<()> {
) -> Result<(), QueryError> {
panic!()
}
}

View File

@@ -2,6 +2,7 @@
name = "pageserver"
version = "0.1.0"
edition = "2021"
license = "Apache-2.0"
[features]
default = []
@@ -9,8 +10,6 @@ default = []
# which adds some runtime cost to run tests on outage conditions
testing = ["fail/failpoints"]
profiling = ["pprof"]
[dependencies]
amplify_num = { git = "https://github.com/hlinnaka/rust-amplify.git", branch = "unsigned-int-perf" }
anyhow = { version = "1.0", features = ["backtrace"] }
@@ -39,7 +38,6 @@ pin-project-lite = "0.2.7"
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true }
rand = "0.8.3"
regex = "1.4.5"
rstar = "0.9.3"
@@ -49,7 +47,7 @@ serde_json = { version = "1.0", features = ["raw_value"] }
serde_with = "2.0"
signal-hook = "0.3.10"
svg_fmt = "0.4.1"
tar = "0.4.33"
tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }
thiserror = "1.0"
tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
@@ -70,7 +68,7 @@ tenant_size_model = { path = "../libs/tenant_size_model" }
utils = { path = "../libs/utils" }
workspace_hack = { version = "0.1", path = "../workspace_hack" }
rpds = "0.12.0"
reqwest = "0.11.13"
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
[dev-dependencies]
criterion = "0.4"

View File

@@ -10,20 +10,24 @@
//! This module is responsible for creation of such tarball
//! from data stored in object storage.
//!
use anyhow::{anyhow, bail, ensure, Context, Result};
use anyhow::{anyhow, bail, ensure, Context};
use bytes::{BufMut, BytesMut};
use fail::fail_point;
use itertools::Itertools;
use std::fmt::Write as FmtWrite;
use std::io;
use std::io::Write;
use std::sync::Arc;
use std::time::SystemTime;
use tar::{Builder, EntryType, Header};
use tokio::io;
use tokio::io::AsyncWrite;
use tracing::*;
use crate::task_mgr;
use crate::tenant::{with_ondemand_download, PageReconstructResult, Timeline};
/// NB: This relies on a modified version of tokio_tar that does *not* write the
/// end-of-archive marker (1024 zero bytes), when the Builder struct is dropped
/// without explicitly calling 'finish' or 'into_inner'!
///
/// See https://github.com/neondatabase/tokio-tar/pull/1
///
use tokio_tar::{Builder, EntryType, Header};
use crate::tenant::{with_ondemand_download, Timeline};
use pageserver_api::reltag::{RelTag, SlruKind};
use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
@@ -34,116 +38,130 @@ use postgres_ffi::PG_TLI;
use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE};
use utils::lsn::Lsn;
/// Create basebackup with non-rel data in it.
/// Only include relational data if 'full_backup' is true.
///
/// Currently we use empty 'req_lsn' in two cases:
/// * During the basebackup right after timeline creation
/// * When working without safekeepers. In this situation it is important to match the lsn
/// we are taking basebackup on with the lsn that is used in pageserver's walreceiver
/// to start the replication.
pub async fn send_basebackup_tarball<'a, W>(
write: &'a mut W,
timeline: &'a Timeline,
req_lsn: Option<Lsn>,
prev_lsn: Option<Lsn>,
full_backup: bool,
) -> anyhow::Result<()>
where
W: AsyncWrite + Send + Sync + Unpin,
{
// Compute postgres doesn't have any previous WAL files, but the first
// record that it's going to write needs to include the LSN of the
// previous record (xl_prev). We include prev_record_lsn in the
// "zenith.signal" file, so that postgres can read it during startup.
//
// We don't keep full history of record boundaries in the page server,
// however, only the predecessor of the latest record on each
// timeline. So we can only provide prev_record_lsn when you take a
// base backup at the end of the timeline, i.e. at last_record_lsn.
// Even at the end of the timeline, we sometimes don't have a valid
// prev_lsn value; that happens if the timeline was just branched from
// an old LSN and it doesn't have any WAL of its own yet. We will set
// prev_lsn to Lsn(0) if we cannot provide the correct value.
let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn {
// Backup was requested at a particular LSN. The caller should've
// already checked that it's a valid LSN.
// If the requested point is the end of the timeline, we can
// provide prev_lsn. (get_last_record_rlsn() might return it as
// zero, though, if no WAL has been generated on this timeline
// yet.)
let end_of_timeline = timeline.get_last_record_rlsn();
if req_lsn == end_of_timeline.last {
(end_of_timeline.prev, req_lsn)
} else {
(Lsn(0), req_lsn)
}
} else {
// Backup was requested at end of the timeline.
let end_of_timeline = timeline.get_last_record_rlsn();
(end_of_timeline.prev, end_of_timeline.last)
};
// Consolidate the derived and the provided prev_lsn values
let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
if backup_prev != Lsn(0) {
ensure!(backup_prev == provided_prev_lsn);
}
provided_prev_lsn
} else {
backup_prev
};
info!(
"taking basebackup lsn={}, prev_lsn={} (full_backup={})",
backup_lsn, prev_lsn, full_backup
);
let basebackup = Basebackup {
ar: Builder::new_non_terminated(write),
timeline,
lsn: backup_lsn,
prev_record_lsn: prev_lsn,
full_backup,
};
basebackup
.send_tarball()
.instrument(info_span!("send_tarball", backup_lsn=%backup_lsn))
.await
}
/// This is short-living object only for the time of tarball creation,
/// created mostly to avoid passing a lot of parameters between various functions
/// used for constructing tarball.
pub struct Basebackup<'a, W>
struct Basebackup<'a, W>
where
W: Write,
W: AsyncWrite + Send + Sync + Unpin,
{
ar: Builder<AbortableWrite<W>>,
timeline: &'a Arc<Timeline>,
pub lsn: Lsn,
ar: Builder<&'a mut W>,
timeline: &'a Timeline,
lsn: Lsn,
prev_record_lsn: Lsn,
full_backup: bool,
finished: bool,
}
// Create basebackup with non-rel data in it.
// Only include relational data if 'full_backup' is true.
//
// Currently we use empty lsn in two cases:
// * During the basebackup right after timeline creation
// * When working without safekeepers. In this situation it is important to match the lsn
// we are taking basebackup on with the lsn that is used in pageserver's walreceiver
// to start the replication.
impl<'a, W> Basebackup<'a, W>
where
W: Write,
W: AsyncWrite + Send + Sync + Unpin,
{
pub fn new(
write: W,
timeline: &'a Arc<Timeline>,
req_lsn: Option<Lsn>,
prev_lsn: Option<Lsn>,
full_backup: bool,
) -> Result<Basebackup<'a, W>> {
// Compute postgres doesn't have any previous WAL files, but the first
// record that it's going to write needs to include the LSN of the
// previous record (xl_prev). We include prev_record_lsn in the
// "zenith.signal" file, so that postgres can read it during startup.
//
// We don't keep full history of record boundaries in the page server,
// however, only the predecessor of the latest record on each
// timeline. So we can only provide prev_record_lsn when you take a
// base backup at the end of the timeline, i.e. at last_record_lsn.
// Even at the end of the timeline, we sometimes don't have a valid
// prev_lsn value; that happens if the timeline was just branched from
// an old LSN and it doesn't have any WAL of its own yet. We will set
// prev_lsn to Lsn(0) if we cannot provide the correct value.
let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn {
// Backup was requested at a particular LSN. The caller should've
// already checked that it's a valid LSN.
// If the requested point is the end of the timeline, we can
// provide prev_lsn. (get_last_record_rlsn() might return it as
// zero, though, if no WAL has been generated on this timeline
// yet.)
let end_of_timeline = timeline.get_last_record_rlsn();
if req_lsn == end_of_timeline.last {
(end_of_timeline.prev, req_lsn)
} else {
(Lsn(0), req_lsn)
}
} else {
// Backup was requested at end of the timeline.
let end_of_timeline = timeline.get_last_record_rlsn();
(end_of_timeline.prev, end_of_timeline.last)
};
// Consolidate the derived and the provided prev_lsn values
let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
if backup_prev != Lsn(0) {
ensure!(backup_prev == provided_prev_lsn)
}
provided_prev_lsn
} else {
backup_prev
};
info!(
"taking basebackup lsn={}, prev_lsn={} (full_backup={})",
backup_lsn, prev_lsn, full_backup
);
Ok(Basebackup {
ar: Builder::new(AbortableWrite::new(write)),
timeline,
lsn: backup_lsn,
prev_record_lsn: prev_lsn,
full_backup,
finished: false,
})
}
pub fn send_tarball(mut self) -> anyhow::Result<()> {
async fn send_tarball(mut self) -> anyhow::Result<()> {
// TODO include checksum
// Create pgdata subdirs structure
for dir in PGDATA_SUBDIRS.iter() {
let header = new_tar_header_dir(dir)?;
self.ar.append(&header, &mut io::empty())?;
self.ar
.append(&header, &mut io::empty())
.await
.context("could not add directory to basebackup tarball")?;
}
// Send empty config files.
// Send config files.
for filepath in PGDATA_SPECIAL_FILES.iter() {
if *filepath == "pg_hba.conf" {
let data = PG_HBA.as_bytes();
let header = new_tar_header(filepath, data.len() as u64)?;
self.ar.append(&header, data)?;
self.ar
.append(&header, data)
.await
.context("could not add config file to basebackup tarball")?;
} else {
let header = new_tar_header(filepath, 0)?;
self.ar.append(&header, &mut io::empty())?;
self.ar
.append(&header, &mut io::empty())
.await
.context("could not add config file to basebackup tarball")?;
}
}
@@ -154,29 +172,30 @@ where
SlruKind::MultiXactMembers,
] {
for segno in
with_ondemand_download_sync(|| self.timeline.list_slru_segments(kind, self.lsn))?
with_ondemand_download(|| self.timeline.list_slru_segments(kind, self.lsn)).await?
{
self.add_slru_segment(kind, segno)?;
self.add_slru_segment(kind, segno).await?;
}
}
// Create tablespace directories
for ((spcnode, dbnode), has_relmap_file) in
with_ondemand_download_sync(|| self.timeline.list_dbdirs(self.lsn))?
with_ondemand_download(|| self.timeline.list_dbdirs(self.lsn)).await?
{
self.add_dbdir(spcnode, dbnode, has_relmap_file)?;
self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
// Gather and send relational files in each database if full backup is requested.
if self.full_backup {
for rel in with_ondemand_download_sync(|| {
self.timeline.list_rels(spcnode, dbnode, self.lsn)
})? {
self.add_rel(rel)?;
for rel in
with_ondemand_download(|| self.timeline.list_rels(spcnode, dbnode, self.lsn))
.await?
{
self.add_rel(rel).await?;
}
}
}
for xid in with_ondemand_download_sync(|| self.timeline.list_twophase_files(self.lsn))? {
self.add_twophase_file(xid)?;
for xid in with_ondemand_download(|| self.timeline.list_twophase_files(self.lsn)).await? {
self.add_twophase_file(xid).await?;
}
fail_point!("basebackup-before-control-file", |_| {
@@ -184,44 +203,46 @@ where
});
// Generate pg_control and bootstrap WAL segment.
self.add_pgcontrol_file()?;
self.ar.finish()?;
self.finished = true;
self.add_pgcontrol_file().await?;
self.ar.finish().await?;
debug!("all tarred up!");
Ok(())
}
fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
async fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
let nblocks =
with_ondemand_download_sync(|| self.timeline.get_rel_size(tag, self.lsn, false))?;
// Function that adds relation segment data to archive
let mut add_file = |segment_index, data: &Vec<u8>| -> anyhow::Result<()> {
let file_name = tag.to_segfile_name(segment_index as u32);
let header = new_tar_header(&file_name, data.len() as u64)?;
self.ar.append(&header, data.as_slice())?;
Ok(())
};
with_ondemand_download(|| self.timeline.get_rel_size(tag, self.lsn, false)).await?;
// If the relation is empty, create an empty file
if nblocks == 0 {
add_file(0, &vec![])?;
let file_name = tag.to_segfile_name(0);
let header = new_tar_header(&file_name, 0)?;
self.ar.append(&header, &mut io::empty()).await?;
return Ok(());
}
// Add a file for each chunk of blocks (aka segment)
let chunks = (0..nblocks).chunks(RELSEG_SIZE as usize);
for (seg, blocks) in chunks.into_iter().enumerate() {
let mut startblk = 0;
let mut seg = 0;
while startblk < nblocks {
let endblk = std::cmp::min(startblk + RELSEG_SIZE, nblocks);
let mut segment_data: Vec<u8> = vec![];
for blknum in blocks {
let img = self
.timeline
.get_rel_page_at_lsn(tag, blknum, self.lsn, false)
.no_ondemand_download()?;
for blknum in startblk..endblk {
let img = with_ondemand_download(|| {
self.timeline
.get_rel_page_at_lsn(tag, blknum, self.lsn, false)
})
.await?;
segment_data.extend_from_slice(&img[..]);
}
add_file(seg, &segment_data)?;
let file_name = tag.to_segfile_name(seg as u32);
let header = new_tar_header(&file_name, segment_data.len() as u64)?;
self.ar.append(&header, segment_data.as_slice()).await?;
seg += 1;
startblk = endblk;
}
Ok(())
@@ -230,17 +251,18 @@ where
//
// Generate SLRU segment files from repository.
//
fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
let nblocks = with_ondemand_download_sync(|| {
self.timeline.get_slru_segment_size(slru, segno, self.lsn)
})?;
async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
let nblocks =
with_ondemand_download(|| self.timeline.get_slru_segment_size(slru, segno, self.lsn))
.await?;
let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
for blknum in 0..nblocks {
let img = with_ondemand_download_sync(|| {
let img = with_ondemand_download(|| {
self.timeline
.get_slru_page_at_lsn(slru, segno, blknum, self.lsn)
})?;
})
.await?;
if slru == SlruKind::Clog {
ensure!(img.len() == BLCKSZ as usize || img.len() == BLCKSZ as usize + 8);
@@ -253,7 +275,7 @@ where
let segname = format!("{}/{:>04X}", slru.to_str(), segno);
let header = new_tar_header(&segname, slru_buf.len() as u64)?;
self.ar.append(&header, slru_buf.as_slice())?;
self.ar.append(&header, slru_buf.as_slice()).await?;
trace!("Added to basebackup slru {} relsize {}", segname, nblocks);
Ok(())
@@ -265,16 +287,16 @@ where
// Each directory contains a PG_VERSION file, and the default database
// directories also contain pg_filenode.map files.
//
fn add_dbdir(
async fn add_dbdir(
&mut self,
spcnode: u32,
dbnode: u32,
has_relmap_file: bool,
) -> anyhow::Result<()> {
let relmap_img = if has_relmap_file {
let img = with_ondemand_download_sync(|| {
self.timeline.get_relmap_file(spcnode, dbnode, self.lsn)
})?;
let img =
with_ondemand_download(|| self.timeline.get_relmap_file(spcnode, dbnode, self.lsn))
.await?;
ensure!(img.len() == 512);
Some(img)
} else {
@@ -284,14 +306,14 @@ where
if spcnode == GLOBALTABLESPACE_OID {
let pg_version_str = self.timeline.pg_version.to_string();
let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?;
self.ar.append(&header, pg_version_str.as_bytes())?;
self.ar.append(&header, pg_version_str.as_bytes()).await?;
info!("timeline.pg_version {}", self.timeline.pg_version);
if let Some(img) = relmap_img {
// filenode map for global tablespace
let header = new_tar_header("global/pg_filenode.map", img.len() as u64)?;
self.ar.append(&header, &img[..])?;
self.ar.append(&header, &img[..]).await?;
} else {
warn!("global/pg_filenode.map is missing");
}
@@ -307,10 +329,8 @@ where
// XLOG_TBLSPC_DROP records. But we probably should just
// throw an error on CREATE TABLESPACE in the first place.
if !has_relmap_file
&& self
.timeline
.list_rels(spcnode, dbnode, self.lsn)
.no_ondemand_download()?
&& with_ondemand_download(|| self.timeline.list_rels(spcnode, dbnode, self.lsn))
.await?
.is_empty()
{
return Ok(());
@@ -321,18 +341,18 @@ where
// Append dir path for each database
let path = format!("base/{}", dbnode);
let header = new_tar_header_dir(&path)?;
self.ar.append(&header, &mut io::empty())?;
self.ar.append(&header, &mut io::empty()).await?;
if let Some(img) = relmap_img {
let dst_path = format!("base/{}/PG_VERSION", dbnode);
let pg_version_str = self.timeline.pg_version.to_string();
let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?;
self.ar.append(&header, pg_version_str.as_bytes())?;
self.ar.append(&header, pg_version_str.as_bytes()).await?;
let relmap_path = format!("base/{}/pg_filenode.map", dbnode);
let header = new_tar_header(&relmap_path, img.len() as u64)?;
self.ar.append(&header, &img[..])?;
self.ar.append(&header, &img[..]).await?;
}
};
Ok(())
@@ -341,8 +361,8 @@ where
//
// Extract twophase state files
//
fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
let img = with_ondemand_download_sync(|| self.timeline.get_twophase_file(xid, self.lsn))?;
async fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
let img = with_ondemand_download(|| self.timeline.get_twophase_file(xid, self.lsn)).await?;
let mut buf = BytesMut::new();
buf.extend_from_slice(&img[..]);
@@ -350,7 +370,7 @@ where
buf.put_u32_le(crc);
let path = format!("pg_twophase/{:>08X}", xid);
let header = new_tar_header(&path, buf.len() as u64)?;
self.ar.append(&header, &buf[..])?;
self.ar.append(&header, &buf[..]).await?;
Ok(())
}
@@ -359,7 +379,7 @@ where
// Add generated pg_control file and bootstrap WAL segment.
// Also send zenith.signal file with extra bootstrap data.
//
fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
async fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
// add zenith.signal file
let mut zenith_signal = String::new();
if self.prev_record_lsn == Lsn(0) {
@@ -371,17 +391,19 @@ where
} else {
write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?;
}
self.ar.append(
&new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
zenith_signal.as_bytes(),
)?;
self.ar
.append(
&new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
zenith_signal.as_bytes(),
)
.await?;
let checkpoint_bytes =
with_ondemand_download_sync(|| self.timeline.get_checkpoint(self.lsn))
.context("failed to get checkpoint bytes")?;
let pg_control_bytes =
with_ondemand_download_sync(|| self.timeline.get_control_file(self.lsn))
.context("failed get control bytes")?;
let checkpoint_bytes = with_ondemand_download(|| self.timeline.get_checkpoint(self.lsn))
.await
.context("failed to get checkpoint bytes")?;
let pg_control_bytes = with_ondemand_download(|| self.timeline.get_control_file(self.lsn))
.await
.context("failed get control bytes")?;
let (pg_control_bytes, system_identifier) = postgres_ffi::generate_pg_control(
&pg_control_bytes,
@@ -392,7 +414,7 @@ where
//send pg_control
let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
self.ar.append(&header, &pg_control_bytes[..])?;
self.ar.append(&header, &pg_control_bytes[..]).await?;
//send wal segment
let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE);
@@ -404,24 +426,11 @@ where
postgres_ffi::generate_wal_segment(segno, system_identifier, self.timeline.pg_version)
.map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
ensure!(wal_seg.len() == WAL_SEGMENT_SIZE);
self.ar.append(&header, &wal_seg[..])?;
self.ar.append(&header, &wal_seg[..]).await?;
Ok(())
}
}
impl<'a, W> Drop for Basebackup<'a, W>
where
W: Write,
{
/// If the basebackup was not finished, prevent the Archive::drop() from
/// writing the end-of-archive marker.
fn drop(&mut self) {
if !self.finished {
self.ar.get_mut().abort();
}
}
}
//
// Create new tarball entry header
//
@@ -457,57 +466,3 @@ fn new_tar_header_dir(path: &str) -> anyhow::Result<Header> {
header.set_cksum();
Ok(header)
}
/// A wrapper that passes through all data to the underlying Write,
/// until abort() is called.
///
/// tar::Builder has an annoying habit of finishing the archive with
/// a valid tar end-of-archive marker (two 512-byte sectors of zeros),
/// even if an error occurs and we don't finish building the archive.
/// We'd rather abort writing the tarball immediately than construct
/// a seemingly valid but incomplete archive. This wrapper allows us
/// to swallow the end-of-archive marker that Builder::drop() emits,
/// without writing it to the underlying sink.
///
struct AbortableWrite<W> {
w: W,
aborted: bool,
}
impl<W> AbortableWrite<W> {
pub fn new(w: W) -> Self {
AbortableWrite { w, aborted: false }
}
pub fn abort(&mut self) {
self.aborted = true;
}
}
impl<W> Write for AbortableWrite<W>
where
W: Write,
{
fn write(&mut self, data: &[u8]) -> io::Result<usize> {
if self.aborted {
Ok(data.len())
} else {
self.w.write(data)
}
}
fn flush(&mut self) -> io::Result<()> {
if self.aborted {
Ok(())
} else {
self.w.flush()
}
}
}
fn with_ondemand_download_sync<F, T>(f: F) -> anyhow::Result<T>
where
F: Send + Fn() -> PageReconstructResult<T>,
T: Send,
{
task_mgr::COMPUTE_REQUEST_RUNTIME.block_on(with_ondemand_download(f))
}

View File

@@ -13,7 +13,7 @@ use tracing::*;
use metrics::set_build_info_metric;
use pageserver::{
config::{defaults::*, PageServerConf},
http, page_cache, page_service, profiling, task_mgr,
http, page_cache, page_service, task_mgr,
task_mgr::TaskKind,
task_mgr::{
BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
@@ -40,8 +40,6 @@ const FEATURES: &[&str] = &[
"testing",
#[cfg(feature = "fail/failpoints")]
"fail/failpoints",
#[cfg(feature = "profiling")]
"profiling",
];
fn version() -> String {
@@ -247,15 +245,12 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
// Install signal handlers
let signals = signals::install_shutdown_handlers()?;
// Start profiler (if enabled)
let profiler_guard = profiling::init_profiler(conf);
// Launch broker client
WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_broker_client(conf))?;
// Initialize authentication for incoming connections
let auth = match &conf.auth_type {
AuthType::Trust | AuthType::MD5 => None,
AuthType::Trust => None,
AuthType::NeonJWT => {
// unwrap is ok because check is performed when creating config, so path is set and file exists
let key_path = conf.auth_validation_public_key_path.as_ref().unwrap();
@@ -372,7 +367,6 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
"Got {}. Terminating in immediate shutdown mode",
signal.name()
);
profiling::exit_profiler(conf, &profiler_guard);
std::process::exit(111);
}
@@ -381,7 +375,6 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
"Got {}. Terminating gracefully in fast shutdown mode",
signal.name()
);
profiling::exit_profiler(conf, &profiler_guard);
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
unreachable!()
}

View File

@@ -138,7 +138,6 @@ pub struct PageServerConf {
pub auth_validation_public_key_path: Option<PathBuf>,
pub remote_storage_config: Option<RemoteStorageConfig>,
pub profiling: ProfilingConfig,
pub default_tenant_conf: TenantConf,
/// Storage broker endpoints to connect to.
@@ -165,25 +164,6 @@ pub struct PageServerConf {
/// startup code to the connection code through a dozen layers.
pub static SAFEKEEPER_AUTH_TOKEN: OnceCell<Arc<String>> = OnceCell::new();
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ProfilingConfig {
Disabled,
PageRequests,
}
impl FromStr for ProfilingConfig {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<ProfilingConfig, Self::Err> {
let result = match s {
"disabled" => ProfilingConfig::Disabled,
"page_requests" => ProfilingConfig::PageRequests,
_ => bail!("invalid value \"{s}\" for profiling option, valid values are \"disabled\" and \"page_requests\""),
};
Ok(result)
}
}
// use dedicated enum for builder to better indicate the intention
// and avoid possible confusion with nested options
pub enum BuilderValue<T> {
@@ -226,7 +206,6 @@ struct PageServerConfigBuilder {
id: BuilderValue<NodeId>,
profiling: BuilderValue<ProfilingConfig>,
broker_endpoint: BuilderValue<Uri>,
broker_keepalive_interval: BuilderValue<Duration>,
@@ -262,7 +241,6 @@ impl Default for PageServerConfigBuilder {
auth_validation_public_key_path: Set(None),
remote_storage_config: Set(None),
id: NotSet,
profiling: Set(ProfilingConfig::Disabled),
broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT
.parse()
.expect("failed to parse default broker endpoint")),
@@ -348,10 +326,6 @@ impl PageServerConfigBuilder {
self.id = BuilderValue::Set(node_id)
}
pub fn profiling(&mut self, profiling: ProfilingConfig) {
self.profiling = BuilderValue::Set(profiling)
}
pub fn log_format(&mut self, log_format: LogFormat) {
self.log_format = BuilderValue::Set(log_format)
}
@@ -405,7 +379,6 @@ impl PageServerConfigBuilder {
.remote_storage_config
.ok_or(anyhow!("missing remote_storage_config"))?,
id: self.id.ok_or(anyhow!("missing id"))?,
profiling: self.profiling.ok_or(anyhow!("missing profiling"))?,
// TenantConf is handled separately
default_tenant_conf: TenantConf::default(),
broker_endpoint: self
@@ -588,7 +561,6 @@ impl PageServerConf {
t_conf = Self::parse_toml_tenant_conf(item)?;
}
"id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
"profiling" => builder.profiling(parse_toml_from_str(key, item)?),
"broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
"broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
"log_format" => builder.log_format(
@@ -722,7 +694,6 @@ impl PageServerConf {
auth_type: AuthType::Trust,
auth_validation_public_key_path: None,
remote_storage_config: None,
profiling: ProfilingConfig::Disabled,
default_tenant_conf: TenantConf::default(),
broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
broker_keepalive_interval: Duration::from_secs(5000),
@@ -898,7 +869,6 @@ log_format = 'json'
auth_type: AuthType::Trust,
auth_validation_public_key_path: None,
remote_storage_config: None,
profiling: ProfilingConfig::Disabled,
default_tenant_conf: TenantConf::default(),
broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
broker_keepalive_interval: humantime::parse_duration(
@@ -949,7 +919,6 @@ log_format = 'json'
auth_type: AuthType::Trust,
auth_validation_public_key_path: None,
remote_storage_config: None,
profiling: ProfilingConfig::Disabled,
default_tenant_conf: TenantConf::default(),
broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
broker_keepalive_interval: Duration::from_secs(5),

View File

@@ -738,17 +738,17 @@ async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Bod
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_id))?;
let tenant = mgr::get_tenant(tenant_id, true)
.await
.map_err(ApiError::NotFound)?;
let timeline = tenant
.get_timeline(timeline_id, true)
.map_err(ApiError::NotFound)?;
timeline
.compact()
let result_receiver = mgr::immediate_compact(tenant_id, timeline_id)
.await
.context("spawn compaction task")
.map_err(ApiError::InternalServerError)?;
let result: anyhow::Result<()> = result_receiver
.await
.context("receive compaction result")
.map_err(ApiError::InternalServerError)?;
result.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, ())
}

View File

@@ -2,12 +2,13 @@
//! Import data and WAL from a PostgreSQL data directory and WAL segments into
//! a neon Timeline.
//!
use std::fs::File;
use std::io::{Read, Seek, SeekFrom};
use std::path::{Path, PathBuf};
use anyhow::{bail, ensure, Context, Result};
use bytes::Bytes;
use futures::StreamExt;
use tokio::io::{AsyncRead, AsyncReadExt};
use tokio_tar::Archive;
use tracing::*;
use walkdir::WalkDir;
@@ -42,7 +43,7 @@ pub fn get_lsn_from_controlfile(path: &Path) -> Result<Lsn> {
/// This is currently only used to import a cluster freshly created by initdb.
/// The code that deals with the checkpoint would not work right if the
/// cluster was not shut down cleanly.
pub fn import_timeline_from_postgres_datadir(
pub async fn import_timeline_from_postgres_datadir(
tline: &Timeline,
pgdata_path: &Path,
pgdata_lsn: Lsn,
@@ -65,9 +66,11 @@ pub fn import_timeline_from_postgres_datadir(
let absolute_path = entry.path();
let relative_path = absolute_path.strip_prefix(pgdata_path)?;
let file = File::open(absolute_path)?;
let mut file = tokio::fs::File::open(absolute_path).await?;
let len = metadata.len() as usize;
if let Some(control_file) = import_file(&mut modification, relative_path, file, len)? {
if let Some(control_file) =
import_file(&mut modification, relative_path, &mut file, len).await?
{
pg_control = Some(control_file);
}
modification.flush()?;
@@ -96,18 +99,19 @@ pub fn import_timeline_from_postgres_datadir(
tline,
Lsn(pg_control.checkPointCopy.redo),
pgdata_lsn,
)?;
)
.await?;
Ok(())
}
// subroutine of import_timeline_from_postgres_datadir(), to load one relation file.
fn import_rel<Reader: Read>(
modification: &mut DatadirModification,
async fn import_rel(
modification: &mut DatadirModification<'_>,
path: &Path,
spcoid: Oid,
dboid: Oid,
mut reader: Reader,
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
len: usize,
) -> anyhow::Result<()> {
// Does it look like a relation file?
@@ -148,7 +152,7 @@ fn import_rel<Reader: Read>(
}
loop {
let r = reader.read_exact(&mut buf);
let r = reader.read_exact(&mut buf).await;
match r {
Ok(_) => {
modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
@@ -181,11 +185,11 @@ fn import_rel<Reader: Read>(
/// Import an SLRU segment file
///
fn import_slru<Reader: Read>(
modification: &mut DatadirModification,
async fn import_slru(
modification: &mut DatadirModification<'_>,
slru: SlruKind,
path: &Path,
mut reader: Reader,
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
len: usize,
) -> anyhow::Result<()> {
info!("importing slru file {path:?}");
@@ -206,7 +210,7 @@ fn import_slru<Reader: Read>(
let mut rpageno = 0;
loop {
let r = reader.read_exact(&mut buf);
let r = reader.read_exact(&mut buf).await;
match r {
Ok(_) => {
modification.put_slru_page_image(
@@ -237,19 +241,20 @@ fn import_slru<Reader: Read>(
/// Scan PostgreSQL WAL files in given directory and load all records between
/// 'startpoint' and 'endpoint' into the repository.
fn import_wal(
async fn import_wal(
walpath: &Path,
tline: &Timeline,
startpoint: Lsn,
endpoint: Lsn,
) -> anyhow::Result<()> {
use std::io::Read;
let mut waldecoder = WalStreamDecoder::new(startpoint, tline.pg_version);
let mut segno = startpoint.segment_number(WAL_SEGMENT_SIZE);
let mut offset = startpoint.segment_offset(WAL_SEGMENT_SIZE);
let mut last_lsn = startpoint;
let mut walingest = WalIngest::new(tline, startpoint).no_ondemand_download()?;
let mut walingest = WalIngest::new(tline, startpoint).await?;
while last_lsn <= endpoint {
// FIXME: assume postgresql tli 1 for now
@@ -265,10 +270,11 @@ fn import_wal(
}
// Slurp the WAL file
let mut file = File::open(&path)?;
let mut file = std::fs::File::open(&path)?;
if offset > 0 {
file.seek(SeekFrom::Start(offset as u64))?;
use std::io::Seek;
file.seek(std::io::SeekFrom::Start(offset as u64))?;
}
let nread = file.read_to_end(&mut buf)?;
@@ -286,7 +292,7 @@ fn import_wal(
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
walingest
.ingest_record(recdata, lsn, &mut modification, &mut decoded)
.no_ondemand_download()?;
.await?;
last_lsn = lsn;
nrecords += 1;
@@ -310,9 +316,9 @@ fn import_wal(
Ok(())
}
pub fn import_basebackup_from_tar<Reader: Read>(
pub async fn import_basebackup_from_tar(
tline: &Timeline,
reader: Reader,
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
base_lsn: Lsn,
) -> Result<()> {
info!("importing base at {base_lsn}");
@@ -322,21 +328,24 @@ pub fn import_basebackup_from_tar<Reader: Read>(
let mut pg_control: Option<ControlFileData> = None;
// Import base
for base_tar_entry in tar::Archive::new(reader).entries()? {
let entry = base_tar_entry?;
let mut entries = Archive::new(reader).entries()?;
while let Some(base_tar_entry) = entries.next().await {
let mut entry = base_tar_entry?;
let header = entry.header();
let len = header.entry_size()? as usize;
let file_path = header.path()?.into_owned();
match header.entry_type() {
tar::EntryType::Regular => {
if let Some(res) = import_file(&mut modification, file_path.as_ref(), entry, len)? {
tokio_tar::EntryType::Regular => {
if let Some(res) =
import_file(&mut modification, file_path.as_ref(), &mut entry, len).await?
{
// We found the pg_control file.
pg_control = Some(res);
}
modification.flush()?;
}
tar::EntryType::Directory => {
tokio_tar::EntryType::Directory => {
debug!("directory {:?}", file_path);
}
_ => {
@@ -356,9 +365,9 @@ pub fn import_basebackup_from_tar<Reader: Read>(
Ok(())
}
pub fn import_wal_from_tar<Reader: Read>(
pub async fn import_wal_from_tar(
tline: &Timeline,
reader: Reader,
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
start_lsn: Lsn,
end_lsn: Lsn,
) -> Result<()> {
@@ -367,20 +376,23 @@ pub fn import_wal_from_tar<Reader: Read>(
let mut segno = start_lsn.segment_number(WAL_SEGMENT_SIZE);
let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE);
let mut last_lsn = start_lsn;
let mut walingest = WalIngest::new(tline, start_lsn).no_ondemand_download()?;
let mut walingest = WalIngest::new(tline, start_lsn).await?;
// Ingest wal until end_lsn
info!("importing wal until {}", end_lsn);
let mut pg_wal_tar = tar::Archive::new(reader);
let mut pg_wal_entries_iter = pg_wal_tar.entries()?;
let mut pg_wal_tar = Archive::new(reader);
let mut pg_wal_entries = pg_wal_tar.entries()?;
while last_lsn <= end_lsn {
let bytes = {
let entry = pg_wal_entries_iter.next().expect("expected more wal")?;
let mut entry = pg_wal_entries
.next()
.await
.ok_or_else(|| anyhow::anyhow!("expected more wal"))??;
let header = entry.header();
let file_path = header.path()?.into_owned();
match header.entry_type() {
tar::EntryType::Regular => {
tokio_tar::EntryType::Regular => {
// FIXME: assume postgresql tli 1 for now
let expected_filename = XLogFileName(1, segno, WAL_SEGMENT_SIZE);
let file_name = file_path
@@ -390,9 +402,9 @@ pub fn import_wal_from_tar<Reader: Read>(
ensure!(expected_filename == file_name);
debug!("processing wal file {:?}", file_path);
read_all_bytes(entry)?
read_all_bytes(&mut entry).await?
}
tar::EntryType::Directory => {
tokio_tar::EntryType::Directory => {
debug!("directory {:?}", file_path);
continue;
}
@@ -414,7 +426,7 @@ pub fn import_wal_from_tar<Reader: Read>(
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
walingest
.ingest_record(recdata, lsn, &mut modification, &mut decoded)
.no_ondemand_download()?;
.await?;
last_lsn = lsn;
debug!("imported record at {} (end {})", lsn, end_lsn);
@@ -433,7 +445,7 @@ pub fn import_wal_from_tar<Reader: Read>(
}
// Log any extra unused files
for e in &mut pg_wal_entries_iter {
while let Some(e) = pg_wal_entries.next().await {
let entry = e?;
let header = entry.header();
let file_path = header.path()?.into_owned();
@@ -443,10 +455,10 @@ pub fn import_wal_from_tar<Reader: Read>(
Ok(())
}
fn import_file<Reader: Read>(
modification: &mut DatadirModification,
async fn import_file(
modification: &mut DatadirModification<'_>,
file_path: &Path,
reader: Reader,
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
len: usize,
) -> Result<Option<ControlFileData>> {
let file_name = match file_path.file_name() {
@@ -466,7 +478,7 @@ fn import_file<Reader: Read>(
match file_name.as_ref() {
"pg_control" => {
let bytes = read_all_bytes(reader)?;
let bytes = read_all_bytes(reader).await?;
// Extract the checkpoint record and import it separately.
let pg_control = ControlFileData::decode(&bytes[..])?;
@@ -479,7 +491,7 @@ fn import_file<Reader: Read>(
return Ok(Some(pg_control));
}
"pg_filenode.map" => {
let bytes = read_all_bytes(reader)?;
let bytes = read_all_bytes(reader).await?;
modification.put_relmap_file(spcnode, dbnode, bytes)?;
debug!("imported relmap file")
}
@@ -487,7 +499,7 @@ fn import_file<Reader: Read>(
debug!("ignored PG_VERSION file");
}
_ => {
import_rel(modification, file_path, spcnode, dbnode, reader, len)?;
import_rel(modification, file_path, spcnode, dbnode, reader, len).await?;
debug!("imported rel creation");
}
}
@@ -502,7 +514,7 @@ fn import_file<Reader: Read>(
match file_name.as_ref() {
"pg_filenode.map" => {
let bytes = read_all_bytes(reader)?;
let bytes = read_all_bytes(reader).await?;
modification.put_relmap_file(spcnode, dbnode, bytes)?;
debug!("imported relmap file")
}
@@ -510,36 +522,36 @@ fn import_file<Reader: Read>(
debug!("ignored PG_VERSION file");
}
_ => {
import_rel(modification, file_path, spcnode, dbnode, reader, len)?;
import_rel(modification, file_path, spcnode, dbnode, reader, len).await?;
debug!("imported rel creation");
}
}
} else if file_path.starts_with("pg_xact") {
let slru = SlruKind::Clog;
import_slru(modification, slru, file_path, reader, len)?;
import_slru(modification, slru, file_path, reader, len).await?;
debug!("imported clog slru");
} else if file_path.starts_with("pg_multixact/offsets") {
let slru = SlruKind::MultiXactOffsets;
import_slru(modification, slru, file_path, reader, len)?;
import_slru(modification, slru, file_path, reader, len).await?;
debug!("imported multixact offsets slru");
} else if file_path.starts_with("pg_multixact/members") {
let slru = SlruKind::MultiXactMembers;
import_slru(modification, slru, file_path, reader, len)?;
import_slru(modification, slru, file_path, reader, len).await?;
debug!("imported multixact members slru");
} else if file_path.starts_with("pg_twophase") {
let xid = u32::from_str_radix(file_name.as_ref(), 16)?;
let bytes = read_all_bytes(reader)?;
let bytes = read_all_bytes(reader).await?;
modification.put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]))?;
debug!("imported twophase file");
} else if file_path.starts_with("pg_wal") {
debug!("found wal file in base section. ignore it");
} else if file_path.starts_with("zenith.signal") {
// Parse zenith signal file to set correct previous LSN
let bytes = read_all_bytes(reader)?;
let bytes = read_all_bytes(reader).await?;
// zenith.signal format is "PREV LSN: prev_lsn"
// TODO write serialization and deserialization in the same place.
let zenith_signal = std::str::from_utf8(&bytes)?.trim();
@@ -576,8 +588,8 @@ fn import_file<Reader: Read>(
Ok(None)
}
fn read_all_bytes<Reader: Read>(mut reader: Reader) -> Result<Bytes> {
async fn read_all_bytes(reader: &mut (impl AsyncRead + Send + Sync + Unpin)) -> Result<Bytes> {
let mut buf: Vec<u8> = vec![];
reader.read_to_end(&mut buf)?;
reader.read_to_end(&mut buf).await?;
Ok(Bytes::copy_from_slice(&buf[..]))
}

View File

@@ -9,7 +9,6 @@ pub(crate) mod metrics;
pub mod page_cache;
pub mod page_service;
pub mod pgdatadir_mapping;
pub mod profiling;
pub mod repository;
pub mod task_mgr;
pub mod tenant;

View File

@@ -209,15 +209,34 @@ pub static NUM_ONDISK_LAYERS: Lazy<IntGauge> = Lazy::new(|| {
// remote storage metrics
static REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS: Lazy<IntGaugeVec> = Lazy::new(|| {
/// NB: increment _after_ recording the current value into [`REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST`].
static REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
"pageserver_remote_upload_queue_unfinished_tasks",
"Number of tasks in the upload queue that are not finished yet.",
"pageserver_remote_timeline_client_calls_unfinished",
"Number of ongoing calls to remote timeline client. \
Used to populate pageserver_remote_timeline_client_calls_started. \
This metric is not useful for sampling from Prometheus, but useful in tests.",
&["tenant_id", "timeline_id", "file_kind", "op_kind"],
)
.expect("failed to define a metric")
});
static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_remote_timeline_client_calls_started",
"When calling a remote timeline client method, we record the current value \
of the calls_unfinished gauge in this histogram. Plot the histogram \
over time in a heatmap to visualize how many operations were ongoing \
at a given instant. It gives you a better idea of the queue depth \
than plotting the gauge directly, since operations may complete faster \
than the sampling interval.",
&["tenant_id", "timeline_id", "file_kind", "op_kind"],
// The calls_unfinished gauge is an integer gauge, hence we have integer buckets.
vec![0.0, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0, 15.0, 20.0, 40.0, 60.0, 80.0, 100.0, 500.0],
)
.expect("failed to define a metric")
});
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum RemoteOpKind {
Upload,
@@ -248,15 +267,12 @@ impl RemoteOpFileKind {
}
}
pub static REMOTE_OPERATION_KINDS: &[&str] = &["upload", "download", "delete"];
pub static REMOTE_OPERATION_FILE_KINDS: &[&str] = &["layer", "index"];
pub static REMOTE_OPERATION_STATUSES: &[&str] = &["success", "failure"];
pub static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_remote_operation_seconds",
"Time spent on remote storage operations. \
Grouped by tenant, timeline, operation_kind and status",
Grouped by tenant, timeline, operation_kind and status. \
Does not account for time spent waiting in remote timeline client's queues.",
&["tenant_id", "timeline_id", "file_kind", "op_kind", "status"]
)
.expect("failed to define a metric")
@@ -475,21 +491,6 @@ impl Drop for TimelineMetrics {
for op in SMGR_QUERY_TIME_OPERATIONS {
let _ = SMGR_QUERY_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
}
let _ = REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS.remove_label_values(&[tenant_id, timeline_id]);
for file_kind in REMOTE_OPERATION_FILE_KINDS {
for op in REMOTE_OPERATION_KINDS {
for status in REMOTE_OPERATION_STATUSES {
let _ = REMOTE_OPERATION_TIME.remove_label_values(&[
tenant_id,
timeline_id,
file_kind,
op,
status,
]);
}
}
}
}
}
@@ -510,7 +511,8 @@ pub struct RemoteTimelineClientMetrics {
timeline_id: String,
remote_physical_size_gauge: Mutex<Option<UIntGauge>>,
remote_operation_time: Mutex<HashMap<(&'static str, &'static str, &'static str), Histogram>>,
unfinished_tasks: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
calls_started_hist: Mutex<HashMap<(&'static str, &'static str), Histogram>>,
}
impl RemoteTimelineClientMetrics {
@@ -519,7 +521,8 @@ impl RemoteTimelineClientMetrics {
tenant_id: tenant_id.to_string(),
timeline_id: timeline_id.to_string(),
remote_operation_time: Mutex::new(HashMap::default()),
unfinished_tasks: Mutex::new(HashMap::default()),
calls_unfinished_gauge: Mutex::new(HashMap::default()),
calls_started_hist: Mutex::new(HashMap::default()),
remote_physical_size_gauge: Mutex::new(None),
}
}
@@ -558,16 +561,37 @@ impl RemoteTimelineClientMetrics {
});
metric.clone()
}
pub fn unfinished_tasks(
fn calls_unfinished_gauge(
&self,
file_kind: &RemoteOpFileKind,
op_kind: &RemoteOpKind,
) -> IntGauge {
// XXX would be nice to have an upgradable RwLock
let mut guard = self.unfinished_tasks.lock().unwrap();
let mut guard = self.calls_unfinished_gauge.lock().unwrap();
let key = (file_kind.as_str(), op_kind.as_str());
let metric = guard.entry(key).or_insert_with(move || {
REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS
REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE
.get_metric_with_label_values(&[
&self.tenant_id.to_string(),
&self.timeline_id.to_string(),
key.0,
key.1,
])
.unwrap()
});
metric.clone()
}
fn calls_started_hist(
&self,
file_kind: &RemoteOpFileKind,
op_kind: &RemoteOpKind,
) -> Histogram {
// XXX would be nice to have an upgradable RwLock
let mut guard = self.calls_started_hist.lock().unwrap();
let key = (file_kind.as_str(), op_kind.as_str());
let metric = guard.entry(key).or_insert_with(move || {
REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
.get_metric_with_label_values(&[
&self.tenant_id.to_string(),
&self.timeline_id.to_string(),
@@ -580,6 +604,58 @@ impl RemoteTimelineClientMetrics {
}
}
/// See [`RemoteTimelineClientMetrics::call_begin`].
#[must_use]
pub(crate) struct RemoteTimelineClientCallMetricGuard(Option<IntGauge>);
impl RemoteTimelineClientCallMetricGuard {
/// Consume this guard object without decrementing the metric.
/// The caller vouches to do this manually, so that the prior increment of the gauge will cancel out.
pub fn will_decrement_manually(mut self) {
self.0 = None; // prevent drop() from decrementing
}
}
impl Drop for RemoteTimelineClientCallMetricGuard {
fn drop(&mut self) {
if let RemoteTimelineClientCallMetricGuard(Some(guard)) = self {
guard.dec();
}
}
}
impl RemoteTimelineClientMetrics {
/// Increment the metrics that track ongoing calls to the remote timeline client instance.
///
/// Drop the returned guard object once the operation is finished to decrement the values.
/// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`] if that
/// is more suitable.
/// Never do both.
pub(crate) fn call_begin(
&self,
file_kind: &RemoteOpFileKind,
op_kind: &RemoteOpKind,
) -> RemoteTimelineClientCallMetricGuard {
let unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
self.calls_started_hist(file_kind, op_kind)
.observe(unfinished_metric.get() as f64);
unfinished_metric.inc();
RemoteTimelineClientCallMetricGuard(Some(unfinished_metric))
}
/// Manually decrement the metric instead of using the guard object.
/// Using the guard object is generally preferable.
/// See [`call_begin`] for more context.
pub(crate) fn call_end(&self, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind) {
let unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
debug_assert!(
unfinished_metric.get() > 0,
"begin and end should cancel out"
);
unfinished_metric.dec();
}
}
impl Drop for RemoteTimelineClientMetrics {
fn drop(&mut self) {
let RemoteTimelineClientMetrics {
@@ -587,13 +663,22 @@ impl Drop for RemoteTimelineClientMetrics {
timeline_id,
remote_physical_size_gauge,
remote_operation_time,
unfinished_tasks,
calls_unfinished_gauge,
calls_started_hist,
} = self;
for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() {
let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]);
}
for ((a, b), _) in unfinished_tasks.get_mut().unwrap().drain() {
let _ = REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS.remove_label_values(&[
for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
tenant_id,
timeline_id,
a,
b,
]);
}
for ((a, b), _) in calls_started_hist.get_mut().unwrap().drain() {
let _ = REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST.remove_label_values(&[
tenant_id,
timeline_id,
a,

View File

@@ -9,7 +9,7 @@
// custom protocol.
//
use anyhow::{bail, ensure, Context, Result};
use anyhow::Context;
use bytes::Buf;
use bytes::Bytes;
use futures::{Stream, StreamExt};
@@ -19,6 +19,8 @@ use pageserver_api::models::{
PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
PagestreamNblocksRequest, PagestreamNblocksResponse,
};
use pq_proto::ConnectionError;
use pq_proto::FeStartupPacket;
use pq_proto::{BeMessage, FeMessage, RowDescriptor};
use std::io;
use std::net::TcpListener;
@@ -26,11 +28,9 @@ use std::str;
use std::str::FromStr;
use std::sync::Arc;
use std::time::Duration;
use tokio::pin;
use tokio_util::io::StreamReader;
use tokio_util::io::SyncIoBridge;
use tracing::*;
use utils::id::ConnectionId;
use utils::postgres_backend_async::QueryError;
use utils::{
auth::{Claims, JwtAuth, Scope},
id::{TenantId, TimelineId},
@@ -42,10 +42,9 @@ use utils::{
use crate::auth::check_permission;
use crate::basebackup;
use crate::config::{PageServerConf, ProfilingConfig};
use crate::config::PageServerConf;
use crate::import_datadir::import_wal_from_tar;
use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
use crate::profiling::profpoint_start;
use crate::task_mgr;
use crate::task_mgr::TaskKind;
use crate::tenant::mgr;
@@ -64,8 +63,8 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
_ = task_mgr::shutdown_watcher() => {
// We were requested to shut down.
let msg = format!("pageserver is shutting down");
let _ = pgb.write_message(&BeMessage::ErrorResponse(&msg));
Err(anyhow::anyhow!(msg))
let _ = pgb.write_message(&BeMessage::ErrorResponse(&msg, None));
Err(QueryError::Other(anyhow::anyhow!(msg)))
}
msg = pgb.read_message() => { msg }
@@ -78,14 +77,15 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
FeMessage::CopyDone => { break },
FeMessage::Sync => continue,
FeMessage::Terminate => {
let msg = format!("client terminated connection with Terminate message during COPY");
pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
let msg = "client terminated connection with Terminate message during COPY";
let query_error_error = QueryError::Disconnected(ConnectionError::Socket(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
pgb.write_message(&BeMessage::ErrorResponse(msg, Some(query_error_error.pg_error_code())))?;
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
break;
}
m => {
let msg = format!("unexpected message {:?}", m);
pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
let msg = format!("unexpected message {m:?}");
pgb.write_message(&BeMessage::ErrorResponse(&msg, None))?;
Err(io::Error::new(io::ErrorKind::Other, msg))?;
break;
}
@@ -95,12 +95,16 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
}
Ok(None) => {
let msg = "client closed connection during COPY";
pgb.write_message(&BeMessage::ErrorResponse(msg))?;
let query_error_error = QueryError::Disconnected(ConnectionError::Socket(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
pgb.write_message(&BeMessage::ErrorResponse(msg, Some(query_error_error.pg_error_code())))?;
pgb.flush().await?;
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
}
Err(e) => {
Err(io::Error::new(io::ErrorKind::Other, e))?;
Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => {
Err(io_error)?;
}
Err(other) => {
Err(io::Error::new(io::ErrorKind::Other, other))?;
}
};
}
@@ -198,23 +202,19 @@ async fn page_service_conn_main(
// we've been requested to shut down
Ok(())
}
Err(err) => {
let root_cause_io_err_kind = err
.root_cause()
.downcast_ref::<io::Error>()
.map(|e| e.kind());
Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => {
// `ConnectionReset` error happens when the Postgres client closes the connection.
// As this disconnection happens quite often and is expected,
// we decided to downgrade the logging level to `INFO`.
// See: https://github.com/neondatabase/neon/issues/1683.
if root_cause_io_err_kind == Some(io::ErrorKind::ConnectionReset) {
if io_error.kind() == io::ErrorKind::ConnectionReset {
info!("Postgres client disconnected");
Ok(())
} else {
Err(err)
Err(io_error).context("Postgres connection error")
}
}
other => other.context("Postgres query error"),
}
}
@@ -253,7 +253,7 @@ impl PageRequestMetrics {
#[derive(Debug)]
struct PageServerHandler {
conf: &'static PageServerConf,
_conf: &'static PageServerConf,
auth: Option<Arc<JwtAuth>>,
claims: Option<Claims>,
}
@@ -261,7 +261,7 @@ struct PageServerHandler {
impl PageServerHandler {
pub fn new(conf: &'static PageServerConf, auth: Option<Arc<JwtAuth>>) -> Self {
PageServerHandler {
conf,
_conf: conf,
auth,
claims: None,
}
@@ -316,7 +316,7 @@ impl PageServerHandler {
Some(FeMessage::CopyData(bytes)) => bytes,
Some(FeMessage::Terminate) => break,
Some(m) => {
bail!("unexpected message: {m:?} during COPY");
anyhow::bail!("unexpected message: {m:?} during COPY");
}
None => break, // client disconnected
};
@@ -373,7 +373,7 @@ impl PageServerHandler {
base_lsn: Lsn,
_end_lsn: Lsn,
pg_version: u32,
) -> anyhow::Result<()> {
) -> Result<(), QueryError> {
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
// Create empty timeline
info!("creating new timeline");
@@ -395,9 +395,7 @@ impl PageServerHandler {
pgb.write_message(&BeMessage::CopyInResponse)?;
pgb.flush().await?;
let copyin_stream = copyin_stream(pgb);
pin!(copyin_stream);
let mut copyin_stream = Box::pin(copyin_stream(pgb));
timeline
.import_basebackup_from_tar(&mut copyin_stream, base_lsn)
.await?;
@@ -429,11 +427,16 @@ impl PageServerHandler {
timeline_id: TimelineId,
start_lsn: Lsn,
end_lsn: Lsn,
) -> anyhow::Result<()> {
) -> Result<(), QueryError> {
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?;
ensure!(timeline.get_last_record_lsn() == start_lsn);
let last_record_lsn = timeline.get_last_record_lsn();
if last_record_lsn != start_lsn {
return Err(QueryError::Other(
anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
);
}
// TODO leave clean state on error. For now you can use detach to clean
// up broken state from a failed import.
@@ -443,8 +446,8 @@ impl PageServerHandler {
pgb.write_message(&BeMessage::CopyInResponse)?;
pgb.flush().await?;
let mut copyin_stream = Box::pin(copyin_stream(pgb));
let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream));
tokio::task::block_in_place(|| import_wal_from_tar(&timeline, reader, start_lsn, end_lsn))?;
let mut reader = tokio_util::io::StreamReader::new(&mut copyin_stream);
import_wal_from_tar(&timeline, &mut reader, start_lsn, end_lsn).await?;
info!("wal import complete");
// Drain the rest of the Copy data
@@ -457,7 +460,11 @@ impl PageServerHandler {
}
// TODO Does it make sense to overshoot?
ensure!(timeline.get_last_record_lsn() >= end_lsn);
if timeline.get_last_record_lsn() < end_lsn {
return Err(QueryError::Other(
anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
);
}
// Flush data to disk, then upload to s3. No need for a forced checkpoint.
// We only want to persist the data, and it doesn't matter if it's in the
@@ -486,7 +493,7 @@ impl PageServerHandler {
mut lsn: Lsn,
latest: bool,
latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
) -> Result<Lsn> {
) -> anyhow::Result<Lsn> {
if latest {
// Latest page version was requested. If LSN is given, it is a hint
// to the page server that there have been no modifications to the
@@ -517,11 +524,11 @@ impl PageServerHandler {
}
} else {
if lsn == Lsn(0) {
bail!("invalid LSN(0) in request");
anyhow::bail!("invalid LSN(0) in request");
}
timeline.wait_lsn(lsn).await?;
}
ensure!(
anyhow::ensure!(
lsn >= **latest_gc_cutoff_lsn,
"tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
lsn, **latest_gc_cutoff_lsn
@@ -534,7 +541,7 @@ impl PageServerHandler {
&self,
timeline: &Timeline,
req: &PagestreamExistsRequest,
) -> Result<PagestreamBeMessage> {
) -> anyhow::Result<PagestreamBeMessage> {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
.await?;
@@ -554,7 +561,7 @@ impl PageServerHandler {
&self,
timeline: &Timeline,
req: &PagestreamNblocksRequest,
) -> Result<PagestreamBeMessage> {
) -> anyhow::Result<PagestreamBeMessage> {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
.await?;
@@ -574,7 +581,7 @@ impl PageServerHandler {
&self,
timeline: &Timeline,
req: &PagestreamDbSizeRequest,
) -> Result<PagestreamBeMessage> {
) -> anyhow::Result<PagestreamBeMessage> {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
.await?;
@@ -595,7 +602,7 @@ impl PageServerHandler {
&self,
timeline: &Timeline,
req: &PagestreamGetPageRequest,
) -> Result<PagestreamBeMessage> {
) -> anyhow::Result<PagestreamBeMessage> {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
.await?;
@@ -609,10 +616,6 @@ impl PageServerHandler {
*/
let page = crate::tenant::with_ondemand_download(|| {
// FIXME: this profiling now happens at different place than it used to. The
// current profiling is based on a thread-local variable, so it doesn't work
// across awaits
let _profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests);
timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)
})
.await?;
@@ -649,16 +652,12 @@ impl PageServerHandler {
pgb.flush().await?;
/* Send a tarball of the latest layer on the timeline */
let mut writer = CopyDataSink {
pgb,
rt: tokio::runtime::Handle::current(),
};
tokio::task::block_in_place(|| {
let basebackup =
basebackup::Basebackup::new(&mut writer, &timeline, lsn, prev_lsn, full_backup)?;
tracing::Span::current().record("lsn", basebackup.lsn.to_string().as_str());
basebackup.send_tarball()
})?;
{
let mut writer = pgb.copyout_writer();
basebackup::send_basebackup_tarball(&mut writer, &timeline, lsn, prev_lsn, full_backup)
.await?;
}
pgb.write_message(&BeMessage::CopyDone)?;
pgb.flush().await?;
info!("basebackup complete");
@@ -668,7 +667,7 @@ impl PageServerHandler {
// when accessing management api supply None as an argument
// when using to authorize tenant pass corresponding tenant id
fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<()> {
fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
if self.auth.is_none() {
// auth is set to Trust, nothing to check so just return ok
return Ok(());
@@ -690,20 +689,19 @@ impl postgres_backend_async::Handler for PageServerHandler {
&mut self,
_pgb: &mut PostgresBackend,
jwt_response: &[u8],
) -> anyhow::Result<()> {
) -> Result<(), QueryError> {
// this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT
// which requires auth to be present
let data = self
.auth
.as_ref()
.unwrap()
.decode(str::from_utf8(jwt_response)?)?;
.decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;
if matches!(data.claims.scope, Scope::Tenant) {
ensure!(
data.claims.tenant_id.is_some(),
if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() {
return Err(QueryError::Other(anyhow::anyhow!(
"jwt token scope is Tenant, but tenant id is missing"
)
)));
}
info!(
@@ -715,22 +713,33 @@ impl postgres_backend_async::Handler for PageServerHandler {
Ok(())
}
fn startup(
&mut self,
_pgb: &mut PostgresBackend,
_sm: &FeStartupPacket,
) -> Result<(), QueryError> {
Ok(())
}
async fn process_query(
&mut self,
pgb: &mut PostgresBackend,
query_string: &str,
) -> anyhow::Result<()> {
debug!("process query {:?}", query_string);
) -> Result<(), QueryError> {
debug!("process query {query_string:?}");
if query_string.starts_with("pagestream ") {
let (_, params_raw) = query_string.split_at("pagestream ".len());
let params = params_raw.split(' ').collect::<Vec<_>>();
ensure!(
params.len() == 2,
"invalid param number for pagestream command"
);
let tenant_id = TenantId::from_str(params[0])?;
let timeline_id = TimelineId::from_str(params[1])?;
if params.len() != 2 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for pagestream command"
)));
}
let tenant_id = TenantId::from_str(params[0])
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
let timeline_id = TimelineId::from_str(params[1])
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
self.check_permission(Some(tenant_id))?;
@@ -740,18 +749,24 @@ impl postgres_backend_async::Handler for PageServerHandler {
let (_, params_raw) = query_string.split_at("basebackup ".len());
let params = params_raw.split_whitespace().collect::<Vec<_>>();
ensure!(
params.len() >= 2,
"invalid param number for basebackup command"
);
if params.len() < 2 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for basebackup command"
)));
}
let tenant_id = TenantId::from_str(params[0])?;
let timeline_id = TimelineId::from_str(params[1])?;
let tenant_id = TenantId::from_str(params[0])
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
let timeline_id = TimelineId::from_str(params[1])
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
self.check_permission(Some(tenant_id))?;
let lsn = if params.len() == 3 {
Some(Lsn::from_str(params[2])?)
Some(
Lsn::from_str(params[2])
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
)
} else {
None
};
@@ -766,13 +781,16 @@ impl postgres_backend_async::Handler for PageServerHandler {
let (_, params_raw) = query_string.split_at("get_last_record_rlsn ".len());
let params = params_raw.split_whitespace().collect::<Vec<_>>();
ensure!(
params.len() == 2,
"invalid param number for get_last_record_rlsn command"
);
if params.len() != 2 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for get_last_record_rlsn command"
)));
}
let tenant_id = TenantId::from_str(params[0])?;
let timeline_id = TimelineId::from_str(params[1])?;
let tenant_id = TenantId::from_str(params[0])
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
let timeline_id = TimelineId::from_str(params[1])
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
self.check_permission(Some(tenant_id))?;
let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?;
@@ -794,22 +812,31 @@ impl postgres_backend_async::Handler for PageServerHandler {
let (_, params_raw) = query_string.split_at("fullbackup ".len());
let params = params_raw.split_whitespace().collect::<Vec<_>>();
ensure!(
params.len() >= 2,
"invalid param number for fullbackup command"
);
if params.len() < 2 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for fullbackup command"
)));
}
let tenant_id = TenantId::from_str(params[0])?;
let timeline_id = TimelineId::from_str(params[1])?;
let tenant_id = TenantId::from_str(params[0])
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
let timeline_id = TimelineId::from_str(params[1])
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
// The caller is responsible for providing correct lsn and prev_lsn.
let lsn = if params.len() > 2 {
Some(Lsn::from_str(params[2])?)
Some(
Lsn::from_str(params[2])
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
)
} else {
None
};
let prev_lsn = if params.len() > 3 {
Some(Lsn::from_str(params[3])?)
Some(
Lsn::from_str(params[3])
.with_context(|| format!("Failed to parse Lsn from {}", params[3]))?,
)
} else {
None
};
@@ -834,12 +861,21 @@ impl postgres_backend_async::Handler for PageServerHandler {
// -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION"
let (_, params_raw) = query_string.split_at("import basebackup ".len());
let params = params_raw.split_whitespace().collect::<Vec<_>>();
ensure!(params.len() == 5);
let tenant_id = TenantId::from_str(params[0])?;
let timeline_id = TimelineId::from_str(params[1])?;
let base_lsn = Lsn::from_str(params[2])?;
let end_lsn = Lsn::from_str(params[3])?;
let pg_version = u32::from_str(params[4])?;
if params.len() != 5 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for import basebackup command"
)));
}
let tenant_id = TenantId::from_str(params[0])
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
let timeline_id = TimelineId::from_str(params[1])
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
let base_lsn = Lsn::from_str(params[2])
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
let end_lsn = Lsn::from_str(params[3])
.with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
let pg_version = u32::from_str(params[4])
.with_context(|| format!("Failed to parse pg_version from {}", params[4]))?;
self.check_permission(Some(tenant_id))?;
@@ -857,7 +893,10 @@ impl postgres_backend_async::Handler for PageServerHandler {
Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?,
Err(e) => {
error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}");
pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))?
pgb.write_message(&BeMessage::ErrorResponse(
&e.to_string(),
Some(e.pg_error_code()),
))?
}
};
} else if query_string.starts_with("import wal ") {
@@ -867,11 +906,19 @@ impl postgres_backend_async::Handler for PageServerHandler {
// caller should poll the http api to check when that is done.
let (_, params_raw) = query_string.split_at("import wal ".len());
let params = params_raw.split_whitespace().collect::<Vec<_>>();
ensure!(params.len() == 4);
let tenant_id = TenantId::from_str(params[0])?;
let timeline_id = TimelineId::from_str(params[1])?;
let start_lsn = Lsn::from_str(params[2])?;
let end_lsn = Lsn::from_str(params[3])?;
if params.len() != 4 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for import wal command"
)));
}
let tenant_id = TenantId::from_str(params[0])
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
let timeline_id = TimelineId::from_str(params[1])
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
let start_lsn = Lsn::from_str(params[2])
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
let end_lsn = Lsn::from_str(params[3])
.with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
self.check_permission(Some(tenant_id))?;
@@ -882,7 +929,10 @@ impl postgres_backend_async::Handler for PageServerHandler {
Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?,
Err(e) => {
error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}");
pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))?
pgb.write_message(&BeMessage::ErrorResponse(
&e.to_string(),
Some(e.pg_error_code()),
))?
}
};
} else if query_string.to_ascii_lowercase().starts_with("set ") {
@@ -893,8 +943,13 @@ impl postgres_backend_async::Handler for PageServerHandler {
// show <tenant_id>
let (_, params_raw) = query_string.split_at("show ".len());
let params = params_raw.split(' ').collect::<Vec<_>>();
ensure!(params.len() == 1, "invalid param number for config command");
let tenant_id = TenantId::from_str(params[0])?;
if params.len() != 1 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for config command"
)));
}
let tenant_id = TenantId::from_str(params[0])
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
self.check_permission(Some(tenant_id))?;
@@ -935,7 +990,9 @@ impl postgres_backend_async::Handler for PageServerHandler {
]))?
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
} else {
bail!("unknown command");
return Err(QueryError::Other(anyhow::anyhow!(
"unknown command {query_string}"
)));
}
Ok(())
@@ -947,7 +1004,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
/// If the tenant is Loading, waits for it to become Active, for up to 30 s. That
/// ensures that queries don't fail immediately after pageserver startup, because
/// all tenants are still loading.
async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> Result<Arc<Tenant>> {
async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> anyhow::Result<Arc<Tenant>> {
let tenant = mgr::get_tenant(tenant_id, false).await?;
match tokio::time::timeout(Duration::from_secs(30), tenant.wait_to_become_active()).await {
Ok(wait_result) => wait_result
@@ -961,37 +1018,8 @@ async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> Result<Arc<Tenan
async fn get_active_timeline_with_timeout(
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<Arc<Timeline>> {
) -> anyhow::Result<Arc<Timeline>> {
get_active_tenant_with_timeout(tenant_id)
.await
.and_then(|tenant| tenant.get_timeline(timeline_id, true))
}
///
/// A std::io::Write implementation that wraps all data written to it in CopyData
/// messages.
///
struct CopyDataSink<'a> {
pgb: &'a mut PostgresBackend,
rt: tokio::runtime::Handle,
}
impl<'a> io::Write for CopyDataSink<'a> {
fn write(&mut self, data: &[u8]) -> io::Result<usize> {
// CopyData
// FIXME: if the input is large, we should split it into multiple messages.
// Not sure what the threshold should be, but the ultimate hard limit is that
// the length cannot exceed u32.
// FIXME: flush isn't really required, but makes it easier
// to view in wireshark
self.pgb.write_message(&BeMessage::CopyData(data))?;
self.rt.block_on(self.pgb.flush())?;
trace!("CopyData sent for {} bytes!", data.len());
Ok(data.len())
}
fn flush(&mut self) -> io::Result<()> {
// no-op
Ok(())
}
}

View File

@@ -1,107 +0,0 @@
//!
//! Support for profiling
//!
//! This relies on a modified version of the 'pprof-rs' crate. That's not very
//! nice, so to avoid a hard dependency on that, this is an optional feature.
//!
use crate::config::{PageServerConf, ProfilingConfig};
/// The actual implementation is in the `profiling_impl` submodule. If the profiling
/// feature is not enabled, it's just a dummy implementation that panics if you
/// try to enabled profiling in the configuration.
pub use profiling_impl::*;
#[cfg(feature = "profiling")]
mod profiling_impl {
use super::*;
use pprof;
use std::marker::PhantomData;
/// Start profiling the current thread. Returns a guard object;
/// the profiling continues until the guard is dropped.
///
/// Note: profiling is not re-entrant. If you call 'profpoint_start' while
/// profiling is already started, nothing happens, and the profiling will be
/// stopped when either guard object is dropped.
#[inline]
pub fn profpoint_start(
conf: &crate::config::PageServerConf,
point: ProfilingConfig,
) -> Option<ProfilingGuard> {
if conf.profiling == point {
pprof::start_profiling();
Some(ProfilingGuard(PhantomData))
} else {
None
}
}
/// A hack to remove Send and Sync from the ProfilingGuard. Because the
/// profiling is attached to current thread.
////
/// See comments in https://github.com/rust-lang/rust/issues/68318
type PhantomUnsend = std::marker::PhantomData<*mut u8>;
pub struct ProfilingGuard(PhantomUnsend);
impl Drop for ProfilingGuard {
fn drop(&mut self) {
pprof::stop_profiling();
}
}
/// Initialize the profiler. This must be called before any 'profpoint_start' calls.
pub fn init_profiler(conf: &PageServerConf) -> Option<pprof::ProfilerGuard> {
if conf.profiling != ProfilingConfig::Disabled {
Some(pprof::ProfilerGuardBuilder::default().build().unwrap())
} else {
None
}
}
/// Exit the profiler. Writes the flamegraph to current workdir.
pub fn exit_profiler(_conf: &PageServerConf, profiler_guard: &Option<pprof::ProfilerGuard>) {
// Write out the flamegraph
if let Some(profiler_guard) = profiler_guard {
if let Ok(report) = profiler_guard.report().build() {
// this gets written under the workdir
let file = std::fs::File::create("flamegraph.svg").unwrap();
let mut options = pprof::flamegraph::Options::default();
options.image_width = Some(2500);
report.flamegraph_with_options(file, &mut options).unwrap();
}
}
}
}
/// Dummy implementation when compiling without profiling feature or for non-linux OSes.
#[cfg(not(feature = "profiling"))]
mod profiling_impl {
use super::*;
pub struct DummyProfilerGuard;
impl Drop for DummyProfilerGuard {
fn drop(&mut self) {
// do nothing, this exists to calm Clippy down
}
}
pub fn profpoint_start(
_conf: &PageServerConf,
_point: ProfilingConfig,
) -> Option<DummyProfilerGuard> {
None
}
pub fn init_profiler(conf: &PageServerConf) -> Option<DummyProfilerGuard> {
if conf.profiling != ProfilingConfig::Disabled {
// shouldn't happen, we don't allow profiling in the config if the support
// for it is disabled.
panic!("profiling enabled but the binary was compiled without profiling support");
}
None
}
pub fn exit_profiler(_conf: &PageServerConf, _guard: &Option<DummyProfilerGuard>) {}
}

View File

@@ -13,13 +13,13 @@
use anyhow::{bail, Context};
use bytes::Bytes;
use futures::FutureExt;
use futures::Stream;
use pageserver_api::models::TimelineState;
use remote_storage::DownloadError;
use remote_storage::GenericRemoteStorage;
use tokio::sync::watch;
use tokio_util::io::StreamReader;
use tokio_util::io::SyncIoBridge;
use tokio::task::JoinSet;
use tracing::*;
use utils::crashsafe::path_with_suffix_extension;
@@ -36,7 +36,6 @@ use std::io::Write;
use std::ops::Bound::Included;
use std::path::Path;
use std::path::PathBuf;
use std::pin::Pin;
use std::process::Command;
use std::process::Stdio;
use std::sync::Arc;
@@ -239,21 +238,15 @@ impl UninitializedTimeline<'_> {
/// Prepares timeline data by loading it from the basebackup archive.
pub async fn import_basebackup_from_tar(
self,
mut copyin_stream: &mut Pin<&mut impl Stream<Item = io::Result<Bytes>>>,
copyin_stream: &mut (impl Stream<Item = io::Result<Bytes>> + Sync + Send + Unpin),
base_lsn: Lsn,
) -> anyhow::Result<Arc<Timeline>> {
let raw_timeline = self.raw_timeline()?;
// import_basebackup_from_tar() is not async, mainly because the Tar crate
// it uses is not async. So we need to jump through some hoops:
// - convert the input from client connection to a synchronous Read
// - use block_in_place()
let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream));
tokio::task::block_in_place(|| {
import_datadir::import_basebackup_from_tar(raw_timeline, reader, base_lsn)
.context("Failed to import basebackup")
})?;
let mut reader = tokio_util::io::StreamReader::new(copyin_stream);
import_datadir::import_basebackup_from_tar(raw_timeline, &mut reader, base_lsn)
.await
.context("Failed to import basebackup")?;
// Flush loop needs to be spawned in order to be able to flush.
// We want to run proper checkpoint before we mark timeline as available to outside world
@@ -606,7 +599,7 @@ impl Tenant {
match tenant_clone.attach().await {
Ok(_) => {}
Err(e) => {
tenant_clone.set_broken();
tenant_clone.set_broken(&e.to_string());
error!("error attaching tenant: {:?}", e);
}
}
@@ -651,26 +644,62 @@ impl Tenant {
.as_ref()
.ok_or_else(|| anyhow::anyhow!("cannot attach without remote storage"))?;
let remote_timelines = remote_timeline_client::list_remote_timelines(
let remote_timeline_ids = remote_timeline_client::list_remote_timelines(
remote_storage,
self.conf,
self.tenant_id,
)
.await?;
info!("found {} timelines", remote_timelines.len());
info!("found {} timelines", remote_timeline_ids.len());
let mut timeline_ancestors: HashMap<TimelineId, TimelineMetadata> = HashMap::new();
let mut index_parts: HashMap<TimelineId, IndexPart> = HashMap::new();
for (timeline_id, index_part) in remote_timelines {
let remote_metadata = index_part.parse_metadata().with_context(|| {
format!(
"Failed to parse metadata file from remote storage for tenant {} timeline {}",
self.tenant_id, timeline_id
)
})?;
// Download & parse index parts
let mut part_downloads = JoinSet::new();
for timeline_id in remote_timeline_ids {
let client = RemoteTimelineClient::new(
remote_storage.clone(),
self.conf,
self.tenant_id,
timeline_id,
);
part_downloads.spawn(
async move {
debug!("starting index part download");
let index_part = client
.download_index_file()
.await
.context("download index file")?;
let remote_metadata = index_part.parse_metadata().context("parse metadata")?;
debug!("finished index part download");
Result::<_, anyhow::Error>::Ok((
timeline_id,
client,
index_part,
remote_metadata,
))
}
.map(move |res| {
res.with_context(|| format!("download index part for timeline {timeline_id}"))
})
.instrument(info_span!("download_index_part", timeline=%timeline_id)),
);
}
// Wait for all the download tasks to complete & collect results.
let mut remote_clients = HashMap::new();
let mut index_parts = HashMap::new();
let mut timeline_ancestors = HashMap::new();
while let Some(result) = part_downloads.join_next().await {
// NB: we already added timeline_id as context to the error
let result: Result<_, anyhow::Error> = result.context("joinset task join")?;
let (timeline_id, client, index_part, remote_metadata) = result?;
debug!("successfully downloaded index part for timeline {timeline_id}");
timeline_ancestors.insert(timeline_id, remote_metadata);
index_parts.insert(timeline_id, index_part);
remote_clients.insert(timeline_id, client);
}
// For every timeline, download the metadata file, scan the local directory,
@@ -683,7 +712,7 @@ impl Tenant {
timeline_id,
index_parts.remove(&timeline_id).unwrap(),
remote_metadata,
remote_storage.clone(),
remote_clients.remove(&timeline_id).unwrap(),
)
.await
.with_context(|| {
@@ -726,22 +755,19 @@ impl Tenant {
Ok(size)
}
#[instrument(skip(self, index_part, remote_metadata, remote_storage), fields(timeline_id=%timeline_id))]
#[instrument(skip_all, fields(timeline_id=%timeline_id))]
async fn load_remote_timeline(
&self,
timeline_id: TimelineId,
index_part: IndexPart,
remote_metadata: TimelineMetadata,
remote_storage: GenericRemoteStorage,
remote_client: RemoteTimelineClient,
) -> anyhow::Result<()> {
info!("downloading index file for timeline {}", timeline_id);
tokio::fs::create_dir_all(self.conf.timeline_path(&timeline_id, &self.tenant_id))
.await
.context("Failed to create new timeline directory")?;
let remote_client =
RemoteTimelineClient::new(remote_storage, self.conf, self.tenant_id, timeline_id)?;
let ancestor = if let Some(ancestor_id) = remote_metadata.ancestor_timeline() {
let timelines = self.timelines.lock().unwrap();
Some(Arc::clone(timelines.get(&ancestor_id).ok_or_else(
@@ -837,7 +863,7 @@ impl Tenant {
match tenant_clone.load().await {
Ok(()) => {}
Err(err) => {
tenant_clone.set_broken();
tenant_clone.set_broken(&err.to_string());
error!("could not load tenant {tenant_id}: {err:?}");
}
}
@@ -998,18 +1024,14 @@ impl Tenant {
None
};
let remote_client = self
.remote_storage
.as_ref()
.map(|remote_storage| {
RemoteTimelineClient::new(
remote_storage.clone(),
self.conf,
self.tenant_id,
timeline_id,
)
})
.transpose()?;
let remote_client = self.remote_storage.as_ref().map(|remote_storage| {
RemoteTimelineClient::new(
remote_storage.clone(),
self.conf,
self.tenant_id,
timeline_id,
)
});
let remote_startup_data = match &remote_client {
Some(remote_client) => match remote_client.download_index_file().await {
@@ -1477,7 +1499,7 @@ impl Tenant {
});
}
pub fn set_broken(&self) {
pub fn set_broken(&self, reason: &str) {
self.state.send_modify(|current_state| {
match *current_state {
TenantState::Active => {
@@ -1486,18 +1508,22 @@ impl Tenant {
// activated should never be marked as broken. We cope with it the best
// we can, but it shouldn't happen.
*current_state = TenantState::Broken;
warn!("Changing Active tenant to Broken state");
warn!("Changing Active tenant to Broken state, reason: {}", reason);
}
TenantState::Broken => {
// This shouldn't happen either
warn!("Tenant is already broken");
warn!("Tenant is already in Broken state");
}
TenantState::Stopping => {
// This shouldn't happen either
*current_state = TenantState::Broken;
warn!("Marking Stopping tenant as Broken");
warn!(
"Marking Stopping tenant as Broken state, reason: {}",
reason
);
}
TenantState::Loading | TenantState::Attaching => {
info!("Setting tenant as Broken state, reason: {}", reason);
*current_state = TenantState::Broken;
}
}
@@ -1851,7 +1877,12 @@ impl Tenant {
utils::failpoint_sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
info!("starting on {} timelines", gc_timelines.len());
// If there is nothing to GC, we don't want any messages in the INFO log.
if !gc_timelines.is_empty() {
info!("{} timelines need GC", gc_timelines.len());
} else {
debug!("{} timelines need GC", gc_timelines.len());
}
// Perform GC for each timeline.
//
@@ -2142,13 +2173,12 @@ impl Tenant {
let tenant_id = raw_timeline.owning_tenant.tenant_id;
let unfinished_timeline = raw_timeline.raw_timeline()?;
tokio::task::block_in_place(|| {
import_datadir::import_timeline_from_postgres_datadir(
unfinished_timeline,
pgdata_path,
pgdata_lsn,
)
})
import_datadir::import_timeline_from_postgres_datadir(
unfinished_timeline,
pgdata_path,
pgdata_lsn,
)
.await
.with_context(|| {
format!("Failed to import pgdatadir for timeline {tenant_id}/{timeline_id}")
})?;
@@ -2204,7 +2234,7 @@ impl Tenant {
self.conf,
tenant_id,
new_timeline_id,
)?;
);
remote_client.init_upload_queue_for_empty_remote(&new_metadata)?;
Some(remote_client)
} else {

View File

@@ -30,7 +30,7 @@ pub mod defaults {
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
pub const DEFAULT_GC_PERIOD: &str = "100 s";
pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
pub const DEFAULT_PITR_INTERVAL: &str = "30 days";
pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds";
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "3 seconds";
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;

View File

@@ -87,6 +87,9 @@ where
/// contain the version, even if it's missing from the returned
/// layer.
///
/// NOTE: This only searches the 'historic' layers, *not* the
/// 'open' and 'frozen' layers!
///
pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult<L>> {
match self.index.query(key.to_i128(), end_lsn.0 - 1) {
(None, None) => None,

View File

@@ -430,7 +430,7 @@ where
Err(e) => {
let tenants_accessor = TENANTS.read().await;
match tenants_accessor.get(&tenant_id) {
Some(tenant) => tenant.set_broken(),
Some(tenant) => tenant.set_broken(&e.to_string()),
None => warn!("Tenant {tenant_id} got removed from memory"),
}
Err(e)
@@ -492,3 +492,53 @@ pub async fn immediate_gc(
Ok(wait_task_done)
}
#[cfg(feature = "testing")]
pub async fn immediate_compact(
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<tokio::sync::oneshot::Receiver<anyhow::Result<()>>, ApiError> {
let guard = TENANTS.read().await;
let tenant = guard
.get(&tenant_id)
.map(Arc::clone)
.with_context(|| format!("Tenant {tenant_id} not found"))
.map_err(ApiError::NotFound)?;
let timeline = tenant
.get_timeline(timeline_id, true)
.map_err(ApiError::NotFound)?;
// Run in task_mgr to avoid race with detach operation
let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
task_mgr::spawn(
&tokio::runtime::Handle::current(),
TaskKind::Compaction,
Some(tenant_id),
Some(timeline_id),
&format!(
"timeline_compact_handler compaction run for tenant {tenant_id} timeline {timeline_id}"
),
false,
async move {
let result = timeline
.compact()
.instrument(
info_span!("manual_compact", tenant = %tenant_id, timeline = %timeline_id),
)
.await;
match task_done.send(result) {
Ok(_) => (),
Err(result) => error!("failed to send compaction result: {result:?}"),
}
Ok(())
},
);
// drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
drop(guard);
Ok(wait_task_done)
}

View File

@@ -298,8 +298,8 @@ impl RemoteTimelineClient {
conf: &'static PageServerConf,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> anyhow::Result<RemoteTimelineClient> {
Ok(RemoteTimelineClient {
) -> RemoteTimelineClient {
RemoteTimelineClient {
conf,
runtime: &BACKGROUND_RUNTIME,
tenant_id,
@@ -307,7 +307,7 @@ impl RemoteTimelineClient {
storage_impl: remote_storage,
upload_queue: Mutex::new(UploadQueue::Uninitialized),
metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
})
}
}
/// Initialize the upload queue for a remote storage that already received
@@ -367,6 +367,10 @@ impl RemoteTimelineClient {
/// Download index file
pub async fn download_index_file(&self) -> Result<IndexPart, DownloadError> {
let _unfinished_gauge_guard = self
.metrics
.call_begin(&RemoteOpFileKind::Index, &RemoteOpKind::Download);
download::download_index_part(
self.conf,
&self.storage_impl,
@@ -393,22 +397,27 @@ impl RemoteTimelineClient {
layer_file_name: &LayerFileName,
layer_metadata: &LayerFileMetadata,
) -> anyhow::Result<u64> {
let downloaded_size = download::download_layer_file(
self.conf,
&self.storage_impl,
self.tenant_id,
self.timeline_id,
layer_file_name,
layer_metadata,
)
.measure_remote_op(
self.tenant_id,
self.timeline_id,
RemoteOpFileKind::Layer,
RemoteOpKind::Download,
Arc::clone(&self.metrics),
)
.await?;
let downloaded_size = {
let _unfinished_gauge_guard = self
.metrics
.call_begin(&RemoteOpFileKind::Layer, &RemoteOpKind::Download);
download::download_layer_file(
self.conf,
&self.storage_impl,
self.tenant_id,
self.timeline_id,
layer_file_name,
layer_metadata,
)
.measure_remote_op(
self.tenant_id,
self.timeline_id,
RemoteOpFileKind::Layer,
RemoteOpKind::Download,
Arc::clone(&self.metrics),
)
.await?
};
// Update the metadata for given layer file. The remote index file
// might be missing some information for the file; this allows us
@@ -517,7 +526,7 @@ impl RemoteTimelineClient {
metadata_bytes,
);
let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
self.update_upload_queue_unfinished_metric(1, &op);
self.calls_unfinished_metric_begin(&op);
upload_queue.queued_operations.push_back(op);
upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;
@@ -549,7 +558,7 @@ impl RemoteTimelineClient {
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
let op = UploadOp::UploadLayer(layer_file_name.clone(), layer_metadata.clone());
self.update_upload_queue_unfinished_metric(1, &op);
self.calls_unfinished_metric_begin(&op);
upload_queue.queued_operations.push_back(op);
info!(
@@ -601,7 +610,7 @@ impl RemoteTimelineClient {
// schedule the actual deletions
for name in names {
let op = UploadOp::Delete(RemoteOpFileKind::Layer, name.clone());
self.update_upload_queue_unfinished_metric(1, &op);
self.calls_unfinished_metric_begin(&op);
upload_queue.queued_operations.push_back(op);
info!("scheduled layer file deletion {}", name.file_name());
}
@@ -753,7 +762,7 @@ impl RemoteTimelineClient {
// upload finishes or times out soon enough.
if task_mgr::is_shutdown_requested() {
info!("upload task cancelled by shutdown request");
self.update_upload_queue_unfinished_metric(-1, &task.op);
self.calls_unfinished_metric_end(&task.op);
self.stop();
return;
}
@@ -901,22 +910,40 @@ impl RemoteTimelineClient {
// Launch any queued tasks that were unblocked by this one.
self.launch_queued_tasks(upload_queue);
}
self.update_upload_queue_unfinished_metric(-1, &task.op);
self.calls_unfinished_metric_end(&task.op);
}
fn update_upload_queue_unfinished_metric(&self, delta: i64, op: &UploadOp) {
let (file_kind, op_kind) = match op {
fn calls_unfinished_metric_impl(
&self,
op: &UploadOp,
) -> Option<(RemoteOpFileKind, RemoteOpKind)> {
let res = match op {
UploadOp::UploadLayer(_, _) => (RemoteOpFileKind::Layer, RemoteOpKind::Upload),
UploadOp::UploadMetadata(_, _) => (RemoteOpFileKind::Index, RemoteOpKind::Upload),
UploadOp::Delete(file_kind, _) => (*file_kind, RemoteOpKind::Delete),
UploadOp::Barrier(_) => {
// we do not account these
return;
return None;
}
};
self.metrics
.unfinished_tasks(&file_kind, &op_kind)
.add(delta)
Some(res)
}
fn calls_unfinished_metric_begin(&self, op: &UploadOp) {
let (file_kind, op_kind) = match self.calls_unfinished_metric_impl(op) {
Some(x) => x,
None => return,
};
let guard = self.metrics.call_begin(&file_kind, &op_kind);
guard.will_decrement_manually(); // in unfinished_ops_metric_end()
}
fn calls_unfinished_metric_end(&self, op: &UploadOp) {
let (file_kind, op_kind) = match self.calls_unfinished_metric_impl(op) {
Some(x) => x,
None => return,
};
self.metrics.call_end(&file_kind, &op_kind);
}
fn stop(&self) {
@@ -967,7 +994,7 @@ impl RemoteTimelineClient {
// Tear down queued ops
for op in qi.queued_operations.into_iter() {
self.update_upload_queue_unfinished_metric(-1, &op);
self.calls_unfinished_metric_end(&op);
// Dropping UploadOp::Barrier() here will make wait_completion() return with an Err()
// which is exactly what we want to happen.
drop(op);

View File

@@ -8,10 +8,9 @@ use std::future::Future;
use std::path::Path;
use anyhow::{anyhow, Context};
use futures::stream::{FuturesUnordered, StreamExt};
use tokio::fs;
use tokio::io::AsyncWriteExt;
use tracing::{debug, error, info, info_span, warn, Instrument};
use tracing::{error, info, warn};
use crate::config::PageServerConf;
use crate::tenant::storage_layer::LayerFileName;
@@ -175,7 +174,7 @@ pub async fn list_remote_timelines<'a>(
storage: &'a GenericRemoteStorage,
conf: &'static PageServerConf,
tenant_id: TenantId,
) -> anyhow::Result<Vec<(TimelineId, IndexPart)>> {
) -> anyhow::Result<HashSet<TimelineId>> {
let tenant_path = conf.timelines_path(&tenant_id);
let tenant_storage_path = conf.remote_path(&tenant_path)?;
@@ -194,7 +193,6 @@ pub async fn list_remote_timelines<'a>(
}
let mut timeline_ids = HashSet::new();
let mut part_downloads = FuturesUnordered::new();
for timeline_remote_storage_key in timelines {
let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
@@ -205,35 +203,22 @@ pub async fn list_remote_timelines<'a>(
format!("failed to parse object name into timeline id '{object_name}'")
})?;
// list_prefixes returns all files with the prefix. If we haven't seen this timeline ID
// yet, launch a download task for it.
if !timeline_ids.contains(&timeline_id) {
timeline_ids.insert(timeline_id);
let storage_clone = storage.clone();
part_downloads.push(async move {
(
timeline_id,
download_index_part(conf, &storage_clone, tenant_id, timeline_id)
.instrument(info_span!("download_index_part", timeline=%timeline_id))
.await,
)
});
}
// list_prefixes is assumed to return unique names. Ensure this here.
// NB: it's safer to bail out than warn-log this because the pageserver
// needs to absolutely know about _all_ timelines that exist, so that
// GC knows all the branchpoints. If we skipped over a timeline instead,
// GC could delete a layer that's still needed by that timeline.
anyhow::ensure!(
!timeline_ids.contains(&timeline_id),
"list_prefixes contains duplicate timeline id {timeline_id}"
);
timeline_ids.insert(timeline_id);
}
// Wait for all the download tasks to complete.
let mut timeline_parts = Vec::new();
while let Some((timeline_id, part_upload_result)) = part_downloads.next().await {
let index_part = part_upload_result
.with_context(|| format!("Failed to fetch index part for timeline {timeline_id}"))?;
debug!("Successfully fetched index part for timeline {timeline_id}");
timeline_parts.push((timeline_id, index_part));
}
Ok(timeline_parts)
Ok(timeline_ids)
}
pub async fn download_index_part(
pub(super) async fn download_index_part(
conf: &'static PageServerConf,
storage: &GenericRemoteStorage,
tenant_id: TenantId,

View File

@@ -1655,8 +1655,7 @@ impl Timeline {
// For debugging purposes, collect the path of layers that we traversed
// through. It's included in the error message if we fail to find the key.
let mut traversal_path =
Vec::<(ValueReconstructResult, Lsn, Box<dyn TraversalLayerExt>)>::new();
let mut traversal_path = Vec::<TraversalPathItem>::new();
let cached_lsn = if let Some((cached_lsn, _)) = &reconstruct_state.img {
*cached_lsn
@@ -1721,82 +1720,132 @@ impl Timeline {
timeline_owned = ancestor;
timeline = &*timeline_owned;
prev_lsn = Lsn(u64::MAX);
continue;
continue 'outer;
}
let layers = timeline.layers.read().unwrap();
#[allow(clippy::never_loop)] // see comment at bottom of this loop
'_layer_map_search: loop {
let remote_layer = {
let layers = timeline.layers.read().unwrap();
// Check the open and frozen in-memory layers first, in order from newest
// to oldest.
if let Some(open_layer) = &layers.open_layer {
let start_lsn = open_layer.get_lsn_range().start;
if cont_lsn > start_lsn {
//info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display());
// Get all the data needed to reconstruct the page version from this layer.
// But if we have an older cached page image, no need to go past that.
let lsn_floor = max(cached_lsn + 1, start_lsn);
result = match open_layer.get_value_reconstruct_data(
key,
lsn_floor..cont_lsn,
reconstruct_state,
) {
Ok(result) => result,
Err(e) => return PageReconstructResult::from(e),
};
cont_lsn = lsn_floor;
traversal_path.push((result, cont_lsn, Box::new(open_layer.clone())));
continue;
}
}
for frozen_layer in layers.frozen_layers.iter().rev() {
let start_lsn = frozen_layer.get_lsn_range().start;
if cont_lsn > start_lsn {
//info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
let lsn_floor = max(cached_lsn + 1, start_lsn);
result = match frozen_layer.get_value_reconstruct_data(
key,
lsn_floor..cont_lsn,
reconstruct_state,
) {
Ok(result) => result,
Err(e) => return PageReconstructResult::from(e),
};
cont_lsn = lsn_floor;
traversal_path.push((result, cont_lsn, Box::new(frozen_layer.clone())));
continue 'outer;
}
}
// Check the open and frozen in-memory layers first, in order from newest
// to oldest.
if let Some(open_layer) = &layers.open_layer {
let start_lsn = open_layer.get_lsn_range().start;
if cont_lsn > start_lsn {
//info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display());
// Get all the data needed to reconstruct the page version from this layer.
// But if we have an older cached page image, no need to go past that.
let lsn_floor = max(cached_lsn + 1, start_lsn);
result = match open_layer.get_value_reconstruct_data(
key,
lsn_floor..cont_lsn,
reconstruct_state,
) {
Ok(result) => result,
Err(e) => return PageReconstructResult::from(e),
};
cont_lsn = lsn_floor;
traversal_path.push((
result,
cont_lsn,
Box::new({
let open_layer = Arc::clone(open_layer);
move || open_layer.traversal_id()
}),
));
continue 'outer;
}
}
for frozen_layer in layers.frozen_layers.iter().rev() {
let start_lsn = frozen_layer.get_lsn_range().start;
if cont_lsn > start_lsn {
//info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
let lsn_floor = max(cached_lsn + 1, start_lsn);
result = match frozen_layer.get_value_reconstruct_data(
key,
lsn_floor..cont_lsn,
reconstruct_state,
) {
Ok(result) => result,
Err(e) => return PageReconstructResult::from(e),
};
cont_lsn = lsn_floor;
traversal_path.push((
result,
cont_lsn,
Box::new({
let frozen_layer = Arc::clone(frozen_layer);
move || frozen_layer.traversal_id()
}),
));
continue 'outer;
}
}
if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
//info!("CHECKING for {} at {} on historic layer {}", key, cont_lsn, layer.filename().display());
// If it's a remote layer, the caller can do the download and retry.
if let Some(remote_layer) = super::storage_layer::downcast_remote_layer(&layer) {
info!("need remote layer {}", layer.traversal_id());
return PageReconstructResult::NeedsDownload(
Weak::clone(&timeline.myself),
Arc::downgrade(&remote_layer),
);
}
let lsn_floor = max(cached_lsn + 1, lsn_floor);
result = match layer.get_value_reconstruct_data(
key,
lsn_floor..cont_lsn,
reconstruct_state,
) {
Ok(result) => result,
Err(e) => return PageReconstructResult::from(e),
if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
// If it's a remote layer, download it and retry.
if let Some(remote_layer) =
super::storage_layer::downcast_remote_layer(&layer)
{
// TODO: push a breadcrumb to 'traversal_path' to record the fact that
// we downloaded / would need to download this layer.
remote_layer // download happens outside the scope of `layers` guard object
} else {
// Get all the data needed to reconstruct the page version from this layer.
// But if we have an older cached page image, no need to go past that.
let lsn_floor = max(cached_lsn + 1, lsn_floor);
result = match layer.get_value_reconstruct_data(
key,
lsn_floor..cont_lsn,
reconstruct_state,
) {
Ok(result) => result,
Err(e) => return PageReconstructResult::from(e),
};
cont_lsn = lsn_floor;
traversal_path.push((
result,
cont_lsn,
Box::new({
let layer = Arc::clone(&layer);
move || layer.traversal_id()
}),
));
continue 'outer;
}
} else if timeline.ancestor_timeline.is_some() {
// Nothing on this timeline. Traverse to parent
result = ValueReconstructResult::Continue;
cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1);
continue 'outer;
} else {
// Nothing found
result = ValueReconstructResult::Missing;
continue 'outer;
}
};
cont_lsn = lsn_floor;
traversal_path.push((result, cont_lsn, Box::new(layer.clone())));
} else if timeline.ancestor_timeline.is_some() {
// Nothing on this timeline. Traverse to parent
result = ValueReconstructResult::Continue;
cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1);
} else {
// Nothing found
result = ValueReconstructResult::Missing;
// Indicate to the caller that we need remote_layer replaced with a downloaded
// layer in the layer map. The control flow could be a lot simpler, but the point
// of this commit is to prepare this function to
// 1. become async
// 2. do the download right here, using
// ```
// download_remote_layer().await?;
// continue 'layer_map_search;
// ```
// For (2), current rustc requires that the layers lock guard is not in scope.
// Hence, the complicated control flow.
let remote_layer_as_persistent: Arc<dyn PersistentLayer> =
Arc::clone(&remote_layer) as Arc<dyn PersistentLayer>;
info!(
"need remote layer {}",
remote_layer_as_persistent.traversal_id()
);
return PageReconstructResult::NeedsDownload(
Weak::clone(&timeline.myself),
Arc::downgrade(&remote_layer),
);
}
}
}
@@ -3376,22 +3425,25 @@ where
}
}
type TraversalPathItem = (
ValueReconstructResult,
Lsn,
Box<dyn FnOnce() -> TraversalId>,
);
/// Helper function for get_reconstruct_data() to add the path of layers traversed
/// to an error, as anyhow context information.
fn layer_traversal_error(
msg: String,
path: Vec<(ValueReconstructResult, Lsn, Box<dyn TraversalLayerExt>)>,
) -> PageReconstructResult<()> {
fn layer_traversal_error(msg: String, path: Vec<TraversalPathItem>) -> PageReconstructResult<()> {
// We want the original 'msg' to be the outermost context. The outermost context
// is the most high-level information, which also gets propagated to the client.
let mut msg_iter = path
.iter()
.into_iter()
.map(|(r, c, l)| {
format!(
"layer traversal: result {:?}, cont_lsn {}, layer: {}",
r,
c,
l.traversal_id(),
l(),
)
})
.chain(std::iter::once(msg));

View File

@@ -21,7 +21,6 @@
//! redo Postgres process, but some records it can handle directly with
//! bespoken Rust code.
use anyhow::Context;
use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
@@ -31,12 +30,10 @@ use bytes::{Buf, Bytes, BytesMut};
use tracing::*;
use crate::pgdatadir_mapping::*;
use crate::tenant::PageReconstructResult;
use crate::tenant::Timeline;
use crate::try_page_reconstruct_result as try_prr;
use crate::tenant::{with_ondemand_download, PageReconstructError};
use crate::walrecord::*;
use crate::ZERO_PAGE;
use crate::{try_no_ondemand_download, try_page_reconstruct_result};
use pageserver_api::reltag::{RelTag, SlruKind};
use postgres_ffi::pg_constants;
use postgres_ffi::relfile_utils::{FSM_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
@@ -55,16 +52,15 @@ pub struct WalIngest<'a> {
}
impl<'a> WalIngest<'a> {
pub fn new(timeline: &Timeline, startpoint: Lsn) -> PageReconstructResult<WalIngest> {
pub async fn new(timeline: &Timeline, startpoint: Lsn) -> anyhow::Result<WalIngest> {
// Fetch the latest checkpoint into memory, so that we can compare with it
// quickly in `ingest_record` and update it when it changes.
let checkpoint_bytes = try_no_ondemand_download!(timeline.get_checkpoint(startpoint));
let checkpoint = try_page_reconstruct_result!(
CheckPoint::decode(&checkpoint_bytes).context("Failed to decode checkpoint bytes")
);
let checkpoint_bytes =
with_ondemand_download(|| timeline.get_checkpoint(startpoint)).await?;
let checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
PageReconstructResult::Success(WalIngest {
Ok(WalIngest {
timeline,
checkpoint,
checkpoint_modified: false,
@@ -79,18 +75,15 @@ impl<'a> WalIngest<'a> {
/// Helper function to parse a WAL record and call the Timeline's PUT functions for all the
/// relations/pages that the record affects.
///
pub fn ingest_record(
pub async fn ingest_record(
&mut self,
recdata: Bytes,
lsn: Lsn,
modification: &mut DatadirModification,
modification: &mut DatadirModification<'_>,
decoded: &mut DecodedWALRecord,
) -> PageReconstructResult<()> {
) -> anyhow::Result<()> {
modification.lsn = lsn;
try_prr!(
decode_wal_record(recdata, decoded, self.timeline.pg_version)
.context("failed decoding wal record")
);
decode_wal_record(recdata, decoded, self.timeline.pg_version)?;
let mut buf = decoded.record.clone();
buf.advance(decoded.main_data_offset);
@@ -105,7 +98,8 @@ impl<'a> WalIngest<'a> {
if decoded.xl_rmid == pg_constants::RM_HEAP_ID
|| decoded.xl_rmid == pg_constants::RM_HEAP2_ID
{
try_prr!(self.ingest_heapam_record(&mut buf, modification, decoded));
self.ingest_heapam_record(&mut buf, modification, decoded)
.await?;
}
// Handle other special record types
if decoded.xl_rmid == pg_constants::RM_SMGR_ID
@@ -113,13 +107,14 @@ impl<'a> WalIngest<'a> {
== pg_constants::XLOG_SMGR_CREATE
{
let create = XlSmgrCreate::decode(&mut buf);
try_prr!(self.ingest_xlog_smgr_create(modification, &create));
self.ingest_xlog_smgr_create(modification, &create)?;
} else if decoded.xl_rmid == pg_constants::RM_SMGR_ID
&& (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
== pg_constants::XLOG_SMGR_TRUNCATE
{
let truncate = XlSmgrTruncate::decode(&mut buf);
try_prr!(self.ingest_xlog_smgr_truncate(modification, &truncate));
self.ingest_xlog_smgr_truncate(modification, &truncate)
.await?;
} else if decoded.xl_rmid == pg_constants::RM_DBASE_ID {
debug!(
"handle RM_DBASE_ID for Postgres version {:?}",
@@ -132,14 +127,15 @@ impl<'a> WalIngest<'a> {
let createdb = XlCreateDatabase::decode(&mut buf);
debug!("XLOG_DBASE_CREATE v14");
try_prr!(self.ingest_xlog_dbase_create(modification, &createdb));
self.ingest_xlog_dbase_create(modification, &createdb)
.await?;
} else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
== postgres_ffi::v14::bindings::XLOG_DBASE_DROP
{
let dropdb = XlDropDatabase::decode(&mut buf);
for tablespace_id in dropdb.tablespace_ids {
trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
try_prr!(modification.drop_dbdir(tablespace_id, dropdb.db_id));
modification.drop_dbdir(tablespace_id, dropdb.db_id)?;
}
}
} else if self.timeline.pg_version == 15 {
@@ -155,14 +151,15 @@ impl<'a> WalIngest<'a> {
// So we can reuse XlCreateDatabase here.
debug!("XLOG_DBASE_CREATE_FILE_COPY");
let createdb = XlCreateDatabase::decode(&mut buf);
try_prr!(self.ingest_xlog_dbase_create(modification, &createdb));
self.ingest_xlog_dbase_create(modification, &createdb)
.await?;
} else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
== postgres_ffi::v15::bindings::XLOG_DBASE_DROP
{
let dropdb = XlDropDatabase::decode(&mut buf);
for tablespace_id in dropdb.tablespace_ids {
trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
try_prr!(modification.drop_dbdir(tablespace_id, dropdb.db_id));
modification.drop_dbdir(tablespace_id, dropdb.db_id)?;
}
}
}
@@ -174,38 +171,42 @@ impl<'a> WalIngest<'a> {
let pageno = buf.get_u32_le();
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
try_prr!(self.put_slru_page_image(
self.put_slru_page_image(
modification,
SlruKind::Clog,
segno,
rpageno,
ZERO_PAGE.clone(),
));
)
.await?;
} else {
assert!(info == pg_constants::CLOG_TRUNCATE);
let xlrec = XlClogTruncate::decode(&mut buf);
try_prr!(self.ingest_clog_truncate_record(modification, &xlrec));
self.ingest_clog_truncate_record(modification, &xlrec)
.await?;
}
} else if decoded.xl_rmid == pg_constants::RM_XACT_ID {
let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
if info == pg_constants::XLOG_XACT_COMMIT || info == pg_constants::XLOG_XACT_ABORT {
let parsed_xact =
XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
try_prr!(self.ingest_xact_record(
self.ingest_xact_record(
modification,
&parsed_xact,
info == pg_constants::XLOG_XACT_COMMIT,
));
)
.await?;
} else if info == pg_constants::XLOG_XACT_COMMIT_PREPARED
|| info == pg_constants::XLOG_XACT_ABORT_PREPARED
{
let parsed_xact =
XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
try_prr!(self.ingest_xact_record(
self.ingest_xact_record(
modification,
&parsed_xact,
info == pg_constants::XLOG_XACT_COMMIT_PREPARED,
));
)
.await?;
// Remove twophase file. see RemoveTwoPhaseFile() in postgres code
trace!(
"Drop twophaseFile for xid {} parsed_xact.xid {} here at {}",
@@ -213,10 +214,9 @@ impl<'a> WalIngest<'a> {
parsed_xact.xid,
lsn,
);
try_prr!(modification.drop_twophase_file(parsed_xact.xid));
modification.drop_twophase_file(parsed_xact.xid)?;
} else if info == pg_constants::XLOG_XACT_PREPARE {
try_prr!(modification
.put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..])));
modification.put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]))?;
}
} else if decoded.xl_rmid == pg_constants::RM_MULTIXACT_ID {
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
@@ -225,34 +225,36 @@ impl<'a> WalIngest<'a> {
let pageno = buf.get_u32_le();
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
try_prr!(self.put_slru_page_image(
self.put_slru_page_image(
modification,
SlruKind::MultiXactOffsets,
segno,
rpageno,
ZERO_PAGE.clone(),
));
)
.await?;
} else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE {
let pageno = buf.get_u32_le();
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
try_prr!(self.put_slru_page_image(
self.put_slru_page_image(
modification,
SlruKind::MultiXactMembers,
segno,
rpageno,
ZERO_PAGE.clone(),
));
)
.await?;
} else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
let xlrec = XlMultiXactCreate::decode(&mut buf);
try_prr!(self.ingest_multixact_create_record(modification, &xlrec));
self.ingest_multixact_create_record(modification, &xlrec)?;
} else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
let xlrec = XlMultiXactTruncate::decode(&mut buf);
try_prr!(self.ingest_multixact_truncate_record(modification, &xlrec));
self.ingest_multixact_truncate_record(modification, &xlrec)?;
}
} else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID {
let xlrec = XlRelmapUpdate::decode(&mut buf);
try_prr!(self.ingest_relmap_page(modification, &xlrec, decoded));
self.ingest_relmap_page(modification, &xlrec, decoded)?;
} else if decoded.xl_rmid == pg_constants::RM_XLOG_ID {
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
if info == pg_constants::XLOG_NEXTOID {
@@ -266,9 +268,7 @@ impl<'a> WalIngest<'a> {
{
let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT];
buf.copy_to_slice(&mut checkpoint_bytes);
let xlog_checkpoint = try_prr!(
CheckPoint::decode(&checkpoint_bytes).context("deserialize CheckPoint")
);
let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
trace!(
"xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
xlog_checkpoint.oldestXid,
@@ -289,32 +289,32 @@ impl<'a> WalIngest<'a> {
// Iterate through all the blocks that the record modifies, and
// "put" a separate copy of the record for each block.
for blk in decoded.blocks.iter() {
try_no_ondemand_download!(self.ingest_decoded_block(modification, lsn, decoded, blk));
self.ingest_decoded_block(modification, lsn, decoded, blk)
.await?;
}
// If checkpoint data was updated, store the new version in the repository
if self.checkpoint_modified {
let new_checkpoint_bytes =
try_prr!(self.checkpoint.encode().context("encode checkpoint"));
let new_checkpoint_bytes = self.checkpoint.encode()?;
try_prr!(modification.put_checkpoint(new_checkpoint_bytes));
modification.put_checkpoint(new_checkpoint_bytes)?;
self.checkpoint_modified = false;
}
// Now that this record has been fully handled, including updating the
// checkpoint data, let the repository know that it is up-to-date to this LSN
try_prr!(modification.commit());
modification.commit()?;
PageReconstructResult::Success(())
Ok(())
}
fn ingest_decoded_block(
async fn ingest_decoded_block(
&mut self,
modification: &mut DatadirModification,
modification: &mut DatadirModification<'_>,
lsn: Lsn,
decoded: &DecodedWALRecord,
blk: &DecodedBkpBlock,
) -> PageReconstructResult<()> {
) -> Result<(), PageReconstructError> {
let rel = RelTag {
spcnode: blk.rnode_spcnode,
dbnode: blk.rnode_dbnode,
@@ -334,7 +334,7 @@ impl<'a> WalIngest<'a> {
&& (decoded.xl_info == pg_constants::XLOG_FPI
|| decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
// compression of WAL is not yet supported: fall back to storing the original WAL record
&& !try_prr!(postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version))
&& !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)?
{
// Extract page image from FPI record
let img_len = blk.bimg_len as usize;
@@ -356,28 +356,25 @@ impl<'a> WalIngest<'a> {
page_set_lsn(&mut image, lsn)
}
assert_eq!(image.len(), BLCKSZ as usize);
try_no_ondemand_download!(self.put_rel_page_image(
modification,
rel,
blk.blkno,
image.freeze()
));
self.put_rel_page_image(modification, rel, blk.blkno, image.freeze())
.await?;
} else {
let rec = NeonWalRecord::Postgres {
will_init: blk.will_init || blk.apply_image,
rec: decoded.record.clone(),
};
try_prr!(self.put_rel_wal_record(modification, rel, blk.blkno, rec));
self.put_rel_wal_record(modification, rel, blk.blkno, rec)
.await?;
}
PageReconstructResult::Success(())
Ok(())
}
fn ingest_heapam_record(
async fn ingest_heapam_record(
&mut self,
buf: &mut Bytes,
modification: &mut DatadirModification,
modification: &mut DatadirModification<'_>,
decoded: &mut DecodedWALRecord,
) -> Result<()> {
) -> anyhow::Result<()> {
// Handle VM bit updates that are implicitly part of heap records.
// First, look at the record to determine which VM bits need
@@ -456,7 +453,7 @@ impl<'a> WalIngest<'a> {
// replaying it would fail to find the previous image of the page, because
// it doesn't exist. So check if the VM page(s) exist, and skip the WAL
// record if it doesn't.
let vm_size = self.get_relsize(vm_rel, modification.lsn)?;
let vm_size = self.get_relsize(vm_rel, modification.lsn).await?;
if let Some(blknum) = new_vm_blk {
if blknum >= vm_size {
new_vm_blk = None;
@@ -481,7 +478,8 @@ impl<'a> WalIngest<'a> {
old_heap_blkno,
flags: pg_constants::VISIBILITYMAP_VALID_BITS,
},
)?;
)
.await?;
} else {
// Clear VM bits for one heap page, or for two pages that reside on
// different VM pages.
@@ -495,7 +493,8 @@ impl<'a> WalIngest<'a> {
old_heap_blkno: None,
flags: pg_constants::VISIBILITYMAP_VALID_BITS,
},
)?;
)
.await?;
}
if let Some(old_vm_blk) = old_vm_blk {
self.put_rel_wal_record(
@@ -507,7 +506,8 @@ impl<'a> WalIngest<'a> {
old_heap_blkno,
flags: pg_constants::VISIBILITYMAP_VALID_BITS,
},
)?;
)
.await?;
}
}
}
@@ -517,9 +517,9 @@ impl<'a> WalIngest<'a> {
}
/// Subroutine of ingest_record(), to handle an XLOG_DBASE_CREATE record.
fn ingest_xlog_dbase_create(
async fn ingest_xlog_dbase_create(
&mut self,
modification: &mut DatadirModification,
modification: &mut DatadirModification<'_>,
rec: &XlCreateDatabase,
) -> anyhow::Result<()> {
let db_id = rec.db_id;
@@ -534,18 +534,22 @@ impl<'a> WalIngest<'a> {
// get calls instead.
let req_lsn = modification.tline.get_last_record_lsn();
let rels = modification
.tline
.list_rels(src_tablespace_id, src_db_id, req_lsn)
.no_ondemand_download()?;
let rels = with_ondemand_download(|| {
modification
.tline
.list_rels(src_tablespace_id, src_db_id, req_lsn)
})
.await?;
debug!("ingest_xlog_dbase_create: {} rels", rels.len());
// Copy relfilemap
let filemap = modification
.tline
.get_relmap_file(src_tablespace_id, src_db_id, req_lsn)
.no_ondemand_download()?;
let filemap = with_ondemand_download(|| {
modification
.tline
.get_relmap_file(src_tablespace_id, src_db_id, req_lsn)
})
.await?;
modification.put_relmap_file(tablespace_id, db_id, filemap)?;
let mut num_rels_copied = 0;
@@ -554,10 +558,9 @@ impl<'a> WalIngest<'a> {
assert_eq!(src_rel.spcnode, src_tablespace_id);
assert_eq!(src_rel.dbnode, src_db_id);
let nblocks = modification
.tline
.get_rel_size(src_rel, req_lsn, true)
.no_ondemand_download()?;
let nblocks =
with_ondemand_download(|| modification.tline.get_rel_size(src_rel, req_lsn, true))
.await?;
let dst_rel = RelTag {
spcnode: tablespace_id,
dbnode: db_id,
@@ -572,10 +575,12 @@ impl<'a> WalIngest<'a> {
for blknum in 0..nblocks {
debug!("copying block {} from {} to {}", blknum, src_rel, dst_rel);
let content = modification
.tline
.get_rel_page_at_lsn(src_rel, blknum, req_lsn, true)
.no_ondemand_download()?;
let content = with_ondemand_download(|| {
modification
.tline
.get_rel_page_at_lsn(src_rel, blknum, req_lsn, true)
})
.await?;
modification.put_rel_page_image(dst_rel, blknum, content)?;
num_blocks_copied += 1;
}
@@ -594,7 +599,7 @@ impl<'a> WalIngest<'a> {
&mut self,
modification: &mut DatadirModification,
rec: &XlSmgrCreate,
) -> Result<()> {
) -> anyhow::Result<()> {
let rel = RelTag {
spcnode: rec.rnode.spcnode,
dbnode: rec.rnode.dbnode,
@@ -608,11 +613,11 @@ impl<'a> WalIngest<'a> {
/// Subroutine of ingest_record(), to handle an XLOG_SMGR_TRUNCATE record.
///
/// This is the same logic as in PostgreSQL's smgr_redo() function.
fn ingest_xlog_smgr_truncate(
async fn ingest_xlog_smgr_truncate(
&mut self,
modification: &mut DatadirModification,
modification: &mut DatadirModification<'_>,
rec: &XlSmgrTruncate,
) -> Result<()> {
) -> anyhow::Result<()> {
let spcnode = rec.rnode.spcnode;
let dbnode = rec.rnode.dbnode;
let relnode = rec.rnode.relnode;
@@ -642,7 +647,7 @@ impl<'a> WalIngest<'a> {
modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?;
fsm_physical_page_no += 1;
}
let nblocks = self.get_relsize(rel, modification.lsn)?;
let nblocks = self.get_relsize(rel, modification.lsn).await?;
if nblocks > fsm_physical_page_no {
// check if something to do: FSM is larger than truncate position
self.put_rel_truncation(modification, rel, fsm_physical_page_no)?;
@@ -663,7 +668,7 @@ impl<'a> WalIngest<'a> {
modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?;
vm_page_no += 1;
}
let nblocks = self.get_relsize(rel, modification.lsn)?;
let nblocks = self.get_relsize(rel, modification.lsn).await?;
if nblocks > vm_page_no {
// check if something to do: VM is larger than truncate position
self.put_rel_truncation(modification, rel, vm_page_no)?;
@@ -674,9 +679,9 @@ impl<'a> WalIngest<'a> {
/// Subroutine of ingest_record(), to handle an XLOG_XACT_* records.
///
fn ingest_xact_record(
async fn ingest_xact_record(
&mut self,
modification: &mut DatadirModification,
modification: &mut DatadirModification<'_>,
parsed: &XlXactParsedRecord,
is_commit: bool,
) -> anyhow::Result<()> {
@@ -735,10 +740,8 @@ impl<'a> WalIngest<'a> {
relnode: xnode.relnode,
};
let last_lsn = self.timeline.get_last_record_lsn();
if modification
.tline
.get_rel_exists(rel, last_lsn, true)
.no_ondemand_download()?
if with_ondemand_download(|| modification.tline.get_rel_exists(rel, last_lsn, true))
.await?
{
self.put_rel_drop(modification, rel)?;
}
@@ -747,9 +750,9 @@ impl<'a> WalIngest<'a> {
Ok(())
}
fn ingest_clog_truncate_record(
async fn ingest_clog_truncate_record(
&mut self,
modification: &mut DatadirModification,
modification: &mut DatadirModification<'_>,
xlrec: &XlClogTruncate,
) -> anyhow::Result<()> {
info!(
@@ -791,11 +794,14 @@ impl<'a> WalIngest<'a> {
// it. So we use the previous record's LSN in the get calls
// instead.
let req_lsn = modification.tline.get_last_record_lsn();
for segno in modification
.tline
.list_slru_segments(SlruKind::Clog, req_lsn)
.no_ondemand_download()?
{
let slru_segments = with_ondemand_download(|| {
modification
.tline
.list_slru_segments(SlruKind::Clog, req_lsn)
})
.await?;
for segno in slru_segments {
let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
if slru_may_delete_clogsegment(segpage, xlrec.pageno) {
modification.drop_slru_segment(SlruKind::Clog, segno)?;
@@ -944,27 +950,26 @@ impl<'a> WalIngest<'a> {
Ok(())
}
fn put_rel_page_image(
async fn put_rel_page_image(
&mut self,
modification: &mut DatadirModification,
modification: &mut DatadirModification<'_>,
rel: RelTag,
blknum: BlockNumber,
img: Bytes,
) -> PageReconstructResult<()> {
try_no_ondemand_download!(self.handle_rel_extend(modification, rel, blknum));
try_prr!(modification.put_rel_page_image(rel, blknum, img));
PageReconstructResult::Success(())
) -> anyhow::Result<()> {
self.handle_rel_extend(modification, rel, blknum).await?;
modification.put_rel_page_image(rel, blknum, img)?;
Ok(())
}
fn put_rel_wal_record(
async fn put_rel_wal_record(
&mut self,
modification: &mut DatadirModification,
modification: &mut DatadirModification<'_>,
rel: RelTag,
blknum: BlockNumber,
rec: NeonWalRecord,
) -> Result<()> {
self.handle_rel_extend(modification, rel, blknum)
.no_ondemand_download()?;
) -> anyhow::Result<()> {
self.handle_rel_extend(modification, rel, blknum).await?;
modification.put_rel_wal_record(rel, blknum, rec)?;
Ok(())
}
@@ -984,69 +989,67 @@ impl<'a> WalIngest<'a> {
Ok(())
}
fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> anyhow::Result<BlockNumber> {
let nblocks = if !self
.timeline
.get_rel_exists(rel, lsn, true)
.no_ondemand_download()?
{
async fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> anyhow::Result<BlockNumber> {
let exists =
with_ondemand_download(|| self.timeline.get_rel_exists(rel, lsn, true)).await?;
let nblocks = if !exists {
0
} else {
self.timeline
.get_rel_size(rel, lsn, true)
.no_ondemand_download()?
with_ondemand_download(|| self.timeline.get_rel_size(rel, lsn, true)).await?
};
Ok(nblocks)
}
fn handle_rel_extend(
async fn handle_rel_extend(
&mut self,
modification: &mut DatadirModification,
modification: &mut DatadirModification<'_>,
rel: RelTag,
blknum: BlockNumber,
) -> PageReconstructResult<()> {
) -> anyhow::Result<()> {
let new_nblocks = blknum + 1;
// Check if the relation exists. We implicitly create relations on first
// record.
// TODO: would be nice if to be more explicit about it
let last_lsn = modification.lsn;
let old_nblocks =
if !try_no_ondemand_download!(self.timeline.get_rel_exists(rel, last_lsn, true)) {
if !with_ondemand_download(|| self.timeline.get_rel_exists(rel, last_lsn, true)).await?
{
// create it with 0 size initially, the logic below will extend it
try_prr!(modification.put_rel_creation(rel, 0));
modification.put_rel_creation(rel, 0)?;
0
} else {
try_no_ondemand_download!(self.timeline.get_rel_size(rel, last_lsn, true))
with_ondemand_download(|| self.timeline.get_rel_size(rel, last_lsn, true)).await?
};
if new_nblocks > old_nblocks {
//info!("extending {} {} to {}", rel, old_nblocks, new_nblocks);
try_prr!(modification.put_rel_extend(rel, new_nblocks));
modification.put_rel_extend(rel, new_nblocks)?;
// fill the gap with zeros
for gap_blknum in old_nblocks..blknum {
try_prr!(modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone()));
modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?;
}
}
PageReconstructResult::Success(())
Ok(())
}
fn put_slru_page_image(
async fn put_slru_page_image(
&mut self,
modification: &mut DatadirModification,
modification: &mut DatadirModification<'_>,
kind: SlruKind,
segno: u32,
blknum: BlockNumber,
img: Bytes,
) -> Result<()> {
self.handle_slru_extend(modification, kind, segno, blknum)?;
) -> anyhow::Result<()> {
self.handle_slru_extend(modification, kind, segno, blknum)
.await?;
modification.put_slru_page_image(kind, segno, blknum, img)?;
Ok(())
}
fn handle_slru_extend(
async fn handle_slru_extend(
&mut self,
modification: &mut DatadirModification,
modification: &mut DatadirModification<'_>,
kind: SlruKind,
segno: u32,
blknum: BlockNumber,
@@ -1060,18 +1063,17 @@ impl<'a> WalIngest<'a> {
// record.
// TODO: would be nice if to be more explicit about it
let last_lsn = self.timeline.get_last_record_lsn();
let old_nblocks = if !self
.timeline
.get_slru_segment_exists(kind, segno, last_lsn)
.no_ondemand_download()?
let old_nblocks = if !with_ondemand_download(|| {
self.timeline.get_slru_segment_exists(kind, segno, last_lsn)
})
.await?
{
// create it with 0 size initially, the logic below will extend it
modification.put_slru_segment_creation(kind, segno, 0)?;
0
} else {
self.timeline
.get_slru_segment_size(kind, segno, last_lsn)
.no_ondemand_download()?
with_ondemand_download(|| self.timeline.get_slru_segment_size(kind, segno, last_lsn))
.await?
};
if new_nblocks > old_nblocks {
@@ -1119,12 +1121,12 @@ mod tests {
static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);
fn init_walingest_test(tline: &Timeline) -> Result<WalIngest> {
async fn init_walingest_test(tline: &Timeline) -> Result<WalIngest> {
let mut m = tline.begin_modification(Lsn(0x10));
m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
m.put_relmap_file(0, 111, Bytes::from(""))?; // dummy relmapper file
m.commit()?;
let walingest = WalIngest::new(tline, Lsn(0x10)).no_ondemand_download()?;
let walingest = WalIngest::new(tline, Lsn(0x10)).await?;
Ok(walingest)
}
@@ -1133,28 +1135,28 @@ mod tests {
async fn test_relsize() -> Result<()> {
let tenant = TenantHarness::create("test_relsize")?.load().await;
let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
let mut walingest = init_walingest_test(&tline)?;
let mut walingest = init_walingest_test(&tline).await?;
let mut m = tline.begin_modification(Lsn(0x20));
walingest.put_rel_creation(&mut m, TESTREL_A)?;
walingest
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))
.no_ondemand_download()?;
.await?;
m.commit()?;
let mut m = tline.begin_modification(Lsn(0x30));
walingest
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"))
.no_ondemand_download()?;
.await?;
m.commit()?;
let mut m = tline.begin_modification(Lsn(0x40));
walingest
.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"))
.no_ondemand_download()?;
.await?;
m.commit()?;
let mut m = tline.begin_modification(Lsn(0x50));
walingest
.put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"))
.no_ondemand_download()?;
.await?;
m.commit()?;
assert_current_logical_size(&tline, Lsn(0x50));
@@ -1292,7 +1294,7 @@ mod tests {
let mut m = tline.begin_modification(Lsn(0x70));
walingest
.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"))
.no_ondemand_download()?;
.await?;
m.commit()?;
assert_eq!(
tline
@@ -1317,7 +1319,7 @@ mod tests {
let mut m = tline.begin_modification(Lsn(0x80));
walingest
.put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"))
.no_ondemand_download()?;
.await?;
m.commit()?;
assert_eq!(
tline
@@ -1349,12 +1351,12 @@ mod tests {
async fn test_drop_extend() -> Result<()> {
let tenant = TenantHarness::create("test_drop_extend")?.load().await;
let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
let mut walingest = init_walingest_test(&tline)?;
let mut walingest = init_walingest_test(&tline).await?;
let mut m = tline.begin_modification(Lsn(0x20));
walingest
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))
.no_ondemand_download()?;
.await?;
m.commit()?;
// Check that rel exists and size is correct
@@ -1391,7 +1393,7 @@ mod tests {
let mut m = tline.begin_modification(Lsn(0x40));
walingest
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"))
.no_ondemand_download()?;
.await?;
m.commit()?;
// Check that rel exists and size is correct
@@ -1418,7 +1420,7 @@ mod tests {
async fn test_truncate_extend() -> Result<()> {
let tenant = TenantHarness::create("test_truncate_extend")?.load().await;
let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
let mut walingest = init_walingest_test(&tline)?;
let mut walingest = init_walingest_test(&tline).await?;
// Create a 20 MB relation (the size is arbitrary)
let relsize = 20 * 1024 * 1024 / 8192;
@@ -1427,7 +1429,7 @@ mod tests {
let data = format!("foo blk {} at {}", blkno, Lsn(0x20));
walingest
.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))
.no_ondemand_download()?;
.await?;
}
m.commit()?;
@@ -1519,7 +1521,7 @@ mod tests {
let data = format!("foo blk {} at {}", blkno, lsn);
walingest
.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))
.no_ondemand_download()?;
.await?;
}
m.commit()?;
@@ -1556,7 +1558,7 @@ mod tests {
async fn test_large_rel() -> Result<()> {
let tenant = TenantHarness::create("test_large_rel")?.load().await;
let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
let mut walingest = init_walingest_test(&tline)?;
let mut walingest = init_walingest_test(&tline).await?;
let mut lsn = 0x10;
for blknum in 0..RELSEG_SIZE + 1 {
@@ -1565,7 +1567,7 @@ mod tests {
let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
walingest
.put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img)
.no_ondemand_download()?;
.await?;
m.commit()?;
}

View File

@@ -1,6 +1,7 @@
//! Actual Postgres connection handler to stream WAL to the server.
use std::{
error::Error,
str::FromStr,
sync::Arc,
time::{Duration, SystemTime},
@@ -11,7 +12,7 @@ use bytes::BytesMut;
use chrono::{NaiveDateTime, Utc};
use fail::fail_point;
use futures::StreamExt;
use postgres::{SimpleQueryMessage, SimpleQueryRow};
use postgres::{error::SqlState, SimpleQueryMessage, SimpleQueryRow};
use postgres_ffi::v14::xlog_utils::normalize_lsn;
use postgres_ffi::WAL_SEGMENT_SIZE;
use postgres_protocol::message::backend::ReplicationMessage;
@@ -20,9 +21,7 @@ use tokio::{pin, select, sync::watch, time};
use tokio_postgres::{replication::ReplicationStream, Client};
use tracing::{debug, error, info, trace, warn};
use crate::{
metrics::LIVE_CONNECTIONS_COUNT, tenant::with_ondemand_download, walreceiver::TaskStateUpdate,
};
use crate::{metrics::LIVE_CONNECTIONS_COUNT, walreceiver::TaskStateUpdate};
use crate::{
task_mgr,
task_mgr::TaskKind,
@@ -34,7 +33,7 @@ use crate::{
use postgres_connection::PgConnectionConfig;
use postgres_ffi::waldecoder::WalStreamDecoder;
use pq_proto::ReplicationFeedback;
use utils::lsn::Lsn;
use utils::{lsn::Lsn, postgres_backend_async::is_expected_io_error};
/// Status of the connection.
#[derive(Debug, Clone, Copy)]
@@ -70,10 +69,17 @@ pub async fn handle_walreceiver_connection(
let mut config = wal_source_connconf.to_tokio_postgres_config();
config.application_name("pageserver");
config.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
time::timeout(connect_timeout, config.connect(postgres::NoTls))
.await
.context("Timed out while waiting for walreceiver connection to open")?
.context("Failed to open walreceiver connection")?
match time::timeout(connect_timeout, config.connect(postgres::NoTls)).await {
Ok(Ok(client_and_conn)) => client_and_conn,
Ok(Err(conn_err)) => {
let expected_error = ignore_expected_errors(conn_err)?;
info!("DB connection stream finished: {expected_error}");
return Ok(());
}
Err(elapsed) => anyhow::bail!(
"Timed out while waiting {elapsed} for walreceiver connection to open"
),
}
};
info!("connected!");
@@ -105,10 +111,8 @@ pub async fn handle_walreceiver_connection(
connection_result = connection => match connection_result{
Ok(()) => info!("Walreceiver db connection closed"),
Err(connection_error) => {
if connection_error.is_closed() {
info!("Connection closed regularly: {connection_error}")
} else {
warn!("Connection aborted: {connection_error}")
if let Err(e) = ignore_expected_errors(connection_error) {
warn!("Connection aborted: {e:#}")
}
}
},
@@ -175,8 +179,7 @@ pub async fn handle_walreceiver_connection(
let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version);
let mut walingest =
with_ondemand_download(|| WalIngest::new(timeline.as_ref(), startpoint)).await?;
let mut walingest = WalIngest::new(timeline.as_ref(), startpoint).await?;
while let Some(replication_message) = {
select! {
@@ -190,14 +193,9 @@ pub async fn handle_walreceiver_connection(
let replication_message = match replication_message {
Ok(message) => message,
Err(replication_error) => {
if replication_error.is_closed() {
info!("Replication stream got closed");
return Ok(());
} else {
return Err(
anyhow::Error::new(replication_error).context("replication stream error")
);
}
let expected_error = ignore_expected_errors(replication_error)?;
info!("Replication stream finished: {expected_error}");
return Ok(());
}
};
@@ -251,16 +249,10 @@ pub async fn handle_walreceiver_connection(
// at risk of hitting a deadlock.
ensure!(lsn.is_aligned());
with_ondemand_download(|| {
walingest.ingest_record(
recdata.clone(),
lsn,
&mut modification,
&mut decoded,
)
})
.await
.with_context(|| format!("could not ingest record at {lsn}"))?;
walingest
.ingest_record(recdata.clone(), lsn, &mut modification, &mut decoded)
.await
.with_context(|| format!("could not ingest record at {lsn}"))?;
fail_point!("walreceiver-after-ingest");
@@ -409,3 +401,32 @@ async fn identify_system(client: &mut Client) -> anyhow::Result<IdentifySystem>
Err(IdentifyError.into())
}
}
/// We don't want to report connectivity problems as real errors towards connection manager because
/// 1. they happen frequently enough to make server logs hard to read and
/// 2. the connection manager can retry other safekeeper.
///
/// If this function returns `Ok(pg_error)`, it's such an error.
/// The caller should log it at info level and then report to connection manager that we're done handling this connection.
/// Connection manager will then handle reconnections.
///
/// If this function returns an `Err()`, the caller can bubble it up using `?`.
/// The connection manager will log the error at ERROR level.
fn ignore_expected_errors(pg_error: postgres::Error) -> anyhow::Result<postgres::Error> {
if pg_error.is_closed()
|| pg_error
.source()
.and_then(|source| source.downcast_ref::<std::io::Error>())
.map(is_expected_io_error)
.unwrap_or(false)
{
return Ok(pg_error);
} else if let Some(db_error) = pg_error.as_db_error() {
if db_error.code() == &SqlState::CONNECTION_FAILURE
&& db_error.message().contains("end streaming")
{
return Ok(pg_error);
}
}
Err(pg_error).context("connection error")
}

View File

@@ -111,6 +111,7 @@ pageserver_connect()
PQfinish(pageserver_conn);
pageserver_conn = NULL;
FreeWaitEventSet(pageserver_conn_wes);
pageserver_conn_wes = NULL;
neon_log(ERROR, "could not complete handshake with pageserver: %s",
msg);
@@ -179,7 +180,10 @@ pageserver_disconnect(void)
prefetch_on_ps_disconnect();
}
if (pageserver_conn_wes != NULL)
{
FreeWaitEventSet(pageserver_conn_wes);
pageserver_conn_wes = NULL;
}
}
static void
@@ -206,7 +210,7 @@ pageserver_send(NeonRequest * request)
*/
if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
{
char *msg = PQerrorMessage(pageserver_conn);
char *msg = pchomp(PQerrorMessage(pageserver_conn));
pageserver_disconnect();
neon_log(ERROR, "failed to send page request: %s", msg);
@@ -239,29 +243,33 @@ pageserver_receive(void)
PG_TRY();
{
/* read response */
resp_buff.len = call_PQgetCopyData(&resp_buff.data);
resp_buff.cursor = 0;
int rc;
if (resp_buff.len < 0)
rc = call_PQgetCopyData(&resp_buff.data);
if (rc >= 0)
{
if (resp_buff.len == -1)
resp_buff.len = rc;
resp_buff.cursor = 0;
resp = nm_unpack_response(&resp_buff);
PQfreemem(resp_buff.data);
if (message_level_is_interesting(PageStoreTrace))
{
pageserver_disconnect();
return NULL;
char *msg = nm_to_string((NeonMessage *) resp);
neon_log(PageStoreTrace, "got response: %s", msg);
pfree(msg);
}
else if (resp_buff.len == -2)
neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
}
resp = nm_unpack_response(&resp_buff);
PQfreemem(resp_buff.data);
if (message_level_is_interesting(PageStoreTrace))
else if (rc == -1)
{
char *msg = nm_to_string((NeonMessage *) resp);
neon_log(PageStoreTrace, "got response: %s", msg);
pfree(msg);
pageserver_disconnect();
resp = NULL;
}
else if (rc == -2)
neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
else
neon_log(ERROR, "unexpected PQgetCopyData return value: %d", rc);
}
PG_CATCH();
{

8
poetry.lock generated
View File

@@ -1418,7 +1418,7 @@ pbr = "*"
[[package]]
name = "setuptools"
version = "65.5.0"
version = "65.5.1"
description = "Easily download, build, install, upgrade, and uninstall Python packages"
category = "main"
optional = false
@@ -1426,7 +1426,7 @@ python-versions = ">=3.7"
[package.extras]
docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mock", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
[[package]]
@@ -2283,8 +2283,8 @@ sarif-om = [
{file = "sarif_om-1.0.4.tar.gz", hash = "sha256:cd5f416b3083e00d402a92e449a7ff67af46f11241073eea0461802a3b5aef98"},
]
setuptools = [
{file = "setuptools-65.5.0-py3-none-any.whl", hash = "sha256:f62ea9da9ed6289bfe868cd6845968a2c854d1427f8548d52cae02a42b4f0356"},
{file = "setuptools-65.5.0.tar.gz", hash = "sha256:512e5536220e38146176efb833d4a62aa726b7bbff82cfbc8ba9eaa3996e0b17"},
{file = "setuptools-65.5.1-py3-none-any.whl", hash = "sha256:d0b9a8433464d5800cbe05094acf5c6d52a91bfac9b52bcfc4d41382be5d5d31"},
{file = "setuptools-65.5.1.tar.gz", hash = "sha256:e197a19aa8ec9722928f2206f8de752def0e4c9fc6953527360d1c36d94ddb2f"},
]
six = [
{file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},

View File

@@ -2,6 +2,7 @@
name = "proxy"
version = "0.1.0"
edition = "2021"
license = "Apache-2.0"
[dependencies]
anyhow = "1.0"
@@ -16,12 +17,14 @@ hashbrown = "0.12"
hex = "0.4.3"
hmac = "0.12.1"
hyper = "0.14"
hyper-tungstenite = "0.8.1"
itertools = "0.10.3"
md5 = "0.7.0"
once_cell = "1.13.0"
parking_lot = "0.12"
pin-project-lite = "0.2.7"
rand = "0.8.3"
regex = "1.4.5"
reqwest = { version = "0.11", default-features = false, features = [ "json", "rustls-tls" ] }
routerify = "3"
rustls = "0.20.0"
@@ -35,10 +38,12 @@ thiserror = "1.0.30"
tokio = { version = "1.17", features = ["macros"] }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
tokio-rustls = "0.23.0"
tls-listener = { version = "0.5.1", features = ["rustls", "hyper-h1"] }
tracing = "0.1.36"
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
url = "2.2.2"
uuid = { version = "1.2", features = ["v4", "serde"] }
webpki-roots = "0.22.5"
x509-parser = "0.14"
metrics = { path = "../libs/metrics" }

View File

@@ -149,7 +149,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
// If there's no project so far, that entails that client doesn't
// support SNI or other means of passing the project name.
// We now expect to see a very specific payload in the place of password.
let fetch_magic_payload = async {
let fetch_magic_payload = |client| async {
warn!("project name not specified, resorting to the password hack auth flow");
let payload = AuthFlow::new(client)
.begin(auth::PasswordHack)
@@ -161,10 +161,26 @@ impl BackendType<'_, ClientCredentials<'_>> {
auth::Result::Ok(payload)
};
// If we want to use cleartext password flow, we can read the password
// from the client and pretend that it's a magic payload (PasswordHack hack).
let fetch_plaintext_password = |client| async {
info!("using cleartext password flow");
let payload = AuthFlow::new(client)
.begin(auth::CleartextPassword)
.await?
.authenticate()
.await?;
auth::Result::Ok(auth::password_hack::PasswordHackPayload {
project: String::new(),
password: payload,
})
};
// TODO: find a proper way to merge those very similar blocks.
let (mut node, payload) = match self {
Console(endpoint, creds) if creds.project.is_none() => {
let payload = fetch_magic_payload.await?;
let payload = fetch_magic_payload(client).await?;
let mut creds = creds.as_ref();
creds.project = Some(payload.project.as_str().into());
@@ -174,8 +190,18 @@ impl BackendType<'_, ClientCredentials<'_>> {
(node, payload)
}
Console(endpoint, creds) if creds.use_cleartext_password_flow => {
// This is a hack to allow cleartext password in secure connections (wss).
let payload = fetch_plaintext_password(client).await?;
let creds = creds.as_ref();
let node = console::Api::new(endpoint, extra, &creds)
.wake_compute()
.await?;
(node, payload)
}
Postgres(endpoint, creds) if creds.project.is_none() => {
let payload = fetch_magic_payload.await?;
let payload = fetch_magic_payload(client).await?;
let mut creds = creds.as_ref();
creds.project = Some(payload.project.as_str().into());

View File

@@ -34,6 +34,9 @@ pub struct ClientCredentials<'a> {
pub user: &'a str,
pub dbname: &'a str,
pub project: Option<Cow<'a, str>>,
/// If `True`, we'll use the old cleartext password flow. This is used for
/// websocket connections, which want to minimize the number of round trips.
pub use_cleartext_password_flow: bool,
}
impl ClientCredentials<'_> {
@@ -50,6 +53,7 @@ impl<'a> ClientCredentials<'a> {
user: self.user,
dbname: self.dbname,
project: self.project().map(Cow::Borrowed),
use_cleartext_password_flow: self.use_cleartext_password_flow,
}
}
}
@@ -59,6 +63,7 @@ impl<'a> ClientCredentials<'a> {
params: &'a StartupMessageParams,
sni: Option<&str>,
common_name: Option<&str>,
use_cleartext_password_flow: bool,
) -> Result<Self, ClientCredsParseError> {
use ClientCredsParseError::*;
@@ -108,6 +113,7 @@ impl<'a> ClientCredentials<'a> {
user = user,
dbname = dbname,
project = project.as_deref(),
use_cleartext_password_flow = use_cleartext_password_flow,
"credentials"
);
@@ -115,6 +121,7 @@ impl<'a> ClientCredentials<'a> {
user,
dbname,
project,
use_cleartext_password_flow,
})
}
}
@@ -141,7 +148,7 @@ mod tests {
let options = StartupMessageParams::new([("user", "john_doe")]);
// TODO: check that `creds.dbname` is None.
let creds = ClientCredentials::parse(&options, None, None)?;
let creds = ClientCredentials::parse(&options, None, None, false)?;
assert_eq!(creds.user, "john_doe");
Ok(())
@@ -151,7 +158,7 @@ mod tests {
fn parse_missing_project() -> anyhow::Result<()> {
let options = StartupMessageParams::new([("user", "john_doe"), ("database", "world")]);
let creds = ClientCredentials::parse(&options, None, None)?;
let creds = ClientCredentials::parse(&options, None, None, false)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.dbname, "world");
assert_eq!(creds.project, None);
@@ -166,7 +173,7 @@ mod tests {
let sni = Some("foo.localhost");
let common_name = Some("localhost");
let creds = ClientCredentials::parse(&options, sni, common_name)?;
let creds = ClientCredentials::parse(&options, sni, common_name, false)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.dbname, "world");
assert_eq!(creds.project.as_deref(), Some("foo"));
@@ -182,7 +189,7 @@ mod tests {
("options", "-ckey=1 project=bar -c geqo=off"),
]);
let creds = ClientCredentials::parse(&options, None, None)?;
let creds = ClientCredentials::parse(&options, None, None, false)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.dbname, "world");
assert_eq!(creds.project.as_deref(), Some("bar"));
@@ -201,7 +208,7 @@ mod tests {
let sni = Some("baz.localhost");
let common_name = Some("localhost");
let creds = ClientCredentials::parse(&options, sni, common_name)?;
let creds = ClientCredentials::parse(&options, sni, common_name, false)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.dbname, "world");
assert_eq!(creds.project.as_deref(), Some("baz"));
@@ -220,7 +227,8 @@ mod tests {
let sni = Some("second.localhost");
let common_name = Some("localhost");
let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail");
let err =
ClientCredentials::parse(&options, sni, common_name, false).expect_err("should fail");
match err {
InconsistentProjectNames { domain, option } => {
assert_eq!(option, "first");
@@ -237,7 +245,8 @@ mod tests {
let sni = Some("project.localhost");
let common_name = Some("example.com");
let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail");
let err =
ClientCredentials::parse(&options, sni, common_name, false).expect_err("should fail");
match err {
InconsistentSni { sni, cn } => {
assert_eq!(sni, "project.localhost");

View File

@@ -37,6 +37,17 @@ impl AuthMethod for PasswordHack {
}
}
/// Use clear-text password auth called `password` in docs
/// <https://www.postgresql.org/docs/current/auth-password.html>
pub struct CleartextPassword;
impl AuthMethod for CleartextPassword {
#[inline(always)]
fn first_message(&self) -> BeMessage<'_> {
Be::AuthenticationCleartextPassword
}
}
/// This wrapper for [`PqStream`] performs client authentication.
#[must_use]
pub struct AuthFlow<'a, Stream, State> {
@@ -86,6 +97,18 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
}
}
impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
/// Perform user authentication. Raise an error in case authentication failed.
pub async fn authenticate(self) -> super::Result<Vec<u8>> {
let msg = self.stream.read_password_message().await?;
let password = msg
.strip_suffix(&[0])
.ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?;
Ok(password.to_vec())
}
}
/// Stream wrapper for handling [SCRAM](crate::scram) auth.
impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
/// Perform user authentication. Raise an error in case authentication failed.

View File

@@ -1,4 +1,5 @@
pub mod server;
pub mod websocket;
use crate::url::ApiUrl;

263
proxy/src/http/websocket.rs Normal file
View File

@@ -0,0 +1,263 @@
use bytes::{Buf, Bytes};
use futures::{Sink, Stream, StreamExt};
use hyper::server::accept::{self};
use hyper::server::conn::AddrIncoming;
use hyper::upgrade::Upgraded;
use hyper::{Body, Request, Response, StatusCode};
use hyper_tungstenite::{tungstenite, WebSocketStream};
use hyper_tungstenite::{tungstenite::Message, HyperWebsocket};
use pin_project_lite::pin_project;
use tokio::net::TcpListener;
use std::convert::Infallible;
use std::future::ready;
use std::pin::Pin;
use std::sync::Arc;
use std::task::{Context, Poll};
use tls_listener::TlsListener;
use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf};
use tracing::{error, info, info_span, warn, Instrument};
use utils::http::{error::ApiError, json::json_response};
use crate::cancellation::CancelMap;
use crate::config::ProxyConfig;
use crate::proxy::handle_ws_client;
pin_project! {
/// This is a wrapper around a WebSocketStream that implements AsyncRead and AsyncWrite.
pub struct WebSocketRW {
#[pin]
stream: WebSocketStream<Upgraded>,
chunk: Option<bytes::Bytes>,
}
}
// FIXME: explain why this is safe or try to remove `unsafe impl`.
unsafe impl Sync for WebSocketRW {}
impl WebSocketRW {
pub fn new(stream: WebSocketStream<Upgraded>) -> Self {
Self {
stream,
chunk: None,
}
}
fn has_chunk(&self) -> bool {
if let Some(ref chunk) = self.chunk {
chunk.remaining() > 0
} else {
false
}
}
}
fn ws_err_into(e: tungstenite::Error) -> io::Error {
io::Error::new(io::ErrorKind::Other, e.to_string())
}
impl AsyncWrite for WebSocketRW {
fn poll_write(
self: Pin<&mut Self>,
cx: &mut Context<'_>,
buf: &[u8],
) -> Poll<Result<usize, io::Error>> {
let mut this = self.project();
match this.stream.as_mut().poll_ready(cx) {
Poll::Ready(Ok(())) => {
if let Err(e) = this
.stream
.as_mut()
.start_send(Message::Binary(buf.to_vec()))
{
Poll::Ready(Err(ws_err_into(e)))
} else {
Poll::Ready(Ok(buf.len()))
}
}
Poll::Ready(Err(e)) => Poll::Ready(Err(ws_err_into(e))),
Poll::Pending => {
cx.waker().wake_by_ref();
Poll::Pending
}
}
}
fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
self.project().stream.poll_flush(cx).map_err(ws_err_into)
}
fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
self.project().stream.poll_close(cx).map_err(ws_err_into)
}
}
impl AsyncRead for WebSocketRW {
fn poll_read(
mut self: Pin<&mut Self>,
cx: &mut Context<'_>,
buf: &mut ReadBuf<'_>,
) -> Poll<io::Result<()>> {
if buf.remaining() == 0 {
return Poll::Ready(Ok(()));
}
let inner_buf = match self.as_mut().poll_fill_buf(cx) {
Poll::Ready(Ok(buf)) => buf,
Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
Poll::Pending => return Poll::Pending,
};
let len = std::cmp::min(inner_buf.len(), buf.remaining());
buf.put_slice(&inner_buf[..len]);
self.consume(len);
Poll::Ready(Ok(()))
}
}
impl AsyncBufRead for WebSocketRW {
fn poll_fill_buf(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<&[u8]>> {
loop {
if self.as_mut().has_chunk() {
let buf = self.project().chunk.as_ref().unwrap().chunk();
return Poll::Ready(Ok(buf));
} else {
match self.as_mut().project().stream.poll_next(cx) {
Poll::Ready(Some(Ok(message))) => match message {
Message::Text(_) => {}
Message::Binary(chunk) => {
*self.as_mut().project().chunk = Some(Bytes::from(chunk));
}
Message::Ping(_) => {
// No need to send a reply: tungstenite takes care of this for you.
}
Message::Pong(_) => {}
Message::Close(_) => {
// No need to send a reply: tungstenite takes care of this for you.
return Poll::Ready(Ok(&[]));
}
Message::Frame(_) => {
unreachable!();
}
},
Poll::Ready(Some(Err(err))) => return Poll::Ready(Err(ws_err_into(err))),
Poll::Ready(None) => return Poll::Ready(Ok(&[])),
Poll::Pending => return Poll::Pending,
}
}
}
}
fn consume(self: Pin<&mut Self>, amt: usize) {
if amt > 0 {
self.project()
.chunk
.as_mut()
.expect("No chunk present")
.advance(amt);
}
}
}
async fn serve_websocket(
websocket: HyperWebsocket,
config: &ProxyConfig,
cancel_map: &CancelMap,
session_id: uuid::Uuid,
hostname: Option<String>,
) -> anyhow::Result<()> {
let websocket = websocket.await?;
handle_ws_client(
config,
cancel_map,
session_id,
WebSocketRW::new(websocket),
hostname,
)
.await?;
Ok(())
}
async fn ws_handler(
mut request: Request<Body>,
config: &'static ProxyConfig,
cancel_map: Arc<CancelMap>,
session_id: uuid::Uuid,
) -> Result<Response<Body>, ApiError> {
let host = request
.headers()
.get("host")
.and_then(|h| h.to_str().ok())
.and_then(|h| h.split(':').next())
.map(|s| s.to_string());
// Check if the request is a websocket upgrade request.
if hyper_tungstenite::is_upgrade_request(&request) {
let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
.map_err(|e| ApiError::BadRequest(e.into()))?;
tokio::spawn(async move {
if let Err(e) = serve_websocket(websocket, config, &cancel_map, session_id, host).await
{
error!("error in websocket connection: {:?}", e);
}
});
// Return the response so the spawned future can continue.
Ok(response)
} else {
json_response(StatusCode::OK, "Connect with a websocket client")
}
}
pub async fn task_main(
ws_listener: TcpListener,
config: &'static ProxyConfig,
) -> anyhow::Result<()> {
scopeguard::defer! {
info!("websocket server has shut down");
}
let tls_config = config.tls_config.as_ref().map(|cfg| cfg.to_server_config());
let tls_acceptor: tokio_rustls::TlsAcceptor = match tls_config {
Some(config) => config.into(),
None => {
warn!("TLS config is missing, WebSocket Secure server will not be started");
return Ok(());
}
};
let addr_incoming = AddrIncoming::from_listener(ws_listener)?;
let tls_listener = TlsListener::new(tls_acceptor, addr_incoming).filter(|conn| {
if let Err(err) = conn {
error!("failed to accept TLS connection for websockets: {:?}", err);
ready(false)
} else {
ready(true)
}
});
let make_svc = hyper::service::make_service_fn(|_stream| async move {
Ok::<_, Infallible>(hyper::service::service_fn(
move |req: Request<Body>| async move {
let cancel_map = Arc::new(CancelMap::default());
let session_id = uuid::Uuid::new_v4();
ws_handler(req, config, cancel_map, session_id)
.instrument(info_span!(
"ws-client",
session = format_args!("{session_id}")
))
.await
},
))
});
hyper::Server::builder(accept::from_stream(tls_listener))
.serve(make_svc)
.await?;
Ok(())
}

View File

@@ -110,12 +110,23 @@ async fn main() -> anyhow::Result<()> {
info!("Starting proxy on {proxy_address}");
let proxy_listener = TcpListener::bind(proxy_address).await?;
let tasks = [
let mut tasks = vec![
tokio::spawn(http::server::task_main(http_listener)),
tokio::spawn(proxy::task_main(config, proxy_listener)),
tokio::task::spawn_blocking(move || mgmt::thread_main(mgmt_listener)),
]
.map(flatten_err);
];
if let Some(wss_address) = arg_matches.get_one::<String>("wss") {
let wss_address: SocketAddr = wss_address.parse()?;
info!("Starting wss on {}", wss_address);
let wss_listener = TcpListener::bind(wss_address).await?;
tasks.push(tokio::spawn(http::websocket::task_main(
wss_listener,
config,
)));
}
let tasks = tasks.into_iter().map(flatten_err);
set_build_info_metric(GIT_VERSION);
// This will block until all tasks have completed.
@@ -155,6 +166,11 @@ fn cli() -> clap::Command {
.help("listen for incoming http connections (metrics, etc) on ip:port")
.default_value("127.0.0.1:7001"),
)
.arg(
Arg::new("wss")
.long("wss")
.help("listen for incoming wss connections on ip:port"),
)
.arg(
Arg::new("uri")
.short('u')

View File

@@ -9,7 +9,10 @@ use std::{
thread,
};
use tracing::{error, info, info_span};
use utils::postgres_backend::{self, AuthType, PostgresBackend};
use utils::{
postgres_backend::{self, AuthType, PostgresBackend},
postgres_backend_async::QueryError,
};
/// Console management API listener thread.
/// It spawns console response handlers needed for the link auth.
@@ -47,7 +50,7 @@ pub fn thread_main(listener: TcpListener) -> anyhow::Result<()> {
}
}
fn handle_connection(socket: TcpStream) -> anyhow::Result<()> {
fn handle_connection(socket: TcpStream) -> Result<(), QueryError> {
let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None, true)?;
pgbackend.run(&mut MgmtHandler)
}
@@ -58,7 +61,7 @@ pub type ComputeReady = Result<DatabaseInfo, String>;
// TODO: replace with an http-based protocol.
struct MgmtHandler;
impl postgres_backend::Handler for MgmtHandler {
fn process_query(&mut self, pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<()> {
fn process_query(&mut self, pgb: &mut PostgresBackend, query: &str) -> Result<(), QueryError> {
try_process_query(pgb, query).map_err(|e| {
error!("failed to process response: {e:?}");
e
@@ -66,8 +69,8 @@ impl postgres_backend::Handler for MgmtHandler {
}
}
fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<()> {
let resp: KickSession = serde_json::from_str(query)?;
fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> Result<(), QueryError> {
let resp: KickSession = serde_json::from_str(query).context("Failed to parse query as json")?;
let span = info_span!("event", session_id = resp.session_id);
let _enter = span.enter();
@@ -81,7 +84,7 @@ fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<(
}
Err(e) => {
error!("failed to deliver response to per-client task");
pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
pgb.write_message(&BeMessage::ErrorResponse(&e.to_string(), None))?;
}
}

View File

@@ -82,6 +82,47 @@ pub async fn task_main(
}
}
pub async fn handle_ws_client(
config: &ProxyConfig,
cancel_map: &CancelMap,
session_id: uuid::Uuid,
stream: impl AsyncRead + AsyncWrite + Unpin + Send,
hostname: Option<String>,
) -> anyhow::Result<()> {
// The `closed` counter will increase when this future is destroyed.
NUM_CONNECTIONS_ACCEPTED_COUNTER.inc();
scopeguard::defer! {
NUM_CONNECTIONS_CLOSED_COUNTER.inc();
}
let tls = config.tls_config.as_ref();
let hostname = hostname.as_deref();
// TLS is None here, because the connection is already encrypted.
let do_handshake = handshake(stream, None, cancel_map).instrument(info_span!("handshake"));
let (mut stream, params) = match do_handshake.await? {
Some(x) => x,
None => return Ok(()), // it's a cancellation request
};
// Extract credentials which we're going to use for auth.
let creds = {
let common_name = tls.and_then(|tls| tls.common_name.as_deref());
let result = config
.auth_backend
.as_ref()
.map(|_| auth::ClientCredentials::parse(&params, hostname, common_name, true))
.transpose();
async { result }.or_else(|e| stream.throw_error(e)).await?
};
let client = Client::new(stream, creds, &params, session_id);
cancel_map
.with_session(|session| client.connect_to_db(session))
.await
}
async fn handle_client(
config: &ProxyConfig,
cancel_map: &CancelMap,
@@ -108,7 +149,7 @@ async fn handle_client(
let result = config
.auth_backend
.as_ref()
.map(|_| auth::ClientCredentials::parse(&params, sni, common_name))
.map(|_| auth::ClientCredentials::parse(&params, sni, common_name, false))
.transpose();
async { result }.or_else(|e| stream.throw_error(e)).await?

View File

@@ -2,7 +2,7 @@ use crate::error::UserFacingError;
use anyhow::bail;
use bytes::BytesMut;
use pin_project_lite::pin_project;
use pq_proto::{BeMessage, FeMessage, FeStartupPacket};
use pq_proto::{BeMessage, ConnectionError, FeMessage, FeStartupPacket};
use rustls::ServerConfig;
use std::pin::Pin;
use std::sync::Arc;
@@ -47,18 +47,13 @@ fn err_connection() -> io::Error {
io::Error::new(io::ErrorKind::ConnectionAborted, "connection is lost")
}
// TODO: change error type of `FeMessage::read_fut`
fn from_anyhow(e: anyhow::Error) -> io::Error {
io::Error::new(io::ErrorKind::Other, e.to_string())
}
impl<S: AsyncRead + Unpin> PqStream<S> {
/// Receive [`FeStartupPacket`], which is a first packet sent by a client.
pub async fn read_startup_packet(&mut self) -> io::Result<FeStartupPacket> {
// TODO: `FeStartupPacket::read_fut` should return `FeStartupPacket`
let msg = FeStartupPacket::read_fut(&mut self.stream)
.await
.map_err(from_anyhow)?
.map_err(ConnectionError::into_io_error)?
.ok_or_else(err_connection)?;
match msg {
@@ -80,7 +75,7 @@ impl<S: AsyncRead + Unpin> PqStream<S> {
async fn read_message(&mut self) -> io::Result<FeMessage> {
FeMessage::read_fut(&mut self.stream)
.await
.map_err(from_anyhow)?
.map_err(ConnectionError::into_io_error)?
.ok_or_else(err_connection)
}
}
@@ -112,7 +107,8 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
/// This method exists due to `&str` not implementing `Into<anyhow::Error>`.
pub async fn throw_error_str<T>(&mut self, error: &'static str) -> anyhow::Result<T> {
tracing::info!("forwarding error to user: {error}");
self.write_message(&BeMessage::ErrorResponse(error)).await?;
self.write_message(&BeMessage::ErrorResponse(error, None))
.await?;
bail!(error)
}
@@ -124,7 +120,8 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
{
let msg = error.to_string_client();
tracing::info!("forwarding error to user: {msg}");
self.write_message(&BeMessage::ErrorResponse(&msg)).await?;
self.write_message(&BeMessage::ErrorResponse(&msg, None))
.await?;
bail!(error)
}
}

View File

@@ -9,8 +9,8 @@
# In vscode, this setting is Rust-analyzer>Check On Save:Command
# Not every feature is supported in macOS builds, e.g. `profiling`,
# avoid running regular linting script that checks every feature.
# Not every feature is supported in macOS builds. Avoid running regular linting
# script that checks every feature.
if [[ "$OSTYPE" == "darwin"* ]]; then
# no extra features to test currently, add more here when needed
cargo clippy --locked --all --all-targets --features testing -- -A unknown_lints -D warnings

View File

@@ -2,6 +2,7 @@
name = "safekeeper"
version = "0.1.0"
edition = "2021"
license = "Apache-2.0"
[dependencies]
async-stream = "0.3"

View File

@@ -143,6 +143,19 @@ fn main() -> anyhow::Result<()> {
return Ok(());
}
let auth = match args.auth_validation_public_key_path.as_ref() {
None => {
info!("auth is disabled");
None
}
Some(path) => {
info!("loading JWT auth key from {}", path.display());
Some(Arc::new(
JwtAuth::from_key_path(path).context("failed to load the auth key")?,
))
}
};
let conf = SafeKeeperConf {
workdir,
my_id: id,
@@ -156,7 +169,7 @@ fn main() -> anyhow::Result<()> {
max_offloader_lag_bytes: args.max_offloader_lag,
backup_runtime_threads: args.wal_backup_threads,
wal_backup_enabled: !args.disable_wal_backup,
auth_validation_public_key_path: args.auth_validation_public_key_path,
auth,
};
// initialize sentry if SENTRY_DSN is provided
@@ -186,19 +199,6 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
e
})?;
let auth = match conf.auth_validation_public_key_path.as_ref() {
None => {
info!("auth is disabled");
None
}
Some(path) => {
info!("loading JWT auth key from {}", path.display());
Some(Arc::new(
JwtAuth::from_key_path(path).context("failed to load the auth key")?,
))
}
};
// Register metrics collector for active timelines. It's important to do this
// after daemonizing, otherwise process collector will be upset.
let timeline_collector = safekeeper::metrics::TimelineCollector::new();
@@ -212,12 +212,11 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx)?;
let conf_ = conf.clone();
let auth_ = auth.clone();
threads.push(
thread::Builder::new()
.name("http_endpoint_thread".into())
.spawn(|| {
let router = http::make_router(conf_, auth_);
let router = http::make_router(conf_);
endpoint::serve_thread_main(
router,
http_listener,
@@ -230,11 +229,7 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
let conf_cloned = conf.clone();
let safekeeper_thread = thread::Builder::new()
.name("safekeeper thread".into())
.spawn(|| {
if let Err(e) = wal_service::thread_main(conf_cloned, pg_listener, auth) {
info!("safekeeper thread terminated: {e}");
}
})
.spawn(|| wal_service::thread_main(conf_cloned, pg_listener))
.unwrap();
threads.push(safekeeper_thread);
@@ -244,7 +239,6 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
thread::Builder::new()
.name("broker thread".into())
.spawn(|| {
// TODO: add auth?
broker::thread_main(conf_);
})?,
);

View File

@@ -8,16 +8,16 @@ use crate::receive_wal::ReceiveWalConn;
use crate::send_wal::ReplicationConn;
use crate::{GlobalTimelines, SafeKeeperConf};
use anyhow::{bail, ensure, Context, Result};
use anyhow::Context;
use postgres_ffi::PG_TLI;
use regex::Regex;
use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID};
use std::str;
use std::sync::Arc;
use tracing::info;
use utils::auth::{Claims, JwtAuth, Scope};
use utils::auth::{Claims, Scope};
use utils::postgres_backend_async::QueryError;
use utils::{
id::{TenantId, TenantTimelineId, TimelineId},
lsn::Lsn,
@@ -32,7 +32,6 @@ pub struct SafekeeperPostgresHandler {
pub tenant_id: Option<TenantId>,
pub timeline_id: Option<TimelineId>,
pub ttid: TenantTimelineId,
auth: Option<Arc<JwtAuth>>,
claims: Option<Claims>,
}
@@ -44,7 +43,7 @@ enum SafekeeperPostgresCommand {
JSONCtrl { cmd: AppendLogicalMessage },
}
fn parse_cmd(cmd: &str) -> Result<SafekeeperPostgresCommand> {
fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
if cmd.starts_with("START_WAL_PUSH") {
Ok(SafekeeperPostgresCommand::StartWalPush)
} else if cmd.starts_with("START_REPLICATION") {
@@ -64,13 +63,17 @@ fn parse_cmd(cmd: &str) -> Result<SafekeeperPostgresCommand> {
cmd: serde_json::from_str(cmd)?,
})
} else {
bail!("unsupported command {}", cmd);
anyhow::bail!("unsupported command {cmd}");
}
}
impl postgres_backend::Handler for SafekeeperPostgresHandler {
// tenant_id and timeline_id are passed in connection string params
fn startup(&mut self, _pgb: &mut PostgresBackend, sm: &FeStartupPacket) -> Result<()> {
fn startup(
&mut self,
_pgb: &mut PostgresBackend,
sm: &FeStartupPacket,
) -> Result<(), QueryError> {
if let FeStartupPacket::StartupMessage { params, .. } = sm {
if let Some(options) = params.options_raw() {
for opt in options {
@@ -79,10 +82,14 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
// https://github.com/neondatabase/neon/pull/2433#discussion_r970005064
match opt.split_once('=') {
Some(("ztenantid", value)) | Some(("tenant_id", value)) => {
self.tenant_id = Some(value.parse()?);
self.tenant_id = Some(value.parse().with_context(|| {
format!("Failed to parse {value} as tenant id")
})?);
}
Some(("ztimelineid", value)) | Some(("timeline_id", value)) => {
self.timeline_id = Some(value.parse()?);
self.timeline_id = Some(value.parse().with_context(|| {
format!("Failed to parse {value} as timeline id")
})?);
}
_ => continue,
}
@@ -95,7 +102,9 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
Ok(())
} else {
bail!("Safekeeper received unexpected initial message: {:?}", sm);
Err(QueryError::Other(anyhow::anyhow!(
"Safekeeper received unexpected initial message: {sm:?}"
)))
}
}
@@ -103,20 +112,20 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
&mut self,
_pgb: &mut PostgresBackend,
jwt_response: &[u8],
) -> anyhow::Result<()> {
) -> Result<(), QueryError> {
// this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT
// which requires auth to be present
let data = self
.conf
.auth
.as_ref()
.unwrap()
.decode(str::from_utf8(jwt_response)?)?;
.decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;
if matches!(data.claims.scope, Scope::Tenant) {
ensure!(
data.claims.tenant_id.is_some(),
if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() {
return Err(QueryError::Other(anyhow::anyhow!(
"jwt token scope is Tenant, but tenant id is missing"
)
)));
}
info!(
@@ -128,7 +137,11 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
Ok(())
}
fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()> {
fn process_query(
&mut self,
pgb: &mut PostgresBackend,
query_string: &str,
) -> Result<(), QueryError> {
if query_string
.to_ascii_lowercase()
.starts_with("set datestyle to ")
@@ -149,39 +162,45 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
self.check_permission(Some(tenant_id))?;
self.ttid = TenantTimelineId::new(tenant_id, timeline_id);
match cmd {
let res = match cmd {
SafekeeperPostgresCommand::StartWalPush => ReceiveWalConn::new(pgb).run(self),
SafekeeperPostgresCommand::StartReplication { start_lsn } => {
ReplicationConn::new(pgb).run(self, pgb, start_lsn)
}
SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb),
SafekeeperPostgresCommand::JSONCtrl { ref cmd } => handle_json_ctrl(self, pgb, cmd),
}
.context(format!(
"Failed to process query for timeline {timeline_id}"
))?;
};
Ok(())
match res {
Ok(()) => Ok(()),
Err(QueryError::Disconnected(connection_error)) => {
info!("Timeline {tenant_id}/{timeline_id} query failed with connection error: {connection_error}");
Err(QueryError::Disconnected(connection_error))
}
Err(QueryError::Other(e)) => Err(QueryError::Other(e.context(format!(
"Failed to process query for timeline {}",
self.ttid
)))),
}
}
}
impl SafekeeperPostgresHandler {
pub fn new(conf: SafeKeeperConf, auth: Option<Arc<JwtAuth>>) -> Self {
pub fn new(conf: SafeKeeperConf) -> Self {
SafekeeperPostgresHandler {
conf,
appname: None,
tenant_id: None,
timeline_id: None,
ttid: TenantTimelineId::empty(),
auth,
claims: None,
}
}
// when accessing management api supply None as an argument
// when using to authorize tenant pass corresponding tenant id
fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<()> {
if self.auth.is_none() {
fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
if self.conf.auth.is_none() {
// auth is set to Trust, nothing to check so just return ok
return Ok(());
}
@@ -198,7 +217,7 @@ impl SafekeeperPostgresHandler {
///
/// Handle IDENTIFY_SYSTEM replication command
///
fn handle_identify_system(&mut self, pgb: &mut PostgresBackend) -> Result<()> {
fn handle_identify_system(&mut self, pgb: &mut PostgresBackend) -> Result<(), QueryError> {
let tli = GlobalTimelines::get(self.ttid)?;
let lsn = if self.is_walproposer_recovery() {

View File

@@ -277,12 +277,9 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
}
/// Safekeeper http router.
pub fn make_router(
conf: SafeKeeperConf,
auth: Option<Arc<JwtAuth>>,
) -> RouterBuilder<hyper::Body, ApiError> {
pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError> {
let mut router = endpoint::make_router();
if auth.is_some() {
if conf.auth.is_some() {
router = router.middleware(auth_middleware(|request| {
#[allow(clippy::mutable_key_type)]
static ALLOWLIST_ROUTES: Lazy<HashSet<Uri>> =
@@ -298,6 +295,7 @@ pub fn make_router(
// NB: on any changes do not forget to update the OpenAPI spec
// located nearby (/safekeeper/src/http/openapi_spec.yaml).
let auth = conf.auth.clone();
router
.data(Arc::new(conf))
.data(auth)

View File

@@ -8,11 +8,12 @@
use std::sync::Arc;
use anyhow::Result;
use anyhow::Context;
use bytes::Bytes;
use serde::{Deserialize, Serialize};
use tracing::*;
use utils::id::TenantTimelineId;
use utils::postgres_backend_async::QueryError;
use crate::handler::SafekeeperPostgresHandler;
use crate::safekeeper::{AcceptorProposerMessage, AppendResponse, ServerInfo};
@@ -47,7 +48,7 @@ pub struct AppendLogicalMessage {
pg_version: u32,
}
#[derive(Serialize, Deserialize)]
#[derive(Debug, Serialize, Deserialize)]
struct AppendResult {
// safekeeper state after append
state: SafeKeeperState,
@@ -62,8 +63,8 @@ pub fn handle_json_ctrl(
spg: &SafekeeperPostgresHandler,
pgb: &mut PostgresBackend,
append_request: &AppendLogicalMessage,
) -> Result<()> {
info!("JSON_CTRL request: {:?}", append_request);
) -> Result<(), QueryError> {
info!("JSON_CTRL request: {append_request:?}");
// need to init safekeeper state before AppendRequest
let tli = prepare_safekeeper(spg.ttid, append_request.pg_version)?;
@@ -78,7 +79,8 @@ pub fn handle_json_ctrl(
state: tli.get_state().1,
inserted_wal,
};
let response_data = serde_json::to_vec(&response)?;
let response_data = serde_json::to_vec(&response)
.with_context(|| format!("Response {response:?} is not a json array"))?;
pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor {
name: b"json",
@@ -93,7 +95,7 @@ pub fn handle_json_ctrl(
/// Prepare safekeeper to process append requests without crashes,
/// by sending ProposerGreeting with default server.wal_seg_size.
fn prepare_safekeeper(ttid: TenantTimelineId, pg_version: u32) -> Result<Arc<Timeline>> {
fn prepare_safekeeper(ttid: TenantTimelineId, pg_version: u32) -> anyhow::Result<Arc<Timeline>> {
GlobalTimelines::create(
ttid,
ServerInfo {
@@ -106,7 +108,7 @@ fn prepare_safekeeper(ttid: TenantTimelineId, pg_version: u32) -> Result<Arc<Tim
)
}
fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> Result<()> {
fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> anyhow::Result<()> {
// add new term to existing history
let history = tli.get_state().1.acceptor_state.term_history;
let history = history.up_to(lsn.checked_sub(1u64).unwrap());
@@ -125,7 +127,7 @@ fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> Result<()
Ok(())
}
#[derive(Serialize, Deserialize)]
#[derive(Debug, Serialize, Deserialize)]
struct InsertedWAL {
begin_lsn: Lsn,
end_lsn: Lsn,
@@ -134,7 +136,10 @@ struct InsertedWAL {
/// Extend local WAL with new LogicalMessage record. To do that,
/// create AppendRequest with new WAL and pass it to safekeeper.
fn append_logical_message(tli: &Arc<Timeline>, msg: &AppendLogicalMessage) -> Result<InsertedWAL> {
fn append_logical_message(
tli: &Arc<Timeline>,
msg: &AppendLogicalMessage,
) -> anyhow::Result<InsertedWAL> {
let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message);
let sk_state = tli.get_state().1;

View File

@@ -24,7 +24,9 @@ pub mod wal_service;
pub mod wal_storage;
mod timelines_global_map;
use std::sync::Arc;
pub use timelines_global_map::GlobalTimelines;
use utils::auth::JwtAuth;
pub mod defaults {
pub use safekeeper_api::{
@@ -57,7 +59,7 @@ pub struct SafeKeeperConf {
pub max_offloader_lag_bytes: u64,
pub backup_runtime_threads: Option<usize>,
pub wal_backup_enabled: bool,
pub auth_validation_public_key_path: Option<PathBuf>,
pub auth: Option<Arc<JwtAuth>>,
}
impl SafeKeeperConf {
@@ -87,7 +89,7 @@ impl SafeKeeperConf {
broker_keepalive_interval: Duration::from_secs(5),
backup_runtime_threads: None,
wal_backup_enabled: true,
auth_validation_public_key_path: None,
auth: None,
heartbeat_timeout: Duration::new(5, 0),
max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES,
}

View File

@@ -2,11 +2,13 @@
//! Gets messages from the network, passes them down to consensus module and
//! sends replies back.
use anyhow::{anyhow, bail, Result};
use anyhow::anyhow;
use anyhow::Context;
use bytes::BytesMut;
use tracing::*;
use utils::lsn::Lsn;
use utils::postgres_backend_async::QueryError;
use crate::safekeeper::ServerInfo;
use crate::timeline::Timeline;
@@ -43,7 +45,7 @@ impl<'pg> ReceiveWalConn<'pg> {
}
// Send message to the postgres
fn write_msg(&mut self, msg: &AcceptorProposerMessage) -> Result<()> {
fn write_msg(&mut self, msg: &AcceptorProposerMessage) -> anyhow::Result<()> {
let mut buf = BytesMut::with_capacity(128);
msg.serialize(&mut buf)?;
self.pg_backend.write_message(&BeMessage::CopyData(&buf))?;
@@ -51,7 +53,7 @@ impl<'pg> ReceiveWalConn<'pg> {
}
/// Receive WAL from wal_proposer
pub fn run(&mut self, spg: &mut SafekeeperPostgresHandler) -> Result<()> {
pub fn run(&mut self, spg: &mut SafekeeperPostgresHandler) -> Result<(), QueryError> {
let _enter = info_span!("WAL acceptor", ttid = %spg.ttid).entered();
// Notify the libpq client that it's allowed to send `CopyData` messages
@@ -79,7 +81,11 @@ impl<'pg> ReceiveWalConn<'pg> {
};
GlobalTimelines::create(spg.ttid, server_info, Lsn::INVALID, Lsn::INVALID)?
}
_ => bail!("unexpected message {:?} instead of greeting", next_msg),
_ => {
return Err(QueryError::Other(anyhow::anyhow!(
"unexpected message {next_msg:?} instead of greeting"
)))
}
};
let mut next_msg = Some(next_msg);
@@ -134,25 +140,32 @@ impl<'pg> ReceiveWalConn<'pg> {
struct ProposerPollStream {
msg_rx: Receiver<ProposerAcceptorMessage>,
read_thread: Option<thread::JoinHandle<Result<()>>>,
read_thread: Option<thread::JoinHandle<Result<(), QueryError>>>,
}
impl ProposerPollStream {
fn new(mut r: ReadStream) -> Result<Self> {
fn new(mut r: ReadStream) -> anyhow::Result<Self> {
let (msg_tx, msg_rx) = channel();
let read_thread = thread::Builder::new()
.name("Read WAL thread".into())
.spawn(move || -> Result<()> {
.spawn(move || -> Result<(), QueryError> {
loop {
let copy_data = match FeMessage::read(&mut r)? {
Some(FeMessage::CopyData(bytes)) => bytes,
Some(msg) => bail!("expected `CopyData` message, found {:?}", msg),
None => bail!("connection closed unexpectedly"),
};
Some(FeMessage::CopyData(bytes)) => Ok(bytes),
Some(msg) => Err(QueryError::Other(anyhow::anyhow!(
"expected `CopyData` message, found {msg:?}"
))),
None => Err(QueryError::from(std::io::Error::new(
std::io::ErrorKind::ConnectionAborted,
"walproposer closed the connection",
))),
}?;
let msg = ProposerAcceptorMessage::parse(copy_data)?;
msg_tx.send(msg)?;
msg_tx
.send(msg)
.context("Failed to send the proposer message")?;
}
// msg_tx will be dropped here, this will also close msg_rx
})?;
@@ -163,17 +176,19 @@ impl ProposerPollStream {
})
}
fn recv_msg(&mut self) -> Result<ProposerAcceptorMessage> {
fn recv_msg(&mut self) -> Result<ProposerAcceptorMessage, QueryError> {
self.msg_rx.recv().map_err(|_| {
// return error from the read thread
let res = match self.read_thread.take() {
Some(thread) => thread.join(),
None => return anyhow!("read thread is gone"),
None => return QueryError::Other(anyhow::anyhow!("read thread is gone")),
};
match res {
Ok(Ok(())) => anyhow!("unexpected result from read thread"),
Err(err) => anyhow!("read thread panicked: {:?}", err),
Ok(Ok(())) => {
QueryError::Other(anyhow::anyhow!("unexpected result from read thread"))
}
Err(err) => QueryError::Other(anyhow::anyhow!("read thread panicked: {err:?}")),
Ok(Err(err)) => err,
}
})

View File

@@ -5,7 +5,7 @@ use crate::handler::SafekeeperPostgresHandler;
use crate::timeline::{ReplicaState, Timeline};
use crate::wal_storage::WalReader;
use crate::GlobalTimelines;
use anyhow::{bail, Context, Result};
use anyhow::Context;
use bytes::Bytes;
use postgres_ffi::get_current_timestamp;
@@ -15,7 +15,8 @@ use std::cmp::min;
use std::net::Shutdown;
use std::sync::Arc;
use std::time::Duration;
use std::{str, thread};
use std::{io, str, thread};
use utils::postgres_backend_async::QueryError;
use pq_proto::{BeMessage, FeMessage, ReplicationFeedback, WalSndKeepAlive, XLogDataBody};
use tokio::sync::watch::Receiver;
@@ -91,7 +92,7 @@ impl ReplicationConn {
fn background_thread(
mut stream_in: ReadStream,
replica_guard: Arc<ReplicationConnGuard>,
) -> Result<()> {
) -> anyhow::Result<()> {
let replica_id = replica_guard.replica;
let timeline = &replica_guard.timeline;
@@ -140,7 +141,7 @@ impl ReplicationConn {
// Shutdown the connection, because rust-postgres client cannot be dropped
// when connection is alive.
let _ = stream_in.shutdown(Shutdown::Both);
bail!("Copy failed");
anyhow::bail!("Copy failed");
}
_ => {
// We only handle `CopyData`, 'Sync', 'CopyFail' messages. Anything else is ignored.
@@ -160,7 +161,7 @@ impl ReplicationConn {
spg: &mut SafekeeperPostgresHandler,
pgb: &mut PostgresBackend,
mut start_pos: Lsn,
) -> Result<()> {
) -> Result<(), QueryError> {
let _enter = info_span!("WAL sender", ttid = %spg.ttid).entered();
let tli = GlobalTimelines::get(spg.ttid)?;
@@ -256,8 +257,10 @@ impl ReplicationConn {
// to right pageserver.
if tli.should_walsender_stop(replica_id) {
// Shut down, timeline is suspended.
// TODO create proper error type for this
bail!("end streaming to {:?}", spg.appname);
return Err(QueryError::from(io::Error::new(
io::ErrorKind::ConnectionAborted,
format!("end streaming to {:?}", spg.appname),
)));
}
// timeout expired: request pageserver status
@@ -265,8 +268,7 @@ impl ReplicationConn {
sent_ptr: end_pos.0,
timestamp: get_current_timestamp(),
request_reply: true,
}))
.context("Failed to send KeepAlive message")?;
}))?;
continue;
}
}
@@ -301,7 +303,7 @@ impl ReplicationConn {
const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1);
// Wait until we have commit_lsn > lsn or timeout expires. Returns latest commit_lsn.
async fn wait_for_lsn(rx: &mut Receiver<Lsn>, lsn: Lsn) -> Result<Option<Lsn>> {
async fn wait_for_lsn(rx: &mut Receiver<Lsn>, lsn: Lsn) -> anyhow::Result<Option<Lsn>> {
let commit_lsn: Lsn = *rx.borrow();
if commit_lsn > lsn {
return Ok(Some(commit_lsn));

View File

@@ -2,35 +2,28 @@
//! WAL service listens for client connections and
//! receive WAL from wal_proposer and send it to WAL receivers
//!
use anyhow::Result;
use regex::Regex;
use std::net::{TcpListener, TcpStream};
use std::sync::Arc;
use std::thread;
use tracing::*;
use utils::auth::JwtAuth;
use utils::postgres_backend_async::QueryError;
use crate::handler::SafekeeperPostgresHandler;
use crate::SafeKeeperConf;
use utils::postgres_backend::{AuthType, PostgresBackend};
/// Accept incoming TCP connections and spawn them into a background thread.
pub fn thread_main(
conf: SafeKeeperConf,
listener: TcpListener,
auth: Option<Arc<JwtAuth>>,
) -> Result<()> {
pub fn thread_main(conf: SafeKeeperConf, listener: TcpListener) -> ! {
loop {
match listener.accept() {
Ok((socket, peer_addr)) => {
debug!("accepted connection from {}", peer_addr);
let conf = conf.clone();
let auth = auth.clone();
let _ = thread::Builder::new()
.name("WAL service thread".into())
.spawn(move || {
if let Err(err) = handle_socket(socket, conf, auth) {
if let Err(err) = handle_socket(socket, conf) {
error!("connection handler exited: {}", err);
}
})
@@ -51,25 +44,17 @@ fn get_tid() -> u64 {
/// This is run by `thread_main` above, inside a background thread.
///
fn handle_socket(
socket: TcpStream,
conf: SafeKeeperConf,
auth: Option<Arc<JwtAuth>>,
) -> Result<()> {
fn handle_socket(socket: TcpStream, conf: SafeKeeperConf) -> Result<(), QueryError> {
let _enter = info_span!("", tid = ?get_tid()).entered();
socket.set_nodelay(true)?;
let mut conn_handler = SafekeeperPostgresHandler::new(conf, auth.clone());
let pgbackend = PostgresBackend::new(
socket,
match auth {
None => AuthType::Trust,
Some(_) => AuthType::NeonJWT,
},
None,
false,
)?;
let auth_type = match conf.auth {
None => AuthType::Trust,
Some(_) => AuthType::NeonJWT,
};
let mut conn_handler = SafekeeperPostgresHandler::new(conf);
let pgbackend = PostgresBackend::new(socket, auth_type, None, false)?;
// libpq replication protocol between safekeeper and replicas/pagers
pgbackend.run(&mut conn_handler)?;

View File

@@ -2,6 +2,7 @@
name = "storage_broker"
version = "0.1.0"
edition = "2021"
license = "Apache-2.0"
[features]
bench = []

View File

@@ -40,10 +40,9 @@ def parse_metrics(text: str, name: str = "") -> Metrics:
PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
"pageserver_remote_upload_queue_unfinished_tasks",
"pageserver_remote_operation_seconds_bucket",
"pageserver_remote_operation_seconds_count",
"pageserver_remote_operation_seconds_sum",
"pageserver_remote_timeline_client_calls_unfinished",
*[f"pageserver_remote_timeline_client_calls_started_{x}" for x in ["bucket", "count", "sum"]],
*[f"pageserver_remote_operation_seconds_{x}" for x in ["bucket", "count", "sum"]],
"pageserver_remote_physical_size",
)

View File

@@ -18,6 +18,7 @@ from contextlib import closing, contextmanager
from dataclasses import dataclass, field
from enum import Flag, auto
from functools import cached_property
from itertools import chain, product
from pathlib import Path
from types import TracebackType
from typing import Any, Dict, Iterator, List, Optional, Tuple, Type, Union, cast
@@ -34,6 +35,7 @@ from _pytest.config import Config
from _pytest.config.argparsing import Parser
from _pytest.fixtures import FixtureRequest
from fixtures.log_helper import log
from fixtures.metrics import parse_metrics
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import (
ATTACHMENT_NAME_REGEX,
@@ -595,6 +597,7 @@ class NeonEnvBuilder:
rust_log_override: Optional[str] = None,
default_branch_name: str = DEFAULT_BRANCH_NAME,
preserve_database_files: bool = False,
initial_tenant: Optional[TenantId] = None,
):
self.repo_dir = repo_dir
self.rust_log_override = rust_log_override
@@ -617,8 +620,9 @@ class NeonEnvBuilder:
self.pg_distrib_dir = pg_distrib_dir
self.pg_version = pg_version
self.preserve_database_files = preserve_database_files
self.initial_tenant = initial_tenant or TenantId.generate()
def init(self) -> NeonEnv:
def init_configs(self) -> NeonEnv:
# Cannot create more than one environment from one builder
assert self.env is None, "environment already initialized"
self.env = NeonEnv(self)
@@ -629,8 +633,17 @@ class NeonEnvBuilder:
self.env.start()
def init_start(self) -> NeonEnv:
env = self.init()
env = self.init_configs()
self.start()
# Prepare the default branch to start the postgres on later.
# Pageserver itself does not create tenants and timelines, until started first and asked via HTTP API.
log.info(
f"Services started, creating initial tenant {env.initial_tenant} and its initial timeline"
)
initial_tenant, initial_timeline = env.neon_cli.create_tenant(tenant_id=env.initial_tenant)
log.info(f"Initial timeline {initial_tenant}/{initial_timeline} created successfully")
return env
def enable_remote_storage(
@@ -889,12 +902,12 @@ class NeonEnv:
# generate initial tenant ID here instead of letting 'neon init' generate it,
# so that we don't need to dig it out of the config file afterwards.
self.initial_tenant = TenantId.generate()
self.initial_tenant = config.initial_tenant
# Create a config file corresponding to the options
toml = textwrap.dedent(
f"""
default_tenant_id = '{self.initial_tenant}'
default_tenant_id = '{config.initial_tenant}'
"""
)
@@ -1409,6 +1422,33 @@ class PageserverHttpClient(requests.Session):
]
return sample.value
def get_remote_timeline_client_metric(
self,
metric_name: str,
tenant_id: TenantId,
timeline_id: TimelineId,
file_kind: str,
op_kind: str,
) -> Optional[float]:
metrics = parse_metrics(self.get_metrics(), "pageserver")
matches = metrics.query_all(
name=metric_name,
filter={
"tenant_id": str(tenant_id),
"timeline_id": str(timeline_id),
"file_kind": str(file_kind),
"op_kind": str(op_kind),
},
)
if len(matches) == 0:
value = None
elif len(matches) == 1:
value = matches[0].value
assert value is not None
else:
assert len(matches) < 2, "above filter should uniquely identify metric"
return value
def get_metric_value(self, name: str) -> Optional[str]:
metrics = self.get_metrics()
relevant = [line for line in metrics.splitlines() if line.startswith(name)]
@@ -1528,6 +1568,7 @@ class NeonCli(AbstractNeonCli):
tenant_id: Optional[TenantId] = None,
timeline_id: Optional[TimelineId] = None,
conf: Optional[Dict[str, str]] = None,
set_default: bool = False,
) -> Tuple[TenantId, TimelineId]:
"""
Creates a new tenant, returns its id and its initial timeline's id.
@@ -1536,47 +1577,51 @@ class NeonCli(AbstractNeonCli):
tenant_id = TenantId.generate()
if timeline_id is None:
timeline_id = TimelineId.generate()
if conf is None:
res = self.raw_cli(
[
"tenant",
"create",
"--tenant-id",
str(tenant_id),
"--timeline-id",
str(timeline_id),
"--pg-version",
self.env.pg_version,
]
)
else:
res = self.raw_cli(
[
"tenant",
"create",
"--tenant-id",
str(tenant_id),
"--timeline-id",
str(timeline_id),
"--pg-version",
self.env.pg_version,
]
+ sum(list(map(lambda kv: (["-c", kv[0] + ":" + kv[1]]), conf.items())), [])
args = [
"tenant",
"create",
"--tenant-id",
str(tenant_id),
"--timeline-id",
str(timeline_id),
"--pg-version",
self.env.pg_version,
]
if conf is not None:
args.extend(
chain.from_iterable(
product(["-c"], (f"{key}:{value}" for key, value in conf.items()))
)
)
if set_default:
args.append("--set-default")
res = self.raw_cli(args)
res.check_returncode()
return tenant_id, timeline_id
def set_default(self, tenant_id: TenantId):
"""
Update default tenant for future operations that require tenant_id.
"""
res = self.raw_cli(["tenant", "set-default", "--tenant-id", str(tenant_id)])
res.check_returncode()
def config_tenant(self, tenant_id: TenantId, conf: Dict[str, str]):
"""
Update tenant config.
"""
if conf is None:
res = self.raw_cli(["tenant", "config", "--tenant-id", str(tenant_id)])
else:
res = self.raw_cli(
["tenant", "config", "--tenant-id", str(tenant_id)]
+ sum(list(map(lambda kv: (["-c", kv[0] + ":" + kv[1]]), conf.items())), [])
args = ["tenant", "config", "--tenant-id", str(tenant_id)]
if conf is not None:
args.extend(
chain.from_iterable(
product(["-c"], (f"{key}:{value}" for key, value in conf.items()))
)
)
res = self.raw_cli(args)
res.check_returncode()
def list_tenants(self) -> "subprocess.CompletedProcess[str]":
@@ -1611,36 +1656,6 @@ class NeonCli(AbstractNeonCli):
return TimelineId(str(created_timeline_id))
def create_root_branch(
self,
branch_name: str,
tenant_id: Optional[TenantId] = None,
):
cmd = [
"timeline",
"create",
"--branch-name",
branch_name,
"--tenant-id",
str(tenant_id or self.env.initial_tenant),
"--pg-version",
self.env.pg_version,
]
res = self.raw_cli(cmd)
res.check_returncode()
matches = CREATE_TIMELINE_ID_EXTRACTOR.search(res.stdout)
created_timeline_id = None
if matches is not None:
created_timeline_id = matches.group("timeline_id")
if created_timeline_id is None:
raise Exception("could not find timeline id after `neon timeline create` invocation")
else:
return TimelineId(created_timeline_id)
def create_branch(
self,
new_branch_name: str = DEFAULT_BRANCH_NAME,
@@ -1696,17 +1711,12 @@ class NeonCli(AbstractNeonCli):
def init(
self,
config_toml: str,
initial_timeline_id: Optional[TimelineId] = None,
) -> "subprocess.CompletedProcess[str]":
with tempfile.NamedTemporaryFile(mode="w+") as tmp:
tmp.write(config_toml)
tmp.flush()
cmd = ["init", f"--config={tmp.name}"]
if initial_timeline_id:
cmd.extend(["--timeline-id", str(initial_timeline_id)])
cmd.extend(["--pg-version", self.env.pg_version])
cmd = ["init", f"--config={tmp.name}", "--pg-version", self.env.pg_version]
append_pageserver_param_overrides(
params_to_update=cmd,
@@ -1903,14 +1913,17 @@ class NeonPageserver(PgProtocol):
".*wal receiver task finished with an error: walreceiver connection handling failure.*",
".*Shutdown task error: walreceiver connection handling failure.*",
".*wal_connection_manager.*tcp connect error: Connection refused.*",
".*query handler for .* failed: Connection reset by peer.*",
".*serving compute connection task.*exited with error: Broken pipe.*",
".*Connection aborted: error communicating with the server: Broken pipe.*",
".*Connection aborted: error communicating with the server: Transport endpoint is not connected.*",
".*Connection aborted: error communicating with the server: Connection reset by peer.*",
".*query handler for .* failed: Socket IO error: Connection reset by peer.*",
".*serving compute connection task.*exited with error: Postgres connection error.*",
".*serving compute connection task.*exited with error: Connection reset by peer.*",
".*serving compute connection task.*exited with error: Postgres query error.*",
".*Connection aborted: connection error: error communicating with the server: Broken pipe.*",
".*Connection aborted: connection error: error communicating with the server: Transport endpoint is not connected.*",
".*Connection aborted: connection error: error communicating with the server: Connection reset by peer.*",
".*kill_and_wait_impl.*: wait successful.*",
".*end streaming to Some.*",
".*Replication stream finished: db error: ERROR: Socket IO error: end streaming to Some.*",
".*query handler for 'pagestream.*failed: Broken pipe.*", # pageserver notices compute shut down
".*query handler for 'pagestream.*failed: Connection reset by peer.*", # pageserver notices compute shut down
# safekeeper connection can fail with this, in the window between timeline creation
# and streaming start
".*Failed to process query for timeline .*: state uninitialized, no data to read.*",
@@ -1980,10 +1993,6 @@ class NeonPageserver(PgProtocol):
if '"testing"' not in self.version:
pytest.skip("pageserver was built without 'testing' feature")
def is_profiling_enabled_or_skip(self):
if '"profiling"' not in self.version:
pytest.skip("pageserver was built without 'profiling' feature")
def http_client(self, auth_token: Optional[str] = None) -> PageserverHttpClient:
return PageserverHttpClient(
port=self.service_port.http,

View File

@@ -148,7 +148,7 @@ def get_scale_for_db(size_mb: int) -> int:
ATTACHMENT_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg]
r"flamegraph\.svg|regression\.diffs|.+\.(?:log|stderr|stdout|filediff|metrics|html)"
r"regression\.diffs|.+\.(?:log|stderr|stdout|filediff|metrics|html)"
)

View File

@@ -1,12 +1,8 @@
# Running locally
First make a release build. The profiling flag is optional, used only for tests that
generate flame graphs. The `-s` flag just silences a lot of output, and makes it
First make a release build. The `-s` flag silences a lot of output, and makes it
easier to see if you have compile errors without scrolling up.
`BUILD_TYPE=release CARGO_BUILD_FLAGS="--features=testing,profiling" make -s -j8`
NOTE: the `profiling` flag only works on linux because we use linux-specific
libc APIs like `libc::timer_t`.
`BUILD_TYPE=release CARGO_BUILD_FLAGS="--features=testing" make -s -j8`
Then run the tests
`NEON_BIN=./target/release poetry run pytest test_runner/performance"`

View File

@@ -8,7 +8,7 @@ from typing import Dict, List
import pytest
from fixtures.benchmark_fixture import MetricReport, PgBenchInitResult, PgBenchRunResult
from fixtures.compare_fixtures import NeonCompare, PgCompare
from fixtures.compare_fixtures import PgCompare
from fixtures.utils import get_scale_for_db
@@ -176,28 +176,6 @@ def test_pgbench(neon_with_baseline: PgCompare, scale: int, duration: int):
run_test_pgbench(neon_with_baseline, scale, duration, PgBenchLoadType.SELECT_ONLY)
# Run the pgbench tests, and generate a flamegraph from it
# This requires that the pageserver was built with the 'profiling' feature.
#
# TODO: If the profiling is cheap enough, there's no need to run the same test
# twice, with and without profiling. But for now, run it separately, so that we
# can see how much overhead the profiling adds.
@pytest.mark.parametrize("scale", get_scales_matrix())
@pytest.mark.parametrize("duration", get_durations_matrix())
def test_pgbench_flamegraph(zenbenchmark, pg_bin, neon_env_builder, scale: int, duration: int):
neon_env_builder.pageserver_config_override = """
profiling="page_requests"
"""
env = neon_env_builder.init_start()
env.pageserver.is_profiling_enabled_or_skip()
env.neon_cli.create_branch("empty", "main")
neon_compare = NeonCompare(zenbenchmark, env, pg_bin, "pgbench")
run_test_pgbench(neon_compare, scale, duration, PgBenchLoadType.INIT)
run_test_pgbench(neon_compare, scale, duration, PgBenchLoadType.SIMPLE_UPDATE)
run_test_pgbench(neon_compare, scale, duration, PgBenchLoadType.SELECT_ONLY)
# The following 3 tests run on an existing database as it was set up by previous tests,
# and leaves the database in a state that would be used in the next tests.
# Modifying the definition order of these functions or adding other remote tests in between will alter results.

0
test_runner/regress/test_config.py Normal file → Executable file
View File

View File

@@ -1,3 +1,5 @@
import time
import pytest
from fixtures.log_helper import log
from fixtures.metrics import parse_metrics
@@ -20,9 +22,19 @@ def httpserver_listen_address(port_distributor: PortDistributor):
return ("localhost", port)
num_metrics_received = 0
initial_tenant = TenantId.generate()
remote_uploaded = 0
first_request = True
checks = {
"written_size": lambda value: value > 0,
"resident_size": lambda value: value >= 0,
# >= 0 check here is to avoid race condition when we receive metrics before
# remote_uploaded is updated
"remote_storage_size": lambda value: value > 0 if remote_uploaded > 0 else value >= 0,
# logical size may lag behind the actual size, so allow 0 here
"timeline_logical_size": lambda value: value >= 0,
}
metric_kinds_checked = set([])
#
@@ -36,38 +48,19 @@ def metrics_handler(request: Request) -> Response:
log.info("received events:")
log.info(events)
checks = {
"written_size": lambda value: value > 0,
"resident_size": lambda value: value >= 0,
# >= 0 check here is to avoid race condition when we receive metrics before
# remote_uploaded is updated
"remote_storage_size": lambda value: value > 0 if remote_uploaded > 0 else value >= 0,
# logical size may lag behind the actual size, so allow 0 here
"timeline_logical_size": lambda value: value >= 0,
}
events_received = 0
for event in events:
check = checks.get(event["metric"])
assert event["tenant_id"] == str(
initial_tenant
), "Expecting metrics only from the initial tenant"
metric_name = event["metric"]
check = checks.get(metric_name)
# calm down mypy
if check is not None:
assert check(event["value"]), f"{event['metric']} isn't valid"
events_received += 1
assert check(event["value"]), f"{metric_name} isn't valid"
global metric_kinds_checked
metric_kinds_checked.add(metric_name)
global first_request
# check that all checks were sent
# but only on the first request, because we don't send non-changed metrics
if first_request:
# we may receive more metrics than we check,
# because there are two timelines
# and we may receive per-timeline metrics from both
# if the test was slow enough for these metrics to be collected
# -1 because that is ok to not receive timeline_logical_size
assert events_received >= len(checks) - 1
first_request = False
global num_metrics_received
num_metrics_received += 1
return Response(status=200)
@@ -83,11 +76,14 @@ def test_metric_collection(
(host, port) = httpserver_listen_address
metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
# Require collecting metrics frequently, since we change
# the timeline and want something to be logged about it.
#
# Disable time-based pitr, we will use the manual GC calls
# to trigger remote storage operations in a controlled way
neon_env_builder.pageserver_config_override = (
f"""
metric_collection_interval="60s"
metric_collection_interval="1s"
metric_collection_endpoint="{metric_collection_endpoint}"
"""
+ "tenant_config={pitr_interval = '0 sec'}"
@@ -100,6 +96,9 @@ def test_metric_collection(
log.info(f"test_metric_collection endpoint is {metric_collection_endpoint}")
# Set initial tenant of the test, that we expect the logs from
global initial_tenant
initial_tenant = neon_env_builder.initial_tenant
# mock http server that returns OK for the metrics
httpserver.expect_request("/billing/api/v1/usage_events", method="POST").respond_with_handler(
metrics_handler
@@ -107,6 +106,9 @@ def test_metric_collection(
# spin up neon, after http server is ready
env = neon_env_builder.init_start()
# Order of fixtures shutdown is not specified, and if http server gets down
# before pageserver, pageserver log might contain such errors in the end.
env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*")
env.neon_cli.create_branch("test_metric_collection")
pg = env.postgres.create_start("test_metric_collection")
@@ -151,7 +153,11 @@ def test_metric_collection(
remote_uploaded = get_num_remote_ops("index", "upload")
assert remote_uploaded > 0
# check that all requests are served
# wait longer than collecting interval and check that all requests are served
time.sleep(3)
httpserver.check()
global num_metrics_received
assert num_metrics_received > 0, "no metrics were received"
global metric_kinds_checked, checks
expected_checks = set(checks.keys())
assert len(metric_kinds_checked) == len(
checks
), f"Expected to receive and check all kind of metrics, but {expected_checks - metric_kinds_checked} got uncovered"

View File

@@ -1,10 +1,17 @@
from fixtures.neon_fixtures import NeonEnvBuilder
from fixtures.neon_fixtures import NeonEnvBuilder, PortDistributor
# Test that neon cli is able to start and stop all processes with the user defaults.
# def test_neon_cli_basics(neon_simple_env: NeonEnv):
def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init()
# Repeats the example from README.md as close as it can
def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder, port_distributor: PortDistributor):
env = neon_env_builder.init_configs()
# Skipping the init step that creates a local tenant in Pytest tests
try:
env.neon_cli.start()
env.neon_cli.create_tenant(tenant_id=env.initial_tenant, set_default=True)
env.neon_cli.pg_start(node_name="main", port=port_distributor.get_port())
env.neon_cli.start()
env.neon_cli.stop()
env.neon_cli.create_branch(new_branch_name="migration_check")
env.neon_cli.pg_start(node_name="migration_check", port=port_distributor.get_port())
finally:
env.neon_cli.stop()

View File

@@ -120,7 +120,7 @@ def test_ondemand_download_large_rel(
#
# If you have a relation with a long history of updates,the pageserver downloads the layer
# If you have a relation with a long history of updates, the pageserver downloads the layer
# files containing the history as needed by timetravel queries.
#
@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
@@ -189,13 +189,10 @@ def test_ondemand_download_timetravel(
# run checkpoint manually to be sure that data landed in remote storage
client.timeline_checkpoint(tenant_id, timeline_id)
# wait until pageserver successfully uploaded a checkpoint to remote storage
wait_for_upload(client, tenant_id, timeline_id, current_lsn)
log.info("uploads have finished")
##### Stop the first pageserver instance, erase all its data
env.postgres.stop_all()
# wait until pageserver has successfully uploaded all the data to remote storage
wait_for_sk_commit_lsn_to_reach_remote_storage(
tenant_id, timeline_id, env.safekeepers, env.pageserver
)
@@ -227,11 +224,15 @@ def test_ondemand_download_timetravel(
wait_until(10, 0.2, lambda: assert_tenant_status(client, tenant_id, "Active"))
# current_physical_size reports sum of layer file sizes, regardless of local or remote
# The current_physical_size reports the sum of layers loaded in the layer
# map, regardless of where the layer files are located. So even though we
# just removed the local files, they still count towards
# current_physical_size because they are loaded as `RemoteLayer`s.
assert filled_current_physical == get_api_current_physical_size()
# Run queries at different points in time
num_layers_downloaded = [0]
physical_size = [get_resident_physical_size()]
resident_size = [get_resident_physical_size()]
for (checkpoint_number, lsn) in lsns:
pg_old = env.postgres.create_start(
branch_name="main", node_name=f"test_old_lsn_{checkpoint_number}", lsn=lsn
@@ -268,13 +269,15 @@ def test_ondemand_download_timetravel(
if len(num_layers_downloaded) > 4:
assert after_downloads > num_layers_downloaded[len(num_layers_downloaded) - 4]
# Likewise, assert that the physical_size metric grows as layers are downloaded
physical_size.append(get_resident_physical_size())
log.info(f"physical_size[-1]={physical_size[-1]}")
if len(physical_size) > 4:
assert physical_size[-1] > physical_size[len(physical_size) - 4]
# Likewise, assert that the resident_physical_size metric grows as layers are downloaded
resident_size.append(get_resident_physical_size())
log.info(f"resident_size[-1]={resident_size[-1]}")
if len(resident_size) > 4:
assert resident_size[-1] > resident_size[len(resident_size) - 4]
# current_physical_size reports sum of layer file sizes, regardless of local or remote
# current_physical_size reports the total size of all layer files, whether
# they are present only in the remote storage, only locally, or both.
# It should not change.
assert filled_current_physical == get_api_current_physical_size()

View File

@@ -12,11 +12,9 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
# Override default checkpointer settings to run it more often
neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}"
env = neon_env_builder.init()
env = neon_env_builder.init_start()
env.pageserver.is_testing_enabled_or_skip()
neon_env_builder.start()
# These warnings are expected, when the pageserver is restarted abruptly
env.pageserver.allowed_errors.append(".*found future delta layer.*")
env.pageserver.allowed_errors.append(".*found future image layer.*")

View File

@@ -2,11 +2,11 @@
# env NEON_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ......
import os
import re
import shutil
import threading
import time
from pathlib import Path
from typing import Dict, List, Tuple
import pytest
from fixtures.log_helper import log
@@ -271,14 +271,15 @@ def test_remote_storage_upload_queue_retries(
wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
def get_queued_count(file_kind, op_kind):
metrics = client.get_metrics()
matches = re.search(
f'^pageserver_remote_upload_queue_unfinished_tasks{{file_kind="{file_kind}",op_kind="{op_kind}",tenant_id="{tenant_id}",timeline_id="{timeline_id}"}} (\\S+)$',
metrics,
re.MULTILINE,
val = client.get_remote_timeline_client_metric(
"pageserver_remote_timeline_client_calls_unfinished",
tenant_id,
timeline_id,
file_kind,
op_kind,
)
assert matches
return int(matches[1])
assert val is not None, "expecting metric to be present"
return int(val)
# create some layers & wait for uploads to finish
overwrite_data_and_wait_for_it_to_arrive_at_pageserver("a")
@@ -368,6 +369,168 @@ def test_remote_storage_upload_queue_retries(
assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 10000
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
def test_remote_timeline_client_calls_started_metric(
neon_env_builder: NeonEnvBuilder,
remote_storage_kind: RemoteStorageKind,
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_remote_timeline_client_metrics",
)
env = neon_env_builder.init_start()
# create tenant with config that will determinstically allow
# compaction and gc
tenant_id, timeline_id = env.neon_cli.create_tenant(
conf={
# small checkpointing and compaction targets to ensure we generate many upload operations
"checkpoint_distance": f"{128 * 1024}",
"compaction_threshold": "1",
"compaction_target_size": f"{128 * 1024}",
# no PITR horizon, we specify the horizon when we request on-demand GC
"pitr_interval": "0s",
# disable background compaction and GC. We invoke it manually when we want it to happen.
"gc_period": "0s",
"compaction_period": "0s",
# don't create image layers, that causes just noise
"image_creation_threshold": "10000",
}
)
client = env.pageserver.http_client()
pg = env.postgres.create_start("main", tenant_id=tenant_id)
pg.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
def overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data):
# create initial set of layers & upload them with failpoints configured
pg.safe_psql_many(
[
f"""
INSERT INTO foo (id, val)
SELECT g, '{data}'
FROM generate_series(1, 10000) g
ON CONFLICT (id) DO UPDATE
SET val = EXCLUDED.val
""",
# to ensure that GC can actually remove some layers
"VACUUM foo",
]
)
wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
def get_queued_count(file_kind, op_kind):
val = client.get_remote_timeline_client_metric(
"pageserver_remote_timeline_client_calls_unfinished",
tenant_id,
timeline_id,
file_kind,
op_kind,
)
if val is None:
return val
return int(val)
def wait_upload_queue_empty():
wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0)
wait_until(2, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0)
wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0)
calls_started: Dict[Tuple[str, str], List[int]] = {
("layer", "upload"): [0],
("index", "upload"): [0],
("layer", "delete"): [0],
}
def fetch_calls_started():
for (file_kind, op_kind), observations in calls_started.items():
val = client.get_remote_timeline_client_metric(
"pageserver_remote_timeline_client_calls_started_count",
tenant_id,
timeline_id,
file_kind,
op_kind,
)
assert val is not None, f"expecting metric to be present: {file_kind} {op_kind}"
val = int(val)
observations.append(val)
def ensure_calls_started_grew():
for (file_kind, op_kind), observations in calls_started.items():
log.info(f"ensure_calls_started_grew: {file_kind} {op_kind}: {observations}")
assert all(
x < y for x, y in zip(observations, observations[1:])
), f"observations for {file_kind} {op_kind} did not grow monotonically: {observations}"
def churn(data_pass1, data_pass2):
overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass1)
client.timeline_checkpoint(tenant_id, timeline_id)
client.timeline_compact(tenant_id, timeline_id)
overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass2)
client.timeline_checkpoint(tenant_id, timeline_id)
client.timeline_compact(tenant_id, timeline_id)
gc_result = client.timeline_gc(tenant_id, timeline_id, 0)
print_gc_result(gc_result)
assert gc_result["layers_removed"] > 0
# create some layers & wait for uploads to finish
churn("a", "b")
wait_upload_queue_empty()
# ensure that we updated the calls_started metric
fetch_calls_started()
ensure_calls_started_grew()
# more churn to cause more operations
churn("c", "d")
# ensure that the calls_started metric continued to be updated
fetch_calls_started()
ensure_calls_started_grew()
### now we exercise the download path
calls_started.clear()
calls_started.update(
{
("index", "download"): [0],
("layer", "download"): [0],
}
)
env.pageserver.stop(immediate=True)
env.postgres.stop_all()
dir_to_clear = Path(env.repo_dir) / "tenants"
shutil.rmtree(dir_to_clear)
os.mkdir(dir_to_clear)
env.pageserver.start()
client = env.pageserver.http_client()
client.tenant_attach(tenant_id)
def tenant_active():
all_states = client.tenant_list()
[tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id]
assert tenant["state"] == "Active"
wait_until(30, 1, tenant_active)
log.info("restarting postgres to validate")
pg = env.postgres.create_start("main", tenant_id=tenant_id)
with pg.cursor() as cur:
assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 10000
# ensure that we updated the calls_started download metric
fetch_calls_started()
ensure_calls_started_grew()
# Test that we correctly handle timeline with layers stuck in upload queue
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
def test_timeline_deletion_with_files_stuck_in_upload_queue(
@@ -401,15 +564,14 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
client = env.pageserver.http_client()
def get_queued_count(file_kind, op_kind):
metrics = client.get_metrics()
matches = re.search(
f'^pageserver_remote_upload_queue_unfinished_tasks{{file_kind="{file_kind}",op_kind="{op_kind}",tenant_id="{tenant_id}",timeline_id="{timeline_id}"}} (\\S+)$',
metrics,
re.MULTILINE,
val = client.get_remote_timeline_client_metric(
"pageserver_remote_timeline_client_calls_unfinished",
tenant_id,
timeline_id,
file_kind,
op_kind,
)
if matches is None:
return None
return int(matches[1])
return int(val) if val is not None else val
pg = env.postgres.create_start("main", tenant_id=tenant_id)

View File

@@ -59,7 +59,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
"gc_horizon": 67108864,
"gc_period": 100,
"image_creation_threshold": 3,
"pitr_interval": 2592000,
"pitr_interval": 604800, # 7 days
}.items()
)
@@ -79,7 +79,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
"gc_horizon": 67108864,
"gc_period": 30,
"image_creation_threshold": 3,
"pitr_interval": 2592000,
"pitr_interval": 604800,
}.items()
)
@@ -107,7 +107,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
"gc_horizon": 67108864,
"gc_period": 80,
"image_creation_threshold": 3,
"pitr_interval": 2592000,
"pitr_interval": 604800,
}.items()
)
@@ -130,7 +130,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
"gc_horizon": 67108864,
"gc_period": 80,
"image_creation_threshold": 3,
"pitr_interval": 2592000,
"pitr_interval": 604800,
}.items()
)

View File

@@ -1,9 +1,13 @@
import asyncio
import random
import time
from threading import Thread
import asyncpg
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnv,
NeonEnvBuilder,
PageserverApiException,
PageserverHttpClient,
@@ -12,6 +16,7 @@ from fixtures.neon_fixtures import (
available_remote_storages,
wait_for_last_record_lsn,
wait_for_upload,
wait_until,
wait_until_tenant_state,
)
from fixtures.types import Lsn, TenantId, TimelineId
@@ -84,6 +89,150 @@ def test_tenant_reattach(
assert env.pageserver.log_contains(".*download.*failed, will retry.*")
num_connections = 10
num_rows = 100000
updates_to_perform = 0
updates_started = 0
updates_finished = 0
# Run random UPDATEs on test table. On failure, try again.
async def update_table(pg_conn: asyncpg.Connection):
global updates_started, updates_finished, updates_to_perform
while updates_started < updates_to_perform or updates_to_perform == 0:
updates_started += 1
id = random.randrange(1, num_rows)
# Loop to retry until the UPDATE succeeds
while True:
try:
await pg_conn.fetchrow(f"UPDATE t SET counter = counter + 1 WHERE id = {id}")
updates_finished += 1
if updates_finished % 1000 == 0:
log.info(f"update {updates_finished} / {updates_to_perform}")
break
except asyncpg.PostgresError as e:
# Received error from Postgres. Log it, sleep a little, and continue
log.info(f"UPDATE error: {e}")
await asyncio.sleep(0.1)
async def sleep_and_reattach(pageserver_http: PageserverHttpClient, tenant_id: TenantId):
global updates_started, updates_finished, updates_to_perform
# Wait until we have performed some updates
wait_until(20, 0.5, lambda: updates_finished > 500)
log.info("Detaching tenant")
pageserver_http.tenant_detach(tenant_id)
await asyncio.sleep(1)
log.info("Re-attaching tenant")
pageserver_http.tenant_attach(tenant_id)
log.info("Re-attach finished")
# Continue with 5000 more updates
updates_to_perform = updates_started + 5000
# async guts of test_tenant_reattach_while_bysy test
async def reattach_while_busy(
env: NeonEnv, pg: Postgres, pageserver_http: PageserverHttpClient, tenant_id: TenantId
):
workers = []
for worker_id in range(num_connections):
pg_conn = await pg.connect_async()
workers.append(asyncio.create_task(update_table(pg_conn)))
workers.append(asyncio.create_task(sleep_and_reattach(pageserver_http, tenant_id)))
await asyncio.gather(*workers)
assert updates_finished == updates_to_perform
# Detach and re-attach tenant, while compute is busy running queries.
#
# Some of the queries may fail, in the window that the tenant has been
# detached but not yet re-attached. But Postgres itself should keep
# running, and when we retry the queries, they should start working
# after the attach has finished.
# FIXME:
#
# This is pretty unstable at the moment. I've seen it fail with a warning like this:
#
# AssertionError: assert not ['2023-01-05T13:09:40.708303Z WARN remote_upload{tenant=c3fc41f6cf29a7626b90316e3518cd4b timeline=7978246f85faa71ab03...1282b/000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001716699-0000000001736681"\n']
#
# (https://neon-github-public-dev.s3.amazonaws.com/reports/pr-3232/debug/3846817847/index.html#suites/f9eba3cfdb71aa6e2b54f6466222829b/470fc62b5db7d7d7/)
# I believe that failure happened because there is a race condition
# between detach and starting remote upload tasks:
#
# 1. detach_timeline calls task_mgr::shutdown_tasks(), sending shutdown
# signal to all in-progress tasks associated with the tenant.
# 2. Just after shutdown_tasks() has collected the list of tasks,
# a new remote-upload task is spawned.
#
# See https://github.com/neondatabase/neon/issues/3273
#
#
# I also saw this failure:
#
# test_runner/regress/test_tenant_detach.py:194: in test_tenant_reattach_while_busy
# asyncio.run(reattach_while_busy(env, pg, pageserver_http, tenant_id))
# /home/nonroot/.pyenv/versions/3.9.2/lib/python3.9/asyncio/runners.py:44: in run
# return loop.run_until_complete(main)
# /home/nonroot/.pyenv/versions/3.9.2/lib/python3.9/asyncio/base_events.py:642: in run_until_complete
# return future.result()
# test_runner/regress/test_tenant_detach.py:151: in reattach_while_busy
# assert updates_finished == updates_to_perform
# E assert 5010 == 10010
# E +5010
# E -10010
#
# I don't know what's causing that...
@pytest.mark.skip(reason="fixme")
@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
def test_tenant_reattach_while_busy(
neon_env_builder: NeonEnvBuilder,
remote_storage_kind: RemoteStorageKind,
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_tenant_reattach_while_busy",
)
env = neon_env_builder.init_start()
# Attempts to connect from compute to pageserver while the tenant is
# temporarily detached produces these errors in the pageserver log.
env.pageserver.allowed_errors.append(".*Tenant .* not found in the local state.*")
env.pageserver.allowed_errors.append(
".*Tenant .* will not become active\\. Current state: Stopping.*"
)
pageserver_http = env.pageserver.http_client()
# create new nenant
tenant_id, timeline_id = env.neon_cli.create_tenant(
# Create layers aggressively
conf={"checkpoint_distance": "100000"}
)
pg = env.postgres.create_start("main", tenant_id=tenant_id)
cur = pg.connect().cursor()
cur.execute("CREATE TABLE t(id int primary key, counter int)")
cur.execute(f"INSERT INTO t SELECT generate_series(1,{num_rows}), 0")
# Run the test
asyncio.run(reattach_while_busy(env, pg, pageserver_http, tenant_id))
# Verify table contents
assert query_scalar(cur, "SELECT count(*) FROM t") == num_rows
assert query_scalar(cur, "SELECT sum(counter) FROM t") == updates_to_perform
def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()

View File

@@ -1105,7 +1105,6 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
env.pageserver.allowed_errors.extend(
[
".*Failed to process query for timeline .*: Timeline .* was not found in global map.*",
".*end streaming to Some.*",
]
)

View File

@@ -13,7 +13,6 @@ publish = false
### BEGIN HAKARI SECTION
[dependencies]
ahash = { version = "0.7", features = ["std"] }
anyhow = { version = "1", features = ["backtrace", "std"] }
bytes = { version = "1", features = ["serde", "std"] }
chrono = { version = "0.4", default-features = false, features = ["clock", "iana-time-zone", "serde", "std", "winapi"] }
@@ -37,13 +36,11 @@ prost = { version = "0.11", features = ["prost-derive", "std"] }
rand = { version = "0.8", features = ["alloc", "getrandom", "libc", "rand_chacha", "rand_hc", "small_rng", "std", "std_rng"] }
regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
reqwest = { version = "0.11", features = ["__rustls", "__tls", "blocking", "default-tls", "hyper-rustls", "hyper-tls", "json", "native-tls-crate", "rustls", "rustls-pemfile", "rustls-tls", "rustls-tls-webpki-roots", "serde_json", "tokio-native-tls", "tokio-rustls", "webpki-roots"] }
scopeguard = { version = "1", features = ["use_std"] }
serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] }
serde_json = { version = "1", features = ["raw_value", "std"] }
socket2 = { version = "0.4", default-features = false, features = ["all"] }
stable_deref_trait = { version = "1", features = ["alloc", "std"] }
tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] }
tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] }
tokio-util = { version = "0.7", features = ["codec", "io", "io-util", "tracing"] }
tower = { version = "0.4", features = ["__common", "balance", "buffer", "discover", "futures-core", "futures-util", "indexmap", "limit", "load", "log", "make", "pin-project", "pin-project-lite", "rand", "ready-cache", "retry", "slab", "timeout", "tokio", "tokio-util", "tracing", "util"] }
tracing = { version = "0.1", features = ["attributes", "log", "std", "tracing-attributes"] }
@@ -51,7 +48,6 @@ tracing-core = { version = "0.1", features = ["once_cell", "std"] }
url = { version = "2", features = ["serde"] }
[build-dependencies]
ahash = { version = "0.7", features = ["std"] }
anyhow = { version = "1", features = ["backtrace", "std"] }
bytes = { version = "1", features = ["serde", "std"] }
either = { version = "1", features = ["use_std"] }