mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-01 04:20:39 +00:00
Compare commits
3 Commits
alek_targz
...
instrument
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0cb37311c2 | ||
|
|
1b687409be | ||
|
|
694bc1a481 |
@@ -105,7 +105,7 @@ runs:
|
||||
# Get previously uploaded data for this run
|
||||
ZSTD_NBTHREADS=0
|
||||
|
||||
S3_FILEPATHS=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/ | jq --raw-output '.Contents[]?.Key')
|
||||
S3_FILEPATHS=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/ | jq --raw-output '.Contents[].Key')
|
||||
if [ -z "$S3_FILEPATHS" ]; then
|
||||
# There's no previously uploaded data for this $GITHUB_RUN_ID
|
||||
exit 0
|
||||
|
||||
33
.github/workflows/build_and_test.yml
vendored
33
.github/workflows/build_and_test.yml
vendored
@@ -155,7 +155,7 @@ jobs:
|
||||
build_type: [ debug, release ]
|
||||
env:
|
||||
BUILD_TYPE: ${{ matrix.build_type }}
|
||||
GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
GIT_VERSION: ${{ github.sha }}
|
||||
|
||||
steps:
|
||||
- name: Fix git ownership
|
||||
@@ -174,27 +174,6 @@ jobs:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Check Postgres submodules revision
|
||||
shell: bash -euo pipefail {0}
|
||||
run: |
|
||||
# This is a temporary solution to ensure that the Postgres submodules revision is correct (i.e. the updated intentionally).
|
||||
# Eventually it will be replaced by a regression test https://github.com/neondatabase/neon/pull/4603
|
||||
|
||||
FAILED=false
|
||||
for postgres in postgres-v14 postgres-v15; do
|
||||
expected=$(cat vendor/revisions.json | jq --raw-output '."'"${postgres}"'"')
|
||||
actual=$(git rev-parse "HEAD:vendor/${postgres}")
|
||||
if [ "${expected}" != "${actual}" ]; then
|
||||
echo >&2 "Expected ${postgres} rev to be at '${expected}', but it is at '${actual}'"
|
||||
FAILED=true
|
||||
fi
|
||||
done
|
||||
|
||||
if [ "${FAILED}" = "true" ]; then
|
||||
echo >&2 "Please update vendors/revisions.json if these changes are intentional"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Set pg 14 revision for caching
|
||||
id: pg_v14_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
|
||||
@@ -635,7 +614,7 @@ jobs:
|
||||
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
--build-arg GIT_VERSION=${{ github.sha }}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
|
||||
--destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
|
||||
@@ -679,7 +658,7 @@ jobs:
|
||||
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
--build-arg GIT_VERSION=${{ github.sha }}
|
||||
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--dockerfile Dockerfile.compute-tools
|
||||
@@ -736,7 +715,7 @@ jobs:
|
||||
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
--build-arg GIT_VERSION=${{ github.sha }}
|
||||
--build-arg PG_VERSION=${{ matrix.version }}
|
||||
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
@@ -763,7 +742,7 @@ jobs:
|
||||
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true \
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
|
||||
--context . \
|
||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} \
|
||||
--build-arg GIT_VERSION=${{ github.sha }} \
|
||||
--build-arg PG_VERSION=${{ matrix.version }} \
|
||||
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} \
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com \
|
||||
@@ -788,7 +767,7 @@ jobs:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
env:
|
||||
VM_BUILDER_VERSION: v0.12.1
|
||||
VM_BUILDER_VERSION: v0.11.1
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
|
||||
43
Cargo.lock
generated
43
Cargo.lock
generated
@@ -158,19 +158,6 @@ dependencies = [
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "async-compression"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5b0122885821398cc923ece939e24d1056a2384ee719432397fa9db87230ff11"
|
||||
dependencies = [
|
||||
"flate2",
|
||||
"futures-core",
|
||||
"memchr",
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "async-stream"
|
||||
version = "0.3.5"
|
||||
@@ -606,7 +593,7 @@ dependencies = [
|
||||
"cc",
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"miniz_oxide 0.6.2",
|
||||
"miniz_oxide",
|
||||
"object",
|
||||
"rustc-demangle",
|
||||
]
|
||||
@@ -895,11 +882,9 @@ name = "compute_tools"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-compression",
|
||||
"chrono",
|
||||
"clap",
|
||||
"compute_api",
|
||||
"flate2",
|
||||
"futures",
|
||||
"hyper",
|
||||
"notify",
|
||||
@@ -907,14 +892,12 @@ dependencies = [
|
||||
"opentelemetry",
|
||||
"postgres",
|
||||
"regex",
|
||||
"remote_storage",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tar",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"toml_edit",
|
||||
"tracing",
|
||||
"tracing-opentelemetry",
|
||||
"tracing-subscriber",
|
||||
@@ -982,7 +965,6 @@ dependencies = [
|
||||
"tar",
|
||||
"thiserror",
|
||||
"toml",
|
||||
"tracing",
|
||||
"url",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
@@ -1385,16 +1367,6 @@ version = "0.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80"
|
||||
|
||||
[[package]]
|
||||
name = "flate2"
|
||||
version = "1.0.26"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743"
|
||||
dependencies = [
|
||||
"crc32fast",
|
||||
"miniz_oxide 0.7.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fnv"
|
||||
version = "1.0.7"
|
||||
@@ -2179,15 +2151,6 @@ dependencies = [
|
||||
"adler",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "miniz_oxide"
|
||||
version = "0.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
|
||||
dependencies = [
|
||||
"adler",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "mio"
|
||||
version = "0.8.6"
|
||||
@@ -2519,7 +2482,6 @@ name = "pageserver"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-compression",
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
@@ -2536,7 +2498,6 @@ dependencies = [
|
||||
"enum-map",
|
||||
"enumset",
|
||||
"fail",
|
||||
"flate2",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hex",
|
||||
@@ -4705,6 +4666,7 @@ dependencies = [
|
||||
"opentelemetry",
|
||||
"opentelemetry-otlp",
|
||||
"opentelemetry-semantic-conventions",
|
||||
"pin-project-lite",
|
||||
"reqwest",
|
||||
"tokio",
|
||||
"tracing",
|
||||
@@ -4856,7 +4818,6 @@ dependencies = [
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"const_format",
|
||||
"criterion",
|
||||
"futures",
|
||||
"heapless",
|
||||
|
||||
@@ -32,8 +32,6 @@ license = "Apache-2.0"
|
||||
## All dependency versions, used in the project
|
||||
[workspace.dependencies]
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
|
||||
flate2 = "1.0.26"
|
||||
async-stream = "0.3"
|
||||
async-trait = "0.1"
|
||||
aws-config = { version = "0.55", default-features = false, features=["rustls"] }
|
||||
|
||||
@@ -6,10 +6,8 @@ license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
async-compression.workspace = true
|
||||
chrono.workspace = true
|
||||
clap.workspace = true
|
||||
flate2.workspace = true
|
||||
futures.workspace = true
|
||||
hyper = { workspace = true, features = ["full"] }
|
||||
notify.workspace = true
|
||||
@@ -32,5 +30,3 @@ url.workspace = true
|
||||
compute_api.workspace = true
|
||||
utils.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
toml_edit.workspace = true
|
||||
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
|
||||
|
||||
@@ -5,8 +5,6 @@
|
||||
//! - `compute_ctl` accepts cluster (compute node) specification as a JSON file.
|
||||
//! - Every start is a fresh start, so the data directory is removed and
|
||||
//! initialized again on each run.
|
||||
//! - If remote_extension_config is provided, it will be used to fetch extensions list
|
||||
//! and download `shared_preload_libraries` from the remote storage.
|
||||
//! - Next it will put configuration files into the `PGDATA` directory.
|
||||
//! - Sync safekeepers and get commit LSN.
|
||||
//! - Get `basebackup` from pageserver using the returned on the previous step LSN.
|
||||
@@ -29,8 +27,7 @@
|
||||
//! compute_ctl -D /var/db/postgres/compute \
|
||||
//! -C 'postgresql://cloud_admin@localhost/postgres' \
|
||||
//! -S /var/db/postgres/specs/current.json \
|
||||
//! -b /usr/local/bin/postgres \
|
||||
//! -r {"bucket": "my-bucket", "region": "eu-central-1", "endpoint": "http:://localhost:9000"} \
|
||||
//! -b /usr/local/bin/postgres
|
||||
//! ```
|
||||
//!
|
||||
use std::collections::HashMap;
|
||||
@@ -38,7 +35,7 @@ use std::fs::File;
|
||||
use std::panic;
|
||||
use std::path::Path;
|
||||
use std::process::exit;
|
||||
use std::sync::{mpsc, Arc, Condvar, Mutex, OnceLock};
|
||||
use std::sync::{mpsc, Arc, Condvar, Mutex};
|
||||
use std::{thread, time::Duration};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
@@ -51,8 +48,6 @@ use compute_api::responses::ComputeStatus;
|
||||
|
||||
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
|
||||
use compute_tools::configurator::launch_configurator;
|
||||
use compute_tools::extension_server::launch_download_extensions;
|
||||
use compute_tools::extension_server::{get_pg_version, init_remote_storage};
|
||||
use compute_tools::http::api::launch_http_server;
|
||||
use compute_tools::logger::*;
|
||||
use compute_tools::monitor::launch_monitor;
|
||||
@@ -60,42 +55,15 @@ use compute_tools::params::*;
|
||||
use compute_tools::spec::*;
|
||||
|
||||
const BUILD_TAG_DEFAULT: &str = "local";
|
||||
const DEFAULT_REMOTE_EXT_CONFIG: &str = r#"{"bucket": "neon-dev-extensions", "region": "eu-central-1", "endpoint": null, "prefix": "5555"}"#;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
|
||||
|
||||
let build_tag = option_env!("BUILD_TAG").unwrap_or(BUILD_TAG_DEFAULT);
|
||||
|
||||
info!("build_tag: {build_tag}");
|
||||
|
||||
let matches = cli().get_matches();
|
||||
let pgbin_default = String::from("postgres");
|
||||
let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
|
||||
|
||||
let remote_ext_config = matches
|
||||
.get_one::<String>("remote-ext-config")
|
||||
.map(|x| x.to_string());
|
||||
// let remote_ext_config =
|
||||
// Some(remote_ext_config.unwrap_or(DEFAULT_REMOTE_EXT_CONFIG.to_string()));
|
||||
|
||||
let ext_remote_storage = remote_ext_config.map(|x| {
|
||||
init_remote_storage(&x, build_tag)
|
||||
.expect("cannot initialize remote extension storage from config")
|
||||
});
|
||||
// creds used to connect to remote extensions bucket
|
||||
// let aws_creds = matches.get_one::<String>("awscreds");
|
||||
// if let Some(aws_creds) = aws_creds {
|
||||
// // not sure if this is a bad idea?
|
||||
// let aws_creds_dict: serde_json::Value = serde_json::from_str(aws_creds)?;
|
||||
// std::env::set_var(
|
||||
// "AWS_ACCESS_KEY_ID",
|
||||
// aws_creds_dict["ID"].as_str().expect("config parse error"),
|
||||
// );
|
||||
// std::env::set_var(
|
||||
// "AWS_SECRET_ACCESS_KEY",
|
||||
// aws_creds_dict["key"].as_str().expect("config parse error"),
|
||||
// );
|
||||
// }
|
||||
|
||||
let http_port = *matches
|
||||
.get_one::<u16>("http-port")
|
||||
@@ -160,6 +128,9 @@ fn main() -> Result<()> {
|
||||
let compute_id = matches.get_one::<String>("compute-id");
|
||||
let control_plane_uri = matches.get_one::<String>("control-plane-uri");
|
||||
|
||||
// Try to use just 'postgres' if no path is provided
|
||||
let pgbin = matches.get_one::<String>("pgbin").unwrap();
|
||||
|
||||
let spec;
|
||||
let mut live_config_allowed = false;
|
||||
match spec_json {
|
||||
@@ -197,7 +168,6 @@ fn main() -> Result<()> {
|
||||
|
||||
let mut new_state = ComputeState::new();
|
||||
let spec_set;
|
||||
|
||||
if let Some(spec) = spec {
|
||||
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
|
||||
new_state.pspec = Some(pspec);
|
||||
@@ -209,12 +179,9 @@ fn main() -> Result<()> {
|
||||
connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
|
||||
pgdata: pgdata.to_string(),
|
||||
pgbin: pgbin.to_string(),
|
||||
pgversion: get_pg_version(pgbin),
|
||||
live_config_allowed,
|
||||
state: Mutex::new(new_state),
|
||||
state_changed: Condvar::new(),
|
||||
ext_remote_storage,
|
||||
available_extensions: OnceLock::new(),
|
||||
};
|
||||
let compute = Arc::new(compute_node);
|
||||
|
||||
@@ -223,8 +190,6 @@ fn main() -> Result<()> {
|
||||
let _http_handle =
|
||||
launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
|
||||
|
||||
let extension_server_port: u16 = http_port;
|
||||
|
||||
if !spec_set {
|
||||
// No spec provided, hang waiting for it.
|
||||
info!("no compute spec provided, waiting");
|
||||
@@ -262,13 +227,10 @@ fn main() -> Result<()> {
|
||||
let _configurator_handle =
|
||||
launch_configurator(&compute).expect("cannot launch configurator thread");
|
||||
|
||||
let _download_extensions_handle =
|
||||
launch_download_extensions(&compute).expect("cannot launch download extensions thread");
|
||||
|
||||
// Start Postgres
|
||||
let mut delay_exit = false;
|
||||
let mut exit_code = None;
|
||||
let pg = match compute.start_compute(extension_server_port) {
|
||||
let pg = match compute.start_compute() {
|
||||
Ok(pg) => Some(pg),
|
||||
Err(err) => {
|
||||
error!("could not start the compute node: {:?}", err);
|
||||
@@ -397,18 +359,6 @@ fn cli() -> clap::Command {
|
||||
.long("control-plane-uri")
|
||||
.value_name("CONTROL_PLANE_API_BASE_URI"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("remote-ext-config")
|
||||
.short('r')
|
||||
.long("remote-ext-config")
|
||||
.value_name("REMOTE_EXT_CONFIG"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("awscreds")
|
||||
.short('k')
|
||||
.long("awscreds")
|
||||
.value_name("AWS_CREDENTIALS"),
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,17 +1,13 @@
|
||||
use std::collections::HashSet;
|
||||
use std::fs;
|
||||
use std::io::BufRead;
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
use std::path::Path;
|
||||
use std::process::{Command, Stdio};
|
||||
use std::str::FromStr;
|
||||
use std::sync::{Condvar, Mutex, OnceLock};
|
||||
use std::sync::{Condvar, Mutex};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::{DateTime, Utc};
|
||||
use futures::future::join_all;
|
||||
use postgres::{Client, NoTls};
|
||||
use tokio;
|
||||
use tokio_postgres;
|
||||
use tracing::{info, instrument, warn};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -19,13 +15,10 @@ use utils::lsn::Lsn;
|
||||
|
||||
use compute_api::responses::{ComputeMetrics, ComputeStatus};
|
||||
use compute_api::spec::{ComputeMode, ComputeSpec};
|
||||
use utils::measured_stream::MeasuredReader;
|
||||
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
|
||||
use crate::config;
|
||||
use crate::pg_helpers::*;
|
||||
use crate::spec::*;
|
||||
use crate::{config, extension_server};
|
||||
|
||||
/// Compute node info shared across several `compute_ctl` threads.
|
||||
pub struct ComputeNode {
|
||||
@@ -33,7 +26,6 @@ pub struct ComputeNode {
|
||||
pub connstr: url::Url,
|
||||
pub pgdata: String,
|
||||
pub pgbin: String,
|
||||
pub pgversion: String,
|
||||
/// We should only allow live re- / configuration of the compute node if
|
||||
/// it uses 'pull model', i.e. it can go to control-plane and fetch
|
||||
/// the latest configuration. Otherwise, there could be a case:
|
||||
@@ -53,10 +45,6 @@ pub struct ComputeNode {
|
||||
pub state: Mutex<ComputeState>,
|
||||
/// `Condvar` to allow notifying waiters about state changes.
|
||||
pub state_changed: Condvar,
|
||||
/// the S3 bucket that we search for extensions in
|
||||
pub ext_remote_storage: Option<GenericRemoteStorage>,
|
||||
// cached lists of available extensions and libraries
|
||||
pub available_extensions: OnceLock<HashSet<String>>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
@@ -152,14 +140,14 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
|
||||
.cluster
|
||||
.roles
|
||||
.iter()
|
||||
.map(|r| escape_literal(&r.name))
|
||||
.map(|r| format!("'{}'", escape_literal(&r.name)))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let dbs = spec
|
||||
.cluster
|
||||
.databases
|
||||
.iter()
|
||||
.map(|db| escape_literal(&db.name))
|
||||
.map(|db| format!("'{}'", escape_literal(&db.name)))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let roles_decl = if roles.is_empty() {
|
||||
@@ -265,52 +253,20 @@ impl ComputeNode {
|
||||
|
||||
let mut client = config.connect(NoTls)?;
|
||||
let basebackup_cmd = match lsn {
|
||||
// HACK We don't use compression on first start (Lsn(0)) because there's no API for it
|
||||
Lsn(0) => format!("basebackup {} {}", spec.tenant_id, spec.timeline_id),
|
||||
_ => format!(
|
||||
"basebackup {} {} {} --gzip",
|
||||
spec.tenant_id, spec.timeline_id, lsn
|
||||
),
|
||||
Lsn(0) => format!("basebackup {} {}", spec.tenant_id, spec.timeline_id), // First start of the compute
|
||||
_ => format!("basebackup {} {} {}", spec.tenant_id, spec.timeline_id, lsn),
|
||||
};
|
||||
|
||||
let copyreader = client.copy_out(basebackup_cmd.as_str())?;
|
||||
let mut measured_reader = MeasuredReader::new(copyreader);
|
||||
|
||||
// Check the magic number to see if it's a gzip or not. Even though
|
||||
// we might explicitly ask for gzip, an old pageserver with no implementation
|
||||
// of gzip compression might send us uncompressed data. After some time
|
||||
// passes we can assume all pageservers know how to compress and we can
|
||||
// delete this check.
|
||||
//
|
||||
// If the data is not gzip, it will be tar. It will not be mistakenly
|
||||
// recognized as gzip because tar starts with an ascii encoding of a filename,
|
||||
// and 0x1f and 0x8b are unlikely first characters for any filename. Moreover,
|
||||
// we send the "global" directory first from the pageserver, so it definitely
|
||||
// won't be recognized as gzip.
|
||||
let mut bufreader = std::io::BufReader::new(&mut measured_reader);
|
||||
let gzip = {
|
||||
let peek = bufreader.fill_buf().unwrap();
|
||||
peek[0] == 0x1f && peek[1] == 0x8b
|
||||
};
|
||||
|
||||
// Read the archive directly from the `CopyOutReader`
|
||||
//
|
||||
// Set `ignore_zeros` so that unpack() reads all the Copy data and
|
||||
// doesn't stop at the end-of-archive marker. Otherwise, if the server
|
||||
// sends an Error after finishing the tarball, we will not notice it.
|
||||
if gzip {
|
||||
let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut bufreader));
|
||||
ar.set_ignore_zeros(true);
|
||||
ar.unpack(&self.pgdata)?;
|
||||
} else {
|
||||
let mut ar = tar::Archive::new(&mut bufreader);
|
||||
ar.set_ignore_zeros(true);
|
||||
ar.unpack(&self.pgdata)?;
|
||||
};
|
||||
let mut ar = tar::Archive::new(copyreader);
|
||||
ar.set_ignore_zeros(true);
|
||||
ar.unpack(&self.pgdata)?;
|
||||
|
||||
// Report metrics
|
||||
self.state.lock().unwrap().metrics.basebackup_bytes =
|
||||
measured_reader.get_byte_count() as u64;
|
||||
self.state.lock().unwrap().metrics.basebackup_ms = Utc::now()
|
||||
.signed_duration_since(start_time)
|
||||
.to_std()
|
||||
@@ -367,22 +323,14 @@ impl ComputeNode {
|
||||
/// Do all the preparations like PGDATA directory creation, configuration,
|
||||
/// safekeepers sync, basebackup, etc.
|
||||
#[instrument(skip_all)]
|
||||
pub fn prepare_pgdata(
|
||||
&self,
|
||||
compute_state: &ComputeState,
|
||||
extension_server_port: u16,
|
||||
) -> Result<()> {
|
||||
pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
let spec = &pspec.spec;
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
|
||||
// Remove/create an empty pgdata directory and put configuration there.
|
||||
self.create_pgdata()?;
|
||||
config::write_postgres_conf(
|
||||
&pgdata_path.join("postgresql.conf"),
|
||||
&pspec.spec,
|
||||
Some(extension_server_port),
|
||||
)?;
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &pspec.spec)?;
|
||||
|
||||
// Syncing safekeepers is only safe with primary nodes: if a primary
|
||||
// is already connected it will be kicked out, so a secondary (standby)
|
||||
@@ -524,7 +472,7 @@ impl ComputeNode {
|
||||
|
||||
// Write new config
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec)?;
|
||||
|
||||
let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
|
||||
self.pg_reload_conf(&mut client)?;
|
||||
@@ -554,7 +502,7 @@ impl ComputeNode {
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
pub fn start_compute(&self, extension_server_port: u16) -> Result<std::process::Child> {
|
||||
pub fn start_compute(&self) -> Result<std::process::Child> {
|
||||
let compute_state = self.state.lock().unwrap().clone();
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
info!(
|
||||
@@ -565,26 +513,7 @@ impl ComputeNode {
|
||||
pspec.timeline_id,
|
||||
);
|
||||
|
||||
// This part is sync, because we need to download
|
||||
// remote shared_preload_libraries before postgres start (if any)
|
||||
{
|
||||
let library_load_start_time = Utc::now();
|
||||
self.prepare_preload_libraries(&compute_state)?;
|
||||
|
||||
let library_load_time = Utc::now()
|
||||
.signed_duration_since(library_load_start_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64;
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.metrics.load_libraries_ms = library_load_time;
|
||||
info!(
|
||||
"Loading shared_preload_libraries took {:?}ms",
|
||||
library_load_time
|
||||
);
|
||||
}
|
||||
|
||||
self.prepare_pgdata(&compute_state, extension_server_port)?;
|
||||
self.prepare_pgdata(&compute_state)?;
|
||||
|
||||
let start_time = Utc::now();
|
||||
let pg = self.start_postgres(pspec.storage_auth_token.clone())?;
|
||||
@@ -620,13 +549,6 @@ impl ComputeNode {
|
||||
pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None")
|
||||
);
|
||||
|
||||
// Log metrics so that we can search for slow operations in logs
|
||||
let metrics = {
|
||||
let state = self.state.lock().unwrap();
|
||||
state.metrics.clone()
|
||||
};
|
||||
info!(?metrics, "compute start finished");
|
||||
|
||||
Ok(pg)
|
||||
}
|
||||
|
||||
@@ -732,92 +654,4 @@ LIMIT 100",
|
||||
"{{\"pg_stat_statements\": []}}".to_string()
|
||||
}
|
||||
}
|
||||
|
||||
// If remote extension storage is configured,
|
||||
// download extension control files
|
||||
#[tokio::main]
|
||||
pub async fn prepare_external_extensions(&self, compute_state: &ComputeState) -> Result<()> {
|
||||
if let Some(ref ext_remote_storage) = self.ext_remote_storage {
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
let spec = &pspec.spec;
|
||||
let custom_ext_prefixes = spec.custom_extensions.clone().unwrap_or(Vec::new());
|
||||
info!("custom_ext_prefixes: {:?}", &custom_ext_prefixes);
|
||||
let available_extensions = extension_server::get_available_extensions(
|
||||
ext_remote_storage,
|
||||
&self.pgbin,
|
||||
&self.pgversion,
|
||||
&custom_ext_prefixes,
|
||||
)
|
||||
.await?;
|
||||
self.available_extensions
|
||||
.set(available_extensions)
|
||||
.expect("available_extensions.set error");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn download_extension(&self, ext_name: &str) -> Result<()> {
|
||||
match &self.ext_remote_storage {
|
||||
None => anyhow::bail!("No remote extension storage"),
|
||||
Some(remote_storage) => {
|
||||
extension_server::download_extension(
|
||||
ext_name,
|
||||
remote_storage,
|
||||
&self.pgbin,
|
||||
&self.pgversion,
|
||||
)
|
||||
.await
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
pub async fn prepare_preload_libraries(&self, compute_state: &ComputeState) -> Result<()> {
|
||||
if self.ext_remote_storage.is_none() {
|
||||
return Ok(());
|
||||
}
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
let spec = &pspec.spec;
|
||||
|
||||
info!("parse shared_preload_libraries from spec.cluster.settings");
|
||||
let mut libs_vec = Vec::new();
|
||||
if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
|
||||
libs_vec = libs
|
||||
.split(&[',', '\'', ' '])
|
||||
.filter(|s| *s != "neon" && !s.is_empty())
|
||||
.map(str::to_string)
|
||||
.collect();
|
||||
}
|
||||
info!("parse shared_preload_libraries from provided postgresql.conf");
|
||||
// that is used in neon_local and python tests
|
||||
if let Some(conf) = &spec.cluster.postgresql_conf {
|
||||
let conf_lines = conf.split('\n').collect::<Vec<&str>>();
|
||||
let mut shared_preload_libraries_line = "";
|
||||
for line in conf_lines {
|
||||
if line.starts_with("shared_preload_libraries") {
|
||||
shared_preload_libraries_line = line;
|
||||
}
|
||||
}
|
||||
let mut preload_libs_vec = Vec::new();
|
||||
if let Some(libs) = shared_preload_libraries_line.split("='").nth(1) {
|
||||
preload_libs_vec = libs
|
||||
.split(&[',', '\'', ' '])
|
||||
.filter(|s| *s != "neon" && !s.is_empty())
|
||||
.map(str::to_string)
|
||||
.collect();
|
||||
}
|
||||
libs_vec.extend(preload_libs_vec);
|
||||
}
|
||||
|
||||
info!("Downloading to shared preload libraries: {:?}", &libs_vec);
|
||||
let mut download_tasks = Vec::new();
|
||||
for library in &libs_vec {
|
||||
download_tasks.push(self.download_extension(library));
|
||||
}
|
||||
let results = join_all(download_tasks).await;
|
||||
for result in results {
|
||||
result?; // propogate any errors
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -33,11 +33,7 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
|
||||
}
|
||||
|
||||
/// Create or completely rewrite configuration file specified by `path`
|
||||
pub fn write_postgres_conf(
|
||||
path: &Path,
|
||||
spec: &ComputeSpec,
|
||||
extension_server_port: Option<u16>,
|
||||
) -> Result<()> {
|
||||
pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
|
||||
// File::create() destroys the file content if it exists.
|
||||
let mut file = File::create(path)?;
|
||||
|
||||
@@ -51,22 +47,30 @@ pub fn write_postgres_conf(
|
||||
// Add options for connecting to storage
|
||||
writeln!(file, "# Neon storage settings")?;
|
||||
if let Some(s) = &spec.pageserver_connstring {
|
||||
writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
|
||||
writeln!(
|
||||
file,
|
||||
"neon.pageserver_connstring='{}'",
|
||||
escape_conf_value(s)
|
||||
)?;
|
||||
}
|
||||
if !spec.safekeeper_connstrings.is_empty() {
|
||||
writeln!(
|
||||
file,
|
||||
"neon.safekeepers={}",
|
||||
"neon.safekeepers='{}'",
|
||||
escape_conf_value(&spec.safekeeper_connstrings.join(","))
|
||||
)?;
|
||||
}
|
||||
if let Some(s) = &spec.tenant_id {
|
||||
writeln!(file, "neon.tenant_id={}", escape_conf_value(&s.to_string()))?;
|
||||
writeln!(
|
||||
file,
|
||||
"neon.tenant_id='{}'",
|
||||
escape_conf_value(&s.to_string())
|
||||
)?;
|
||||
}
|
||||
if let Some(s) = &spec.timeline_id {
|
||||
writeln!(
|
||||
file,
|
||||
"neon.timeline_id={}",
|
||||
"neon.timeline_id='{}'",
|
||||
escape_conf_value(&s.to_string())
|
||||
)?;
|
||||
}
|
||||
@@ -91,9 +95,5 @@ pub fn write_postgres_conf(
|
||||
writeln!(file, "# Managed by compute_ctl: end")?;
|
||||
}
|
||||
|
||||
if let Some(port) = extension_server_port {
|
||||
writeln!(file, "neon.extension_server_port={}", port)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -42,15 +42,13 @@ fn configurator_main_loop(compute: &Arc<ComputeNode>) {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn launch_configurator(
|
||||
compute: &Arc<ComputeNode>,
|
||||
) -> Result<thread::JoinHandle<()>, std::io::Error> {
|
||||
pub fn launch_configurator(compute: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
|
||||
let compute = Arc::clone(compute);
|
||||
|
||||
thread::Builder::new()
|
||||
Ok(thread::Builder::new()
|
||||
.name("compute-configurator".into())
|
||||
.spawn(move || {
|
||||
configurator_main_loop(&compute);
|
||||
info!("configurator thread is exited");
|
||||
})
|
||||
})?)
|
||||
}
|
||||
|
||||
@@ -1,237 +0,0 @@
|
||||
// Download extension files from the extension store
|
||||
// and put them in the right place in the postgres directory
|
||||
/*
|
||||
The layout of the S3 bucket is as follows:
|
||||
|
||||
v14/ext_index.json
|
||||
-- this contains information necessary to create control files
|
||||
v14/extensions/test_ext1.tar.gz
|
||||
-- this contains the library files and sql files necessary to create this extension
|
||||
v14/extensions/custom_ext1.tar.gz
|
||||
|
||||
The difference between a private and public extensions is determined by who can
|
||||
load the extension this is specified in ext_index.json
|
||||
|
||||
Speicially, ext_index.json has a list of public extensions, and a list of
|
||||
extensions enabled for specific tenant-ids.
|
||||
*/
|
||||
use crate::compute::ComputeNode;
|
||||
use anyhow::Context;
|
||||
use anyhow::{self, Result};
|
||||
use flate2::read::GzDecoder;
|
||||
use remote_storage::*;
|
||||
use serde_json::{self, Value};
|
||||
use std::collections::HashSet;
|
||||
use std::num::{NonZeroU32, NonZeroUsize};
|
||||
use std::path::Path;
|
||||
use std::str;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use tar::Archive;
|
||||
use tokio::io::AsyncReadExt;
|
||||
use tracing::info;
|
||||
|
||||
fn get_pg_config(argument: &str, pgbin: &str) -> String {
|
||||
// gives the result of `pg_config [argument]`
|
||||
// where argument is a flag like `--version` or `--sharedir`
|
||||
let pgconfig = pgbin
|
||||
.strip_suffix("postgres")
|
||||
.expect("bad pgbin")
|
||||
.to_owned()
|
||||
+ "/pg_config";
|
||||
let config_output = std::process::Command::new(pgconfig)
|
||||
.arg(argument)
|
||||
.output()
|
||||
.expect("pg_config error");
|
||||
std::str::from_utf8(&config_output.stdout)
|
||||
.expect("pg_config error")
|
||||
.trim()
|
||||
.to_string()
|
||||
}
|
||||
|
||||
pub fn get_pg_version(pgbin: &str) -> String {
|
||||
// pg_config --version returns a (platform specific) human readable string
|
||||
// such as "PostgreSQL 15.4". We parse this to v14/v15
|
||||
let human_version = get_pg_config("--version", pgbin);
|
||||
if human_version.contains("15") {
|
||||
return "v15".to_string();
|
||||
} else if human_version.contains("14") {
|
||||
return "v14".to_string();
|
||||
}
|
||||
panic!("Unsuported postgres version {human_version}");
|
||||
}
|
||||
|
||||
// download extension control files
|
||||
// if custom_ext_prefixes is provided - search also in custom extension paths
|
||||
pub async fn get_available_extensions(
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
pgbin: &str,
|
||||
pg_version: &str,
|
||||
custom_ext_prefixes: &[String],
|
||||
) -> Result<HashSet<String>> {
|
||||
let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
|
||||
let index_path = pg_version.to_owned() + "/ext_index.json";
|
||||
let index_path = RemotePath::new(Path::new(&index_path)).context("error forming path")?;
|
||||
info!("download ext_index.json: {:?}", &index_path);
|
||||
|
||||
// TODO: potential optimization: cache ext_index.json
|
||||
let mut download = remote_storage.download(&index_path).await?;
|
||||
let mut write_data_buffer = Vec::new();
|
||||
download
|
||||
.download_stream
|
||||
.read_to_end(&mut write_data_buffer)
|
||||
.await?;
|
||||
let ext_index_str = match str::from_utf8(&write_data_buffer) {
|
||||
Ok(v) => v,
|
||||
Err(e) => panic!("Invalid UTF-8 sequence: {}", e),
|
||||
};
|
||||
|
||||
let ext_index_full: Value = serde_json::from_str(ext_index_str)?;
|
||||
let ext_index_full = ext_index_full.as_object().context("error parsing json")?;
|
||||
let control_data = ext_index_full["control_data"]
|
||||
.as_object()
|
||||
.context("json parse error")?;
|
||||
let enabled_extensions = ext_index_full["enabled_extensions"]
|
||||
.as_object()
|
||||
.context("json parse error")?;
|
||||
info!("{:?}", control_data.clone());
|
||||
info!("{:?}", enabled_extensions.clone());
|
||||
|
||||
let mut prefixes = vec!["public".to_string()];
|
||||
prefixes.extend(custom_ext_prefixes.to_owned());
|
||||
info!("{:?}", &prefixes);
|
||||
let mut all_extensions = HashSet::new();
|
||||
for prefix in prefixes {
|
||||
let prefix_extensions = match enabled_extensions.get(&prefix) {
|
||||
Some(Value::Array(ext_name)) => ext_name,
|
||||
_ => {
|
||||
info!("prefix {} has no extensions", prefix);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
info!("{:?}", prefix_extensions);
|
||||
for ext_name in prefix_extensions {
|
||||
all_extensions.insert(ext_name.as_str().context("json parse error")?.to_string());
|
||||
}
|
||||
}
|
||||
|
||||
for prefix in &all_extensions {
|
||||
let control_contents = control_data[prefix].as_str().context("json parse error")?;
|
||||
let control_path = local_sharedir.join(prefix.to_owned() + ".control");
|
||||
|
||||
info!("WRITING FILE {:?}{:?}", control_path, control_contents);
|
||||
std::fs::write(control_path, control_contents)?;
|
||||
}
|
||||
|
||||
Ok(all_extensions.into_iter().collect())
|
||||
}
|
||||
|
||||
// download all sqlfiles (and possibly data files) for a given extension name
|
||||
pub async fn download_extension(
|
||||
ext_name: &str,
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
pgbin: &str,
|
||||
pg_version: &str,
|
||||
) -> Result<()> {
|
||||
// TODO: potential optimization: only download the extension if it doesn't exist
|
||||
// problem: how would we tell if it exists?
|
||||
let ext_name = ext_name.replace(".so", "");
|
||||
let ext_name_targz = ext_name.to_owned() + ".tar.gz";
|
||||
if Path::new(&ext_name_targz).exists() {
|
||||
info!("extension {:?} already exists", ext_name_targz);
|
||||
return Ok(());
|
||||
}
|
||||
let ext_path = RemotePath::new(
|
||||
&Path::new(pg_version)
|
||||
.join("extensions")
|
||||
.join(ext_name_targz.clone()),
|
||||
)?;
|
||||
info!(
|
||||
"Start downloading extension {:?} from {:?}",
|
||||
ext_name, ext_path
|
||||
);
|
||||
let mut download = remote_storage.download(&ext_path).await?;
|
||||
let mut write_data_buffer = Vec::new();
|
||||
download
|
||||
.download_stream
|
||||
.read_to_end(&mut write_data_buffer)
|
||||
.await?;
|
||||
let unzip_dest = pgbin.strip_suffix("/bin/postgres").expect("bad pgbin");
|
||||
let tar = GzDecoder::new(write_data_buffer.as_slice());
|
||||
let mut archive = Archive::new(tar);
|
||||
archive.unpack(unzip_dest)?;
|
||||
info!("Download + unzip {:?} completed successfully", &ext_path);
|
||||
|
||||
let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
|
||||
let zip_sharedir = format!("{unzip_dest}/extensions/{ext_name}/share/extension");
|
||||
info!("mv {zip_sharedir:?}/* {local_sharedir:?}");
|
||||
for file in std::fs::read_dir(zip_sharedir)? {
|
||||
let old_file = file?.path();
|
||||
let new_file =
|
||||
Path::new(&local_sharedir).join(old_file.file_name().context("error parsing file")?);
|
||||
std::fs::rename(old_file, new_file)?;
|
||||
}
|
||||
let local_libdir = Path::new(&get_pg_config("--libdir", pgbin)).join("postgresql");
|
||||
let zip_libdir = format!("{unzip_dest}/extensions/{ext_name}/lib");
|
||||
info!("mv {zip_libdir:?}/* {local_libdir:?}");
|
||||
for file in std::fs::read_dir(zip_libdir)? {
|
||||
let old_file = file?.path();
|
||||
let new_file =
|
||||
Path::new(&local_libdir).join(old_file.file_name().context("error parsing file")?);
|
||||
std::fs::rename(old_file, new_file)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// This function initializes the necessary structs to use remmote storage (should be fairly cheap)
|
||||
pub fn init_remote_storage(
|
||||
remote_ext_config: &str,
|
||||
default_prefix: &str,
|
||||
) -> anyhow::Result<GenericRemoteStorage> {
|
||||
let remote_ext_config: serde_json::Value = serde_json::from_str(remote_ext_config)?;
|
||||
|
||||
let remote_ext_bucket = remote_ext_config["bucket"]
|
||||
.as_str()
|
||||
.context("config parse error")?;
|
||||
let remote_ext_region = remote_ext_config["region"]
|
||||
.as_str()
|
||||
.context("config parse error")?;
|
||||
let remote_ext_endpoint = remote_ext_config["endpoint"].as_str();
|
||||
let remote_ext_prefix = remote_ext_config["prefix"]
|
||||
.as_str()
|
||||
.unwrap_or(default_prefix)
|
||||
.to_string();
|
||||
|
||||
// TODO: potentially allow modification of other parameters
|
||||
// however, default values should be fine for now
|
||||
let config = S3Config {
|
||||
bucket_name: remote_ext_bucket.to_string(),
|
||||
bucket_region: remote_ext_region.to_string(),
|
||||
prefix_in_bucket: Some(remote_ext_prefix),
|
||||
endpoint: remote_ext_endpoint.map(|x| x.to_string()),
|
||||
concurrency_limit: NonZeroUsize::new(100).expect("100 != 0"),
|
||||
max_keys_per_list_response: None,
|
||||
};
|
||||
let config = RemoteStorageConfig {
|
||||
max_concurrent_syncs: NonZeroUsize::new(100).expect("100 != 0"),
|
||||
max_sync_errors: NonZeroU32::new(100).expect("100 != 0"),
|
||||
storage: RemoteStorageKind::AwsS3(config),
|
||||
};
|
||||
GenericRemoteStorage::from_config(&config)
|
||||
}
|
||||
|
||||
pub fn launch_download_extensions(
|
||||
compute: &Arc<ComputeNode>,
|
||||
) -> Result<thread::JoinHandle<()>, std::io::Error> {
|
||||
let compute = Arc::clone(compute);
|
||||
thread::Builder::new()
|
||||
.name("download-extensions".into())
|
||||
.spawn(move || {
|
||||
info!("start download_extension_files");
|
||||
let compute_state = compute.state.lock().expect("error unlocking compute.state");
|
||||
compute
|
||||
.prepare_external_extensions(&compute_state)
|
||||
.expect("error preparing extensions");
|
||||
info!("download_extension_files done, exiting thread");
|
||||
})
|
||||
}
|
||||
@@ -121,27 +121,6 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
|
||||
}
|
||||
}
|
||||
|
||||
// download extension files from S3 on demand
|
||||
(&Method::POST, route) if route.starts_with("/extension_server/") => {
|
||||
info!("serving {:?} POST request", route);
|
||||
info!("req.uri {:?}", req.uri());
|
||||
let filename = route.split('/').last().unwrap().to_string();
|
||||
info!(
|
||||
"serving /extension_server POST request, filename: {:?}",
|
||||
&filename
|
||||
);
|
||||
|
||||
match compute.download_extension(&filename).await {
|
||||
Ok(_) => Response::new(Body::from("OK")),
|
||||
Err(e) => {
|
||||
error!("extension download failed: {}", e);
|
||||
let mut resp = Response::new(Body::from(e.to_string()));
|
||||
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
|
||||
resp
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Return the `404 Not Found` for any other routes.
|
||||
_ => {
|
||||
let mut not_found = Response::new(Body::from("404 Not Found"));
|
||||
|
||||
@@ -139,34 +139,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
/extension_server:
|
||||
post:
|
||||
tags:
|
||||
- Extension
|
||||
summary: Download extension from S3 to local folder.
|
||||
description: ""
|
||||
operationId: downloadExtension
|
||||
responses:
|
||||
200:
|
||||
description: Extension downloaded
|
||||
content:
|
||||
text/plain:
|
||||
schema:
|
||||
type: string
|
||||
description: Error text or 'OK' if download succeeded.
|
||||
example: "OK"
|
||||
400:
|
||||
description: Request is invalid.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
500:
|
||||
description: Extension download request failed.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
|
||||
components:
|
||||
securitySchemes:
|
||||
|
||||
@@ -9,7 +9,6 @@ pub mod http;
|
||||
#[macro_use]
|
||||
pub mod logger;
|
||||
pub mod compute;
|
||||
pub mod extension_server;
|
||||
pub mod monitor;
|
||||
pub mod params;
|
||||
pub mod pg_helpers;
|
||||
|
||||
@@ -105,10 +105,10 @@ fn watch_compute_activity(compute: &ComputeNode) {
|
||||
}
|
||||
|
||||
/// Launch a separate compute monitor thread and return its `JoinHandle`.
|
||||
pub fn launch_monitor(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>, std::io::Error> {
|
||||
pub fn launch_monitor(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
|
||||
let state = Arc::clone(state);
|
||||
|
||||
thread::Builder::new()
|
||||
Ok(thread::Builder::new()
|
||||
.name("compute-monitor".into())
|
||||
.spawn(move || watch_compute_activity(&state))
|
||||
.spawn(move || watch_compute_activity(&state))?)
|
||||
}
|
||||
|
||||
@@ -16,26 +16,15 @@ use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
|
||||
|
||||
const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds
|
||||
|
||||
/// Escape a string for including it in a SQL literal. Wrapping the result
|
||||
/// with `E'{}'` or `'{}'` is not required, as it returns a ready-to-use
|
||||
/// SQL string literal, e.g. `'db'''` or `E'db\\'`.
|
||||
/// See https://github.com/postgres/postgres/blob/da98d005cdbcd45af563d0c4ac86d0e9772cd15f/src/backend/utils/adt/quote.c#L47
|
||||
/// for the original implementation.
|
||||
/// Escape a string for including it in a SQL literal
|
||||
pub fn escape_literal(s: &str) -> String {
|
||||
let res = s.replace('\'', "''").replace('\\', "\\\\");
|
||||
|
||||
if res.contains('\\') {
|
||||
format!("E'{}'", res)
|
||||
} else {
|
||||
format!("'{}'", res)
|
||||
}
|
||||
s.replace('\'', "''").replace('\\', "\\\\")
|
||||
}
|
||||
|
||||
/// Escape a string so that it can be used in postgresql.conf. Wrapping the result
|
||||
/// with `'{}'` is not required, as it returns a ready-to-use config string.
|
||||
/// Escape a string so that it can be used in postgresql.conf.
|
||||
/// Same as escape_literal, currently.
|
||||
pub fn escape_conf_value(s: &str) -> String {
|
||||
let res = s.replace('\'', "''").replace('\\', "\\\\");
|
||||
format!("'{}'", res)
|
||||
s.replace('\'', "''").replace('\\', "\\\\")
|
||||
}
|
||||
|
||||
trait GenericOptionExt {
|
||||
@@ -48,7 +37,7 @@ impl GenericOptionExt for GenericOption {
|
||||
fn to_pg_option(&self) -> String {
|
||||
if let Some(val) = &self.value {
|
||||
match self.vartype.as_ref() {
|
||||
"string" => format!("{} {}", self.name, escape_literal(val)),
|
||||
"string" => format!("{} '{}'", self.name, escape_literal(val)),
|
||||
_ => format!("{} {}", self.name, val),
|
||||
}
|
||||
} else {
|
||||
@@ -60,7 +49,7 @@ impl GenericOptionExt for GenericOption {
|
||||
fn to_pg_setting(&self) -> String {
|
||||
if let Some(val) = &self.value {
|
||||
match self.vartype.as_ref() {
|
||||
"string" => format!("{} = {}", self.name, escape_conf_value(val)),
|
||||
"string" => format!("{} = '{}'", self.name, escape_conf_value(val)),
|
||||
_ => format!("{} = {}", self.name, val),
|
||||
}
|
||||
} else {
|
||||
|
||||
@@ -124,7 +124,7 @@ pub fn get_spec_from_control_plane(
|
||||
pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
|
||||
// File `postgresql.conf` is no longer included into `basebackup`, so just
|
||||
// always write all config into it creating new file.
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec, None)?;
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
|
||||
|
||||
update_pg_hba(pgdata_path)?;
|
||||
|
||||
@@ -397,44 +397,10 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
// We do not check either DB exists or not,
|
||||
// Postgres will take care of it for us
|
||||
"delete_db" => {
|
||||
// In Postgres we can't drop a database if it is a template.
|
||||
// So we need to unset the template flag first, but it could
|
||||
// be a retry, so we could've already dropped the database.
|
||||
// Check that database exists first to make it idempotent.
|
||||
let unset_template_query: String = format!(
|
||||
"
|
||||
DO $$
|
||||
BEGIN
|
||||
IF EXISTS(
|
||||
SELECT 1
|
||||
FROM pg_catalog.pg_database
|
||||
WHERE datname = {}
|
||||
)
|
||||
THEN
|
||||
ALTER DATABASE {} is_template false;
|
||||
END IF;
|
||||
END
|
||||
$$;",
|
||||
escape_literal(&op.name),
|
||||
&op.name.pg_quote()
|
||||
);
|
||||
// Use FORCE to drop database even if there are active connections.
|
||||
// We run this from `cloud_admin`, so it should have enough privileges.
|
||||
// NB: there could be other db states, which prevent us from dropping
|
||||
// the database. For example, if db is used by any active subscription
|
||||
// or replication slot.
|
||||
// TODO: deal with it once we allow logical replication. Proper fix should
|
||||
// involve returning an error code to the control plane, so it could
|
||||
// figure out that this is a non-retryable error, return it to the user
|
||||
// and fail operation permanently.
|
||||
let drop_db_query: String = format!(
|
||||
"DROP DATABASE IF EXISTS {} WITH (FORCE)",
|
||||
&op.name.pg_quote()
|
||||
);
|
||||
let query: String = format!("DROP DATABASE IF EXISTS {}", &op.name.pg_quote());
|
||||
|
||||
warn!("deleting database '{}'", &op.name);
|
||||
client.execute(unset_template_query.as_str(), &[])?;
|
||||
client.execute(drop_db_query.as_str(), &[])?;
|
||||
client.execute(query.as_str(), &[])?;
|
||||
}
|
||||
"rename_db" => {
|
||||
let new_name = op.new_name.as_ref().unwrap();
|
||||
|
||||
@@ -89,12 +89,4 @@ test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hoor
|
||||
assert_eq!(none_generic_options.find("missed_value"), None);
|
||||
assert_eq!(none_generic_options.find("invalid_value"), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_escape_literal() {
|
||||
assert_eq!(escape_literal("test"), "'test'");
|
||||
assert_eq!(escape_literal("test'"), "'test'''");
|
||||
assert_eq!(escape_literal("test\\'"), "E'test\\\\'''");
|
||||
assert_eq!(escape_literal("test\\'\\'"), "E'test\\\\''\\\\'''");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -32,4 +32,3 @@ utils.workspace = true
|
||||
|
||||
compute_api.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
@@ -658,8 +658,6 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
||||
.get_one::<String>("endpoint_id")
|
||||
.ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;
|
||||
|
||||
let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
|
||||
|
||||
// If --safekeepers argument is given, use only the listed safekeeper nodes.
|
||||
let safekeepers =
|
||||
if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
|
||||
@@ -701,7 +699,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
||||
_ => {}
|
||||
}
|
||||
println!("Starting existing endpoint {endpoint_id}...");
|
||||
endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
|
||||
endpoint.start(&auth_token, safekeepers)?;
|
||||
} else {
|
||||
let branch_name = sub_args
|
||||
.get_one::<String>("branch-name")
|
||||
@@ -745,7 +743,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
||||
pg_version,
|
||||
mode,
|
||||
)?;
|
||||
ep.start(&auth_token, safekeepers, remote_ext_config)?;
|
||||
ep.start(&auth_token, safekeepers)?;
|
||||
}
|
||||
}
|
||||
"stop" => {
|
||||
@@ -1005,12 +1003,6 @@ fn cli() -> Command {
|
||||
.help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
|
||||
.required(false);
|
||||
|
||||
let remote_ext_config_args = Arg::new("remote-ext-config")
|
||||
.long("remote-ext-config")
|
||||
.num_args(1)
|
||||
.help("Configure the S3 bucket that we search for extensions in.")
|
||||
.required(false);
|
||||
|
||||
let lsn_arg = Arg::new("lsn")
|
||||
.long("lsn")
|
||||
.help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.")
|
||||
@@ -1169,7 +1161,6 @@ fn cli() -> Command {
|
||||
.arg(pg_version_arg)
|
||||
.arg(hot_standby_arg)
|
||||
.arg(safekeepers_arg)
|
||||
.arg(remote_ext_config_args)
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("stop")
|
||||
|
||||
@@ -311,7 +311,7 @@ impl Endpoint {
|
||||
|
||||
// TODO: use future host field from safekeeper spec
|
||||
// Pass the list of safekeepers to the replica so that it can connect to any of them,
|
||||
// whichever is available.
|
||||
// whichever is availiable.
|
||||
let sk_ports = self
|
||||
.env
|
||||
.safekeepers
|
||||
@@ -418,12 +418,7 @@ impl Endpoint {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn start(
|
||||
&self,
|
||||
auth_token: &Option<String>,
|
||||
safekeepers: Vec<NodeId>,
|
||||
remote_ext_config: Option<&String>,
|
||||
) -> Result<()> {
|
||||
pub fn start(&self, auth_token: &Option<String>, safekeepers: Vec<NodeId>) -> Result<()> {
|
||||
if self.status() == "running" {
|
||||
anyhow::bail!("The endpoint is already running");
|
||||
}
|
||||
@@ -491,15 +486,6 @@ impl Endpoint {
|
||||
pageserver_connstring: Some(pageserver_connstring),
|
||||
safekeeper_connstrings,
|
||||
storage_auth_token: auth_token.clone(),
|
||||
// TODO FIXME: This is a hack to test custom extensions locally.
|
||||
// In test_download_extensions, we assume that the custom extension
|
||||
// prefix is the tenant ID. So we set it here.
|
||||
//
|
||||
// The proper way to implement this is to pass the custom extension
|
||||
// in spec, but we don't have a way to do that yet in the python tests.
|
||||
// NEW HACK: we enable the anon custom extension for everyone! this is of course just for testing
|
||||
// how will we do it for real?
|
||||
custom_extensions: Some(vec!["123454321".to_string(), self.tenant_id.to_string()]),
|
||||
};
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
|
||||
@@ -531,11 +517,6 @@ impl Endpoint {
|
||||
.stdin(std::process::Stdio::null())
|
||||
.stderr(logfile.try_clone()?)
|
||||
.stdout(logfile);
|
||||
|
||||
if let Some(remote_ext_config) = remote_ext_config {
|
||||
cmd.args(["--remote-ext-config", remote_ext_config]);
|
||||
}
|
||||
|
||||
let child = cmd.spawn()?;
|
||||
|
||||
// Write down the pid so we can wait for it when we want to stop
|
||||
|
||||
@@ -1,187 +0,0 @@
|
||||
# Supporting custom user Extensions (Dynamic Extension Loading)
|
||||
Created 2023-05-03
|
||||
|
||||
## Motivation
|
||||
|
||||
There are many extensions in the PostgreSQL ecosystem, and not all extensions
|
||||
are of a quality that we can confidently support them. Additionally, our
|
||||
current extension inclusion mechanism has several problems because we build all
|
||||
extensions into the primary Compute image: We build the extensions every time
|
||||
we build the compute image regardless of whether we actually need to rebuild
|
||||
the image, and the inclusion of these extensions in the image adds a hard
|
||||
dependency on all supported extensions - thus increasing the image size, and
|
||||
with it the time it takes to download that image - increasing first start
|
||||
latency.
|
||||
|
||||
This RFC proposes a dynamic loading mechanism that solves most of these
|
||||
problems.
|
||||
|
||||
## Summary
|
||||
|
||||
`compute_ctl` is made responsible for loading extensions on-demand into
|
||||
the container's file system for dynamically loaded extensions, and will also
|
||||
make sure that the extensions in `shared_preload_libraries` are downloaded
|
||||
before the compute node starts.
|
||||
|
||||
## Components
|
||||
|
||||
compute_ctl, PostgreSQL, neon (extension), Compute Host Node, Extension Store
|
||||
|
||||
## Requirements
|
||||
|
||||
Compute nodes with no extra extensions should not be negatively impacted by
|
||||
the existence of support for many extensions.
|
||||
|
||||
Installing an extension into PostgreSQL should be easy.
|
||||
|
||||
Non-preloaded extensions shouldn't impact startup latency.
|
||||
|
||||
Uninstalled extensions shouldn't impact query latency.
|
||||
|
||||
A small latency penalty for dynamically loaded extensions is acceptable in
|
||||
the first seconds of compute startup, but not in steady-state operations.
|
||||
|
||||
## Proposed implementation
|
||||
|
||||
### On-demand, JIT-loading of extensions
|
||||
|
||||
Before postgres starts we download
|
||||
- control files for all extensions available to that compute node;
|
||||
- all `shared_preload_libraries`;
|
||||
|
||||
After postgres is running, `compute_ctl` listens for requests to load files.
|
||||
When PostgreSQL requests a file, `compute_ctl` downloads it.
|
||||
|
||||
PostgreSQL requests files in the following cases:
|
||||
- When loading a preload library set in `local_preload_libraries`
|
||||
- When explicitly loading a library with `LOAD`
|
||||
- Wnen creating extension with `CREATE EXTENSION` (download sql scripts, (optional) extension data files and (optional) library files)))
|
||||
|
||||
|
||||
#### Summary
|
||||
|
||||
Pros:
|
||||
- Startup is only as slow as it takes to load all (shared_)preload_libraries
|
||||
- Supports BYO Extension
|
||||
|
||||
Cons:
|
||||
- O(sizeof(extensions)) IO requirement for loading all extensions.
|
||||
|
||||
### Alternative solutions
|
||||
|
||||
1. Allow users to add their extensions to the base image
|
||||
|
||||
Pros:
|
||||
- Easy to deploy
|
||||
|
||||
Cons:
|
||||
- Doesn't scale - first start size is dependent on image size;
|
||||
- All extensions are shared across all users: It doesn't allow users to
|
||||
bring their own restrictive-licensed extensions
|
||||
|
||||
2. Bring Your Own compute image
|
||||
|
||||
Pros:
|
||||
- Still easy to deploy
|
||||
- User can bring own patched version of PostgreSQL
|
||||
|
||||
Cons:
|
||||
- First start latency is O(sizeof(extensions image))
|
||||
- Warm instance pool for skipping pod schedule latency is not feasible with
|
||||
O(n) custom images
|
||||
- Support channels are difficult to manage
|
||||
|
||||
3. Download all user extensions in bulk on compute start
|
||||
|
||||
Pros:
|
||||
- Easy to deploy
|
||||
- No startup latency issues for "clean" users.
|
||||
- Warm instance pool for skipping pod schedule latency is possible
|
||||
|
||||
Cons:
|
||||
- Downloading all extensions in advance takes a lot of time, thus startup
|
||||
latency issues
|
||||
|
||||
4. Store user's extensions in persistent storage
|
||||
|
||||
Pros:
|
||||
- Easy to deploy
|
||||
- No startup latency issues
|
||||
- Warm instance pool for skipping pod schedule latency is possible
|
||||
|
||||
Cons:
|
||||
- EC2 instances have only limited number of attachments shared between EBS
|
||||
volumes, direct-attached NVMe drives, and ENIs.
|
||||
- Compute instance migration isn't trivially solved for EBS mounts (e.g.
|
||||
the device is unavailable whilst moving the mount between instances).
|
||||
- EBS can only mount on one instance at a time (except the expensive IO2
|
||||
device type).
|
||||
|
||||
5. Store user's extensions in network drive
|
||||
|
||||
Pros:
|
||||
- Easy to deploy
|
||||
- Few startup latency issues
|
||||
- Warm instance pool for skipping pod schedule latency is possible
|
||||
|
||||
Cons:
|
||||
- We'd need networked drives, and a lot of them, which would store many
|
||||
duplicate extensions.
|
||||
- **UNCHECKED:** Compute instance migration may not work nicely with
|
||||
networked IOs
|
||||
|
||||
|
||||
### Idea extensions
|
||||
|
||||
The extension store does not have to be S3 directly, but could be a Node-local
|
||||
caching service on top of S3. This would reduce the load on the network for
|
||||
popular extensions.
|
||||
|
||||
## Extension Storage implementation
|
||||
|
||||
The layout of the S3 bucket is as follows:
|
||||
```
|
||||
v14/ext_index.json
|
||||
-- this contains information necessary to create control files
|
||||
v14/extensions/test_ext1.tar.gz
|
||||
-- this contains the library files and sql files necessary to create this extension
|
||||
v14/extensions/custom_ext1.tar.gz
|
||||
```
|
||||
The difference between private and public extensions is determined by who can
|
||||
load the extension. This is specified in `ext_index.json`.
|
||||
Speicially, `ext_index.json` has a list of public extensions, and a list of
|
||||
extensions enabled for specific tenant-ids. Here is an example `ext_index.json`:
|
||||
```
|
||||
{
|
||||
"enabled_extensions": {
|
||||
"123454321": [
|
||||
"anon"
|
||||
],
|
||||
"public": [
|
||||
"embedding"
|
||||
]
|
||||
},
|
||||
"control_data": {
|
||||
"embedding": "comment = 'hnsw index' \ndefault_version = '0.1.0' \nmodule_pathname = '$libdir/embedding' \nrelocatable = true \ntrusted = true",
|
||||
"anon": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### How to add new extension to the Extension Storage?
|
||||
|
||||
Simply upload build artifacts to the S3 bucket.
|
||||
Implement a CI step for that. Splitting it from ompute-node-image build.
|
||||
|
||||
### How do we deal with extension versions and updates?
|
||||
|
||||
Currently, we rebuild extensions on every compute-node-image build and store them in the <build-version> prefix.
|
||||
This is needed to ensure that `/share` and `/lib` files are in sync.
|
||||
|
||||
For extension updates, we rely on the PostgreSQL extension versioning mechanism (sql update scripts) and extension authors to not break backwards compatibility within one major version of PostgreSQL.
|
||||
|
||||
### Alternatives
|
||||
|
||||
For extensions written on trusted languages we can also adopt
|
||||
`dbdev` PostgreSQL Package Manager based on `pg_tle` by Supabase.
|
||||
This will increase the amount supported extensions and decrease the amount of work required to support them.
|
||||
@@ -1,84 +0,0 @@
|
||||
# Postgres user and database management
|
||||
|
||||
(This supersedes the previous proposal that looked too complicated and desynchronization-prone)
|
||||
|
||||
We've accumulated a bunch of problems with our approach to role and database management, namely:
|
||||
|
||||
1. we don't allow role and database creation from Postgres, and users are complaining about that
|
||||
2. fine-grained role management is not possible both from Postgres and console
|
||||
|
||||
Right now, we do store users and databases both in console and Postgres, and there are two main reasons for
|
||||
that:
|
||||
|
||||
* we want to be able to authenticate users in proxy against the console without Postgres' involvement. Otherwise,
|
||||
malicious brute force attempts will wake up Postgres (expensive) and may exhaust the Postgres connections limit (deny of service).
|
||||
* it is handy when we can render console UI without waking up compute (e.g., show database list)
|
||||
|
||||
This RFC doesn't talk about giving root access to the database, which is blocked by a secure runtime setup.
|
||||
|
||||
## Overview
|
||||
|
||||
* Add Postgres extension that sends an HTTP request each time transaction that modifies users/databases is about to commit.
|
||||
* Add user management API to internal console API. Also, the console should put a JWT token into the compute so that it can access management API.
|
||||
|
||||
## Postgres behavior
|
||||
|
||||
The default user role (@username) should have `CREATE ROLE`, `CREATE DB`, and `BYPASSRLS` privileges. We expose the Postgres port
|
||||
to the open internet, so we need to check password strength. Now console generates strong passwords, so there is no risk of having dumb passwords. With user-provided passwords, such risks exist.
|
||||
|
||||
Since we store passwords in the console we should also send unencrypted password when role is created/changed. Hence communication with the console must be encrypted. Postgres also supports creating roles using hashes, in that case, we will not be able to get a raw password. So I can see the following options here:
|
||||
* roles created via SQL will *not* have raw passwords in the console
|
||||
* roles created via SQL will have raw passwords in the console, except ones that were created using hashes
|
||||
|
||||
I'm leaning towards the second option here as it is a bit more consistent one -- if raw password storage is enabled then we store passwords in all cases where we can store them.
|
||||
|
||||
To send data about roles and databases from Postgres to the console we can create the following Postgres extension:
|
||||
|
||||
* Intercept role/database changes in `ProcessUtility_hook`. Here we have access to the query statement with the raw password. The hook handler itself should not dial the console immediately and rather stash info in some hashmap for later use.
|
||||
* When the transaction is about to commit we execute collected role modifications (all as one -- console should either accept all or reject all, and hence API shouldn't be REST-like). If the console request fails we can roll back the transaction. This way if the transaction is committed we know for sure that console has this information. We can use `XACT_EVENT_PRE_COMMIT` and `XACT_EVENT_PARALLEL_PRE_COMMIT` for that.
|
||||
* Extension should be mindful of the fact that it is possible to create and delete roles within the transaction.
|
||||
* We also need to track who is database owner, some coding around may be needed to get the current user when the database is created.
|
||||
|
||||
## Console user management API
|
||||
|
||||
The current public API has REST API for role management. We need to have some analog for the internal API (called mgmt API in the console code). But unlike public API here we want to have an atomic way to create several roles/databases (in cases when several roles were created in the same transaction). So something like that may work:
|
||||
|
||||
```
|
||||
curl -X PATCH /api/v1/roles_and_databases -d '
|
||||
[
|
||||
{"op":"create", "type":"role", "name": "kurt", "password":"lYgT3BlbkFJ2vBZrqv"},
|
||||
{"op":"drop", "type":"role", "name": "trout"},
|
||||
{"op":"alter", "type":"role", "name": "kilgore", "password":"3BlbkFJ2vB"},
|
||||
{"op":"create", "type":"database", "name": "db2", "owner": "eliot"},
|
||||
]
|
||||
'
|
||||
```
|
||||
|
||||
Makes sense not to error out on duplicated create/delete operations (see failure modes)
|
||||
|
||||
## Managing users from the console
|
||||
|
||||
Now console puts a spec file with the list of databases/roles and delta operations in all the compute pods. `compute_ctl` then picks up that file and stubbornly executes deltas and checks data in the spec file is the same as in the Postgres. This way if the user creates a role in the UI we restart compute with a new spec file and during the start databases/roles are created. So if Postgres send an HTTP call each time role is created we need to break recursion in that case. We can do that based on application_name or some GUC or user (local == no HTTP hook).
|
||||
|
||||
Generally, we have several options when we are creating users via console:
|
||||
|
||||
1. restart compute with a new spec file, execute local SQL command; cut recursion in the extension
|
||||
2. "push" spec files into running compute, execute local SQL command; cut recursion in the extension
|
||||
3. "push" spec files into running compute, execute local SQL command; let extension create those roles in the console
|
||||
4. avoid managing roles via spec files, send SQL commands to compute; let extension create those roles in the console
|
||||
|
||||
The last option is the most straightforward one, but with the raw password storage opt-out, we will not have the password to establish an SQL connection. Also, we need a spec for provisioning purposes and to address potential desync (but that is quite unlikely). So I think the easiest approach would be:
|
||||
|
||||
1. keep role management like it is now and cut the recursion in the extension when SQL is executed by compute_ctl
|
||||
2. add "push" endpoint to the compute_ctl to avoid compute restart during the `apply_config` operation -- that can be done as a follow up to avoid increasing scope too much
|
||||
|
||||
## Failure modes
|
||||
|
||||
* during role creation via SQL role was created in the console but the connection was dropped before Postgres got acknowledgment or some error happened after acknowledgment (out of disk space, deadlock, etc):
|
||||
|
||||
in that case, Postgres won't have a role that exists in the console. Compute restart will heal it (due to the spec file). Also if the console allows repeated creation/deletion user can repeat the transaction.
|
||||
|
||||
|
||||
# Scalability
|
||||
|
||||
On my laptop, I can create 4200 roles per second. That corresponds to 363 million roles per day. Since each role creation ends up in the console database we can add some limit to the number of roles (could be reasonably big to not run into it often -- like 1k or 10k).
|
||||
@@ -71,11 +71,9 @@ pub struct ComputeMetrics {
|
||||
pub wait_for_spec_ms: u64,
|
||||
pub sync_safekeepers_ms: u64,
|
||||
pub basebackup_ms: u64,
|
||||
pub basebackup_bytes: u64,
|
||||
pub start_postgres_ms: u64,
|
||||
pub config_ms: u64,
|
||||
pub total_startup_ms: u64,
|
||||
pub load_libraries_ms: u64,
|
||||
}
|
||||
|
||||
/// Response of the `/computes/{compute_id}/spec` control-plane API.
|
||||
|
||||
@@ -60,9 +60,6 @@ pub struct ComputeSpec {
|
||||
/// If set, 'storage_auth_token' is used as the password to authenticate to
|
||||
/// the pageserver and safekeepers.
|
||||
pub storage_auth_token: Option<String>,
|
||||
|
||||
// list of prefixes to search for custom extensions in remote extension storage
|
||||
pub custom_extensions: Option<Vec<String>>,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
|
||||
@@ -184,20 +184,6 @@ pub enum GenericRemoteStorage {
|
||||
}
|
||||
|
||||
impl GenericRemoteStorage {
|
||||
// A function for listing all the files in a "directory"
|
||||
// Example:
|
||||
// list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
|
||||
pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.list_files(folder).await,
|
||||
Self::AwsS3(s) => s.list_files(folder).await,
|
||||
Self::Unreliable(s) => s.list_files(folder).await,
|
||||
}
|
||||
}
|
||||
|
||||
// lists common *prefixes*, if any of files
|
||||
// Example:
|
||||
// list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
|
||||
pub async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
@@ -209,6 +195,14 @@ impl GenericRemoteStorage {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.list_files(folder).await,
|
||||
Self::AwsS3(s) => s.list_files(folder).await,
|
||||
Self::Unreliable(s) => s.list_files(folder).await,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn upload(
|
||||
&self,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
future::Future,
|
||||
io::ErrorKind,
|
||||
path::{Path, PathBuf},
|
||||
pin::Pin,
|
||||
};
|
||||
@@ -344,14 +343,18 @@ impl RemoteStorage for LocalFs {
|
||||
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
|
||||
let file_path = path.with_base(&self.storage_root);
|
||||
match fs::remove_file(&file_path).await {
|
||||
Ok(()) => Ok(()),
|
||||
// The file doesn't exist. This shouldn't yield an error to mirror S3's behaviour.
|
||||
if !file_path.exists() {
|
||||
// See https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObject.html
|
||||
// > If there isn't a null version, Amazon S3 does not remove any objects but will still respond that the command was successful.
|
||||
Err(e) if e.kind() == ErrorKind::NotFound => Ok(()),
|
||||
Err(e) => Err(anyhow::anyhow!(e)),
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if !file_path.is_file() {
|
||||
anyhow::bail!("{file_path:?} is not a file");
|
||||
}
|
||||
Ok(fs::remove_file(file_path)
|
||||
.await
|
||||
.map_err(|e| anyhow::anyhow!(e))?)
|
||||
}
|
||||
|
||||
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
|
||||
|
||||
@@ -349,17 +349,10 @@ impl RemoteStorage for S3Bucket {
|
||||
|
||||
/// See the doc for `RemoteStorage::list_files`
|
||||
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
let mut folder_name = folder
|
||||
let folder_name = folder
|
||||
.map(|p| self.relative_path_to_s3_object(p))
|
||||
.or_else(|| self.prefix_in_bucket.clone());
|
||||
|
||||
// remove leading "/" if one exists
|
||||
if let Some(folder_name_slash) = folder_name.clone() {
|
||||
if folder_name_slash.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
|
||||
folder_name = Some(folder_name_slash[1..].to_string());
|
||||
}
|
||||
}
|
||||
|
||||
// AWS may need to break the response into several parts
|
||||
let mut continuation_token = None;
|
||||
let mut all_files = vec![];
|
||||
|
||||
@@ -16,3 +16,4 @@ tracing-opentelemetry.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
pin-project-lite = "0.2"
|
||||
|
||||
72
libs/tracing-utils/src/instrument.rs
Normal file
72
libs/tracing-utils/src/instrument.rs
Normal file
@@ -0,0 +1,72 @@
|
||||
use std::{
|
||||
future::Future,
|
||||
pin::Pin,
|
||||
task::{Context, Poll},
|
||||
};
|
||||
|
||||
use pin_project_lite::pin_project;
|
||||
use tracing::{instrument::Instrumented, Level};
|
||||
|
||||
pub trait InstrumentCancel: Sized {
|
||||
type Inner;
|
||||
fn with_cancel_info(self) -> CancelLog<Self::Inner> {
|
||||
self.with_cancel_log(Level::INFO)
|
||||
}
|
||||
fn with_cancel_log(self, level: Level) -> CancelLog<Self::Inner>;
|
||||
}
|
||||
|
||||
impl<T: Sized> InstrumentCancel for Instrumented<T> {
|
||||
type Inner = T;
|
||||
fn with_cancel_log(self, level: Level) -> CancelLog<T> {
|
||||
CancelLog {
|
||||
inner: self,
|
||||
cancel_level: Some(level),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pin_project! {
|
||||
/// A [`Future`] that has been instrumented with a `tracing` [`Span`]. It will emit a log on cancellation
|
||||
///
|
||||
/// This type is returned by the [`Instrument`] extension trait. See that
|
||||
/// trait's documentation for details.
|
||||
///
|
||||
/// [`Future`]: std::future::Future
|
||||
/// [`Span`]: crate::Span
|
||||
#[derive(Debug, Clone)]
|
||||
#[must_use = "futures do nothing unless you `.await` or poll them"]
|
||||
pub struct CancelLog<T> {
|
||||
#[pin]
|
||||
inner: Instrumented<T>,
|
||||
cancel_level: Option<Level>,
|
||||
}
|
||||
|
||||
impl<T> PinnedDrop for CancelLog<T> {
|
||||
fn drop(this: Pin<&mut Self>) {
|
||||
let this = this.project();
|
||||
if let Some(level) = this.cancel_level.take() {
|
||||
let _enter = this.inner.span().enter();
|
||||
match level {
|
||||
Level::TRACE => tracing::event!(Level::TRACE, "task was cancelled"),
|
||||
Level::DEBUG => tracing::event!(Level::DEBUG, "task was cancelled"),
|
||||
Level::INFO => tracing::event!(Level::INFO, "task was cancelled"),
|
||||
Level::WARN => tracing::event!(Level::WARN, "task was cancelled"),
|
||||
Level::ERROR => tracing::event!(Level::ERROR, "task was cancelled"),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Future> Future for CancelLog<T> {
|
||||
type Output = T::Output;
|
||||
|
||||
fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
|
||||
let this = self.project();
|
||||
let res = this.inner.poll(cx);
|
||||
if res.is_ready() {
|
||||
*this.cancel_level = None;
|
||||
}
|
||||
res
|
||||
}
|
||||
}
|
||||
@@ -41,6 +41,7 @@ use opentelemetry_otlp::{OTEL_EXPORTER_OTLP_ENDPOINT, OTEL_EXPORTER_OTLP_TRACES_
|
||||
pub use tracing_opentelemetry::OpenTelemetryLayer;
|
||||
|
||||
pub mod http;
|
||||
pub mod instrument;
|
||||
|
||||
/// Set up OpenTelemetry exporter, using configuration from environment variables.
|
||||
///
|
||||
|
||||
@@ -40,8 +40,6 @@ pq_proto.workspace = true
|
||||
metrics.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
const_format.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
byteorder.workspace = true
|
||||
bytes.workspace = true
|
||||
|
||||
@@ -109,16 +109,10 @@ pub use failpoint_macro_helpers::failpoint_sleep_helper;
|
||||
/// * building in docker (either in CI or locally)
|
||||
///
|
||||
/// One thing to note is that .git is not available in docker (and it is bad to include it there).
|
||||
/// When building locally, the `git_version` is used to query .git. When building on CI and docker,
|
||||
/// we don't build the actual PR branch commits, but always a "phantom" would be merge commit to
|
||||
/// the target branch -- the actual PR commit from which we build from is supplied as GIT_VERSION
|
||||
/// environment variable.
|
||||
///
|
||||
/// We ended up with this compromise between phantom would be merge commits vs. pull request branch
|
||||
/// heads due to old logs becoming more reliable (github could gc the phantom merge commit
|
||||
/// anytime) in #4641.
|
||||
///
|
||||
/// To avoid running buildscript every recompilation, we use rerun-if-env-changed option.
|
||||
/// So everything becides docker build is covered by git_version crate, and docker uses a `GIT_VERSION` argument to get the value required.
|
||||
/// It takes variable from build process env and puts it to the rustc env. And then we can retrieve it here by using env! macro.
|
||||
/// Git version received from environment variable used as a fallback in git_version invocation.
|
||||
/// And to avoid running buildscript every recompilation, we use rerun-if-env-changed option.
|
||||
/// So the build script will be run only when GIT_VERSION envvar has changed.
|
||||
///
|
||||
/// Why not to use buildscript to get git commit sha directly without procmacro from different crate?
|
||||
@@ -138,28 +132,17 @@ pub use failpoint_macro_helpers::failpoint_sleep_helper;
|
||||
#[macro_export]
|
||||
macro_rules! project_git_version {
|
||||
($const_identifier:ident) => {
|
||||
// this should try GIT_VERSION first only then git_version::git_version!
|
||||
const $const_identifier: &::core::primitive::str = {
|
||||
const __COMMIT_FROM_GIT: &::core::primitive::str = git_version::git_version! {
|
||||
prefix = "",
|
||||
fallback = "unknown",
|
||||
args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha
|
||||
};
|
||||
|
||||
const __ARG: &[&::core::primitive::str; 2] = &match ::core::option_env!("GIT_VERSION") {
|
||||
::core::option::Option::Some(x) => ["git-env:", x],
|
||||
::core::option::Option::None => ["git:", __COMMIT_FROM_GIT],
|
||||
};
|
||||
|
||||
$crate::__const_format::concatcp!(__ARG[0], __ARG[1])
|
||||
};
|
||||
const $const_identifier: &str = git_version::git_version!(
|
||||
prefix = "git:",
|
||||
fallback = concat!(
|
||||
"git-env:",
|
||||
env!("GIT_VERSION", "Missing GIT_VERSION envvar")
|
||||
),
|
||||
args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha
|
||||
);
|
||||
};
|
||||
}
|
||||
|
||||
/// Re-export for `project_git_version` macro
|
||||
#[doc(hidden)]
|
||||
pub use const_format as __const_format;
|
||||
|
||||
/// Same as `assert!`, but evaluated during compilation and gets optimized out in runtime.
|
||||
#[macro_export]
|
||||
macro_rules! const_assert {
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use pin_project_lite::pin_project;
|
||||
use std::io::Read;
|
||||
use std::pin::Pin;
|
||||
use std::{io, task};
|
||||
use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
|
||||
@@ -76,34 +75,3 @@ impl<S: AsyncWrite + Unpin, R, W: FnMut(usize)> AsyncWrite for MeasuredStream<S,
|
||||
self.project().stream.poll_shutdown(context)
|
||||
}
|
||||
}
|
||||
|
||||
/// Wrapper for a reader that counts bytes read.
|
||||
///
|
||||
/// Similar to MeasuredStream but it's one way and it's sync
|
||||
pub struct MeasuredReader<R: Read> {
|
||||
inner: R,
|
||||
byte_count: usize,
|
||||
}
|
||||
|
||||
impl<R: Read> MeasuredReader<R> {
|
||||
pub fn new(reader: R) -> Self {
|
||||
Self {
|
||||
inner: reader,
|
||||
byte_count: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_byte_count(&self) -> usize {
|
||||
self.byte_count
|
||||
}
|
||||
}
|
||||
|
||||
impl<R: Read> Read for MeasuredReader<R> {
|
||||
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
|
||||
let result = self.inner.read(buf);
|
||||
if let Ok(n_bytes) = result {
|
||||
self.byte_count += n_bytes
|
||||
}
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,7 +12,6 @@ testing = ["fail/failpoints"]
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
async-compression.workspace = true
|
||||
async-stream.workspace = true
|
||||
async-trait.workspace = true
|
||||
byteorder.workspace = true
|
||||
@@ -25,7 +24,6 @@ consumption_metrics.workspace = true
|
||||
crc32c.workspace = true
|
||||
crossbeam-utils.workspace = true
|
||||
either.workspace = true
|
||||
flate2.workspace = true
|
||||
fail.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use pageserver::keyspace::{KeyPartitioning, KeySpace};
|
||||
use pageserver::repository::Key;
|
||||
use pageserver::tenant::layer_map::LayerMap;
|
||||
use pageserver::tenant::storage_layer::LayerFileName;
|
||||
use pageserver::tenant::storage_layer::PersistentLayerDesc;
|
||||
use pageserver::tenant::storage_layer::{tests::LayerDescriptor, Layer, LayerFileName};
|
||||
use pageserver::tenant::storage_layer::{PersistentLayer, PersistentLayerDesc};
|
||||
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
|
||||
use std::cmp::{max, min};
|
||||
use std::fs::File;
|
||||
@@ -28,13 +28,13 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
|
||||
for fname in filenames {
|
||||
let fname = fname.unwrap();
|
||||
let fname = LayerFileName::from_str(&fname).unwrap();
|
||||
let layer = PersistentLayerDesc::from(fname);
|
||||
let layer = LayerDescriptor::from(fname);
|
||||
|
||||
let lsn_range = layer.get_lsn_range();
|
||||
min_lsn = min(min_lsn, lsn_range.start);
|
||||
max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1));
|
||||
|
||||
updates.insert_historic(layer);
|
||||
updates.insert_historic(layer.layer_desc().clone());
|
||||
}
|
||||
|
||||
println!("min: {min_lsn}, max: {max_lsn}");
|
||||
@@ -210,15 +210,15 @@ fn bench_sequential(c: &mut Criterion) {
|
||||
for i in 0..100_000 {
|
||||
let i32 = (i as u32) % 100;
|
||||
let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
|
||||
let layer = PersistentLayerDesc::new_img(
|
||||
let layer = LayerDescriptor::from(PersistentLayerDesc::new_img(
|
||||
TenantId::generate(),
|
||||
TimelineId::generate(),
|
||||
zero.add(10 * i32)..zero.add(10 * i32 + 1),
|
||||
Lsn(i),
|
||||
false,
|
||||
0,
|
||||
);
|
||||
updates.insert_historic(layer);
|
||||
));
|
||||
updates.insert_historic(layer.layer_desc().clone());
|
||||
}
|
||||
updates.flush();
|
||||
println!("Finished layer map init in {:?}", now.elapsed());
|
||||
|
||||
@@ -117,8 +117,7 @@ pub fn main() -> Result<()> {
|
||||
|
||||
let mut lsn_diff = (lsn_end - lsn_start) as f32;
|
||||
let mut fill = Fill::None;
|
||||
let mut ymargin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas
|
||||
let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas
|
||||
let mut margin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas
|
||||
let mut lsn_offset = 0.0;
|
||||
|
||||
// Fill in and thicken rectangle if it's an
|
||||
@@ -129,7 +128,7 @@ pub fn main() -> Result<()> {
|
||||
num_images += 1;
|
||||
lsn_diff = 0.3;
|
||||
lsn_offset = -lsn_diff / 2.0;
|
||||
ymargin = 0.05;
|
||||
margin = 0.05;
|
||||
fill = Fill::Color(rgb(0, 0, 0));
|
||||
}
|
||||
Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
|
||||
@@ -138,10 +137,10 @@ pub fn main() -> Result<()> {
|
||||
println!(
|
||||
" {}",
|
||||
rectangle(
|
||||
key_start as f32 + stretch * xmargin,
|
||||
stretch * (lsn_max as f32 - (lsn_end as f32 - ymargin - lsn_offset)),
|
||||
key_diff as f32 - stretch * 2.0 * xmargin,
|
||||
stretch * (lsn_diff - 2.0 * ymargin)
|
||||
key_start as f32 + stretch * margin,
|
||||
stretch * (lsn_max as f32 - (lsn_end as f32 - margin - lsn_offset)),
|
||||
key_diff as f32 - stretch * 2.0 * margin,
|
||||
stretch * (lsn_diff - 2.0 * margin)
|
||||
)
|
||||
.fill(fill)
|
||||
.stroke(Stroke::Color(rgb(0, 0, 0), 0.1))
|
||||
|
||||
@@ -305,7 +305,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
let now = SystemTime::now();
|
||||
for (i, (partition, candidate)) in candidates.iter().enumerate() {
|
||||
debug!(
|
||||
"cand {}/{}: size={}, no_access_for={}us, partition={:?}, {}/{}/{}",
|
||||
"cand {}/{}: size={}, no_access_for={}us, parition={:?}, tenant={} timeline={} layer={}",
|
||||
i + 1,
|
||||
candidates.len(),
|
||||
candidate.layer.file_size(),
|
||||
|
||||
@@ -346,7 +346,7 @@ async fn timeline_create_handler(
|
||||
Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
|
||||
}
|
||||
}
|
||||
.instrument(info_span!("timeline_create", %tenant_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
|
||||
.instrument(info_span!("timeline_create", tenant = %tenant_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -381,7 +381,7 @@ async fn timeline_list_handler(
|
||||
}
|
||||
Ok::<Vec<TimelineInfo>, ApiError>(response_data)
|
||||
}
|
||||
.instrument(info_span!("timeline_list", %tenant_id))
|
||||
.instrument(info_span!("timeline_list", tenant = %tenant_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, response_data)
|
||||
@@ -418,7 +418,7 @@ async fn timeline_detail_handler(
|
||||
|
||||
Ok::<_, ApiError>(timeline_info)
|
||||
}
|
||||
.instrument(info_span!("timeline_detail", %tenant_id, %timeline_id))
|
||||
.instrument(info_span!("timeline_detail", tenant = %tenant_id, timeline = %timeline_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, timeline_info)
|
||||
@@ -479,7 +479,7 @@ async fn tenant_attach_handler(
|
||||
remote_storage.clone(),
|
||||
&ctx,
|
||||
)
|
||||
.instrument(info_span!("tenant_attach", %tenant_id))
|
||||
.instrument(info_span!("tenant_attach", tenant = %tenant_id))
|
||||
.await?;
|
||||
} else {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
@@ -501,7 +501,7 @@ async fn timeline_delete_handler(
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||
|
||||
mgr::delete_timeline(tenant_id, timeline_id, &ctx)
|
||||
.instrument(info_span!("timeline_delete", %tenant_id, %timeline_id))
|
||||
.instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id))
|
||||
.await?;
|
||||
|
||||
// FIXME: needs to be an error for console to retry it. Ideally Accepted should be used and retried until 404.
|
||||
@@ -519,7 +519,7 @@ async fn tenant_detach_handler(
|
||||
let state = get_state(&request);
|
||||
let conf = state.conf;
|
||||
mgr::detach_tenant(conf, tenant_id, detach_ignored.unwrap_or(false))
|
||||
.instrument(info_span!("tenant_detach", %tenant_id))
|
||||
.instrument(info_span!("tenant_detach", tenant = %tenant_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
@@ -542,7 +542,7 @@ async fn tenant_load_handler(
|
||||
state.remote_storage.clone(),
|
||||
&ctx,
|
||||
)
|
||||
.instrument(info_span!("load", %tenant_id))
|
||||
.instrument(info_span!("load", tenant = %tenant_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
@@ -558,7 +558,7 @@ async fn tenant_ignore_handler(
|
||||
let state = get_state(&request);
|
||||
let conf = state.conf;
|
||||
mgr::ignore_tenant(conf, tenant_id)
|
||||
.instrument(info_span!("ignore_tenant", %tenant_id))
|
||||
.instrument(info_span!("ignore_tenant", tenant = %tenant_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
@@ -611,7 +611,7 @@ async fn tenant_status(
|
||||
attachment_status: state.attachment_status(),
|
||||
})
|
||||
}
|
||||
.instrument(info_span!("tenant_status_handler", %tenant_id))
|
||||
.instrument(info_span!("tenant_status_handler", tenant = %tenant_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, tenant_info)
|
||||
@@ -850,7 +850,7 @@ async fn tenant_create_handler(
|
||||
state.remote_storage.clone(),
|
||||
&ctx,
|
||||
)
|
||||
.instrument(info_span!("tenant_create", tenant_id = %target_tenant_id))
|
||||
.instrument(info_span!("tenant_create", tenant = ?target_tenant_id))
|
||||
.await?;
|
||||
|
||||
// We created the tenant. Existing API semantics are that the tenant
|
||||
@@ -912,7 +912,7 @@ async fn update_tenant_config_handler(
|
||||
|
||||
let state = get_state(&request);
|
||||
mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id)
|
||||
.instrument(info_span!("tenant_config", %tenant_id))
|
||||
.instrument(info_span!("tenant_config", tenant = ?tenant_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
|
||||
@@ -10,7 +10,6 @@
|
||||
//
|
||||
|
||||
use anyhow::Context;
|
||||
use async_compression::tokio::write::GzipEncoder;
|
||||
use bytes::Buf;
|
||||
use bytes::Bytes;
|
||||
use futures::Stream;
|
||||
@@ -32,10 +31,8 @@ use std::str;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_util::io::StreamReader;
|
||||
use tracing::field;
|
||||
use tracing::*;
|
||||
use utils::id::ConnectionId;
|
||||
use utils::{
|
||||
@@ -54,7 +51,6 @@ use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant;
|
||||
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::mgr;
|
||||
use crate::tenant::mgr::GetTenantError;
|
||||
use crate::tenant::{Tenant, Timeline};
|
||||
@@ -242,7 +238,6 @@ pub async fn libpq_listener_main(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(peer_addr))]
|
||||
async fn page_service_conn_main(
|
||||
conf: &'static PageServerConf,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
@@ -265,7 +260,6 @@ async fn page_service_conn_main(
|
||||
.context("could not set TCP_NODELAY")?;
|
||||
|
||||
let peer_addr = socket.peer_addr().context("get peer address")?;
|
||||
tracing::Span::current().record("peer_addr", field::display(peer_addr));
|
||||
|
||||
// setup read timeout of 10 minutes. the timeout is rather arbitrary for requirements:
|
||||
// - long enough for most valid compute connections
|
||||
@@ -368,7 +362,7 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
#[instrument(skip(self, pgb, ctx))]
|
||||
async fn handle_pagerequests<IO>(
|
||||
&self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
@@ -379,8 +373,6 @@ impl PageServerHandler {
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
// NOTE: pagerequests handler exits when connection is closed,
|
||||
// so there is no need to reset the association
|
||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
||||
@@ -481,7 +473,7 @@ impl PageServerHandler {
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[instrument(skip_all, fields(%base_lsn, end_lsn=%_end_lsn, %pg_version))]
|
||||
#[instrument(skip(self, pgb, ctx))]
|
||||
async fn handle_import_basebackup<IO>(
|
||||
&self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
@@ -495,8 +487,6 @@ impl PageServerHandler {
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
||||
// Create empty timeline
|
||||
info!("creating new timeline");
|
||||
@@ -541,7 +531,7 @@ impl PageServerHandler {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(%start_lsn, %end_lsn))]
|
||||
#[instrument(skip(self, pgb, ctx))]
|
||||
async fn handle_import_wal<IO>(
|
||||
&self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
@@ -554,7 +544,6 @@ impl PageServerHandler {
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
||||
|
||||
let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
|
||||
@@ -749,7 +738,7 @@ impl PageServerHandler {
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[instrument(skip_all, fields(?lsn, ?prev_lsn, %full_backup))]
|
||||
#[instrument(skip(self, pgb, ctx))]
|
||||
async fn handle_basebackup_request<IO>(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
@@ -758,14 +747,11 @@ impl PageServerHandler {
|
||||
lsn: Option<Lsn>,
|
||||
prev_lsn: Option<Lsn>,
|
||||
full_backup: bool,
|
||||
gzip: bool,
|
||||
ctx: RequestContext,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let started = std::time::Instant::now();
|
||||
|
||||
// check that the timeline exists
|
||||
@@ -786,9 +772,8 @@ impl PageServerHandler {
|
||||
pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
|
||||
pgb.flush().await?;
|
||||
|
||||
// Send a tarball of the latest layer on the timeline. Compress if not
|
||||
// fullbackup. TODO Compress in that case too (tests need to be updated)
|
||||
if full_backup {
|
||||
// Send a tarball of the latest layer on the timeline
|
||||
{
|
||||
let mut writer = pgb.copyout_writer();
|
||||
basebackup::send_basebackup_tarball(
|
||||
&mut writer,
|
||||
@@ -799,40 +784,6 @@ impl PageServerHandler {
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
} else {
|
||||
let mut writer = pgb.copyout_writer();
|
||||
if gzip {
|
||||
let mut encoder = GzipEncoder::with_quality(
|
||||
writer,
|
||||
// NOTE using fast compression because it's on the critical path
|
||||
// for compute startup. For an empty database, we get
|
||||
// <100KB with this method. The Level::Best compression method
|
||||
// gives us <20KB, but maybe we should add basebackup caching
|
||||
// on compute shutdown first.
|
||||
async_compression::Level::Fastest,
|
||||
);
|
||||
basebackup::send_basebackup_tarball(
|
||||
&mut encoder,
|
||||
&timeline,
|
||||
lsn,
|
||||
prev_lsn,
|
||||
full_backup,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
// shutdown the encoder to ensure the gzip footer is written
|
||||
encoder.shutdown().await?;
|
||||
} else {
|
||||
basebackup::send_basebackup_tarball(
|
||||
&mut writer,
|
||||
&timeline,
|
||||
lsn,
|
||||
prev_lsn,
|
||||
full_backup,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::CopyDone)?;
|
||||
@@ -911,7 +862,6 @@ where
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id, timeline_id))]
|
||||
async fn process_query(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
@@ -933,10 +883,6 @@ where
|
||||
let timeline_id = TimelineId::from_str(params[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
self.handle_pagerequests(pgb, tenant_id, timeline_id, ctx)
|
||||
@@ -956,10 +902,6 @@ where
|
||||
let timeline_id = TimelineId::from_str(params[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
let lsn = if params.len() >= 3 {
|
||||
@@ -971,19 +913,6 @@ where
|
||||
None
|
||||
};
|
||||
|
||||
let gzip = if params.len() >= 4 {
|
||||
if params[3] == "--gzip" {
|
||||
true
|
||||
} else {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"Parameter in position 3 unknown {}",
|
||||
params[3],
|
||||
)));
|
||||
}
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
metrics::metric_vec_duration::observe_async_block_duration_by_result(
|
||||
&*crate::metrics::BASEBACKUP_QUERY_TIME,
|
||||
async move {
|
||||
@@ -994,7 +923,6 @@ where
|
||||
lsn,
|
||||
None,
|
||||
false,
|
||||
gzip,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -1020,10 +948,6 @@ where
|
||||
let timeline_id = TimelineId::from_str(params[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
|
||||
|
||||
@@ -1055,10 +979,6 @@ where
|
||||
let timeline_id = TimelineId::from_str(params[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
// The caller is responsible for providing correct lsn and prev_lsn.
|
||||
let lsn = if params.len() > 2 {
|
||||
Some(
|
||||
@@ -1080,17 +1000,8 @@ where
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
// Check that the timeline exists
|
||||
self.handle_basebackup_request(
|
||||
pgb,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
prev_lsn,
|
||||
true,
|
||||
false,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, prev_lsn, true, ctx)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.starts_with("import basebackup ") {
|
||||
// Import the `base` section (everything but the wal) of a basebackup.
|
||||
@@ -1122,10 +1033,6 @@ where
|
||||
let pg_version = u32::from_str(params[4])
|
||||
.with_context(|| format!("Failed to parse pg_version from {}", params[4]))?;
|
||||
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
match self
|
||||
@@ -1170,10 +1077,6 @@ where
|
||||
let end_lsn = Lsn::from_str(params[3])
|
||||
.with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
|
||||
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
match self
|
||||
@@ -1205,8 +1108,6 @@ where
|
||||
let tenant_id = TenantId::from_str(params[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
||||
|
||||
tracing::Span::current().record("tenant_id", field::display(tenant_id));
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
|
||||
|
||||
@@ -84,25 +84,6 @@ use utils::{
|
||||
lsn::{Lsn, RecordLsn},
|
||||
};
|
||||
|
||||
/// Declare a failpoint that can use the `pause` failpoint action.
|
||||
/// We don't want to block the executor thread, hence, spawn_blocking + await.
|
||||
macro_rules! pausable_failpoint {
|
||||
($name:literal) => {
|
||||
if cfg!(feature = "testing") {
|
||||
tokio::task::spawn_blocking({
|
||||
let current = tracing::Span::current();
|
||||
move || {
|
||||
let _entered = current.entered();
|
||||
tracing::info!("at failpoint {}", $name);
|
||||
fail::fail_point!($name);
|
||||
}
|
||||
})
|
||||
.await
|
||||
.expect("spawn_blocking");
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
pub mod blob_io;
|
||||
pub mod block_io;
|
||||
pub mod disk_btree;
|
||||
@@ -429,7 +410,7 @@ impl Tenant {
|
||||
.layers
|
||||
.read()
|
||||
.await
|
||||
.layer_map()
|
||||
.0
|
||||
.iter_historic_layers()
|
||||
.next()
|
||||
.is_some(),
|
||||
@@ -579,7 +560,7 @@ impl Tenant {
|
||||
.map(move |res| {
|
||||
res.with_context(|| format!("download index part for timeline {timeline_id}"))
|
||||
})
|
||||
.instrument(info_span!("download_index_part", %timeline_id)),
|
||||
.instrument(info_span!("download_index_part", timeline=%timeline_id)),
|
||||
);
|
||||
}
|
||||
// Wait for all the download tasks to complete & collect results.
|
||||
@@ -1368,7 +1349,7 @@ impl Tenant {
|
||||
for (timeline_id, timeline) in &timelines_to_compact {
|
||||
timeline
|
||||
.compact(ctx)
|
||||
.instrument(info_span!("compact_timeline", %timeline_id))
|
||||
.instrument(info_span!("compact_timeline", timeline = %timeline_id))
|
||||
.await?;
|
||||
}
|
||||
|
||||
@@ -1517,7 +1498,20 @@ impl Tenant {
|
||||
remote_client.delete_all().await.context("delete_all")?
|
||||
};
|
||||
|
||||
pausable_failpoint!("in_progress_delete");
|
||||
// Have a failpoint that can use the `pause` failpoint action.
|
||||
// We don't want to block the executor thread, hence, spawn_blocking + await.
|
||||
if cfg!(feature = "testing") {
|
||||
tokio::task::spawn_blocking({
|
||||
let current = tracing::Span::current();
|
||||
move || {
|
||||
let _entered = current.entered();
|
||||
tracing::info!("at failpoint in_progress_delete");
|
||||
fail::fail_point!("in_progress_delete");
|
||||
}
|
||||
})
|
||||
.await
|
||||
.expect("spawn_blocking");
|
||||
}
|
||||
|
||||
{
|
||||
// Remove the timeline from the map.
|
||||
|
||||
@@ -651,35 +651,19 @@ impl LayerMap {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::LayerMap;
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
use crate::tenant::storage_layer::{tests::LayerDescriptor, LayerFileName};
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
|
||||
mod l0_delta_layers_updated {
|
||||
|
||||
use crate::tenant::{
|
||||
storage_layer::{AsLayerDesc, PersistentLayerDesc},
|
||||
timeline::layer_manager::LayerFileManager,
|
||||
storage_layer::{PersistentLayer, PersistentLayerDesc},
|
||||
timeline::LayerFileManager,
|
||||
};
|
||||
|
||||
use super::*;
|
||||
|
||||
struct LayerObject(PersistentLayerDesc);
|
||||
|
||||
impl AsLayerDesc for LayerObject {
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl LayerObject {
|
||||
fn new(desc: PersistentLayerDesc) -> Self {
|
||||
LayerObject(desc)
|
||||
}
|
||||
}
|
||||
|
||||
type TestLayerFileManager = LayerFileManager<LayerObject>;
|
||||
|
||||
#[test]
|
||||
fn for_full_range_delta() {
|
||||
// l0_delta_layers are used by compaction, and should observe all buffered updates
|
||||
@@ -716,18 +700,18 @@ mod tests {
|
||||
|
||||
let layer = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69";
|
||||
let layer = LayerFileName::from_str(layer).unwrap();
|
||||
let layer = PersistentLayerDesc::from(layer);
|
||||
let layer = LayerDescriptor::from(layer);
|
||||
|
||||
// same skeletan construction; see scenario below
|
||||
let not_found = Arc::new(LayerObject::new(layer.clone()));
|
||||
let new_version = Arc::new(LayerObject::new(layer));
|
||||
let not_found = Arc::new(layer.clone());
|
||||
let new_version = Arc::new(layer);
|
||||
|
||||
// after the immutable storage state refactor, the replace operation
|
||||
// will not use layer map any more. We keep it here for consistency in test cases
|
||||
// and can remove it in the future.
|
||||
let _map = LayerMap::default();
|
||||
|
||||
let mut mapping = TestLayerFileManager::new();
|
||||
let mut mapping = LayerFileManager::new();
|
||||
|
||||
mapping
|
||||
.replace_and_verify(not_found, new_version)
|
||||
@@ -736,10 +720,10 @@ mod tests {
|
||||
|
||||
fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) {
|
||||
let name = LayerFileName::from_str(layer_name).unwrap();
|
||||
let skeleton = PersistentLayerDesc::from(name);
|
||||
let skeleton = LayerDescriptor::from(name);
|
||||
|
||||
let remote = Arc::new(LayerObject::new(skeleton.clone()));
|
||||
let downloaded = Arc::new(LayerObject::new(skeleton));
|
||||
let remote = Arc::new(skeleton.clone());
|
||||
let downloaded = Arc::new(skeleton);
|
||||
|
||||
let mut map = LayerMap::default();
|
||||
let mut mapping = LayerFileManager::new();
|
||||
|
||||
@@ -690,7 +690,7 @@ pub async fn immediate_gc(
|
||||
fail::fail_point!("immediate_gc_task_pre");
|
||||
let result = tenant
|
||||
.gc_iteration(Some(timeline_id), gc_horizon, pitr, &ctx)
|
||||
.instrument(info_span!("manual_gc", %tenant_id, %timeline_id))
|
||||
.instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
|
||||
.await;
|
||||
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
|
||||
// better once the types support it.
|
||||
@@ -740,7 +740,9 @@ pub async fn immediate_compact(
|
||||
async move {
|
||||
let result = timeline
|
||||
.compact(&ctx)
|
||||
.instrument(info_span!("manual_compact", %tenant_id, %timeline_id))
|
||||
.instrument(
|
||||
info_span!("manual_compact", tenant = %tenant_id, timeline = %timeline_id),
|
||||
)
|
||||
.await;
|
||||
|
||||
match task_done.send(result) {
|
||||
|
||||
@@ -748,8 +748,20 @@ impl RemoteTimelineClient {
|
||||
stopped.deleted_at = SetDeletedFlagProgress::NotRunning;
|
||||
});
|
||||
|
||||
pausable_failpoint!("persist_deleted_index_part");
|
||||
|
||||
// Have a failpoint that can use the `pause` failpoint action.
|
||||
// We don't want to block the executor thread, hence, spawn_blocking + await.
|
||||
if cfg!(feature = "testing") {
|
||||
tokio::task::spawn_blocking({
|
||||
let current = tracing::Span::current();
|
||||
move || {
|
||||
let _entered = current.entered();
|
||||
tracing::info!("at failpoint persist_deleted_index_part");
|
||||
fail::fail_point!("persist_deleted_index_part");
|
||||
}
|
||||
})
|
||||
.await
|
||||
.expect("spawn_blocking");
|
||||
}
|
||||
upload::upload_index_part(
|
||||
self.conf,
|
||||
&self.storage_impl,
|
||||
@@ -921,11 +933,11 @@ impl RemoteTimelineClient {
|
||||
|
||||
// Assign unique ID to this task
|
||||
upload_queue.task_counter += 1;
|
||||
let upload_task_id = upload_queue.task_counter;
|
||||
let task_id = upload_queue.task_counter;
|
||||
|
||||
// Add it to the in-progress map
|
||||
let task = Arc::new(UploadTask {
|
||||
task_id: upload_task_id,
|
||||
task_id,
|
||||
op: next_op,
|
||||
retries: AtomicU32::new(0),
|
||||
});
|
||||
@@ -935,8 +947,6 @@ impl RemoteTimelineClient {
|
||||
|
||||
// Spawn task to perform the task
|
||||
let self_rc = Arc::clone(self);
|
||||
let tenant_id = self.tenant_id;
|
||||
let timeline_id = self.timeline_id;
|
||||
task_mgr::spawn(
|
||||
self.runtime.handle(),
|
||||
TaskKind::RemoteUploadTask,
|
||||
@@ -948,7 +958,7 @@ impl RemoteTimelineClient {
|
||||
self_rc.perform_upload_task(task).await;
|
||||
Ok(())
|
||||
}
|
||||
.instrument(info_span!(parent: None, "remote_upload", %tenant_id, %timeline_id, %upload_task_id)),
|
||||
.instrument(info_span!(parent: None, "remote_upload", tenant = %self.tenant_id, timeline = %self.timeline_id, upload_task_id = %task_id)),
|
||||
);
|
||||
|
||||
// Loop back to process next task
|
||||
|
||||
@@ -19,10 +19,9 @@ pub(super) async fn delete_layer<'a>(
|
||||
|
||||
let path_to_delete = conf.remote_path(local_layer_path)?;
|
||||
|
||||
// We don't want to print an error if the delete failed if the file has
|
||||
// already been deleted. Thankfully, in this situation S3 already
|
||||
// does not yield an error. While OS-provided local file system APIs do yield
|
||||
// errors, we avoid them in the `LocalFs` wrapper.
|
||||
// XXX: If the deletion fails because the object already didn't exist,
|
||||
// it would be good to just issue a warning but consider it success.
|
||||
// https://github.com/neondatabase/neon/issues/2934
|
||||
storage.delete(&path_to_delete).await.with_context(|| {
|
||||
format!("Failed to delete remote layer from storage at {path_to_delete:?}")
|
||||
})
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use fail::fail_point;
|
||||
use std::{io::ErrorKind, path::Path};
|
||||
use std::path::Path;
|
||||
use tokio::fs;
|
||||
|
||||
use crate::{config::PageServerConf, tenant::remote_timeline_client::index::IndexPart};
|
||||
@@ -11,8 +11,6 @@ use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::index::LayerFileMetadata;
|
||||
|
||||
use tracing::info;
|
||||
|
||||
/// Serializes and uploads the given index part data to the remote storage.
|
||||
pub(super) async fn upload_index_part<'a>(
|
||||
conf: &'static PageServerConf,
|
||||
@@ -58,22 +56,9 @@ pub(super) async fn upload_timeline_layer<'a>(
|
||||
});
|
||||
let storage_path = conf.remote_path(source_path)?;
|
||||
|
||||
let source_file_res = fs::File::open(&source_path).await;
|
||||
let source_file = match source_file_res {
|
||||
Ok(source_file) => source_file,
|
||||
Err(e) if e.kind() == ErrorKind::NotFound => {
|
||||
// In some situations we might run into the underlying file being deleted by
|
||||
// e.g. compaction before the uploader gets to it. In that instance, we don't
|
||||
// want to retry the error: a deleted file won't come back. In theory, the
|
||||
// file might not have been written in the first place, which also indicates
|
||||
// a bug. Still log the situation so that we can keep an eye on it.
|
||||
// See https://github.com/neondatabase/neon/issues/4526
|
||||
info!(path = %source_path.display(), "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
|
||||
return Ok(());
|
||||
}
|
||||
Err(e) => Err(e)
|
||||
.with_context(|| format!("Failed to open a source file for layer {source_path:?}"))?,
|
||||
};
|
||||
let source_file = fs::File::open(&source_path)
|
||||
.await
|
||||
.with_context(|| format!("Failed to open a source file for layer {source_path:?}"))?;
|
||||
|
||||
let fs_size = source_file
|
||||
.metadata()
|
||||
|
||||
@@ -5,8 +5,8 @@ use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
|
||||
pub(crate) fn debug_assert_current_span_has_tenant_id() {}
|
||||
|
||||
#[cfg(debug_assertions)]
|
||||
pub(crate) static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<1>> =
|
||||
once_cell::sync::Lazy::new(|| MultiNameExtractor::new("TenantId", ["tenant_id"]));
|
||||
pub(crate) static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<2>> =
|
||||
once_cell::sync::Lazy::new(|| MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"]));
|
||||
|
||||
#[cfg(debug_assertions)]
|
||||
#[track_caller]
|
||||
|
||||
@@ -41,7 +41,7 @@ pub use inmemory_layer::InMemoryLayer;
|
||||
pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
|
||||
pub use remote_layer::RemoteLayer;
|
||||
|
||||
use super::timeline::layer_manager::LayerManager;
|
||||
use super::layer_map::BatchedUpdates;
|
||||
|
||||
pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
|
||||
where
|
||||
@@ -170,7 +170,7 @@ impl LayerAccessStats {
|
||||
///
|
||||
/// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
|
||||
pub(crate) fn for_loading_layer(
|
||||
layer_map_lock_held_witness: &LayerManager,
|
||||
layer_map_lock_held_witness: &BatchedUpdates<'_>,
|
||||
status: LayerResidenceStatus,
|
||||
) -> Self {
|
||||
let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
|
||||
@@ -189,7 +189,7 @@ impl LayerAccessStats {
|
||||
/// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
|
||||
pub(crate) fn clone_for_residence_change(
|
||||
&self,
|
||||
layer_map_lock_held_witness: &LayerManager,
|
||||
layer_map_lock_held_witness: &BatchedUpdates<'_>,
|
||||
new_status: LayerResidenceStatus,
|
||||
) -> LayerAccessStats {
|
||||
let clone = {
|
||||
@@ -221,7 +221,7 @@ impl LayerAccessStats {
|
||||
///
|
||||
pub(crate) fn record_residence_event(
|
||||
&self,
|
||||
_layer_map_lock_held_witness: &LayerManager,
|
||||
_layer_map_lock_held_witness: &BatchedUpdates<'_>,
|
||||
status: LayerResidenceStatus,
|
||||
reason: LayerResidenceEventReason,
|
||||
) {
|
||||
@@ -376,12 +376,6 @@ pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i
|
||||
/// Returned by [`Layer::key_iter`]
|
||||
pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i + Send>;
|
||||
|
||||
/// Get a layer descriptor from a layer.
|
||||
pub trait AsLayerDesc {
|
||||
/// Get the layer descriptor.
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc;
|
||||
}
|
||||
|
||||
/// A Layer contains all data in a "rectangle" consisting of a range of keys and
|
||||
/// range of LSNs.
|
||||
///
|
||||
@@ -395,8 +389,10 @@ pub trait AsLayerDesc {
|
||||
/// A delta layer contains all modifications within a range of LSNs and keys.
|
||||
/// An image layer is a snapshot of all the data in a key-range, at a single
|
||||
/// LSN.
|
||||
pub trait PersistentLayer: Layer + AsLayerDesc {
|
||||
/// Identify the tenant this layer belongs to
|
||||
pub trait PersistentLayer: Layer {
|
||||
/// Get the layer descriptor.
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc;
|
||||
|
||||
fn get_tenant_id(&self) -> TenantId {
|
||||
self.layer_desc().tenant_id
|
||||
}
|
||||
@@ -462,32 +458,119 @@ pub fn downcast_remote_layer(
|
||||
pub mod tests {
|
||||
use super::*;
|
||||
|
||||
impl From<DeltaFileName> for PersistentLayerDesc {
|
||||
/// Holds metadata about a layer without any content. Used mostly for testing.
|
||||
///
|
||||
/// To use filenames as fixtures, parse them as [`LayerFileName`] then convert from that to a
|
||||
/// LayerDescriptor.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct LayerDescriptor {
|
||||
base: PersistentLayerDesc,
|
||||
}
|
||||
|
||||
impl From<PersistentLayerDesc> for LayerDescriptor {
|
||||
fn from(base: PersistentLayerDesc) -> Self {
|
||||
Self { base }
|
||||
}
|
||||
}
|
||||
|
||||
impl Layer for LayerDescriptor {
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
_key: Key,
|
||||
_lsn_range: Range<Lsn>,
|
||||
_reconstruct_data: &mut ValueReconstructState,
|
||||
_ctx: &RequestContext,
|
||||
) -> Result<ValueReconstructResult> {
|
||||
todo!("This method shouldn't be part of the Layer trait")
|
||||
}
|
||||
|
||||
fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
||||
todo!()
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
self.layer_desc().key_range.clone()
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
self.layer_desc().lsn_range.clone()
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn is_incremental(&self) -> bool {
|
||||
self.layer_desc().is_incremental
|
||||
}
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
impl std::fmt::Display for LayerDescriptor {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.layer_desc().short_id())
|
||||
}
|
||||
}
|
||||
|
||||
impl PersistentLayer for LayerDescriptor {
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc {
|
||||
&self.base
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn iter(&self, _: &RequestContext) -> Result<LayerIter<'_>> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn key_iter(&self, _: &RequestContext) -> Result<LayerKeyIter<'_>> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn info(&self, _: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn access_stats(&self) -> &LayerAccessStats {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<DeltaFileName> for LayerDescriptor {
|
||||
fn from(value: DeltaFileName) -> Self {
|
||||
PersistentLayerDesc::new_delta(
|
||||
TenantId::from_array([0; 16]),
|
||||
TimelineId::from_array([0; 16]),
|
||||
value.key_range,
|
||||
value.lsn_range,
|
||||
233,
|
||||
)
|
||||
LayerDescriptor {
|
||||
base: PersistentLayerDesc::new_delta(
|
||||
TenantId::from_array([0; 16]),
|
||||
TimelineId::from_array([0; 16]),
|
||||
value.key_range,
|
||||
value.lsn_range,
|
||||
233,
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<ImageFileName> for PersistentLayerDesc {
|
||||
impl From<ImageFileName> for LayerDescriptor {
|
||||
fn from(value: ImageFileName) -> Self {
|
||||
PersistentLayerDesc::new_img(
|
||||
TenantId::from_array([0; 16]),
|
||||
TimelineId::from_array([0; 16]),
|
||||
value.key_range,
|
||||
value.lsn,
|
||||
false,
|
||||
233,
|
||||
)
|
||||
LayerDescriptor {
|
||||
base: PersistentLayerDesc::new_img(
|
||||
TenantId::from_array([0; 16]),
|
||||
TimelineId::from_array([0; 16]),
|
||||
value.key_range,
|
||||
value.lsn,
|
||||
false,
|
||||
233,
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<LayerFileName> for PersistentLayerDesc {
|
||||
impl From<LayerFileName> for LayerDescriptor {
|
||||
fn from(value: LayerFileName) -> Self {
|
||||
match value {
|
||||
LayerFileName::Delta(d) => Self::from(d),
|
||||
|
||||
@@ -56,8 +56,8 @@ use utils::{
|
||||
};
|
||||
|
||||
use super::{
|
||||
AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerIter,
|
||||
LayerKeyIter, PathOrConf, PersistentLayerDesc,
|
||||
DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
|
||||
PathOrConf, PersistentLayerDesc,
|
||||
};
|
||||
|
||||
///
|
||||
@@ -403,13 +403,11 @@ impl std::fmt::Display for DeltaLayer {
|
||||
}
|
||||
}
|
||||
|
||||
impl AsLayerDesc for DeltaLayer {
|
||||
impl PersistentLayer for DeltaLayer {
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc {
|
||||
&self.desc
|
||||
}
|
||||
}
|
||||
|
||||
impl PersistentLayer for DeltaLayer {
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
Some(self.path())
|
||||
}
|
||||
|
||||
@@ -53,9 +53,7 @@ use utils::{
|
||||
};
|
||||
|
||||
use super::filename::ImageFileName;
|
||||
use super::{
|
||||
AsLayerDesc, Layer, LayerAccessStatsReset, LayerIter, PathOrConf, PersistentLayerDesc,
|
||||
};
|
||||
use super::{Layer, LayerAccessStatsReset, LayerIter, PathOrConf, PersistentLayerDesc};
|
||||
|
||||
///
|
||||
/// Header stored in the beginning of the file
|
||||
@@ -243,13 +241,11 @@ impl std::fmt::Display for ImageLayer {
|
||||
}
|
||||
}
|
||||
|
||||
impl AsLayerDesc for ImageLayer {
|
||||
impl PersistentLayer for ImageLayer {
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc {
|
||||
&self.desc
|
||||
}
|
||||
}
|
||||
|
||||
impl PersistentLayer for ImageLayer {
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
Some(self.path())
|
||||
}
|
||||
|
||||
@@ -4,9 +4,9 @@
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::repository::Key;
|
||||
use crate::tenant::layer_map::BatchedUpdates;
|
||||
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use crate::tenant::timeline::layer_manager::LayerManager;
|
||||
use anyhow::{bail, Result};
|
||||
use pageserver_api::models::HistoricLayerInfo;
|
||||
use std::ops::Range;
|
||||
@@ -20,8 +20,8 @@ use utils::{
|
||||
|
||||
use super::filename::{DeltaFileName, ImageFileName};
|
||||
use super::{
|
||||
AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter,
|
||||
LayerKeyIter, LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
|
||||
DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
|
||||
LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
|
||||
};
|
||||
|
||||
/// RemoteLayer is a not yet downloaded [`ImageLayer`] or
|
||||
@@ -115,13 +115,11 @@ impl std::fmt::Display for RemoteLayer {
|
||||
}
|
||||
}
|
||||
|
||||
impl AsLayerDesc for RemoteLayer {
|
||||
impl PersistentLayer for RemoteLayer {
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc {
|
||||
&self.desc
|
||||
}
|
||||
}
|
||||
|
||||
impl PersistentLayer for RemoteLayer {
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
None
|
||||
}
|
||||
@@ -224,7 +222,7 @@ impl RemoteLayer {
|
||||
/// Create a Layer struct representing this layer, after it has been downloaded.
|
||||
pub fn create_downloaded_layer(
|
||||
&self,
|
||||
layer_map_lock_held_witness: &LayerManager,
|
||||
layer_map_lock_held_witness: &BatchedUpdates<'_>,
|
||||
conf: &'static PageServerConf,
|
||||
file_size: u64,
|
||||
) -> Arc<dyn PersistentLayer> {
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
//!
|
||||
|
||||
mod eviction_task;
|
||||
pub mod layer_manager;
|
||||
mod logical_size;
|
||||
pub mod span;
|
||||
pub mod uninit;
|
||||
@@ -81,15 +82,15 @@ use crate::{is_temporary, task_mgr};
|
||||
|
||||
pub(super) use self::eviction_task::EvictionTaskTenantState;
|
||||
use self::eviction_task::EvictionTaskTimelineState;
|
||||
use self::layer_manager::LayerManager;
|
||||
use self::logical_size::LogicalSize;
|
||||
use self::walreceiver::{WalReceiver, WalReceiverConf};
|
||||
|
||||
use super::config::TenantConf;
|
||||
use super::layer_map::BatchedUpdates;
|
||||
use super::remote_timeline_client::index::IndexPart;
|
||||
use super::remote_timeline_client::RemoteTimelineClient;
|
||||
use super::storage_layer::{
|
||||
AsLayerDesc, DeltaLayer, ImageLayer, Layer, LayerAccessStatsReset, PersistentLayerDesc,
|
||||
DeltaLayer, ImageLayer, Layer, LayerAccessStatsReset, PersistentLayerDesc, PersistentLayerKey,
|
||||
};
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
@@ -123,6 +124,80 @@ impl PartialOrd for Hole {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LayerFileManager(HashMap<PersistentLayerKey, Arc<dyn PersistentLayer>>);
|
||||
|
||||
impl LayerFileManager {
|
||||
fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<dyn PersistentLayer> {
|
||||
// The assumption for the `expect()` is that all code maintains the following invariant:
|
||||
// A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
|
||||
self.0
|
||||
.get(&desc.key())
|
||||
.with_context(|| format!("get layer from desc: {}", desc.filename()))
|
||||
.expect("not found")
|
||||
.clone()
|
||||
}
|
||||
|
||||
pub(crate) fn insert(&mut self, layer: Arc<dyn PersistentLayer>) {
|
||||
let present = self.0.insert(layer.layer_desc().key(), layer.clone());
|
||||
if present.is_some() && cfg!(debug_assertions) {
|
||||
panic!("overwriting a layer: {:?}", layer.layer_desc())
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn new() -> Self {
|
||||
Self(HashMap::new())
|
||||
}
|
||||
|
||||
pub(crate) fn remove(&mut self, layer: Arc<dyn PersistentLayer>) {
|
||||
let present = self.0.remove(&layer.layer_desc().key());
|
||||
if present.is_none() && cfg!(debug_assertions) {
|
||||
panic!(
|
||||
"removing layer that is not present in layer mapping: {:?}",
|
||||
layer.layer_desc()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn replace_and_verify(
|
||||
&mut self,
|
||||
expected: Arc<dyn PersistentLayer>,
|
||||
new: Arc<dyn PersistentLayer>,
|
||||
) -> Result<()> {
|
||||
let key = expected.layer_desc().key();
|
||||
let other = new.layer_desc().key();
|
||||
|
||||
let expected_l0 = LayerMap::is_l0(expected.layer_desc());
|
||||
let new_l0 = LayerMap::is_l0(new.layer_desc());
|
||||
|
||||
fail::fail_point!("layermap-replace-notfound", |_| anyhow::bail!(
|
||||
"layermap-replace-notfound"
|
||||
));
|
||||
|
||||
anyhow::ensure!(
|
||||
key == other,
|
||||
"expected and new layer have different keys: {key:?} != {other:?}"
|
||||
);
|
||||
|
||||
anyhow::ensure!(
|
||||
expected_l0 == new_l0,
|
||||
"one layer is l0 while the other is not: {expected_l0} != {new_l0}"
|
||||
);
|
||||
|
||||
if let Some(layer) = self.0.get_mut(&expected.layer_desc().key()) {
|
||||
anyhow::ensure!(
|
||||
compare_arced_layers(&expected, layer),
|
||||
"another layer was found instead of expected, expected={expected:?}, new={new:?}",
|
||||
expected = Arc::as_ptr(&expected),
|
||||
new = Arc::as_ptr(layer),
|
||||
);
|
||||
*layer = new;
|
||||
Ok(())
|
||||
} else {
|
||||
anyhow::bail!("layer was not found");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
|
||||
/// Can be removed after all refactors are done.
|
||||
fn drop_rlock<T>(rlock: tokio::sync::OwnedRwLockReadGuard<T>) {
|
||||
@@ -134,6 +209,7 @@ fn drop_rlock<T>(rlock: tokio::sync::OwnedRwLockReadGuard<T>) {
|
||||
fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
|
||||
drop(rlock)
|
||||
}
|
||||
|
||||
pub struct Timeline {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: Arc<RwLock<TenantConfOpt>>,
|
||||
@@ -162,7 +238,7 @@ pub struct Timeline {
|
||||
///
|
||||
/// In the future, we'll be able to split up the tuple of LayerMap and `LayerFileManager`,
|
||||
/// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`.
|
||||
pub(crate) layers: Arc<tokio::sync::RwLock<LayerManager>>,
|
||||
pub(crate) layers: Arc<tokio::sync::RwLock<(LayerMap, LayerFileManager)>>,
|
||||
|
||||
/// Set of key ranges which should be covered by image layers to
|
||||
/// allow GC to remove old layers. This set is created by GC and its cutoff LSN is also stored.
|
||||
@@ -513,7 +589,7 @@ impl Timeline {
|
||||
/// Hence, the result **does not represent local filesystem usage**.
|
||||
pub async fn layer_size_sum(&self) -> u64 {
|
||||
let guard = self.layers.read().await;
|
||||
let layer_map = guard.layer_map();
|
||||
let (layer_map, _) = &*guard;
|
||||
let mut size = 0;
|
||||
for l in layer_map.iter_historic_layers() {
|
||||
size += l.file_size();
|
||||
@@ -824,7 +900,7 @@ impl Timeline {
|
||||
let last_lsn = self.get_last_record_lsn();
|
||||
let open_layer_size = {
|
||||
let guard = self.layers.read().await;
|
||||
let layers = guard.layer_map();
|
||||
let (layers, _) = &*guard;
|
||||
let Some(open_layer) = layers.open_layer.as_ref() else {
|
||||
return Ok(());
|
||||
};
|
||||
@@ -956,7 +1032,7 @@ impl Timeline {
|
||||
|
||||
pub async fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo {
|
||||
let guard = self.layers.read().await;
|
||||
let layer_map = guard.layer_map();
|
||||
let (layer_map, mapping) = &*guard;
|
||||
let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1);
|
||||
if let Some(open_layer) = &layer_map.open_layer {
|
||||
in_memory_layers.push(open_layer.info());
|
||||
@@ -967,7 +1043,7 @@ impl Timeline {
|
||||
|
||||
let mut historic_layers = Vec::new();
|
||||
for historic_layer in layer_map.iter_historic_layers() {
|
||||
let historic_layer = guard.get_from_desc(&historic_layer);
|
||||
let historic_layer = mapping.get_from_desc(&historic_layer);
|
||||
historic_layers.push(historic_layer.info(reset));
|
||||
}
|
||||
|
||||
@@ -977,7 +1053,7 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))]
|
||||
#[instrument(skip_all, fields(tenant = %self.tenant_id, timeline = %self.timeline_id))]
|
||||
pub async fn download_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
|
||||
let Some(layer) = self.find_layer(layer_file_name).await else { return Ok(None) };
|
||||
let Some(remote_layer) = layer.downcast_remote_layer() else { return Ok(Some(false)) };
|
||||
@@ -1078,18 +1154,27 @@ impl Timeline {
|
||||
|
||||
// start the batch update
|
||||
let mut guard = self.layers.write().await;
|
||||
let (layer_map, mapping) = &mut *guard;
|
||||
let mut batch_updates = layer_map.batch_update();
|
||||
|
||||
let mut results = Vec::with_capacity(layers_to_evict.len());
|
||||
|
||||
for l in layers_to_evict.iter() {
|
||||
let res = if cancel.is_cancelled() {
|
||||
None
|
||||
} else {
|
||||
Some(self.evict_layer_batch_impl(&layer_removal_guard, l, &mut guard))
|
||||
Some(self.evict_layer_batch_impl(
|
||||
&layer_removal_guard,
|
||||
l,
|
||||
&mut batch_updates,
|
||||
mapping,
|
||||
))
|
||||
};
|
||||
results.push(res);
|
||||
}
|
||||
|
||||
// commit the updates & release locks
|
||||
batch_updates.flush();
|
||||
drop_wlock(guard);
|
||||
drop(layer_removal_guard);
|
||||
|
||||
@@ -1101,7 +1186,8 @@ impl Timeline {
|
||||
&self,
|
||||
_layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
|
||||
local_layer: &Arc<dyn PersistentLayer>,
|
||||
layer_mgr: &mut LayerManager,
|
||||
batch_updates: &mut BatchedUpdates<'_>,
|
||||
mapping: &mut LayerFileManager,
|
||||
) -> anyhow::Result<bool> {
|
||||
if local_layer.is_remote_layer() {
|
||||
// TODO(issue #3851): consider returning an err here instead of false,
|
||||
@@ -1137,7 +1223,7 @@ impl Timeline {
|
||||
&layer_metadata,
|
||||
local_layer
|
||||
.access_stats()
|
||||
.clone_for_residence_change(layer_mgr, LayerResidenceStatus::Evicted),
|
||||
.clone_for_residence_change(batch_updates, LayerResidenceStatus::Evicted),
|
||||
),
|
||||
LayerFileName::Delta(delta_name) => RemoteLayer::new_delta(
|
||||
self.tenant_id,
|
||||
@@ -1146,13 +1232,13 @@ impl Timeline {
|
||||
&layer_metadata,
|
||||
local_layer
|
||||
.access_stats()
|
||||
.clone_for_residence_change(layer_mgr, LayerResidenceStatus::Evicted),
|
||||
.clone_for_residence_change(batch_updates, LayerResidenceStatus::Evicted),
|
||||
),
|
||||
});
|
||||
|
||||
assert_eq!(local_layer.layer_desc(), new_remote_layer.layer_desc());
|
||||
|
||||
let succeed = match layer_mgr.replace_and_verify(local_layer.clone(), new_remote_layer) {
|
||||
let succeed = match mapping.replace_and_verify(local_layer.clone(), new_remote_layer) {
|
||||
Ok(()) => {
|
||||
if let Err(e) = local_layer.delete_resident_layer_file() {
|
||||
error!("failed to remove layer file on evict after replacement: {e:#?}");
|
||||
@@ -1323,7 +1409,10 @@ impl Timeline {
|
||||
timeline_id,
|
||||
tenant_id,
|
||||
pg_version,
|
||||
layers: Arc::new(tokio::sync::RwLock::new(LayerManager::create())),
|
||||
layers: Arc::new(tokio::sync::RwLock::new((
|
||||
LayerMap::default(),
|
||||
LayerFileManager::new(),
|
||||
))),
|
||||
wanted_image_layers: Mutex::new(None),
|
||||
|
||||
walredo_mgr,
|
||||
@@ -1452,7 +1541,7 @@ impl Timeline {
|
||||
*flush_loop_state = FlushLoopState::Exited;
|
||||
Ok(())
|
||||
}
|
||||
.instrument(info_span!(parent: None, "layer flush task", tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))
|
||||
.instrument(info_span!(parent: None, "layer flush task", tenant = %self.tenant_id, timeline = %self.timeline_id))
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1508,7 +1597,7 @@ impl Timeline {
|
||||
let mut layers = self.layers.try_write().expect(
|
||||
"in the context where we call this function, no other task has access to the object",
|
||||
);
|
||||
layers.initialize_empty(Lsn(start_lsn.0));
|
||||
layers.0.next_open_layer_at = Some(Lsn(start_lsn.0));
|
||||
}
|
||||
|
||||
///
|
||||
@@ -1516,6 +1605,8 @@ impl Timeline {
|
||||
///
|
||||
pub(super) async fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
|
||||
let mut guard = self.layers.write().await;
|
||||
let (layers, mapping) = &mut *guard;
|
||||
let mut updates = layers.batch_update();
|
||||
let mut num_layers = 0;
|
||||
|
||||
let timer = self.metrics.load_layer_map_histo.start_timer();
|
||||
@@ -1526,8 +1617,6 @@ impl Timeline {
|
||||
// total size of layer files in the current timeline directory
|
||||
let mut total_physical_size = 0;
|
||||
|
||||
let mut loaded_layers = Vec::<Arc<dyn PersistentLayer>>::new();
|
||||
|
||||
for direntry in fs::read_dir(timeline_path)? {
|
||||
let direntry = direntry?;
|
||||
let direntry_path = direntry.path();
|
||||
@@ -1554,12 +1643,12 @@ impl Timeline {
|
||||
self.tenant_id,
|
||||
&imgfilename,
|
||||
file_size,
|
||||
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident),
|
||||
LayerAccessStats::for_loading_layer(&updates, LayerResidenceStatus::Resident),
|
||||
);
|
||||
|
||||
trace!("found layer {}", layer.path().display());
|
||||
total_physical_size += file_size;
|
||||
loaded_layers.push(Arc::new(layer));
|
||||
self.insert_historic_layer(Arc::new(layer), &mut updates, mapping);
|
||||
num_layers += 1;
|
||||
} else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) {
|
||||
// Create a DeltaLayer struct for each delta file.
|
||||
@@ -1586,12 +1675,12 @@ impl Timeline {
|
||||
self.tenant_id,
|
||||
&deltafilename,
|
||||
file_size,
|
||||
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident),
|
||||
LayerAccessStats::for_loading_layer(&updates, LayerResidenceStatus::Resident),
|
||||
);
|
||||
|
||||
trace!("found layer {}", layer.path().display());
|
||||
total_physical_size += file_size;
|
||||
loaded_layers.push(Arc::new(layer));
|
||||
self.insert_historic_layer(Arc::new(layer), &mut updates, mapping);
|
||||
num_layers += 1;
|
||||
} else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
|
||||
// ignore these
|
||||
@@ -1617,7 +1706,8 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
guard.initialize_local_layers(loaded_layers, Lsn(disk_consistent_lsn.0) + 1);
|
||||
updates.flush();
|
||||
layers.next_open_layer_at = Some(Lsn(disk_consistent_lsn.0) + 1);
|
||||
|
||||
info!(
|
||||
"loaded layer map with {} layers at {}, total physical size: {}",
|
||||
@@ -1645,9 +1735,8 @@ impl Timeline {
|
||||
// We're holding a layer map lock for a while but this
|
||||
// method is only called during init so it's fine.
|
||||
let mut guard = self.layers.write().await;
|
||||
|
||||
let mut corrupted_local_layers = Vec::new();
|
||||
let mut added_remote_layers = Vec::new();
|
||||
let (layer_map, mapping) = &mut *guard;
|
||||
let mut updates = layer_map.batch_update();
|
||||
for remote_layer_name in &index_part.timeline_layers {
|
||||
let local_layer = local_only_layers.remove(remote_layer_name);
|
||||
|
||||
@@ -1691,7 +1780,7 @@ impl Timeline {
|
||||
anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
|
||||
} else {
|
||||
self.metrics.resident_physical_size_gauge.sub(local_size);
|
||||
corrupted_local_layers.push(local_layer);
|
||||
self.remove_historic_layer(local_layer, &mut updates, mapping);
|
||||
// fall-through to adding the remote layer
|
||||
}
|
||||
} else {
|
||||
@@ -1723,10 +1812,14 @@ impl Timeline {
|
||||
self.timeline_id,
|
||||
imgfilename,
|
||||
&remote_layer_metadata,
|
||||
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted),
|
||||
LayerAccessStats::for_loading_layer(
|
||||
&updates,
|
||||
LayerResidenceStatus::Evicted,
|
||||
),
|
||||
);
|
||||
let remote_layer = Arc::new(remote_layer);
|
||||
added_remote_layers.push(remote_layer);
|
||||
|
||||
self.insert_historic_layer(remote_layer, &mut updates, mapping);
|
||||
}
|
||||
LayerFileName::Delta(deltafilename) => {
|
||||
// Create a RemoteLayer for the delta file.
|
||||
@@ -1747,14 +1840,18 @@ impl Timeline {
|
||||
self.timeline_id,
|
||||
deltafilename,
|
||||
&remote_layer_metadata,
|
||||
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted),
|
||||
LayerAccessStats::for_loading_layer(
|
||||
&updates,
|
||||
LayerResidenceStatus::Evicted,
|
||||
),
|
||||
);
|
||||
let remote_layer = Arc::new(remote_layer);
|
||||
added_remote_layers.push(remote_layer);
|
||||
self.insert_historic_layer(remote_layer, &mut updates, mapping);
|
||||
}
|
||||
}
|
||||
}
|
||||
guard.initialize_remote_layers(corrupted_local_layers, added_remote_layers);
|
||||
|
||||
updates.flush();
|
||||
Ok(local_only_layers)
|
||||
}
|
||||
|
||||
@@ -1790,10 +1887,10 @@ impl Timeline {
|
||||
|
||||
let local_layers = {
|
||||
let guard = self.layers.read().await;
|
||||
let layers = guard.layer_map();
|
||||
let (layers, mapping) = &*guard;
|
||||
layers
|
||||
.iter_historic_layers()
|
||||
.map(|l| (l.filename(), guard.get_from_desc(&l)))
|
||||
.map(|l| (l.filename(), mapping.get_from_desc(&l)))
|
||||
.collect::<HashMap<_, _>>()
|
||||
};
|
||||
|
||||
@@ -2167,15 +2264,70 @@ impl Timeline {
|
||||
|
||||
async fn find_layer(&self, layer_file_name: &str) -> Option<Arc<dyn PersistentLayer>> {
|
||||
let guard = self.layers.read().await;
|
||||
for historic_layer in guard.layer_map().iter_historic_layers() {
|
||||
let (layers, mapping) = &*guard;
|
||||
for historic_layer in layers.iter_historic_layers() {
|
||||
let historic_layer_name = historic_layer.filename().file_name();
|
||||
if layer_file_name == historic_layer_name {
|
||||
return Some(guard.get_from_desc(&historic_layer));
|
||||
return Some(mapping.get_from_desc(&historic_layer));
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
/// Helper function to insert a layer from both layer map and layer file manager. Will be removed in the future
|
||||
/// after we introduce `LayerMapManager`.
|
||||
fn insert_historic_layer(
|
||||
&self,
|
||||
layer: Arc<dyn PersistentLayer>,
|
||||
updates: &mut BatchedUpdates<'_>,
|
||||
mapping: &mut LayerFileManager,
|
||||
) {
|
||||
updates.insert_historic(layer.layer_desc().clone());
|
||||
mapping.insert(layer);
|
||||
}
|
||||
|
||||
/// Helper function to remove a layer from both layer map and layer file manager. Will be removed in the future
|
||||
/// after we introduce `LayerMapManager`.
|
||||
fn remove_historic_layer(
|
||||
&self,
|
||||
layer: Arc<dyn PersistentLayer>,
|
||||
updates: &mut BatchedUpdates<'_>,
|
||||
mapping: &mut LayerFileManager,
|
||||
) {
|
||||
updates.remove_historic(layer.layer_desc().clone());
|
||||
mapping.remove(layer);
|
||||
}
|
||||
|
||||
/// Removes the layer from local FS (if present) and from memory.
|
||||
/// Remote storage is not affected by this operation.
|
||||
fn delete_historic_layer(
|
||||
&self,
|
||||
// we cannot remove layers otherwise, since gc and compaction will race
|
||||
_layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||
layer: Arc<PersistentLayerDesc>,
|
||||
updates: &mut BatchedUpdates<'_>,
|
||||
mapping: &mut LayerFileManager,
|
||||
) -> anyhow::Result<()> {
|
||||
let layer = mapping.get_from_desc(&layer);
|
||||
if !layer.is_remote_layer() {
|
||||
layer.delete_resident_layer_file()?;
|
||||
let layer_file_size = layer.file_size();
|
||||
self.metrics
|
||||
.resident_physical_size_gauge
|
||||
.sub(layer_file_size);
|
||||
}
|
||||
|
||||
// TODO Removing from the bottom of the layer map is expensive.
|
||||
// Maybe instead discard all layer map historic versions that
|
||||
// won't be needed for page reconstruction for this timeline,
|
||||
// and mark what we can't delete yet as deleted from the layer
|
||||
// map index without actually rebuilding the index.
|
||||
updates.remove_historic(layer.layer_desc().clone());
|
||||
mapping.remove(layer);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
type TraversalId = String;
|
||||
@@ -2350,7 +2502,7 @@ impl Timeline {
|
||||
'layer_map_search: loop {
|
||||
let remote_layer = {
|
||||
let guard = timeline.layers.read().await;
|
||||
let layers = guard.layer_map();
|
||||
let (layers, mapping) = &*guard;
|
||||
|
||||
// Check the open and frozen in-memory layers first, in order from newest
|
||||
// to oldest.
|
||||
@@ -2412,7 +2564,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
|
||||
let layer = guard.get_from_desc(&layer);
|
||||
let layer = mapping.get_from_desc(&layer);
|
||||
// If it's a remote layer, download it and retry.
|
||||
if let Some(remote_layer) =
|
||||
super::storage_layer::downcast_remote_layer(&layer)
|
||||
@@ -2535,13 +2687,52 @@ impl Timeline {
|
||||
///
|
||||
async fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result<Arc<InMemoryLayer>> {
|
||||
let mut guard = self.layers.write().await;
|
||||
let layer = guard.get_layer_for_write(
|
||||
let (layers, _) = &mut *guard;
|
||||
|
||||
ensure!(lsn.is_aligned());
|
||||
|
||||
let last_record_lsn = self.get_last_record_lsn();
|
||||
ensure!(
|
||||
lsn > last_record_lsn,
|
||||
"cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})\n{}",
|
||||
lsn,
|
||||
self.get_last_record_lsn(),
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
)?;
|
||||
last_record_lsn,
|
||||
std::backtrace::Backtrace::force_capture(),
|
||||
);
|
||||
|
||||
// Do we have a layer open for writing already?
|
||||
let layer;
|
||||
if let Some(open_layer) = &layers.open_layer {
|
||||
if open_layer.get_lsn_range().start > lsn {
|
||||
bail!(
|
||||
"unexpected open layer in the future: open layers starts at {}, write lsn {}",
|
||||
open_layer.get_lsn_range().start,
|
||||
lsn
|
||||
);
|
||||
}
|
||||
|
||||
layer = Arc::clone(open_layer);
|
||||
} else {
|
||||
// No writeable layer yet. Create one.
|
||||
let start_lsn = layers
|
||||
.next_open_layer_at
|
||||
.context("No next open layer found")?;
|
||||
|
||||
trace!(
|
||||
"creating layer for write at {}/{} for record at {}",
|
||||
self.timeline_id,
|
||||
start_lsn,
|
||||
lsn
|
||||
);
|
||||
let new_layer =
|
||||
InMemoryLayer::create(self.conf, self.timeline_id, self.tenant_id, start_lsn)?;
|
||||
let layer_rc = Arc::new(new_layer);
|
||||
|
||||
layers.open_layer = Some(Arc::clone(&layer_rc));
|
||||
layers.next_open_layer_at = None;
|
||||
|
||||
layer = layer_rc;
|
||||
}
|
||||
Ok(layer)
|
||||
}
|
||||
|
||||
@@ -2574,7 +2765,21 @@ impl Timeline {
|
||||
Some(self.write_lock.lock().await)
|
||||
};
|
||||
let mut guard = self.layers.write().await;
|
||||
guard.try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at);
|
||||
let (layers, _) = &mut *guard;
|
||||
if let Some(open_layer) = &layers.open_layer {
|
||||
let open_layer_rc = Arc::clone(open_layer);
|
||||
// Does this layer need freezing?
|
||||
let end_lsn = Lsn(self.get_last_record_lsn().0 + 1);
|
||||
open_layer.freeze(end_lsn);
|
||||
|
||||
// The layer is no longer open, update the layer map to reflect this.
|
||||
// We will replace it with on-disk historics below.
|
||||
layers.frozen_layers.push_back(open_layer_rc);
|
||||
layers.open_layer = None;
|
||||
layers.next_open_layer_at = Some(end_lsn);
|
||||
self.last_freeze_at.store(end_lsn);
|
||||
}
|
||||
drop_wlock(guard);
|
||||
}
|
||||
|
||||
/// Layer flusher task's main loop.
|
||||
@@ -2599,7 +2804,8 @@ impl Timeline {
|
||||
let result = loop {
|
||||
let layer_to_flush = {
|
||||
let guard = self.layers.read().await;
|
||||
guard.layer_map().frozen_layers.front().cloned()
|
||||
let (layers, _) = &*guard;
|
||||
layers.frozen_layers.front().cloned()
|
||||
// drop 'layers' lock to allow concurrent reads and writes
|
||||
};
|
||||
let Some(layer_to_flush) = layer_to_flush else { break Ok(()) };
|
||||
@@ -2714,14 +2920,15 @@ impl Timeline {
|
||||
HashMap::from([(delta_path, metadata)])
|
||||
};
|
||||
|
||||
pausable_failpoint!("flush-frozen-before-sync");
|
||||
fail_point!("flush-frozen-before-sync");
|
||||
|
||||
// The new on-disk layers are now in the layer map. We can remove the
|
||||
// in-memory layer from the map now. The flushed layer is stored in
|
||||
// in-memory layer from the map now. We do not modify `LayerFileManager` because
|
||||
// it only contains persistent layers. The flushed layer is stored in
|
||||
// the mapping in `create_delta_layer`.
|
||||
{
|
||||
let mut guard = self.layers.write().await;
|
||||
let l = guard.layer_map_mut().frozen_layers.pop_front();
|
||||
let mut layers = self.layers.write().await;
|
||||
let l = layers.0.frozen_layers.pop_front();
|
||||
|
||||
// Only one thread may call this function at a time (for this
|
||||
// timeline). If two threads tried to flush the same frozen
|
||||
@@ -2860,12 +3067,15 @@ impl Timeline {
|
||||
// Add it to the layer map
|
||||
let l = Arc::new(new_delta);
|
||||
let mut guard = self.layers.write().await;
|
||||
let (layers, mapping) = &mut *guard;
|
||||
let mut batch_updates = layers.batch_update();
|
||||
l.access_stats().record_residence_event(
|
||||
&guard,
|
||||
&batch_updates,
|
||||
LayerResidenceStatus::Resident,
|
||||
LayerResidenceEventReason::LayerCreate,
|
||||
);
|
||||
guard.track_new_l0_delta_layer(l);
|
||||
self.insert_historic_layer(l, &mut batch_updates, mapping);
|
||||
batch_updates.flush();
|
||||
|
||||
// update metrics
|
||||
self.metrics.resident_physical_size_gauge.add(sz);
|
||||
@@ -2914,7 +3124,7 @@ impl Timeline {
|
||||
let threshold = self.get_image_creation_threshold();
|
||||
|
||||
let guard = self.layers.read().await;
|
||||
let layers = guard.layer_map();
|
||||
let (layers, _) = &*guard;
|
||||
|
||||
let mut max_deltas = 0;
|
||||
{
|
||||
@@ -3093,9 +3303,11 @@ impl Timeline {
|
||||
let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len());
|
||||
|
||||
let mut guard = self.layers.write().await;
|
||||
let (layers, mapping) = &mut *guard;
|
||||
let mut updates = layers.batch_update();
|
||||
let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id);
|
||||
|
||||
for l in &image_layers {
|
||||
for l in image_layers {
|
||||
let path = l.filename();
|
||||
let metadata = timeline_path
|
||||
.join(path.file_name())
|
||||
@@ -3109,12 +3321,13 @@ impl Timeline {
|
||||
.add(metadata.len());
|
||||
let l = Arc::new(l);
|
||||
l.access_stats().record_residence_event(
|
||||
&guard,
|
||||
&updates,
|
||||
LayerResidenceStatus::Resident,
|
||||
LayerResidenceEventReason::LayerCreate,
|
||||
);
|
||||
self.insert_historic_layer(l, &mut updates, mapping);
|
||||
}
|
||||
guard.track_new_image_layers(image_layers);
|
||||
updates.flush();
|
||||
drop_wlock(guard);
|
||||
timer.stop_and_record();
|
||||
|
||||
@@ -3276,18 +3489,18 @@ impl Timeline {
|
||||
fn compact_level0_phase1(
|
||||
self: Arc<Self>,
|
||||
_layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||
guard: tokio::sync::OwnedRwLockReadGuard<LayerManager>,
|
||||
guard: tokio::sync::OwnedRwLockReadGuard<(LayerMap, LayerFileManager)>,
|
||||
mut stats: CompactLevel0Phase1StatsBuilder,
|
||||
target_file_size: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<CompactLevel0Phase1Result, CompactionError> {
|
||||
stats.read_lock_held_spawn_blocking_startup_micros =
|
||||
stats.read_lock_acquisition_micros.till_now(); // set by caller
|
||||
let layers = guard.layer_map();
|
||||
let (layers, mapping) = &*guard;
|
||||
let level0_deltas = layers.get_level0_deltas()?;
|
||||
let mut level0_deltas = level0_deltas
|
||||
.into_iter()
|
||||
.map(|x| guard.get_from_desc(&x))
|
||||
.map(|x| mapping.get_from_desc(&x))
|
||||
.collect_vec();
|
||||
stats.level0_deltas_count = Some(level0_deltas.len());
|
||||
// Only compact if enough layers have accumulated.
|
||||
@@ -3703,11 +3916,9 @@ impl Timeline {
|
||||
}
|
||||
|
||||
let mut guard = self.layers.write().await;
|
||||
let (layers, mapping) = &mut *guard;
|
||||
let mut updates = layers.batch_update();
|
||||
let mut new_layer_paths = HashMap::with_capacity(new_layers.len());
|
||||
|
||||
let mut insert_layers = Vec::new();
|
||||
let mut remove_layers = Vec::new();
|
||||
|
||||
for l in new_layers {
|
||||
let new_delta_path = l.path();
|
||||
|
||||
@@ -3733,11 +3944,11 @@ impl Timeline {
|
||||
new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
|
||||
let x: Arc<dyn PersistentLayer + 'static> = Arc::new(l);
|
||||
x.access_stats().record_residence_event(
|
||||
&guard,
|
||||
&updates,
|
||||
LayerResidenceStatus::Resident,
|
||||
LayerResidenceEventReason::LayerCreate,
|
||||
);
|
||||
insert_layers.push(x);
|
||||
self.insert_historic_layer(x, &mut updates, mapping);
|
||||
}
|
||||
|
||||
// Now that we have reshuffled the data to set of new delta layers, we can
|
||||
@@ -3745,16 +3956,12 @@ impl Timeline {
|
||||
let mut layer_names_to_delete = Vec::with_capacity(deltas_to_compact.len());
|
||||
for l in deltas_to_compact {
|
||||
layer_names_to_delete.push(l.filename());
|
||||
remove_layers.push(guard.get_from_desc(&l));
|
||||
// NB: the layer file identified by descriptor `l` is guaranteed to be present
|
||||
// in the LayerFileManager because we kept holding `layer_removal_cs` the entire
|
||||
// time, even though we dropped `Timeline::layers` inbetween.
|
||||
self.delete_historic_layer(layer_removal_cs.clone(), l, &mut updates, mapping)?;
|
||||
}
|
||||
|
||||
guard.finish_compact_l0(
|
||||
layer_removal_cs,
|
||||
remove_layers,
|
||||
insert_layers,
|
||||
&self.metrics,
|
||||
)?;
|
||||
|
||||
updates.flush();
|
||||
drop_wlock(guard);
|
||||
|
||||
// Also schedule the deletions in remote storage
|
||||
@@ -3899,7 +4106,7 @@ impl Timeline {
|
||||
new_gc_cutoff,
|
||||
)
|
||||
.instrument(
|
||||
info_span!("gc_timeline", timeline_id = %self.timeline_id, cutoff = %new_gc_cutoff),
|
||||
info_span!("gc_timeline", timeline = %self.timeline_id, cutoff = %new_gc_cutoff),
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -3973,7 +4180,7 @@ impl Timeline {
|
||||
//
|
||||
// TODO holding a write lock is too agressive and avoidable
|
||||
let mut guard = self.layers.write().await;
|
||||
let layers = guard.layer_map();
|
||||
let (layers, mapping) = &mut *guard;
|
||||
'outer: for l in layers.iter_historic_layers() {
|
||||
result.layers_total += 1;
|
||||
|
||||
@@ -4069,6 +4276,7 @@ impl Timeline {
|
||||
.unwrap()
|
||||
.replace((new_gc_cutoff, wanted_image_layers.to_keyspace()));
|
||||
|
||||
let mut updates = layers.batch_update();
|
||||
if !layers_to_remove.is_empty() {
|
||||
// Persist the new GC cutoff value in the metadata file, before
|
||||
// we actually remove anything.
|
||||
@@ -4078,15 +4286,18 @@ impl Timeline {
|
||||
// (couldn't do this in the loop above, because you cannot modify a collection
|
||||
// while iterating it. BTreeMap::retain() would be another option)
|
||||
let mut layer_names_to_delete = Vec::with_capacity(layers_to_remove.len());
|
||||
let gc_layers = layers_to_remove
|
||||
.iter()
|
||||
.map(|x| guard.get_from_desc(x))
|
||||
.collect();
|
||||
for doomed_layer in layers_to_remove {
|
||||
layer_names_to_delete.push(doomed_layer.filename());
|
||||
result.layers_removed += 1;
|
||||
{
|
||||
for doomed_layer in layers_to_remove {
|
||||
layer_names_to_delete.push(doomed_layer.filename());
|
||||
self.delete_historic_layer(
|
||||
layer_removal_cs.clone(),
|
||||
doomed_layer,
|
||||
&mut updates,
|
||||
mapping,
|
||||
)?; // FIXME: schedule succeeded deletions before returning?
|
||||
result.layers_removed += 1;
|
||||
}
|
||||
}
|
||||
let apply = guard.finish_gc_timeline(layer_removal_cs, gc_layers, &self.metrics)?;
|
||||
|
||||
if result.layers_removed != 0 {
|
||||
fail_point!("after-timeline-gc-removed-layers");
|
||||
@@ -4095,9 +4306,8 @@ impl Timeline {
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
|
||||
}
|
||||
|
||||
apply.flush();
|
||||
}
|
||||
updates.flush();
|
||||
|
||||
info!(
|
||||
"GC completed removing {} layers, cutoff {}",
|
||||
@@ -4269,11 +4479,13 @@ impl Timeline {
|
||||
// Download complete. Replace the RemoteLayer with the corresponding
|
||||
// Delta- or ImageLayer in the layer map.
|
||||
let mut guard = self_clone.layers.write().await;
|
||||
let (layers, mapping) = &mut *guard;
|
||||
let updates = layers.batch_update();
|
||||
let new_layer =
|
||||
remote_layer.create_downloaded_layer(&guard, self_clone.conf, *size);
|
||||
remote_layer.create_downloaded_layer(&updates, self_clone.conf, *size);
|
||||
{
|
||||
let l: Arc<dyn PersistentLayer> = remote_layer.clone();
|
||||
let failure = match guard.replace_and_verify(l, new_layer) {
|
||||
let failure = match mapping.replace_and_verify(l, new_layer) {
|
||||
Ok(()) => false,
|
||||
Err(e) => {
|
||||
// this is a precondition failure, the layer filename derived
|
||||
@@ -4301,6 +4513,7 @@ impl Timeline {
|
||||
.store(true, Relaxed);
|
||||
}
|
||||
}
|
||||
updates.flush();
|
||||
drop_wlock(guard);
|
||||
|
||||
info!("on-demand download successful");
|
||||
@@ -4379,7 +4592,7 @@ impl Timeline {
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
.instrument(info_span!(parent: None, "download_all_remote_layers", tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))
|
||||
.instrument(info_span!(parent: None, "download_all_remote_layers", tenant = %self.tenant_id, timeline = %self.timeline_id))
|
||||
);
|
||||
|
||||
let initial_info = DownloadRemoteLayersTaskInfo {
|
||||
@@ -4401,10 +4614,10 @@ impl Timeline {
|
||||
let mut downloads = Vec::new();
|
||||
{
|
||||
let guard = self.layers.read().await;
|
||||
let layers = guard.layer_map();
|
||||
let (layers, mapping) = &*guard;
|
||||
layers
|
||||
.iter_historic_layers()
|
||||
.map(|l| guard.get_from_desc(&l))
|
||||
.map(|l| mapping.get_from_desc(&l))
|
||||
.filter_map(|l| l.downcast_remote_layer())
|
||||
.map(|l| self.download_remote_layer(l))
|
||||
.for_each(|dl| downloads.push(dl))
|
||||
@@ -4506,7 +4719,7 @@ impl LocalLayerInfoForDiskUsageEviction {
|
||||
impl Timeline {
|
||||
pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
|
||||
let guard = self.layers.read().await;
|
||||
let layers = guard.layer_map();
|
||||
let (layers, mapping) = &*guard;
|
||||
|
||||
let mut max_layer_size: Option<u64> = None;
|
||||
let mut resident_layers = Vec::new();
|
||||
@@ -4515,7 +4728,7 @@ impl Timeline {
|
||||
let file_size = l.file_size();
|
||||
max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
|
||||
|
||||
let l = guard.get_from_desc(&l);
|
||||
let l = mapping.get_from_desc(&l);
|
||||
|
||||
if l.is_remote_layer() {
|
||||
continue;
|
||||
|
||||
@@ -198,10 +198,10 @@ impl Timeline {
|
||||
// So, we just need to deal with this.
|
||||
let candidates: Vec<Arc<dyn PersistentLayer>> = {
|
||||
let guard = self.layers.read().await;
|
||||
let layers = guard.layer_map();
|
||||
let (layers, mapping) = &*guard;
|
||||
let mut candidates = Vec::new();
|
||||
for hist_layer in layers.iter_historic_layers() {
|
||||
let hist_layer = guard.get_from_desc(&hist_layer);
|
||||
let hist_layer = mapping.get_from_desc(&hist_layer);
|
||||
if hist_layer.is_remote_layer() {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -1,370 +0,0 @@
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
use tracing::trace;
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::{AtomicLsn, Lsn},
|
||||
};
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
metrics::TimelineMetrics,
|
||||
tenant::{
|
||||
layer_map::{BatchedUpdates, LayerMap},
|
||||
storage_layer::{
|
||||
AsLayerDesc, DeltaLayer, ImageLayer, InMemoryLayer, Layer, PersistentLayer,
|
||||
PersistentLayerDesc, PersistentLayerKey, RemoteLayer,
|
||||
},
|
||||
timeline::compare_arced_layers,
|
||||
},
|
||||
};
|
||||
|
||||
/// Provides semantic APIs to manipulate the layer map.
|
||||
pub struct LayerManager {
|
||||
layer_map: LayerMap,
|
||||
layer_fmgr: LayerFileManager,
|
||||
}
|
||||
|
||||
/// After GC, the layer map changes will not be applied immediately. Users should manually apply the changes after
|
||||
/// scheduling deletes in remote client.
|
||||
pub struct ApplyGcResultGuard<'a>(BatchedUpdates<'a>);
|
||||
|
||||
impl ApplyGcResultGuard<'_> {
|
||||
pub fn flush(self) {
|
||||
self.0.flush();
|
||||
}
|
||||
}
|
||||
|
||||
impl LayerManager {
|
||||
pub fn create() -> Self {
|
||||
Self {
|
||||
layer_map: LayerMap::default(),
|
||||
layer_fmgr: LayerFileManager::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<dyn PersistentLayer> {
|
||||
self.layer_fmgr.get_from_desc(desc)
|
||||
}
|
||||
|
||||
/// Get an immutable reference to the layer map.
|
||||
///
|
||||
/// We expect users only to be able to get an immutable layer map. If users want to make modifications,
|
||||
/// they should use the below semantic APIs. This design makes us step closer to immutable storage state.
|
||||
pub fn layer_map(&self) -> &LayerMap {
|
||||
&self.layer_map
|
||||
}
|
||||
|
||||
/// Get a mutable reference to the layer map. This function will be removed once `flush_frozen_layer`
|
||||
/// gets a refactor.
|
||||
pub fn layer_map_mut(&mut self) -> &mut LayerMap {
|
||||
&mut self.layer_map
|
||||
}
|
||||
|
||||
/// Replace layers in the layer file manager, used in evictions and layer downloads.
|
||||
pub fn replace_and_verify(
|
||||
&mut self,
|
||||
expected: Arc<dyn PersistentLayer>,
|
||||
new: Arc<dyn PersistentLayer>,
|
||||
) -> Result<()> {
|
||||
self.layer_fmgr.replace_and_verify(expected, new)
|
||||
}
|
||||
|
||||
/// Called from `load_layer_map`. Initialize the layer manager with:
|
||||
/// 1. all on-disk layers
|
||||
/// 2. next open layer (with disk disk_consistent_lsn LSN)
|
||||
pub fn initialize_local_layers(
|
||||
&mut self,
|
||||
on_disk_layers: Vec<Arc<dyn PersistentLayer>>,
|
||||
next_open_layer_at: Lsn,
|
||||
) {
|
||||
let mut updates = self.layer_map.batch_update();
|
||||
for layer in on_disk_layers {
|
||||
Self::insert_historic_layer(layer, &mut updates, &mut self.layer_fmgr);
|
||||
}
|
||||
updates.flush();
|
||||
self.layer_map.next_open_layer_at = Some(next_open_layer_at);
|
||||
}
|
||||
|
||||
/// Initialize when creating a new timeline, called in `init_empty_layer_map`.
|
||||
pub fn initialize_empty(&mut self, next_open_layer_at: Lsn) {
|
||||
self.layer_map.next_open_layer_at = Some(next_open_layer_at);
|
||||
}
|
||||
|
||||
pub fn initialize_remote_layers(
|
||||
&mut self,
|
||||
corrupted_local_layers: Vec<Arc<dyn PersistentLayer>>,
|
||||
remote_layers: Vec<Arc<RemoteLayer>>,
|
||||
) {
|
||||
let mut updates = self.layer_map.batch_update();
|
||||
for layer in corrupted_local_layers {
|
||||
Self::remove_historic_layer(layer, &mut updates, &mut self.layer_fmgr);
|
||||
}
|
||||
for layer in remote_layers {
|
||||
Self::insert_historic_layer(layer, &mut updates, &mut self.layer_fmgr);
|
||||
}
|
||||
updates.flush();
|
||||
}
|
||||
|
||||
/// Open a new writable layer to append data if there is no open layer, otherwise return the current open layer,
|
||||
/// called within `get_layer_for_write`.
|
||||
pub fn get_layer_for_write(
|
||||
&mut self,
|
||||
lsn: Lsn,
|
||||
last_record_lsn: Lsn,
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<Arc<InMemoryLayer>> {
|
||||
ensure!(lsn.is_aligned());
|
||||
|
||||
ensure!(
|
||||
lsn > last_record_lsn,
|
||||
"cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})\n{}",
|
||||
lsn,
|
||||
last_record_lsn,
|
||||
std::backtrace::Backtrace::force_capture(),
|
||||
);
|
||||
|
||||
// Do we have a layer open for writing already?
|
||||
let layer = if let Some(open_layer) = &self.layer_map.open_layer {
|
||||
if open_layer.get_lsn_range().start > lsn {
|
||||
bail!(
|
||||
"unexpected open layer in the future: open layers starts at {}, write lsn {}",
|
||||
open_layer.get_lsn_range().start,
|
||||
lsn
|
||||
);
|
||||
}
|
||||
|
||||
Arc::clone(open_layer)
|
||||
} else {
|
||||
// No writeable layer yet. Create one.
|
||||
let start_lsn = self
|
||||
.layer_map
|
||||
.next_open_layer_at
|
||||
.context("No next open layer found")?;
|
||||
|
||||
trace!(
|
||||
"creating in-memory layer at {}/{} for record at {}",
|
||||
timeline_id,
|
||||
start_lsn,
|
||||
lsn
|
||||
);
|
||||
|
||||
let new_layer = InMemoryLayer::create(conf, timeline_id, tenant_id, start_lsn)?;
|
||||
let layer = Arc::new(new_layer);
|
||||
|
||||
self.layer_map.open_layer = Some(layer.clone());
|
||||
self.layer_map.next_open_layer_at = None;
|
||||
|
||||
layer
|
||||
};
|
||||
|
||||
Ok(layer)
|
||||
}
|
||||
|
||||
/// Called from `freeze_inmem_layer`, returns true if successfully frozen.
|
||||
pub fn try_freeze_in_memory_layer(
|
||||
&mut self,
|
||||
Lsn(last_record_lsn): Lsn,
|
||||
last_freeze_at: &AtomicLsn,
|
||||
) {
|
||||
let end_lsn = Lsn(last_record_lsn + 1);
|
||||
|
||||
if let Some(open_layer) = &self.layer_map.open_layer {
|
||||
let open_layer_rc = Arc::clone(open_layer);
|
||||
// Does this layer need freezing?
|
||||
open_layer.freeze(end_lsn);
|
||||
|
||||
// The layer is no longer open, update the layer map to reflect this.
|
||||
// We will replace it with on-disk historics below.
|
||||
self.layer_map.frozen_layers.push_back(open_layer_rc);
|
||||
self.layer_map.open_layer = None;
|
||||
self.layer_map.next_open_layer_at = Some(end_lsn);
|
||||
last_freeze_at.store(end_lsn);
|
||||
}
|
||||
}
|
||||
|
||||
/// Add image layers to the layer map, called from `create_image_layers`.
|
||||
pub fn track_new_image_layers(&mut self, image_layers: Vec<ImageLayer>) {
|
||||
let mut updates = self.layer_map.batch_update();
|
||||
for layer in image_layers {
|
||||
Self::insert_historic_layer(Arc::new(layer), &mut updates, &mut self.layer_fmgr);
|
||||
}
|
||||
updates.flush();
|
||||
}
|
||||
|
||||
/// Insert into the layer map when a new delta layer is created, called from `create_delta_layer`.
|
||||
pub fn track_new_l0_delta_layer(&mut self, delta_layer: Arc<DeltaLayer>) {
|
||||
let mut updates = self.layer_map.batch_update();
|
||||
Self::insert_historic_layer(delta_layer, &mut updates, &mut self.layer_fmgr);
|
||||
updates.flush();
|
||||
}
|
||||
|
||||
/// Called when compaction is completed.
|
||||
pub fn finish_compact_l0(
|
||||
&mut self,
|
||||
layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||
compact_from: Vec<Arc<dyn PersistentLayer>>,
|
||||
compact_to: Vec<Arc<dyn PersistentLayer>>,
|
||||
metrics: &TimelineMetrics,
|
||||
) -> Result<()> {
|
||||
let mut updates = self.layer_map.batch_update();
|
||||
for l in compact_to {
|
||||
Self::insert_historic_layer(l, &mut updates, &mut self.layer_fmgr);
|
||||
}
|
||||
for l in compact_from {
|
||||
// NB: the layer file identified by descriptor `l` is guaranteed to be present
|
||||
// in the LayerFileManager because compaction kept holding `layer_removal_cs` the entire
|
||||
// time, even though we dropped `Timeline::layers` inbetween.
|
||||
Self::delete_historic_layer(
|
||||
layer_removal_cs.clone(),
|
||||
l,
|
||||
&mut updates,
|
||||
metrics,
|
||||
&mut self.layer_fmgr,
|
||||
)?;
|
||||
}
|
||||
updates.flush();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Called when garbage collect the timeline. Returns a guard that will apply the updates to the layer map.
|
||||
pub fn finish_gc_timeline(
|
||||
&mut self,
|
||||
layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||
gc_layers: Vec<Arc<dyn PersistentLayer>>,
|
||||
metrics: &TimelineMetrics,
|
||||
) -> Result<ApplyGcResultGuard> {
|
||||
let mut updates = self.layer_map.batch_update();
|
||||
for doomed_layer in gc_layers {
|
||||
Self::delete_historic_layer(
|
||||
layer_removal_cs.clone(),
|
||||
doomed_layer,
|
||||
&mut updates,
|
||||
metrics,
|
||||
&mut self.layer_fmgr,
|
||||
)?; // FIXME: schedule succeeded deletions in timeline.rs `gc_timeline` instead of in batch?
|
||||
}
|
||||
Ok(ApplyGcResultGuard(updates))
|
||||
}
|
||||
|
||||
/// Helper function to insert a layer into the layer map and file manager.
|
||||
fn insert_historic_layer(
|
||||
layer: Arc<dyn PersistentLayer>,
|
||||
updates: &mut BatchedUpdates<'_>,
|
||||
mapping: &mut LayerFileManager,
|
||||
) {
|
||||
updates.insert_historic(layer.layer_desc().clone());
|
||||
mapping.insert(layer);
|
||||
}
|
||||
|
||||
/// Helper function to remove a layer into the layer map and file manager
|
||||
fn remove_historic_layer(
|
||||
layer: Arc<dyn PersistentLayer>,
|
||||
updates: &mut BatchedUpdates<'_>,
|
||||
mapping: &mut LayerFileManager,
|
||||
) {
|
||||
updates.remove_historic(layer.layer_desc().clone());
|
||||
mapping.remove(layer);
|
||||
}
|
||||
|
||||
/// Removes the layer from local FS (if present) and from memory.
|
||||
/// Remote storage is not affected by this operation.
|
||||
fn delete_historic_layer(
|
||||
// we cannot remove layers otherwise, since gc and compaction will race
|
||||
_layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||
layer: Arc<dyn PersistentLayer>,
|
||||
updates: &mut BatchedUpdates<'_>,
|
||||
metrics: &TimelineMetrics,
|
||||
mapping: &mut LayerFileManager,
|
||||
) -> anyhow::Result<()> {
|
||||
if !layer.is_remote_layer() {
|
||||
layer.delete_resident_layer_file()?;
|
||||
let layer_file_size = layer.file_size();
|
||||
metrics.resident_physical_size_gauge.sub(layer_file_size);
|
||||
}
|
||||
|
||||
// TODO Removing from the bottom of the layer map is expensive.
|
||||
// Maybe instead discard all layer map historic versions that
|
||||
// won't be needed for page reconstruction for this timeline,
|
||||
// and mark what we can't delete yet as deleted from the layer
|
||||
// map index without actually rebuilding the index.
|
||||
updates.remove_historic(layer.layer_desc().clone());
|
||||
mapping.remove(layer);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LayerFileManager<T: AsLayerDesc + ?Sized = dyn PersistentLayer>(
|
||||
HashMap<PersistentLayerKey, Arc<T>>,
|
||||
);
|
||||
|
||||
impl<T: AsLayerDesc + ?Sized> LayerFileManager<T> {
|
||||
fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<T> {
|
||||
// The assumption for the `expect()` is that all code maintains the following invariant:
|
||||
// A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
|
||||
self.0
|
||||
.get(&desc.key())
|
||||
.with_context(|| format!("get layer from desc: {}", desc.filename()))
|
||||
.expect("not found")
|
||||
.clone()
|
||||
}
|
||||
|
||||
pub(crate) fn insert(&mut self, layer: Arc<T>) {
|
||||
let present = self.0.insert(layer.layer_desc().key(), layer.clone());
|
||||
if present.is_some() && cfg!(debug_assertions) {
|
||||
panic!("overwriting a layer: {:?}", layer.layer_desc())
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn new() -> Self {
|
||||
Self(HashMap::new())
|
||||
}
|
||||
|
||||
pub(crate) fn remove(&mut self, layer: Arc<T>) {
|
||||
let present = self.0.remove(&layer.layer_desc().key());
|
||||
if present.is_none() && cfg!(debug_assertions) {
|
||||
panic!(
|
||||
"removing layer that is not present in layer mapping: {:?}",
|
||||
layer.layer_desc()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn replace_and_verify(&mut self, expected: Arc<T>, new: Arc<T>) -> Result<()> {
|
||||
let key = expected.layer_desc().key();
|
||||
let other = new.layer_desc().key();
|
||||
|
||||
let expected_l0 = LayerMap::is_l0(expected.layer_desc());
|
||||
let new_l0 = LayerMap::is_l0(new.layer_desc());
|
||||
|
||||
fail::fail_point!("layermap-replace-notfound", |_| anyhow::bail!(
|
||||
"layermap-replace-notfound"
|
||||
));
|
||||
|
||||
anyhow::ensure!(
|
||||
key == other,
|
||||
"expected and new layer have different keys: {key:?} != {other:?}"
|
||||
);
|
||||
|
||||
anyhow::ensure!(
|
||||
expected_l0 == new_l0,
|
||||
"one layer is l0 while the other is not: {expected_l0} != {new_l0}"
|
||||
);
|
||||
|
||||
if let Some(layer) = self.0.get_mut(&key) {
|
||||
anyhow::ensure!(
|
||||
compare_arced_layers(&expected, layer),
|
||||
"another layer was found instead of expected, expected={expected:?}, new={new:?}",
|
||||
expected = Arc::as_ptr(&expected),
|
||||
new = Arc::as_ptr(layer),
|
||||
);
|
||||
*layer = new;
|
||||
Ok(())
|
||||
} else {
|
||||
anyhow::bail!("layer was not found");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -7,8 +7,10 @@ pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {}
|
||||
#[cfg(debug_assertions)]
|
||||
#[track_caller]
|
||||
pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
|
||||
static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<1>> =
|
||||
once_cell::sync::Lazy::new(|| MultiNameExtractor::new("TimelineId", ["timeline_id"]));
|
||||
static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<2>> =
|
||||
once_cell::sync::Lazy::new(|| {
|
||||
MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"])
|
||||
});
|
||||
|
||||
let fields: [&dyn Extractor; 2] = [
|
||||
&*crate::tenant::span::TENANT_ID_EXTRACTOR,
|
||||
|
||||
@@ -132,7 +132,7 @@ impl<'t> UninitializedTimeline<'t> {
|
||||
impl Drop for UninitializedTimeline<'_> {
|
||||
fn drop(&mut self) {
|
||||
if let Some((_, uninit_mark)) = self.raw_timeline.take() {
|
||||
let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_id, timeline_id = %self.timeline_id).entered();
|
||||
let _entered = info_span!("drop_uninitialized_timeline", tenant = %self.owning_tenant.tenant_id, timeline = %self.timeline_id).entered();
|
||||
error!("Timeline got dropped without initializing, cleaning its files");
|
||||
cleanup_timeline_directory(uninit_mark);
|
||||
}
|
||||
|
||||
@@ -266,7 +266,7 @@ pub struct ConnectionManagerStatus {
|
||||
impl ConnectionManagerStatus {
|
||||
/// Generates a string, describing current connection status in a form, suitable for logging.
|
||||
pub fn to_human_readable_string(&self) -> String {
|
||||
let mut resulting_string = String::new();
|
||||
let mut resulting_string = "WalReceiver status".to_string();
|
||||
match &self.existing_connection {
|
||||
Some(connection) => {
|
||||
if connection.has_processed_wal {
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
MODULE_big = neon
|
||||
OBJS = \
|
||||
$(WIN32RES) \
|
||||
extension_server.o \
|
||||
file_cache.o \
|
||||
libpagestore.o \
|
||||
libpqwalproposer.o \
|
||||
|
||||
@@ -1,104 +0,0 @@
|
||||
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* extension_server.c
|
||||
* Request compute_ctl to download extension files.
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* contrib/neon/extension_server.c
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres.h"
|
||||
#include "tcop/pquery.h"
|
||||
#include "tcop/utility.h"
|
||||
#include "access/xact.h"
|
||||
#include "utils/hsearch.h"
|
||||
#include "utils/memutils.h"
|
||||
#include "commands/defrem.h"
|
||||
#include "miscadmin.h"
|
||||
#include "utils/acl.h"
|
||||
#include "fmgr.h"
|
||||
#include "utils/guc.h"
|
||||
#include "port.h"
|
||||
#include "fmgr.h"
|
||||
|
||||
#include <curl/curl.h>
|
||||
|
||||
static int extension_server_port = 0;
|
||||
|
||||
static download_extension_file_hook_type prev_download_extension_file_hook = NULL;
|
||||
|
||||
// to download all SQL (and data) files for an extension:
|
||||
// curl -X POST http://localhost:8080/extension_server/postgis
|
||||
// it covers two possible extension files layouts:
|
||||
// 1. extension_name--version--platform.sql
|
||||
// 2. extension_name/extension_name--version.sql
|
||||
// extension_name/extra_files.csv
|
||||
//
|
||||
// to download specific library file:
|
||||
// curl -X POST http://localhost:8080/extension_server/postgis-3.so?is_library=true
|
||||
static bool
|
||||
neon_download_extension_file_http(const char *filename, bool is_library)
|
||||
{
|
||||
CURL *curl;
|
||||
CURLcode res;
|
||||
char *compute_ctl_url;
|
||||
char *postdata;
|
||||
bool ret = false;
|
||||
|
||||
if ((curl = curl_easy_init()) == NULL)
|
||||
{
|
||||
elog(ERROR, "Failed to initialize curl handle");
|
||||
}
|
||||
|
||||
compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
|
||||
extension_server_port, filename, is_library ? "?is_library=true" : "");
|
||||
|
||||
elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url);
|
||||
|
||||
curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST");
|
||||
curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url);
|
||||
// NOTE: 15L may be insufficient time for large extensions like postgis
|
||||
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 15L /* seconds */);
|
||||
|
||||
if (curl)
|
||||
{
|
||||
/* Perform the request, res will get the return code */
|
||||
res = curl_easy_perform(curl);
|
||||
/* Check for errors */
|
||||
if (res == CURLE_OK)
|
||||
{
|
||||
ret = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Don't error here because postgres will try to find the file
|
||||
// and will fail with some proper error message if it's not found.
|
||||
elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res));
|
||||
}
|
||||
|
||||
/* always cleanup */
|
||||
curl_easy_cleanup(curl);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void pg_init_extension_server()
|
||||
{
|
||||
// Port to connect to compute_ctl on localhost
|
||||
// to request extension files.
|
||||
DefineCustomIntVariable("neon.extension_server_port",
|
||||
"connection string to the compute_ctl",
|
||||
NULL,
|
||||
&extension_server_port,
|
||||
0, 0, INT_MAX,
|
||||
PGC_POSTMASTER,
|
||||
0, /* no flags required */
|
||||
NULL, NULL, NULL);
|
||||
|
||||
// set download_extension_file_hook
|
||||
prev_download_extension_file_hook = download_extension_file_hook;
|
||||
download_extension_file_hook = neon_download_extension_file_http;
|
||||
}
|
||||
@@ -1 +0,0 @@
|
||||
|
||||
@@ -35,11 +35,8 @@ _PG_init(void)
|
||||
{
|
||||
pg_init_libpagestore();
|
||||
pg_init_walproposer();
|
||||
|
||||
InitControlPlaneConnector();
|
||||
|
||||
pg_init_extension_server();
|
||||
|
||||
// Important: This must happen after other parts of the extension
|
||||
// are loaded, otherwise any settings to GUCs that were set before
|
||||
// the extension was loaded will be removed.
|
||||
|
||||
@@ -21,8 +21,6 @@ extern char *neon_tenant;
|
||||
extern void pg_init_libpagestore(void);
|
||||
extern void pg_init_walproposer(void);
|
||||
|
||||
extern void pg_init_extension_server(void);
|
||||
|
||||
/*
|
||||
* Returns true if we shouldn't do REDO on that block in record indicated by
|
||||
* block_id; false otherwise.
|
||||
|
||||
@@ -2231,18 +2231,6 @@ HandleSafekeeperResponse(void)
|
||||
if (n_synced >= quorum)
|
||||
{
|
||||
/* All safekeepers synced! */
|
||||
|
||||
/*
|
||||
* Send empty message to broadcast latest truncateLsn to all safekeepers.
|
||||
* This helps to finish next sync-safekeepers eailier, by skipping recovery
|
||||
* step.
|
||||
*
|
||||
* We don't need to wait for response because it doesn't affect correctness,
|
||||
* and TCP should be able to deliver the message to safekeepers in case of
|
||||
* network working properly.
|
||||
*/
|
||||
BroadcastAppendRequest();
|
||||
|
||||
fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn));
|
||||
exit(0);
|
||||
}
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
/// the outside. Similar to an ingress controller for HTTPS.
|
||||
use std::{net::SocketAddr, sync::Arc};
|
||||
|
||||
use futures::future::Either;
|
||||
use tokio::net::TcpListener;
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Context};
|
||||
@@ -110,25 +109,20 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
let cancellation_token = CancellationToken::new();
|
||||
|
||||
let main = tokio::spawn(task_main(
|
||||
let main = proxy::flatten_err(tokio::spawn(task_main(
|
||||
Arc::new(destination),
|
||||
tls_config,
|
||||
proxy_listener,
|
||||
cancellation_token.clone(),
|
||||
));
|
||||
let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token));
|
||||
)));
|
||||
let signals_task = proxy::flatten_err(tokio::spawn(proxy::handle_signals(cancellation_token)));
|
||||
|
||||
// the signal task cant ever succeed.
|
||||
// the main task can error, or can succeed on cancellation.
|
||||
// we want to immediately exit on either of these cases
|
||||
let signal = match futures::future::select(signals_task, main).await {
|
||||
Either::Left((res, _)) => proxy::flatten_err(res)?,
|
||||
Either::Right((res, _)) => return proxy::flatten_err(res),
|
||||
};
|
||||
tokio::select! {
|
||||
res = main => { res?; },
|
||||
res = signals_task => { res?; },
|
||||
}
|
||||
|
||||
// maintenance tasks return `Infallible` success values, this is an impossible value
|
||||
// so this match statically ensures that there are no possibilities for that value
|
||||
match signal {}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn task_main(
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use futures::future::Either;
|
||||
use proxy::auth;
|
||||
use proxy::console;
|
||||
use proxy::http;
|
||||
@@ -7,10 +6,8 @@ use proxy::metrics;
|
||||
use anyhow::bail;
|
||||
use clap::{self, Arg};
|
||||
use proxy::config::{self, ProxyConfig};
|
||||
use std::pin::pin;
|
||||
use std::{borrow::Cow, net::SocketAddr};
|
||||
use tokio::net::TcpListener;
|
||||
use tokio::task::JoinSet;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::info;
|
||||
use tracing::warn;
|
||||
@@ -46,59 +43,43 @@ async fn main() -> anyhow::Result<()> {
|
||||
let proxy_listener = TcpListener::bind(proxy_address).await?;
|
||||
let cancellation_token = CancellationToken::new();
|
||||
|
||||
// client facing tasks. these will exit on error or on cancellation
|
||||
// cancellation returns Ok(())
|
||||
let mut client_tasks = JoinSet::new();
|
||||
client_tasks.spawn(proxy::proxy::task_main(
|
||||
let mut client_tasks = vec![tokio::spawn(proxy::proxy::task_main(
|
||||
config,
|
||||
proxy_listener,
|
||||
cancellation_token.clone(),
|
||||
));
|
||||
))];
|
||||
|
||||
if let Some(wss_address) = args.get_one::<String>("wss") {
|
||||
let wss_address: SocketAddr = wss_address.parse()?;
|
||||
info!("Starting wss on {wss_address}");
|
||||
let wss_listener = TcpListener::bind(wss_address).await?;
|
||||
|
||||
client_tasks.spawn(http::websocket::task_main(
|
||||
client_tasks.push(tokio::spawn(http::websocket::task_main(
|
||||
config,
|
||||
wss_listener,
|
||||
cancellation_token.clone(),
|
||||
));
|
||||
)));
|
||||
}
|
||||
|
||||
// maintenance tasks. these never return unless there's an error
|
||||
let mut maintenance_tasks = JoinSet::new();
|
||||
maintenance_tasks.spawn(proxy::handle_signals(cancellation_token));
|
||||
maintenance_tasks.spawn(http::server::task_main(http_listener));
|
||||
maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener));
|
||||
let mut tasks = vec![
|
||||
tokio::spawn(proxy::handle_signals(cancellation_token)),
|
||||
tokio::spawn(http::server::task_main(http_listener)),
|
||||
tokio::spawn(console::mgmt::task_main(mgmt_listener)),
|
||||
];
|
||||
|
||||
if let Some(metrics_config) = &config.metric_collection {
|
||||
maintenance_tasks.spawn(metrics::task_main(metrics_config));
|
||||
tasks.push(tokio::spawn(metrics::task_main(metrics_config)));
|
||||
}
|
||||
|
||||
let maintenance = loop {
|
||||
// get one complete task
|
||||
match futures::future::select(
|
||||
pin!(maintenance_tasks.join_next()),
|
||||
pin!(client_tasks.join_next()),
|
||||
)
|
||||
.await
|
||||
{
|
||||
// exit immediately on maintenance task completion
|
||||
Either::Left((Some(res), _)) => break proxy::flatten_err(res)?,
|
||||
// exit with error immediately if all maintenance tasks have ceased (should be caught by branch above)
|
||||
Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"),
|
||||
// exit immediately on client task error
|
||||
Either::Right((Some(res), _)) => proxy::flatten_err(res)?,
|
||||
// exit if all our client tasks have shutdown gracefully
|
||||
Either::Right((None, _)) => return Ok(()),
|
||||
}
|
||||
};
|
||||
|
||||
// maintenance tasks return Infallible success values, this is an impossible value
|
||||
// so this match statically ensures that there are no possibilities for that value
|
||||
match maintenance {}
|
||||
let tasks = futures::future::try_join_all(tasks.into_iter().map(proxy::flatten_err));
|
||||
let client_tasks =
|
||||
futures::future::try_join_all(client_tasks.into_iter().map(proxy::flatten_err));
|
||||
tokio::select! {
|
||||
// We are only expecting an error from these forever tasks
|
||||
res = tasks => { res?; },
|
||||
res = client_tasks => { res?; },
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// ProxyConfig is created at proxy startup, and lives forever.
|
||||
|
||||
@@ -6,9 +6,10 @@ use anyhow::Context;
|
||||
use once_cell::sync::Lazy;
|
||||
use postgres_backend::{self, AuthType, PostgresBackend, PostgresBackendTCP, QueryError};
|
||||
use pq_proto::{BeMessage, SINGLE_COL_ROWDESC};
|
||||
use std::{convert::Infallible, future};
|
||||
use std::future;
|
||||
use tokio::net::{TcpListener, TcpStream};
|
||||
use tracing::{error, info, info_span, Instrument};
|
||||
use tracing_utils::instrument::InstrumentCancel;
|
||||
|
||||
static CPLANE_WAITERS: Lazy<Waiters<ComputeReady>> = Lazy::new(Default::default);
|
||||
|
||||
@@ -31,7 +32,7 @@ pub fn notify(psql_session_id: &str, msg: ComputeReady) -> Result<(), waiters::N
|
||||
|
||||
/// Console management API listener task.
|
||||
/// It spawns console response handlers needed for the link auth.
|
||||
pub async fn task_main(listener: TcpListener) -> anyhow::Result<Infallible> {
|
||||
pub async fn task_main(listener: TcpListener) -> anyhow::Result<()> {
|
||||
scopeguard::defer! {
|
||||
info!("mgmt has shut down");
|
||||
}
|
||||
@@ -50,23 +51,14 @@ pub async fn task_main(listener: TcpListener) -> anyhow::Result<Infallible> {
|
||||
async move {
|
||||
info!("serving a new console management API connection");
|
||||
|
||||
// these might be long running connections, have a separate logging for cancelling
|
||||
// on shutdown and other ways of stopping.
|
||||
let cancelled = scopeguard::guard(tracing::Span::current(), |span| {
|
||||
let _e = span.entered();
|
||||
info!("console management API task cancelled");
|
||||
});
|
||||
|
||||
if let Err(e) = handle_connection(socket).await {
|
||||
error!("serving failed with an error: {e}");
|
||||
} else {
|
||||
info!("serving completed");
|
||||
}
|
||||
|
||||
// we can no longer get dropped
|
||||
scopeguard::ScopeGuard::into_inner(cancelled);
|
||||
}
|
||||
.instrument(span),
|
||||
.instrument(span)
|
||||
.with_cancel_info(),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,10 +10,7 @@ use crate::{auth, console};
|
||||
|
||||
use super::sql_over_http::MAX_RESPONSE_SIZE;
|
||||
|
||||
use crate::proxy::{
|
||||
can_retry_tokio_postgres_error, invalidate_cache, retry_after, try_wake,
|
||||
NUM_RETRIES_WAKE_COMPUTE,
|
||||
};
|
||||
use crate::proxy::{invalidate_cache, retry_after, try_wake, NUM_RETRIES_WAKE_COMPUTE};
|
||||
|
||||
use tracing::error;
|
||||
use tracing::info;
|
||||
@@ -275,11 +272,19 @@ async fn connect_to_compute(
|
||||
}
|
||||
|
||||
fn can_retry_error(err: &tokio_postgres::Error, num_retries: u32) -> bool {
|
||||
match err {
|
||||
use tokio_postgres::error::SqlState;
|
||||
match err.code() {
|
||||
// retry all errors at least once
|
||||
_ if num_retries == 0 => true,
|
||||
_ if num_retries >= NUM_RETRIES_WAKE_COMPUTE => false,
|
||||
err => can_retry_tokio_postgres_error(err),
|
||||
// keep retrying connection errors
|
||||
Some(
|
||||
&SqlState::CONNECTION_FAILURE
|
||||
| &SqlState::CONNECTION_EXCEPTION
|
||||
| &SqlState::CONNECTION_DOES_NOT_EXIST
|
||||
| &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
|
||||
) if num_retries < NUM_RETRIES_WAKE_COMPUTE => true,
|
||||
// otherwise, don't retry
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use anyhow::{anyhow, bail};
|
||||
use anyhow::anyhow;
|
||||
use hyper::{Body, Request, Response, StatusCode};
|
||||
use std::{convert::Infallible, net::TcpListener};
|
||||
use std::net::TcpListener;
|
||||
use tracing::info;
|
||||
use utils::http::{endpoint, error::ApiError, json::json_response, RouterBuilder, RouterService};
|
||||
|
||||
@@ -12,7 +12,7 @@ fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
|
||||
endpoint::make_router().get("/v1/status", status_handler)
|
||||
}
|
||||
|
||||
pub async fn task_main(http_listener: TcpListener) -> anyhow::Result<Infallible> {
|
||||
pub async fn task_main(http_listener: TcpListener) -> anyhow::Result<()> {
|
||||
scopeguard::defer! {
|
||||
info!("http has shut down");
|
||||
}
|
||||
@@ -23,5 +23,5 @@ pub async fn task_main(http_listener: TcpListener) -> anyhow::Result<Infallible>
|
||||
.serve(service().map_err(|e| anyhow!(e))?)
|
||||
.await?;
|
||||
|
||||
bail!("hyper server without shutdown handling cannot shutdown successfully");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use std::convert::Infallible;
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use futures::{Future, FutureExt};
|
||||
use tokio::task::JoinError;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::warn;
|
||||
@@ -24,7 +23,7 @@ pub mod url;
|
||||
pub mod waiters;
|
||||
|
||||
/// Handle unix signals appropriately.
|
||||
pub async fn handle_signals(token: CancellationToken) -> anyhow::Result<Infallible> {
|
||||
pub async fn handle_signals(token: CancellationToken) -> anyhow::Result<()> {
|
||||
use tokio::signal::unix::{signal, SignalKind};
|
||||
|
||||
let mut hangup = signal(SignalKind::hangup())?;
|
||||
@@ -51,6 +50,8 @@ pub async fn handle_signals(token: CancellationToken) -> anyhow::Result<Infallib
|
||||
}
|
||||
|
||||
/// Flattens `Result<Result<T>>` into `Result<T>`.
|
||||
pub fn flatten_err<T>(r: Result<anyhow::Result<T>, JoinError>) -> anyhow::Result<T> {
|
||||
r.context("join error").and_then(|x| x)
|
||||
pub async fn flatten_err(
|
||||
f: impl Future<Output = Result<anyhow::Result<()>, JoinError>>,
|
||||
) -> anyhow::Result<()> {
|
||||
f.map(|r| r.context("join error").and_then(|x| x)).await
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@ use crate::{config::MetricCollectionConfig, http};
|
||||
use chrono::{DateTime, Utc};
|
||||
use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
|
||||
use serde::Serialize;
|
||||
use std::{collections::HashMap, convert::Infallible, time::Duration};
|
||||
use std::{collections::HashMap, time::Duration};
|
||||
use tracing::{error, info, instrument, trace, warn};
|
||||
|
||||
const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
|
||||
@@ -26,7 +26,7 @@ pub struct Ids {
|
||||
pub branch_id: String,
|
||||
}
|
||||
|
||||
pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infallible> {
|
||||
pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<()> {
|
||||
info!("metrics collector config: {config:?}");
|
||||
scopeguard::defer! {
|
||||
info!("metrics collector has shut down");
|
||||
|
||||
@@ -20,7 +20,7 @@ use hyper::StatusCode;
|
||||
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
|
||||
use once_cell::sync::Lazy;
|
||||
use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
|
||||
use std::{error::Error, ops::ControlFlow, sync::Arc};
|
||||
use std::{ops::ControlFlow, sync::Arc};
|
||||
use tokio::{
|
||||
io::{AsyncRead, AsyncWrite, AsyncWriteExt},
|
||||
time,
|
||||
@@ -162,10 +162,7 @@ pub async fn handle_ws_client(
|
||||
.map(|_| auth::ClientCredentials::parse(¶ms, hostname, common_names))
|
||||
.transpose();
|
||||
|
||||
match result {
|
||||
Ok(creds) => creds,
|
||||
Err(e) => stream.throw_error(e).await?,
|
||||
}
|
||||
async { result }.or_else(|e| stream.throw_error(e)).await?
|
||||
};
|
||||
|
||||
let client = Client::new(stream, creds, ¶ms, session_id, false);
|
||||
@@ -204,10 +201,7 @@ async fn handle_client(
|
||||
.map(|_| auth::ClientCredentials::parse(¶ms, sni, common_names))
|
||||
.transpose();
|
||||
|
||||
match result {
|
||||
Ok(creds) => creds,
|
||||
Err(e) => stream.throw_error(e).await?,
|
||||
}
|
||||
async { result }.or_else(|e| stream.throw_error(e)).await?
|
||||
};
|
||||
|
||||
let allow_self_signed_compute = config.allow_self_signed_compute;
|
||||
@@ -391,9 +385,6 @@ async fn connect_to_compute(
|
||||
time::Duration::from_secs(10)
|
||||
};
|
||||
|
||||
// do this again to ensure we have username?
|
||||
node_info.config.set_startup_params(params);
|
||||
|
||||
match connect_to_compute_once(node_info, timeout).await {
|
||||
Ok(res) => return Ok(res),
|
||||
Err(e) => {
|
||||
@@ -445,45 +436,24 @@ pub async fn try_wake(
|
||||
}
|
||||
|
||||
fn can_retry_error(err: &compute::ConnectionError, num_retries: u32) -> bool {
|
||||
use std::io::ErrorKind;
|
||||
match err {
|
||||
// retry all errors at least once
|
||||
_ if num_retries == 0 => true,
|
||||
_ if num_retries >= NUM_RETRIES_WAKE_COMPUTE => false,
|
||||
compute::ConnectionError::Postgres(err) => can_retry_tokio_postgres_error(err),
|
||||
compute::ConnectionError::CouldNotConnect(err) => is_io_connection_err(err),
|
||||
// keep retrying connection errors
|
||||
compute::ConnectionError::CouldNotConnect(io_err)
|
||||
if num_retries < NUM_RETRIES_WAKE_COMPUTE =>
|
||||
{
|
||||
matches!(
|
||||
io_err.kind(),
|
||||
ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable
|
||||
)
|
||||
}
|
||||
// otherwise, don't retry
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn can_retry_tokio_postgres_error(err: &tokio_postgres::Error) -> bool {
|
||||
if let Some(io_err) = err.source().and_then(|x| x.downcast_ref()) {
|
||||
is_io_connection_err(io_err)
|
||||
} else if let Some(db_err) = err.source().and_then(|x| x.downcast_ref()) {
|
||||
is_sql_connection_err(db_err)
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fn is_sql_connection_err(err: &tokio_postgres::error::DbError) -> bool {
|
||||
use tokio_postgres::error::SqlState;
|
||||
matches!(
|
||||
err.code(),
|
||||
&SqlState::CONNECTION_FAILURE
|
||||
| &SqlState::CONNECTION_EXCEPTION
|
||||
| &SqlState::CONNECTION_DOES_NOT_EXIST
|
||||
| &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
|
||||
)
|
||||
}
|
||||
|
||||
fn is_io_connection_err(err: &std::io::Error) -> bool {
|
||||
use std::io::ErrorKind;
|
||||
matches!(
|
||||
err.kind(),
|
||||
ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable | ErrorKind::TimedOut
|
||||
)
|
||||
}
|
||||
|
||||
pub fn retry_after(num_retries: u32) -> time::Duration {
|
||||
match num_retries {
|
||||
0 => time::Duration::ZERO,
|
||||
@@ -622,13 +592,15 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
|
||||
application_name: params.get("application_name"),
|
||||
};
|
||||
|
||||
let auth_result = match creds
|
||||
.authenticate(&extra, &mut stream, allow_cleartext)
|
||||
.await
|
||||
{
|
||||
Ok(auth_result) => auth_result,
|
||||
Err(e) => return stream.throw_error(e).await,
|
||||
};
|
||||
let auth_result = async {
|
||||
// `&mut stream` doesn't let us merge those 2 lines.
|
||||
let res = creds
|
||||
.authenticate(&extra, &mut stream, allow_cleartext)
|
||||
.await;
|
||||
|
||||
async { res }.or_else(|e| stream.throw_error(e)).await
|
||||
}
|
||||
.await?;
|
||||
|
||||
let AuthSuccess {
|
||||
reported_auth_ok,
|
||||
|
||||
@@ -205,25 +205,29 @@ module.exports = async ({ github, context, fetch, report }) => {
|
||||
|
||||
const {reportUrl, reportJsonUrl} = report
|
||||
|
||||
if (reportUrl && reportJsonUrl) {
|
||||
try {
|
||||
const parsed = await parseReportJson({ reportJsonUrl, fetch })
|
||||
commentBody += await reportSummary({ ...parsed, reportUrl })
|
||||
} catch (error) {
|
||||
commentBody += `### [full report](${reportUrl})\n___\n`
|
||||
commentBody += `#### Failed to create a summary for the test run: \n`
|
||||
commentBody += "```\n"
|
||||
commentBody += `${error.stack}\n`
|
||||
commentBody += "```\n"
|
||||
commentBody += "\nTo reproduce and debug the error locally run:\n"
|
||||
commentBody += "```\n"
|
||||
commentBody += `scripts/comment-test-report.js ${reportJsonUrl}`
|
||||
commentBody += "\n```\n"
|
||||
}
|
||||
} else {
|
||||
if (!reportUrl || !reportJsonUrl) {
|
||||
commentBody += `#### No tests were run or test report is not available\n`
|
||||
commentBody += autoupdateNotice
|
||||
return
|
||||
}
|
||||
|
||||
try {
|
||||
const parsed = await parseReportJson({ reportJsonUrl, fetch })
|
||||
commentBody += await reportSummary({ ...parsed, reportUrl })
|
||||
} catch (error) {
|
||||
commentBody += `### [full report](${reportUrl})\n___\n`
|
||||
commentBody += `#### Failed to create a summary for the test run: \n`
|
||||
commentBody += "```\n"
|
||||
commentBody += `${error.stack}\n`
|
||||
commentBody += "```\n"
|
||||
commentBody += "\nTo reproduce and debug the error locally run:\n"
|
||||
commentBody += "```\n"
|
||||
commentBody += `scripts/comment-test-report.js ${reportJsonUrl}`
|
||||
commentBody += "\n```\n"
|
||||
}
|
||||
|
||||
commentBody += autoupdateNotice
|
||||
|
||||
let createCommentFn, listCommentsFn, updateCommentFn, issueNumberOrSha
|
||||
if (isPullRequest) {
|
||||
createCommentFn = github.rest.issues.createComment
|
||||
|
||||
@@ -529,16 +529,6 @@ def available_remote_storages() -> List[RemoteStorageKind]:
|
||||
return remote_storages
|
||||
|
||||
|
||||
def available_s3_storages() -> List[RemoteStorageKind]:
|
||||
remote_storages = [RemoteStorageKind.MOCK_S3]
|
||||
if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE") is not None:
|
||||
remote_storages.append(RemoteStorageKind.REAL_S3)
|
||||
log.info("Enabling real s3 storage for tests")
|
||||
else:
|
||||
log.info("Using mock implementations to test remote storage")
|
||||
return remote_storages
|
||||
|
||||
|
||||
@dataclass
|
||||
class LocalFsStorage:
|
||||
root: Path
|
||||
@@ -559,16 +549,6 @@ class S3Storage:
|
||||
"AWS_SECRET_ACCESS_KEY": self.secret_key,
|
||||
}
|
||||
|
||||
def to_string(self) -> str:
|
||||
return json.dumps(
|
||||
{
|
||||
"bucket": self.bucket_name,
|
||||
"region": self.bucket_region,
|
||||
"endpoint": self.endpoint,
|
||||
"prefix": self.prefix_in_bucket,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
RemoteStorage = Union[LocalFsStorage, S3Storage]
|
||||
|
||||
@@ -635,12 +615,10 @@ class NeonEnvBuilder:
|
||||
self.rust_log_override = rust_log_override
|
||||
self.port_distributor = port_distributor
|
||||
self.remote_storage = remote_storage
|
||||
self.ext_remote_storage: Optional[S3Storage] = None
|
||||
self.remote_storage_client: Optional[Any] = None
|
||||
self.remote_storage_users = remote_storage_users
|
||||
self.broker = broker
|
||||
self.run_id = run_id
|
||||
self.mock_s3_server: MockS3Server = mock_s3_server
|
||||
self.mock_s3_server = mock_s3_server
|
||||
self.pageserver_config_override = pageserver_config_override
|
||||
self.num_safekeepers = num_safekeepers
|
||||
self.safekeepers_id_start = safekeepers_id_start
|
||||
@@ -688,24 +666,15 @@ class NeonEnvBuilder:
|
||||
remote_storage_kind: RemoteStorageKind,
|
||||
test_name: str,
|
||||
force_enable: bool = True,
|
||||
enable_remote_extensions: bool = False,
|
||||
):
|
||||
if remote_storage_kind == RemoteStorageKind.NOOP:
|
||||
return
|
||||
elif remote_storage_kind == RemoteStorageKind.LOCAL_FS:
|
||||
self.enable_local_fs_remote_storage(force_enable=force_enable)
|
||||
elif remote_storage_kind == RemoteStorageKind.MOCK_S3:
|
||||
self.enable_mock_s3_remote_storage(
|
||||
bucket_name=test_name,
|
||||
force_enable=force_enable,
|
||||
enable_remote_extensions=enable_remote_extensions,
|
||||
)
|
||||
self.enable_mock_s3_remote_storage(bucket_name=test_name, force_enable=force_enable)
|
||||
elif remote_storage_kind == RemoteStorageKind.REAL_S3:
|
||||
self.enable_real_s3_remote_storage(
|
||||
test_name=test_name,
|
||||
force_enable=force_enable,
|
||||
enable_remote_extensions=enable_remote_extensions,
|
||||
)
|
||||
self.enable_real_s3_remote_storage(test_name=test_name, force_enable=force_enable)
|
||||
else:
|
||||
raise RuntimeError(f"Unknown storage type: {remote_storage_kind}")
|
||||
|
||||
@@ -719,15 +688,11 @@ class NeonEnvBuilder:
|
||||
assert force_enable or self.remote_storage is None, "remote storage is enabled already"
|
||||
self.remote_storage = LocalFsStorage(Path(self.repo_dir / "local_fs_remote_storage"))
|
||||
|
||||
def enable_mock_s3_remote_storage(
|
||||
self, bucket_name: str, force_enable: bool = True, enable_remote_extensions: bool = False
|
||||
):
|
||||
def enable_mock_s3_remote_storage(self, bucket_name: str, force_enable: bool = True):
|
||||
"""
|
||||
Sets up the pageserver to use the S3 mock server, creates the bucket, if it's not present already.
|
||||
Starts up the mock server, if that does not run yet.
|
||||
Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`.
|
||||
|
||||
Also creates the bucket for extensions, self.ext_remote_storage bucket
|
||||
"""
|
||||
assert force_enable or self.remote_storage is None, "remote storage is enabled already"
|
||||
mock_endpoint = self.mock_s3_server.endpoint()
|
||||
@@ -748,22 +713,9 @@ class NeonEnvBuilder:
|
||||
bucket_region=mock_region,
|
||||
access_key=self.mock_s3_server.access_key(),
|
||||
secret_key=self.mock_s3_server.secret_key(),
|
||||
prefix_in_bucket="pageserver",
|
||||
)
|
||||
|
||||
if enable_remote_extensions:
|
||||
self.ext_remote_storage = S3Storage(
|
||||
bucket_name=bucket_name,
|
||||
endpoint=mock_endpoint,
|
||||
bucket_region=mock_region,
|
||||
access_key=self.mock_s3_server.access_key(),
|
||||
secret_key=self.mock_s3_server.secret_key(),
|
||||
prefix_in_bucket="ext",
|
||||
)
|
||||
|
||||
def enable_real_s3_remote_storage(
|
||||
self, test_name: str, force_enable: bool = True, enable_remote_extensions: bool = False
|
||||
):
|
||||
def enable_real_s3_remote_storage(self, test_name: str, force_enable: bool = True):
|
||||
"""
|
||||
Sets up configuration to use real s3 endpoint without mock server
|
||||
"""
|
||||
@@ -803,15 +755,6 @@ class NeonEnvBuilder:
|
||||
prefix_in_bucket=self.remote_storage_prefix,
|
||||
)
|
||||
|
||||
if enable_remote_extensions:
|
||||
self.ext_remote_storage = S3Storage(
|
||||
bucket_name="neon-dev-extensions",
|
||||
bucket_region="eu-central-1",
|
||||
access_key=access_key,
|
||||
secret_key=secret_key,
|
||||
prefix_in_bucket="5555",
|
||||
)
|
||||
|
||||
def cleanup_local_storage(self):
|
||||
if self.preserve_database_files:
|
||||
return
|
||||
@@ -845,7 +788,6 @@ class NeonEnvBuilder:
|
||||
# `self.remote_storage_prefix` is coupled with `S3Storage` storage type,
|
||||
# so this line effectively a no-op
|
||||
assert isinstance(self.remote_storage, S3Storage)
|
||||
assert self.remote_storage_client is not None
|
||||
|
||||
if self.keep_remote_storage_contents:
|
||||
log.info("keep_remote_storage_contents skipping remote storage cleanup")
|
||||
@@ -975,8 +917,6 @@ class NeonEnv:
|
||||
self.neon_binpath = config.neon_binpath
|
||||
self.pg_distrib_dir = config.pg_distrib_dir
|
||||
self.endpoint_counter = 0
|
||||
self.remote_storage_client = config.remote_storage_client
|
||||
self.ext_remote_storage = config.ext_remote_storage
|
||||
|
||||
# generate initial tenant ID here instead of letting 'neon init' generate it,
|
||||
# so that we don't need to dig it out of the config file afterwards.
|
||||
@@ -1563,7 +1503,6 @@ class NeonCli(AbstractNeonCli):
|
||||
safekeepers: Optional[List[int]] = None,
|
||||
tenant_id: Optional[TenantId] = None,
|
||||
lsn: Optional[Lsn] = None,
|
||||
remote_ext_config: Optional[str] = None,
|
||||
) -> "subprocess.CompletedProcess[str]":
|
||||
args = [
|
||||
"endpoint",
|
||||
@@ -1573,8 +1512,6 @@ class NeonCli(AbstractNeonCli):
|
||||
"--pg-version",
|
||||
self.env.pg_version,
|
||||
]
|
||||
if remote_ext_config is not None:
|
||||
args.extend(["--remote-ext-config", remote_ext_config])
|
||||
if lsn is not None:
|
||||
args.append(f"--lsn={lsn}")
|
||||
args.extend(["--pg-port", str(pg_port)])
|
||||
@@ -2436,7 +2373,7 @@ class Endpoint(PgProtocol):
|
||||
|
||||
return self
|
||||
|
||||
def start(self, remote_ext_config: Optional[str] = None) -> "Endpoint":
|
||||
def start(self) -> "Endpoint":
|
||||
"""
|
||||
Start the Postgres instance.
|
||||
Returns self.
|
||||
@@ -2452,7 +2389,6 @@ class Endpoint(PgProtocol):
|
||||
http_port=self.http_port,
|
||||
tenant_id=self.tenant_id,
|
||||
safekeepers=self.active_safekeepers,
|
||||
remote_ext_config=remote_ext_config,
|
||||
)
|
||||
self.running = True
|
||||
|
||||
@@ -2542,7 +2478,6 @@ class Endpoint(PgProtocol):
|
||||
hot_standby: bool = False,
|
||||
lsn: Optional[Lsn] = None,
|
||||
config_lines: Optional[List[str]] = None,
|
||||
remote_ext_config: Optional[str] = None,
|
||||
) -> "Endpoint":
|
||||
"""
|
||||
Create an endpoint, apply config, and start Postgres.
|
||||
@@ -2557,7 +2492,7 @@ class Endpoint(PgProtocol):
|
||||
config_lines=config_lines,
|
||||
hot_standby=hot_standby,
|
||||
lsn=lsn,
|
||||
).start(remote_ext_config=remote_ext_config)
|
||||
).start()
|
||||
|
||||
log.info(f"Postgres startup took {time.time() - started_at} seconds")
|
||||
|
||||
@@ -2591,7 +2526,6 @@ class EndpointFactory:
|
||||
lsn: Optional[Lsn] = None,
|
||||
hot_standby: bool = False,
|
||||
config_lines: Optional[List[str]] = None,
|
||||
remote_ext_config: Optional[str] = None,
|
||||
) -> Endpoint:
|
||||
ep = Endpoint(
|
||||
self.env,
|
||||
@@ -2608,7 +2542,6 @@ class EndpointFactory:
|
||||
hot_standby=hot_standby,
|
||||
config_lines=config_lines,
|
||||
lsn=lsn,
|
||||
remote_ext_config=remote_ext_config,
|
||||
)
|
||||
|
||||
def create(
|
||||
@@ -3176,18 +3109,3 @@ def last_flush_lsn_upload(
|
||||
ps_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn)
|
||||
return last_flush_lsn
|
||||
|
||||
|
||||
def parse_project_git_version_output(s: str) -> str:
|
||||
"""
|
||||
Parses the git commit hash out of the --version output supported at least by neon_local.
|
||||
|
||||
The information is generated by utils::project_git_version!
|
||||
"""
|
||||
import re
|
||||
|
||||
res = re.search(r"git(-env)?:([0-9a-fA-F]{8,40})(-\S+)?", s)
|
||||
if res and (commit := res.group(2)):
|
||||
return commit
|
||||
|
||||
raise ValueError(f"unable to parse --version output: '{s}'")
|
||||
|
||||
@@ -89,9 +89,6 @@ class TenantId(Id):
|
||||
def __repr__(self) -> str:
|
||||
return f'`TenantId("{self.id.hex()}")'
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.id.hex()
|
||||
|
||||
|
||||
class TimelineId(Id):
|
||||
def __repr__(self) -> str:
|
||||
|
||||
@@ -60,11 +60,6 @@ def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenc
|
||||
value = metrics[key]
|
||||
zenbenchmark.record(name, value, "ms", report=MetricReport.LOWER_IS_BETTER)
|
||||
|
||||
# Check basebackup size makes sense
|
||||
basebackup_bytes = metrics["basebackup_bytes"]
|
||||
if i > 0:
|
||||
assert basebackup_bytes < 100 * 1024
|
||||
|
||||
# Stop so we can restart
|
||||
endpoint.stop()
|
||||
|
||||
|
||||
Binary file not shown.
Binary file not shown.
@@ -1,14 +0,0 @@
|
||||
{
|
||||
"enabled_extensions": {
|
||||
"123454321": [
|
||||
"anon"
|
||||
],
|
||||
"public": [
|
||||
"embedding"
|
||||
]
|
||||
},
|
||||
"control_data": {
|
||||
"embedding": "comment = 'hnsw index' \ndefault_version = '0.1.0' \nmodule_pathname = '$libdir/embedding' \nrelocatable = true \ntrusted = true",
|
||||
"anon": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
|
||||
}
|
||||
}
|
||||
Binary file not shown.
Binary file not shown.
@@ -1,14 +0,0 @@
|
||||
{
|
||||
"enabled_extensions": {
|
||||
"123454321": [
|
||||
"anon"
|
||||
],
|
||||
"public": [
|
||||
"embedding"
|
||||
]
|
||||
},
|
||||
"control_data": {
|
||||
"embedding": "comment = 'hnsw index' \ndefault_version = '0.1.0' \nmodule_pathname = '$libdir/embedding' \nrelocatable = true \ntrusted = true",
|
||||
"anon": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
|
||||
}
|
||||
}
|
||||
@@ -14,7 +14,6 @@ from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
PgBin,
|
||||
PortDistributor,
|
||||
parse_project_git_version_output,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
from fixtures.pageserver.utils import (
|
||||
@@ -73,9 +72,9 @@ def test_create_snapshot(
|
||||
".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
|
||||
)
|
||||
|
||||
pg_bin.run_capture(["pgbench", "--initialize", "--scale=10", endpoint.connstr()])
|
||||
pg_bin.run_capture(["pgbench", "--time=60", "--progress=2", endpoint.connstr()])
|
||||
pg_bin.run_capture(
|
||||
pg_bin.run(["pgbench", "--initialize", "--scale=10", endpoint.connstr()])
|
||||
pg_bin.run(["pgbench", "--time=60", "--progress=2", endpoint.connstr()])
|
||||
pg_bin.run(
|
||||
["pg_dumpall", f"--dbname={endpoint.connstr()}", f"--file={test_output_dir / 'dump.sql'}"]
|
||||
)
|
||||
|
||||
@@ -353,7 +352,7 @@ def prepare_snapshot(
|
||||
# get git SHA of neon binary
|
||||
def get_neon_version(neon_binpath: Path):
|
||||
out = subprocess.check_output([neon_binpath / "neon_local", "--version"]).decode("utf-8")
|
||||
return parse_project_git_version_output(out)
|
||||
return out.split("git:", 1)[1].rstrip()
|
||||
|
||||
|
||||
def check_neon_works(
|
||||
@@ -405,9 +404,7 @@ def check_neon_works(
|
||||
request.addfinalizer(lambda: cli_current.endpoint_stop("main"))
|
||||
|
||||
connstr = f"host=127.0.0.1 port={pg_port} user=cloud_admin dbname=postgres"
|
||||
pg_bin.run_capture(
|
||||
["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump.sql'}"]
|
||||
)
|
||||
pg_bin.run(["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump.sql'}"])
|
||||
initial_dump_differs = dump_differs(
|
||||
repo_dir.parent / "dump.sql",
|
||||
test_output_dir / "dump.sql",
|
||||
@@ -427,7 +424,7 @@ def check_neon_works(
|
||||
shutil.rmtree(repo_dir / "local_fs_remote_storage")
|
||||
timeline_delete_wait_completed(pageserver_http, tenant_id, timeline_id)
|
||||
pageserver_http.timeline_create(pg_version, tenant_id, timeline_id)
|
||||
pg_bin.run_capture(
|
||||
pg_bin.run(
|
||||
["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump-from-wal.sql'}"]
|
||||
)
|
||||
# The assert itself deferred to the end of the test
|
||||
@@ -439,7 +436,7 @@ def check_neon_works(
|
||||
)
|
||||
|
||||
# Check that we can interract with the data
|
||||
pg_bin.run_capture(["pgbench", "--time=10", "--progress=2", connstr])
|
||||
pg_bin.run(["pgbench", "--time=10", "--progress=2", connstr])
|
||||
|
||||
assert not dump_from_wal_differs, "dump from WAL differs"
|
||||
assert not initial_dump_differs, "initial dump differs"
|
||||
|
||||
@@ -1,122 +0,0 @@
|
||||
import os
|
||||
import shutil
|
||||
from contextlib import closing
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
RemoteStorageKind,
|
||||
available_s3_storages,
|
||||
)
|
||||
from fixtures.pg_version import PgVersion
|
||||
|
||||
# Generate mock extension files and upload them to the mock bucket.
|
||||
#
|
||||
# NOTE: You must have appropriate AWS credentials to run REAL_S3 test.
|
||||
# It may also be necessary to set the following environment variables for MOCK_S3 test:
|
||||
# export AWS_ACCESS_KEY_ID='test' # export AWS_SECRET_ACCESS_KEY='test'
|
||||
# export AWS_SECURITY_TOKEN='test' # export AWS_SESSION_TOKEN='test'
|
||||
# export AWS_DEFAULT_REGION='us-east-1'
|
||||
|
||||
|
||||
@pytest.mark.parametrize("remote_storage_kind", available_s3_storages())
|
||||
def test_remote_extensions(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
remote_storage_kind: RemoteStorageKind,
|
||||
pg_version: PgVersion,
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_remote_extensions",
|
||||
enable_remote_extensions=True,
|
||||
)
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
env = neon_env_builder.init_start()
|
||||
tenant_id, _ = env.neon_cli.create_tenant()
|
||||
env.neon_cli.create_timeline("test_remote_extensions", tenant_id=tenant_id)
|
||||
|
||||
# For MOCK_S3 we upload test files.
|
||||
# For REAL_S3 we use the files already in the bucket
|
||||
if remote_storage_kind == RemoteStorageKind.MOCK_S3:
|
||||
log.info("Uploading test files to mock bucket")
|
||||
|
||||
def upload_test_file(from_path, to_path):
|
||||
assert env.ext_remote_storage is not None # satisfy mypy
|
||||
assert env.remote_storage_client is not None # satisfy mypy
|
||||
with open(
|
||||
f"test_runner/regress/data/extension_test/v{pg_version}/{from_path}", "rb"
|
||||
) as f:
|
||||
env.remote_storage_client.upload_fileobj(
|
||||
f,
|
||||
env.ext_remote_storage.bucket_name,
|
||||
f"ext/v{pg_version}/{to_path}",
|
||||
)
|
||||
|
||||
upload_test_file("ext_index.json", "ext_index.json")
|
||||
upload_test_file("anon.tar.gz", "extensions/anon.tar.gz")
|
||||
upload_test_file("embedding.tar.gz", "extensions/embedding.tar.gz")
|
||||
|
||||
assert env.ext_remote_storage is not None # satisfy mypy
|
||||
assert env.remote_storage_client is not None # satisfy mypy
|
||||
try:
|
||||
# Start a compute node and check that it can download the extensions
|
||||
# and use them to CREATE EXTENSION and LOAD
|
||||
endpoint = env.endpoints.create_start(
|
||||
"test_remote_extensions",
|
||||
tenant_id=tenant_id,
|
||||
remote_ext_config=env.ext_remote_storage.to_string(),
|
||||
# config_lines=["log_min_messages=debug3"],
|
||||
)
|
||||
with closing(endpoint.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
# Check that appropriate control files were downloaded
|
||||
cur.execute("SELECT * FROM pg_available_extensions")
|
||||
all_extensions = [x[0] for x in cur.fetchall()]
|
||||
log.info(all_extensions)
|
||||
assert "anon" in all_extensions
|
||||
assert "embedding" in all_extensions
|
||||
# TODO: check that we cant't download custom extensions for other tenant ids
|
||||
|
||||
# check that we can download public extension
|
||||
cur.execute("CREATE EXTENSION embedding")
|
||||
cur.execute("SELECT extname FROM pg_extension")
|
||||
assert "embedding" in [x[0] for x in cur.fetchall()]
|
||||
|
||||
# check that we can download private extension
|
||||
try:
|
||||
cur.execute("CREATE EXTENSION anon")
|
||||
except Exception as err:
|
||||
log.info("error creating anon extension")
|
||||
assert "pgcrypto" in str(err), "unexpected error creating anon extension"
|
||||
|
||||
# TODO: try to load libraries as well
|
||||
|
||||
finally:
|
||||
# Cleaning up downloaded files is important for local tests
|
||||
# or else one test could reuse the files from another test or another test run
|
||||
cleanup_files = [
|
||||
"lib/postgresql/anon.so",
|
||||
"lib/postgresql/embedding.so",
|
||||
"share/postgresql/extension/anon.control",
|
||||
"share/postgresql/extension/embedding--0.1.0.sql",
|
||||
"share/postgresql/extension/embedding.control",
|
||||
]
|
||||
cleanup_files = [f"pg_install/v{pg_version}/" + x for x in cleanup_files]
|
||||
cleanup_folders = [
|
||||
"extensions",
|
||||
f"pg_install/v{pg_version}/share/postgresql/extension/anon",
|
||||
f"pg_install/v{pg_version}/extensions",
|
||||
]
|
||||
for file in cleanup_files:
|
||||
try:
|
||||
os.remove(file)
|
||||
log.info(f"removed file {file}")
|
||||
except Exception as err:
|
||||
log.info(f"error removing file {file}: {err}")
|
||||
for folder in cleanup_folders:
|
||||
try:
|
||||
shutil.rmtree(folder)
|
||||
log.info(f"removed folder {folder}")
|
||||
except Exception as err:
|
||||
log.info(f"error removing file {file}: {err}")
|
||||
@@ -1,18 +1,12 @@
|
||||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import cast
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
from fixtures.neon_fixtures import (
|
||||
DEFAULT_BRANCH_NAME,
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
parse_project_git_version_output,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
from fixtures.pg_version import PgVersion, skip_on_postgres
|
||||
from fixtures.types import TenantId, TimelineId
|
||||
|
||||
|
||||
@@ -137,66 +131,3 @@ def test_cli_start_stop(neon_env_builder: NeonEnvBuilder):
|
||||
# Default stop
|
||||
res = env.neon_cli.raw_cli(["stop"])
|
||||
res.check_returncode()
|
||||
|
||||
|
||||
@skip_on_postgres(PgVersion.V14, reason="does not use postgres")
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("BUILD_TYPE") == "debug", reason="unit test for test support, either build works"
|
||||
)
|
||||
def test_parse_project_git_version_output_positive():
|
||||
commit = "b6f77b5816cf1dba12a3bc8747941182ce220846"
|
||||
|
||||
positive = [
|
||||
# most likely when developing locally
|
||||
f"Neon CLI git:{commit}-modified",
|
||||
# when developing locally
|
||||
f"Neon CLI git:{commit}",
|
||||
# this is not produced in practice, but the impl supports it
|
||||
f"Neon CLI git-env:{commit}-modified",
|
||||
# most likely from CI or docker build
|
||||
f"Neon CLI git-env:{commit}",
|
||||
]
|
||||
|
||||
for example in positive:
|
||||
assert parse_project_git_version_output(example) == commit
|
||||
|
||||
|
||||
@skip_on_postgres(PgVersion.V14, reason="does not use postgres")
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("BUILD_TYPE") == "debug", reason="unit test for test support, either build works"
|
||||
)
|
||||
def test_parse_project_git_version_output_local_docker():
|
||||
"""
|
||||
Makes sure the tests don't accept the default version in Dockerfile one gets without providing
|
||||
a commit lookalike in --build-arg GIT_VERSION=XXX
|
||||
"""
|
||||
input = "Neon CLI git-env:local"
|
||||
|
||||
with pytest.raises(ValueError) as e:
|
||||
parse_project_git_version_output(input)
|
||||
|
||||
assert input in str(e)
|
||||
|
||||
|
||||
@skip_on_postgres(PgVersion.V14, reason="does not use postgres")
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("BUILD_TYPE") == "debug", reason="cli api sanity, either build works"
|
||||
)
|
||||
def test_binaries_version_parses(neon_binpath: Path):
|
||||
"""
|
||||
Ensures that we can parse the actual outputs of --version from a set of binaries.
|
||||
|
||||
The list is not meant to be exhaustive, and compute_ctl has a different way for example.
|
||||
"""
|
||||
|
||||
binaries = [
|
||||
"neon_local",
|
||||
"pageserver",
|
||||
"safekeeper",
|
||||
"proxy",
|
||||
"pg_sni_router",
|
||||
"storage_broker",
|
||||
]
|
||||
for bin in binaries:
|
||||
out = subprocess.check_output([neon_binpath / bin, "--version"]).decode("utf-8")
|
||||
parse_project_git_version_output(out)
|
||||
|
||||
@@ -777,95 +777,6 @@ def test_empty_branch_remote_storage_upload_on_restart(
|
||||
create_thread.join()
|
||||
|
||||
|
||||
# Regression test for a race condition where files are compactified before the upload,
|
||||
# resulting in the uploading complaining about the file not being found
|
||||
# https://github.com/neondatabase/neon/issues/4526
|
||||
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
|
||||
def test_compaction_delete_before_upload(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
remote_storage_kind: RemoteStorageKind,
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_compaction_delete_before_upload",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# create tenant with config that will determinstically allow
|
||||
# compaction and disables gc
|
||||
tenant_id, timeline_id = env.neon_cli.create_tenant(
|
||||
conf={
|
||||
# Set a small compaction threshold
|
||||
"compaction_threshold": "3",
|
||||
# Disable GC
|
||||
"gc_period": "0s",
|
||||
# disable PITR
|
||||
"pitr_interval": "0s",
|
||||
}
|
||||
)
|
||||
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
|
||||
# Build two tables with some data inside
|
||||
endpoint.safe_psql("CREATE TABLE foo AS SELECT x FROM generate_series(1, 10000) g(x)")
|
||||
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
|
||||
|
||||
client.timeline_checkpoint(tenant_id, timeline_id)
|
||||
|
||||
endpoint.safe_psql("CREATE TABLE bar AS SELECT x FROM generate_series(1, 10000) g(x)")
|
||||
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
|
||||
|
||||
# Now make the flushing hang and update one small piece of data
|
||||
client.configure_failpoints(("flush-frozen-before-sync", "pause"))
|
||||
|
||||
endpoint.safe_psql("UPDATE foo SET x = 0 WHERE x = 1")
|
||||
|
||||
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
|
||||
|
||||
q: queue.Queue[Optional[PageserverApiException]] = queue.Queue()
|
||||
barrier = threading.Barrier(2)
|
||||
|
||||
def checkpoint_in_background():
|
||||
barrier.wait()
|
||||
try:
|
||||
client.timeline_checkpoint(tenant_id, timeline_id)
|
||||
q.put(None)
|
||||
except PageserverApiException as e:
|
||||
q.put(e)
|
||||
|
||||
create_thread = threading.Thread(target=checkpoint_in_background)
|
||||
create_thread.start()
|
||||
|
||||
try:
|
||||
barrier.wait()
|
||||
|
||||
time.sleep(4)
|
||||
client.timeline_compact(tenant_id, timeline_id)
|
||||
|
||||
client.configure_failpoints(("flush-frozen-before-sync", "off"))
|
||||
|
||||
conflict = q.get()
|
||||
|
||||
assert conflict is None
|
||||
finally:
|
||||
create_thread.join()
|
||||
|
||||
# Add a delay for the uploads to run into either the file not found or the
|
||||
time.sleep(4)
|
||||
|
||||
# Ensure that this actually terminates
|
||||
wait_upload_queue_empty(client, tenant_id, timeline_id)
|
||||
|
||||
# For now we are hitting this message.
|
||||
# Maybe in the future the underlying race condition will be fixed,
|
||||
# but until then, ensure that this message is hit instead.
|
||||
assert env.pageserver.log_contains(
|
||||
"File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more."
|
||||
)
|
||||
|
||||
|
||||
def wait_upload_queue_empty(
|
||||
client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
|
||||
):
|
||||
|
||||
@@ -275,7 +275,6 @@ def assert_prefix_empty(neon_env_builder: NeonEnvBuilder, prefix: Optional[str]
|
||||
assert isinstance(neon_env_builder.remote_storage, S3Storage)
|
||||
|
||||
# Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
|
||||
assert neon_env_builder.remote_storage_client is not None
|
||||
response = neon_env_builder.remote_storage_client.list_objects_v2(
|
||||
Bucket=neon_env_builder.remote_storage.bucket_name,
|
||||
Prefix=prefix or neon_env_builder.remote_storage.prefix_in_bucket or "",
|
||||
@@ -629,7 +628,7 @@ def test_timeline_delete_works_for_remote_smoke(
|
||||
)
|
||||
|
||||
# for some reason the check above doesnt immediately take effect for the below.
|
||||
# Assume it is mock server inconsistency and check twice.
|
||||
# Assume it is mock server incosistency and check twice.
|
||||
wait_until(
|
||||
2,
|
||||
0.5,
|
||||
|
||||
2
vendor/postgres-v14
vendored
2
vendor/postgres-v14
vendored
Submodule vendor/postgres-v14 updated: 93a5ee7749...1144aee166
2
vendor/postgres-v15
vendored
2
vendor/postgres-v15
vendored
Submodule vendor/postgres-v15 updated: 293a06e5e1...1984832c74
4
vendor/revisions.json
vendored
4
vendor/revisions.json
vendored
@@ -1,4 +0,0 @@
|
||||
{
|
||||
"postgres-v15": "293a06e5e14ed9be3f5002c63b4fac391491ec17",
|
||||
"postgres-v14": "93a5ee7749f109ecb9e5481be485c8cb17fe72ce"
|
||||
}
|
||||
Reference in New Issue
Block a user