Compare commits

..

1 Commits

Author SHA1 Message Date
John Spray
1c7df523e2 pageserver: fix deletion/creation race
Closes: ?https://github.com/neondatabase/neon/issues/5878
2023-11-17 23:41:03 +00:00
105 changed files with 1218 additions and 3595 deletions

View File

@@ -1,3 +1,17 @@
# The binaries are really slow, if you compile them in 'dev' mode with the defaults.
# Enable some optimizations even in 'dev' mode, to make tests faster. The basic
# optimizations enabled by "opt-level=1" don't affect debuggability too much.
#
# See https://www.reddit.com/r/rust/comments/gvrgca/this_is_a_neat_trick_for_getting_good_runtime/
#
[profile.dev.package."*"]
# Set the default for dependencies in Development mode.
opt-level = 3
[profile.dev]
# Turn on a small amount of optimization in Development mode.
opt-level = 1
[build]
# This is only present for local builds, as it will be overridden
# by the RUSTDOCFLAGS env var in CI.

View File

@@ -404,7 +404,7 @@ jobs:
uses: ./.github/actions/save-coverage-data
regress-tests:
needs: [ check-permissions, build-neon, tag ]
needs: [ check-permissions, build-neon ]
runs-on: [ self-hosted, gen3, large ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -436,7 +436,6 @@ jobs:
env:
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
- name: Merge and upload coverage data
if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'

View File

@@ -2,7 +2,7 @@ name: Create Release Branch
on:
schedule:
- cron: '0 6 * * 1'
- cron: '0 7 * * 5'
workflow_dispatch:
jobs:

View File

@@ -9,24 +9,6 @@ refactoring, additional comments, and so forth. Let's try to raise the
bar, and clean things up as we go. Try to leave code in a better shape
than it was before.
## Pre-commit hook
We have a sample pre-commit hook in `pre-commit.py`.
To set it up, run:
```bash
ln -s ../../pre-commit.py .git/hooks/pre-commit
```
This will run following checks on staged files before each commit:
- `rustfmt`
- checks for python files, see [obligatory checks](/docs/sourcetree.md#obligatory-checks).
There is also a separate script `./run_clippy.sh` that runs `cargo clippy` on the whole project
and `./scripts/reformat` that runs all formatting tools to ensure the project is up to date.
If you want to skip the hook, run `git commit` with `--no-verify` option.
## Submitting changes
1. Get at least one +1 on your PR before you push.

20
Cargo.lock generated
View File

@@ -193,8 +193,6 @@ dependencies = [
"memchr",
"pin-project-lite",
"tokio",
"zstd",
"zstd-safe",
]
[[package]]
@@ -1126,7 +1124,6 @@ version = "0.1.0"
dependencies = [
"anyhow",
"async-compression",
"bytes",
"cfg-if",
"chrono",
"clap",
@@ -2908,8 +2905,6 @@ dependencies = [
"git-version",
"pageserver",
"postgres_ffi",
"serde",
"serde_json",
"svg_fmt",
"tokio",
"utils",
@@ -3011,7 +3006,6 @@ dependencies = [
"serde_with",
"strum",
"strum_macros",
"thiserror",
"utils",
"workspace_hack",
]
@@ -3227,7 +3221,7 @@ dependencies = [
[[package]]
name = "postgres"
version = "0.19.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=6ce32f791526e27533cab0232a6bb243b2c32584#6ce32f791526e27533cab0232a6bb243b2c32584"
dependencies = [
"bytes",
"fallible-iterator",
@@ -3240,7 +3234,7 @@ dependencies = [
[[package]]
name = "postgres-native-tls"
version = "0.5.0"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=6ce32f791526e27533cab0232a6bb243b2c32584#6ce32f791526e27533cab0232a6bb243b2c32584"
dependencies = [
"native-tls",
"tokio",
@@ -3251,7 +3245,7 @@ dependencies = [
[[package]]
name = "postgres-protocol"
version = "0.6.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=6ce32f791526e27533cab0232a6bb243b2c32584#6ce32f791526e27533cab0232a6bb243b2c32584"
dependencies = [
"base64 0.20.0",
"byteorder",
@@ -3269,7 +3263,7 @@ dependencies = [
[[package]]
name = "postgres-types"
version = "0.2.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=6ce32f791526e27533cab0232a6bb243b2c32584#6ce32f791526e27533cab0232a6bb243b2c32584"
dependencies = [
"bytes",
"fallible-iterator",
@@ -3506,7 +3500,6 @@ dependencies = [
"pbkdf2",
"pin-project-lite",
"postgres-native-tls",
"postgres-protocol",
"postgres_backend",
"pq_proto",
"prometheus",
@@ -4940,7 +4933,7 @@ dependencies = [
[[package]]
name = "tokio-postgres"
version = "0.7.7"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=6ce32f791526e27533cab0232a6bb243b2c32584#6ce32f791526e27533cab0232a6bb243b2c32584"
dependencies = [
"async-trait",
"byteorder",
@@ -6038,9 +6031,6 @@ dependencies = [
"tungstenite",
"url",
"uuid",
"zstd",
"zstd-safe",
"zstd-sys",
]
[[package]]

View File

@@ -37,7 +37,7 @@ license = "Apache-2.0"
[workspace.dependencies]
anyhow = { version = "1.0", features = ["backtrace"] }
arc-swap = "1.6"
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
azure_core = "0.16"
azure_identity = "0.16"
azure_storage = "0.16"
@@ -165,11 +165,11 @@ env_logger = "0.10"
log = "0.4"
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
## Other git libraries
heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -206,7 +206,7 @@ tonic-build = "0.9"
# This is only needed for proxy's tests.
# TODO: we should probably fork `tokio-postgres-rustls` instead.
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
################# Binary contents sections

View File

@@ -714,24 +714,6 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -
cargo pgrx install --release && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
#########################################################################################
#
# Layer "wal2json-build"
# Compile "wal2json" extension
#
#########################################################################################
FROM build-deps AS wal2json-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
mkdir wal2json-src && cd wal2json-src && tar xvzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/wal2json.control
#########################################################################################
#
# Layer "neon-pg-ext-build"
@@ -768,7 +750,6 @@ COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
COPY pgxn/ pgxn/
RUN make -j $(getconf _NPROCESSORS_ONLN) \

View File

@@ -38,4 +38,3 @@ toml_edit.workspace = true
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
zstd = "0.12.4"
bytes = "1.0"

View File

@@ -31,7 +31,7 @@
//! -C 'postgresql://cloud_admin@localhost/postgres' \
//! -S /var/db/postgres/specs/current.json \
//! -b /usr/local/bin/postgres \
//! -r http://pg-ext-s3-gateway
//! -r {"bucket": "neon-dev-extensions-eu-central-1", "region": "eu-central-1"}
//! ```
//!
use std::collections::HashMap;
@@ -51,7 +51,7 @@ use compute_api::responses::ComputeStatus;
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
use compute_tools::configurator::launch_configurator;
use compute_tools::extension_server::get_pg_version;
use compute_tools::extension_server::{get_pg_version, init_remote_storage};
use compute_tools::http::api::launch_http_server;
use compute_tools::logger::*;
use compute_tools::monitor::launch_monitor;
@@ -60,7 +60,7 @@ use compute_tools::spec::*;
// this is an arbitrary build tag. Fine as a default / for testing purposes
// in-case of not-set environment var
const BUILD_TAG_DEFAULT: &str = "latest";
const BUILD_TAG_DEFAULT: &str = "5670669815";
fn main() -> Result<()> {
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
@@ -74,18 +74,10 @@ fn main() -> Result<()> {
let pgbin_default = String::from("postgres");
let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
let ext_remote_storage = matches
.get_one::<String>("remote-ext-config")
// Compatibility hack: if the control plane specified any remote-ext-config
// use the default value for extension storage proxy gateway.
// Remove this once the control plane is updated to pass the gateway URL
.map(|conf| {
if conf.starts_with("http") {
conf.trim_end_matches('/')
} else {
"http://pg-ext-s3-gateway"
}
});
let remote_ext_config = matches.get_one::<String>("remote-ext-config");
let ext_remote_storage = remote_ext_config.map(|x| {
init_remote_storage(x).expect("cannot initialize remote extension storage from config")
});
let http_port = *matches
.get_one::<u16>("http-port")
@@ -206,7 +198,7 @@ fn main() -> Result<()> {
live_config_allowed,
state: Mutex::new(new_state),
state_changed: Condvar::new(),
ext_remote_storage: ext_remote_storage.map(|s| s.to_string()),
ext_remote_storage,
ext_download_progress: RwLock::new(HashMap::new()),
build_tag,
};
@@ -487,6 +479,13 @@ fn cli() -> clap::Command {
)
.value_name("FILECACHE_CONNSTR"),
)
.arg(
// DEPRECATED, NO LONGER DOES ANYTHING.
// See https://github.com/neondatabase/cloud/issues/7516
Arg::new("file-cache-on-disk")
.long("file-cache-on-disk")
.action(clap::ArgAction::SetTrue),
)
}
#[test]

View File

@@ -25,7 +25,7 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus};
use compute_api::spec::{ComputeMode, ComputeSpec};
use utils::measured_stream::MeasuredReader;
use remote_storage::{DownloadError, RemotePath};
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
use crate::checker::create_availability_check_data;
use crate::pg_helpers::*;
@@ -59,8 +59,8 @@ pub struct ComputeNode {
pub state: Mutex<ComputeState>,
/// `Condvar` to allow notifying waiters about state changes.
pub state_changed: Condvar,
/// the address of extension storage proxy gateway
pub ext_remote_storage: Option<String>,
/// the S3 bucket that we search for extensions in
pub ext_remote_storage: Option<GenericRemoteStorage>,
// key: ext_archive_name, value: started download time, download_completed?
pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
pub build_tag: String,
@@ -698,7 +698,6 @@ impl ComputeNode {
handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
handle_grants(spec, &mut client, self.connstr.as_str())?;
handle_extensions(spec, &mut client)?;
handle_extension_neon(&mut client)?;
create_availability_check_data(&mut client)?;
// 'Close' connection
@@ -743,7 +742,6 @@ impl ComputeNode {
handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
handle_grants(&spec, &mut client, self.connstr.as_str())?;
handle_extensions(&spec, &mut client)?;
handle_extension_neon(&mut client)?;
}
// 'Close' connection
@@ -957,12 +955,12 @@ LIMIT 100",
real_ext_name: String,
ext_path: RemotePath,
) -> Result<u64, DownloadError> {
let ext_remote_storage =
self.ext_remote_storage
.as_ref()
.ok_or(DownloadError::BadInput(anyhow::anyhow!(
"Remote extensions storage is not configured",
)))?;
let remote_storage = self
.ext_remote_storage
.as_ref()
.ok_or(DownloadError::BadInput(anyhow::anyhow!(
"Remote extensions storage is not configured",
)))?;
let ext_archive_name = ext_path.object_name().expect("bad path");
@@ -1018,7 +1016,7 @@ LIMIT 100",
let download_size = extension_server::download_extension(
&real_ext_name,
&ext_path,
ext_remote_storage,
remote_storage,
&self.pgbin,
)
.await

View File

@@ -71,16 +71,18 @@ More specifically, here is an example ext_index.json
}
}
*/
use anyhow::Context;
use anyhow::{self, Result};
use anyhow::{bail, Context};
use bytes::Bytes;
use compute_api::spec::RemoteExtSpec;
use regex::Regex;
use remote_storage::*;
use reqwest::StatusCode;
use serde_json;
use std::io::Read;
use std::num::NonZeroUsize;
use std::path::Path;
use std::str;
use tar::Archive;
use tokio::io::AsyncReadExt;
use tracing::info;
use tracing::log::warn;
use zstd::stream::read::Decoder;
@@ -136,31 +138,23 @@ fn parse_pg_version(human_version: &str) -> &str {
pub async fn download_extension(
ext_name: &str,
ext_path: &RemotePath,
ext_remote_storage: &str,
remote_storage: &GenericRemoteStorage,
pgbin: &str,
) -> Result<u64> {
info!("Download extension {:?} from {:?}", ext_name, ext_path);
// TODO add retry logic
let download_buffer =
match download_extension_tar(ext_remote_storage, &ext_path.to_string()).await {
Ok(buffer) => buffer,
Err(error_message) => {
return Err(anyhow::anyhow!(
"error downloading extension {:?}: {:?}",
ext_name,
error_message
));
}
};
let mut download = remote_storage.download(ext_path).await?;
let mut download_buffer = Vec::new();
download
.download_stream
.read_to_end(&mut download_buffer)
.await?;
let download_size = download_buffer.len() as u64;
info!("Download size {:?}", download_size);
// it's unclear whether it is more performant to decompress into memory or not
// TODO: decompressing into memory can be avoided
let decoder = Decoder::new(download_buffer.as_ref())?;
let mut archive = Archive::new(decoder);
let mut decoder = Decoder::new(download_buffer.as_slice())?;
let mut decompress_buffer = Vec::new();
decoder.read_to_end(&mut decompress_buffer)?;
let mut archive = Archive::new(decompress_buffer.as_slice());
let unzip_dest = pgbin
.strip_suffix("/bin/postgres")
.expect("bad pgbin")
@@ -228,32 +222,29 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
}
}
// Do request to extension storage proxy, i.e.
// curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst
// using HHTP GET
// and return the response body as bytes
//
async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result<Bytes> {
let uri = format!("{}/{}", ext_remote_storage, ext_path);
info!("Download extension {:?} from uri {:?}", ext_path, uri);
let resp = reqwest::get(uri).await?;
match resp.status() {
StatusCode::OK => match resp.bytes().await {
Ok(resp) => {
info!("Download extension {:?} completed successfully", ext_path);
Ok(resp)
}
Err(e) => bail!("could not deserialize remote extension response: {}", e),
},
StatusCode::SERVICE_UNAVAILABLE => bail!("remote extension is temporarily unavailable"),
_ => bail!(
"unexpected remote extension response status code: {}",
resp.status()
),
// This function initializes the necessary structs to use remote storage
pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRemoteStorage> {
#[derive(Debug, serde::Deserialize)]
struct RemoteExtJson {
bucket: String,
region: String,
endpoint: Option<String>,
prefix: Option<String>,
}
let remote_ext_json = serde_json::from_str::<RemoteExtJson>(remote_ext_config)?;
let config = S3Config {
bucket_name: remote_ext_json.bucket,
bucket_region: remote_ext_json.region,
prefix_in_bucket: remote_ext_json.prefix,
endpoint: remote_ext_json.endpoint,
concurrency_limit: NonZeroUsize::new(100).expect("100 != 0"),
max_keys_per_list_response: None,
};
let config = RemoteStorageConfig {
storage: RemoteStorageKind::AwsS3(config),
};
GenericRemoteStorage::from_config(&config)
}
#[cfg(test)]

View File

@@ -123,7 +123,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
}
}
// download extension files from remote extension storage on demand
// download extension files from S3 on demand
(&Method::POST, route) if route.starts_with("/extension_server/") => {
info!("serving {:?} POST request", route);
info!("req.uri {:?}", req.uri());

View File

@@ -674,33 +674,3 @@ pub fn handle_extensions(spec: &ComputeSpec, client: &mut Client) -> Result<()>
Ok(())
}
/// Run CREATE and ALTER EXTENSION neon UPDATE for postgres database
#[instrument(skip_all)]
pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
info!("handle extension neon");
let mut query = "CREATE SCHEMA IF NOT EXISTS neon";
client.simple_query(query)?;
query = "CREATE EXTENSION IF NOT EXISTS neon WITH SCHEMA neon";
info!("create neon extension with query: {}", query);
client.simple_query(query)?;
query = "UPDATE pg_extension SET extrelocatable = true WHERE extname = 'neon'";
client.simple_query(query)?;
query = "ALTER EXTENSION neon SET SCHEMA neon";
info!("alter neon extension schema with query: {}", query);
client.simple_query(query)?;
// this will be a no-op if extension is already up to date,
// which may happen in two cases:
// - extension was just installed
// - extension was already installed and is up to date
let query = "ALTER EXTENSION neon UPDATE";
info!("update neon extension schema with query: {}", query);
client.simple_query(query)?;
Ok(())
}

View File

@@ -286,7 +286,6 @@ async fn main() -> anyhow::Result<()> {
logging::init(
LogFormat::Plain,
logging::TracingErrorLayerEnablement::Disabled,
logging::Output::Stdout,
)?;
let args = Cli::parse();

View File

@@ -487,15 +487,8 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
.copied()
.context("Failed to parse postgres version from the argument string")?;
let new_timeline_id_opt = parse_timeline_id(create_match)?;
let timeline_info = pageserver.timeline_create(
tenant_id,
new_timeline_id_opt,
None,
None,
Some(pg_version),
)?;
let timeline_info =
pageserver.timeline_create(tenant_id, None, None, None, Some(pg_version))?;
let new_timeline_id = timeline_info.timeline_id;
let last_record_lsn = timeline_info.last_record_lsn;
@@ -1252,7 +1245,7 @@ fn cli() -> Command {
let remote_ext_config_args = Arg::new("remote-ext-config")
.long("remote-ext-config")
.num_args(1)
.help("Configure the remote extensions storage proxy gateway to request for extensions.")
.help("Configure the S3 bucket that we search for extensions in.")
.required(false);
let lsn_arg = Arg::new("lsn")
@@ -1315,7 +1308,6 @@ fn cli() -> Command {
.subcommand(Command::new("create")
.about("Create a new blank timeline")
.arg(tenant_id_arg.clone())
.arg(timeline_id_arg.clone())
.arg(branch_name_arg.clone())
.arg(pg_version_arg.clone())
)

View File

@@ -45,7 +45,6 @@ use std::sync::Arc;
use std::time::Duration;
use anyhow::{anyhow, bail, Context, Result};
use compute_api::spec::RemoteExtSpec;
use serde::{Deserialize, Serialize};
use utils::id::{NodeId, TenantId, TimelineId};
@@ -477,18 +476,6 @@ impl Endpoint {
}
}
// check for file remote_extensions_spec.json
// if it is present, read it and pass to compute_ctl
let remote_extensions_spec_path = self.endpoint_path().join("remote_extensions_spec.json");
let remote_extensions_spec = std::fs::File::open(remote_extensions_spec_path);
let remote_extensions: Option<RemoteExtSpec>;
if let Ok(spec_file) = remote_extensions_spec {
remote_extensions = serde_json::from_reader(spec_file).ok();
} else {
remote_extensions = None;
};
// Create spec file
let spec = ComputeSpec {
skip_pg_catalog_updates: self.skip_pg_catalog_updates,
@@ -510,7 +497,7 @@ impl Endpoint {
pageserver_connstring: Some(pageserver_connstring),
safekeeper_connstrings,
storage_auth_token: auth_token.clone(),
remote_extensions,
remote_extensions: None,
};
let spec_path = self.endpoint_path().join("spec.json");
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;

View File

@@ -14,6 +14,7 @@ use pageserver_api::models::{
use std::collections::HashMap;
use std::time::Duration;
use utils::{
generation::Generation,
id::{TenantId, TimelineId},
lsn::Lsn,
};
@@ -92,22 +93,6 @@ pub fn migrate_tenant(
// Get a new generation
let attachment_service = AttachmentService::from_env(env);
fn build_location_config(
mode: LocationConfigMode,
generation: Option<u32>,
secondary_conf: Option<LocationConfigSecondary>,
) -> LocationConfig {
LocationConfig {
mode,
generation,
secondary_conf,
tenant_conf: TenantConfig::default(),
shard_number: 0,
shard_count: 0,
shard_stripe_size: 0,
}
}
let previous = attachment_service.inspect(tenant_id)?;
let mut baseline_lsns = None;
if let Some((generation, origin_ps_id)) = &previous {
@@ -116,7 +101,12 @@ pub fn migrate_tenant(
if origin_ps_id == &dest_ps.conf.id {
println!("🔁 Already attached to {origin_ps_id}, freshening...");
let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
let dest_conf = LocationConfig {
mode: LocationConfigMode::AttachedSingle,
generation: gen.map(Generation::new),
secondary_conf: None,
tenant_conf: TenantConfig::default(),
};
dest_ps.location_config(tenant_id, dest_conf)?;
println!("✅ Migration complete");
return Ok(());
@@ -124,15 +114,24 @@ pub fn migrate_tenant(
println!("🔁 Switching origin pageserver {origin_ps_id} to stale mode");
let stale_conf =
build_location_config(LocationConfigMode::AttachedStale, Some(*generation), None);
let stale_conf = LocationConfig {
mode: LocationConfigMode::AttachedStale,
generation: Some(Generation::new(*generation)),
secondary_conf: None,
tenant_conf: TenantConfig::default(),
};
origin_ps.location_config(tenant_id, stale_conf)?;
baseline_lsns = Some(get_lsns(tenant_id, &origin_ps)?);
}
let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
let dest_conf = build_location_config(LocationConfigMode::AttachedMulti, gen, None);
let dest_conf = LocationConfig {
mode: LocationConfigMode::AttachedMulti,
generation: gen.map(Generation::new),
secondary_conf: None,
tenant_conf: TenantConfig::default(),
};
println!("🔁 Attaching to pageserver {}", dest_ps.conf.id);
dest_ps.location_config(tenant_id, dest_conf)?;
@@ -171,11 +170,12 @@ pub fn migrate_tenant(
}
// Downgrade to a secondary location
let secondary_conf = build_location_config(
LocationConfigMode::Secondary,
None,
Some(LocationConfigSecondary { warm: true }),
);
let secondary_conf = LocationConfig {
mode: LocationConfigMode::Secondary,
generation: None,
secondary_conf: Some(LocationConfigSecondary { warm: true }),
tenant_conf: TenantConfig::default(),
};
println!(
"💤 Switching to secondary mode on pageserver {}",
@@ -188,7 +188,12 @@ pub fn migrate_tenant(
"🔁 Switching to AttachedSingle mode on pageserver {}",
dest_ps.conf.id
);
let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
let dest_conf = LocationConfig {
mode: LocationConfigMode::AttachedSingle,
generation: gen.map(Generation::new),
secondary_conf: None,
tenant_conf: TenantConfig::default(),
};
dest_ps.location_config(tenant_id, dest_conf)?;
println!("✅ Migration complete");

View File

@@ -18,7 +18,6 @@ enum-map.workspace = true
strum.workspace = true
strum_macros.workspace = true
hex.workspace = true
thiserror.workspace = true
workspace_hack.workspace = true

View File

@@ -10,6 +10,7 @@ use serde_with::serde_as;
use strum_macros;
use utils::{
completion,
generation::Generation,
history_buffer::HistoryBufferWithDropCounter,
id::{NodeId, TenantId, TimelineId},
lsn::Lsn,
@@ -261,19 +262,10 @@ pub struct LocationConfig {
pub mode: LocationConfigMode,
/// If attaching, in what generation?
#[serde(default)]
pub generation: Option<u32>,
pub generation: Option<Generation>,
#[serde(default)]
pub secondary_conf: Option<LocationConfigSecondary>,
// Shard parameters: if shard_count is nonzero, then other shard_* fields
// must be set accurately.
#[serde(default)]
pub shard_number: u8,
#[serde(default)]
pub shard_count: u8,
#[serde(default)]
pub shard_stripe_size: u32,
// If requesting mode `Secondary`, configuration for that.
// Custom storage configuration for the tenant, if any
pub tenant_conf: TenantConfig,

View File

@@ -2,7 +2,6 @@ use std::{ops::RangeInclusive, str::FromStr};
use hex::FromHex;
use serde::{Deserialize, Serialize};
use thiserror;
use utils::id::TenantId;
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug)]
@@ -140,89 +139,6 @@ impl From<[u8; 18]> for TenantShardId {
}
}
/// For use within the context of a particular tenant, when we need to know which
/// shard we're dealing with, but do not need to know the full ShardIdentity (because
/// we won't be doing any page->shard mapping), and do not need to know the fully qualified
/// TenantShardId.
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy)]
pub struct ShardIndex {
pub shard_number: ShardNumber,
pub shard_count: ShardCount,
}
impl ShardIndex {
pub fn new(number: ShardNumber, count: ShardCount) -> Self {
Self {
shard_number: number,
shard_count: count,
}
}
pub fn unsharded() -> Self {
Self {
shard_number: ShardNumber(0),
shard_count: ShardCount(0),
}
}
pub fn is_unsharded(&self) -> bool {
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
}
/// For use in constructing remote storage paths: concatenate this with a TenantId
/// to get a fully qualified TenantShardId.
///
/// Backward compat: this function returns an empty string if Self::is_unsharded, such
/// that the legacy pre-sharding remote key format is preserved.
pub fn get_suffix(&self) -> String {
if self.is_unsharded() {
"".to_string()
} else {
format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
}
}
}
impl std::fmt::Display for ShardIndex {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
}
}
impl std::fmt::Debug for ShardIndex {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
// Debug is the same as Display: the compact hex representation
write!(f, "{}", self)
}
}
impl std::str::FromStr for ShardIndex {
type Err = hex::FromHexError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
// Expect format: 1 byte shard number, 1 byte shard count
if s.len() == 4 {
let bytes = s.as_bytes();
let mut shard_parts: [u8; 2] = [0u8; 2];
hex::decode_to_slice(bytes, &mut shard_parts)?;
Ok(Self {
shard_number: ShardNumber(shard_parts[0]),
shard_count: ShardCount(shard_parts[1]),
})
} else {
Err(hex::FromHexError::InvalidStringLength)
}
}
}
impl From<[u8; 2]> for ShardIndex {
fn from(b: [u8; 2]) -> Self {
Self {
shard_number: ShardNumber(b[0]),
shard_count: ShardCount(b[1]),
}
}
}
impl Serialize for TenantShardId {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
@@ -293,151 +209,6 @@ impl<'de> Deserialize<'de> for TenantShardId {
}
}
/// Stripe size in number of pages
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
pub struct ShardStripeSize(pub u32);
/// Layout version: for future upgrades where we might change how the key->shard mapping works
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
pub struct ShardLayout(u8);
const LAYOUT_V1: ShardLayout = ShardLayout(1);
/// Default stripe size in pages: 256MiB divided by 8kiB page size.
const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
/// The ShardIdentity contains the information needed for one member of map
/// to resolve a key to a shard, and then check whether that shard is ==self.
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
pub struct ShardIdentity {
pub layout: ShardLayout,
pub number: ShardNumber,
pub count: ShardCount,
pub stripe_size: ShardStripeSize,
}
#[derive(thiserror::Error, Debug, PartialEq, Eq)]
pub enum ShardConfigError {
#[error("Invalid shard count")]
InvalidCount,
#[error("Invalid shard number")]
InvalidNumber,
#[error("Invalid stripe size")]
InvalidStripeSize,
}
impl ShardIdentity {
/// An identity with number=0 count=0 is a "none" identity, which represents legacy
/// tenants. Modern single-shard tenants should not use this: they should
/// have number=0 count=1.
pub fn unsharded() -> Self {
Self {
number: ShardNumber(0),
count: ShardCount(0),
layout: LAYOUT_V1,
stripe_size: DEFAULT_STRIPE_SIZE,
}
}
pub fn is_unsharded(&self) -> bool {
self.number == ShardNumber(0) && self.count == ShardCount(0)
}
/// Count must be nonzero, and number must be < count. To construct
/// the legacy case (count==0), use Self::unsharded instead.
pub fn new(
number: ShardNumber,
count: ShardCount,
stripe_size: ShardStripeSize,
) -> Result<Self, ShardConfigError> {
if count.0 == 0 {
Err(ShardConfigError::InvalidCount)
} else if number.0 > count.0 - 1 {
Err(ShardConfigError::InvalidNumber)
} else if stripe_size.0 == 0 {
Err(ShardConfigError::InvalidStripeSize)
} else {
Ok(Self {
number,
count,
layout: LAYOUT_V1,
stripe_size,
})
}
}
}
impl Serialize for ShardIndex {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
if serializer.is_human_readable() {
serializer.collect_str(self)
} else {
// Binary encoding is not used in index_part.json, but is included in anticipation of
// switching various structures (e.g. inter-process communication, remote metadata) to more
// compact binary encodings in future.
let mut packed: [u8; 2] = [0; 2];
packed[0] = self.shard_number.0;
packed[1] = self.shard_count.0;
packed.serialize(serializer)
}
}
}
impl<'de> Deserialize<'de> for ShardIndex {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
struct IdVisitor {
is_human_readable_deserializer: bool,
}
impl<'de> serde::de::Visitor<'de> for IdVisitor {
type Value = ShardIndex;
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
if self.is_human_readable_deserializer {
formatter.write_str("value in form of hex string")
} else {
formatter.write_str("value in form of integer array([u8; 2])")
}
}
fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
where
A: serde::de::SeqAccess<'de>,
{
let s = serde::de::value::SeqAccessDeserializer::new(seq);
let id: [u8; 2] = Deserialize::deserialize(s)?;
Ok(ShardIndex::from(id))
}
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
where
E: serde::de::Error,
{
ShardIndex::from_str(v).map_err(E::custom)
}
}
if deserializer.is_human_readable() {
deserializer.deserialize_str(IdVisitor {
is_human_readable_deserializer: true,
})
} else {
deserializer.deserialize_tuple(
2,
IdVisitor {
is_human_readable_deserializer: false,
},
)
}
}
}
#[cfg(test)]
mod tests {
use std::str::FromStr;
@@ -547,66 +318,4 @@ mod tests {
Ok(())
}
#[test]
fn shard_identity_validation() -> Result<(), ShardConfigError> {
// Happy cases
ShardIdentity::new(ShardNumber(0), ShardCount(1), DEFAULT_STRIPE_SIZE)?;
ShardIdentity::new(ShardNumber(0), ShardCount(1), ShardStripeSize(1))?;
ShardIdentity::new(ShardNumber(254), ShardCount(255), ShardStripeSize(1))?;
assert_eq!(
ShardIdentity::new(ShardNumber(0), ShardCount(0), DEFAULT_STRIPE_SIZE),
Err(ShardConfigError::InvalidCount)
);
assert_eq!(
ShardIdentity::new(ShardNumber(10), ShardCount(10), DEFAULT_STRIPE_SIZE),
Err(ShardConfigError::InvalidNumber)
);
assert_eq!(
ShardIdentity::new(ShardNumber(11), ShardCount(10), DEFAULT_STRIPE_SIZE),
Err(ShardConfigError::InvalidNumber)
);
assert_eq!(
ShardIdentity::new(ShardNumber(255), ShardCount(255), DEFAULT_STRIPE_SIZE),
Err(ShardConfigError::InvalidNumber)
);
assert_eq!(
ShardIdentity::new(ShardNumber(0), ShardCount(1), ShardStripeSize(0)),
Err(ShardConfigError::InvalidStripeSize)
);
Ok(())
}
#[test]
fn shard_index_human_encoding() -> Result<(), hex::FromHexError> {
let example = ShardIndex {
shard_number: ShardNumber(13),
shard_count: ShardCount(17),
};
let expected: String = "0d11".to_string();
let encoded = format!("{example}");
assert_eq!(&encoded, &expected);
let decoded = ShardIndex::from_str(&encoded)?;
assert_eq!(example, decoded);
Ok(())
}
#[test]
fn shard_index_binary_encoding() -> Result<(), hex::FromHexError> {
let example = ShardIndex {
shard_number: ShardNumber(13),
shard_count: ShardCount(17),
};
let expected: [u8; 2] = [0x0d, 0x11];
let encoded = bincode::serialize(&example).unwrap();
assert_eq!(Hex(&encoded), Hex(&expected));
let decoded = bincode::deserialize(&encoded).unwrap();
assert_eq!(example, decoded);
Ok(())
}
}

View File

@@ -281,7 +281,6 @@ fn ensure_logging_ready() {
utils::logging::init(
utils::logging::LogFormat::Test,
utils::logging::TracingErrorLayerEnablement::Disabled,
utils::logging::Output::Stdout,
)
.expect("logging init failed");
});

View File

@@ -210,7 +210,6 @@ fn ensure_logging_ready() {
utils::logging::init(
utils::logging::LogFormat::Test,
utils::logging::TracingErrorLayerEnablement::Disabled,
utils::logging::Output::Stdout,
)
.expect("logging init failed");
});

View File

@@ -1,21 +0,0 @@
#!/bin/bash
# like restore_from_wal.sh, but takes existing initdb.tar.zst
set -euxo pipefail
PG_BIN=$1
WAL_PATH=$2
DATA_DIR=$3
PORT=$4
echo "port=$PORT" >> "$DATA_DIR"/postgresql.conf
echo "shared_preload_libraries='\$libdir/neon_rmgr.so'" >> "$DATA_DIR"/postgresql.conf
REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-)
declare -i WAL_SIZE=$REDO_POS+114
"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" start
"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" stop -m immediate
cp "$DATA_DIR"/pg_wal/000000010000000000000001 .
cp "$WAL_PATH"/* "$DATA_DIR"/pg_wal/
for partial in "$DATA_DIR"/pg_wal/*.partial ; do mv "$partial" "${partial%.partial}" ; done
dd if=000000010000000000000001 of="$DATA_DIR"/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
rm -f 000000010000000000000001

View File

@@ -66,17 +66,9 @@ pub enum TracingErrorLayerEnablement {
EnableWithRustLogFilter,
}
/// Where the logging should output to.
#[derive(Clone, Copy)]
pub enum Output {
Stdout,
Stderr,
}
pub fn init(
log_format: LogFormat,
tracing_error_layer_enablement: TracingErrorLayerEnablement,
output: Output,
) -> anyhow::Result<()> {
// We fall back to printing all spans at info-level or above if
// the RUST_LOG environment variable is not set.
@@ -93,12 +85,7 @@ pub fn init(
let log_layer = tracing_subscriber::fmt::layer()
.with_target(false)
.with_ansi(false)
.with_writer(move || -> Box<dyn std::io::Write> {
match output {
Output::Stdout => Box::new(std::io::stdout()),
Output::Stderr => Box::new(std::io::stderr()),
}
});
.with_writer(std::io::stdout);
let log_layer = match log_format {
LogFormat::Json => log_layer.json().boxed(),
LogFormat::Plain => log_layer.boxed(),

View File

@@ -18,5 +18,3 @@ tokio.workspace = true
utils.workspace = true
svg_fmt.workspace = true
workspace_hack.workspace = true
serde.workspace = true
serde_json.workspace = true

View File

@@ -1,38 +0,0 @@
use std::collections::HashMap;
use anyhow::Context;
use camino::Utf8PathBuf;
use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
use pageserver::tenant::storage_layer::LayerFileName;
use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
use utils::lsn::Lsn;
#[derive(clap::Subcommand)]
pub(crate) enum IndexPartCmd {
Dump { path: Utf8PathBuf },
}
pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
match cmd {
IndexPartCmd::Dump { path } => {
let bytes = tokio::fs::read(path).await.context("read file")?;
let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
#[derive(serde::Serialize)]
struct Output<'a> {
layer_metadata: &'a HashMap<LayerFileName, IndexLayerMetadata>,
disk_consistent_lsn: Lsn,
timeline_metadata: &'a TimelineMetadata,
}
let output = Output {
layer_metadata: &des.layer_metadata,
disk_consistent_lsn: des.get_disk_consistent_lsn(),
timeline_metadata: &des.metadata,
};
let output = serde_json::to_string_pretty(&output).context("serialize output")?;
println!("{output}");
Ok(())
}
}
}

View File

@@ -5,13 +5,11 @@
//! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.
mod draw_timeline_dir;
mod index_part;
mod layer_map_analyzer;
mod layers;
use camino::{Utf8Path, Utf8PathBuf};
use clap::{Parser, Subcommand};
use index_part::IndexPartCmd;
use layers::LayerCmd;
use pageserver::{
context::{DownloadBehavior, RequestContext},
@@ -40,8 +38,6 @@ struct CliOpts {
#[derive(Subcommand)]
enum Commands {
Metadata(MetadataCmd),
#[command(subcommand)]
IndexPart(IndexPartCmd),
PrintLayerFile(PrintLayerFileCmd),
DrawTimeline {},
AnalyzeLayerMap(AnalyzeLayerMapCmd),
@@ -87,9 +83,6 @@ async fn main() -> anyhow::Result<()> {
Commands::Metadata(cmd) => {
handle_metadata(&cmd)?;
}
Commands::IndexPart(cmd) => {
index_part::main(&cmd).await?;
}
Commands::DrawTimeline {} => {
draw_timeline_dir::main()?;
}

View File

@@ -103,11 +103,7 @@ fn main() -> anyhow::Result<()> {
} else {
TracingErrorLayerEnablement::Disabled
};
logging::init(
conf.log_format,
tracing_error_layer_enablement,
logging::Output::Stdout,
)?;
logging::init(conf.log_format, tracing_error_layer_enablement)?;
// mind the order required here: 1. logging, 2. panic_hook, 3. sentry.
// disarming this hook on pageserver, because we never tear down tracing.
@@ -625,7 +621,6 @@ fn start_pageserver(
conf.synthetic_size_calculation_interval,
conf.id,
local_disk_storage,
cancel,
metrics_ctx,
)
.instrument(info_span!("metrics_collection"))

View File

@@ -3,7 +3,7 @@
use crate::context::{DownloadBehavior, RequestContext};
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
use crate::tenant::tasks::BackgroundLoopKind;
use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError};
use crate::tenant::{mgr, LogicalSizeCalculationCause};
use camino::Utf8PathBuf;
use consumption_metrics::EventType;
use pageserver_api::models::TenantState;
@@ -12,7 +12,6 @@ use std::collections::HashMap;
use std::sync::Arc;
use std::time::{Duration, SystemTime};
use tokio::time::Instant;
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::id::NodeId;
@@ -38,7 +37,6 @@ type RawMetric = (MetricsKey, (EventType, u64));
type Cache = HashMap<MetricsKey, (EventType, u64)>;
/// Main thread that serves metrics collection
#[allow(clippy::too_many_arguments)]
pub async fn collect_metrics(
metric_collection_endpoint: &Url,
metric_collection_interval: Duration,
@@ -46,7 +44,6 @@ pub async fn collect_metrics(
synthetic_size_calculation_interval: Duration,
node_id: NodeId,
local_disk_storage: Utf8PathBuf,
cancel: CancellationToken,
ctx: RequestContext,
) -> anyhow::Result<()> {
if _cached_metric_collection_interval != Duration::ZERO {
@@ -66,13 +63,9 @@ pub async fn collect_metrics(
"synthetic size calculation",
false,
async move {
calculate_synthetic_size_worker(
synthetic_size_calculation_interval,
&cancel,
&worker_ctx,
)
.instrument(info_span!("synthetic_size_worker"))
.await?;
calculate_synthetic_size_worker(synthetic_size_calculation_interval, &worker_ctx)
.instrument(info_span!("synthetic_size_worker"))
.await?;
Ok(())
},
);
@@ -248,7 +241,6 @@ async fn reschedule(
/// Caclculate synthetic size for each active tenant
async fn calculate_synthetic_size_worker(
synthetic_size_calculation_interval: Duration,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> anyhow::Result<()> {
info!("starting calculate_synthetic_size_worker");
@@ -280,12 +272,7 @@ async fn calculate_synthetic_size_worker(
// Same for the loop that fetches computed metrics.
// By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
// which turns out is really handy to understand the system.
if let Err(e) = tenant.calculate_synthetic_size(cause, cancel, ctx).await {
if let Some(PageReconstructError::Cancelled) =
e.downcast_ref::<PageReconstructError>()
{
return Ok(());
}
if let Err(e) = tenant.calculate_synthetic_size(cause, ctx).await {
error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
}
}

View File

@@ -10,7 +10,6 @@ use crate::control_plane_client::ControlPlaneGenerationsApi;
use crate::metrics;
use crate::tenant::remote_timeline_client::remote_layer_path;
use crate::tenant::remote_timeline_client::remote_timeline_path;
use crate::tenant::remote_timeline_client::LayerFileMetadata;
use crate::virtual_file::MaybeFatalIo;
use crate::virtual_file::VirtualFile;
use anyhow::Context;
@@ -510,19 +509,17 @@ impl DeletionQueueClient {
tenant_id: TenantId,
timeline_id: TimelineId,
current_generation: Generation,
layers: Vec<(LayerFileName, LayerFileMetadata)>,
layers: Vec<(LayerFileName, Generation)>,
) -> Result<(), DeletionQueueError> {
if current_generation.is_none() {
debug!("Enqueuing deletions in legacy mode, skipping queue");
let mut layer_paths = Vec::new();
for (layer, meta) in layers {
for (layer, generation) in layers {
layer_paths.push(remote_layer_path(
&tenant_id,
&timeline_id,
meta.shard,
&layer,
meta.generation,
generation,
));
}
self.push_immediate(layer_paths).await?;
@@ -542,7 +539,7 @@ impl DeletionQueueClient {
tenant_id: TenantId,
timeline_id: TimelineId,
current_generation: Generation,
layers: Vec<(LayerFileName, LayerFileMetadata)>,
layers: Vec<(LayerFileName, Generation)>,
) -> Result<(), DeletionQueueError> {
metrics::DELETION_QUEUE
.keys_submitted
@@ -753,7 +750,6 @@ impl DeletionQueue {
mod test {
use camino::Utf8Path;
use hex_literal::hex;
use pageserver_api::shard::ShardIndex;
use std::{io::ErrorKind, time::Duration};
use tracing::info;
@@ -993,8 +989,6 @@ mod test {
// we delete, and the generation of the running Tenant.
let layer_generation = Generation::new(0xdeadbeef);
let now_generation = Generation::new(0xfeedbeef);
let layer_metadata =
LayerFileMetadata::new(0xf00, layer_generation, ShardIndex::unsharded());
let remote_layer_file_name_1 =
format!("{}{}", layer_file_name_1, layer_generation.get_suffix());
@@ -1018,7 +1012,7 @@ mod test {
tenant_id,
TIMELINE_ID,
now_generation,
[(layer_file_name_1.clone(), layer_metadata)].to_vec(),
[(layer_file_name_1.clone(), layer_generation)].to_vec(),
)
.await?;
assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
@@ -1057,8 +1051,6 @@ mod test {
let stale_generation = latest_generation.previous();
// Generation that our example layer file was written with
let layer_generation = stale_generation.previous();
let layer_metadata =
LayerFileMetadata::new(0xf00, layer_generation, ShardIndex::unsharded());
ctx.set_latest_generation(latest_generation);
@@ -1076,7 +1068,7 @@ mod test {
tenant_id,
TIMELINE_ID,
stale_generation,
[(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
[(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(),
)
.await?;
@@ -1091,7 +1083,7 @@ mod test {
tenant_id,
TIMELINE_ID,
latest_generation,
[(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
[(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(),
)
.await?;
@@ -1118,8 +1110,6 @@ mod test {
let layer_generation = Generation::new(0xdeadbeef);
let now_generation = Generation::new(0xfeedbeef);
let layer_metadata =
LayerFileMetadata::new(0xf00, layer_generation, ShardIndex::unsharded());
// Inject a deletion in the generation before generation_now: after restart,
// this deletion should _not_ get executed (only the immediately previous
@@ -1131,7 +1121,7 @@ mod test {
tenant_id,
TIMELINE_ID,
now_generation.previous(),
[(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
[(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(),
)
.await?;
@@ -1145,7 +1135,7 @@ mod test {
tenant_id,
TIMELINE_ID,
now_generation,
[(EXAMPLE_LAYER_NAME_ALT.clone(), layer_metadata.clone())].to_vec(),
[(EXAMPLE_LAYER_NAME_ALT.clone(), layer_generation)].to_vec(),
)
.await?;
@@ -1235,13 +1225,12 @@ pub(crate) mod mock {
match msg {
ListWriterQueueMessage::Delete(op) => {
let mut objects = op.objects;
for (layer, meta) in op.layers {
for (layer, generation) in op.layers {
objects.push(remote_layer_path(
&op.tenant_id,
&op.timeline_id,
meta.shard,
&layer,
meta.generation,
generation,
));
}

View File

@@ -33,7 +33,6 @@ use crate::config::PageServerConf;
use crate::deletion_queue::TEMP_SUFFIX;
use crate::metrics;
use crate::tenant::remote_timeline_client::remote_layer_path;
use crate::tenant::remote_timeline_client::LayerFileMetadata;
use crate::tenant::storage_layer::LayerFileName;
use crate::virtual_file::on_fatal_io_error;
use crate::virtual_file::MaybeFatalIo;
@@ -59,7 +58,7 @@ pub(super) struct DeletionOp {
// `layers` and `objects` are both just lists of objects. `layers` is used if you do not
// have a config object handy to project it to a remote key, and need the consuming worker
// to do it for you.
pub(super) layers: Vec<(LayerFileName, LayerFileMetadata)>,
pub(super) layers: Vec<(LayerFileName, Generation)>,
pub(super) objects: Vec<RemotePath>,
/// The _current_ generation of the Tenant attachment in which we are enqueuing
@@ -388,13 +387,12 @@ impl ListWriter {
);
let mut layer_paths = Vec::new();
for (layer, meta) in op.layers {
for (layer, generation) in op.layers {
layer_paths.push(remote_layer_path(
&op.tenant_id,
&op.timeline_id,
meta.shard,
&layer,
meta.generation,
generation,
));
}
layer_paths.extend(op.objects);

View File

@@ -6,7 +6,6 @@ use std::str::FromStr;
use std::sync::Arc;
use anyhow::{anyhow, Context, Result};
use enumset::EnumSet;
use futures::TryFutureExt;
use humantime::format_rfc3339;
use hyper::header;
@@ -43,7 +42,6 @@ use crate::tenant::mgr::{
};
use crate::tenant::size::ModelInputs;
use crate::tenant::storage_layer::LayerAccessStatsReset;
use crate::tenant::timeline::CompactFlags;
use crate::tenant::timeline::Timeline;
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources};
use crate::{config::PageServerConf, tenant::mgr};
@@ -550,7 +548,7 @@ async fn timeline_detail_handler(
async fn get_lsn_by_timestamp_handler(
request: Request<Body>,
cancel: CancellationToken,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
@@ -566,9 +564,7 @@ async fn get_lsn_by_timestamp_handler(
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
let result = timeline
.find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
.await?;
let result = timeline.find_lsn_for_timestamp(timestamp_pg, &ctx).await?;
if version.unwrap_or(0) > 1 {
#[derive(serde::Serialize)]
@@ -844,7 +840,7 @@ async fn tenant_delete_handler(
/// without modifying anything anyway.
async fn tenant_size_handler(
request: Request<Body>,
cancel: CancellationToken,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
@@ -860,7 +856,6 @@ async fn tenant_size_handler(
.gather_size_inputs(
retention_period,
LogicalSizeCalculationCause::TenantSizeHandler,
&cancel,
&ctx,
)
.await
@@ -1245,7 +1240,7 @@ async fn failpoints_handler(
// Run GC immediately on given timeline.
async fn timeline_gc_handler(
mut request: Request<Body>,
cancel: CancellationToken,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -1254,7 +1249,7 @@ async fn timeline_gc_handler(
let gc_req: TimelineGcRequest = json_request(&mut request).await?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let wait_task_done = mgr::immediate_gc(tenant_id, timeline_id, gc_req, cancel, &ctx).await?;
let wait_task_done = mgr::immediate_gc(tenant_id, timeline_id, gc_req, &ctx).await?;
let gc_result = wait_task_done
.await
.context("wait for gc task")
@@ -1273,15 +1268,11 @@ async fn timeline_compact_handler(
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_id))?;
let mut flags = EnumSet::empty();
if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
flags |= CompactFlags::ForceRepartition;
}
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
timeline
.compact(&cancel, flags, &ctx)
.compact(&cancel, &ctx)
.await
.map_err(|e| ApiError::InternalServerError(e.into()))?;
json_response(StatusCode::OK, ())
@@ -1298,11 +1289,6 @@ async fn timeline_checkpoint_handler(
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_id))?;
let mut flags = EnumSet::empty();
if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
flags |= CompactFlags::ForceRepartition;
}
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
@@ -1311,7 +1297,7 @@ async fn timeline_checkpoint_handler(
.await
.map_err(ApiError::InternalServerError)?;
timeline
.compact(&cancel, flags, &ctx)
.compact(&cancel, &ctx)
.await
.map_err(|e| ApiError::InternalServerError(e.into()))?;
@@ -1689,24 +1675,8 @@ where
let token_cloned = token.clone();
let result = handler(r, token).await;
if token_cloned.is_cancelled() {
// dropguard has executed: we will never turn this result into response.
//
// at least temporarily do {:?} logging; these failures are rare enough but
// could hide difficult errors.
match &result {
Ok(response) => {
let status = response.status();
info!(%status, "Cancelled request finished successfully")
}
Err(e) => error!("Cancelled request finished with an error: {e:?}"),
}
info!("Cancelled request finished");
}
// only logging for cancelled panicked request handlers is the tracing_panic_hook,
// which should suffice.
//
// there is still a chance to lose the result due to race between
// returning from here and the actual connection closing happening
// before outer task gets to execute. leaving that up for #5815.
result
}
.in_current_span(),

View File

@@ -3,25 +3,18 @@
//! a neon Timeline.
//!
use std::path::{Path, PathBuf};
use std::pin::Pin;
use std::task::{self, Poll};
use anyhow::{bail, ensure, Context, Result};
use async_compression::{tokio::write::ZstdEncoder, zstd::CParameter, Level};
use bytes::Bytes;
use camino::Utf8Path;
use futures::StreamExt;
use nix::NixPath;
use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt};
use tokio::io::{AsyncRead, AsyncReadExt};
use tokio_tar::Archive;
use tokio_tar::Builder;
use tokio_tar::HeaderMode;
use tracing::*;
use walkdir::WalkDir;
use crate::context::RequestContext;
use crate::pgdatadir_mapping::*;
use crate::tenant::remote_timeline_client::INITDB_PATH;
use crate::tenant::Timeline;
use crate::walingest::WalIngest;
use crate::walrecord::DecodedWALRecord;
@@ -40,9 +33,7 @@ use utils::lsn::Lsn;
pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result<Lsn> {
// Read control file to extract the LSN
let controlfile_path = path.join("global").join("pg_control");
let controlfile_buf = std::fs::read(&controlfile_path)
.with_context(|| format!("reading controlfile: {controlfile_path}"))?;
let controlfile = ControlFileData::decode(&controlfile_buf)?;
let controlfile = ControlFileData::decode(&std::fs::read(controlfile_path)?)?;
let lsn = controlfile.checkPoint;
Ok(Lsn(lsn))
@@ -627,108 +618,3 @@ async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result<Bytes>
reader.read_to_end(&mut buf).await?;
Ok(Bytes::from(buf))
}
/// An in-memory buffer implementing `AsyncWrite`, inserting yields every now and then
///
/// The number of yields is bounded by above by the number of times poll_write is called,
/// so calling it with 8 KB chunks and 8 MB chunks gives the same number of yields in total.
/// This is an explicit choice as the `YieldingVec` is meant to give the async executor
/// breathing room between units of CPU intensive preparation of buffers to be written.
/// Once a write call is issued, the whole buffer has been prepared already, so there is no
/// gain in splitting up the memcopy further.
struct YieldingVec {
yield_budget: usize,
// the buffer written into
buf: Vec<u8>,
}
impl YieldingVec {
fn new() -> Self {
Self {
yield_budget: 0,
buf: Vec::new(),
}
}
// Whether we should yield for a read operation of given size
fn should_yield(&mut self, add_buf_len: usize) -> bool {
// Set this limit to a small value so that we are a
// good async citizen and yield repeatedly (but not
// too often for many small writes to cause many yields)
const YIELD_DIST: usize = 1024;
let target_buf_len = self.buf.len() + add_buf_len;
let ret = self.yield_budget / YIELD_DIST < target_buf_len / YIELD_DIST;
if self.yield_budget < target_buf_len {
self.yield_budget += add_buf_len;
}
ret
}
}
impl AsyncWrite for YieldingVec {
fn poll_write(
mut self: Pin<&mut Self>,
cx: &mut task::Context<'_>,
buf: &[u8],
) -> Poll<std::io::Result<usize>> {
if self.should_yield(buf.len()) {
cx.waker().wake_by_ref();
return Poll::Pending;
}
self.get_mut().buf.extend_from_slice(buf);
Poll::Ready(Ok(buf.len()))
}
fn poll_flush(self: Pin<&mut Self>, _cx: &mut task::Context<'_>) -> Poll<std::io::Result<()>> {
Poll::Ready(Ok(()))
}
fn poll_shutdown(
self: Pin<&mut Self>,
_cx: &mut task::Context<'_>,
) -> Poll<std::io::Result<()>> {
Poll::Ready(Ok(()))
}
}
pub async fn create_tar_zst(pgdata_path: &Utf8Path) -> Result<Vec<u8>> {
let mut paths = Vec::new();
for entry in WalkDir::new(pgdata_path) {
let entry = entry?;
let metadata = entry.metadata().expect("error getting dir entry metadata");
// Also allow directories so that we also get empty directories
if !(metadata.is_file() || metadata.is_dir()) {
continue;
}
let path = entry.into_path();
paths.push(path);
}
// Do a sort to get a more consistent listing
paths.sort_unstable();
let zstd = ZstdEncoder::with_quality_and_params(
YieldingVec::new(),
Level::Default,
&[CParameter::enable_long_distance_matching(true)],
);
let mut builder = Builder::new(zstd);
// Use reproducible header mode
builder.mode(HeaderMode::Deterministic);
for path in paths {
let rel_path = path.strip_prefix(pgdata_path)?;
if rel_path.is_empty() {
// The top directory should not be compressed,
// the tar crate doesn't like that
continue;
}
builder.append_path_with_name(&path, rel_path).await?;
}
let mut zstd = builder.into_inner().await?;
zstd.shutdown().await?;
let compressed = zstd.into_inner();
let compressed_len = compressed.buf.len();
const INITDB_TAR_ZST_WARN_LIMIT: usize = 2_000_000;
if compressed_len > INITDB_TAR_ZST_WARN_LIMIT {
warn!("compressed {INITDB_PATH} size of {compressed_len} is above limit {INITDB_TAR_ZST_WARN_LIMIT}.");
}
Ok(compressed.buf)
}

View File

@@ -638,7 +638,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
///
/// Operations:
/// - open ([`std::fs::OpenOptions::open`])
/// - close (dropping [`crate::virtual_file::VirtualFile`])
/// - close (dropping [`std::fs::File`])
/// - close-by-replace (close by replacement algorithm)
/// - read (`read_at`)
/// - write (`write_at`)

View File

@@ -21,7 +21,6 @@ use serde::{Deserialize, Serialize};
use std::collections::{hash_map, HashMap, HashSet};
use std::ops::ControlFlow;
use std::ops::Range;
use tokio_util::sync::CancellationToken;
use tracing::{debug, trace, warn};
use utils::bin_ser::DeserializeError;
use utils::{bin_ser::BeSer, lsn::Lsn};
@@ -366,7 +365,6 @@ impl Timeline {
pub async fn find_lsn_for_timestamp(
&self,
search_timestamp: TimestampTz,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> Result<LsnForTimestamp, PageReconstructError> {
let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
@@ -385,9 +383,6 @@ impl Timeline {
let mut found_smaller = false;
let mut found_larger = false;
while low < high {
if cancel.is_cancelled() {
return Err(PageReconstructError::Cancelled);
}
// cannot overflow, high and low are both smaller than u64::MAX / 2
let mid = (high + low) / 2;

View File

@@ -12,9 +12,7 @@
//!
use anyhow::{bail, Context};
use bytes::Bytes;
use camino::{Utf8Path, Utf8PathBuf};
use enumset::EnumSet;
use futures::FutureExt;
use pageserver_api::models::TimelineState;
use remote_storage::DownloadError;
@@ -25,7 +23,6 @@ use tokio::sync::watch;
use tokio::task::JoinSet;
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::backoff;
use utils::completion;
use utils::crashsafe::path_with_suffix_extension;
use utils::fs_ext;
@@ -294,16 +291,6 @@ impl From<harness::TestRedoManager> for WalRedoManager {
}
impl WalRedoManager {
pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
match self {
Self::Prod(mgr) => mgr.maybe_quiesce(idle_timeout),
#[cfg(test)]
Self::Test(_) => {
// Not applicable to test redo manager
}
}
}
pub async fn request_redo(
&self,
key: crate::repository::Key,
@@ -733,7 +720,7 @@ impl Tenant {
///
async fn attach(
self: &Arc<Tenant>,
init_order: Option<InitializationOrder>,
mut init_order: Option<InitializationOrder>,
preload: Option<TenantPreload>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
@@ -750,6 +737,11 @@ impl Tenant {
}
};
// Signal that we have completed remote phase
init_order
.as_mut()
.and_then(|x| x.initial_tenant_load_remote.take());
let mut timelines_to_resume_deletions = vec![];
let mut remote_index_and_client = HashMap::new();
@@ -1627,7 +1619,6 @@ impl Tenant {
target_timeline_id: Option<TimelineId>,
horizon: u64,
pitr: Duration,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> anyhow::Result<GcResult> {
// Don't start doing work during shutdown
@@ -1650,7 +1641,7 @@ impl Tenant {
}
}
self.gc_iteration_internal(target_timeline_id, horizon, pitr, cancel, ctx)
self.gc_iteration_internal(target_timeline_id, horizon, pitr, ctx)
.await
}
@@ -1658,16 +1649,22 @@ impl Tenant {
/// This function is periodically called by compactor task.
/// Also it can be explicitly requested per timeline through page server
/// api's 'compact' command.
async fn compaction_iteration(
pub async fn compaction_iteration(
&self,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> anyhow::Result<(), timeline::CompactionError> {
// Don't start doing work during shutdown, or when broken, we do not need those in the logs
if !self.is_active() {
) -> anyhow::Result<()> {
// Don't start doing work during shutdown
if let TenantState::Stopping { .. } = self.current_state() {
return Ok(());
}
// We should only be called once the tenant has activated.
anyhow::ensure!(
self.is_active(),
"Cannot run compaction iteration on inactive tenant"
);
{
let conf = self.tenant_conf.read().unwrap();
if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() {
@@ -1698,7 +1695,7 @@ impl Tenant {
for (timeline_id, timeline) in &timelines_to_compact {
timeline
.compact(cancel, EnumSet::empty(), ctx)
.compact(cancel, ctx)
.instrument(info_span!("compact_timeline", %timeline_id))
.await?;
}
@@ -1853,7 +1850,6 @@ impl Tenant {
});
})
};
// test_long_timeline_create_then_tenant_delete is leaning on this message
tracing::info!("Waiting for timelines...");
while let Some(res) = js.join_next().await {
match res {
@@ -2568,30 +2564,14 @@ impl Tenant {
target_timeline_id: Option<TimelineId>,
horizon: u64,
pitr: Duration,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> anyhow::Result<GcResult> {
let mut totals: GcResult = Default::default();
let now = Instant::now();
let gc_timelines = match self
.refresh_gc_info_internal(target_timeline_id, horizon, pitr, cancel, ctx)
.await
{
Ok(result) => result,
Err(e) => {
if let Some(PageReconstructError::Cancelled) =
e.downcast_ref::<PageReconstructError>()
{
// Handle cancellation
totals.elapsed = now.elapsed();
return Ok(totals);
} else {
// Propagate other errors
return Err(e);
}
}
};
let gc_timelines = self
.refresh_gc_info_internal(target_timeline_id, horizon, pitr, ctx)
.await?;
crate::failpoint_support::sleep_millis_async!(
"gc_iteration_internal_after_getting_gc_timelines"
@@ -2615,7 +2595,7 @@ impl Tenant {
// See comments in [`Tenant::branch_timeline`] for more information
// about why branch creation task can run concurrently with timeline's GC iteration.
for timeline in gc_timelines {
if task_mgr::is_shutdown_requested() || cancel.is_cancelled() {
if task_mgr::is_shutdown_requested() {
// We were requested to shut down. Stop and return with the progress we
// made.
break;
@@ -2635,7 +2615,6 @@ impl Tenant {
/// This is usually executed as part of periodic gc, but can now be triggered more often.
pub async fn refresh_gc_info(
&self,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> anyhow::Result<Vec<Arc<Timeline>>> {
// since this method can now be called at different rates than the configured gc loop, it
@@ -2647,7 +2626,7 @@ impl Tenant {
// refresh all timelines
let target_timeline_id = None;
self.refresh_gc_info_internal(target_timeline_id, horizon, pitr, cancel, ctx)
self.refresh_gc_info_internal(target_timeline_id, horizon, pitr, ctx)
.await
}
@@ -2656,7 +2635,6 @@ impl Tenant {
target_timeline_id: Option<TimelineId>,
horizon: u64,
pitr: Duration,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> anyhow::Result<Vec<Arc<Timeline>>> {
// grab mutex to prevent new timelines from being created here.
@@ -2730,7 +2708,7 @@ impl Tenant {
.map(|&x| x.1)
.collect();
timeline
.update_gc_info(branchpoints, cutoff, pitr, cancel, ctx)
.update_gc_info(branchpoints, cutoff, pitr, ctx)
.await?;
gc_timelines.push(timeline);
@@ -2893,7 +2871,7 @@ impl Tenant {
}
/// - run initdb to init temporary instance and get bootstrap data
/// - after initialization completes, tar up the temp dir and upload it to S3.
/// - after initialization complete, remove the temp dir.
///
/// The caller is responsible for activating the returned timeline.
async fn bootstrap_timeline(
@@ -2908,7 +2886,7 @@ impl Tenant {
};
// create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
// temporary directory for basebackup files for the given timeline.
let pgdata_path = path_with_suffix_extension(
let initdb_path = path_with_suffix_extension(
self.conf
.timelines_path(&self.tenant_id)
.join(format!("basebackup-{timeline_id}")),
@@ -2917,45 +2895,22 @@ impl Tenant {
// an uninit mark was placed before, nothing else can access this timeline files
// current initdb was not run yet, so remove whatever was left from the previous runs
if pgdata_path.exists() {
fs::remove_dir_all(&pgdata_path).with_context(|| {
format!("Failed to remove already existing initdb directory: {pgdata_path}")
if initdb_path.exists() {
fs::remove_dir_all(&initdb_path).with_context(|| {
format!("Failed to remove already existing initdb directory: {initdb_path}")
})?;
}
// Init temporarily repo to get bootstrap data, this creates a directory in the `pgdata_path` path
run_initdb(self.conf, &pgdata_path, pg_version)?;
// Init temporarily repo to get bootstrap data, this creates a directory in the `initdb_path` path
run_initdb(self.conf, &initdb_path, pg_version)?;
// this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it
scopeguard::defer! {
if let Err(e) = fs::remove_dir_all(&pgdata_path) {
if let Err(e) = fs::remove_dir_all(&initdb_path) {
// this is unlikely, but we will remove the directory on pageserver restart or another bootstrap call
error!("Failed to remove temporary initdb directory '{pgdata_path}': {e}");
error!("Failed to remove temporary initdb directory '{initdb_path}': {e}");
}
}
let pgdata_lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align();
// Upload the created data dir to S3
if let Some(storage) = &self.remote_storage {
let pgdata_zstd = import_datadir::create_tar_zst(&pgdata_path).await?;
let pgdata_zstd = Bytes::from(pgdata_zstd);
backoff::retry(
|| async {
self::remote_timeline_client::upload_initdb_dir(
storage,
&self.tenant_id,
&timeline_id,
pgdata_zstd.clone(),
)
.await
},
|_| false,
3,
u32::MAX,
"persist_initdb_tar_zst",
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
)
.await?;
}
let pgdata_path = &initdb_path;
let pgdata_lsn = import_datadir::get_lsn_from_controlfile(pgdata_path)?.align();
// Import the contents of the data directory at the initial checkpoint
// LSN, and any WAL after that.
@@ -2985,7 +2940,7 @@ impl Tenant {
import_datadir::import_timeline_from_postgres_datadir(
unfinished_timeline,
&pgdata_path,
pgdata_path,
pgdata_lsn,
ctx,
)
@@ -3166,7 +3121,6 @@ impl Tenant {
// (only if it is shorter than the real cutoff).
max_retention_period: Option<u64>,
cause: LogicalSizeCalculationCause,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> anyhow::Result<size::ModelInputs> {
let logical_sizes_at_once = self
@@ -3189,7 +3143,6 @@ impl Tenant {
max_retention_period,
&mut shared_cache,
cause,
cancel,
ctx,
)
.await
@@ -3202,10 +3155,9 @@ impl Tenant {
pub async fn calculate_synthetic_size(
&self,
cause: LogicalSizeCalculationCause,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> anyhow::Result<u64> {
let inputs = self.gather_size_inputs(None, cause, cancel, ctx).await?;
let inputs = self.gather_size_inputs(None, cause, ctx).await?;
let size = inputs.calculate()?;
@@ -3468,7 +3420,6 @@ pub async fn dump_layerfile_from_path(
pub(crate) mod harness {
use bytes::{Bytes, BytesMut};
use once_cell::sync::OnceCell;
use pageserver_api::shard::ShardIndex;
use std::fs;
use std::sync::Arc;
use utils::logging;
@@ -3535,7 +3486,6 @@ pub(crate) mod harness {
pub tenant_conf: TenantConf,
pub tenant_id: TenantId,
pub generation: Generation,
pub shard: ShardIndex,
pub remote_storage: GenericRemoteStorage,
pub remote_fs_dir: Utf8PathBuf,
pub deletion_queue: MockDeletionQueue,
@@ -3550,7 +3500,6 @@ pub(crate) mod harness {
// enable it in case the tests exercise code paths that use
// debug_assert_current_span_has_tenant_and_timeline_id
logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
logging::Output::Stdout,
)
.expect("Failed to init test logging")
});
@@ -3595,7 +3544,6 @@ pub(crate) mod harness {
tenant_conf,
tenant_id,
generation: Generation::new(0xdeadbeef),
shard: ShardIndex::unsharded(),
remote_storage,
remote_fs_dir,
deletion_queue,
@@ -3980,13 +3928,7 @@ mod tests {
// and compaction works. But it does set the 'cutoff' point so that the cross check
// below should fail.
tenant
.gc_iteration(
Some(TIMELINE_ID),
0x10,
Duration::ZERO,
&CancellationToken::new(),
&ctx,
)
.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx)
.await?;
// try to branch at lsn 25, should fail because we already garbage collected the data
@@ -4089,13 +4031,7 @@ mod tests {
tline.set_broken("test".to_owned());
tenant
.gc_iteration(
Some(TIMELINE_ID),
0x10,
Duration::ZERO,
&CancellationToken::new(),
&ctx,
)
.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx)
.await?;
// The branchpoints should contain all timelines, even ones marked
@@ -4141,13 +4077,7 @@ mod tests {
.expect("Should have a local timeline");
// this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
tenant
.gc_iteration(
Some(TIMELINE_ID),
0x10,
Duration::ZERO,
&CancellationToken::new(),
&ctx,
)
.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx)
.await?;
assert!(newtline.get(*TEST_KEY, Lsn(0x25), &ctx).await.is_ok());
@@ -4175,13 +4105,7 @@ mod tests {
// run gc on parent
tenant
.gc_iteration(
Some(TIMELINE_ID),
0x10,
Duration::ZERO,
&CancellationToken::new(),
&ctx,
)
.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx)
.await?;
// Check that the data is still accessible on the branch.
@@ -4370,9 +4294,7 @@ mod tests {
drop(writer);
tline.freeze_and_flush().await?;
tline
.compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
.await?;
tline.compact(&CancellationToken::new(), &ctx).await?;
let writer = tline.writer().await;
writer
@@ -4387,9 +4309,7 @@ mod tests {
drop(writer);
tline.freeze_and_flush().await?;
tline
.compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
.await?;
tline.compact(&CancellationToken::new(), &ctx).await?;
let writer = tline.writer().await;
writer
@@ -4404,9 +4324,7 @@ mod tests {
drop(writer);
tline.freeze_and_flush().await?;
tline
.compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
.await?;
tline.compact(&CancellationToken::new(), &ctx).await?;
let writer = tline.writer().await;
writer
@@ -4421,9 +4339,7 @@ mod tests {
drop(writer);
tline.freeze_and_flush().await?;
tline
.compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
.await?;
tline.compact(&CancellationToken::new(), &ctx).await?;
assert_eq!(
tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?,
@@ -4491,18 +4407,10 @@ mod tests {
let cutoff = tline.get_last_record_lsn();
tline
.update_gc_info(
Vec::new(),
cutoff,
Duration::ZERO,
&CancellationToken::new(),
&ctx,
)
.update_gc_info(Vec::new(), cutoff, Duration::ZERO, &ctx)
.await?;
tline.freeze_and_flush().await?;
tline
.compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
.await?;
tline.compact(&CancellationToken::new(), &ctx).await?;
tline.gc().await?;
}
@@ -4579,18 +4487,10 @@ mod tests {
// Perform a cycle of flush, compact, and GC
let cutoff = tline.get_last_record_lsn();
tline
.update_gc_info(
Vec::new(),
cutoff,
Duration::ZERO,
&CancellationToken::new(),
&ctx,
)
.update_gc_info(Vec::new(), cutoff, Duration::ZERO, &ctx)
.await?;
tline.freeze_and_flush().await?;
tline
.compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
.await?;
tline.compact(&CancellationToken::new(), &ctx).await?;
tline.gc().await?;
}
@@ -4677,18 +4577,10 @@ mod tests {
// Perform a cycle of flush, compact, and GC
let cutoff = tline.get_last_record_lsn();
tline
.update_gc_info(
Vec::new(),
cutoff,
Duration::ZERO,
&CancellationToken::new(),
&ctx,
)
.update_gc_info(Vec::new(), cutoff, Duration::ZERO, &ctx)
.await?;
tline.freeze_and_flush().await?;
tline
.compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
.await?;
tline.compact(&CancellationToken::new(), &ctx).await?;
tline.gc().await?;
}

View File

@@ -10,7 +10,6 @@
//!
use anyhow::Context;
use pageserver_api::models;
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
use serde::{Deserialize, Serialize};
use std::num::NonZeroU64;
use std::time::Duration;
@@ -89,14 +88,6 @@ pub(crate) struct LocationConf {
/// The location-specific part of the configuration, describes the operating
/// mode of this pageserver for this tenant.
pub(crate) mode: LocationMode,
/// The detailed shard identity. This structure is already scoped within
/// a TenantShardId, but we need the full ShardIdentity to enable calculating
/// key->shard mappings.
#[serde(default = "ShardIdentity::unsharded")]
#[serde(skip_serializing_if = "ShardIdentity::is_unsharded")]
pub(crate) shard: ShardIdentity,
/// The pan-cluster tenant configuration, the same on all locations
pub(crate) tenant_conf: TenantConfOpt,
}
@@ -169,8 +160,6 @@ impl LocationConf {
generation,
attach_mode: AttachmentMode::Single,
}),
// Legacy configuration loads are always from tenants created before sharding existed.
shard: ShardIdentity::unsharded(),
tenant_conf,
}
}
@@ -198,7 +187,6 @@ impl LocationConf {
fn get_generation(conf: &'_ models::LocationConfig) -> Result<Generation, anyhow::Error> {
conf.generation
.map(Generation::new)
.ok_or_else(|| anyhow::anyhow!("Generation must be set when attaching"))
}
@@ -238,21 +226,7 @@ impl LocationConf {
}
};
let shard = if conf.shard_count == 0 {
ShardIdentity::unsharded()
} else {
ShardIdentity::new(
ShardNumber(conf.shard_number),
ShardCount(conf.shard_count),
ShardStripeSize(conf.shard_stripe_size),
)?
};
Ok(Self {
shard,
mode,
tenant_conf,
})
Ok(Self { mode, tenant_conf })
}
}
@@ -267,7 +241,6 @@ impl Default for LocationConf {
attach_mode: AttachmentMode::Single,
}),
tenant_conf: TenantConfOpt::default(),
shard: ShardIdentity::unsharded(),
}
}
}

View File

@@ -1944,7 +1944,6 @@ pub(crate) async fn immediate_gc(
tenant_id: TenantId,
timeline_id: TimelineId,
gc_req: TimelineGcRequest,
cancel: CancellationToken,
ctx: &RequestContext,
) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> {
let guard = TENANTS.read().unwrap();
@@ -1971,7 +1970,7 @@ pub(crate) async fn immediate_gc(
async move {
fail::fail_point!("immediate_gc_task_pre");
let result = tenant
.gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
.gc_iteration(Some(timeline_id), gc_horizon, pitr, &ctx)
.instrument(info_span!("manual_gc", %tenant_id, %timeline_id))
.await;
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it

View File

@@ -188,10 +188,8 @@ use anyhow::Context;
use camino::Utf8Path;
use chrono::{NaiveDateTime, Utc};
use pageserver_api::shard::ShardIndex;
use scopeguard::ScopeGuard;
use tokio_util::sync::CancellationToken;
pub(crate) use upload::upload_initdb_dir;
use utils::backoff::{
self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
};
@@ -251,8 +249,6 @@ pub(crate) const FAILED_REMOTE_OP_RETRIES: u32 = 10;
// retries. Uploads and deletions are retried forever, though.
pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
pub(crate) const INITDB_PATH: &str = "initdb.tar.zst";
pub enum MaybeDeletedIndexPart {
IndexPart(IndexPart),
Deleted(IndexPart),
@@ -403,11 +399,6 @@ impl RemoteTimelineClient {
Ok(())
}
pub(crate) fn get_shard_index(&self) -> ShardIndex {
// TODO: carry this on the struct
ShardIndex::unsharded()
}
pub fn remote_consistent_lsn_projected(&self) -> Option<Lsn> {
match &mut *self.upload_queue.lock().unwrap() {
UploadQueue::Uninitialized => None,
@@ -471,7 +462,6 @@ impl RemoteTimelineClient {
&self.storage_impl,
&self.tenant_id,
&self.timeline_id,
self.get_shard_index(),
self.generation,
cancel,
)
@@ -657,20 +647,52 @@ impl RemoteTimelineClient {
/// deletion won't actually be performed, until all previously scheduled
/// upload operations, and the index file upload, have completed
/// successfully.
///
/// No work is done if the layers are not present in the remote index. Returns
/// false if no work was done.
pub fn schedule_layer_file_deletion(
self: &Arc<Self>,
names: &[LayerFileName],
) -> anyhow::Result<()> {
) -> anyhow::Result<bool> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
let with_metadata =
let with_generations =
self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned());
self.schedule_deletion_of_unlinked0(upload_queue, with_metadata);
if with_generations.is_empty() {
// No-op.
Ok(false)
} else {
self.schedule_deletion_of_unlinked0(upload_queue, with_generations);
// Launch the tasks immediately, if possible
self.launch_queued_tasks(upload_queue);
// Launch the tasks immediately, if possible
self.launch_queued_tasks(upload_queue);
Ok(true)
}
}
/// Schedule layer deletions and wait for them to fully execute.
///
/// This is not the normal way to delete layers: usually deletion is scheduled and
/// left to run in the background. However, during startup in [`crate::tenant::Timeline::load_layer_map`]
/// we may find that there are some layers in the future wrt disk_consistent_lsn,
/// and drop them. This is different to a normal deletion, because we are deleting layers that
/// we may soon re-upload with the same name: it's important that the deletions do not race with
/// those later uploads. So this function includes a full flush of the deletion queue.
///
/// TODO: remote, as we will no longer need this function when we are always running pageservers with
/// generations enabled, because layer keys after a restart will always differ to layers before
/// the restart by their generation suffix.
pub async fn flushing_delete_layers(
self: &Arc<Self>,
names: &[LayerFileName],
) -> anyhow::Result<()> {
if self.schedule_layer_file_deletion(names)? {
self.wait_completion().await?;
self.deletion_queue_client.flush_execute().await?;
}
Ok(())
}
@@ -702,7 +724,7 @@ impl RemoteTimelineClient {
self: &Arc<Self>,
upload_queue: &mut UploadQueueInitialized,
names: I,
) -> Vec<(LayerFileName, LayerFileMetadata)>
) -> Vec<(LayerFileName, Generation)>
where
I: IntoIterator<Item = LayerFileName>,
{
@@ -710,17 +732,16 @@ impl RemoteTimelineClient {
// so we don't need update it. Just serialize it.
let metadata = upload_queue.latest_metadata.clone();
// Decorate our list of names with each name's metadata, dropping
// names that are unexpectedly missing from our metadata. This metadata
// is later used when physically deleting layers, to construct key paths.
let with_metadata: Vec<_> = names
// Decorate our list of names with each name's generation, dropping
// names that are unexpectedly missing from our metadata.
let with_generations: Vec<_> = names
.into_iter()
.filter_map(|name| {
let meta = upload_queue.latest_files.remove(&name);
if let Some(meta) = meta {
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
Some((name, meta))
Some((name, meta.generation))
} else {
// This can only happen if we forgot to to schedule the file upload
// before scheduling the delete. Log it because it is a rare/strange
@@ -733,10 +754,9 @@ impl RemoteTimelineClient {
.collect();
#[cfg(feature = "testing")]
for (name, metadata) in &with_metadata {
let gen = metadata.generation;
if let Some(unexpected) = upload_queue.dangling_files.insert(name.to_owned(), gen) {
if unexpected == gen {
for (name, gen) in &with_generations {
if let Some(unexpected) = upload_queue.dangling_files.insert(name.to_owned(), *gen) {
if &unexpected == gen {
tracing::error!("{name} was unlinked twice with same generation");
} else {
tracing::error!("{name} was unlinked twice with different generations {gen:?} and {unexpected:?}");
@@ -751,14 +771,14 @@ impl RemoteTimelineClient {
self.schedule_index_upload(upload_queue, metadata);
}
with_metadata
with_generations
}
/// Schedules deletion for layer files which have previously been unlinked from the
/// `index_part.json` with [`Self::schedule_gc_update`] or [`Self::schedule_compaction_update`].
pub(crate) fn schedule_deletion_of_unlinked(
self: &Arc<Self>,
layers: Vec<(LayerFileName, LayerFileMetadata)>,
layers: Vec<(LayerFileName, Generation)>,
) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
@@ -771,22 +791,16 @@ impl RemoteTimelineClient {
fn schedule_deletion_of_unlinked0(
self: &Arc<Self>,
upload_queue: &mut UploadQueueInitialized,
with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
with_generations: Vec<(LayerFileName, Generation)>,
) {
for (name, meta) in &with_metadata {
info!(
"scheduling deletion of layer {}{} (shard {})",
name,
meta.generation.get_suffix(),
meta.shard
);
for (name, gen) in &with_generations {
info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
}
#[cfg(feature = "testing")]
for (name, meta) in &with_metadata {
let gen = meta.generation;
for (name, gen) in &with_generations {
match upload_queue.dangling_files.remove(name) {
Some(same) if same == gen => { /* expected */ }
Some(same) if &same == gen => { /* expected */ }
Some(other) => {
tracing::error!("{name} was unlinked with {other:?} but deleted with {gen:?}");
}
@@ -798,7 +812,7 @@ impl RemoteTimelineClient {
// schedule the actual deletions
let op = UploadOp::Delete(Delete {
layers: with_metadata,
layers: with_generations,
});
self.calls_unfinished_metric_begin(&op);
upload_queue.queued_operations.push_back(op);
@@ -834,7 +848,7 @@ impl RemoteTimelineClient {
let mut receiver = {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
self.schedule_barrier0(upload_queue)
self.schedule_barrier(upload_queue)
};
if receiver.changed().await.is_err() {
@@ -843,14 +857,7 @@ impl RemoteTimelineClient {
Ok(())
}
pub(crate) fn schedule_barrier(self: &Arc<Self>) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
self.schedule_barrier0(upload_queue);
Ok(())
}
fn schedule_barrier0(
fn schedule_barrier(
self: &Arc<Self>,
upload_queue: &mut UploadQueueInitialized,
) -> tokio::sync::watch::Receiver<()> {
@@ -919,7 +926,6 @@ impl RemoteTimelineClient {
&self.storage_impl,
&self.tenant_id,
&self.timeline_id,
self.get_shard_index(),
self.generation,
&index_part_with_deleted_at,
)
@@ -978,7 +984,6 @@ impl RemoteTimelineClient {
remote_layer_path(
&self.tenant_id,
&self.timeline_id,
meta.shard,
&file_name,
meta.generation,
)
@@ -1027,12 +1032,7 @@ impl RemoteTimelineClient {
.unwrap_or(
// No generation-suffixed indices, assume we are dealing with
// a legacy index.
remote_index_path(
&self.tenant_id,
&self.timeline_id,
self.get_shard_index(),
Generation::none(),
),
remote_index_path(&self.tenant_id, &self.timeline_id, Generation::none()),
);
let remaining_layers: Vec<RemotePath> = remaining
@@ -1241,7 +1241,6 @@ impl RemoteTimelineClient {
&self.storage_impl,
&self.tenant_id,
&self.timeline_id,
self.get_shard_index(),
self.generation,
index_part,
)
@@ -1262,18 +1261,16 @@ impl RemoteTimelineClient {
}
res
}
UploadOp::Delete(delete) => {
pausable_failpoint!("before-delete-layer-pausable");
self.deletion_queue_client
.push_layers(
self.tenant_id,
self.timeline_id,
self.generation,
delete.layers.clone(),
)
.await
.map_err(|e| anyhow::anyhow!(e))
}
UploadOp::Delete(delete) => self
.deletion_queue_client
.push_layers(
self.tenant_id,
self.timeline_id,
self.generation,
delete.layers.clone(),
)
.await
.map_err(|e| anyhow::anyhow!(e)),
UploadOp::Barrier(_) => {
// unreachable. Barrier operations are handled synchronously in
// launch_queued_tasks
@@ -1550,14 +1547,12 @@ pub fn remote_timeline_path(tenant_id: &TenantId, timeline_id: &TimelineId) -> R
pub fn remote_layer_path(
tenant_id: &TenantId,
timeline_id: &TimelineId,
shard: ShardIndex,
layer_file_name: &LayerFileName,
generation: Generation,
) -> RemotePath {
// Generation-aware key format
let path = format!(
"tenants/{tenant_id}{0}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{1}{2}",
shard.get_suffix(),
"tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
layer_file_name.file_name(),
generation.get_suffix()
);
@@ -1565,22 +1560,13 @@ pub fn remote_layer_path(
RemotePath::from_string(&path).expect("Failed to construct path")
}
pub fn remote_initdb_archive_path(tenant_id: &TenantId, timeline_id: &TimelineId) -> RemotePath {
RemotePath::from_string(&format!(
"tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{INITDB_PATH}"
))
.expect("Failed to construct path")
}
pub fn remote_index_path(
tenant_id: &TenantId,
timeline_id: &TimelineId,
shard: ShardIndex,
generation: Generation,
) -> RemotePath {
RemotePath::from_string(&format!(
"tenants/{tenant_id}{0}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{1}{2}",
shard.get_suffix(),
"tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
IndexPart::FILE_NAME,
generation.get_suffix()
))
@@ -1805,7 +1791,6 @@ mod tests {
println!("remote_timeline_dir: {remote_timeline_dir}");
let generation = harness.generation;
let shard = harness.shard;
// Create a couple of dummy files, schedule upload for them
@@ -1822,7 +1807,7 @@ mod tests {
harness.conf,
&timeline,
name,
LayerFileMetadata::new(contents.len() as u64, generation, shard),
LayerFileMetadata::new(contents.len() as u64, generation),
)
}).collect::<Vec<_>>();
@@ -1971,7 +1956,7 @@ mod tests {
harness.conf,
&timeline,
layer_file_name_1.clone(),
LayerFileMetadata::new(content_1.len() as u64, harness.generation, harness.shard),
LayerFileMetadata::new(content_1.len() as u64, harness.generation),
);
#[derive(Debug, PartialEq, Clone, Copy)]
@@ -2036,11 +2021,7 @@ mod tests {
assert_eq!(actual_c, expected_c);
}
async fn inject_index_part(
test_state: &TestSetup,
generation: Generation,
shard: ShardIndex,
) -> IndexPart {
async fn inject_index_part(test_state: &TestSetup, generation: Generation) -> IndexPart {
// An empty IndexPart, just sufficient to ensure deserialization will succeed
let example_metadata = TimelineMetadata::example();
let example_index_part = IndexPart::new(
@@ -2061,13 +2042,7 @@ mod tests {
std::fs::create_dir_all(remote_timeline_dir).expect("creating test dir should work");
let index_path = test_state.harness.remote_fs_dir.join(
remote_index_path(
&test_state.harness.tenant_id,
&TIMELINE_ID,
shard,
generation,
)
.get_path(),
remote_index_path(&test_state.harness.tenant_id, &TIMELINE_ID, generation).get_path(),
);
eprintln!("Writing {index_path}");
std::fs::write(&index_path, index_part_bytes).unwrap();
@@ -2104,12 +2079,7 @@ mod tests {
// Simple case: we are in generation N, load the index from generation N - 1
let generation_n = 5;
let injected = inject_index_part(
&test_state,
Generation::new(generation_n - 1),
ShardIndex::unsharded(),
)
.await;
let injected = inject_index_part(&test_state, Generation::new(generation_n - 1)).await;
assert_got_index_part(&test_state, Generation::new(generation_n), &injected).await;
@@ -2127,34 +2097,22 @@ mod tests {
// A generation-less IndexPart exists in the bucket, we should find it
let generation_n = 5;
let injected_none =
inject_index_part(&test_state, Generation::none(), ShardIndex::unsharded()).await;
let injected_none = inject_index_part(&test_state, Generation::none()).await;
assert_got_index_part(&test_state, Generation::new(generation_n), &injected_none).await;
// If a more recent-than-none generation exists, we should prefer to load that
let injected_1 =
inject_index_part(&test_state, Generation::new(1), ShardIndex::unsharded()).await;
let injected_1 = inject_index_part(&test_state, Generation::new(1)).await;
assert_got_index_part(&test_state, Generation::new(generation_n), &injected_1).await;
// If a more-recent-than-me generation exists, we should ignore it.
let _injected_10 =
inject_index_part(&test_state, Generation::new(10), ShardIndex::unsharded()).await;
let _injected_10 = inject_index_part(&test_state, Generation::new(10)).await;
assert_got_index_part(&test_state, Generation::new(generation_n), &injected_1).await;
// If a directly previous generation exists, _and_ an index exists in my own
// generation, I should prefer my own generation.
let _injected_prev = inject_index_part(
&test_state,
Generation::new(generation_n - 1),
ShardIndex::unsharded(),
)
.await;
let injected_current = inject_index_part(
&test_state,
Generation::new(generation_n),
ShardIndex::unsharded(),
)
.await;
let _injected_prev =
inject_index_part(&test_state, Generation::new(generation_n - 1)).await;
let injected_current = inject_index_part(&test_state, Generation::new(generation_n)).await;
assert_got_index_part(
&test_state,
Generation::new(generation_n),

View File

@@ -9,7 +9,6 @@ use std::time::Duration;
use anyhow::{anyhow, Context};
use camino::Utf8Path;
use pageserver_api::shard::ShardIndex;
use tokio::fs;
use tokio::io::AsyncWriteExt;
use tokio_util::sync::CancellationToken;
@@ -54,7 +53,6 @@ pub async fn download_layer_file<'a>(
let remote_path = remote_layer_path(
&tenant_id,
&timeline_id,
layer_metadata.shard,
layer_file_name,
layer_metadata.generation,
);
@@ -215,11 +213,10 @@ async fn do_download_index_part(
storage: &GenericRemoteStorage,
tenant_id: &TenantId,
timeline_id: &TimelineId,
shard: ShardIndex,
index_generation: Generation,
cancel: CancellationToken,
) -> Result<IndexPart, DownloadError> {
let remote_path = remote_index_path(tenant_id, timeline_id, shard, index_generation);
let remote_path = remote_index_path(tenant_id, timeline_id, index_generation);
let index_part_bytes = download_retry_forever(
|| async {
@@ -257,7 +254,6 @@ pub(super) async fn download_index_part(
storage: &GenericRemoteStorage,
tenant_id: &TenantId,
timeline_id: &TimelineId,
shard: ShardIndex,
my_generation: Generation,
cancel: CancellationToken,
) -> Result<IndexPart, DownloadError> {
@@ -265,15 +261,8 @@ pub(super) async fn download_index_part(
if my_generation.is_none() {
// Operating without generations: just fetch the generation-less path
return do_download_index_part(
storage,
tenant_id,
timeline_id,
shard,
my_generation,
cancel,
)
.await;
return do_download_index_part(storage, tenant_id, timeline_id, my_generation, cancel)
.await;
}
// Stale case: If we were intentionally attached in a stale generation, there may already be a remote
@@ -284,7 +273,6 @@ pub(super) async fn download_index_part(
storage,
tenant_id,
timeline_id,
shard,
my_generation,
cancel.clone(),
)
@@ -312,7 +300,6 @@ pub(super) async fn download_index_part(
storage,
tenant_id,
timeline_id,
shard,
my_generation.previous(),
cancel.clone(),
)
@@ -333,9 +320,8 @@ pub(super) async fn download_index_part(
}
// General case/fallback: if there is no index at my_generation or prev_generation, then list all index_part.json
// objects, and select the highest one with a generation <= my_generation. Constructing the prefix is equivalent
// to constructing a full index path with no generation, because the generation is a suffix.
let index_prefix = remote_index_path(tenant_id, timeline_id, shard, Generation::none());
// objects, and select the highest one with a generation <= my_generation.
let index_prefix = remote_index_path(tenant_id, timeline_id, Generation::none());
let indices = backoff::retry(
|| async { storage.list_files(Some(&index_prefix)).await },
|_| false,
@@ -361,21 +347,14 @@ pub(super) async fn download_index_part(
match max_previous_generation {
Some(g) => {
tracing::debug!("Found index_part in generation {g:?}");
do_download_index_part(storage, tenant_id, timeline_id, shard, g, cancel).await
do_download_index_part(storage, tenant_id, timeline_id, g, cancel).await
}
None => {
// Migration from legacy pre-generation state: we have a generation but no prior
// attached pageservers did. Try to load from a no-generation path.
tracing::info!("No index_part.json* found");
do_download_index_part(
storage,
tenant_id,
timeline_id,
shard,
Generation::none(),
cancel,
)
.await
do_download_index_part(storage, tenant_id, timeline_id, Generation::none(), cancel)
.await
}
}
}

View File

@@ -12,7 +12,6 @@ use crate::tenant::metadata::TimelineMetadata;
use crate::tenant::storage_layer::LayerFileName;
use crate::tenant::upload_queue::UploadQueueInitialized;
use crate::tenant::Generation;
use pageserver_api::shard::ShardIndex;
use utils::lsn::Lsn;
@@ -26,8 +25,6 @@ pub struct LayerFileMetadata {
file_size: u64,
pub(crate) generation: Generation,
pub(crate) shard: ShardIndex,
}
impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
@@ -35,17 +32,15 @@ impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
LayerFileMetadata {
file_size: other.file_size,
generation: other.generation,
shard: other.shard,
}
}
}
impl LayerFileMetadata {
pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self {
pub fn new(file_size: u64, generation: Generation) -> Self {
LayerFileMetadata {
file_size,
generation,
shard,
}
}
@@ -133,14 +128,6 @@ impl IndexPart {
pub fn get_disk_consistent_lsn(&self) -> Lsn {
self.disk_consistent_lsn
}
pub fn from_s3_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
serde_json::from_slice::<IndexPart>(bytes)
}
pub fn to_s3_bytes(&self) -> serde_json::Result<Vec<u8>> {
serde_json::to_vec(self)
}
}
impl TryFrom<&UploadQueueInitialized> for IndexPart {
@@ -166,10 +153,6 @@ pub struct IndexLayerMetadata {
#[serde(default = "Generation::none")]
#[serde(skip_serializing_if = "Generation::is_none")]
pub generation: Generation,
#[serde(default = "ShardIndex::unsharded")]
#[serde(skip_serializing_if = "ShardIndex::is_unsharded")]
pub shard: ShardIndex,
}
impl From<LayerFileMetadata> for IndexLayerMetadata {
@@ -177,7 +160,6 @@ impl From<LayerFileMetadata> for IndexLayerMetadata {
IndexLayerMetadata {
file_size: other.file_size,
generation: other.generation,
shard: other.shard,
}
}
}
@@ -205,15 +187,13 @@ mod tests {
layer_metadata: HashMap::from([
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
file_size: 25600000,
generation: Generation::none(),
shard: ShardIndex::unsharded()
generation: Generation::none()
}),
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
// serde_json should always parse this but this might be a double with jq for
// example.
file_size: 9007199254741001,
generation: Generation::none(),
shard: ShardIndex::unsharded()
generation: Generation::none()
})
]),
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
@@ -221,7 +201,7 @@ mod tests {
deleted_at: None,
};
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
let part = serde_json::from_str::<IndexPart>(example).unwrap();
assert_eq!(part, expected);
}
@@ -245,15 +225,13 @@ mod tests {
layer_metadata: HashMap::from([
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
file_size: 25600000,
generation: Generation::none(),
shard: ShardIndex::unsharded()
generation: Generation::none()
}),
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
// serde_json should always parse this but this might be a double with jq for
// example.
file_size: 9007199254741001,
generation: Generation::none(),
shard: ShardIndex::unsharded()
generation: Generation::none()
})
]),
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
@@ -261,7 +239,7 @@ mod tests {
deleted_at: None,
};
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
let part = serde_json::from_str::<IndexPart>(example).unwrap();
assert_eq!(part, expected);
}
@@ -286,15 +264,13 @@ mod tests {
layer_metadata: HashMap::from([
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
file_size: 25600000,
generation: Generation::none(),
shard: ShardIndex::unsharded()
generation: Generation::none()
}),
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
// serde_json should always parse this but this might be a double with jq for
// example.
file_size: 9007199254741001,
generation: Generation::none(),
shard: ShardIndex::unsharded()
generation: Generation::none()
})
]),
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
@@ -303,7 +279,7 @@ mod tests {
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
};
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
let part = serde_json::from_str::<IndexPart>(example).unwrap();
assert_eq!(part, expected);
}
@@ -347,7 +323,7 @@ mod tests {
deleted_at: None,
};
let empty_layers_parsed = IndexPart::from_s3_bytes(empty_layers_json.as_bytes()).unwrap();
let empty_layers_parsed = serde_json::from_str::<IndexPart>(empty_layers_json).unwrap();
assert_eq!(empty_layers_parsed, expected);
}
@@ -370,24 +346,22 @@ mod tests {
layer_metadata: HashMap::from([
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
file_size: 25600000,
generation: Generation::none(),
shard: ShardIndex::unsharded()
generation: Generation::none()
}),
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
// serde_json should always parse this but this might be a double with jq for
// example.
file_size: 9007199254741001,
generation: Generation::none(),
shard: ShardIndex::unsharded()
generation: Generation::none()
})
]),
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
};
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
let part = serde_json::from_str::<IndexPart>(example).unwrap();
assert_eq!(part, expected);
}
}

View File

@@ -1,19 +1,15 @@
//! Helper functions to upload files to remote storage with a RemoteStorage
use anyhow::{bail, Context};
use bytes::Bytes;
use camino::Utf8Path;
use fail::fail_point;
use pageserver_api::shard::ShardIndex;
use std::io::ErrorKind;
use tokio::fs;
use super::Generation;
use crate::{
config::PageServerConf,
tenant::remote_timeline_client::{
index::IndexPart, remote_index_path, remote_initdb_archive_path, remote_path,
},
tenant::remote_timeline_client::{index::IndexPart, remote_index_path, remote_path},
};
use remote_storage::GenericRemoteStorage;
use utils::id::{TenantId, TimelineId};
@@ -27,7 +23,6 @@ pub(super) async fn upload_index_part<'a>(
storage: &'a GenericRemoteStorage,
tenant_id: &TenantId,
timeline_id: &TimelineId,
shard: ShardIndex,
generation: Generation,
index_part: &'a IndexPart,
) -> anyhow::Result<()> {
@@ -38,13 +33,12 @@ pub(super) async fn upload_index_part<'a>(
});
pausable_failpoint!("before-upload-index-pausable");
let index_part_bytes = index_part
.to_s3_bytes()
.context("serialize index part file into bytes")?;
let index_part_bytes =
serde_json::to_vec(&index_part).context("serialize index part file into bytes")?;
let index_part_size = index_part_bytes.len();
let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes));
let remote_path = remote_index_path(tenant_id, timeline_id, shard, generation);
let remote_path = remote_index_path(tenant_id, timeline_id, generation);
storage
.upload_storage_object(Box::new(index_part_bytes), index_part_size, &remote_path)
.await
@@ -109,22 +103,3 @@ pub(super) async fn upload_timeline_layer<'a>(
Ok(())
}
/// Uploads the given `initdb` data to the remote storage.
pub(crate) async fn upload_initdb_dir(
storage: &GenericRemoteStorage,
tenant_id: &TenantId,
timeline_id: &TimelineId,
initdb_dir: Bytes,
) -> anyhow::Result<()> {
tracing::trace!("uploading initdb dir");
let size = initdb_dir.len();
let bytes = tokio::io::BufReader::new(std::io::Cursor::new(initdb_dir));
let remote_path = remote_initdb_archive_path(tenant_id, timeline_id);
storage
.upload_storage_object(bytes, size, &remote_path)
.await
.with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
}

View File

@@ -6,7 +6,6 @@ use std::sync::Arc;
use anyhow::{bail, Context};
use tokio::sync::oneshot::error::RecvError;
use tokio::sync::Semaphore;
use tokio_util::sync::CancellationToken;
use crate::context::RequestContext;
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
@@ -114,12 +113,11 @@ pub(super) async fn gather_inputs(
max_retention_period: Option<u64>,
logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
cause: LogicalSizeCalculationCause,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> anyhow::Result<ModelInputs> {
// refresh is needed to update gc related pitr_cutoff and horizon_cutoff
tenant
.refresh_gc_info(cancel, ctx)
.refresh_gc_info(ctx)
.await
.context("Failed to refresh gc_info before gathering inputs")?;

View File

@@ -289,9 +289,7 @@ impl DeltaLayer {
async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
let path = self.path();
let loaded = DeltaLayerInner::load(&path, None, ctx)
.await
.and_then(|res| res)?;
let loaded = DeltaLayerInner::load(&path, None, ctx).await?;
// not production code
let actual_filename = path.file_name().unwrap().to_owned();
@@ -612,28 +610,18 @@ impl Drop for DeltaLayerWriter {
}
impl DeltaLayerInner {
/// Returns nested result following Result<Result<_, OpErr>, Critical>:
/// - inner has the success or transient failure
/// - outer has the permanent failure
pub(super) async fn load(
path: &Utf8Path,
summary: Option<Summary>,
ctx: &RequestContext,
) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
let file = match VirtualFile::open(path).await {
Ok(file) => file,
Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
};
) -> anyhow::Result<Self> {
let file = VirtualFile::open(path)
.await
.with_context(|| format!("Failed to open file '{path}'"))?;
let file = FileBlockReader::new(file);
let summary_blk = match file.read_blk(0, ctx).await {
Ok(blk) => blk,
Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
};
// TODO: this should be an assertion instead; see ImageLayerInner::load
let actual_summary =
Summary::des_prefix(summary_blk.as_ref()).context("deserialize first block")?;
let summary_blk = file.read_blk(0, ctx).await?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
if let Some(mut expected_summary) = summary {
// production code path
@@ -648,11 +636,11 @@ impl DeltaLayerInner {
}
}
Ok(Ok(DeltaLayerInner {
Ok(DeltaLayerInner {
file,
index_start_blk: actual_summary.index_start_blk,
index_root_blk: actual_summary.index_root_blk,
}))
})
}
pub(super) async fn get_value_reconstruct_data(

View File

@@ -249,9 +249,7 @@ impl ImageLayer {
async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
let path = self.path();
let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx)
.await
.and_then(|res| res)?;
let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx).await?;
// not production code
let actual_filename = path.file_name().unwrap().to_owned();
@@ -297,31 +295,18 @@ impl ImageLayer {
}
impl ImageLayerInner {
/// Returns nested result following Result<Result<_, OpErr>, Critical>:
/// - inner has the success or transient failure
/// - outer has the permanent failure
pub(super) async fn load(
path: &Utf8Path,
lsn: Lsn,
summary: Option<Summary>,
ctx: &RequestContext,
) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
let file = match VirtualFile::open(path).await {
Ok(file) => file,
Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
};
) -> anyhow::Result<Self> {
let file = VirtualFile::open(path)
.await
.with_context(|| format!("Failed to open file '{}'", path))?;
let file = FileBlockReader::new(file);
let summary_blk = match file.read_blk(0, ctx).await {
Ok(blk) => blk,
Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
};
// length is the only way how this could fail, so it's not actually likely at all unless
// read_blk returns wrong sized block.
//
// TODO: confirm and make this into assertion
let actual_summary =
Summary::des_prefix(summary_blk.as_ref()).context("deserialize first block")?;
let summary_blk = file.read_blk(0, ctx).await?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
if let Some(mut expected_summary) = summary {
// production code path
@@ -337,12 +322,12 @@ impl ImageLayerInner {
}
}
Ok(Ok(ImageLayerInner {
Ok(ImageLayerInner {
index_start_blk: actual_summary.index_start_blk,
index_root_blk: actual_summary.index_root_blk,
lsn,
file,
}))
})
}
pub(super) async fn get_value_reconstruct_data(

View File

@@ -3,7 +3,6 @@ use camino::{Utf8Path, Utf8PathBuf};
use pageserver_api::models::{
HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
};
use pageserver_api::shard::ShardIndex;
use std::ops::Range;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use std::sync::{Arc, Weak};
@@ -97,7 +96,6 @@ impl Layer {
desc,
None,
metadata.generation,
metadata.shard,
)));
debug_assert!(owner.0.needs_download_blocking().unwrap().is_some());
@@ -138,7 +136,6 @@ impl Layer {
desc,
Some(inner),
metadata.generation,
metadata.shard,
)
}));
@@ -182,7 +179,6 @@ impl Layer {
desc,
Some(inner),
timeline.generation,
timeline.get_shard_index(),
)
}));
@@ -430,15 +426,6 @@ struct LayerInner {
/// For loaded layers (resident or evicted) this comes from [`LayerFileMetadata::generation`],
/// for created layers from [`Timeline::generation`].
generation: Generation,
/// The shard of this Layer.
///
/// For layers created in this process, this will always be the [`ShardIndex`] of the
/// current `ShardIdentity`` (TODO: add link once it's introduced).
///
/// For loaded layers, this may be some other value if the tenant has undergone
/// a shard split since the layer was originally written.
shard: ShardIndex,
}
impl std::fmt::Display for LayerInner {
@@ -472,9 +459,9 @@ impl Drop for LayerInner {
let path = std::mem::take(&mut self.path);
let file_name = self.layer_desc().filename();
let gen = self.generation;
let file_size = self.layer_desc().file_size;
let timeline = self.timeline.clone();
let meta = self.metadata();
crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
let _g = span.entered();
@@ -502,7 +489,7 @@ impl Drop for LayerInner {
timeline.metrics.resident_physical_size_sub(file_size);
}
if let Some(remote_client) = timeline.remote_client.as_ref() {
let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]);
let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, gen)]);
if let Err(e) = res {
// test_timeline_deletion_with_files_stuck_in_upload_queue is good at
@@ -536,7 +523,6 @@ impl LayerInner {
desc: PersistentLayerDesc,
downloaded: Option<Arc<DownloadedLayer>>,
generation: Generation,
shard: ShardIndex,
) -> Self {
let path = conf
.timeline_path(&timeline.tenant_id, &timeline.timeline_id)
@@ -564,7 +550,6 @@ impl LayerInner {
status: tokio::sync::broadcast::channel(1).0,
consecutive_failures: AtomicUsize::new(0),
generation,
shard,
}
}
@@ -883,9 +868,6 @@ impl LayerInner {
}
Ok((Err(e), _permit)) => {
// FIXME: this should be with the spawned task and be cancellation sensitive
//
// while we should not need this, this backoff has turned out to be useful with
// a bug of unexpectedly deleted remote layer file (#5787).
let consecutive_failures =
self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
@@ -1092,7 +1074,7 @@ impl LayerInner {
}
fn metadata(&self) -> LayerFileMetadata {
LayerFileMetadata::new(self.desc.file_size, self.generation, self.shard)
LayerFileMetadata::new(self.desc.file_size, self.generation)
}
}
@@ -1214,7 +1196,7 @@ impl DownloadedLayer {
));
delta_layer::DeltaLayerInner::load(&owner.path, summary, ctx)
.await
.map(|res| res.map(LayerKind::Delta))
.map(LayerKind::Delta)
} else {
let lsn = owner.desc.image_layer_lsn();
let summary = Some(image_layer::Summary::expected(
@@ -1225,32 +1207,23 @@ impl DownloadedLayer {
));
image_layer::ImageLayerInner::load(&owner.path, lsn, summary, ctx)
.await
.map(|res| res.map(LayerKind::Image))
};
match res {
Ok(Ok(layer)) => Ok(Ok(layer)),
Ok(Err(transient)) => Err(transient),
Err(permanent) => {
LAYER_IMPL_METRICS.inc_permanent_loading_failures();
// TODO(#5815): we are not logging all errors, so temporarily log them **once**
// here as well
let permanent = permanent.context("load layer");
tracing::error!("layer loading failed permanently: {permanent:#}");
Ok(Err(permanent))
}
.map(LayerKind::Image)
}
// this will be a permanent failure
.context("load layer");
if let Err(e) = res.as_ref() {
LAYER_IMPL_METRICS.inc_permanent_loading_failures();
// TODO(#5815): we are not logging all errors, so temporarily log them here as well
tracing::error!("layer loading failed permanently: {e:#}");
}
res
};
self.kind
.get_or_try_init(init)
// return transient errors using `?`
.await?
.as_ref()
.map_err(|e| {
// errors are not clonabled, cannot but stringify
// test_broken_timeline matches this string
anyhow::anyhow!("layer loading failed: {e:#}")
})
self.kind.get_or_init(init).await.as_ref().map_err(|e| {
// errors are not clonabled, cannot but stringify
// test_broken_timeline matches this string
anyhow::anyhow!("layer loading failed: {e:#}")
})
}
async fn get_value_reconstruct_data(

View File

@@ -180,16 +180,16 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
// Run compaction
if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await {
let wait_duration = backoff::exponential_backoff_duration_seconds(
error_run_count + 1,
error_run_count,
1.0,
MAX_BACKOFF_SECS,
);
error_run_count += 1;
let wait_duration = Duration::from_secs_f64(wait_duration);
error!(
"Compaction failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
"Compaction failed {error_run_count} times, retrying in {:?}: {e:?}",
wait_duration
);
wait_duration
Duration::from_secs_f64(wait_duration)
} else {
error_run_count = 0;
period
@@ -198,10 +198,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction);
// Perhaps we did no work and the walredo process has been idle for some time:
// give it a chance to shut down to avoid leaving walredo process running indefinitely.
tenant.walredo_mgr.maybe_quiesce(period * 10);
// Sleep
if tokio::time::timeout(sleep_duration, cancel.cancelled())
.await
@@ -261,20 +257,20 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
} else {
// Run gc
let res = tenant
.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx)
.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx)
.await;
if let Err(e) = res {
let wait_duration = backoff::exponential_backoff_duration_seconds(
error_run_count + 1,
error_run_count,
1.0,
MAX_BACKOFF_SECS,
);
error_run_count += 1;
let wait_duration = Duration::from_secs_f64(wait_duration);
error!(
"Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
"Gc failed {error_run_count} times, retrying in {:?}: {e:?}",
wait_duration
);
wait_duration
Duration::from_secs_f64(wait_duration)
} else {
error_run_count = 0;
period

View File

@@ -10,7 +10,6 @@ mod walreceiver;
use anyhow::{anyhow, bail, ensure, Context, Result};
use bytes::Bytes;
use camino::{Utf8Path, Utf8PathBuf};
use enumset::EnumSet;
use fail::fail_point;
use itertools::Itertools;
use pageserver_api::models::{
@@ -62,7 +61,6 @@ use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError};
use crate::tenant::config::{EvictionPolicy, TenantConfOpt};
use pageserver_api::reltag::RelTag;
use pageserver_api::shard::ShardIndex;
use postgres_connection::PgConnectionConfig;
use postgres_ffi::to_pg_timestamp;
@@ -439,11 +437,6 @@ pub enum LogicalSizeCalculationCause {
TenantSizeHandler,
}
#[derive(enumset::EnumSetType)]
pub(crate) enum CompactFlags {
ForceRepartition,
}
/// Public interface functions
impl Timeline {
/// Get the LSN where this branch was created
@@ -701,7 +694,6 @@ impl Timeline {
pub(crate) async fn compact(
self: &Arc<Self>,
cancel: &CancellationToken,
flags: EnumSet<CompactFlags>,
ctx: &RequestContext,
) -> Result<(), CompactionError> {
// this wait probably never needs any "long time spent" logging, because we already nag if
@@ -774,7 +766,6 @@ impl Timeline {
.repartition(
self.get_last_record_lsn(),
self.get_compaction_target_size(),
flags,
ctx,
)
.await
@@ -1598,7 +1589,6 @@ impl Timeline {
// Copy to move into the task we're about to spawn
let generation = self.generation;
let shard = self.get_shard_index();
let this = self.myself.upgrade().expect("&self method holds the arc");
let (loaded_layers, needs_cleanup, total_physical_size) = tokio::task::spawn_blocking({
@@ -1647,7 +1637,6 @@ impl Timeline {
index_part.as_ref(),
disk_consistent_lsn,
generation,
shard,
);
let mut loaded_layers = Vec::new();
@@ -1720,34 +1709,7 @@ impl Timeline {
guard.initialize_local_layers(loaded_layers, disk_consistent_lsn + 1);
if let Some(rtc) = self.remote_client.as_ref() {
rtc.schedule_layer_file_deletion(&needs_cleanup)?;
rtc.schedule_index_upload_for_file_changes()?;
// This barrier orders above DELETEs before any later operations.
// This is critical because code executing after the barrier might
// create again objects with the same key that we just scheduled for deletion.
// For example, if we just scheduled deletion of an image layer "from the future",
// later compaction might run again and re-create the same image layer.
// "from the future" here means an image layer whose LSN is > IndexPart::disk_consistent_lsn.
// "same" here means same key range and LSN.
//
// Without a barrier between above DELETEs and the re-creation's PUTs,
// the upload queue may execute the PUT first, then the DELETE.
// In our example, we will end up with an IndexPart referencing a non-existent object.
//
// 1. a future image layer is created and uploaded
// 2. ps restart
// 3. the future layer from (1) is deleted during load layer map
// 4. image layer is re-created and uploaded
// 5. deletion queue would like to delete (1) but actually deletes (4)
// 6. delete by name works as expected, but it now deletes the wrong (later) version
//
// See https://github.com/neondatabase/neon/issues/5878
//
// NB: generation numbers naturally protect against this because they disambiguate
// (1) and (4)
rtc.schedule_barrier()?;
// Tenant::create_timeline will wait for these uploads to happen before returning, or
// on retry.
rtc.flushing_delete_layers(&needs_cleanup).await?;
}
info!(
@@ -2560,12 +2522,7 @@ impl Timeline {
// Note: The 'ctx' in use here has DownloadBehavior::Error. We should not
// require downloading anything during initial import.
let (partitioning, _lsn) = self
.repartition(
self.initdb_lsn,
self.get_compaction_target_size(),
EnumSet::empty(),
ctx,
)
.repartition(self.initdb_lsn, self.get_compaction_target_size(), ctx)
.await?;
if self.cancel.is_cancelled() {
@@ -2603,8 +2560,6 @@ impl Timeline {
)
};
pausable_failpoint!("flush-layer-cancel-after-writing-layer-out-pausable");
if self.cancel.is_cancelled() {
return Err(FlushLayerError::Cancelled);
}
@@ -2786,16 +2741,12 @@ impl Timeline {
&self,
lsn: Lsn,
partition_size: u64,
flags: EnumSet<CompactFlags>,
ctx: &RequestContext,
) -> anyhow::Result<(KeyPartitioning, Lsn)> {
{
let partitioning_guard = self.partitioning.lock().unwrap();
let distance = lsn.0 - partitioning_guard.1 .0;
if partitioning_guard.1 != Lsn(0)
&& distance <= self.repartition_threshold
&& !flags.contains(CompactFlags::ForceRepartition)
{
if partitioning_guard.1 != Lsn(0) && distance <= self.repartition_threshold {
debug!(
distance,
threshold = self.repartition_threshold,
@@ -3543,22 +3494,21 @@ impl Timeline {
}
// FIXME: the writer already fsyncs all data, only rename needs to be fsynced here
let layer_paths: Vec<Utf8PathBuf> = new_layers
let mut layer_paths: Vec<Utf8PathBuf> = new_layers
.iter()
.map(|l| l.local_path().to_owned())
.collect();
// Fsync all the layer files and directory using multiple threads to
// minimize latency.
par_fsync::par_fsync_async(&layer_paths)
.await
.context("fsync all new layers")?;
//
// FIXME: spawn_blocking above for this
par_fsync::par_fsync(&layer_paths).context("fsync all new layers")?;
let timeline_dir = self.conf.timeline_path(&self.tenant_id, &self.timeline_id);
par_fsync::par_fsync_async(&[timeline_dir])
.await
par_fsync::par_fsync(&[self.conf.timeline_path(&self.tenant_id, &self.timeline_id)])
.context("fsync of timeline dir")?;
layer_paths.pop().unwrap();
}
stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();
@@ -3731,7 +3681,6 @@ impl Timeline {
retain_lsns: Vec<Lsn>,
cutoff_horizon: Lsn,
pitr: Duration,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> anyhow::Result<()> {
// First, calculate pitr_cutoff_timestamp and then convert it to LSN.
@@ -3745,10 +3694,7 @@ impl Timeline {
if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) {
let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp);
match self
.find_lsn_for_timestamp(pitr_timestamp, cancel, ctx)
.await?
{
match self.find_lsn_for_timestamp(pitr_timestamp, ctx).await? {
LsnForTimestamp::Present(lsn) => lsn,
LsnForTimestamp::Future(lsn) => {
// The timestamp is in the future. That sounds impossible,
@@ -4367,11 +4313,6 @@ impl Timeline {
resident_layers,
}
}
pub(crate) fn get_shard_index(&self) -> ShardIndex {
// TODO: carry this on the struct
ShardIndex::unsharded()
}
}
type TraversalPathItem = (

View File

@@ -110,6 +110,35 @@ async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTi
Ok(())
}
// We delete local files first, so if pageserver restarts after local files deletion then remote deletion is not continued.
// This can be solved with inversion of these steps. But even if these steps are inverted then, when index_part.json
// gets deleted there is no way to distinguish between "this timeline is good, we just didnt upload it to remote"
// and "this timeline is deleted we should continue with removal of local state". So to avoid the ambiguity we use a mark file.
// After index part is deleted presence of this mark file indentifies that it was a deletion intention.
// So we can just remove the mark file.
async fn create_delete_mark(
conf: &PageServerConf,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<(), DeleteTimelineError> {
fail::fail_point!("timeline-delete-before-delete-mark", |_| {
Err(anyhow::anyhow!(
"failpoint: timeline-delete-before-delete-mark"
))?
});
let marker_path = conf.timeline_delete_mark_file_path(tenant_id, timeline_id);
// Note: we're ok to replace existing file.
let _ = std::fs::OpenOptions::new()
.write(true)
.create(true)
.open(&marker_path)
.with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
Ok(())
}
/// Grab the layer_removal_cs lock, and actually perform the deletion.
///
/// This lock prevents prevents GC or compaction from running at the same time.
@@ -282,8 +311,6 @@ async fn cleanup_remaining_timeline_fs_traces(
.context("fsync_pre_mark_remove")?;
// Remove delete mark
// TODO: once we are confident that no more exist in the field, remove this
// line. It cleans up a legacy marker file that might in rare cases be present.
tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id))
.await
.or_else(fs_ext::ignore_not_found)
@@ -364,6 +391,8 @@ impl DeleteTimelineFlow {
set_deleted_in_remote_index(&timeline).await?;
create_delete_mark(tenant.conf, timeline.tenant_id, timeline.timeline_id).await?;
fail::fail_point!("timeline-delete-before-schedule", |_| {
Err(anyhow::anyhow!(
"failpoint: timeline-delete-before-schedule"
@@ -435,6 +464,10 @@ impl DeleteTimelineFlow {
guard.mark_in_progress()?;
// Note that delete mark can be missing on resume
// because we create delete mark after we set deleted_at in the index part.
create_delete_mark(tenant.conf, tenant.tenant_id, timeline_id).await?;
Self::schedule_background(guard, tenant.conf, tenant, timeline);
Ok(())

View File

@@ -351,7 +351,7 @@ impl Timeline {
match state.last_layer_access_imitation {
Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
_ => {
self.imitate_synthetic_size_calculation_worker(&tenant, cancel, ctx)
self.imitate_synthetic_size_calculation_worker(&tenant, ctx, cancel)
.await;
state.last_layer_access_imitation = Some(tokio::time::Instant::now());
}
@@ -417,8 +417,8 @@ impl Timeline {
async fn imitate_synthetic_size_calculation_worker(
&self,
tenant: &Arc<Tenant>,
cancel: &CancellationToken,
ctx: &RequestContext,
cancel: &CancellationToken,
) {
if self.conf.metric_collection_endpoint.is_none() {
// We don't start the consumption metrics task if this is not set in the config.
@@ -457,7 +457,6 @@ impl Timeline {
None,
&mut throwaway_cache,
LogicalSizeCalculationCause::EvictionTaskImitation,
cancel,
ctx,
)
.instrument(info_span!("gather_inputs"));

View File

@@ -13,7 +13,6 @@ use crate::{
};
use anyhow::Context;
use camino::Utf8Path;
use pageserver_api::shard::ShardIndex;
use std::{collections::HashMap, str::FromStr};
use utils::lsn::Lsn;
@@ -108,7 +107,6 @@ pub(super) fn reconcile(
index_part: Option<&IndexPart>,
disk_consistent_lsn: Lsn,
generation: Generation,
shard: ShardIndex,
) -> Vec<(LayerFileName, Result<Decision, DismissedLayer>)> {
use Decision::*;
@@ -120,13 +118,10 @@ pub(super) fn reconcile(
.map(|(name, file_size)| {
(
name,
// The generation and shard here will be corrected to match IndexPart in the merge below, unless
// The generation here will be corrected to match IndexPart in the merge below, unless
// it is not in IndexPart, in which case using our current generation makes sense
// because it will be uploaded in this generation.
(
Some(LayerFileMetadata::new(file_size, generation, shard)),
None,
),
(Some(LayerFileMetadata::new(file_size, generation)), None),
)
})
.collect::<Collected>();

View File

@@ -45,20 +45,12 @@ impl<'t> UninitializedTimeline<'t> {
let timeline_id = self.timeline_id;
let tenant_id = self.owning_tenant.tenant_id;
if self.raw_timeline.is_none() {
return Err(anyhow::anyhow!(
"No timeline for initialization found for {tenant_id}/{timeline_id}"
));
}
let (new_timeline, uninit_mark) = self.raw_timeline.take().with_context(|| {
format!("No timeline for initalization found for {tenant_id}/{timeline_id}")
})?;
// Check that the caller initialized disk_consistent_lsn
let new_disk_consistent_lsn = self
.raw_timeline
.as_ref()
.expect("checked above")
.0
.get_disk_consistent_lsn();
let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn();
anyhow::ensure!(
new_disk_consistent_lsn.is_valid(),
"new timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn"
@@ -70,13 +62,6 @@ impl<'t> UninitializedTimeline<'t> {
"Found freshly initialized timeline {tenant_id}/{timeline_id} in the tenant map"
),
Entry::Vacant(v) => {
// after taking here should be no fallible operations, because the drop guard will not
// cleanup after and would block for example the tenant deletion
let (new_timeline, uninit_mark) =
self.raw_timeline.take().expect("already checked");
// this is the mutual exclusion between different retries to create the timeline;
// this should be an assertion.
uninit_mark.remove_uninit_mark().with_context(|| {
format!(
"Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}"
@@ -85,10 +70,10 @@ impl<'t> UninitializedTimeline<'t> {
v.insert(Arc::clone(&new_timeline));
new_timeline.maybe_spawn_flush_loop();
Ok(new_timeline)
}
}
Ok(new_timeline)
}
/// Prepares timeline data by loading it from the basebackup archive.

View File

@@ -1,5 +1,6 @@
use super::storage_layer::LayerFileName;
use super::storage_layer::ResidentLayer;
use super::Generation;
use crate::tenant::metadata::TimelineMetadata;
use crate::tenant::remote_timeline_client::index::IndexPart;
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
@@ -14,9 +15,6 @@ use utils::lsn::AtomicLsn;
use std::sync::atomic::AtomicU32;
use utils::lsn::Lsn;
#[cfg(feature = "testing")]
use utils::generation::Generation;
// clippy warns that Uninitialized is much smaller than Initialized, which wastes
// memory for Uninitialized variants. Doesn't matter in practice, there are not
// that many upload queues in a running pageserver, and most of them are initialized
@@ -234,7 +232,7 @@ pub(crate) struct UploadTask {
/// for timeline deletion, which skips this queue and goes directly to DeletionQueue.
#[derive(Debug)]
pub(crate) struct Delete {
pub(crate) layers: Vec<(LayerFileName, LayerFileMetadata)>,
pub(crate) layers: Vec<(LayerFileName, Generation)>,
}
#[derive(Debug)]

View File

@@ -91,7 +91,6 @@ struct ProcessOutput {
pub struct PostgresRedoManager {
tenant_id: TenantId,
conf: &'static PageServerConf,
last_redo_at: std::sync::Mutex<Option<Instant>>,
redo_process: RwLock<Option<Arc<WalRedoProcess>>>,
}
@@ -188,26 +187,10 @@ impl PostgresRedoManager {
PostgresRedoManager {
tenant_id,
conf,
last_redo_at: std::sync::Mutex::default(),
redo_process: RwLock::new(None),
}
}
/// This type doesn't have its own background task to check for idleness: we
/// rely on our owner calling this function periodically in its own housekeeping
/// loops.
pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
if let Ok(g) = self.last_redo_at.try_lock() {
if let Some(last_redo_at) = *g {
if last_redo_at.elapsed() >= idle_timeout {
drop(g);
let mut guard = self.redo_process.write().unwrap();
*guard = None;
}
}
}
}
///
/// Process one request for WAL redo using wal-redo postgres
///
@@ -222,8 +205,6 @@ impl PostgresRedoManager {
wal_redo_timeout: Duration,
pg_version: u32,
) -> anyhow::Result<Bytes> {
*(self.last_redo_at.lock().unwrap()) = Some(Instant::now());
let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
const MAX_RETRY_ATTEMPTS: u32 = 1;
let mut n_attempts = 0u32;
@@ -367,13 +348,12 @@ impl PostgresRedoManager {
self.apply_record_neon(key, &mut page, *record_lsn, record)?;
}
// Success!
let duration = start_time.elapsed();
// FIXME: using the same metric here creates a bimodal distribution by default, and because
// there could be multiple batch sizes this would be N+1 modal.
let end_time = Instant::now();
let duration = end_time.duration_since(start_time);
WAL_REDO_TIME.observe(duration.as_secs_f64());
debug!(
"neon applied {} WAL records in {} us to reconstruct page image at LSN {}",
"neon applied {} WAL records in {} ms to reconstruct page image at LSN {}",
records.len(),
duration.as_micros(),
lsn

View File

@@ -20,7 +20,7 @@ SHLIB_LINK_INTERNAL = $(libpq)
SHLIB_LINK = -lcurl
EXTENSION = neon
DATA = neon--1.0.sql neon--1.0--1.1.sql
DATA = neon--1.0.sql
PGFILEDESC = "neon - cloud storage for PostgreSQL"
EXTRA_CLEAN = \

View File

@@ -1,20 +0,0 @@
neon extension consists of several parts:
### shared preload library `neon.so`
- implements storage manager API and network communications with remote page server.
- walproposer: implements broadcast protocol between postgres and WAL safekeepers.
- control plane connector: Captures updates to roles/databases using ProcessUtility_hook and sends them to the control ProcessUtility_hook.
- remote extension server: Request compute_ctl to download extension files.
- file_cache: Local file cache is used to temporary store relations pages in local file system for better performance.
- relsize_cache: Relation size cache for better neon performance.
### SQL functions in `neon--*.sql`
Utility functions to expose neon specific information to user and metrics collection.
This extension is created in all databases in the cluster by default.

View File

@@ -475,12 +475,6 @@ NeonXactCallback(XactEvent event, void *arg)
Assert(CurrentDdlTable == &RootTable);
}
static bool
RoleIsNeonSuperuser(const char *role_name)
{
return strcmp(role_name, "neon_superuser") == 0;
}
static void
HandleCreateDb(CreatedbStmt *stmt)
{
@@ -507,16 +501,9 @@ HandleCreateDb(CreatedbStmt *stmt)
entry->type = Op_Set;
if (downer && downer->arg)
{
const char *owner_name = defGetString(downer);
if (RoleIsNeonSuperuser(owner_name))
elog(ERROR, "can't create a database with owner neon_superuser");
entry->owner = get_role_oid(owner_name, false);
}
entry->owner = get_role_oid(defGetString(downer), false);
else
{
entry->owner = GetUserId();
}
}
static void
@@ -535,10 +522,8 @@ HandleAlterOwner(AlterOwnerStmt *stmt)
if (!found)
memset(entry->old_name, 0, sizeof(entry->old_name));
const char *new_owner = get_rolespec_name(stmt->newowner);
if (RoleIsNeonSuperuser(new_owner))
elog(ERROR, "can't alter owner to neon_superuser");
entry->owner = get_role_oid(new_owner, false);
entry->owner = get_role_oid(get_rolespec_name(stmt->newowner), false);
entry->type = Op_Set;
}
@@ -632,9 +617,6 @@ HandleAlterRole(AlterRoleStmt *stmt)
InitRoleTableIfNeeded();
DefElem *dpass = NULL;
ListCell *option;
const char *role_name = stmt->role->rolename;
if (RoleIsNeonSuperuser(role_name))
elog(ERROR, "can't ALTER neon_superuser");
foreach(option, stmt->options)
{
@@ -649,7 +631,7 @@ HandleAlterRole(AlterRoleStmt *stmt)
bool found = false;
RoleEntry *entry = hash_search(
CurrentDdlTable->role_table,
role_name,
stmt->role->rolename,
HASH_ENTER,
&found);

View File

@@ -32,13 +32,11 @@
#include "storage/latch.h"
#include "storage/ipc.h"
#include "storage/lwlock.h"
#include "utils/builtins.h"
#include "utils/dynahash.h"
#include "utils/guc.h"
#include "storage/fd.h"
#include "storage/pg_shmem.h"
#include "storage/buf_internals.h"
#include "pgstat.h"
/*
* Local file cache is used to temporary store relations pages in local file system.
@@ -67,7 +65,6 @@
typedef struct FileCacheEntry
{
BufferTag key;
uint32 hash;
uint32 offset;
uint32 access_count;
uint32 bitmap[BLOCKS_PER_CHUNK/32];
@@ -79,10 +76,6 @@ typedef struct FileCacheControl
uint64 generation; /* generation is needed to handle correct hash reenabling */
uint32 size; /* size of cache file in chunks */
uint32 used; /* number of used chunks */
uint32 limit; /* shared copy of lfc_size_limit */
uint64 hits;
uint64 misses;
uint64 writes;
dlist_head lru; /* double linked list for LRU replacement algorithm */
} FileCacheControl;
@@ -98,12 +91,10 @@ static shmem_startup_hook_type prev_shmem_startup_hook;
static shmem_request_hook_type prev_shmem_request_hook;
#endif
#define LFC_ENABLED() (lfc_ctl->limit != 0)
void PGDLLEXPORT FileCacheMonitorMain(Datum main_arg);
void FileCacheMonitorMain(Datum main_arg);
/*
* Local file cache is optional and Neon can work without it.
* Local file cache is mandatory and Neon can work without it.
* In case of any any errors with this cache, we should disable it but to not throw error.
* Also we should allow re-enable it if source of failure (lack of disk space, permissions,...) is fixed.
* All cache content should be invalidated to avoid reading of stale or corrupted data
@@ -111,77 +102,49 @@ void PGDLLEXPORT FileCacheMonitorMain(Datum main_arg);
static void
lfc_disable(char const* op)
{
int fd;
HASH_SEQ_STATUS status;
FileCacheEntry* entry;
elog(WARNING, "Failed to %s local file cache at %s: %m, disabling local file cache", op, lfc_path);
/* Invalidate hash */
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
if (LFC_ENABLED())
{
HASH_SEQ_STATUS status;
FileCacheEntry* entry;
hash_seq_init(&status, lfc_hash);
while ((entry = hash_seq_search(&status)) != NULL)
{
hash_search_with_hash_value(lfc_hash, &entry->key, entry->hash, HASH_REMOVE, NULL);
}
lfc_ctl->generation += 1;
lfc_ctl->size = 0;
lfc_ctl->used = 0;
lfc_ctl->limit = 0;
dlist_init(&lfc_ctl->lru);
if (lfc_desc > 0)
{
/* If the reason of error is ENOSPC, then truncation of file may help to reclaim some space */
int rc = ftruncate(lfc_desc, 0);
if (rc < 0)
elog(WARNING, "Failed to truncate local file cache %s: %m", lfc_path);
}
}
/* We need to use unlink to to avoid races in LFC write, because it is not protectedby */
unlink(lfc_path);
fd = BasicOpenFile(lfc_path, O_RDWR|O_CREAT|O_TRUNC);
if (fd < 0)
elog(WARNING, "Failed to recreate local file cache %s: %m", lfc_path);
else
close(fd);
LWLockRelease(lfc_lock);
if (lfc_desc > 0)
close(lfc_desc);
lfc_desc = -1;
}
lfc_size_limit = 0;
/*
* This check is done without obtaining lfc_lock, so it is unreliable
*/
static bool
lfc_maybe_disabled(void)
{
return !lfc_ctl || !LFC_ENABLED();
/* Invalidate hash */
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
hash_seq_init(&status, lfc_hash);
while ((entry = hash_seq_search(&status)) != NULL)
{
hash_search(lfc_hash, &entry->key, HASH_REMOVE, NULL);
memset(entry->bitmap, 0, sizeof entry->bitmap);
}
hash_seq_term(&status);
lfc_ctl->generation += 1;
lfc_ctl->size = 0;
lfc_ctl->used = 0;
dlist_init(&lfc_ctl->lru);
LWLockRelease(lfc_lock);
}
static bool
lfc_ensure_opened(void)
{
bool enabled = !lfc_maybe_disabled();
/* Open cache file if not done yet */
if (lfc_desc <= 0 && enabled)
if (lfc_desc <= 0)
{
lfc_desc = BasicOpenFile(lfc_path, O_RDWR);
lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
if (lfc_desc < 0) {
lfc_disable("open");
return false;
}
}
return enabled;
return true;
}
static void
@@ -200,7 +163,6 @@ lfc_shmem_startup(void)
lfc_ctl = (FileCacheControl*)ShmemInitStruct("lfc", sizeof(FileCacheControl), &found);
if (!found)
{
int fd;
uint32 lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size);
lfc_lock = (LWLockId)GetNamedLWLockTranche("lfc_lock");
info.keysize = sizeof(BufferTag);
@@ -213,23 +175,10 @@ lfc_shmem_startup(void)
lfc_ctl->generation = 0;
lfc_ctl->size = 0;
lfc_ctl->used = 0;
lfc_ctl->hits = 0;
lfc_ctl->misses = 0;
lfc_ctl->writes = 0;
dlist_init(&lfc_ctl->lru);
/* Recreate file cache on restart */
fd = BasicOpenFile(lfc_path, O_RDWR|O_CREAT|O_TRUNC);
if (fd < 0)
{
elog(WARNING, "Failed to create local file cache %s: %m", lfc_path);
lfc_ctl->limit = 0;
}
else
{
close(fd);
lfc_ctl->limit = SIZE_MB_TO_CHUNKS(lfc_size_limit);
}
/* Remove file cache on restart */
(void)unlink(lfc_path);
}
LWLockRelease(AddinShmemInitLock);
}
@@ -246,17 +195,6 @@ lfc_shmem_request(void)
RequestNamedLWLockTranche("lfc_lock", 1);
}
static bool
is_normal_backend(void)
{
/*
* Stats collector detach shared memory, so we should not try to access shared memory here.
* Parallel workers first assign default value (0), so not perform truncation in parallel workers.
* The Postmaster can handle SIGHUP and it has access to shared memory (UsedShmemSegAddr != NULL), but has no PGPROC.
*/
return lfc_ctl && MyProc && UsedShmemSegAddr && !IsParallelWorker();
}
static bool
lfc_check_limit_hook(int *newval, void **extra, GucSource source)
{
@@ -272,15 +210,25 @@ static void
lfc_change_limit_hook(int newval, void *extra)
{
uint32 new_size = SIZE_MB_TO_CHUNKS(newval);
if (!is_normal_backend())
return;
if (!lfc_ensure_opened())
/*
* Stats collector detach shared memory, so we should not try to access shared memory here.
* Parallel workers first assign default value (0), so not perform truncation in parallel workers.
* The Postmaster can handle SIGHUP and it has access to shared memory (UsedShmemSegAddr != NULL), but has no PGPROC.
*/
if (!lfc_ctl || !MyProc || !UsedShmemSegAddr || IsParallelWorker())
return;
/* Open cache file if not done yet */
if (lfc_desc <= 0)
{
lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
if (lfc_desc < 0) {
elog(WARNING, "Failed to open file cache %s: %m, disabling file cache", lfc_path);
lfc_size_limit = 0; /* disable file cache */
return;
}
}
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
while (new_size < lfc_ctl->used && !dlist_is_empty(&lfc_ctl->lru))
{
/* Shrink cache by throwing away least recently accessed chunks and returning their space to file system */
@@ -290,12 +238,10 @@ lfc_change_limit_hook(int newval, void *extra)
if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, (off_t)victim->offset*BLOCKS_PER_CHUNK*BLCKSZ, BLOCKS_PER_CHUNK*BLCKSZ) < 0)
elog(LOG, "Failed to punch hole in file: %m");
#endif
hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL);
lfc_ctl->used -= 1;
}
lfc_ctl->limit = new_size;
elog(DEBUG1, "set local file cache limit to %d", new_size);
LWLockRelease(lfc_lock);
}
@@ -309,7 +255,6 @@ lfc_init(void)
if (!process_shared_preload_libraries_in_progress)
elog(ERROR, "Neon module should be loaded via shared_preload_libraries");
DefineCustomIntVariable("neon.max_file_cache_size",
"Maximal size of Neon local file cache",
NULL,
@@ -370,10 +315,10 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
BufferTag tag;
FileCacheEntry* entry;
int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
bool found = false;
bool found;
uint32 hash;
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
return false;
CopyNRelFileInfoToBufTag(tag, rinfo);
@@ -382,11 +327,8 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
hash = get_hash_value(lfc_hash, &tag);
LWLockAcquire(lfc_lock, LW_SHARED);
if (LFC_ENABLED())
{
entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
found = entry != NULL && (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) != 0;
}
entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
found = entry != NULL && (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) != 0;
LWLockRelease(lfc_lock);
return found;
}
@@ -403,7 +345,7 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
uint32 hash;
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
return;
CopyNRelFileInfoToBufTag(tag, rinfo);
@@ -413,13 +355,6 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
hash = get_hash_value(lfc_hash, &tag);
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
if (!LFC_ENABLED())
{
LWLockRelease(lfc_lock);
return;
}
entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, &found);
if (!found)
@@ -470,7 +405,7 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
/*
* Try to read page from local cache.
* Returns true if page is found in local cache.
* In case of error local file cache is disabled (lfc->limit is set to zero).
* In case of error lfc_size_limit is set to zero to disable any further opera-tins with cache.
*/
bool
lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
@@ -485,7 +420,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
uint64 generation;
uint32 entry_offset;
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
return false;
if (!lfc_ensure_opened())
@@ -497,18 +432,10 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
hash = get_hash_value(lfc_hash, &tag);
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
if (!LFC_ENABLED())
{
LWLockRelease(lfc_lock);
return false;
}
entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0)
{
/* Page is not cached */
lfc_ctl->misses += 1;
LWLockRelease(lfc_lock);
return false;
}
@@ -529,11 +456,8 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
/* Place entry to the head of LRU list */
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
if (lfc_ctl->generation == generation)
{
Assert(LFC_ENABLED());
lfc_ctl->hits += 1;
Assert(entry->access_count > 0);
if (--entry->access_count == 0)
dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);
@@ -564,10 +488,8 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
bool found;
int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
uint32 hash;
uint64 generation;
uint32 entry_offset;
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
return;
if (!lfc_ensure_opened())
@@ -575,17 +497,12 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
tag.forkNum = forkNum;
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
CopyNRelFileInfoToBufTag(tag, rinfo);
hash = get_hash_value(lfc_hash, &tag);
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
if (!LFC_ENABLED())
{
LWLockRelease(lfc_lock);
return;
}
entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found);
if (found)
@@ -604,13 +521,13 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
* there are should be very large number of concurrent IO operations and them are limited by max_connections,
* we prefer not to complicate code and use second approach.
*/
if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru))
if (lfc_ctl->used >= SIZE_MB_TO_CHUNKS(lfc_size_limit) && !dlist_is_empty(&lfc_ctl->lru))
{
/* Cache overflow: evict least recently used chunk */
FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
Assert(victim->access_count == 0);
entry->offset = victim->offset; /* grab victim's chunk */
hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL);
elog(DEBUG2, "Swap file cache page");
}
else
@@ -619,140 +536,27 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
entry->offset = lfc_ctl->size++; /* allocate new chunk at end of file */
}
entry->access_count = 1;
entry->hash = hash;
memset(entry->bitmap, 0, sizeof entry->bitmap);
}
generation = lfc_ctl->generation;
entry_offset = entry->offset;
lfc_ctl->writes += 1;
LWLockRelease(lfc_lock);
rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t)entry_offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
if (rc != BLCKSZ)
{
LWLockRelease(lfc_lock);
lfc_disable("write");
}
else
{
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
if (lfc_ctl->generation == generation)
{
Assert(LFC_ENABLED());
/* Place entry to the head of LRU list */
Assert(entry->access_count > 0);
if (--entry->access_count == 0)
dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);
entry->bitmap[chunk_offs >> 5] |= (1 << (chunk_offs & 31));
}
/* Place entry to the head of LRU list */
Assert(entry->access_count > 0);
if (--entry->access_count == 0)
dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);
entry->bitmap[chunk_offs >> 5] |= (1 << (chunk_offs & 31));
LWLockRelease(lfc_lock);
}
}
typedef struct
{
TupleDesc tupdesc;
} NeonGetStatsCtx;
#define NUM_NEON_GET_STATS_COLS 2
#define NUM_NEON_GET_STATS_ROWS 3
PG_FUNCTION_INFO_V1(neon_get_lfc_stats);
Datum
neon_get_lfc_stats(PG_FUNCTION_ARGS)
{
FuncCallContext *funcctx;
NeonGetStatsCtx* fctx;
MemoryContext oldcontext;
TupleDesc tupledesc;
Datum result;
HeapTuple tuple;
char const* key;
uint64 value;
Datum values[NUM_NEON_GET_STATS_COLS];
bool nulls[NUM_NEON_GET_STATS_COLS];
if (SRF_IS_FIRSTCALL())
{
funcctx = SRF_FIRSTCALL_INIT();
/* Switch context when allocating stuff to be used in later calls */
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
/* Create a user function context for cross-call persistence */
fctx = (NeonGetStatsCtx*) palloc(sizeof(NeonGetStatsCtx));
/* Construct a tuple descriptor for the result rows. */
tupledesc = CreateTemplateTupleDesc(NUM_NEON_GET_STATS_COLS);
TupleDescInitEntry(tupledesc, (AttrNumber) 1, "lfc_key",
TEXTOID, -1, 0);
TupleDescInitEntry(tupledesc, (AttrNumber) 2, "lfc_value",
INT8OID, -1, 0);
fctx->tupdesc = BlessTupleDesc(tupledesc);
funcctx->max_calls = NUM_NEON_GET_STATS_ROWS;
funcctx->user_fctx = fctx;
/* Return to original context when allocating transient memory */
MemoryContextSwitchTo(oldcontext);
}
funcctx = SRF_PERCALL_SETUP();
/* Get the saved state */
fctx = (NeonGetStatsCtx*) funcctx->user_fctx;
switch (funcctx->call_cntr)
{
case 0:
key = "file_cache_misses";
if (lfc_ctl)
value = lfc_ctl->misses;
break;
case 1:
key = "file_cache_hits";
if (lfc_ctl)
value = lfc_ctl->hits;
break;
case 2:
key = "file_cache_used";
if (lfc_ctl)
value = lfc_ctl->used;
break;
case 3:
key = "file_cache_writes";
if (lfc_ctl)
value = lfc_ctl->writes;
break;
default:
SRF_RETURN_DONE(funcctx);
}
values[0] = PointerGetDatum(cstring_to_text(key));
nulls[0] = false;
if (lfc_ctl)
{
nulls[1] = false;
values[1] = Int64GetDatum(value);
}
else
nulls[1] = true;
tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
result = HeapTupleGetDatum(tuple);
SRF_RETURN_NEXT(funcctx, result);
}
/*
* Function returning data from the local file cache
* relation node/tablespace/database/blocknum and access_counter
*/
PG_FUNCTION_INFO_V1(local_cache_pages);
/*
* Record structure holding the to be exposed cache data.
*/
@@ -776,6 +580,11 @@ typedef struct
LocalCachePagesRec *record;
} LocalCachePagesContext;
/*
* Function returning data from the local file cache
* relation node/tablespace/database/blocknum and access_counter
*/
PG_FUNCTION_INFO_V1(local_cache_pages);
#define NUM_LOCALCACHE_PAGES_ELEM 7
@@ -842,20 +651,15 @@ local_cache_pages(PG_FUNCTION_ARGS)
fctx->tupdesc = BlessTupleDesc(tupledesc);
if (lfc_ctl)
{
LWLockAcquire(lfc_lock, LW_SHARED);
LWLockAcquire(lfc_lock, LW_SHARED);
if (LFC_ENABLED())
{
hash_seq_init(&status, lfc_hash);
while ((entry = hash_seq_search(&status)) != NULL)
{
for (int i = 0; i < BLOCKS_PER_CHUNK/32; i++)
n_pages += pg_popcount32(entry->bitmap[i]);
}
}
hash_seq_init(&status, lfc_hash);
while ((entry = hash_seq_search(&status)) != NULL)
{
for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
n_pages += (entry->bitmap[i >> 5] & (1 << (i & 31))) != 0;
}
hash_seq_term(&status);
fctx->record = (LocalCachePagesRec *)
MemoryContextAllocHuge(CurrentMemoryContext,
sizeof(LocalCachePagesRec) * n_pages);
@@ -867,35 +671,36 @@ local_cache_pages(PG_FUNCTION_ARGS)
/* Return to original context when allocating transient memory */
MemoryContextSwitchTo(oldcontext);
if (n_pages != 0)
/*
* Scan through all the buffers, saving the relevant fields in the
* fctx->record structure.
*
* We don't hold the partition locks, so we don't get a consistent
* snapshot across all buffers, but we do grab the buffer header
* locks, so the information of each buffer is self-consistent.
*/
n_pages = 0;
hash_seq_init(&status, lfc_hash);
while ((entry = hash_seq_search(&status)) != NULL)
{
/*
* Scan through all the cache entries, saving the relevant fields in the
* fctx->record structure.
*/
uint32 n = 0;
hash_seq_init(&status, lfc_hash);
while ((entry = hash_seq_search(&status)) != NULL)
for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
{
for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
if (entry->bitmap[i >> 5] & (1 << (i & 31)))
{
if (entry->bitmap[i >> 5] & (1 << (i & 31)))
{
fctx->record[n].pageoffs = entry->offset*BLOCKS_PER_CHUNK + i;
fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key));
fctx->record[n].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key));
fctx->record[n].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key));
fctx->record[n].forknum = entry->key.forkNum;
fctx->record[n].blocknum = entry->key.blockNum + i;
fctx->record[n].accesscount = entry->access_count;
n += 1;
}
fctx->record[n_pages].pageoffs = entry->offset*BLOCKS_PER_CHUNK + i;
fctx->record[n_pages].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key));
fctx->record[n_pages].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key));
fctx->record[n_pages].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key));
fctx->record[n_pages].forknum = entry->key.forkNum;
fctx->record[n_pages].blocknum = entry->key.blockNum + i;
fctx->record[n_pages].accesscount = entry->access_count;
n_pages += 1;
}
}
Assert(n_pages == n);
}
if (lfc_ctl)
LWLockRelease(lfc_lock);
hash_seq_term(&status);
Assert(n_pages == funcctx->max_calls);
LWLockRelease(lfc_lock);
}
funcctx = SRF_PERCALL_SETUP();

View File

@@ -21,7 +21,6 @@
#include "storage/buf_internals.h"
#include "storage/lwlock.h"
#include "storage/ipc.h"
#include "storage/pg_shmem.h"
#include "c.h"
#include "postmaster/interrupt.h"
@@ -88,12 +87,6 @@ bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) =
static bool pageserver_flush(void);
static void pageserver_disconnect(void);
static bool
PagestoreShmemIsValid()
{
return pagestore_shared && UsedShmemSegAddr;
}
static bool
CheckPageserverConnstring(char **newval, void **extra, GucSource source)
{
@@ -103,7 +96,7 @@ CheckPageserverConnstring(char **newval, void **extra, GucSource source)
static void
AssignPageserverConnstring(const char *newval, void *extra)
{
if(!PagestoreShmemIsValid())
if(!pagestore_shared)
return;
LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE);
strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
@@ -114,7 +107,7 @@ AssignPageserverConnstring(const char *newval, void *extra)
static bool
CheckConnstringUpdated()
{
if(!PagestoreShmemIsValid())
if(!pagestore_shared)
return false;
return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter);
}
@@ -122,7 +115,7 @@ CheckConnstringUpdated()
static void
ReloadConnstring()
{
if(!PagestoreShmemIsValid())
if(!pagestore_shared)
return;
LWLockAcquire(pagestore_shared->lock, LW_SHARED);
strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));

View File

@@ -1,10 +0,0 @@
\echo Use "ALTER EXTENSION neon UPDATE TO '1.1'" to load this file. \quit
CREATE FUNCTION neon_get_lfc_stats()
RETURNS SETOF RECORD
AS 'MODULE_PATHNAME', 'neon_get_lfc_stats'
LANGUAGE C PARALLEL SAFE;
-- Create a view for convenient access.
CREATE VIEW neon_lfc_stats AS
SELECT P.* FROM neon_get_lfc_stats() AS P (lfc_key text, lfc_value bigint);

View File

@@ -1,5 +1,4 @@
# neon extension
comment = 'cloud storage for PostgreSQL'
default_version = '1.1'
default_version = '1.0'
module_pathname = '$libdir/neon'
relocatable = true

View File

@@ -76,4 +76,3 @@ tokio-util.workspace = true
rcgen.workspace = true
rstest.workspace = true
tokio-postgres-rustls.workspace = true
postgres-protocol.workspace = true

View File

@@ -6,7 +6,6 @@ pub use link::LinkAuthError;
use tokio_postgres::config::AuthKeys;
use crate::proxy::{handle_try_wake, retry_after, LatencyTimer};
use crate::stream::Stream;
use crate::{
auth::{self, ClientCredentials},
config::AuthenticationConfig,
@@ -132,7 +131,7 @@ async fn auth_quirks_creds(
api: &impl console::Api,
extra: &ConsoleReqExtra<'_>,
creds: &mut ClientCredentials<'_>,
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
allow_cleartext: bool,
config: &'static AuthenticationConfig,
latency_timer: &mut LatencyTimer,
@@ -166,7 +165,7 @@ async fn auth_quirks(
api: &impl console::Api,
extra: &ConsoleReqExtra<'_>,
creds: &mut ClientCredentials<'_>,
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
allow_cleartext: bool,
config: &'static AuthenticationConfig,
latency_timer: &mut LatencyTimer,
@@ -242,7 +241,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
pub async fn authenticate(
&mut self,
extra: &ConsoleReqExtra<'_>,
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
allow_cleartext: bool,
config: &'static AuthenticationConfig,
latency_timer: &mut LatencyTimer,

View File

@@ -6,7 +6,7 @@ use crate::{
console::{self, AuthInfo, ConsoleReqExtra},
proxy::LatencyTimer,
sasl, scram,
stream::{PqStream, Stream},
stream::PqStream,
};
use tokio::io::{AsyncRead, AsyncWrite};
use tracing::{info, warn};
@@ -15,7 +15,7 @@ pub(super) async fn authenticate(
api: &impl console::Api,
extra: &ConsoleReqExtra<'_>,
creds: &ClientCredentials<'_>,
client: &mut PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
config: &'static AuthenticationConfig,
latency_timer: &mut LatencyTimer,
) -> auth::Result<AuthSuccess<ComputeCredentials>> {

View File

@@ -2,7 +2,7 @@ use super::{AuthSuccess, ComputeCredentials};
use crate::{
auth::{self, AuthFlow, ClientCredentials},
proxy::LatencyTimer,
stream::{self, Stream},
stream,
};
use tokio::io::{AsyncRead, AsyncWrite};
use tracing::{info, warn};
@@ -12,7 +12,7 @@ use tracing::{info, warn};
/// These properties are benefical for serverless JS workers, so we
/// use this mechanism for websocket connections.
pub async fn cleartext_hack(
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
latency_timer: &mut LatencyTimer,
) -> auth::Result<AuthSuccess<ComputeCredentials>> {
warn!("cleartext auth flow override is enabled, proceeding");
@@ -37,7 +37,7 @@ pub async fn cleartext_hack(
/// Very similar to [`cleartext_hack`], but there's a specific password format.
pub async fn password_hack(
creds: &mut ClientCredentials<'_>,
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
latency_timer: &mut LatencyTimer,
) -> auth::Result<AuthSuccess<ComputeCredentials>> {
warn!("project not specified, resorting to the password hack auth flow");

View File

@@ -1,21 +1,16 @@
//! Main authentication flow.
use super::{AuthErrorImpl, PasswordHackPayload};
use crate::{
config::TlsServerEndPoint,
sasl, scram,
stream::{PqStream, Stream},
};
use crate::{sasl, scram, stream::PqStream};
use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be};
use std::io;
use tokio::io::{AsyncRead, AsyncWrite};
use tracing::info;
/// Every authentication selector is supposed to implement this trait.
pub trait AuthMethod {
/// Any authentication selector should provide initial backend message
/// containing auth method name and parameters, e.g. md5 salt.
fn first_message(&self, channel_binding: bool) -> BeMessage<'_>;
fn first_message(&self) -> BeMessage<'_>;
}
/// Initial state of [`AuthFlow`].
@@ -26,14 +21,8 @@ pub struct Scram<'a>(pub &'a scram::ServerSecret);
impl AuthMethod for Scram<'_> {
#[inline(always)]
fn first_message(&self, channel_binding: bool) -> BeMessage<'_> {
if channel_binding {
Be::AuthenticationSasl(BeAuthenticationSaslMessage::Methods(scram::METHODS))
} else {
Be::AuthenticationSasl(BeAuthenticationSaslMessage::Methods(
scram::METHODS_WITHOUT_PLUS,
))
}
fn first_message(&self) -> BeMessage<'_> {
Be::AuthenticationSasl(BeAuthenticationSaslMessage::Methods(scram::METHODS))
}
}
@@ -43,7 +32,7 @@ pub struct PasswordHack;
impl AuthMethod for PasswordHack {
#[inline(always)]
fn first_message(&self, _channel_binding: bool) -> BeMessage<'_> {
fn first_message(&self) -> BeMessage<'_> {
Be::AuthenticationCleartextPassword
}
}
@@ -54,44 +43,37 @@ pub struct CleartextPassword;
impl AuthMethod for CleartextPassword {
#[inline(always)]
fn first_message(&self, _channel_binding: bool) -> BeMessage<'_> {
fn first_message(&self) -> BeMessage<'_> {
Be::AuthenticationCleartextPassword
}
}
/// This wrapper for [`PqStream`] performs client authentication.
#[must_use]
pub struct AuthFlow<'a, S, State> {
pub struct AuthFlow<'a, Stream, State> {
/// The underlying stream which implements libpq's protocol.
stream: &'a mut PqStream<Stream<S>>,
stream: &'a mut PqStream<Stream>,
/// State might contain ancillary data (see [`Self::begin`]).
state: State,
tls_server_end_point: TlsServerEndPoint,
}
/// Initial state of the stream wrapper.
impl<'a, S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'a, S, Begin> {
impl<'a, S: AsyncWrite + Unpin> AuthFlow<'a, S, Begin> {
/// Create a new wrapper for client authentication.
pub fn new(stream: &'a mut PqStream<Stream<S>>) -> Self {
let tls_server_end_point = stream.get_ref().tls_server_end_point();
pub fn new(stream: &'a mut PqStream<S>) -> Self {
Self {
stream,
state: Begin,
tls_server_end_point,
}
}
/// Move to the next step by sending auth method's name & params to client.
pub async fn begin<M: AuthMethod>(self, method: M) -> io::Result<AuthFlow<'a, S, M>> {
self.stream
.write_message(&method.first_message(self.tls_server_end_point.supported()))
.await?;
self.stream.write_message(&method.first_message()).await?;
Ok(AuthFlow {
stream: self.stream,
state: method,
tls_server_end_point: self.tls_server_end_point,
})
}
}
@@ -141,15 +123,9 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
return Err(super::AuthError::bad_auth_method(sasl.method));
}
info!("client chooses {}", sasl.method);
let secret = self.state.0;
let outcome = sasl::SaslStream::new(self.stream, sasl.message)
.authenticate(scram::Exchange::new(
secret,
rand::random,
self.tls_server_end_point,
))
.authenticate(scram::Exchange::new(secret, rand::random, None))
.await?;
Ok(outcome)

View File

@@ -6,8 +6,6 @@
use std::{net::SocketAddr, sync::Arc};
use futures::future::Either;
use itertools::Itertools;
use proxy::config::TlsServerEndPoint;
use tokio::net::TcpListener;
use anyhow::{anyhow, bail, ensure, Context};
@@ -67,7 +65,7 @@ async fn main() -> anyhow::Result<()> {
let destination: String = args.get_one::<String>("dest").unwrap().parse()?;
// Configure TLS
let (tls_config, tls_server_end_point): (Arc<rustls::ServerConfig>, TlsServerEndPoint) = match (
let tls_config: Arc<rustls::ServerConfig> = match (
args.get_one::<String>("tls-key"),
args.get_one::<String>("tls-cert"),
) {
@@ -91,22 +89,16 @@ async fn main() -> anyhow::Result<()> {
))?
.into_iter()
.map(rustls::Certificate)
.collect_vec()
.collect()
};
// needed for channel bindings
let first_cert = cert_chain.first().context("missing certificate")?;
let tls_server_end_point = TlsServerEndPoint::new(first_cert)?;
let tls_config = rustls::ServerConfig::builder()
rustls::ServerConfig::builder()
.with_safe_default_cipher_suites()
.with_safe_default_kx_groups()
.with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])?
.with_no_client_auth()
.with_single_cert(cert_chain, key)?
.into();
(tls_config, tls_server_end_point)
.into()
}
_ => bail!("tls-key and tls-cert must be specified"),
};
@@ -121,7 +113,6 @@ async fn main() -> anyhow::Result<()> {
let main = tokio::spawn(task_main(
Arc::new(destination),
tls_config,
tls_server_end_point,
proxy_listener,
cancellation_token.clone(),
));
@@ -143,7 +134,6 @@ async fn main() -> anyhow::Result<()> {
async fn task_main(
dest_suffix: Arc<String>,
tls_config: Arc<rustls::ServerConfig>,
tls_server_end_point: TlsServerEndPoint,
listener: tokio::net::TcpListener,
cancellation_token: CancellationToken,
) -> anyhow::Result<()> {
@@ -169,7 +159,7 @@ async fn task_main(
.context("failed to set socket option")?;
info!(%peer_addr, "serving");
handle_client(dest_suffix, tls_config, tls_server_end_point, socket).await
handle_client(dest_suffix, tls_config, socket).await
}
.unwrap_or_else(|e| {
// Acknowledge that the task has finished with an error.
@@ -178,18 +168,9 @@ async fn task_main(
.instrument(tracing::info_span!("handle_client", ?session_id))
);
}
// Don't modify this unless you read https://docs.rs/tokio/latest/tokio/macro.select.html carefully.
// If this future completes and the pattern doesn't match, this branch is disabled for this call to `select!`.
// This only counts for this loop and it will be enabled again on next `select!`.
//
// Prior code had this as `Some(Err(e))` which _looks_ equivalent to the current setup, but it's not.
// When `connections.join_next()` returned `Some(Ok(()))` (which we expect), it would disable the join_next and it would
// not get called again, even if there are more connections to remove.
Some(res) = connections.join_next() => {
if let Err(e) = res {
if !e.is_panic() && !e.is_cancelled() {
warn!("unexpected error from joined connection task: {e:?}");
}
Some(Err(e)) = connections.join_next(), if !connections.is_empty() => {
if !e.is_panic() && !e.is_cancelled() {
warn!("unexpected error from joined connection task: {e:?}");
}
}
_ = cancellation_token.cancelled() => {
@@ -217,7 +198,6 @@ const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmod
async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
raw_stream: S,
tls_config: Arc<rustls::ServerConfig>,
tls_server_end_point: TlsServerEndPoint,
) -> anyhow::Result<Stream<S>> {
let mut stream = PqStream::new(Stream::from_raw(raw_stream));
@@ -242,11 +222,7 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
if !read_buf.is_empty() {
bail!("data is sent before server replied with EncryptionResponse");
}
Ok(Stream::Tls {
tls: Box::new(raw.upgrade(tls_config).await?),
tls_server_end_point,
})
Ok(raw.upgrade(tls_config).await?)
}
unexpected => {
info!(
@@ -261,10 +237,9 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
async fn handle_client(
dest_suffix: Arc<String>,
tls_config: Arc<rustls::ServerConfig>,
tls_server_end_point: TlsServerEndPoint,
stream: impl AsyncRead + AsyncWrite + Unpin,
) -> anyhow::Result<()> {
let tls_stream = ssl_handshake(stream, tls_config, tls_server_end_point).await?;
let tls_stream = ssl_handshake(stream, tls_config).await?;
// Cut off first part of the SNI domain
// We receive required destination details in the format of

View File

@@ -1,15 +1,12 @@
use crate::auth;
use anyhow::{bail, ensure, Context, Ok};
use rustls::{sign, Certificate, PrivateKey};
use sha2::{Digest, Sha256};
use rustls::sign;
use std::{
collections::{HashMap, HashSet},
str::FromStr,
sync::Arc,
time::Duration,
};
use tracing::{error, info};
use x509_parser::oid_registry;
pub struct ProxyConfig {
pub tls_config: Option<TlsConfig>,
@@ -30,7 +27,6 @@ pub struct MetricCollectionConfig {
pub struct TlsConfig {
pub config: Arc<rustls::ServerConfig>,
pub common_names: Option<HashSet<String>>,
pub cert_resolver: Arc<CertResolver>,
}
pub struct HttpConfig {
@@ -56,7 +52,7 @@ pub fn configure_tls(
let mut cert_resolver = CertResolver::new();
// add default certificate
cert_resolver.add_cert_path(key_path, cert_path, true)?;
cert_resolver.add_cert(key_path, cert_path, true)?;
// add extra certificates
if let Some(certs_dir) = certs_dir {
@@ -68,7 +64,7 @@ pub fn configure_tls(
let key_path = path.join("tls.key");
let cert_path = path.join("tls.crt");
if key_path.exists() && cert_path.exists() {
cert_resolver.add_cert_path(
cert_resolver.add_cert(
&key_path.to_string_lossy(),
&cert_path.to_string_lossy(),
false,
@@ -80,97 +76,35 @@ pub fn configure_tls(
let common_names = cert_resolver.get_common_names();
let cert_resolver = Arc::new(cert_resolver);
let config = rustls::ServerConfig::builder()
.with_safe_default_cipher_suites()
.with_safe_default_kx_groups()
// allow TLS 1.2 to be compatible with older client libraries
.with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])?
.with_no_client_auth()
.with_cert_resolver(cert_resolver.clone())
.with_cert_resolver(Arc::new(cert_resolver))
.into();
Ok(TlsConfig {
config,
common_names: Some(common_names),
cert_resolver,
})
}
/// Channel binding parameter
///
/// <https://www.rfc-editor.org/rfc/rfc5929#section-4>
/// Description: The hash of the TLS server's certificate as it
/// appears, octet for octet, in the server's Certificate message. Note
/// that the Certificate message contains a certificate_list, in which
/// the first element is the server's certificate.
///
/// The hash function is to be selected as follows:
///
/// * if the certificate's signatureAlgorithm uses a single hash
/// function, and that hash function is either MD5 or SHA-1, then use SHA-256;
///
/// * if the certificate's signatureAlgorithm uses a single hash
/// function and that hash function neither MD5 nor SHA-1, then use
/// the hash function associated with the certificate's
/// signatureAlgorithm;
///
/// * if the certificate's signatureAlgorithm uses no hash functions or
/// uses multiple hash functions, then this channel binding type's
/// channel bindings are undefined at this time (updates to is channel
/// binding type may occur to address this issue if it ever arises).
#[derive(Debug, Clone, Copy)]
pub enum TlsServerEndPoint {
Sha256([u8; 32]),
Undefined,
}
impl TlsServerEndPoint {
pub fn new(cert: &Certificate) -> anyhow::Result<Self> {
let sha256_oids = [
// I'm explicitly not adding MD5 or SHA1 here... They're bad.
oid_registry::OID_SIG_ECDSA_WITH_SHA256,
oid_registry::OID_PKCS1_SHA256WITHRSA,
];
let pem = x509_parser::parse_x509_certificate(&cert.0)
.context("Failed to parse PEM object from cerficiate")?
.1;
info!(subject = %pem.subject, "parsing TLS certificate");
let reg = oid_registry::OidRegistry::default().with_all_crypto();
let oid = pem.signature_algorithm.oid();
let alg = reg.get(oid);
if sha256_oids.contains(oid) {
let tls_server_end_point: [u8; 32] =
Sha256::new().chain_update(&cert.0).finalize().into();
info!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), tls_server_end_point = %base64::encode(tls_server_end_point), "determined channel binding");
Ok(Self::Sha256(tls_server_end_point))
} else {
error!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), "unknown channel binding");
Ok(Self::Undefined)
}
}
pub fn supported(&self) -> bool {
!matches!(self, TlsServerEndPoint::Undefined)
}
}
#[derive(Default)]
pub struct CertResolver {
certs: HashMap<String, (Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint)>,
default: Option<(Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint)>,
struct CertResolver {
certs: HashMap<String, Arc<rustls::sign::CertifiedKey>>,
default: Option<Arc<rustls::sign::CertifiedKey>>,
}
impl CertResolver {
pub fn new() -> Self {
Self::default()
fn new() -> Self {
Self {
certs: HashMap::new(),
default: None,
}
}
fn add_cert_path(
fn add_cert(
&mut self,
key_path: &str,
cert_path: &str,
@@ -186,65 +120,57 @@ impl CertResolver {
keys.pop().map(rustls::PrivateKey).unwrap()
};
let key = sign::any_supported_type(&priv_key).context("invalid private key")?;
let cert_chain_bytes = std::fs::read(cert_path)
.context(format!("Failed to read TLS cert file at '{cert_path}.'"))?;
let cert_chain = {
rustls_pemfile::certs(&mut &cert_chain_bytes[..])
.with_context(|| {
format!(
.context(format!(
"Failed to read TLS certificate chain from bytes from file at '{cert_path}'."
)
})?
))?
.into_iter()
.map(rustls::Certificate)
.collect()
};
self.add_cert(priv_key, cert_chain, is_default)
}
let common_name = {
let pem = x509_parser::pem::parse_x509_pem(&cert_chain_bytes)
.context(format!(
"Failed to parse PEM object from bytes from file at '{cert_path}'."
))?
.1;
let common_name = pem.parse_x509()?.subject().to_string();
pub fn add_cert(
&mut self,
priv_key: PrivateKey,
cert_chain: Vec<Certificate>,
is_default: bool,
) -> anyhow::Result<()> {
let key = sign::any_supported_type(&priv_key).context("invalid private key")?;
let first_cert = &cert_chain[0];
let tls_server_end_point = TlsServerEndPoint::new(first_cert)?;
let pem = x509_parser::parse_x509_certificate(&first_cert.0)
.context("Failed to parse PEM object from cerficiate")?
.1;
let common_name = pem.subject().to_string();
// We only use non-wildcard certificates in link proxy so it seems okay to treat them the same as
// wildcard ones as we don't use SNI there. That treatment only affects certificate selection, so
// verify-full will still check wildcard match. Old coding here just ignored non-wildcard common names
// and passed None instead, which blows up number of cases downstream code should handle. Proper coding
// here should better avoid Option for common_names, and do wildcard-based certificate selection instead
// of cutting off '*.' parts.
let common_name = if common_name.starts_with("CN=*.") {
common_name.strip_prefix("CN=*.").map(|s| s.to_string())
} else {
common_name.strip_prefix("CN=").map(|s| s.to_string())
// We only use non-wildcard certificates in link proxy so it seems okay to treat them the same as
// wildcard ones as we don't use SNI there. That treatment only affects certificate selection, so
// verify-full will still check wildcard match. Old coding here just ignored non-wildcard common names
// and passed None instead, which blows up number of cases downstream code should handle. Proper coding
// here should better avoid Option for common_names, and do wildcard-based certificate selection instead
// of cutting off '*.' parts.
if common_name.starts_with("CN=*.") {
common_name.strip_prefix("CN=*.").map(|s| s.to_string())
} else {
common_name.strip_prefix("CN=").map(|s| s.to_string())
}
}
.context("Failed to parse common name from certificate")?;
.context(format!(
"Failed to parse common name from certificate at '{cert_path}'."
))?;
let cert = Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key));
if is_default {
self.default = Some((cert.clone(), tls_server_end_point));
self.default = Some(cert.clone());
}
self.certs.insert(common_name, (cert, tls_server_end_point));
self.certs.insert(common_name, cert);
Ok(())
}
pub fn get_common_names(&self) -> HashSet<String> {
fn get_common_names(&self) -> HashSet<String> {
self.certs.keys().map(|s| s.to_string()).collect()
}
}
@@ -252,24 +178,15 @@ impl CertResolver {
impl rustls::server::ResolvesServerCert for CertResolver {
fn resolve(
&self,
client_hello: rustls::server::ClientHello,
_client_hello: rustls::server::ClientHello,
) -> Option<Arc<rustls::sign::CertifiedKey>> {
self.resolve(client_hello.server_name()).map(|x| x.0)
}
}
impl CertResolver {
pub fn resolve(
&self,
server_name: Option<&str>,
) -> Option<(Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint)> {
// loop here and cut off more and more subdomains until we find
// a match to get a proper wildcard support. OTOH, we now do not
// use nested domains, so keep this simple for now.
//
// With the current coding foo.com will match *.foo.com and that
// repeats behavior of the old code.
if let Some(mut sni_name) = server_name {
if let Some(mut sni_name) = _client_hello.server_name() {
loop {
if let Some(cert) = self.certs.get(sni_name) {
return Some(cert.clone());

View File

@@ -294,18 +294,9 @@ pub async fn task_main(
}),
);
}
// Don't modify this unless you read https://docs.rs/tokio/latest/tokio/macro.select.html carefully.
// If this future completes and the pattern doesn't match, this branch is disabled for this call to `select!`.
// This only counts for this loop and it will be enabled again on next `select!`.
//
// Prior code had this as `Some(Err(e))` which _looks_ equivalent to the current setup, but it's not.
// When `connections.join_next()` returned `Some(Ok(()))` (which we expect), it would disable the join_next and it would
// not get called again, even if there are more connections to remove.
Some(res) = connections.join_next() => {
if let Err(e) = res {
if !e.is_panic() && !e.is_cancelled() {
warn!("unexpected error from joined connection task: {e:?}");
}
Some(Err(e)) = connections.join_next(), if !connections.is_empty() => {
if !e.is_panic() && !e.is_cancelled() {
warn!("unexpected error from joined connection task: {e:?}");
}
}
_ = cancellation_token.cancelled() => {
@@ -470,17 +461,7 @@ async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
if !read_buf.is_empty() {
bail!("data is sent before server replied with EncryptionResponse");
}
let tls_stream = raw.upgrade(tls.to_server_config()).await?;
let (_, tls_server_end_point) = tls
.cert_resolver
.resolve(tls_stream.get_ref().1.server_name())
.context("missing certificate")?;
stream = PqStream::new(Stream::Tls {
tls: Box::new(tls_stream),
tls_server_end_point,
});
stream = PqStream::new(raw.upgrade(tls.to_server_config()).await?);
}
}
_ => bail!(ERR_PROTO_VIOLATION),
@@ -885,7 +866,7 @@ pub async fn proxy_pass(
/// Thin connection context.
struct Client<'a, S> {
/// The underlying libpq protocol stream.
stream: PqStream<Stream<S>>,
stream: PqStream<S>,
/// Client credentials that we care about.
creds: auth::BackendType<'a, auth::ClientCredentials<'a>>,
/// KV-dictionary with PostgreSQL connection params.
@@ -899,7 +880,7 @@ struct Client<'a, S> {
impl<'a, S> Client<'a, S> {
/// Construct a new connection context.
fn new(
stream: PqStream<Stream<S>>,
stream: PqStream<S>,
creds: auth::BackendType<'a, auth::ClientCredentials<'a>>,
params: &'a StartupMessageParams,
session_id: uuid::Uuid,

View File

@@ -1,23 +1,19 @@
//! A group of high-level tests for connection establishing logic and auth.
mod mitm;
//!
use super::*;
use crate::auth::backend::TestBackend;
use crate::auth::ClientCredentials;
use crate::config::CertResolver;
use crate::console::{CachedNodeInfo, NodeInfo};
use crate::{auth, http, sasl, scram};
use async_trait::async_trait;
use rstest::rstest;
use tokio_postgres::config::SslMode;
use tokio_postgres::tls::{MakeTlsConnect, NoTls};
use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream};
use tokio_postgres_rustls::MakeRustlsConnect;
/// Generate a set of TLS certificates: CA + server.
fn generate_certs(
hostname: &str,
common_name: &str,
) -> anyhow::Result<(rustls::Certificate, rustls::Certificate, rustls::PrivateKey)> {
let ca = rcgen::Certificate::from_params({
let mut params = rcgen::CertificateParams::default();
@@ -25,15 +21,7 @@ fn generate_certs(
params
})?;
let cert = rcgen::Certificate::from_params({
let mut params = rcgen::CertificateParams::new(vec![hostname.into()]);
params.distinguished_name = rcgen::DistinguishedName::new();
params
.distinguished_name
.push(rcgen::DnType::CommonName, common_name);
params
})?;
let cert = rcgen::generate_simple_self_signed(vec![hostname.into()])?;
Ok((
rustls::Certificate(ca.serialize_der()?),
rustls::Certificate(cert.serialize_der_with_signer(&ca)?),
@@ -49,14 +37,7 @@ struct ClientConfig<'a> {
impl ClientConfig<'_> {
fn make_tls_connect<S: AsyncRead + AsyncWrite + Unpin + Send + 'static>(
self,
) -> anyhow::Result<
impl tokio_postgres::tls::TlsConnect<
S,
Error = impl std::fmt::Debug,
Future = impl Send,
Stream = RustlsStream<S>,
>,
> {
) -> anyhow::Result<impl tokio_postgres::tls::TlsConnect<S>> {
let mut mk = MakeRustlsConnect::new(self.config);
let tls = MakeTlsConnect::<S>::make_tls_connect(&mut mk, self.hostname)?;
Ok(tls)
@@ -68,24 +49,20 @@ fn generate_tls_config<'a>(
hostname: &'a str,
common_name: &'a str,
) -> anyhow::Result<(ClientConfig<'a>, TlsConfig)> {
let (ca, cert, key) = generate_certs(hostname, common_name)?;
let (ca, cert, key) = generate_certs(hostname)?;
let tls_config = {
let config = rustls::ServerConfig::builder()
.with_safe_defaults()
.with_no_client_auth()
.with_single_cert(vec![cert.clone()], key.clone())?
.with_single_cert(vec![cert], key)?
.into();
let mut cert_resolver = CertResolver::new();
cert_resolver.add_cert(key, vec![cert], true)?;
let common_names = Some(cert_resolver.get_common_names());
let common_names = Some([common_name.to_owned()].iter().cloned().collect());
TlsConfig {
config,
common_names,
cert_resolver: Arc::new(cert_resolver),
}
};
@@ -276,7 +253,6 @@ async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> {
));
let (_client, _conn) = tokio_postgres::Config::new()
.channel_binding(tokio_postgres::config::ChannelBinding::Require)
.user("user")
.dbname("db")
.password(password)
@@ -287,30 +263,6 @@ async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> {
proxy.await?
}
#[tokio::test]
async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
let (client, server) = tokio::io::duplex(1024);
let (client_config, server_config) =
generate_tls_config("generic-project-name.localhost", "localhost")?;
let proxy = tokio::spawn(dummy_proxy(
client,
Some(server_config),
Scram::new("password")?,
));
let (_client, _conn) = tokio_postgres::Config::new()
.channel_binding(tokio_postgres::config::ChannelBinding::Disable)
.user("user")
.dbname("db")
.password("password")
.ssl_mode(SslMode::Require)
.connect_raw(server, client_config.make_tls_connect()?)
.await?;
proxy.await?
}
#[tokio::test]
async fn scram_auth_mock() -> anyhow::Result<()> {
let (client, server) = tokio::io::duplex(1024);

View File

@@ -1,257 +0,0 @@
//! Man-in-the-middle tests
//!
//! Channel binding should prevent a proxy server
//! - that has access to create valid certificates -
//! from controlling the TLS connection.
use std::fmt::Debug;
use super::*;
use bytes::{Bytes, BytesMut};
use futures::{SinkExt, StreamExt};
use postgres_protocol::message::frontend;
use tokio::io::{AsyncReadExt, DuplexStream};
use tokio_postgres::config::SslMode;
use tokio_postgres::tls::TlsConnect;
use tokio_util::codec::{Decoder, Encoder};
enum Intercept {
None,
Methods,
SASLResponse,
}
async fn proxy_mitm(
intercept: Intercept,
) -> (DuplexStream, DuplexStream, ClientConfig<'static>, TlsConfig) {
let (end_server1, client1) = tokio::io::duplex(1024);
let (server2, end_client2) = tokio::io::duplex(1024);
let (client_config1, server_config1) =
generate_tls_config("generic-project-name.localhost", "localhost").unwrap();
let (client_config2, server_config2) =
generate_tls_config("generic-project-name.localhost", "localhost").unwrap();
tokio::spawn(async move {
// begin handshake with end_server
let end_server = connect_tls(server2, client_config2.make_tls_connect().unwrap()).await;
// process handshake with end_client
let (end_client, startup) =
handshake(client1, Some(&server_config1), &CancelMap::default())
.await
.unwrap()
.unwrap();
let mut end_server = tokio_util::codec::Framed::new(end_server, PgFrame);
let (end_client, buf) = end_client.framed.into_inner();
assert!(buf.is_empty());
let mut end_client = tokio_util::codec::Framed::new(end_client, PgFrame);
// give the end_server the startup parameters
let mut buf = BytesMut::new();
frontend::startup_message(startup.iter(), &mut buf).unwrap();
end_server.send(buf.freeze()).await.unwrap();
// proxy messages between end_client and end_server
loop {
tokio::select! {
message = end_server.next() => {
match message {
Some(Ok(message)) => {
// intercept SASL and return only SCRAM-SHA-256 ;)
if matches!(intercept, Intercept::Methods) && message.starts_with(b"R") && message[5..].starts_with(&[0,0,0,10]) {
end_client.send(Bytes::from_static(b"R\0\0\0\x17\0\0\0\x0aSCRAM-SHA-256\0\0")).await.unwrap();
continue;
}
end_client.send(message).await.unwrap()
}
_ => break,
}
}
message = end_client.next() => {
match message {
Some(Ok(message)) => {
// intercept SASL response and return SCRAM-SHA-256 with no channel binding ;)
if matches!(intercept, Intercept::SASLResponse) && message.starts_with(b"p") && message[5..].starts_with(b"SCRAM-SHA-256-PLUS\0") {
let sasl_message = &message[1+4+19+4..];
let mut new_message = b"n,,".to_vec();
new_message.extend_from_slice(sasl_message.strip_prefix(b"p=tls-server-end-point,,").unwrap());
let mut buf = BytesMut::new();
frontend::sasl_initial_response("SCRAM-SHA-256", &new_message, &mut buf).unwrap();
end_server.send(buf.freeze()).await.unwrap();
continue;
}
end_server.send(message).await.unwrap()
}
_ => break,
}
}
else => { break }
}
}
});
(end_server1, end_client2, client_config1, server_config2)
}
/// taken from tokio-postgres
pub async fn connect_tls<S, T>(mut stream: S, tls: T) -> T::Stream
where
S: AsyncRead + AsyncWrite + Unpin,
T: TlsConnect<S>,
T::Error: Debug,
{
let mut buf = BytesMut::new();
frontend::ssl_request(&mut buf);
stream.write_all(&buf).await.unwrap();
let mut buf = [0];
stream.read_exact(&mut buf).await.unwrap();
if buf[0] != b'S' {
panic!("ssl not supported by server");
}
tls.connect(stream).await.unwrap()
}
struct PgFrame;
impl Decoder for PgFrame {
type Item = Bytes;
type Error = io::Error;
fn decode(&mut self, src: &mut BytesMut) -> Result<Option<Self::Item>, Self::Error> {
if src.len() < 5 {
src.reserve(5 - src.len());
return Ok(None);
}
let len = u32::from_be_bytes(src[1..5].try_into().unwrap()) as usize + 1;
if src.len() < len {
src.reserve(len - src.len());
return Ok(None);
}
Ok(Some(src.split_to(len).freeze()))
}
}
impl Encoder<Bytes> for PgFrame {
type Error = io::Error;
fn encode(&mut self, item: Bytes, dst: &mut BytesMut) -> Result<(), Self::Error> {
dst.extend_from_slice(&item);
Ok(())
}
}
/// If the client doesn't support channel bindings, it can be exploited.
#[tokio::test]
async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
let (server, client, client_config, server_config) = proxy_mitm(Intercept::None).await;
let proxy = tokio::spawn(dummy_proxy(
client,
Some(server_config),
Scram::new("password")?,
));
let _client_err = tokio_postgres::Config::new()
.channel_binding(tokio_postgres::config::ChannelBinding::Disable)
.user("user")
.dbname("db")
.password("password")
.ssl_mode(SslMode::Require)
.connect_raw(server, client_config.make_tls_connect()?)
.await?;
proxy.await?
}
/// If the client chooses SCRAM-PLUS, it will fail
#[tokio::test]
async fn scram_auth_prefer_channel_binding() -> anyhow::Result<()> {
connect_failure(
Intercept::None,
tokio_postgres::config::ChannelBinding::Prefer,
)
.await
}
/// If the MITM pretends like SCRAM-PLUS isn't available, but the client supports it, it will fail
#[tokio::test]
async fn scram_auth_prefer_channel_binding_intercept() -> anyhow::Result<()> {
connect_failure(
Intercept::Methods,
tokio_postgres::config::ChannelBinding::Prefer,
)
.await
}
/// If the MITM pretends like the client doesn't support channel bindings, it will fail
#[tokio::test]
async fn scram_auth_prefer_channel_binding_intercept_response() -> anyhow::Result<()> {
connect_failure(
Intercept::SASLResponse,
tokio_postgres::config::ChannelBinding::Prefer,
)
.await
}
/// If the client chooses SCRAM-PLUS, it will fail
#[tokio::test]
async fn scram_auth_require_channel_binding() -> anyhow::Result<()> {
connect_failure(
Intercept::None,
tokio_postgres::config::ChannelBinding::Require,
)
.await
}
/// If the client requires SCRAM-PLUS, and it is spoofed to remove SCRAM-PLUS, it will fail
#[tokio::test]
async fn scram_auth_require_channel_binding_intercept() -> anyhow::Result<()> {
connect_failure(
Intercept::Methods,
tokio_postgres::config::ChannelBinding::Require,
)
.await
}
/// If the client requires SCRAM-PLUS, and it is spoofed to remove SCRAM-PLUS, it will fail
#[tokio::test]
async fn scram_auth_require_channel_binding_intercept_response() -> anyhow::Result<()> {
connect_failure(
Intercept::SASLResponse,
tokio_postgres::config::ChannelBinding::Require,
)
.await
}
async fn connect_failure(
intercept: Intercept,
channel_binding: tokio_postgres::config::ChannelBinding,
) -> anyhow::Result<()> {
let (server, client, client_config, server_config) = proxy_mitm(intercept).await;
let proxy = tokio::spawn(dummy_proxy(
client,
Some(server_config),
Scram::new("password")?,
));
let _client_err = tokio_postgres::Config::new()
.channel_binding(channel_binding)
.user("user")
.dbname("db")
.password("password")
.ssl_mode(SslMode::Require)
.connect_raw(server, client_config.make_tls_connect()?)
.await
.err()
.context("client shouldn't be able to connect")?;
let _server_err = proxy
.await?
.err()
.context("server shouldn't accept client")?;
Ok(())
}

View File

@@ -36,9 +36,9 @@ impl<'a> ChannelBinding<&'a str> {
impl<T: std::fmt::Display> ChannelBinding<T> {
/// Encode channel binding data as base64 for subsequent checks.
pub fn encode<'a, E>(
pub fn encode<E>(
&self,
get_cbind_data: impl FnOnce(&T) -> Result<&'a [u8], E>,
get_cbind_data: impl FnOnce(&T) -> Result<String, E>,
) -> Result<std::borrow::Cow<'static, str>, E> {
use ChannelBinding::*;
Ok(match self {
@@ -51,11 +51,12 @@ impl<T: std::fmt::Display> ChannelBinding<T> {
"eSws".into()
}
Required(mode) => {
use std::io::Write;
let mut cbind_input = vec![];
write!(&mut cbind_input, "p={mode},,",).unwrap();
cbind_input.extend_from_slice(get_cbind_data(mode)?);
base64::encode(&cbind_input).into()
let msg = format!(
"p={mode},,{data}",
mode = mode,
data = get_cbind_data(mode)?
);
base64::encode(msg).into()
}
})
}
@@ -76,7 +77,7 @@ mod tests {
];
for (cb, input) in cases {
assert_eq!(cb.encode(|_| anyhow::Ok(b"bar"))?, input);
assert_eq!(cb.encode(|_| anyhow::Ok("bar".to_owned()))?, input);
}
Ok(())

View File

@@ -22,12 +22,9 @@ pub use secret::ServerSecret;
use hmac::{Hmac, Mac};
use sha2::{Digest, Sha256};
const SCRAM_SHA_256: &str = "SCRAM-SHA-256";
const SCRAM_SHA_256_PLUS: &str = "SCRAM-SHA-256-PLUS";
// TODO: add SCRAM-SHA-256-PLUS
/// A list of supported SCRAM methods.
pub const METHODS: &[&str] = &[SCRAM_SHA_256_PLUS, SCRAM_SHA_256];
pub const METHODS_WITHOUT_PLUS: &[&str] = &[SCRAM_SHA_256];
pub const METHODS: &[&str] = &["SCRAM-SHA-256"];
/// Decode base64 into array without any heap allocations
fn base64_decode_array<const N: usize>(input: impl AsRef<[u8]>) -> Option<[u8; N]> {
@@ -83,11 +80,7 @@ mod tests {
const NONCE: [u8; 18] = [
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
];
let mut exchange = Exchange::new(
&secret,
|| NONCE,
crate::config::TlsServerEndPoint::Undefined,
);
let mut exchange = Exchange::new(&secret, || NONCE, None);
let client_first = "n,,n=user,r=rOprNGfwEbeRWgbNEkqO";
let client_final = "c=biws,r=rOprNGfwEbeRWgbNEkqOAQIDBAUGBwgJCgsMDQ4PEBES,p=rw1r5Kph5ThxmaUBC2GAQ6MfXbPnNkFiTIvdb/Rear0=";

View File

@@ -5,11 +5,9 @@ use super::messages::{
};
use super::secret::ServerSecret;
use super::signature::SignatureBuilder;
use crate::config;
use crate::sasl::{self, ChannelBinding, Error as SaslError};
/// The only channel binding mode we currently support.
#[derive(Debug)]
struct TlsServerEndPoint;
impl std::fmt::Display for TlsServerEndPoint {
@@ -45,20 +43,20 @@ pub struct Exchange<'a> {
state: ExchangeState,
secret: &'a ServerSecret,
nonce: fn() -> [u8; SCRAM_RAW_NONCE_LEN],
tls_server_end_point: config::TlsServerEndPoint,
cert_digest: Option<&'a [u8]>,
}
impl<'a> Exchange<'a> {
pub fn new(
secret: &'a ServerSecret,
nonce: fn() -> [u8; SCRAM_RAW_NONCE_LEN],
tls_server_end_point: config::TlsServerEndPoint,
cert_digest: Option<&'a [u8]>,
) -> Self {
Self {
state: ExchangeState::Initial,
secret,
nonce,
tls_server_end_point,
cert_digest,
}
}
}
@@ -73,14 +71,6 @@ impl sasl::Mechanism for Exchange<'_> {
let client_first_message = ClientFirstMessage::parse(input)
.ok_or(SaslError::BadClientMessage("invalid client-first-message"))?;
// If the flag is set to "y" and the server supports channel
// binding, the server MUST fail authentication
if client_first_message.cbind_flag == ChannelBinding::NotSupportedServer
&& self.tls_server_end_point.supported()
{
return Err(SaslError::ChannelBindingFailed("SCRAM-PLUS not used"));
}
let server_first_message = client_first_message.build_server_first_message(
&(self.nonce)(),
&self.secret.salt_base64,
@@ -104,11 +94,10 @@ impl sasl::Mechanism for Exchange<'_> {
let client_final_message = ClientFinalMessage::parse(input)
.ok_or(SaslError::BadClientMessage("invalid client-final-message"))?;
let channel_binding = cbind_flag.encode(|_| match &self.tls_server_end_point {
config::TlsServerEndPoint::Sha256(x) => Ok(x),
config::TlsServerEndPoint::Undefined => {
Err(SaslError::ChannelBindingFailed("no cert digest provided"))
}
let channel_binding = cbind_flag.encode(|_| {
self.cert_digest
.map(base64::encode)
.ok_or(SaslError::ChannelBindingFailed("no cert digest provided"))
})?;
// This might've been caused by a MITM attack

View File

@@ -1,8 +1,7 @@
use crate::config::TlsServerEndPoint;
use crate::error::UserFacingError;
use anyhow::bail;
use bytes::BytesMut;
use pin_project_lite::pin_project;
use pq_proto::framed::{ConnectionError, Framed};
use pq_proto::{BeMessage, FeMessage, FeStartupPacket, ProtocolError};
use rustls::ServerConfig;
@@ -18,7 +17,7 @@ use tokio_rustls::server::TlsStream;
/// or [`AsyncWrite`] to prevent subtle errors (e.g. trying
/// to pass random malformed bytes through the connection).
pub struct PqStream<S> {
pub(crate) framed: Framed<S>,
framed: Framed<S>,
}
impl<S> PqStream<S> {
@@ -119,21 +118,19 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
}
}
/// Wrapper for upgrading raw streams into secure streams.
pub enum Stream<S> {
/// We always begin with a raw stream,
/// which may then be upgraded into a secure stream.
Raw { raw: S },
Tls {
pin_project! {
/// Wrapper for upgrading raw streams into secure streams.
/// NOTE: it should be possible to decompose this object as necessary.
#[project = StreamProj]
pub enum Stream<S> {
/// We always begin with a raw stream,
/// which may then be upgraded into a secure stream.
Raw { #[pin] raw: S },
/// We box [`TlsStream`] since it can be quite large.
tls: Box<TlsStream<S>>,
/// Channel binding parameter
tls_server_end_point: TlsServerEndPoint,
},
Tls { #[pin] tls: Box<TlsStream<S>> },
}
}
impl<S: Unpin> Unpin for Stream<S> {}
impl<S> Stream<S> {
/// Construct a new instance from a raw stream.
pub fn from_raw(raw: S) -> Self {
@@ -144,17 +141,7 @@ impl<S> Stream<S> {
pub fn sni_hostname(&self) -> Option<&str> {
match self {
Stream::Raw { .. } => None,
Stream::Tls { tls, .. } => tls.get_ref().1.server_name(),
}
}
pub fn tls_server_end_point(&self) -> TlsServerEndPoint {
match self {
Stream::Raw { .. } => TlsServerEndPoint::Undefined,
Stream::Tls {
tls_server_end_point,
..
} => *tls_server_end_point,
Stream::Tls { tls } => tls.get_ref().1.server_name(),
}
}
}
@@ -171,9 +158,12 @@ pub enum StreamUpgradeError {
impl<S: AsyncRead + AsyncWrite + Unpin> Stream<S> {
/// If possible, upgrade raw stream into a secure TLS-based stream.
pub async fn upgrade(self, cfg: Arc<ServerConfig>) -> Result<TlsStream<S>, StreamUpgradeError> {
pub async fn upgrade(self, cfg: Arc<ServerConfig>) -> Result<Self, StreamUpgradeError> {
match self {
Stream::Raw { raw } => Ok(tokio_rustls::TlsAcceptor::from(cfg).accept(raw).await?),
Stream::Raw { raw } => {
let tls = Box::new(tokio_rustls::TlsAcceptor::from(cfg).accept(raw).await?);
Ok(Stream::Tls { tls })
}
Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls),
}
}
@@ -181,46 +171,50 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Stream<S> {
impl<S: AsyncRead + AsyncWrite + Unpin> AsyncRead for Stream<S> {
fn poll_read(
mut self: Pin<&mut Self>,
self: Pin<&mut Self>,
context: &mut task::Context<'_>,
buf: &mut ReadBuf<'_>,
) -> task::Poll<io::Result<()>> {
match &mut *self {
Self::Raw { raw } => Pin::new(raw).poll_read(context, buf),
Self::Tls { tls, .. } => Pin::new(tls).poll_read(context, buf),
use StreamProj::*;
match self.project() {
Raw { raw } => raw.poll_read(context, buf),
Tls { tls } => tls.poll_read(context, buf),
}
}
}
impl<S: AsyncRead + AsyncWrite + Unpin> AsyncWrite for Stream<S> {
fn poll_write(
mut self: Pin<&mut Self>,
self: Pin<&mut Self>,
context: &mut task::Context<'_>,
buf: &[u8],
) -> task::Poll<io::Result<usize>> {
match &mut *self {
Self::Raw { raw } => Pin::new(raw).poll_write(context, buf),
Self::Tls { tls, .. } => Pin::new(tls).poll_write(context, buf),
use StreamProj::*;
match self.project() {
Raw { raw } => raw.poll_write(context, buf),
Tls { tls } => tls.poll_write(context, buf),
}
}
fn poll_flush(
mut self: Pin<&mut Self>,
self: Pin<&mut Self>,
context: &mut task::Context<'_>,
) -> task::Poll<io::Result<()>> {
match &mut *self {
Self::Raw { raw } => Pin::new(raw).poll_flush(context),
Self::Tls { tls, .. } => Pin::new(tls).poll_flush(context),
use StreamProj::*;
match self.project() {
Raw { raw } => raw.poll_flush(context),
Tls { tls } => tls.poll_flush(context),
}
}
fn poll_shutdown(
mut self: Pin<&mut Self>,
self: Pin<&mut Self>,
context: &mut task::Context<'_>,
) -> task::Poll<io::Result<()>> {
match &mut *self {
Self::Raw { raw } => Pin::new(raw).poll_shutdown(context),
Self::Tls { tls, .. } => Pin::new(tls).poll_shutdown(context),
use StreamProj::*;
match self.project() {
Raw { raw } => raw.poll_shutdown(context),
Tls { tls } => tls.poll_shutdown(context),
}
}
}

View File

@@ -1,5 +1,5 @@
[toolchain]
channel = "1.74.0"
channel = "1.73.0"
profile = "default"
# The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
# https://rust-lang.github.io/rustup/concepts/profiles.html

View File

@@ -202,7 +202,6 @@ async fn main() -> anyhow::Result<()> {
logging::init(
LogFormat::from_config(&args.log_format)?,
logging::TracingErrorLayerEnablement::Disabled,
logging::Output::Stdout,
)?;
logging::replace_panic_hook_with_tracing_panic_hook().forget();
info!("version: {GIT_VERSION}");

View File

@@ -434,7 +434,6 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
logging::init(
LogFormat::from_config(&args.log_format)?,
logging::TracingErrorLayerEnablement::Disabled,
logging::Output::Stdout,
)?;
logging::replace_panic_hook_with_tracing_panic_hook().forget();
// initialize sentry if SENTRY_DSN is provided

View File

@@ -18,7 +18,6 @@ from datetime import datetime
from functools import cached_property
from itertools import chain, product
from pathlib import Path
from queue import SimpleQueue
from types import TracebackType
from typing import Any, Dict, Iterator, List, Optional, Tuple, Type, cast
from urllib.parse import urlparse
@@ -37,20 +36,12 @@ from _pytest.fixtures import FixtureRequest
from psycopg2.extensions import connection as PgConnection
from psycopg2.extensions import cursor as PgCursor
from psycopg2.extensions import make_dsn, parse_dsn
from pytest_httpserver import HTTPServer
from typing_extensions import Literal
from urllib3.util.retry import Retry
from werkzeug.wrappers.request import Request
from werkzeug.wrappers.response import Response
from fixtures.broker import NeonBroker
from fixtures.log_helper import log
from fixtures.pageserver.allowed_errors import (
DEFAULT_PAGESERVER_ALLOWED_ERRORS,
scan_pageserver_log_for_errors,
)
from fixtures.pageserver.http import PageserverHttpClient
from fixtures.pageserver.types import IndexPartDump
from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
from fixtures.pg_version import PgVersion
from fixtures.port_distributor import PortDistributor
@@ -438,6 +429,8 @@ class NeonEnvBuilder:
# Pageserver remote storage
self.pageserver_remote_storage = pageserver_remote_storage
# Extensions remote storage
self.ext_remote_storage: Optional[S3Storage] = None
# Safekeepers remote storage
self.sk_remote_storage: Optional[RemoteStorage] = None
@@ -536,6 +529,24 @@ class NeonEnvBuilder:
)
self.pageserver_remote_storage = ret
def enable_extensions_remote_storage(self, kind: RemoteStorageKind):
assert self.ext_remote_storage is None, "already configured extensions remote storage"
# there is an assumption that REAL_S3 for extensions is never
# cleaned up these are also special in that they have a hardcoded
# bucket and region, which is most likely the same as our normal
ext = self._configure_and_create_remote_storage(
kind,
RemoteStorageUser.EXTENSIONS,
bucket_name="neon-dev-extensions-eu-central-1",
bucket_region="eu-central-1",
)
assert isinstance(
ext, S3Storage
), "unsure why, but only MOCK_S3 and REAL_S3 are currently supported for extensions"
ext.cleanup = False
self.ext_remote_storage = ext
def enable_safekeeper_remote_storage(self, kind: RemoteStorageKind):
assert self.sk_remote_storage is None, "sk_remote_storage already configured"
@@ -592,7 +603,8 @@ class NeonEnvBuilder:
directory_to_clean.rmdir()
def cleanup_remote_storage(self):
for x in [self.pageserver_remote_storage, self.sk_remote_storage]:
# extensions are currently not cleaned up, disabled when creating
for x in [self.pageserver_remote_storage, self.ext_remote_storage, self.sk_remote_storage]:
if isinstance(x, S3Storage):
x.do_cleanup()
@@ -690,12 +702,12 @@ class NeonEnv:
self.port_distributor = config.port_distributor
self.s3_mock_server = config.mock_s3_server
self.neon_cli = NeonCli(env=self)
self.pagectl = Pagectl(env=self)
self.endpoints = EndpointFactory(self)
self.safekeepers: List[Safekeeper] = []
self.pageservers: List[NeonPageserver] = []
self.broker = config.broker
self.pageserver_remote_storage = config.pageserver_remote_storage
self.ext_remote_storage = config.ext_remote_storage
self.safekeepers_remote_storage = config.sk_remote_storage
self.pg_version = config.pg_version
# Binary path for pageserver, safekeeper, etc
@@ -1009,68 +1021,6 @@ def neon_env_builder(
yield builder
@pytest.fixture(scope="function")
def neon_env_and_metrics_server(
httpserver: HTTPServer,
neon_env_builder: NeonEnvBuilder,
httpserver_listen_address,
) -> Tuple[NeonEnv, HTTPServer, SimpleQueue[Any]]:
"""
Fixture to create a Neon environment and metrics server.
"""
(host, port) = httpserver_listen_address
metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
# this should be Union[str, Tuple[List[Any], bool]], but it will make unpacking much more verbose
uploads: SimpleQueue[Any] = SimpleQueue()
def metrics_handler(request: Request) -> Response:
if request.json is None:
return Response(status=400)
events = request.json["events"]
is_last = request.headers["pageserver-metrics-last-upload-in-batch"]
assert is_last in ["true", "false"]
uploads.put((events, is_last == "true"))
return Response(status=200)
# Require collecting metrics frequently, since we change
# the timeline and want something to be logged about it.
#
# Disable time-based pitr, we will use the manual GC calls
# to trigger remote storage operations in a controlled way
neon_env_builder.pageserver_config_override = f"""
metric_collection_interval="1s"
metric_collection_endpoint="{metric_collection_endpoint}"
cached_metric_collection_interval="0s"
synthetic_size_calculation_interval="3s"
"""
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
log.info(f"test_metric_collection endpoint is {metric_collection_endpoint}")
# mock http server that returns OK for the metrics
httpserver.expect_request("/billing/api/v1/usage_events", method="POST").respond_with_handler(
metrics_handler
)
# spin up neon, after http server is ready
env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
# httpserver is shut down before pageserver during passing run
env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*")
# we have a fast rate of calculation, these can happen at shutdown
env.pageserver.allowed_errors.append(
".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*"
)
env.pageserver.allowed_errors.append(
".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes"
)
return (env, httpserver, uploads)
@dataclass
class PageserverPort:
pg: int
@@ -1272,7 +1222,6 @@ class NeonCli(AbstractNeonCli):
self,
new_branch_name: str,
tenant_id: Optional[TenantId] = None,
timeline_id: Optional[TimelineId] = None,
) -> TimelineId:
cmd = [
"timeline",
@@ -1285,9 +1234,6 @@ class NeonCli(AbstractNeonCli):
self.env.pg_version,
]
if timeline_id is not None:
cmd.extend(["--timeline-id", str(timeline_id)])
res = self.raw_cli(cmd)
res.check_returncode()
@@ -1513,7 +1459,12 @@ class NeonCli(AbstractNeonCli):
if pageserver_id is not None:
args.extend(["--pageserver-id", str(pageserver_id)])
res = self.raw_cli(args)
storage = self.env.ext_remote_storage
s3_env_vars = None
if isinstance(storage, S3Storage):
s3_env_vars = storage.access_env_vars()
res = self.raw_cli(args, extra_env_vars=s3_env_vars)
res.check_returncode()
return res
@@ -1607,20 +1558,6 @@ class ComputeCtl(AbstractNeonCli):
COMMAND = "compute_ctl"
class Pagectl(AbstractNeonCli):
"""
A typed wrapper around the `pagectl` utility CLI tool.
"""
COMMAND = "pagectl"
def dump_index_part(self, path: Path) -> IndexPartDump:
res = self.raw_cli(["index-part", "dump", str(path)])
res.check_returncode()
parsed = json.loads(res.stdout)
return IndexPartDump.from_json(parsed)
class NeonAttachmentService:
def __init__(self, env: NeonEnv):
self.env = env
@@ -1638,7 +1575,7 @@ class NeonAttachmentService:
self.running = False
return self
def attach_hook_issue(self, tenant_id: TenantId, pageserver_id: int) -> int:
def attach_hook(self, tenant_id: TenantId, pageserver_id: int) -> int:
response = requests.post(
f"{self.env.control_plane_api}/attach-hook",
json={"tenant_id": str(tenant_id), "node_id": pageserver_id},
@@ -1648,13 +1585,6 @@ class NeonAttachmentService:
assert isinstance(gen, int)
return gen
def attach_hook_drop(self, tenant_id: TenantId):
response = requests.post(
f"{self.env.control_plane_api}/attach-hook",
json={"tenant_id": str(tenant_id), "node_id": None},
)
response.raise_for_status()
def __enter__(self) -> "NeonAttachmentService":
return self
@@ -1692,7 +1622,57 @@ class NeonPageserver(PgProtocol):
# env.pageserver.allowed_errors.append(".*could not open garage door.*")
#
# The entries in the list are regular experessions.
self.allowed_errors: List[str] = list(DEFAULT_PAGESERVER_ALLOWED_ERRORS)
self.allowed_errors = [
# All tests print these, when starting up or shutting down
".*wal receiver task finished with an error: walreceiver connection handling failure.*",
".*Shutdown task error: walreceiver connection handling failure.*",
".*wal_connection_manager.*tcp connect error: Connection refused.*",
".*query handler for .* failed: Socket IO error: Connection reset by peer.*",
".*serving compute connection task.*exited with error: Postgres connection error.*",
".*serving compute connection task.*exited with error: Connection reset by peer.*",
".*serving compute connection task.*exited with error: Postgres query error.*",
".*Connection aborted: error communicating with the server: Transport endpoint is not connected.*",
# FIXME: replication patch for tokio_postgres regards any but CopyDone/CopyData message in CopyBoth stream as unexpected
".*Connection aborted: unexpected message from server*",
".*kill_and_wait_impl.*: wait successful.*",
".*query handler for 'pagestream.*failed: Broken pipe.*", # pageserver notices compute shut down
".*query handler for 'pagestream.*failed: Connection reset by peer.*", # pageserver notices compute shut down
# safekeeper connection can fail with this, in the window between timeline creation
# and streaming start
".*Failed to process query for timeline .*: state uninitialized, no data to read.*",
# Tests related to authentication and authorization print these
".*Error processing HTTP request: Forbidden",
# intentional failpoints
".*failpoint ",
# FIXME: These need investigation
".*manual_gc.*is_shutdown_requested\\(\\) called in an unexpected task or thread.*",
".*tenant_list: timeline is not found in remote index while it is present in the tenants registry.*",
".*Removing intermediate uninit mark file.*",
# Tenant::delete_timeline() can cause any of the four following errors.
# FIXME: we shouldn't be considering it an error: https://github.com/neondatabase/neon/issues/2946
".*could not flush frozen layer.*queue is in state Stopped", # when schedule layer upload fails because queued got closed before compaction got killed
".*wait for layer upload ops to complete.*", # .*Caused by:.*wait_completion aborted because upload queue was stopped
".*gc_loop.*Gc failed, retrying in.*timeline is Stopping", # When gc checks timeline state after acquiring layer_removal_cs
".*gc_loop.*Gc failed, retrying in.*: Cannot run GC iteration on inactive tenant", # Tenant::gc precondition
".*compaction_loop.*Compaction failed.*, retrying in.*timeline or pageserver is shutting down", # When compaction checks timeline state after acquiring layer_removal_cs
".*query handler for 'pagestream.*failed: Timeline .* was not found", # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock()
".*query handler for 'pagestream.*failed: Timeline .* is not active", # timeline delete in progress
".*task iteration took longer than the configured period.*",
# this is until #3501
".*Compaction failed.*, retrying in [^:]+: Cannot run compaction iteration on inactive tenant",
# these can happen anytime we do compactions from background task and shutdown pageserver
r".*ERROR.*ancestor timeline \S+ is being stopped",
# this is expected given our collaborative shutdown approach for the UploadQueue
".*Compaction failed.*, retrying in .*: queue is in state Stopped.*",
# Pageserver timeline deletion should be polled until it gets 404, so ignore it globally
".*Error processing HTTP request: NotFound: Timeline .* was not found",
".*took more than expected to complete.*",
# these can happen during shutdown, but it should not be a reason to fail a test
".*completed, took longer than expected.*",
# AWS S3 may emit 500 errors for keys in a DeleteObjects response: we retry these
# and it is not a failure of our code when it happens.
".*DeleteObjects.*We encountered an internal error. Please try again.*",
]
def timeline_dir(self, tenant_id: TenantId, timeline_id: Optional[TimelineId] = None) -> Path:
"""Get a timeline directory's path based on the repo directory of the test environment"""
@@ -1802,9 +1782,27 @@ class NeonPageserver(PgProtocol):
def assert_no_errors(self):
logfile = open(os.path.join(self.workdir, "pageserver.log"), "r")
errors = scan_pageserver_log_for_errors(logfile, self.allowed_errors)
error_or_warn = re.compile(r"\s(ERROR|WARN)")
errors = []
while True:
line = logfile.readline()
if not line:
break
for _lineno, error in errors:
if error_or_warn.search(line):
# Is this a torn log line? This happens when force-killing a process and restarting
# Example: "2023-10-25T09:38:31.752314Z WARN deletion executo2023-10-25T09:38:31.875947Z INFO version: git-env:0f9452f76e8ccdfc88291bccb3f53e3016f40192"
if re.match("\\d{4}-\\d{2}-\\d{2}T.+\\d{4}-\\d{2}-\\d{2}T.+INFO version.+", line):
continue
# It's an ERROR or WARN. Is it in the allow-list?
for a in self.allowed_errors:
if re.match(a, line):
break
else:
errors.append(line)
for error in errors:
log.info(f"not allowed error: {error.strip()}")
assert not errors
@@ -1854,20 +1852,13 @@ class NeonPageserver(PgProtocol):
to call into the pageserver HTTP client.
"""
if self.env.attachment_service is not None:
generation = self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
generation = self.env.attachment_service.attach_hook(tenant_id, self.id)
else:
generation = None
client = self.http_client()
return client.tenant_attach(tenant_id, config, config_null, generation=generation)
def tenant_detach(self, tenant_id: TenantId):
if self.env.attachment_service is not None:
self.env.attachment_service.attach_hook_drop(tenant_id)
client = self.http_client()
return client.tenant_detach(tenant_id)
def append_pageserver_param_overrides(
params_to_update: List[str],
@@ -2635,17 +2626,6 @@ class Endpoint(PgProtocol):
with open(config_path, "w") as file:
json.dump(dict(data_dict, **kwargs), file, indent=4)
# Mock the extension part of spec passed from control plane for local testing
# endpooint.rs adds content of this file as a part of the spec.json
def create_remote_extension_spec(self, spec: dict[str, Any]):
"""Create a remote extension spec file for the endpoint."""
remote_extensions_spec_path = os.path.join(
self.endpoint_path(), "remote_extensions_spec.json"
)
with open(remote_extensions_spec_path, "w") as file:
json.dump(spec, file, indent=4)
def stop(self) -> "Endpoint":
"""
Stop the Postgres instance if it's running.

View File

@@ -1,116 +0,0 @@
#! /usr/bin/env python3
import argparse
import re
import sys
from typing import Iterable, List, Tuple
def scan_pageserver_log_for_errors(
input: Iterable[str], allowed_errors: List[str]
) -> List[Tuple[int, str]]:
error_or_warn = re.compile(r"\s(ERROR|WARN)")
errors = []
for lineno, line in enumerate(input, start=1):
if len(line) == 0:
continue
if error_or_warn.search(line):
# Is this a torn log line? This happens when force-killing a process and restarting
# Example: "2023-10-25T09:38:31.752314Z WARN deletion executo2023-10-25T09:38:31.875947Z INFO version: git-env:0f9452f76e8ccdfc88291bccb3f53e3016f40192"
if re.match("\\d{4}-\\d{2}-\\d{2}T.+\\d{4}-\\d{2}-\\d{2}T.+INFO version.+", line):
continue
# It's an ERROR or WARN. Is it in the allow-list?
for a in allowed_errors:
if re.match(a, line):
break
else:
errors.append((lineno, line))
return errors
DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
# All tests print these, when starting up or shutting down
".*wal receiver task finished with an error: walreceiver connection handling failure.*",
".*Shutdown task error: walreceiver connection handling failure.*",
".*wal_connection_manager.*tcp connect error: Connection refused.*",
".*query handler for .* failed: Socket IO error: Connection reset by peer.*",
".*serving compute connection task.*exited with error: Postgres connection error.*",
".*serving compute connection task.*exited with error: Connection reset by peer.*",
".*serving compute connection task.*exited with error: Postgres query error.*",
".*Connection aborted: error communicating with the server: Transport endpoint is not connected.*",
# FIXME: replication patch for tokio_postgres regards any but CopyDone/CopyData message in CopyBoth stream as unexpected
".*Connection aborted: unexpected message from server*",
".*kill_and_wait_impl.*: wait successful.*",
".*query handler for 'pagestream.*failed: Broken pipe.*", # pageserver notices compute shut down
".*query handler for 'pagestream.*failed: Connection reset by peer.*", # pageserver notices compute shut down
# safekeeper connection can fail with this, in the window between timeline creation
# and streaming start
".*Failed to process query for timeline .*: state uninitialized, no data to read.*",
# Tests related to authentication and authorization print these
".*Error processing HTTP request: Forbidden",
# intentional failpoints
".*failpoint ",
# FIXME: These need investigation
".*manual_gc.*is_shutdown_requested\\(\\) called in an unexpected task or thread.*",
".*tenant_list: timeline is not found in remote index while it is present in the tenants registry.*",
".*Removing intermediate uninit mark file.*",
# Tenant::delete_timeline() can cause any of the four following errors.
# FIXME: we shouldn't be considering it an error: https://github.com/neondatabase/neon/issues/2946
".*could not flush frozen layer.*queue is in state Stopped", # when schedule layer upload fails because queued got closed before compaction got killed
".*wait for layer upload ops to complete.*", # .*Caused by:.*wait_completion aborted because upload queue was stopped
".*gc_loop.*Gc failed, retrying in.*timeline is Stopping", # When gc checks timeline state after acquiring layer_removal_cs
".*gc_loop.*Gc failed, retrying in.*: Cannot run GC iteration on inactive tenant", # Tenant::gc precondition
".*compaction_loop.*Compaction failed.*, retrying in.*timeline or pageserver is shutting down", # When compaction checks timeline state after acquiring layer_removal_cs
".*query handler for 'pagestream.*failed: Timeline .* was not found", # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock()
".*query handler for 'pagestream.*failed: Timeline .* is not active", # timeline delete in progress
".*task iteration took longer than the configured period.*",
# these can happen anytime we do compactions from background task and shutdown pageserver
r".*ERROR.*ancestor timeline \S+ is being stopped",
# this is expected given our collaborative shutdown approach for the UploadQueue
".*Compaction failed.*, retrying in .*: Other\\(queue is in state Stopped.*",
".*Compaction failed.*, retrying in .*: ShuttingDown",
# Pageserver timeline deletion should be polled until it gets 404, so ignore it globally
".*Error processing HTTP request: NotFound: Timeline .* was not found",
".*took more than expected to complete.*",
# these can happen during shutdown, but it should not be a reason to fail a test
".*completed, took longer than expected.*",
# AWS S3 may emit 500 errors for keys in a DeleteObjects response: we retry these
# and it is not a failure of our code when it happens.
".*DeleteObjects.*We encountered an internal error. Please try again.*",
)
def _check_allowed_errors(input):
allowed_errors: List[str] = list(DEFAULT_PAGESERVER_ALLOWED_ERRORS)
# add any test specifics here; cli parsing is not provided for the
# difficulty of copypasting regexes as arguments without any quoting
# errors.
errors = scan_pageserver_log_for_errors(input, allowed_errors)
for lineno, error in errors:
print(f"-:{lineno}: {error.strip()}", file=sys.stderr)
print(f"\n{len(errors)} not allowed errors", file=sys.stderr)
return errors
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="check input against pageserver global allowed_errors"
)
parser.add_argument(
"-i",
"--input",
type=argparse.FileType("r"),
default=sys.stdin,
help="Pageserver logs file. Reads from stdin if no file is provided.",
)
args = parser.parse_args()
errors = _check_allowed_errors(args.input)
sys.exit(len(errors) > 0)

View File

@@ -432,18 +432,12 @@ class PageserverHttpClient(requests.Session):
assert isinstance(res_json, dict)
return res_json
def timeline_compact(
self, tenant_id: TenantId, timeline_id: TimelineId, force_repartition=False
):
def timeline_compact(self, tenant_id: TenantId, timeline_id: TimelineId):
self.is_testing_enabled_or_skip()
query = {}
if force_repartition:
query["force_repartition"] = "true"
log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}")
res = self.put(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/compact",
params=query,
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/compact"
)
log.info(f"Got compact request response code: {res.status_code}")
self.verbose_error(res)
@@ -472,18 +466,12 @@ class PageserverHttpClient(requests.Session):
res_json = res.json()
return res_json
def timeline_checkpoint(
self, tenant_id: TenantId, timeline_id: TimelineId, force_repartition=False
):
def timeline_checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId):
self.is_testing_enabled_or_skip()
query = {}
if force_repartition:
query["force_repartition"] = "true"
log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}")
res = self.put(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint",
params=query,
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint"
)
log.info(f"Got checkpoint request response code: {res.status_code}")
self.verbose_error(res)

View File

@@ -1,146 +0,0 @@
from dataclasses import dataclass
from typing import Any, Dict, Tuple, Union
from fixtures.types import KEY_MAX, KEY_MIN, Key, Lsn
@dataclass
class IndexLayerMetadata:
@classmethod
def from_json(cls, d: Dict[str, Any]):
return {}
@dataclass(frozen=True)
class ImageLayerFileName:
lsn: Lsn
key_start: Key
key_end: Key
def to_str(self):
ret = (
f"{self.key_start.as_int():036X}-{self.key_end.as_int():036X}__{self.lsn.as_int():016X}"
)
assert self == parse_layer_file_name(ret)
return ret
@dataclass(frozen=True)
class DeltaLayerFileName:
lsn_start: Lsn
lsn_end: Lsn
key_start: Key
key_end: Key
def is_l0(self):
return self.key_start == KEY_MIN and self.key_end == KEY_MAX
def to_str(self):
ret = f"{self.key_start.as_int():036X}-{self.key_end.as_int():036X}__{self.lsn_start.as_int():016X}-{self.lsn_end.as_int():016X}"
assert self == parse_layer_file_name(ret)
return ret
LayerFileName = Union[ImageLayerFileName, DeltaLayerFileName]
class InvalidFileName(Exception):
pass
def parse_image_layer(f_name: str) -> Tuple[int, int, int]:
"""Parse an image layer file name. Return key start, key end, and snapshot lsn"""
parts = f_name.split("__")
if len(parts) != 2:
raise InvalidFileName(f"expecting two parts separated by '__', got: {parts}")
key_parts = parts[0].split("-")
if len(key_parts) != 2:
raise InvalidFileName(
f"expecting two key parts separated by '--' in parts[0], got: {key_parts}"
)
try:
return int(key_parts[0], 16), int(key_parts[1], 16), int(parts[1], 16)
except ValueError as e:
raise InvalidFileName(f"conversion error: {f_name}") from e
def parse_delta_layer(f_name: str) -> Tuple[int, int, int, int]:
"""Parse a delta layer file name. Return key start, key end, lsn start, and lsn end"""
parts = f_name.split("__")
if len(parts) != 2:
raise InvalidFileName(f"expecting two parts separated by '__', got: {parts}")
key_parts = parts[0].split("-")
if len(key_parts) != 2:
raise InvalidFileName(
f"expecting two key parts separated by '--' in parts[0], got: {key_parts}"
)
lsn_parts = parts[1].split("-")
if len(lsn_parts) != 2:
raise InvalidFileName(
f"expecting two lsn parts separated by '--' in parts[1], got: {lsn_parts}"
)
try:
return (
int(key_parts[0], 16),
int(key_parts[1], 16),
int(lsn_parts[0], 16),
int(lsn_parts[1], 16),
)
except ValueError as e:
raise InvalidFileName(f"conversion error: {f_name}") from e
def parse_layer_file_name(file_name: str) -> LayerFileName:
try:
key_start, key_end, lsn = parse_image_layer(file_name)
return ImageLayerFileName(lsn=Lsn(lsn), key_start=Key(key_start), key_end=Key(key_end))
except InvalidFileName:
pass
try:
key_start, key_end, lsn_start, lsn_end = parse_delta_layer(file_name)
return DeltaLayerFileName(
lsn_start=Lsn(lsn_start),
lsn_end=Lsn(lsn_end),
key_start=Key(key_start),
key_end=Key(key_end),
)
except InvalidFileName:
pass
raise ValueError()
def is_future_layer(layer_file_name: LayerFileName, disk_consistent_lsn: Lsn):
"""
Determines if this layer file is considered to be in future meaning we will discard these
layers during timeline initialization from the given disk_consistent_lsn.
"""
if (
isinstance(layer_file_name, ImageLayerFileName)
and layer_file_name.lsn > disk_consistent_lsn
):
return True
elif (
isinstance(layer_file_name, DeltaLayerFileName)
and layer_file_name.lsn_end > disk_consistent_lsn + 1
):
return True
else:
return False
@dataclass
class IndexPartDump:
layer_metadata: Dict[LayerFileName, IndexLayerMetadata]
disk_consistent_lsn: Lsn
@classmethod
def from_json(cls, d: Dict[str, Any]) -> "IndexPartDump":
return IndexPartDump(
layer_metadata={
parse_layer_file_name(n): IndexLayerMetadata.from_json(v)
for n, v in d["layer_metadata"].items()
},
disk_consistent_lsn=Lsn(d["disk_consistent_lsn"]),
)

View File

@@ -12,7 +12,6 @@ import boto3
from mypy_boto3_s3 import S3Client
from fixtures.log_helper import log
from fixtures.pageserver.types import LayerFileName
from fixtures.types import TenantId, TimelineId
TIMELINE_INDEX_PART_FILE_NAME = "index_part.json"
@@ -88,11 +87,6 @@ class LocalFsStorage:
def timeline_path(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path:
return self.tenant_path(tenant_id) / "timelines" / str(timeline_id)
def layer_path(
self, tenant_id: TenantId, timeline_id: TimelineId, layer_file_name: LayerFileName
):
return self.timeline_path(tenant_id, timeline_id) / layer_file_name.to_str()
def index_path(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path:
return self.timeline_path(tenant_id, timeline_id) / TIMELINE_INDEX_PART_FILE_NAME

View File

@@ -1,5 +1,4 @@
import random
from dataclasses import dataclass
from functools import total_ordering
from typing import Any, Type, TypeVar, Union
@@ -37,11 +36,6 @@ class Lsn:
return NotImplemented
return self.lsn_int < other.lsn_int
def __gt__(self, other: Any) -> bool:
if not isinstance(other, Lsn):
raise NotImplementedError
return self.lsn_int > other.lsn_int
def __eq__(self, other: Any) -> bool:
if not isinstance(other, Lsn):
return NotImplemented
@@ -53,32 +47,9 @@ class Lsn:
return NotImplemented
return self.lsn_int - other.lsn_int
def __add__(self, other: Union[int, "Lsn"]) -> "Lsn":
if isinstance(other, int):
return Lsn(self.lsn_int + other)
elif isinstance(other, Lsn):
return Lsn(self.lsn_int + other.lsn_int)
else:
raise NotImplementedError
def __hash__(self) -> int:
return hash(self.lsn_int)
def as_int(self) -> int:
return self.lsn_int
@dataclass(frozen=True)
class Key:
key_int: int
def as_int(self) -> int:
return self.key_int
KEY_MAX = Key(0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF)
KEY_MIN = Key(0)
@total_ordering
class Id:

View File

@@ -6,16 +6,7 @@ import subprocess
import threading
import time
from pathlib import Path
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
List,
Optional,
Tuple,
TypeVar,
)
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, TypeVar
from urllib.parse import urlencode
import allure
@@ -23,10 +14,6 @@ import zstandard
from psycopg2.extensions import cursor
from fixtures.log_helper import log
from fixtures.pageserver.types import (
parse_delta_layer,
parse_image_layer,
)
if TYPE_CHECKING:
from fixtures.neon_fixtures import PgBin
@@ -206,6 +193,26 @@ def get_timeline_dir_size(path: Path) -> int:
return sz
def parse_image_layer(f_name: str) -> Tuple[int, int, int]:
"""Parse an image layer file name. Return key start, key end, and snapshot lsn"""
parts = f_name.split("__")
key_parts = parts[0].split("-")
return int(key_parts[0], 16), int(key_parts[1], 16), int(parts[1], 16)
def parse_delta_layer(f_name: str) -> Tuple[int, int, int, int]:
"""Parse a delta layer file name. Return key start, key end, lsn start, and lsn end"""
parts = f_name.split("__")
key_parts = parts[0].split("-")
lsn_parts = parts[1].split("-")
return (
int(key_parts[0], 16),
int(key_parts[1], 16),
int(lsn_parts[0], 16),
int(lsn_parts[1], 16),
)
def get_scale_for_db(size_mb: int) -> int:
"""Returns pgbench scale factor for given target db size in MB.

View File

@@ -24,6 +24,8 @@ def check_backpressure(endpoint: Endpoint, stop_event: threading.Event, polling_
log.info("checks started")
with pg_cur(endpoint) as cur:
cur.execute("CREATE EXTENSION neon") # TODO move it to neon_fixtures?
cur.execute("select pg_size_bytes(current_setting('max_replication_write_lag'))")
res = cur.fetchone()
max_replication_write_lag_bytes = res[0]
@@ -100,13 +102,9 @@ def test_backpressure_received_lsn_lag(neon_env_builder: NeonEnvBuilder):
# Create a branch for us
env.neon_cli.create_branch("test_backpressure")
endpoint = env.endpoints.create(
endpoint = env.endpoints.create_start(
"test_backpressure", config_lines=["max_replication_write_lag=30MB"]
)
# don't skip pg_catalog updates - it runs CREATE EXTENSION neon
# which is needed for backpressure_lsns() to work
endpoint.respec(skip_pg_catalog_updates=False)
endpoint.start()
log.info("postgres is running on 'test_backpressure' branch")
# setup check thread

View File

@@ -46,10 +46,7 @@ from fixtures.utils import query_scalar
# Because the delta layer D covering lsn1 is corrupted, creating a branch
# starting from lsn1 should return an error as follows:
# could not find data for key ... at LSN ..., for request at LSN ...
def test_branch_and_gc(neon_simple_env: NeonEnv, build_type: str):
if build_type == "debug":
pytest.skip("times out in debug builds")
def test_branch_and_gc(neon_simple_env: NeonEnv):
env = neon_simple_env
pageserver_http_client = env.pageserver.http_client()

View File

@@ -114,7 +114,6 @@ def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder)
[
".*Failed to process timeline dir contents.*Timeline has no ancestor and no layer files.*",
".*Timeline got dropped without initializing, cleaning its files.*",
".*Failed to load index_part from remote storage, failed creation?.*",
]
)
@@ -144,58 +143,6 @@ def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder)
), "pageserver should clean its temp timeline files on timeline creation failure"
def test_timeline_init_break_before_checkpoint_recreate(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
env.pageserver.allowed_errors.extend(
[
".*Failed to process timeline dir contents.*Timeline has no ancestor and no layer files.*",
".*Timeline got dropped without initializing, cleaning its files.*",
".*Failed to load index_part from remote storage, failed creation?.*",
]
)
tenant_id = env.initial_tenant
timelines_dir = env.pageserver.timeline_dir(tenant_id)
old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
initial_timeline_dirs = [d for d in timelines_dir.iterdir()]
# Some fixed timeline ID (like control plane does)
timeline_id = TimelineId("1080243c1f76fe3c5147266663c9860b")
# Introduce failpoint during timeline init (some intermediate files are on disk), before it's checkpointed.
pageserver_http.configure_failpoints(("before-checkpoint-new-timeline", "return"))
with pytest.raises(Exception, match="before-checkpoint-new-timeline"):
_ = env.neon_cli.create_timeline(
"test_timeline_init_break_before_checkpoint", tenant_id, timeline_id
)
# Restart the page server
env.pageserver.restart(immediate=True)
# Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally.
new_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
assert (
new_tenant_timelines == old_tenant_timelines
), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}"
timeline_dirs = [d for d in timelines_dir.iterdir()]
assert (
timeline_dirs == initial_timeline_dirs
), "pageserver should clean its temp timeline files on timeline creation failure"
# Disable the failpoint again
pageserver_http.configure_failpoints(("before-checkpoint-new-timeline", "off"))
# creating the branch should have worked now
new_timeline_id = env.neon_cli.create_timeline(
"test_timeline_init_break_before_checkpoint", tenant_id, timeline_id
)
assert timeline_id == new_timeline_id
def test_timeline_create_break_after_uninit_mark(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()

View File

@@ -411,6 +411,7 @@ def check_neon_works(
config.initial_tenant = snapshot_config["default_tenant_id"]
config.pg_distrib_dir = pg_distrib_dir
config.remote_storage = None
config.ext_remote_storage = None
config.sk_remote_storage = None
# Use the "target" binaries to launch the storage nodes

View File

@@ -245,19 +245,6 @@ def test_ddl_forwarding(ddl: DdlForwardingContext):
raise AssertionError("Could not count databases")
assert result[0] == 0, "Database 'failure' still exists after drop"
# We don't have compute_ctl, so here, so create neon_superuser here manually
cur.execute("CREATE ROLE neon_superuser NOLOGIN CREATEDB CREATEROLE")
with pytest.raises(psycopg2.InternalError):
cur.execute("ALTER ROLE neon_superuser LOGIN")
with pytest.raises(psycopg2.InternalError):
cur.execute("CREATE DATABASE trololobus WITH OWNER neon_superuser")
cur.execute("CREATE DATABASE trololobus")
with pytest.raises(psycopg2.InternalError):
cur.execute("ALTER DATABASE trololobus OWNER TO neon_superuser")
conn.close()

View File

@@ -1,137 +1,316 @@
import os
import shutil
import threading
from contextlib import closing
from pathlib import Path
from typing import Any, Dict
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnvBuilder,
)
from fixtures.pg_version import PgVersion
from pytest_httpserver import HTTPServer
from werkzeug.wrappers.request import Request
from werkzeug.wrappers.response import Response
from fixtures.pg_version import PgVersion, skip_on_postgres
from fixtures.remote_storage import (
RemoteStorageKind,
S3Storage,
available_s3_storages,
)
# use neon_env_builder_local fixture to override the default neon_env_builder fixture
# and use a test-specific pg_install instead of shared one
@pytest.fixture(scope="function")
def neon_env_builder_local(
neon_env_builder: NeonEnvBuilder,
test_output_dir: Path,
pg_distrib_dir: Path,
pg_version: PgVersion,
) -> NeonEnvBuilder:
test_local_pginstall = test_output_dir / "pg_install"
log.info(f"copy {pg_distrib_dir} to {test_local_pginstall}")
shutil.copytree(
pg_distrib_dir / pg_version.v_prefixed, test_local_pginstall / pg_version.v_prefixed
)
# Cleaning up downloaded files is important for local tests
# or else one test could reuse the files from another test or another test run
def cleanup(pg_version):
PGDIR = Path(f"pg_install/v{pg_version}")
neon_env_builder.pg_distrib_dir = test_local_pginstall
log.info(f"local neon_env_builder.pg_distrib_dir: {neon_env_builder.pg_distrib_dir}")
LIB_DIR = PGDIR / Path("lib/postgresql")
cleanup_lib_globs = ["anon*", "postgis*", "pg_buffercache*"]
cleanup_lib_glob_paths = [LIB_DIR.glob(x) for x in cleanup_lib_globs]
return neon_env_builder
SHARE_DIR = PGDIR / Path("share/postgresql/extension")
cleanup_ext_globs = [
"anon*",
"address_standardizer*",
"postgis*",
"pageinspect*",
"pg_buffercache*",
"pgrouting*",
]
cleanup_ext_glob_paths = [SHARE_DIR.glob(x) for x in cleanup_ext_globs]
all_glob_paths = cleanup_lib_glob_paths + cleanup_ext_glob_paths
all_cleanup_files = []
for file_glob in all_glob_paths:
for file in file_glob:
all_cleanup_files.append(file)
for file in all_cleanup_files:
try:
os.remove(file)
log.info(f"removed file {file}")
except Exception as err:
log.info(
f"skipping remove of file {file} because it doesn't exist.\
this may be expected or unexpected depending on the test {err}"
)
cleanup_folders = [SHARE_DIR / Path("anon"), PGDIR / Path("download_extensions")]
for folder in cleanup_folders:
try:
shutil.rmtree(folder)
log.info(f"removed folder {folder}")
except Exception as err:
log.info(
f"skipping remove of folder {folder} because it doesn't exist.\
this may be expected or unexpected depending on the test {err}"
)
def upload_files(env):
log.info("Uploading test files to mock bucket")
os.chdir("test_runner/regress/data/extension_test")
for path in os.walk("."):
prefix, _, files = path
for file in files:
# the [2:] is to remove the leading "./"
full_path = os.path.join(prefix, file)[2:]
with open(full_path, "rb") as f:
log.info(f"UPLOAD {full_path} to ext/{full_path}")
assert isinstance(env.pageserver_remote_storage, S3Storage)
env.pageserver_remote_storage.client.upload_fileobj(
f,
env.ext_remote_storage.bucket_name,
f"ext/{full_path}",
)
os.chdir("../../../..")
# Test downloading remote extension.
@skip_on_postgres(PgVersion.V16, reason="TODO: PG16 extension building")
@pytest.mark.parametrize("remote_storage_kind", available_s3_storages())
@pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949")
def test_remote_extensions(
httpserver: HTTPServer,
neon_env_builder_local: NeonEnvBuilder,
httpserver_listen_address,
pg_version,
neon_env_builder: NeonEnvBuilder,
remote_storage_kind: RemoteStorageKind,
pg_version: PgVersion,
):
if pg_version == PgVersion.V16:
pytest.skip("TODO: PG16 extension building")
neon_env_builder.enable_extensions_remote_storage(remote_storage_kind)
env = neon_env_builder.init_start()
tenant_id, _ = env.neon_cli.create_tenant()
env.neon_cli.create_timeline("test_remote_extensions", tenant_id=tenant_id)
# setup mock http server
# that expects request for anon.tar.zst
# and returns the requested file
(host, port) = httpserver_listen_address
extensions_endpoint = f"http://{host}:{port}/pg-ext-s3-gateway"
assert env.ext_remote_storage is not None # satisfy mypy
build_tag = os.environ.get("BUILD_TAG", "latest")
archive_path = f"{build_tag}/v{pg_version}/extensions/anon.tar.zst"
# For MOCK_S3 we upload test files.
# For REAL_S3 we use the files already in the bucket
if remote_storage_kind == RemoteStorageKind.MOCK_S3:
upload_files(env)
def endpoint_handler_build_tag(request: Request) -> Response:
log.info(f"request: {request}")
file_name = "anon.tar.zst"
file_path = f"test_runner/regress/data/extension_test/5670669815/v{pg_version}/extensions/anon.tar.zst"
file_size = os.path.getsize(file_path)
fh = open(file_path, "rb")
return Response(
fh,
mimetype="application/octet-stream",
headers=[
("Content-Length", str(file_size)),
("Content-Disposition", 'attachment; filename="%s"' % file_name),
],
direct_passthrough=True,
)
httpserver.expect_request(
f"/pg-ext-s3-gateway/{archive_path}", method="GET"
).respond_with_handler(endpoint_handler_build_tag)
# Start a compute node with remote_extension spec
# and check that it can download the extensions and use them to CREATE EXTENSION.
env = neon_env_builder_local.init_start()
env.neon_cli.create_branch("test_remote_extensions")
endpoint = env.endpoints.create(
# Start a compute node and check that it can download the extensions
# and use them to CREATE EXTENSION and LOAD
endpoint = env.endpoints.create_start(
"test_remote_extensions",
config_lines=["log_min_messages=debug3"],
tenant_id=tenant_id,
remote_ext_config=env.ext_remote_storage.to_string(),
# config_lines=["log_min_messages=debug3"],
)
# mock remote_extensions spec
spec: Dict[str, Any] = {
"library_index": {
"anon": "anon",
},
"extension_data": {
"anon": {
"archive_path": "",
"control_data": {
"anon.control": "# PostgreSQL Anonymizer (anon) extension\ncomment = 'Data anonymization tools'\ndefault_version = '1.1.0'\ndirectory='extension/anon'\nrelocatable = false\nrequires = 'pgcrypto'\nsuperuser = false\nmodule_pathname = '$libdir/anon'\ntrusted = true\n"
},
},
},
}
spec["extension_data"]["anon"]["archive_path"] = archive_path
endpoint.create_remote_extension_spec(spec)
endpoint.start(
remote_ext_config=extensions_endpoint,
)
# this is expected to fail if there's no pgcrypto extension, that's ok
# we just want to check that the extension was downloaded
try:
with closing(endpoint.connect()) as conn:
with conn.cursor() as cur:
# Check that appropriate files were downloaded
cur.execute("CREATE EXTENSION anon")
res = [x[0] for x in cur.fetchall()]
log.info(res)
except Exception as err:
assert "pgcrypto" in str(err), f"unexpected error creating anon extension {err}"
# Check that appropriate control files were downloaded
cur.execute("SELECT * FROM pg_available_extensions")
all_extensions = [x[0] for x in cur.fetchall()]
log.info(all_extensions)
assert "anon" in all_extensions
httpserver.check()
# postgis is on real s3 but not mock s3.
# it's kind of a big file, would rather not upload to github
if remote_storage_kind == RemoteStorageKind.REAL_S3:
assert "postgis" in all_extensions
# this may fail locally if dependency is missing
# we don't really care about the error,
# we just want to make sure it downloaded
try:
cur.execute("CREATE EXTENSION postgis")
except Exception as err:
log.info(f"(expected) error creating postgis extension: {err}")
# we do not check the error, so this is basically a NO-OP
# however checking the log you can make sure that it worked
# and also get valuable information about how long loading the extension took
# this is expected to fail on my computer because I don't have the pgcrypto extension
try:
cur.execute("CREATE EXTENSION anon")
except Exception as err:
log.info("error creating anon extension")
assert "pgcrypto" in str(err), "unexpected error creating anon extension"
finally:
cleanup(pg_version)
# TODO
# 1. Test downloading remote library.
#
# 2. Test a complex extension, which has multiple extensions in one archive
# Test downloading remote library.
@skip_on_postgres(PgVersion.V16, reason="TODO: PG16 extension building")
@pytest.mark.parametrize("remote_storage_kind", available_s3_storages())
@pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949")
def test_remote_library(
neon_env_builder: NeonEnvBuilder,
remote_storage_kind: RemoteStorageKind,
pg_version: PgVersion,
):
neon_env_builder.enable_extensions_remote_storage(remote_storage_kind)
env = neon_env_builder.init_start()
tenant_id, _ = env.neon_cli.create_tenant()
env.neon_cli.create_timeline("test_remote_library", tenant_id=tenant_id)
assert env.ext_remote_storage is not None # satisfy mypy
# For MOCK_S3 we upload test files.
# For REAL_S3 we use the files already in the bucket
if remote_storage_kind == RemoteStorageKind.MOCK_S3:
upload_files(env)
# and use them to run LOAD library
endpoint = env.endpoints.create_start(
"test_remote_library",
tenant_id=tenant_id,
remote_ext_config=env.ext_remote_storage.to_string(),
# config_lines=["log_min_messages=debug3"],
)
try:
with closing(endpoint.connect()) as conn:
with conn.cursor() as cur:
# try to load library
try:
cur.execute("LOAD 'anon'")
except Exception as err:
log.info(f"error loading anon library: {err}")
raise AssertionError("unexpected error loading anon library") from err
# test library which name is different from extension name
# this may fail locally if dependency is missing
# however, it does successfully download the postgis archive
if remote_storage_kind == RemoteStorageKind.REAL_S3:
try:
cur.execute("LOAD 'postgis_topology-3'")
except Exception as err:
log.info("error loading postgis_topology-3")
assert "No such file or directory" in str(
err
), "unexpected error loading postgis_topology-3"
finally:
cleanup(pg_version)
# Here we test a complex extension
# which has multiple extensions in one archive
# using postgis as an example
#
# 3.Test that extension is downloaded after endpoint restart,
# @pytest.mark.skipif(
# RemoteStorageKind.REAL_S3 not in available_s3_storages(),
# reason="skipping test because real s3 not enabled",
# )
@skip_on_postgres(PgVersion.V16, reason="TODO: PG16 extension building")
@pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949")
def test_multiple_extensions_one_archive(
neon_env_builder: NeonEnvBuilder,
pg_version: PgVersion,
):
neon_env_builder.enable_extensions_remote_storage(RemoteStorageKind.REAL_S3)
env = neon_env_builder.init_start()
tenant_id, _ = env.neon_cli.create_tenant()
env.neon_cli.create_timeline("test_multiple_extensions_one_archive", tenant_id=tenant_id)
assert env.ext_remote_storage is not None # satisfy mypy
endpoint = env.endpoints.create_start(
"test_multiple_extensions_one_archive",
tenant_id=tenant_id,
remote_ext_config=env.ext_remote_storage.to_string(),
)
with closing(endpoint.connect()) as conn:
with conn.cursor() as cur:
cur.execute("CREATE EXTENSION address_standardizer;")
cur.execute("CREATE EXTENSION address_standardizer_data_us;")
# execute query to ensure that it works
cur.execute(
"SELECT house_num, name, suftype, city, country, state, unit \
FROM standardize_address('us_lex', 'us_gaz', 'us_rules', \
'One Rust Place, Boston, MA 02109');"
)
res = cur.fetchall()
log.info(res)
assert len(res) > 0
cleanup(pg_version)
# Test that extension is downloaded after endpoint restart,
# when the library is used in the query.
#
# Run the test with mutliple simultaneous connections to an endpoint.
# to ensure that the extension is downloaded only once.
#
# 4. Test that private extensions are only downloaded when they are present in the spec.
#
@pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949")
def test_extension_download_after_restart(
neon_env_builder: NeonEnvBuilder,
pg_version: PgVersion,
):
# TODO: PG15 + PG16 extension building
if "v14" not in pg_version: # test set only has extension built for v14
return None
neon_env_builder.enable_extensions_remote_storage(RemoteStorageKind.MOCK_S3)
env = neon_env_builder.init_start()
tenant_id, _ = env.neon_cli.create_tenant()
env.neon_cli.create_timeline("test_extension_download_after_restart", tenant_id=tenant_id)
assert env.ext_remote_storage is not None # satisfy mypy
# For MOCK_S3 we upload test files.
upload_files(env)
endpoint = env.endpoints.create_start(
"test_extension_download_after_restart",
tenant_id=tenant_id,
remote_ext_config=env.ext_remote_storage.to_string(),
config_lines=["log_min_messages=debug3"],
)
with closing(endpoint.connect()) as conn:
with conn.cursor() as cur:
cur.execute("CREATE extension pg_buffercache;")
cur.execute("SELECT * from pg_buffercache;")
res = cur.fetchall()
assert len(res) > 0
log.info(res)
# shutdown compute node
endpoint.stop()
# remove extension files locally
cleanup(pg_version)
# spin up compute node again (there are no extension files available, because compute is stateless)
endpoint = env.endpoints.create_start(
"test_extension_download_after_restart",
tenant_id=tenant_id,
remote_ext_config=env.ext_remote_storage.to_string(),
config_lines=["log_min_messages=debug3"],
)
# connect to compute node and run the query
# that will trigger the download of the extension
def run_query(endpoint, thread_id: int):
log.info("thread_id {%d} starting", thread_id)
with closing(endpoint.connect()) as conn:
with conn.cursor() as cur:
cur.execute("SELECT * from pg_buffercache;")
res = cur.fetchall()
assert len(res) > 0
log.info("thread_id {%d}, res = %s", thread_id, res)
threads = [threading.Thread(target=run_query, args=(endpoint, i)) for i in range(2)]
for thread in threads:
thread.start()
for thread in threads:
thread.join()
cleanup(pg_version)

View File

@@ -1,6 +1,5 @@
import time
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnvBuilder,
@@ -16,11 +15,7 @@ from fixtures.utils import query_scalar
# and then download them back.
def test_basic_eviction(
neon_env_builder: NeonEnvBuilder,
build_type: str,
):
if build_type == "debug":
pytest.skip("times out in debug builds")
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
env = neon_env_builder.init_start(

View File

@@ -1,222 +0,0 @@
import time
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder
from fixtures.pageserver.types import (
DeltaLayerFileName,
ImageLayerFileName,
is_future_layer,
)
from fixtures.pageserver.utils import (
wait_for_last_record_lsn,
wait_for_upload_queue_empty,
wait_until_tenant_active,
)
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
from fixtures.types import Lsn
from fixtures.utils import query_scalar, wait_until
def test_issue_5878(neon_env_builder: NeonEnvBuilder):
"""
Regression test for issue https://github.com/neondatabase/neon/issues/5878 .
Create a situation where IndexPart contains an image layer from a future
(i.e., image layer > IndexPart::disk_consistent_lsn).
Detach.
Attach.
Wait for tenant to finish load_layer_map (by waiting for it to become active).
Wait for any remote timeline client ops to finish that the attach started.
Integrity-check the index part.
Before fixing the issue, load_layer_map would schedule removal of the future
image layer. A compaction run could later re-create the image layer with
the same file name, scheduling a PUT.
Due to lack of an upload queue barrier, the PUT and DELETE could be re-ordered.
The result was IndexPart referencing a non-existent object.
"""
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
env = neon_env_builder.init_start()
ps_http = env.pageserver.http_client()
l0_l1_threshold = 3
image_creation_threshold = 1
tenant_config = {
"gc_period": "0s", # disable GC (shouldn't matter for this test but still)
"compaction_period": "0s", # we want to control when compaction runs
"checkpoint_timeout": "24h", # something we won't reach
"checkpoint_distance": f"{50 * (1024**2)}", # something we won't reach, we checkpoint manually
"image_creation_threshold": f"{image_creation_threshold}",
"compaction_threshold": f"{l0_l1_threshold}",
"compaction_target_size": f"{128 * (1024**3)}", # make it so that we only have 1 partition => image coverage for delta layers => enables gc of delta layers
}
tenant_id, timeline_id = env.neon_cli.create_tenant(conf=tenant_config)
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
def get_index_part():
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
ip_path = env.pageserver_remote_storage.index_path(tenant_id, timeline_id)
return env.pagectl.dump_index_part(ip_path)
def get_future_layers():
ip = get_index_part()
future_layers = [
layer_file_name
for layer_file_name in ip.layer_metadata.keys()
if is_future_layer(layer_file_name, ip.disk_consistent_lsn)
]
return future_layers
assert len(get_future_layers()) == 0
current = get_index_part()
assert len(set(current.layer_metadata.keys())) == 1
layer_file_name = list(current.layer_metadata.keys())[0]
assert isinstance(layer_file_name, DeltaLayerFileName)
assert layer_file_name.is_l0(), f"{layer_file_name}"
log.info("force image layer creation in the future by writing some data into in-memory layer")
# Create a number of layers in the tenant
with endpoint.cursor() as cur:
cur.execute("CREATE TABLE foo (t text)")
iters = l0_l1_threshold * image_creation_threshold
for i in range(0, iters):
cur.execute(
f"""
INSERT INTO foo
SELECT '{i}' || g
FROM generate_series(1, 10000) g
"""
)
last_record_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, last_record_lsn)
# 0..iters-1: create a stack of delta layers
# iters: leave a non-empty in-memory layer which we'll use for image layer generation
if i < iters - 1:
ps_http.timeline_checkpoint(tenant_id, timeline_id, force_repartition=True)
assert (
len(
[
layer
for layer in ps_http.layer_map_info(
tenant_id, timeline_id
).historic_layers
if layer.kind == "Image"
]
)
== 0
)
endpoint.stop()
wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id)
ip = get_index_part()
assert len(ip.layer_metadata.keys())
assert (
ip.disk_consistent_lsn < last_record_lsn
), "sanity check for what above loop is supposed to do"
# create the image layer from the future
ps_http.timeline_compact(tenant_id, timeline_id, force_repartition=True)
assert (
len(
[
layer
for layer in ps_http.layer_map_info(tenant_id, timeline_id).historic_layers
if layer.kind == "Image"
]
)
== 1
)
wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id)
future_layers = get_future_layers()
assert len(future_layers) == 1
future_layer = future_layers[0]
assert isinstance(future_layer, ImageLayerFileName)
assert future_layer.lsn == last_record_lsn
log.info(
f"got layer from the future: lsn={future_layer.lsn} disk_consistent_lsn={ip.disk_consistent_lsn} last_record_lsn={last_record_lsn}"
)
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
future_layer_path = env.pageserver_remote_storage.layer_path(
tenant_id, timeline_id, future_layer
)
log.info(f"future layer path: {future_layer_path}")
pre_stat = future_layer_path.stat()
time.sleep(1.1) # so that we can use change in pre_stat.st_mtime to detect overwrites
# force removal of layers from the future
tenant_conf = ps_http.tenant_config(tenant_id)
ps_http.tenant_detach(tenant_id)
failpoint_name = "before-delete-layer-pausable"
ps_http.configure_failpoints((failpoint_name, "pause"))
ps_http.tenant_attach(tenant_id, tenant_conf.tenant_specific_overrides)
wait_until_tenant_active(ps_http, tenant_id)
# Ensure the IndexPart upload that unlinks the layer file finishes, i.e., doesn't clog the queue.
def future_layer_is_gone_from_index_part():
future_layers = set(get_future_layers())
assert future_layer not in future_layers
wait_until(10, 0.5, future_layer_is_gone_from_index_part)
# NB: the layer file is unlinked index part now, but, because we made the delete
# operation stuck, the layer file itself is still in the remote_storage
def delete_at_pause_point():
assert env.pageserver.log_contains(f".*{tenant_id}.*at failpoint.*{failpoint_name}")
wait_until(10, 0.5, delete_at_pause_point)
assert future_layer_path.exists()
# wait for re-ingestion of the WAL from safekeepers into the in-memory layer
# (this happens in parallel to the above)
wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, last_record_lsn)
# re-do image layer generation
# This will produce the same image layer and queue an upload.
# However, we still have the deletion for the layer queued, stuck on the failpoint.
# An incorrect implementation would let the PUT execute before the DELETE.
# The later code in this test asserts that this doesn't happen.
ps_http.timeline_compact(tenant_id, timeline_id, force_repartition=True)
# Let things sit for some time; a good implementation makes no progress because
# we can't execute the PUT before the DELETE. A bad implementation would do that.
max_race_opportunity_window = 4
start = time.monotonic()
while True:
post_stat = future_layer_path.stat()
assert (
pre_stat.st_mtime == post_stat.st_mtime
), "observed PUT overtake the stucked DELETE => bug isn't fixed yet"
if time.monotonic() - start > max_race_opportunity_window:
log.info(
"a correct implementation would never let the later PUT overtake the earlier DELETE"
)
break
time.sleep(1)
# Window has passed, unstuck the delete, let upload queue drain.
log.info("unstuck the DELETE")
ps_http.configure_failpoints(("before-delete-layer-pausable", "off"))
wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id)
# Examine the resulting S3 state.
log.info("integrity-check the remote storage")
ip = get_index_part()
for layer_file_name in ip.layer_metadata.keys():
layer_path = env.pageserver_remote_storage.layer_path(
tenant_id, timeline_id, layer_file_name
)
assert layer_path.exists(), f"{layer_file_name.to_str()}"
log.info("assert that the overwritten layer won")
final_stat = future_layer_path.stat()
assert final_stat.st_mtime != pre_stat.st_mtime

View File

@@ -1,74 +0,0 @@
import os
import random
import threading
import time
from typing import List
from fixtures.neon_fixtures import NeonEnv
from fixtures.utils import query_scalar
def test_local_file_cache_unlink(neon_simple_env: NeonEnv):
env = neon_simple_env
cache_dir = os.path.join(env.repo_dir, "file_cache")
os.mkdir(cache_dir)
env.neon_cli.create_branch("test_local_file_cache_unlink", "empty")
endpoint = env.endpoints.create_start(
"test_local_file_cache_unlink",
config_lines=[
"shared_buffers='1MB'",
f"neon.file_cache_path='{cache_dir}/file.cache'",
"neon.max_file_cache_size='64MB'",
"neon.file_cache_size_limit='10MB'",
],
)
cur = endpoint.connect().cursor()
n_rows = 100000
n_threads = 20
n_updates_per_thread = 10000
n_updates_per_connection = 1000
n_total_updates = n_threads * n_updates_per_thread
cur.execute("CREATE TABLE lfctest (id int4 PRIMARY KEY, n int) WITH (fillfactor=10)")
cur.execute(f"INSERT INTO lfctest SELECT g, 1 FROM generate_series(1, {n_rows}) g")
# Start threads that will perform random UPDATEs. Each UPDATE
# increments the counter on the row, so that we can check at the
# end that the sum of all the counters match the number of updates
# performed (plus the initial 1 on each row).
#
# Furthermore, each thread will reconnect between every 1000 updates.
def run_updates():
n_updates_performed = 0
conn = endpoint.connect()
cur = conn.cursor()
for _ in range(n_updates_per_thread):
id = random.randint(1, n_rows)
cur.execute(f"UPDATE lfctest SET n = n + 1 WHERE id = {id}")
n_updates_performed += 1
if n_updates_performed % n_updates_per_connection == 0:
cur.close()
conn.close()
conn = endpoint.connect()
cur = conn.cursor()
threads: List[threading.Thread] = []
for _i in range(n_threads):
thread = threading.Thread(target=run_updates, args=(), daemon=True)
thread.start()
threads.append(thread)
time.sleep(5)
new_cache_dir = os.path.join(env.repo_dir, "file_cache_new")
os.rename(cache_dir, new_cache_dir)
for thread in threads:
thread.join()
assert query_scalar(cur, "SELECT SUM(n) FROM lfctest") == n_total_updates + n_rows

View File

@@ -1,28 +0,0 @@
from contextlib import closing
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder
# Verify that the neon extension is installed and has the correct version.
def test_neon_extension(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
env.neon_cli.create_branch("test_create_extension_neon")
endpoint_main = env.endpoints.create("test_create_extension_neon")
# don't skip pg_catalog updates - it runs CREATE EXTENSION neon
endpoint_main.respec(skip_pg_catalog_updates=False)
endpoint_main.start()
log.info("postgres is running on 'test_create_extension_neon' branch")
with closing(endpoint_main.connect()) as conn:
with conn.cursor() as cur:
cur.execute("SELECT extversion from pg_extension where extname='neon'")
# If this fails, it means the extension is either not installed
# or was updated and the version is different.
#
# IMPORTANT:
# If the version has changed, the test should be updated.
# Ensure that the default version is also updated in the neon.control file
assert cur.fetchone() == ("1.1",)

View File

@@ -282,7 +282,7 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
# Now advance the generation in the control plane: subsequent validations
# from the running pageserver will fail. No more deletions should happen.
env.attachment_service.attach_hook_issue(env.initial_tenant, some_other_pageserver)
env.attachment_service.attach_hook(env.initial_tenant, some_other_pageserver)
generate_uploads_and_deletions(env, init=False)
assert_deletion_queue(ps_http, lambda n: n > 0)
@@ -397,7 +397,7 @@ def test_deletion_queue_recovery(
if keep_attachment == KeepAttachment.LOSE:
some_other_pageserver = 101010
assert env.attachment_service is not None
env.attachment_service.attach_hook_issue(env.initial_tenant, some_other_pageserver)
env.attachment_service.attach_hook(env.initial_tenant, some_other_pageserver)
env.pageserver.start()

View File

@@ -3,21 +3,75 @@ import time
from dataclasses import dataclass
from pathlib import Path
from queue import SimpleQueue
from typing import Any, Dict, Set, Tuple
from typing import Any, Dict, Set
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnv,
NeonEnvBuilder,
wait_for_last_flush_lsn,
)
from fixtures.remote_storage import RemoteStorageKind
from fixtures.types import TenantId, TimelineId
from pytest_httpserver import HTTPServer
from werkzeug.wrappers.request import Request
from werkzeug.wrappers.response import Response
# TODO: collect all of the env setup *AFTER* removal of RemoteStorageKind.NOOP
def test_metric_collection(
neon_env_and_metrics_server: Tuple[NeonEnv, HTTPServer, SimpleQueue[Any]]
httpserver: HTTPServer,
neon_env_builder: NeonEnvBuilder,
httpserver_listen_address,
):
(env, httpserver, uploads) = neon_env_and_metrics_server
(host, port) = httpserver_listen_address
metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
# this should be Union[str, Tuple[List[Any], bool]], but it will make unpacking much more verbose
uploads: SimpleQueue[Any] = SimpleQueue()
def metrics_handler(request: Request) -> Response:
if request.json is None:
return Response(status=400)
events = request.json["events"]
is_last = request.headers["pageserver-metrics-last-upload-in-batch"]
assert is_last in ["true", "false"]
uploads.put((events, is_last == "true"))
return Response(status=200)
# Require collecting metrics frequently, since we change
# the timeline and want something to be logged about it.
#
# Disable time-based pitr, we will use the manual GC calls
# to trigger remote storage operations in a controlled way
neon_env_builder.pageserver_config_override = f"""
metric_collection_interval="1s"
metric_collection_endpoint="{metric_collection_endpoint}"
cached_metric_collection_interval="0s"
synthetic_size_calculation_interval="3s"
"""
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
log.info(f"test_metric_collection endpoint is {metric_collection_endpoint}")
# mock http server that returns OK for the metrics
httpserver.expect_request("/billing/api/v1/usage_events", method="POST").respond_with_handler(
metrics_handler
)
# spin up neon, after http server is ready
env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
# httpserver is shut down before pageserver during passing run
env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*")
# we have a fast rate of calculation, these can happen at shutdown
env.pageserver.allowed_errors.append(
".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*"
)
env.pageserver.allowed_errors.append(
".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes"
)
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
@@ -114,11 +168,59 @@ def test_metric_collection(
def test_metric_collection_cleans_up_tempfile(
neon_env_and_metrics_server: Tuple[NeonEnv, HTTPServer, SimpleQueue[Any]]
httpserver: HTTPServer,
neon_env_builder: NeonEnvBuilder,
httpserver_listen_address,
):
(env, httpserver, uploads) = neon_env_and_metrics_server
(host, port) = httpserver_listen_address
metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
# this should be Union[str, Tuple[List[Any], bool]], but it will make unpacking much more verbose
uploads: SimpleQueue[Any] = SimpleQueue()
def metrics_handler(request: Request) -> Response:
if request.json is None:
return Response(status=400)
events = request.json["events"]
is_last = request.headers["pageserver-metrics-last-upload-in-batch"]
assert is_last in ["true", "false"]
uploads.put((events, is_last == "true"))
return Response(status=200)
# Require collecting metrics frequently, since we change
# the timeline and want something to be logged about it.
#
# Disable time-based pitr, we will use the manual GC calls
# to trigger remote storage operations in a controlled way
neon_env_builder.pageserver_config_override = f"""
metric_collection_interval="1s"
metric_collection_endpoint="{metric_collection_endpoint}"
cached_metric_collection_interval="0s"
synthetic_size_calculation_interval="3s"
"""
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
# mock http server that returns OK for the metrics
httpserver.expect_request("/billing/api/v1/usage_events", method="POST").respond_with_handler(
metrics_handler
)
# spin up neon, after http server is ready
env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
pageserver_http = env.pageserver.http_client()
# httpserver is shut down before pageserver during passing run
env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*")
# we have a fast rate of calculation, these can happen at shutdown
env.pageserver.allowed_errors.append(
".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*"
)
env.pageserver.allowed_errors.append(
".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes"
)
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
@@ -188,8 +290,6 @@ def test_metric_collection_cleans_up_tempfile(
), "only initial tempfile should had been removed"
assert initially.other.issuperset(later.other), "no other files should had been removed"
httpserver.check()
@dataclass
class PrefixPartitionedFiles:

View File

@@ -144,10 +144,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool)
# Test that repeatedly kills and restarts the page server, while the
# safekeeper and compute node keep running.
@pytest.mark.timeout(540)
def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, build_type: str):
if build_type == "debug":
pytest.skip("times out in debug builds")
def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder):
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
neon_env_builder.enable_scrub_on_exit()

View File

@@ -336,15 +336,10 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold(
):
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
env = neon_env_builder.init_start(
initial_tenant_conf={
# disable compaction so that it will not download the layer for repartitioning
"compaction_period": "0s"
}
)
env = neon_env_builder.init_start()
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
(tenant_id, timeline_id) = env.initial_tenant, env.initial_timeline
(tenant_id, timeline_id) = env.neon_cli.create_tenant()
ps_http = env.pageserver.http_client()
def get_metric():

View File

@@ -1,7 +1,6 @@
import enum
import os
import shutil
from threading import Thread
import pytest
from fixtures.log_helper import log
@@ -28,7 +27,7 @@ from fixtures.remote_storage import (
available_s3_storages,
)
from fixtures.types import TenantId
from fixtures.utils import run_pg_bench_small, wait_until
from fixtures.utils import run_pg_bench_small
@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
@@ -400,78 +399,4 @@ def test_tenant_delete_is_resumed_on_attach(
)
def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonEnvBuilder):
"""Reproduction of 2023-11-23 stuck tenants investigation"""
# do not use default tenant/timeline creation because it would output the failpoint log message too early
env = neon_env_builder.init_configs()
env.start()
pageserver_http = env.pageserver.http_client()
# happens with the cancellation bailing flushing loop earlier, leaving disk_consistent_lsn at zero
env.pageserver.allowed_errors.append(
".*Timeline got dropped without initializing, cleaning its files"
)
# the response hit_pausable_failpoint_and_later_fail
env.pageserver.allowed_errors.append(
f".*Error processing HTTP request: InternalServerError\\(new timeline {env.initial_tenant}/{env.initial_timeline} has invalid disk_consistent_lsn"
)
pageserver_http.tenant_create(env.initial_tenant)
failpoint = "flush-layer-cancel-after-writing-layer-out-pausable"
pageserver_http.configure_failpoints((failpoint, "pause"))
def hit_pausable_failpoint_and_later_fail():
with pytest.raises(
PageserverApiException, match="new timeline \\S+ has invalid disk_consistent_lsn"
):
pageserver_http.timeline_create(
env.pg_version, env.initial_tenant, env.initial_timeline
)
def start_deletion():
pageserver_http.tenant_delete(env.initial_tenant)
def has_hit_failpoint():
assert env.pageserver.log_contains(f"at failpoint {failpoint}") is not None
def deletion_has_started_waiting_for_timelines():
assert env.pageserver.log_contains("Waiting for timelines...") is not None
def tenant_is_deleted():
try:
pageserver_http.tenant_status(env.initial_tenant)
except PageserverApiException as e:
assert e.status_code == 404
else:
raise RuntimeError("tenant was still accessible")
creation = Thread(target=hit_pausable_failpoint_and_later_fail)
creation.start()
deletion = None
try:
wait_until(10, 1, has_hit_failpoint)
# it should start ok, sync up with the stuck creation, then fail because disk_consistent_lsn was not updated
# then deletion should fail and set the tenant broken
deletion = Thread(target=start_deletion)
deletion.start()
wait_until(10, 1, deletion_has_started_waiting_for_timelines)
pageserver_http.configure_failpoints((failpoint, "off"))
creation.join()
deletion.join()
wait_until(10, 1, tenant_is_deleted)
finally:
creation.join()
if deletion is not None:
deletion.join()
# TODO test concurrent deletions with "hang" failpoint

Some files were not shown because too many files have changed in this diff Show More