mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-10 14:10:37 +00:00
Compare commits
13 Commits
jcsp/issue
...
fix-XLogWa
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8dc1c48605 | ||
|
|
e1a9669d05 | ||
|
|
aaf60819fa | ||
|
|
c84656a53e | ||
|
|
af99c959ef | ||
|
|
a8e6d259cb | ||
|
|
c1390bfc3b | ||
|
|
6d951e69d6 | ||
|
|
4b8809b280 | ||
|
|
4c5afb7b10 | ||
|
|
ec069dc45e | ||
|
|
790c05d675 | ||
|
|
923cf91aa4 |
111
Cargo.lock
generated
111
Cargo.lock
generated
@@ -708,7 +708,7 @@ dependencies = [
|
||||
"sha1",
|
||||
"sync_wrapper",
|
||||
"tokio",
|
||||
"tokio-tungstenite 0.20.0",
|
||||
"tokio-tungstenite",
|
||||
"tower",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
@@ -979,6 +979,12 @@ version = "3.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1"
|
||||
|
||||
[[package]]
|
||||
name = "bytemuck"
|
||||
version = "1.16.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "78834c15cb5d5efe3452d58b1e8ba890dd62d21907f867f383358198e56ebca5"
|
||||
|
||||
[[package]]
|
||||
name = "byteorder"
|
||||
version = "1.4.3"
|
||||
@@ -1233,8 +1239,10 @@ dependencies = [
|
||||
"serde_json",
|
||||
"signal-hook",
|
||||
"tar",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"tracing",
|
||||
@@ -1596,7 +1604,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6943ae99c34386c84a470c499d3414f66502a41340aa895406e0d2e4a207b91d"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"hashbrown 0.14.0",
|
||||
"hashbrown 0.14.5",
|
||||
"lock_api",
|
||||
"once_cell",
|
||||
"parking_lot_core 0.9.8",
|
||||
@@ -1997,6 +2005,27 @@ dependencies = [
|
||||
"percent-encoding",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "framed-websockets"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/neondatabase/framed-websockets#34eff3d6f8cfccbc5f35e4f65314ff7328621127"
|
||||
dependencies = [
|
||||
"base64 0.21.1",
|
||||
"bytemuck",
|
||||
"bytes",
|
||||
"futures-core",
|
||||
"futures-sink",
|
||||
"http-body-util",
|
||||
"hyper 1.2.0",
|
||||
"hyper-util",
|
||||
"pin-project",
|
||||
"rand 0.8.5",
|
||||
"sha1",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fs2"
|
||||
version = "0.4.3"
|
||||
@@ -2275,9 +2304,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.14.0"
|
||||
version = "0.14.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a"
|
||||
checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"allocator-api2",
|
||||
@@ -2285,11 +2314,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "hashlink"
|
||||
version = "0.8.4"
|
||||
version = "0.9.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7"
|
||||
checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af"
|
||||
dependencies = [
|
||||
"hashbrown 0.14.0",
|
||||
"hashbrown 0.14.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2598,21 +2627,6 @@ dependencies = [
|
||||
"tokio-native-tls",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hyper-tungstenite"
|
||||
version = "0.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7a343d17fe7885302ed7252767dc7bb83609a874b6ff581142241ec4b73957ad"
|
||||
dependencies = [
|
||||
"http-body-util",
|
||||
"hyper 1.2.0",
|
||||
"hyper-util",
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
"tokio-tungstenite 0.21.0",
|
||||
"tungstenite 0.21.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hyper-util"
|
||||
version = "0.1.3"
|
||||
@@ -2690,7 +2704,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e"
|
||||
dependencies = [
|
||||
"equivalent",
|
||||
"hashbrown 0.14.0",
|
||||
"hashbrown 0.14.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2952,7 +2966,7 @@ version = "0.12.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d3262e75e648fce39813cb56ac41f3c3e3f65217ebf3844d818d1f9398cfb0dc"
|
||||
dependencies = [
|
||||
"hashbrown 0.14.0",
|
||||
"hashbrown 0.14.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3005,7 +3019,7 @@ checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"crossbeam-utils",
|
||||
"hashbrown 0.14.0",
|
||||
"hashbrown 0.14.5",
|
||||
"itoa",
|
||||
"lasso",
|
||||
"measured-derive",
|
||||
@@ -3567,7 +3581,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79"
|
||||
dependencies = [
|
||||
"dlv-list",
|
||||
"hashbrown 0.14.0",
|
||||
"hashbrown 0.14.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3894,7 +3908,7 @@ dependencies = [
|
||||
"ahash",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"hashbrown 0.14.0",
|
||||
"hashbrown 0.14.5",
|
||||
"num",
|
||||
"num-bigint",
|
||||
"paste",
|
||||
@@ -4378,9 +4392,10 @@ dependencies = [
|
||||
"dashmap",
|
||||
"env_logger",
|
||||
"fallible-iterator",
|
||||
"framed-websockets",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hashbrown 0.13.2",
|
||||
"hashbrown 0.14.5",
|
||||
"hashlink",
|
||||
"hex",
|
||||
"hmac",
|
||||
@@ -4390,7 +4405,6 @@ dependencies = [
|
||||
"humantime",
|
||||
"hyper 0.14.26",
|
||||
"hyper 1.2.0",
|
||||
"hyper-tungstenite",
|
||||
"hyper-util",
|
||||
"indexmap 2.0.1",
|
||||
"ipnet",
|
||||
@@ -4435,7 +4449,6 @@ dependencies = [
|
||||
"smol_str",
|
||||
"socket2 0.5.5",
|
||||
"subtle",
|
||||
"sync_wrapper",
|
||||
"task-local-extensions",
|
||||
"thiserror",
|
||||
"tikv-jemalloc-ctl",
|
||||
@@ -4444,6 +4457,7 @@ dependencies = [
|
||||
"tokio-postgres",
|
||||
"tokio-postgres-rustls",
|
||||
"tokio-rustls 0.25.0",
|
||||
"tokio-tungstenite",
|
||||
"tokio-util",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
@@ -6380,19 +6394,7 @@ dependencies = [
|
||||
"futures-util",
|
||||
"log",
|
||||
"tokio",
|
||||
"tungstenite 0.20.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-tungstenite"
|
||||
version = "0.21.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c83b561d025642014097b66e6c1bb422783339e0909e4429cde4749d1990bc38"
|
||||
dependencies = [
|
||||
"futures-util",
|
||||
"log",
|
||||
"tokio",
|
||||
"tungstenite 0.21.0",
|
||||
"tungstenite",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6406,7 +6408,7 @@ dependencies = [
|
||||
"futures-io",
|
||||
"futures-sink",
|
||||
"futures-util",
|
||||
"hashbrown 0.14.0",
|
||||
"hashbrown 0.14.5",
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
"tracing",
|
||||
@@ -6688,25 +6690,6 @@ dependencies = [
|
||||
"utf-8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tungstenite"
|
||||
version = "0.21.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9ef1a641ea34f399a848dea702823bbecfb4c486f911735368f1f137cb8257e1"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"data-encoding",
|
||||
"http 1.1.0",
|
||||
"httparse",
|
||||
"log",
|
||||
"rand 0.8.5",
|
||||
"sha1",
|
||||
"thiserror",
|
||||
"url",
|
||||
"utf-8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "twox-hash"
|
||||
version = "1.6.3"
|
||||
@@ -7502,7 +7485,7 @@ dependencies = [
|
||||
"futures-sink",
|
||||
"futures-util",
|
||||
"getrandom 0.2.11",
|
||||
"hashbrown 0.14.0",
|
||||
"hashbrown 0.14.5",
|
||||
"hex",
|
||||
"hmac",
|
||||
"hyper 0.14.26",
|
||||
|
||||
@@ -81,13 +81,14 @@ enum-map = "2.4.2"
|
||||
enumset = "1.0.12"
|
||||
fail = "0.5.0"
|
||||
fallible-iterator = "0.2"
|
||||
framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" }
|
||||
fs2 = "0.4.3"
|
||||
futures = "0.3"
|
||||
futures-core = "0.3"
|
||||
futures-util = "0.3"
|
||||
git-version = "0.3"
|
||||
hashbrown = "0.13"
|
||||
hashlink = "0.8.4"
|
||||
hashbrown = "0.14"
|
||||
hashlink = "0.9.1"
|
||||
hdrhistogram = "7.5.2"
|
||||
hex = "0.4"
|
||||
hex-literal = "0.4"
|
||||
@@ -98,7 +99,7 @@ http-types = { version = "2", default-features = false }
|
||||
humantime = "2.1"
|
||||
humantime-serde = "1.1.1"
|
||||
hyper = "0.14"
|
||||
hyper-tungstenite = "0.13.0"
|
||||
tokio-tungstenite = "0.20.0"
|
||||
indexmap = "2"
|
||||
inotify = "0.10.2"
|
||||
ipnet = "2.9.0"
|
||||
|
||||
@@ -27,10 +27,12 @@ reqwest = { workspace = true, features = ["json"] }
|
||||
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
||||
tokio-postgres.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-opentelemetry.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
tracing-utils.workspace = true
|
||||
thiserror.workspace = true
|
||||
url.workspace = true
|
||||
|
||||
compute_api.workspace = true
|
||||
|
||||
116
compute_tools/src/catalog.rs
Normal file
116
compute_tools/src/catalog.rs
Normal file
@@ -0,0 +1,116 @@
|
||||
use compute_api::{
|
||||
responses::CatalogObjects,
|
||||
spec::{Database, Role},
|
||||
};
|
||||
use futures::Stream;
|
||||
use postgres::{Client, NoTls};
|
||||
use std::{path::Path, process::Stdio, result::Result, sync::Arc};
|
||||
use tokio::{
|
||||
io::{AsyncBufReadExt, BufReader},
|
||||
process::Command,
|
||||
task,
|
||||
};
|
||||
use tokio_stream::{self as stream, StreamExt};
|
||||
use tokio_util::codec::{BytesCodec, FramedRead};
|
||||
use tracing::warn;
|
||||
|
||||
use crate::{
|
||||
compute::ComputeNode,
|
||||
pg_helpers::{get_existing_dbs, get_existing_roles},
|
||||
};
|
||||
|
||||
pub async fn get_dbs_and_roles(compute: &Arc<ComputeNode>) -> anyhow::Result<CatalogObjects> {
|
||||
let connstr = compute.connstr.clone();
|
||||
task::spawn_blocking(move || {
|
||||
let mut client = Client::connect(connstr.as_str(), NoTls)?;
|
||||
let roles: Vec<Role>;
|
||||
{
|
||||
let mut xact = client.transaction()?;
|
||||
roles = get_existing_roles(&mut xact)?;
|
||||
}
|
||||
let databases: Vec<Database> = get_existing_dbs(&mut client)?.values().cloned().collect();
|
||||
|
||||
Ok(CatalogObjects { roles, databases })
|
||||
})
|
||||
.await?
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum SchemaDumpError {
|
||||
#[error("Database does not exist.")]
|
||||
DatabaseDoesNotExist,
|
||||
#[error("Failed to execute pg_dump.")]
|
||||
IO(#[from] std::io::Error),
|
||||
}
|
||||
|
||||
// It uses the pg_dump utility to dump the schema of the specified database.
|
||||
// The output is streamed back to the caller and supposed to be streamed via HTTP.
|
||||
//
|
||||
// Before return the result with the output, it checks that pg_dump produced any output.
|
||||
// If not, it tries to parse the stderr output to determine if the database does not exist
|
||||
// and special error is returned.
|
||||
//
|
||||
// To make sure that the process is killed when the caller drops the stream, we use tokio kill_on_drop feature.
|
||||
pub async fn get_database_schema(
|
||||
compute: &Arc<ComputeNode>,
|
||||
dbname: &str,
|
||||
) -> Result<impl Stream<Item = Result<bytes::Bytes, std::io::Error>>, SchemaDumpError> {
|
||||
let pgbin = &compute.pgbin;
|
||||
let basepath = Path::new(pgbin).parent().unwrap();
|
||||
let pgdump = basepath.join("pg_dump");
|
||||
let mut connstr = compute.connstr.clone();
|
||||
connstr.set_path(dbname);
|
||||
let mut cmd = Command::new(pgdump)
|
||||
.arg("--schema-only")
|
||||
.arg(connstr.as_str())
|
||||
.stdout(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.kill_on_drop(true)
|
||||
.spawn()?;
|
||||
|
||||
let stdout = cmd.stdout.take().ok_or_else(|| {
|
||||
std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stdout.")
|
||||
})?;
|
||||
|
||||
let stderr = cmd.stderr.take().ok_or_else(|| {
|
||||
std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stderr.")
|
||||
})?;
|
||||
|
||||
let mut stdout_reader = FramedRead::new(stdout, BytesCodec::new());
|
||||
let stderr_reader = BufReader::new(stderr);
|
||||
|
||||
let first_chunk = match stdout_reader.next().await {
|
||||
Some(Ok(bytes)) if !bytes.is_empty() => bytes,
|
||||
Some(Err(e)) => {
|
||||
return Err(SchemaDumpError::IO(e));
|
||||
}
|
||||
_ => {
|
||||
let mut lines = stderr_reader.lines();
|
||||
if let Some(line) = lines.next_line().await? {
|
||||
if line.contains(&format!("FATAL: database \"{}\" does not exist", dbname)) {
|
||||
return Err(SchemaDumpError::DatabaseDoesNotExist);
|
||||
}
|
||||
warn!("pg_dump stderr: {}", line)
|
||||
}
|
||||
tokio::spawn(async move {
|
||||
while let Ok(Some(line)) = lines.next_line().await {
|
||||
warn!("pg_dump stderr: {}", line)
|
||||
}
|
||||
});
|
||||
|
||||
return Err(SchemaDumpError::IO(std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
"failed to start pg_dump",
|
||||
)));
|
||||
}
|
||||
};
|
||||
let initial_stream = stream::once(Ok(first_chunk.freeze()));
|
||||
// Consume stderr and log warnings
|
||||
tokio::spawn(async move {
|
||||
let mut lines = stderr_reader.lines();
|
||||
while let Ok(Some(line)) = lines.next_line().await {
|
||||
warn!("pg_dump stderr: {}", line)
|
||||
}
|
||||
});
|
||||
Ok(initial_stream.chain(stdout_reader.map(|res| res.map(|b| b.freeze()))))
|
||||
}
|
||||
@@ -5,17 +5,21 @@ use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
|
||||
use crate::catalog::SchemaDumpError;
|
||||
use crate::catalog::{get_database_schema, get_dbs_and_roles};
|
||||
use crate::compute::forward_termination_signal;
|
||||
use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
|
||||
use compute_api::requests::ConfigurationRequest;
|
||||
use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
|
||||
|
||||
use anyhow::Result;
|
||||
use hyper::header::CONTENT_TYPE;
|
||||
use hyper::service::{make_service_fn, service_fn};
|
||||
use hyper::{Body, Method, Request, Response, Server, StatusCode};
|
||||
use tokio::task;
|
||||
use tracing::{error, info, warn};
|
||||
use tracing_utils::http::OtelName;
|
||||
use utils::http::request::must_get_query_param;
|
||||
|
||||
fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
|
||||
ComputeStatusResponse {
|
||||
@@ -133,6 +137,34 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
|
||||
}
|
||||
}
|
||||
|
||||
(&Method::GET, "/dbs_and_roles") => {
|
||||
info!("serving /dbs_and_roles GET request",);
|
||||
match get_dbs_and_roles(compute).await {
|
||||
Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
|
||||
Err(_) => {
|
||||
render_json_error("can't get dbs and roles", StatusCode::INTERNAL_SERVER_ERROR)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
(&Method::GET, "/database_schema") => {
|
||||
let database = match must_get_query_param(&req, "database") {
|
||||
Err(e) => return e.into_response(),
|
||||
Ok(database) => database,
|
||||
};
|
||||
info!("serving /database_schema GET request with database: {database}",);
|
||||
match get_database_schema(compute, &database).await {
|
||||
Ok(res) => render_plain(Body::wrap_stream(res)),
|
||||
Err(SchemaDumpError::DatabaseDoesNotExist) => {
|
||||
render_json_error("database does not exist", StatusCode::NOT_FOUND)
|
||||
}
|
||||
Err(e) => {
|
||||
error!("can't get schema dump: {}", e);
|
||||
render_json_error("can't get schema dump", StatusCode::INTERNAL_SERVER_ERROR)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// download extension files from remote extension storage on demand
|
||||
(&Method::POST, route) if route.starts_with("/extension_server/") => {
|
||||
info!("serving {:?} POST request", route);
|
||||
@@ -303,10 +335,25 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
|
||||
};
|
||||
Response::builder()
|
||||
.status(status)
|
||||
.header(CONTENT_TYPE, "application/json")
|
||||
.body(Body::from(serde_json::to_string(&error).unwrap()))
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn render_json(body: Body) -> Response<Body> {
|
||||
Response::builder()
|
||||
.header(CONTENT_TYPE, "application/json")
|
||||
.body(body)
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn render_plain(body: Body) -> Response<Body> {
|
||||
Response::builder()
|
||||
.header(CONTENT_TYPE, "text/plain")
|
||||
.body(body)
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
|
||||
{
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
|
||||
@@ -68,6 +68,51 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Info"
|
||||
|
||||
/dbs_and_roles:
|
||||
get:
|
||||
tags:
|
||||
- Info
|
||||
summary: Get databases and roles in the catalog.
|
||||
description: ""
|
||||
operationId: getDbsAndRoles
|
||||
responses:
|
||||
200:
|
||||
description: Compute schema objects
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/DbsAndRoles"
|
||||
|
||||
/database_schema:
|
||||
get:
|
||||
tags:
|
||||
- Info
|
||||
summary: Get schema dump
|
||||
parameters:
|
||||
- name: database
|
||||
in: query
|
||||
description: Database name to dump.
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
example: "postgres"
|
||||
description: Get schema dump in SQL format.
|
||||
operationId: getDatabaseSchema
|
||||
responses:
|
||||
200:
|
||||
description: Schema dump
|
||||
content:
|
||||
text/plain:
|
||||
schema:
|
||||
type: string
|
||||
description: Schema dump in SQL format.
|
||||
404:
|
||||
description: Non existing database.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
|
||||
/check_writability:
|
||||
post:
|
||||
tags:
|
||||
@@ -229,6 +274,73 @@ components:
|
||||
num_cpus:
|
||||
type: integer
|
||||
|
||||
DbsAndRoles:
|
||||
type: object
|
||||
description: Databases and Roles
|
||||
required:
|
||||
- roles
|
||||
- databases
|
||||
properties:
|
||||
roles:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/Role"
|
||||
databases:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/Database"
|
||||
|
||||
Database:
|
||||
type: object
|
||||
description: Database
|
||||
required:
|
||||
- name
|
||||
- owner
|
||||
- restrict_conn
|
||||
- invalid
|
||||
properties:
|
||||
name:
|
||||
type: string
|
||||
owner:
|
||||
type: string
|
||||
options:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/GenericOption"
|
||||
restrict_conn:
|
||||
type: boolean
|
||||
invalid:
|
||||
type: boolean
|
||||
|
||||
Role:
|
||||
type: object
|
||||
description: Role
|
||||
required:
|
||||
- name
|
||||
properties:
|
||||
name:
|
||||
type: string
|
||||
encrypted_password:
|
||||
type: string
|
||||
options:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/GenericOption"
|
||||
|
||||
GenericOption:
|
||||
type: object
|
||||
description: Schema Generic option
|
||||
required:
|
||||
- name
|
||||
- vartype
|
||||
properties:
|
||||
name:
|
||||
type: string
|
||||
value:
|
||||
type: string
|
||||
vartype:
|
||||
type: string
|
||||
|
||||
ComputeState:
|
||||
type: object
|
||||
required:
|
||||
|
||||
@@ -8,6 +8,7 @@ pub mod configurator;
|
||||
pub mod http;
|
||||
#[macro_use]
|
||||
pub mod logger;
|
||||
pub mod catalog;
|
||||
pub mod compute;
|
||||
pub mod extension_server;
|
||||
pub mod monitor;
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
use chrono::{DateTime, Utc};
|
||||
use serde::{Deserialize, Serialize, Serializer};
|
||||
|
||||
use crate::spec::ComputeSpec;
|
||||
use crate::spec::{ComputeSpec, Database, Role};
|
||||
|
||||
#[derive(Serialize, Debug, Deserialize)]
|
||||
pub struct GenericAPIError {
|
||||
@@ -113,6 +113,12 @@ pub struct ComputeMetrics {
|
||||
pub total_ext_download_size: u64,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default, Serialize)]
|
||||
pub struct CatalogObjects {
|
||||
pub roles: Vec<Role>,
|
||||
pub databases: Vec<Database>,
|
||||
}
|
||||
|
||||
/// Response of the `/computes/{compute_id}/spec` control-plane API.
|
||||
/// This is not actually a compute API response, so consider moving
|
||||
/// to a different place.
|
||||
|
||||
@@ -10,6 +10,7 @@ use std::{
|
||||
io::{BufRead, Read},
|
||||
num::{NonZeroU64, NonZeroUsize},
|
||||
str::FromStr,
|
||||
sync::atomic::AtomicUsize,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
|
||||
@@ -308,13 +309,88 @@ pub struct TenantConfig {
|
||||
pub switch_aux_file_policy: Option<AuxFilePolicy>,
|
||||
}
|
||||
|
||||
/// The policy for the aux file storage. It can be switched through `switch_aux_file_policy`
|
||||
/// tenant config. When the first aux file written, the policy will be persisted in the
|
||||
/// `index_part.json` file and has a limited migration path.
|
||||
///
|
||||
/// Currently, we only allow the following migration path:
|
||||
///
|
||||
/// Unset -> V1
|
||||
/// -> V2
|
||||
/// -> CrossValidation -> V2
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum AuxFilePolicy {
|
||||
/// V1 aux file policy: store everything in AUX_FILE_KEY
|
||||
V1,
|
||||
/// V2 aux file policy: store in the AUX_FILE keyspace
|
||||
V2,
|
||||
/// Cross validation runs both formats on the write path and does validation
|
||||
/// on the read path.
|
||||
CrossValidation,
|
||||
}
|
||||
|
||||
impl AuxFilePolicy {
|
||||
pub fn is_valid_migration_path(from: Option<Self>, to: Self) -> bool {
|
||||
matches!(
|
||||
(from, to),
|
||||
(None, _) | (Some(AuxFilePolicy::CrossValidation), AuxFilePolicy::V2)
|
||||
)
|
||||
}
|
||||
|
||||
/// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
|
||||
pub fn default_tenant_config() -> Self {
|
||||
Self::V1
|
||||
}
|
||||
}
|
||||
|
||||
/// The aux file policy memory flag. Users can store `Option<AuxFilePolicy>` into this atomic flag. 0 == unspecified.
|
||||
pub struct AtomicAuxFilePolicy(AtomicUsize);
|
||||
|
||||
impl AtomicAuxFilePolicy {
|
||||
pub fn new(policy: Option<AuxFilePolicy>) -> Self {
|
||||
Self(AtomicUsize::new(
|
||||
policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
|
||||
))
|
||||
}
|
||||
|
||||
pub fn load(&self) -> Option<AuxFilePolicy> {
|
||||
match self.0.load(std::sync::atomic::Ordering::Acquire) {
|
||||
0 => None,
|
||||
other => Some(AuxFilePolicy::from_usize(other)),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn store(&self, policy: Option<AuxFilePolicy>) {
|
||||
self.0.store(
|
||||
policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
|
||||
std::sync::atomic::Ordering::Release,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
impl AuxFilePolicy {
|
||||
pub fn to_usize(self) -> usize {
|
||||
match self {
|
||||
Self::V1 => 1,
|
||||
Self::CrossValidation => 2,
|
||||
Self::V2 => 3,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn try_from_usize(this: usize) -> Option<Self> {
|
||||
match this {
|
||||
1 => Some(Self::V1),
|
||||
2 => Some(Self::CrossValidation),
|
||||
3 => Some(Self::V2),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_usize(this: usize) -> Self {
|
||||
Self::try_from_usize(this).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for AuxFilePolicy {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
@@ -604,6 +680,9 @@ pub struct TimelineInfo {
|
||||
pub state: TimelineState,
|
||||
|
||||
pub walreceiver_status: String,
|
||||
|
||||
/// The last aux file policy being used on this timeline
|
||||
pub last_aux_file_policy: Option<AuxFilePolicy>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
@@ -762,6 +841,16 @@ pub struct DownloadRemoteLayersTaskSpawnRequest {
|
||||
pub max_concurrent_downloads: NonZeroUsize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct IngestAuxFilesRequest {
|
||||
pub aux_files: HashMap<String, String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct ListAuxFilesRequest {
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct DownloadRemoteLayersTaskInfo {
|
||||
pub task_id: String,
|
||||
@@ -1505,4 +1594,59 @@ mod tests {
|
||||
assert_eq!(actual, expected, "example on {line}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_aux_file_migration_path() {
|
||||
assert!(AuxFilePolicy::is_valid_migration_path(
|
||||
None,
|
||||
AuxFilePolicy::V1
|
||||
));
|
||||
assert!(AuxFilePolicy::is_valid_migration_path(
|
||||
None,
|
||||
AuxFilePolicy::V2
|
||||
));
|
||||
assert!(AuxFilePolicy::is_valid_migration_path(
|
||||
None,
|
||||
AuxFilePolicy::CrossValidation
|
||||
));
|
||||
// Self-migration is not a valid migration path, and the caller should handle it by itself.
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V1),
|
||||
AuxFilePolicy::V1
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V2),
|
||||
AuxFilePolicy::V2
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::CrossValidation),
|
||||
AuxFilePolicy::CrossValidation
|
||||
));
|
||||
// Migrations not allowed
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::CrossValidation),
|
||||
AuxFilePolicy::V1
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V1),
|
||||
AuxFilePolicy::V2
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V2),
|
||||
AuxFilePolicy::V1
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V2),
|
||||
AuxFilePolicy::CrossValidation
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V1),
|
||||
AuxFilePolicy::CrossValidation
|
||||
));
|
||||
// Migrations allowed
|
||||
assert!(AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::CrossValidation),
|
||||
AuxFilePolicy::V2
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,8 +1,12 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::{models::*, shard::TenantShardId};
|
||||
use reqwest::{IntoUrl, Method, StatusCode};
|
||||
use utils::{
|
||||
http::error::HttpErrorBody,
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
pub mod util;
|
||||
@@ -561,4 +565,57 @@ impl Client {
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn ingest_aux_files(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
aux_files: HashMap<String, String>,
|
||||
) -> Result<bool> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{}/timeline/{}/ingest_aux_files",
|
||||
self.mgmt_api_endpoint, tenant_shard_id, timeline_id
|
||||
);
|
||||
let resp = self
|
||||
.request_noerror(Method::POST, &uri, IngestAuxFilesRequest { aux_files })
|
||||
.await?;
|
||||
match resp.status() {
|
||||
StatusCode::OK => Ok(true),
|
||||
status => Err(match resp.json::<HttpErrorBody>().await {
|
||||
Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
|
||||
Err(_) => {
|
||||
Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
|
||||
}
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn list_aux_files(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Result<HashMap<String, Bytes>> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{}/timeline/{}/list_aux_files",
|
||||
self.mgmt_api_endpoint, tenant_shard_id, timeline_id
|
||||
);
|
||||
let resp = self
|
||||
.request_noerror(Method::POST, &uri, ListAuxFilesRequest { lsn })
|
||||
.await?;
|
||||
match resp.status() {
|
||||
StatusCode::OK => {
|
||||
let resp: HashMap<String, Bytes> = resp.json().await.map_err(|e| {
|
||||
Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, format!("{e}"))
|
||||
})?;
|
||||
Ok(resp)
|
||||
}
|
||||
status => Err(match resp.json::<HttpErrorBody>().await {
|
||||
Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
|
||||
Err(_) => {
|
||||
Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
|
||||
}
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use clap::{Parser, Subcommand};
|
||||
use pageserver_compaction::helpers::PAGE_SZ;
|
||||
use pageserver_compaction::simulator::MockTimeline;
|
||||
use rand::Rng;
|
||||
use std::io::Write;
|
||||
@@ -51,7 +52,7 @@ async fn simulate(cmd: &SimulateCmd, results_path: &Path) -> anyhow::Result<()>
|
||||
let mut executor = MockTimeline::new();
|
||||
|
||||
// Convert the logical size in MB into a key range.
|
||||
let key_range = 0..((cmd.logical_size * 1024 * 1024) / 8192);
|
||||
let key_range = 0..((cmd.logical_size * 1024 * 1024) / PAGE_SZ);
|
||||
//let key_range = u64::MIN..u64::MAX;
|
||||
println!(
|
||||
"starting simulation with key range {:016X}-{:016X}",
|
||||
|
||||
@@ -25,7 +25,7 @@ use std::collections::{HashSet, VecDeque};
|
||||
use std::ops::Range;
|
||||
|
||||
use crate::helpers::{
|
||||
accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with,
|
||||
accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with, PAGE_SZ,
|
||||
};
|
||||
use crate::interface::*;
|
||||
use utils::lsn::Lsn;
|
||||
@@ -379,7 +379,7 @@ where
|
||||
.get_keyspace(&job.key_range, job.lsn_range.end, ctx)
|
||||
.await?,
|
||||
&self.shard_identity,
|
||||
) * 8192;
|
||||
) * PAGE_SZ;
|
||||
|
||||
let wal_size = job
|
||||
.input_layers
|
||||
@@ -441,7 +441,7 @@ where
|
||||
let mut window = KeyspaceWindow::new(
|
||||
E::Key::MIN..E::Key::MAX,
|
||||
keyspace,
|
||||
self.target_file_size / 8192,
|
||||
self.target_file_size / PAGE_SZ,
|
||||
);
|
||||
while let Some(key_range) = window.choose_next_image(&self.shard_identity) {
|
||||
new_jobs.push(CompactionJob::<E> {
|
||||
@@ -663,8 +663,8 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
// Sliding window through keyspace and values
|
||||
// This is used by over_with_images to decide on good split points
|
||||
/// Sliding window through keyspace and values for image layer
|
||||
/// This is used by [`LevelCompactionState::cover_with_images`] to decide on good split points
|
||||
struct KeyspaceWindow<K> {
|
||||
head: KeyspaceWindowHead<K>,
|
||||
|
||||
@@ -804,9 +804,9 @@ struct WindowElement<K> {
|
||||
accum_size: u64,
|
||||
}
|
||||
|
||||
// Sliding window through keyspace and values
|
||||
//
|
||||
// This is used to decide what layer to write next, from the beginning of the window.
|
||||
/// Sliding window through keyspace and values for delta layer tiling
|
||||
///
|
||||
/// This is used to decide which delta layer to write next.
|
||||
struct Window<K> {
|
||||
elems: VecDeque<WindowElement<K>>,
|
||||
|
||||
@@ -830,11 +830,13 @@ where
|
||||
fn feed(&mut self, key: K, size: u64) {
|
||||
let last_size;
|
||||
if let Some(last) = self.elems.back_mut() {
|
||||
assert!(last.last_key <= key);
|
||||
if key == last.last_key {
|
||||
last.accum_size += size;
|
||||
return;
|
||||
}
|
||||
// We require the keys to be strictly increasing for the window.
|
||||
// Keys should already have been deduplicated by `accum_key_values`
|
||||
assert!(
|
||||
last.last_key < key,
|
||||
"last_key(={}) >= key(={key})",
|
||||
last.last_key
|
||||
);
|
||||
last_size = last.accum_size;
|
||||
} else {
|
||||
last_size = 0;
|
||||
@@ -922,7 +924,7 @@ where
|
||||
// If we're willing to stretch it up to 1.25 target size, could we
|
||||
// gobble up the rest of the work? This avoids creating very small
|
||||
// "tail" layers at the end of the keyspace
|
||||
if !has_more && self.remain_size() < target_size * 5 / 3 {
|
||||
if !has_more && self.remain_size() < target_size * 5 / 4 {
|
||||
self.commit_upto(self.elems.len());
|
||||
} else {
|
||||
let delta_split_at = self.find_size_split(target_size);
|
||||
|
||||
@@ -16,6 +16,8 @@ use std::pin::Pin;
|
||||
use std::task::{ready, Poll};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
pub const PAGE_SZ: u64 = 8192;
|
||||
|
||||
pub fn keyspace_total_size<K>(
|
||||
keyspace: &CompactionKeySpace<K>,
|
||||
shard_identity: &ShardIdentity,
|
||||
|
||||
@@ -14,6 +14,7 @@ use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use std::sync::Mutex;
|
||||
|
||||
use crate::helpers::PAGE_SZ;
|
||||
use crate::helpers::{merge_delta_keys, overlaps_with};
|
||||
|
||||
use crate::interface;
|
||||
@@ -509,7 +510,7 @@ impl interface::CompactionJobExecutor for MockTimeline {
|
||||
let new_layer = Arc::new(MockImageLayer {
|
||||
key_range: key_range.clone(),
|
||||
lsn_range: lsn..lsn,
|
||||
file_size: accum_size * 8192,
|
||||
file_size: accum_size * PAGE_SZ,
|
||||
deleted: Mutex::new(false),
|
||||
});
|
||||
info!(
|
||||
|
||||
@@ -219,6 +219,7 @@ fn handle_metadata(
|
||||
let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
|
||||
println!("Current metadata:\n{meta:?}");
|
||||
let mut update_meta = false;
|
||||
// TODO: simplify this part
|
||||
if let Some(disk_consistent_lsn) = disk_consistent_lsn {
|
||||
meta = TimelineMetadata::new(
|
||||
*disk_consistent_lsn,
|
||||
|
||||
98
pageserver/pagebench/src/cmd/aux_files.rs
Normal file
98
pageserver/pagebench/src/cmd/aux_files.rs
Normal file
@@ -0,0 +1,98 @@
|
||||
use pageserver_api::models::{AuxFilePolicy, TenantConfig, TenantConfigRequest};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
/// Ingest aux files into the pageserver.
|
||||
#[derive(clap::Parser)]
|
||||
pub(crate) struct Args {
|
||||
#[clap(long, default_value = "http://localhost:9898")]
|
||||
mgmt_api_endpoint: String,
|
||||
#[clap(long, default_value = "postgres://postgres@localhost:64000")]
|
||||
page_service_connstring: String,
|
||||
#[clap(long)]
|
||||
pageserver_jwt: Option<String>,
|
||||
|
||||
targets: Option<Vec<TenantTimelineId>>,
|
||||
}
|
||||
|
||||
pub(crate) fn main(args: Args) -> anyhow::Result<()> {
|
||||
let rt = tokio::runtime::Builder::new_multi_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let main_task = rt.spawn(main_impl(args));
|
||||
rt.block_on(main_task).unwrap()
|
||||
}
|
||||
|
||||
async fn main_impl(args: Args) -> anyhow::Result<()> {
|
||||
let args: &'static Args = Box::leak(Box::new(args));
|
||||
|
||||
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
|
||||
args.mgmt_api_endpoint.clone(),
|
||||
args.pageserver_jwt.as_deref(),
|
||||
));
|
||||
|
||||
// discover targets
|
||||
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
|
||||
&mgmt_api_client,
|
||||
crate::util::cli::targets::Spec {
|
||||
limit_to_first_n_targets: None,
|
||||
targets: {
|
||||
if let Some(targets) = &args.targets {
|
||||
if targets.len() != 1 {
|
||||
anyhow::bail!("must specify exactly one target");
|
||||
}
|
||||
Some(targets.clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
},
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
let timeline = timelines[0];
|
||||
let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
|
||||
let timeline_id = timeline.timeline_id;
|
||||
|
||||
println!("operating on timeline {}", timeline);
|
||||
|
||||
mgmt_api_client
|
||||
.tenant_config(&TenantConfigRequest {
|
||||
tenant_id: timeline.tenant_id,
|
||||
config: TenantConfig {
|
||||
switch_aux_file_policy: Some(AuxFilePolicy::V2),
|
||||
..Default::default()
|
||||
},
|
||||
})
|
||||
.await?;
|
||||
|
||||
for batch in 0..100 {
|
||||
let items = (0..100)
|
||||
.map(|id| {
|
||||
(
|
||||
format!("pg_logical/mappings/{:03}.{:03}", batch, id),
|
||||
format!("{:08}", id),
|
||||
)
|
||||
})
|
||||
.collect::<HashMap<_, _>>();
|
||||
let file_cnt = items.len();
|
||||
mgmt_api_client
|
||||
.ingest_aux_files(tenant_shard_id, timeline_id, items)
|
||||
.await?;
|
||||
println!("ingested {file_cnt} files");
|
||||
}
|
||||
|
||||
let files = mgmt_api_client
|
||||
.list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1))
|
||||
.await?;
|
||||
|
||||
println!("{} files found", files.len());
|
||||
|
||||
anyhow::Ok(())
|
||||
}
|
||||
@@ -14,6 +14,7 @@ mod util {
|
||||
|
||||
/// The pagebench CLI sub-commands, dispatched in [`main`] below.
|
||||
mod cmd {
|
||||
pub(super) mod aux_files;
|
||||
pub(super) mod basebackup;
|
||||
pub(super) mod getpage_latest_lsn;
|
||||
pub(super) mod ondemand_download_churn;
|
||||
@@ -27,6 +28,7 @@ enum Args {
|
||||
GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
|
||||
TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
|
||||
OndemandDownloadChurn(cmd::ondemand_download_churn::Args),
|
||||
AuxFiles(cmd::aux_files::Args),
|
||||
}
|
||||
|
||||
fn main() {
|
||||
@@ -46,6 +48,7 @@ fn main() {
|
||||
cmd::trigger_initial_size_calculation::main(args)
|
||||
}
|
||||
Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
|
||||
Args::AuxFiles(args) => cmd::aux_files::main(args),
|
||||
}
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
@@ -535,17 +535,11 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
}
|
||||
EvictionLayer::Secondary(layer) => {
|
||||
let file_size = layer.metadata.file_size();
|
||||
let tenant_manager = tenant_manager.clone();
|
||||
|
||||
js.spawn(async move {
|
||||
layer
|
||||
.secondary_tenant
|
||||
.evict_layer(
|
||||
tenant_manager.get_conf(),
|
||||
layer.timeline_id,
|
||||
layer.name,
|
||||
layer.metadata,
|
||||
)
|
||||
.evict_layer(layer.timeline_id, layer.name)
|
||||
.await;
|
||||
Ok(file_size)
|
||||
});
|
||||
|
||||
@@ -16,6 +16,8 @@ use hyper::header;
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use metrics::launch_timestamp::LaunchTimestamp;
|
||||
use pageserver_api::models::IngestAuxFilesRequest;
|
||||
use pageserver_api::models::ListAuxFilesRequest;
|
||||
use pageserver_api::models::LocationConfig;
|
||||
use pageserver_api::models::LocationConfigListResponse;
|
||||
use pageserver_api::models::ShardParameters;
|
||||
@@ -439,6 +441,8 @@ async fn build_timeline_info_common(
|
||||
state,
|
||||
|
||||
walreceiver_status,
|
||||
|
||||
last_aux_file_policy: timeline.last_aux_file_policy.load(),
|
||||
};
|
||||
Ok(info)
|
||||
}
|
||||
@@ -2329,6 +2333,71 @@ async fn get_utilization(
|
||||
.map_err(ApiError::InternalServerError)
|
||||
}
|
||||
|
||||
async fn list_aux_files(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let body: ListAuxFilesRequest = json_request(&mut request).await?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
|
||||
let process = || async move {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let files = timeline.list_aux_files(body.lsn, &ctx).await?;
|
||||
Ok::<_, anyhow::Error>(files)
|
||||
};
|
||||
|
||||
match process().await {
|
||||
Ok(st) => json_response(StatusCode::OK, st),
|
||||
Err(err) => json_response(
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
ApiError::InternalServerError(err).to_string(),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
async fn ingest_aux_files(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let body: IngestAuxFilesRequest = json_request(&mut request).await?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
|
||||
let process = || async move {
|
||||
let mut modification = timeline.begin_modification(Lsn(
|
||||
timeline.get_last_record_lsn().0 + 8
|
||||
) /* advance LSN by 8 */);
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
for (fname, content) in body.aux_files {
|
||||
modification
|
||||
.put_file(&fname, content.as_bytes(), &ctx)
|
||||
.await?;
|
||||
}
|
||||
modification.commit(&ctx).await?;
|
||||
Ok::<_, anyhow::Error>(())
|
||||
};
|
||||
|
||||
match process().await {
|
||||
Ok(st) => json_response(StatusCode::OK, st),
|
||||
Err(err) => Err(ApiError::InternalServerError(err)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Report on the largest tenants on this pageserver, for the storage controller to identify
|
||||
/// candidates for splitting
|
||||
async fn post_top_tenants(
|
||||
@@ -2706,6 +2775,14 @@ pub fn make_router(
|
||||
)
|
||||
.put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
|
||||
.get("/v1/utilization", |r| api_handler(r, get_utilization))
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files",
|
||||
|r| testing_api_handler("ingest_aux_files", r, ingest_aux_files),
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/list_aux_files",
|
||||
|r| testing_api_handler("list_aux_files", r, list_aux_files),
|
||||
)
|
||||
.post("/v1/top_tenants", |r| api_handler(r, post_top_tenants))
|
||||
.any(handler_404))
|
||||
}
|
||||
|
||||
@@ -35,7 +35,7 @@ use std::ops::ControlFlow;
|
||||
use std::ops::Range;
|
||||
use strum::IntoEnumIterator;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, trace, warn};
|
||||
use tracing::{debug, info, trace, warn};
|
||||
use utils::bin_ser::DeserializeError;
|
||||
use utils::vec_map::{VecMap, VecMapOrdering};
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
@@ -718,10 +718,11 @@ impl Timeline {
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
|
||||
match self.get_switch_aux_file_policy() {
|
||||
AuxFilePolicy::V1 => self.list_aux_files_v1(lsn, ctx).await,
|
||||
AuxFilePolicy::V2 => self.list_aux_files_v2(lsn, ctx).await,
|
||||
AuxFilePolicy::CrossValidation => {
|
||||
let current_policy = self.last_aux_file_policy.load();
|
||||
match current_policy {
|
||||
Some(AuxFilePolicy::V1) | None => self.list_aux_files_v1(lsn, ctx).await,
|
||||
Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await,
|
||||
Some(AuxFilePolicy::CrossValidation) => {
|
||||
let v1_result = self.list_aux_files_v1(lsn, ctx).await;
|
||||
let v2_result = self.list_aux_files_v2(lsn, ctx).await;
|
||||
match (v1_result, v2_result) {
|
||||
@@ -1469,7 +1470,27 @@ impl<'a> DatadirModification<'a> {
|
||||
content: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let policy = self.tline.get_switch_aux_file_policy();
|
||||
let switch_policy = self.tline.get_switch_aux_file_policy();
|
||||
|
||||
let policy = {
|
||||
let current_policy = self.tline.last_aux_file_policy.load();
|
||||
// Allowed switch path:
|
||||
// * no aux files -> v1/v2/cross-validation
|
||||
// * cross-validation->v2
|
||||
if AuxFilePolicy::is_valid_migration_path(current_policy, switch_policy) {
|
||||
self.tline.last_aux_file_policy.store(Some(switch_policy));
|
||||
self.tline
|
||||
.remote_client
|
||||
.schedule_index_upload_for_aux_file_policy_update(Some(switch_policy))?;
|
||||
info!(current=?current_policy, next=?switch_policy, "switching aux file policy");
|
||||
switch_policy
|
||||
} else {
|
||||
// This branch handles non-valid migration path, and the case that switch_policy == current_policy.
|
||||
// And actually, because the migration path always allow unspecified -> *, this unwrap_or will never be hit.
|
||||
current_policy.unwrap_or(AuxFilePolicy::default_tenant_config())
|
||||
}
|
||||
};
|
||||
|
||||
if let AuxFilePolicy::V2 | AuxFilePolicy::CrossValidation = policy {
|
||||
let key = aux_file::encode_aux_file_key(path);
|
||||
// retrieve the key from the engine
|
||||
|
||||
@@ -20,6 +20,7 @@ use futures::stream::FuturesUnordered;
|
||||
use futures::FutureExt;
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::models;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::models::TimelineState;
|
||||
use pageserver_api::models::TopTenantShardItem;
|
||||
use pageserver_api::models::WalRedoManagerStatus;
|
||||
@@ -529,6 +530,7 @@ impl Tenant {
|
||||
index_part: Option<IndexPart>,
|
||||
metadata: TimelineMetadata,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
last_aux_file_policy: Option<AuxFilePolicy>,
|
||||
_ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let tenant_id = self.tenant_shard_id;
|
||||
@@ -539,6 +541,10 @@ impl Tenant {
|
||||
ancestor.clone(),
|
||||
resources,
|
||||
CreateTimelineCause::Load,
|
||||
// This could be derived from ancestor branch + index part. Though the only caller of `timeline_init_and_sync` is `load_remote_timeline`,
|
||||
// there will potentially be other caller of this function in the future, and we don't know whether `index_part` or `ancestor` takes precedence.
|
||||
// Therefore, we pass this field explicitly for now, and remove it once we fully migrate to aux file v2.
|
||||
last_aux_file_policy,
|
||||
)?;
|
||||
let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
|
||||
anyhow::ensure!(
|
||||
@@ -553,6 +559,10 @@ impl Tenant {
|
||||
|
||||
if let Some(index_part) = index_part.as_ref() {
|
||||
timeline.remote_client.init_upload_queue(index_part)?;
|
||||
|
||||
timeline
|
||||
.last_aux_file_policy
|
||||
.store(index_part.last_aux_file_policy());
|
||||
} else {
|
||||
// No data on the remote storage, but we have local metadata file. We can end up
|
||||
// here with timeline_create being interrupted before finishing index part upload.
|
||||
@@ -1173,12 +1183,15 @@ impl Tenant {
|
||||
None
|
||||
};
|
||||
|
||||
let last_aux_file_policy = index_part.last_aux_file_policy();
|
||||
|
||||
self.timeline_init_and_sync(
|
||||
timeline_id,
|
||||
resources,
|
||||
Some(index_part),
|
||||
remote_metadata,
|
||||
ancestor,
|
||||
last_aux_file_policy,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
@@ -1358,6 +1371,7 @@ impl Tenant {
|
||||
create_guard,
|
||||
initdb_lsn,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
}
|
||||
@@ -2441,6 +2455,7 @@ impl Tenant {
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
resources: TimelineResources,
|
||||
cause: CreateTimelineCause,
|
||||
last_aux_file_policy: Option<AuxFilePolicy>,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
let state = match cause {
|
||||
CreateTimelineCause::Load => {
|
||||
@@ -2469,6 +2484,7 @@ impl Tenant {
|
||||
resources,
|
||||
pg_version,
|
||||
state,
|
||||
last_aux_file_policy,
|
||||
self.cancel.child_token(),
|
||||
);
|
||||
|
||||
@@ -3119,6 +3135,7 @@ impl Tenant {
|
||||
timeline_create_guard,
|
||||
start_lsn + 1,
|
||||
Some(Arc::clone(src_timeline)),
|
||||
src_timeline.last_aux_file_policy.load(),
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -3312,6 +3329,7 @@ impl Tenant {
|
||||
timeline_create_guard,
|
||||
pgdata_lsn,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -3383,6 +3401,7 @@ impl Tenant {
|
||||
create_guard: TimelineCreateGuard<'a>,
|
||||
start_lsn: Lsn,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
last_aux_file_policy: Option<AuxFilePolicy>,
|
||||
) -> anyhow::Result<UninitializedTimeline> {
|
||||
let tenant_shard_id = self.tenant_shard_id;
|
||||
|
||||
@@ -3398,6 +3417,7 @@ impl Tenant {
|
||||
ancestor,
|
||||
resources,
|
||||
CreateTimelineCause::Load,
|
||||
last_aux_file_policy,
|
||||
)
|
||||
.context("Failed to create timeline data structure")?;
|
||||
|
||||
@@ -5621,4 +5641,280 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_branch_copies_dirty_aux_file_flag() {
|
||||
let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag").unwrap();
|
||||
|
||||
// the default aux file policy to switch is v1 if not set by the admins
|
||||
assert_eq!(
|
||||
harness.tenant_conf.switch_aux_file_policy,
|
||||
AuxFilePolicy::V1
|
||||
);
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let mut lsn = Lsn(0x08);
|
||||
|
||||
let tline: Arc<Timeline> = tenant
|
||||
.create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// no aux file is written at this point, so the persistent flag should be unset
|
||||
assert_eq!(tline.last_aux_file_policy.load(), None);
|
||||
|
||||
{
|
||||
lsn += 8;
|
||||
let mut modification = tline.begin_modification(lsn);
|
||||
modification
|
||||
.put_file("pg_logical/mappings/test1", b"first", &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
modification.commit(&ctx).await.unwrap();
|
||||
}
|
||||
|
||||
// there is no tenant manager to pass the configuration through, so lets mimic it
|
||||
tenant.set_new_location_config(
|
||||
AttachedTenantConf::try_from(LocationConf::attached_single(
|
||||
TenantConfOpt {
|
||||
switch_aux_file_policy: Some(AuxFilePolicy::V2),
|
||||
..Default::default()
|
||||
},
|
||||
tenant.generation,
|
||||
&pageserver_api::models::ShardParameters::default(),
|
||||
))
|
||||
.unwrap(),
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
tline.get_switch_aux_file_policy(),
|
||||
AuxFilePolicy::V2,
|
||||
"wanted state has been updated"
|
||||
);
|
||||
assert_eq!(
|
||||
tline.last_aux_file_policy.load(),
|
||||
Some(AuxFilePolicy::V1),
|
||||
"aux file is written with switch_aux_file_policy unset (which is v1), so we should keep v1"
|
||||
);
|
||||
|
||||
// we can read everything from the storage
|
||||
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
|
||||
assert_eq!(
|
||||
files.get("pg_logical/mappings/test1"),
|
||||
Some(&bytes::Bytes::from_static(b"first"))
|
||||
);
|
||||
|
||||
{
|
||||
lsn += 8;
|
||||
let mut modification = tline.begin_modification(lsn);
|
||||
modification
|
||||
.put_file("pg_logical/mappings/test2", b"second", &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
modification.commit(&ctx).await.unwrap();
|
||||
}
|
||||
|
||||
assert_eq!(
|
||||
tline.last_aux_file_policy.load(),
|
||||
Some(AuxFilePolicy::V1),
|
||||
"keep v1 storage format when new files are written"
|
||||
);
|
||||
|
||||
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
|
||||
assert_eq!(
|
||||
files.get("pg_logical/mappings/test2"),
|
||||
Some(&bytes::Bytes::from_static(b"second"))
|
||||
);
|
||||
|
||||
let child = tenant
|
||||
.branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// child copies the last flag even if that is not on remote storage yet
|
||||
assert_eq!(child.get_switch_aux_file_policy(), AuxFilePolicy::V2);
|
||||
assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V1));
|
||||
|
||||
let files = child.list_aux_files(lsn, &ctx).await.unwrap();
|
||||
assert_eq!(files.get("pg_logical/mappings/test1"), None);
|
||||
assert_eq!(files.get("pg_logical/mappings/test2"), None);
|
||||
|
||||
// even if we crash here without flushing parent timeline with it's new
|
||||
// last_aux_file_policy we are safe, because child was never meant to access ancestor's
|
||||
// files. the ancestor can even switch back to V1 because of a migration safely.
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn aux_file_policy_switch() {
|
||||
let mut harness = TenantHarness::create("aux_file_policy_switch").unwrap();
|
||||
harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::CrossValidation; // set to cross-validation mode
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let mut lsn = Lsn(0x08);
|
||||
|
||||
let tline: Arc<Timeline> = tenant
|
||||
.create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
tline.last_aux_file_policy.load(),
|
||||
None,
|
||||
"no aux file is written so it should be unset"
|
||||
);
|
||||
|
||||
{
|
||||
lsn += 8;
|
||||
let mut modification = tline.begin_modification(lsn);
|
||||
modification
|
||||
.put_file("pg_logical/mappings/test1", b"first", &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
modification.commit(&ctx).await.unwrap();
|
||||
}
|
||||
|
||||
// there is no tenant manager to pass the configuration through, so lets mimic it
|
||||
tenant.set_new_location_config(
|
||||
AttachedTenantConf::try_from(LocationConf::attached_single(
|
||||
TenantConfOpt {
|
||||
switch_aux_file_policy: Some(AuxFilePolicy::V2),
|
||||
..Default::default()
|
||||
},
|
||||
tenant.generation,
|
||||
&pageserver_api::models::ShardParameters::default(),
|
||||
))
|
||||
.unwrap(),
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
tline.get_switch_aux_file_policy(),
|
||||
AuxFilePolicy::V2,
|
||||
"wanted state has been updated"
|
||||
);
|
||||
assert_eq!(
|
||||
tline.last_aux_file_policy.load(),
|
||||
Some(AuxFilePolicy::CrossValidation),
|
||||
"dirty index_part.json reflected state is yet to be updated"
|
||||
);
|
||||
|
||||
// we can still read the auxfile v1 before we ingest anything new
|
||||
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
|
||||
assert_eq!(
|
||||
files.get("pg_logical/mappings/test1"),
|
||||
Some(&bytes::Bytes::from_static(b"first"))
|
||||
);
|
||||
|
||||
{
|
||||
lsn += 8;
|
||||
let mut modification = tline.begin_modification(lsn);
|
||||
modification
|
||||
.put_file("pg_logical/mappings/test2", b"second", &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
modification.commit(&ctx).await.unwrap();
|
||||
}
|
||||
|
||||
assert_eq!(
|
||||
tline.last_aux_file_policy.load(),
|
||||
Some(AuxFilePolicy::V2),
|
||||
"ingesting a file should apply the wanted switch state when applicable"
|
||||
);
|
||||
|
||||
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
|
||||
assert_eq!(
|
||||
files.get("pg_logical/mappings/test1"),
|
||||
Some(&bytes::Bytes::from_static(b"first")),
|
||||
"cross validation writes to both v1 and v2 so this should be available in v2"
|
||||
);
|
||||
assert_eq!(
|
||||
files.get("pg_logical/mappings/test2"),
|
||||
Some(&bytes::Bytes::from_static(b"second"))
|
||||
);
|
||||
|
||||
// mimic again by trying to flip it from V2 to V1 (not switched to while ingesting a file)
|
||||
tenant.set_new_location_config(
|
||||
AttachedTenantConf::try_from(LocationConf::attached_single(
|
||||
TenantConfOpt {
|
||||
switch_aux_file_policy: Some(AuxFilePolicy::V1),
|
||||
..Default::default()
|
||||
},
|
||||
tenant.generation,
|
||||
&pageserver_api::models::ShardParameters::default(),
|
||||
))
|
||||
.unwrap(),
|
||||
);
|
||||
|
||||
{
|
||||
lsn += 8;
|
||||
let mut modification = tline.begin_modification(lsn);
|
||||
modification
|
||||
.put_file("pg_logical/mappings/test2", b"third", &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
modification.commit(&ctx).await.unwrap();
|
||||
}
|
||||
|
||||
assert_eq!(
|
||||
tline.get_switch_aux_file_policy(),
|
||||
AuxFilePolicy::V1,
|
||||
"wanted state has been updated again, even if invalid request"
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
tline.last_aux_file_policy.load(),
|
||||
Some(AuxFilePolicy::V2),
|
||||
"ingesting a file should apply the wanted switch state when applicable"
|
||||
);
|
||||
|
||||
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
|
||||
assert_eq!(
|
||||
files.get("pg_logical/mappings/test1"),
|
||||
Some(&bytes::Bytes::from_static(b"first"))
|
||||
);
|
||||
assert_eq!(
|
||||
files.get("pg_logical/mappings/test2"),
|
||||
Some(&bytes::Bytes::from_static(b"third"))
|
||||
);
|
||||
|
||||
// mimic again by trying to flip it from from V1 to V2 (not switched to while ingesting a file)
|
||||
tenant.set_new_location_config(
|
||||
AttachedTenantConf::try_from(LocationConf::attached_single(
|
||||
TenantConfOpt {
|
||||
switch_aux_file_policy: Some(AuxFilePolicy::V2),
|
||||
..Default::default()
|
||||
},
|
||||
tenant.generation,
|
||||
&pageserver_api::models::ShardParameters::default(),
|
||||
))
|
||||
.unwrap(),
|
||||
);
|
||||
|
||||
{
|
||||
lsn += 8;
|
||||
let mut modification = tline.begin_modification(lsn);
|
||||
modification
|
||||
.put_file("pg_logical/mappings/test3", b"last", &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
modification.commit(&ctx).await.unwrap();
|
||||
}
|
||||
|
||||
assert_eq!(tline.get_switch_aux_file_policy(), AuxFilePolicy::V2);
|
||||
|
||||
assert_eq!(tline.last_aux_file_policy.load(), Some(AuxFilePolicy::V2));
|
||||
|
||||
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
|
||||
assert_eq!(
|
||||
files.get("pg_logical/mappings/test1"),
|
||||
Some(&bytes::Bytes::from_static(b"first"))
|
||||
);
|
||||
assert_eq!(
|
||||
files.get("pg_logical/mappings/test2"),
|
||||
Some(&bytes::Bytes::from_static(b"third"))
|
||||
);
|
||||
assert_eq!(
|
||||
files.get("pg_logical/mappings/test3"),
|
||||
Some(&bytes::Bytes::from_static(b"last"))
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -373,6 +373,8 @@ pub struct TenantConf {
|
||||
|
||||
/// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
|
||||
/// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
|
||||
/// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
|
||||
/// file is written.
|
||||
pub switch_aux_file_policy: AuxFilePolicy,
|
||||
}
|
||||
|
||||
@@ -574,7 +576,7 @@ impl Default for TenantConf {
|
||||
lazy_slru_download: false,
|
||||
timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
|
||||
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
|
||||
switch_aux_file_policy: AuxFilePolicy::V1,
|
||||
switch_aux_file_policy: AuxFilePolicy::default_tenant_config(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -189,6 +189,7 @@ use camino::Utf8Path;
|
||||
use chrono::{NaiveDateTime, Utc};
|
||||
|
||||
pub(crate) use download::download_initdb_tar_zst;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::shard::{ShardIndex, TenantShardId};
|
||||
use scopeguard::ScopeGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -518,6 +519,7 @@ impl RemoteTimelineClient {
|
||||
&self,
|
||||
layer_file_name: &LayerName,
|
||||
layer_metadata: &LayerFileMetadata,
|
||||
local_path: &Utf8Path,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<u64> {
|
||||
@@ -536,6 +538,7 @@ impl RemoteTimelineClient {
|
||||
self.timeline_id,
|
||||
layer_file_name,
|
||||
layer_metadata,
|
||||
local_path,
|
||||
cancel,
|
||||
ctx,
|
||||
)
|
||||
@@ -609,6 +612,17 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Launch an index-file upload operation in the background, with only aux_file_policy flag updated.
|
||||
pub(crate) fn schedule_index_upload_for_aux_file_policy_update(
|
||||
self: &Arc<Self>,
|
||||
last_aux_file_policy: Option<AuxFilePolicy>,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
upload_queue.last_aux_file_policy = last_aux_file_policy;
|
||||
self.schedule_index_upload(upload_queue);
|
||||
Ok(())
|
||||
}
|
||||
///
|
||||
/// Launch an index-file upload operation in the background, if necessary.
|
||||
///
|
||||
@@ -1849,6 +1863,7 @@ impl RemoteTimelineClient {
|
||||
dangling_files: HashMap::default(),
|
||||
shutting_down: false,
|
||||
shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
|
||||
last_aux_file_policy: initialized.last_aux_file_policy,
|
||||
};
|
||||
|
||||
let upload_queue = std::mem::replace(
|
||||
|
||||
@@ -21,7 +21,6 @@ use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
|
||||
use crate::tenant::storage_layer::layer::local_layer_path;
|
||||
use crate::tenant::storage_layer::LayerName;
|
||||
use crate::tenant::Generation;
|
||||
use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
|
||||
@@ -50,19 +49,13 @@ pub async fn download_layer_file<'a>(
|
||||
timeline_id: TimelineId,
|
||||
layer_file_name: &'a LayerName,
|
||||
layer_metadata: &'a LayerFileMetadata,
|
||||
local_path: &Utf8Path,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<u64, DownloadError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id);
|
||||
let local_path = local_layer_path(
|
||||
conf,
|
||||
&tenant_shard_id,
|
||||
&timeline_id,
|
||||
layer_file_name,
|
||||
&layer_metadata.generation,
|
||||
);
|
||||
|
||||
let remote_path = remote_layer_path(
|
||||
&tenant_shard_id.tenant_id,
|
||||
@@ -82,7 +75,7 @@ pub async fn download_layer_file<'a>(
|
||||
// For more context about durable_rename check this email from postgres mailing list:
|
||||
// https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
|
||||
// If pageserver crashes the temp file will be deleted on startup and re-downloaded.
|
||||
let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
|
||||
let temp_file_path = path_with_suffix_extension(local_path, TEMP_DOWNLOAD_EXTENSION);
|
||||
|
||||
let bytes_amount = download_retry(
|
||||
|| async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await },
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use chrono::NaiveDateTime;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::TimelineId;
|
||||
|
||||
@@ -88,6 +89,16 @@ pub struct IndexPart {
|
||||
|
||||
#[serde(default)]
|
||||
pub(crate) lineage: Lineage,
|
||||
|
||||
/// Describes the kind of aux files stored in the timeline.
|
||||
///
|
||||
/// The value is modified during file ingestion when the latest wanted value communicated via tenant config is applied if it is acceptable.
|
||||
/// A V1 setting after V2 files have been committed is not accepted.
|
||||
///
|
||||
/// None means no aux files have been written to the storage before the point
|
||||
/// when this flag is introduced.
|
||||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||
pub(crate) last_aux_file_policy: Option<AuxFilePolicy>,
|
||||
}
|
||||
|
||||
impl IndexPart {
|
||||
@@ -101,10 +112,11 @@ impl IndexPart {
|
||||
/// is always generated from the keys of `layer_metadata`)
|
||||
/// - 4: timeline_layers is fully removed.
|
||||
/// - 5: lineage was added
|
||||
const LATEST_VERSION: usize = 5;
|
||||
/// - 6: last_aux_file_policy is added.
|
||||
const LATEST_VERSION: usize = 6;
|
||||
|
||||
// Versions we may see when reading from a bucket.
|
||||
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5];
|
||||
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6];
|
||||
|
||||
pub const FILE_NAME: &'static str = "index_part.json";
|
||||
|
||||
@@ -113,6 +125,7 @@ impl IndexPart {
|
||||
disk_consistent_lsn: Lsn,
|
||||
metadata: TimelineMetadata,
|
||||
lineage: Lineage,
|
||||
last_aux_file_policy: Option<AuxFilePolicy>,
|
||||
) -> Self {
|
||||
let layer_metadata = layers_and_metadata
|
||||
.iter()
|
||||
@@ -126,6 +139,7 @@ impl IndexPart {
|
||||
metadata,
|
||||
deleted_at: None,
|
||||
lineage,
|
||||
last_aux_file_policy,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -155,8 +169,13 @@ impl IndexPart {
|
||||
example_metadata.disk_consistent_lsn(),
|
||||
example_metadata,
|
||||
Default::default(),
|
||||
Some(AuxFilePolicy::V1),
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn last_aux_file_policy(&self) -> Option<AuxFilePolicy> {
|
||||
self.last_aux_file_policy
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&UploadQueueInitialized> for IndexPart {
|
||||
@@ -165,7 +184,13 @@ impl From<&UploadQueueInitialized> for IndexPart {
|
||||
let metadata = uq.latest_metadata.clone();
|
||||
let lineage = uq.latest_lineage.clone();
|
||||
|
||||
Self::new(&uq.latest_files, disk_consistent_lsn, metadata, lineage)
|
||||
Self::new(
|
||||
&uq.latest_files,
|
||||
disk_consistent_lsn,
|
||||
metadata,
|
||||
lineage,
|
||||
uq.last_aux_file_policy,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -299,6 +324,7 @@ mod tests {
|
||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
deleted_at: None,
|
||||
lineage: Lineage::default(),
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
@@ -340,6 +366,7 @@ mod tests {
|
||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
deleted_at: None,
|
||||
lineage: Lineage::default(),
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
@@ -383,6 +410,7 @@ mod tests {
|
||||
deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
|
||||
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
|
||||
lineage: Lineage::default(),
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
@@ -428,6 +456,7 @@ mod tests {
|
||||
.unwrap(),
|
||||
deleted_at: None,
|
||||
lineage: Lineage::default(),
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
let empty_layers_parsed = IndexPart::from_s3_bytes(empty_layers_json.as_bytes()).unwrap();
|
||||
@@ -468,6 +497,7 @@ mod tests {
|
||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
|
||||
lineage: Lineage::default(),
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
@@ -511,6 +541,57 @@ mod tests {
|
||||
reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
|
||||
original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
|
||||
},
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn v6_indexpart_is_parsed() {
|
||||
let example = r#"{
|
||||
"version":6,
|
||||
"layer_metadata":{
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
|
||||
},
|
||||
"disk_consistent_lsn":"0/16960E8",
|
||||
"metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
|
||||
"deleted_at": "2023-07-31T09:00:00.123",
|
||||
"lineage":{
|
||||
"original_ancestor":["e2bfd8c633d713d279e6fcd2bcc15b6d","0/15A7618","2024-05-07T18:52:36.322426563"],
|
||||
"reparenting_history":["e1bfd8c633d713d279e6fcd2bcc15b6d"]
|
||||
},
|
||||
"last_aux_file_policy": "V2"
|
||||
}"#;
|
||||
|
||||
let expected = IndexPart {
|
||||
version: 6,
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: 9007199254741001,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
|
||||
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
|
||||
lineage: Lineage {
|
||||
reparenting_history_truncated: false,
|
||||
reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
|
||||
original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
|
||||
},
|
||||
last_aux_file_policy: Some(AuxFilePolicy::V2),
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
|
||||
@@ -6,11 +6,9 @@ mod scheduler;
|
||||
use std::{sync::Arc, time::SystemTime};
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
context::RequestContext,
|
||||
disk_usage_eviction_task::DiskUsageEvictionInfo,
|
||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||
virtual_file::MaybeFatalIo,
|
||||
};
|
||||
|
||||
use self::{
|
||||
@@ -21,9 +19,8 @@ use self::{
|
||||
use super::{
|
||||
config::{SecondaryLocationConfig, TenantConfOpt},
|
||||
mgr::TenantManager,
|
||||
remote_timeline_client::LayerFileMetadata,
|
||||
span::debug_assert_current_span_has_tenant_id,
|
||||
storage_layer::{layer::local_layer_path, LayerName},
|
||||
storage_layer::LayerName,
|
||||
};
|
||||
|
||||
use pageserver_api::{
|
||||
@@ -178,13 +175,7 @@ impl SecondaryTenant {
|
||||
|
||||
/// Cancellation safe, but on cancellation the eviction will go through
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline_id, name=%name))]
|
||||
pub(crate) async fn evict_layer(
|
||||
self: &Arc<Self>,
|
||||
conf: &PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
name: LayerName,
|
||||
metadata: LayerFileMetadata,
|
||||
) {
|
||||
pub(crate) async fn evict_layer(self: &Arc<Self>, timeline_id: TimelineId, name: LayerName) {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let guard = match self.gate.enter() {
|
||||
@@ -197,41 +188,11 @@ impl SecondaryTenant {
|
||||
|
||||
let now = SystemTime::now();
|
||||
|
||||
let local_path = local_layer_path(
|
||||
conf,
|
||||
&self.tenant_shard_id,
|
||||
&timeline_id,
|
||||
&name,
|
||||
&metadata.generation,
|
||||
);
|
||||
|
||||
let this = self.clone();
|
||||
|
||||
// spawn it to be cancellation safe
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let _guard = guard;
|
||||
// We tolerate ENOENT, because between planning eviction and executing
|
||||
// it, the secondary downloader could have seen an updated heatmap that
|
||||
// resulted in a layer being deleted.
|
||||
// Other local I/O errors are process-fatal: these should never happen.
|
||||
let deleted = std::fs::remove_file(local_path);
|
||||
|
||||
let not_found = deleted
|
||||
.as_ref()
|
||||
.is_err_and(|x| x.kind() == std::io::ErrorKind::NotFound);
|
||||
|
||||
let deleted = if not_found {
|
||||
false
|
||||
} else {
|
||||
deleted
|
||||
.map(|()| true)
|
||||
.fatal_err("Deleting layer during eviction")
|
||||
};
|
||||
|
||||
if !deleted {
|
||||
// skip updating accounting and putting perhaps later timestamp
|
||||
return;
|
||||
}
|
||||
|
||||
// Update the timeline's state. This does not have to be synchronized with
|
||||
// the download process, because:
|
||||
@@ -250,8 +211,15 @@ impl SecondaryTenant {
|
||||
// of the cache.
|
||||
let mut detail = this.detail.lock().unwrap();
|
||||
if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
|
||||
timeline_detail.on_disk_layers.remove(&name);
|
||||
timeline_detail.evicted_at.insert(name, now);
|
||||
let removed = timeline_detail.on_disk_layers.remove(&name);
|
||||
|
||||
// We might race with removal of the same layer during downloads, if it was removed
|
||||
// from the heatmap. If we see that the OnDiskState is gone, then no need to
|
||||
// do a physical deletion or store in evicted_at.
|
||||
if let Some(removed) = removed {
|
||||
removed.remove_blocking();
|
||||
timeline_detail.evicted_at.insert(name, now);
|
||||
}
|
||||
}
|
||||
})
|
||||
.await
|
||||
|
||||
@@ -111,6 +111,7 @@ struct SecondaryDownloader {
|
||||
pub(super) struct OnDiskState {
|
||||
metadata: LayerFileMetadata,
|
||||
access_time: SystemTime,
|
||||
local_path: Utf8PathBuf,
|
||||
}
|
||||
|
||||
impl OnDiskState {
|
||||
@@ -121,12 +122,26 @@ impl OnDiskState {
|
||||
_ame: LayerName,
|
||||
metadata: LayerFileMetadata,
|
||||
access_time: SystemTime,
|
||||
local_path: Utf8PathBuf,
|
||||
) -> Self {
|
||||
Self {
|
||||
metadata,
|
||||
access_time,
|
||||
local_path,
|
||||
}
|
||||
}
|
||||
|
||||
// This is infallible, because all errors are either acceptable (ENOENT), or totally
|
||||
// unexpected (fatal).
|
||||
pub(super) fn remove_blocking(&self) {
|
||||
// We tolerate ENOENT, because between planning eviction and executing
|
||||
// it, the secondary downloader could have seen an updated heatmap that
|
||||
// resulted in a layer being deleted.
|
||||
// Other local I/O errors are process-fatal: these should never happen.
|
||||
std::fs::remove_file(&self.local_path)
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.fatal_err("Deleting secondary layer")
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
@@ -816,20 +831,12 @@ impl<'a> TenantDownloader<'a> {
|
||||
if cfg!(debug_assertions) {
|
||||
// Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
|
||||
// are already present on disk are really there.
|
||||
let local_path = local_layer_path(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
&timeline.timeline_id,
|
||||
&layer.name,
|
||||
&layer.metadata.generation,
|
||||
);
|
||||
|
||||
match tokio::fs::metadata(&local_path).await {
|
||||
match tokio::fs::metadata(&on_disk.local_path).await {
|
||||
Ok(meta) => {
|
||||
tracing::debug!(
|
||||
"Layer {} present at {}, size {}",
|
||||
layer.name,
|
||||
local_path,
|
||||
on_disk.local_path,
|
||||
meta.len(),
|
||||
);
|
||||
}
|
||||
@@ -837,7 +844,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
tracing::warn!(
|
||||
"Layer {} not found at {} ({})",
|
||||
layer.name,
|
||||
local_path,
|
||||
on_disk.local_path,
|
||||
e
|
||||
);
|
||||
debug_assert!(false);
|
||||
@@ -926,6 +933,13 @@ impl<'a> TenantDownloader<'a> {
|
||||
v.get_mut().access_time = t.access_time;
|
||||
}
|
||||
Entry::Vacant(e) => {
|
||||
let local_path = local_layer_path(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
&timeline.timeline_id,
|
||||
&t.name,
|
||||
&t.metadata.generation,
|
||||
);
|
||||
e.insert(OnDiskState::new(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
@@ -933,6 +947,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
t.name,
|
||||
LayerFileMetadata::from(&t.metadata),
|
||||
t.access_time,
|
||||
local_path,
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -955,6 +970,14 @@ impl<'a> TenantDownloader<'a> {
|
||||
&self.secondary_state.cancel
|
||||
);
|
||||
|
||||
let local_path = local_layer_path(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
&layer.name,
|
||||
&layer.metadata.generation,
|
||||
);
|
||||
|
||||
// Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
|
||||
let downloaded_bytes = match download_layer_file(
|
||||
self.conf,
|
||||
@@ -963,6 +986,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
*timeline_id,
|
||||
&layer.name,
|
||||
&LayerFileMetadata::from(&layer.metadata),
|
||||
&local_path,
|
||||
&self.secondary_state.cancel,
|
||||
ctx,
|
||||
)
|
||||
@@ -1116,6 +1140,7 @@ async fn init_timeline_state(
|
||||
name,
|
||||
LayerFileMetadata::from(&remote_meta.metadata),
|
||||
remote_meta.access_time,
|
||||
file_path,
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1108,6 +1108,7 @@ impl LayerInner {
|
||||
.download_layer_file(
|
||||
&self.desc.layer_name(),
|
||||
&self.metadata(),
|
||||
&self.path,
|
||||
&timeline.cancel,
|
||||
ctx,
|
||||
)
|
||||
|
||||
@@ -23,7 +23,7 @@ use pageserver_api::{
|
||||
},
|
||||
keyspace::{KeySpaceAccum, SparseKeyPartitioning},
|
||||
models::{
|
||||
AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo,
|
||||
AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo,
|
||||
DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo,
|
||||
TimelineState,
|
||||
},
|
||||
@@ -413,7 +413,11 @@ pub struct Timeline {
|
||||
/// Keep aux directory cache to avoid it's reconstruction on each update
|
||||
pub(crate) aux_files: tokio::sync::Mutex<AuxFilesState>,
|
||||
|
||||
/// Size estimator for aux file v2
|
||||
pub(crate) aux_file_size_estimator: AuxFileSizeEstimator,
|
||||
|
||||
/// Indicate whether aux file v2 storage is enabled.
|
||||
pub(crate) last_aux_file_policy: AtomicAuxFilePolicy,
|
||||
}
|
||||
|
||||
pub struct WalReceiverInfo {
|
||||
@@ -2133,6 +2137,7 @@ impl Timeline {
|
||||
resources: TimelineResources,
|
||||
pg_version: u32,
|
||||
state: TimelineState,
|
||||
aux_file_policy: Option<AuxFilePolicy>,
|
||||
cancel: CancellationToken,
|
||||
) -> Arc<Self> {
|
||||
let disk_consistent_lsn = metadata.disk_consistent_lsn();
|
||||
@@ -2257,6 +2262,8 @@ impl Timeline {
|
||||
}),
|
||||
|
||||
aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics),
|
||||
|
||||
last_aux_file_policy: AtomicAuxFilePolicy::new(aux_file_policy),
|
||||
};
|
||||
result.repartition_threshold =
|
||||
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
|
||||
|
||||
@@ -280,6 +280,8 @@ impl DeleteTimelineFlow {
|
||||
// Important. We dont pass ancestor above because it can be missing.
|
||||
// Thus we need to skip the validation here.
|
||||
CreateTimelineCause::Delete,
|
||||
// Aux file policy is not needed for deletion, assuming deletion does not read aux keyspace
|
||||
None,
|
||||
)
|
||||
.context("create_timeline_struct")?;
|
||||
|
||||
|
||||
@@ -56,7 +56,7 @@ impl Default for Options {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
rewrite_concurrency: std::num::NonZeroUsize::new(2).unwrap(),
|
||||
copy_concurrency: std::num::NonZeroUsize::new(10).unwrap(),
|
||||
copy_concurrency: std::num::NonZeroUsize::new(100).unwrap(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,6 +8,7 @@ use std::collections::{HashMap, VecDeque};
|
||||
use std::fmt::Debug;
|
||||
|
||||
use chrono::NaiveDateTime;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use std::sync::Arc;
|
||||
use tracing::info;
|
||||
use utils::lsn::AtomicLsn;
|
||||
@@ -60,6 +61,9 @@ pub(crate) struct UploadQueueInitialized {
|
||||
/// Part of the flattened "next" `index_part.json`.
|
||||
pub(crate) latest_lineage: Lineage,
|
||||
|
||||
/// The last aux file policy used on this timeline.
|
||||
pub(crate) last_aux_file_policy: Option<AuxFilePolicy>,
|
||||
|
||||
/// `disk_consistent_lsn` from the last metadata file that was successfully
|
||||
/// uploaded. `Lsn(0)` if nothing was uploaded yet.
|
||||
/// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
|
||||
@@ -189,6 +193,7 @@ impl UploadQueue {
|
||||
dangling_files: HashMap::new(),
|
||||
shutting_down: false,
|
||||
shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
|
||||
last_aux_file_policy: Default::default(),
|
||||
};
|
||||
|
||||
*self = UploadQueue::Initialized(state);
|
||||
@@ -239,6 +244,7 @@ impl UploadQueue {
|
||||
dangling_files: HashMap::new(),
|
||||
shutting_down: false,
|
||||
shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
|
||||
last_aux_file_policy: index_part.last_aux_file_policy(),
|
||||
};
|
||||
|
||||
*self = UploadQueue::Initialized(state);
|
||||
|
||||
@@ -26,6 +26,7 @@ clap.workspace = true
|
||||
consumption_metrics.workspace = true
|
||||
dashmap.workspace = true
|
||||
env_logger.workspace = true
|
||||
framed-websockets.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
hashbrown.workspace = true
|
||||
@@ -35,7 +36,6 @@ hmac.workspace = true
|
||||
hostname.workspace = true
|
||||
http.workspace = true
|
||||
humantime.workspace = true
|
||||
hyper-tungstenite.workspace = true
|
||||
hyper.workspace = true
|
||||
hyper1 = { package = "hyper", version = "1.2", features = ["server"] }
|
||||
hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
|
||||
@@ -76,7 +76,6 @@ smol_str.workspace = true
|
||||
smallvec.workspace = true
|
||||
socket2.workspace = true
|
||||
subtle.workspace = true
|
||||
sync_wrapper.workspace = true
|
||||
task-local-extensions.workspace = true
|
||||
thiserror.workspace = true
|
||||
tikv-jemallocator.workspace = true
|
||||
@@ -106,6 +105,7 @@ workspace_hack.workspace = true
|
||||
[dev-dependencies]
|
||||
camino-tempfile.workspace = true
|
||||
fallible-iterator.workspace = true
|
||||
tokio-tungstenite.workspace = true
|
||||
rcgen.workspace = true
|
||||
rstest.workspace = true
|
||||
tokio-postgres-rustls.workspace = true
|
||||
|
||||
@@ -102,7 +102,7 @@ pub async fn task_main(
|
||||
let connections = tokio_util::task::task_tracker::TaskTracker::new();
|
||||
connections.close(); // allows `connections.wait to complete`
|
||||
|
||||
let server = Builder::new(hyper_util::rt::TokioExecutor::new());
|
||||
let server = Builder::new(TokioExecutor::new());
|
||||
|
||||
while let Some(res) = run_until_cancelled(ws_listener.accept(), &cancellation_token).await {
|
||||
let (conn, peer_addr) = res.context("could not accept TCP stream")?;
|
||||
@@ -255,7 +255,6 @@ async fn connection_handler(
|
||||
.in_current_span()
|
||||
.map_ok_or_else(api_error_into_response, |r| r),
|
||||
);
|
||||
|
||||
async move {
|
||||
let res = handler.await;
|
||||
cancel_request.disarm();
|
||||
@@ -301,7 +300,7 @@ async fn request_handler(
|
||||
.map(|s| s.to_string());
|
||||
|
||||
// Check if the request is a websocket upgrade request.
|
||||
if hyper_tungstenite::is_upgrade_request(&request) {
|
||||
if framed_websockets::upgrade::is_upgrade_request(&request) {
|
||||
let ctx = RequestMonitoring::new(
|
||||
session_id,
|
||||
peer_addr,
|
||||
@@ -312,7 +311,7 @@ async fn request_handler(
|
||||
let span = ctx.span.clone();
|
||||
info!(parent: &span, "performing websocket upgrade");
|
||||
|
||||
let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
|
||||
let (response, websocket) = framed_websockets::upgrade::upgrade(&mut request)
|
||||
.map_err(|e| ApiError::BadRequest(e.into()))?;
|
||||
|
||||
ws_connections.spawn(
|
||||
@@ -334,7 +333,7 @@ async fn request_handler(
|
||||
);
|
||||
|
||||
// Return the response so the spawned future can continue.
|
||||
Ok(response)
|
||||
Ok(response.map(|_: http_body_util::Empty<Bytes>| Full::new(Bytes::new())))
|
||||
} else if request.uri().path() == "/sql" && *request.method() == Method::POST {
|
||||
let ctx = RequestMonitoring::new(
|
||||
session_id,
|
||||
|
||||
@@ -7,10 +7,11 @@ use crate::{
|
||||
proxy::{handle_client, ClientMode},
|
||||
rate_limiter::EndpointRateLimiter,
|
||||
};
|
||||
use bytes::{Buf, Bytes};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use framed_websockets::{Frame, OpCode, WebSocketServer};
|
||||
use futures::{Sink, Stream};
|
||||
use hyper::upgrade::Upgraded;
|
||||
use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
|
||||
use hyper1::upgrade::OnUpgrade;
|
||||
use hyper_util::rt::TokioIo;
|
||||
use pin_project_lite::pin_project;
|
||||
|
||||
use std::{
|
||||
@@ -21,25 +22,23 @@ use std::{
|
||||
use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf};
|
||||
use tracing::warn;
|
||||
|
||||
// TODO: use `std::sync::Exclusive` once it's stabilized.
|
||||
// Tracking issue: https://github.com/rust-lang/rust/issues/98407.
|
||||
use sync_wrapper::SyncWrapper;
|
||||
|
||||
pin_project! {
|
||||
/// This is a wrapper around a [`WebSocketStream`] that
|
||||
/// implements [`AsyncRead`] and [`AsyncWrite`].
|
||||
pub struct WebSocketRw<S = Upgraded> {
|
||||
pub struct WebSocketRw<S> {
|
||||
#[pin]
|
||||
stream: SyncWrapper<WebSocketStream<S>>,
|
||||
bytes: Bytes,
|
||||
stream: WebSocketServer<S>,
|
||||
recv: Bytes,
|
||||
send: BytesMut,
|
||||
}
|
||||
}
|
||||
|
||||
impl<S> WebSocketRw<S> {
|
||||
pub fn new(stream: WebSocketStream<S>) -> Self {
|
||||
pub fn new(stream: WebSocketServer<S>) -> Self {
|
||||
Self {
|
||||
stream: stream.into(),
|
||||
bytes: Bytes::new(),
|
||||
stream,
|
||||
recv: Bytes::new(),
|
||||
send: BytesMut::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -50,22 +49,24 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncWrite for WebSocketRw<S> {
|
||||
cx: &mut Context<'_>,
|
||||
buf: &[u8],
|
||||
) -> Poll<io::Result<usize>> {
|
||||
let mut stream = self.project().stream.get_pin_mut();
|
||||
let this = self.project();
|
||||
let mut stream = this.stream;
|
||||
this.send.put(buf);
|
||||
|
||||
ready!(stream.as_mut().poll_ready(cx).map_err(io_error))?;
|
||||
match stream.as_mut().start_send(Message::Binary(buf.into())) {
|
||||
match stream.as_mut().start_send(Frame::binary(this.send.split())) {
|
||||
Ok(()) => Poll::Ready(Ok(buf.len())),
|
||||
Err(e) => Poll::Ready(Err(io_error(e))),
|
||||
}
|
||||
}
|
||||
|
||||
fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
|
||||
let stream = self.project().stream.get_pin_mut();
|
||||
let stream = self.project().stream;
|
||||
stream.poll_flush(cx).map_err(io_error)
|
||||
}
|
||||
|
||||
fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
|
||||
let stream = self.project().stream.get_pin_mut();
|
||||
let stream = self.project().stream;
|
||||
stream.poll_close(cx).map_err(io_error)
|
||||
}
|
||||
}
|
||||
@@ -76,13 +77,10 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncRead for WebSocketRw<S> {
|
||||
cx: &mut Context<'_>,
|
||||
buf: &mut ReadBuf<'_>,
|
||||
) -> Poll<io::Result<()>> {
|
||||
if buf.remaining() > 0 {
|
||||
let bytes = ready!(self.as_mut().poll_fill_buf(cx))?;
|
||||
let len = std::cmp::min(bytes.len(), buf.remaining());
|
||||
buf.put_slice(&bytes[..len]);
|
||||
self.consume(len);
|
||||
}
|
||||
|
||||
let bytes = ready!(self.as_mut().poll_fill_buf(cx))?;
|
||||
let len = std::cmp::min(bytes.len(), buf.remaining());
|
||||
buf.put_slice(&bytes[..len]);
|
||||
self.consume(len);
|
||||
Poll::Ready(Ok(()))
|
||||
}
|
||||
}
|
||||
@@ -94,31 +92,27 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {
|
||||
|
||||
let mut this = self.project();
|
||||
loop {
|
||||
if !this.bytes.chunk().is_empty() {
|
||||
let chunk = (*this.bytes).chunk();
|
||||
if !this.recv.chunk().is_empty() {
|
||||
let chunk = (*this.recv).chunk();
|
||||
return Poll::Ready(Ok(chunk));
|
||||
}
|
||||
|
||||
let res = ready!(this.stream.as_mut().get_pin_mut().poll_next(cx));
|
||||
let res = ready!(this.stream.as_mut().poll_next(cx));
|
||||
match res.transpose().map_err(io_error)? {
|
||||
Some(message) => match message {
|
||||
Message::Ping(_) => {}
|
||||
Message::Pong(_) => {}
|
||||
Message::Text(text) => {
|
||||
Some(message) => match message.opcode {
|
||||
OpCode::Ping => {}
|
||||
OpCode::Pong => {}
|
||||
OpCode::Text => {
|
||||
// We expect to see only binary messages.
|
||||
let error = "unexpected text message in the websocket";
|
||||
warn!(length = text.len(), error);
|
||||
warn!(length = message.payload.len(), error);
|
||||
return Poll::Ready(Err(io_error(error)));
|
||||
}
|
||||
Message::Frame(_) => {
|
||||
// This case is impossible according to Frame's doc.
|
||||
panic!("unexpected raw frame in the websocket");
|
||||
OpCode::Binary | OpCode::Continuation => {
|
||||
debug_assert!(this.recv.is_empty());
|
||||
*this.recv = message.payload.freeze();
|
||||
}
|
||||
Message::Binary(chunk) => {
|
||||
assert!(this.bytes.is_empty());
|
||||
*this.bytes = Bytes::from(chunk);
|
||||
}
|
||||
Message::Close(_) => return EOF,
|
||||
OpCode::Close => return EOF,
|
||||
},
|
||||
None => return EOF,
|
||||
}
|
||||
@@ -126,19 +120,21 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {
|
||||
}
|
||||
|
||||
fn consume(self: Pin<&mut Self>, amount: usize) {
|
||||
self.project().bytes.advance(amount);
|
||||
self.project().recv.advance(amount);
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn serve_websocket(
|
||||
config: &'static ProxyConfig,
|
||||
mut ctx: RequestMonitoring,
|
||||
websocket: HyperWebsocket,
|
||||
websocket: OnUpgrade,
|
||||
cancellation_handler: Arc<CancellationHandlerMain>,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
hostname: Option<String>,
|
||||
) -> anyhow::Result<()> {
|
||||
let websocket = websocket.await?;
|
||||
let websocket = WebSocketServer::after_handshake(TokioIo::new(websocket));
|
||||
|
||||
let conn_gauge = Metrics::get()
|
||||
.proxy
|
||||
.client_connections
|
||||
@@ -177,15 +173,16 @@ pub async fn serve_websocket(
|
||||
mod tests {
|
||||
use std::pin::pin;
|
||||
|
||||
use framed_websockets::WebSocketServer;
|
||||
use futures::{SinkExt, StreamExt};
|
||||
use hyper_tungstenite::{
|
||||
tungstenite::{protocol::Role, Message},
|
||||
WebSocketStream,
|
||||
};
|
||||
use tokio::{
|
||||
io::{duplex, AsyncReadExt, AsyncWriteExt},
|
||||
task::JoinSet,
|
||||
};
|
||||
use tokio_tungstenite::{
|
||||
tungstenite::{protocol::Role, Message},
|
||||
WebSocketStream,
|
||||
};
|
||||
|
||||
use super::WebSocketRw;
|
||||
|
||||
@@ -210,9 +207,7 @@ mod tests {
|
||||
});
|
||||
|
||||
js.spawn(async move {
|
||||
let mut rw = pin!(WebSocketRw::new(
|
||||
WebSocketStream::from_raw_socket(stream2, Role::Server, None).await
|
||||
));
|
||||
let mut rw = pin!(WebSocketRw::new(WebSocketServer::after_handshake(stream2)));
|
||||
|
||||
let mut buf = vec![0; 1024];
|
||||
let n = rw.read(&mut buf).await.unwrap();
|
||||
|
||||
@@ -9,11 +9,13 @@ and `safekeeper`, and does housekeeping such as cleaning up objects for tenants
|
||||
|
||||
#### S3
|
||||
|
||||
Do `aws sso login --profile dev` to get the SSO access to the bucket to clean, get the SSO_ACCOUNT_ID for your profile (`cat ~/.aws/config` may help).
|
||||
Do `aws sso login --profile dev` to get the SSO access to the bucket to clean.
|
||||
Also, set the following environment variables:
|
||||
|
||||
- `SSO_ACCOUNT_ID`: Credentials id to use for accessing S3 buckets
|
||||
- `AWS_PROFILE`: Profile name to use for accessing S3 buckets (e.g. `dev`)
|
||||
- `REGION`: A region where the bucket is located at.
|
||||
- `BUCKET`: Bucket name
|
||||
- `BUCKET_PREFIX` (optional): Prefix inside the bucket
|
||||
|
||||
#### Console API
|
||||
|
||||
@@ -43,7 +45,7 @@ processing by the `purge-garbage` subcommand.
|
||||
|
||||
Example:
|
||||
|
||||
`env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- find-garbage --node-kind=pageserver --depth=tenant --output-path=eu-west-1-garbage.json`
|
||||
`env AWS_PROFILE=dev REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- find-garbage --node-kind=pageserver --depth=tenant --output-path=eu-west-1-garbage.json`
|
||||
|
||||
#### `purge-garbage`
|
||||
|
||||
@@ -59,7 +61,7 @@ to pass them on the command line
|
||||
|
||||
Example:
|
||||
|
||||
`env SSO_ACCOUNT_ID=123456 cargo run --release -- purge-garbage --node-kind=pageserver --depth=tenant --input-path=eu-west-1-garbage.json`
|
||||
`env AWS_PROFILE=dev cargo run --release -- purge-garbage --node-kind=pageserver --depth=tenant --input-path=eu-west-1-garbage.json`
|
||||
|
||||
Add the `--delete` argument before `purge-garbage` to enable deletion. This is intentionally
|
||||
not provided inline in the example above to avoid accidents. Without the `--delete` flag
|
||||
@@ -72,7 +74,7 @@ Errors are logged to stderr and summary to stdout.
|
||||
|
||||
For pageserver:
|
||||
```
|
||||
env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata --node-kind pageserver
|
||||
env AWS_PROFILE=dev REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata --node-kind pageserver
|
||||
|
||||
Timelines: 31106
|
||||
With errors: 3
|
||||
|
||||
@@ -200,30 +200,15 @@ impl RootTarget {
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct BucketConfig {
|
||||
pub region: String,
|
||||
pub bucket: String,
|
||||
pub prefix_in_bucket: Option<String>,
|
||||
|
||||
/// Use SSO if this is set, else rely on AWS_* environment vars
|
||||
pub sso_account_id: Option<String>,
|
||||
}
|
||||
|
||||
impl Display for BucketConfig {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"{}/{}/{}",
|
||||
self.sso_account_id.as_deref().unwrap_or("<none>"),
|
||||
self.region,
|
||||
self.bucket
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl BucketConfig {
|
||||
pub fn from_env() -> anyhow::Result<Self> {
|
||||
let sso_account_id = env::var("SSO_ACCOUNT_ID").ok();
|
||||
let region = env::var("REGION").context("'REGION' param retrieval")?;
|
||||
let bucket = env::var("BUCKET").context("'BUCKET' param retrieval")?;
|
||||
let prefix_in_bucket = env::var("BUCKET_PREFIX").ok();
|
||||
@@ -232,7 +217,6 @@ impl BucketConfig {
|
||||
region,
|
||||
bucket,
|
||||
prefix_in_bucket,
|
||||
sso_account_id,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -276,7 +260,7 @@ pub fn init_logging(file_name: &str) -> WorkerGuard {
|
||||
guard
|
||||
}
|
||||
|
||||
pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Client {
|
||||
pub fn init_s3_client(bucket_region: Region) -> Client {
|
||||
let credentials_provider = {
|
||||
// uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
|
||||
let chain = CredentialsProviderChain::first_try(
|
||||
@@ -290,7 +274,7 @@ pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Clie
|
||||
);
|
||||
|
||||
// Use SSO if we were given an account ID
|
||||
match account_id {
|
||||
match std::env::var("SSO_ACCOUNT_ID").ok() {
|
||||
Some(sso_account) => chain.or_else(
|
||||
"sso",
|
||||
SsoCredentialsProvider::builder()
|
||||
@@ -334,7 +318,7 @@ fn init_remote(
|
||||
) -> anyhow::Result<(Arc<Client>, RootTarget)> {
|
||||
let bucket_region = Region::new(bucket_config.region);
|
||||
let delimiter = "/".to_string();
|
||||
let s3_client = Arc::new(init_s3_client(bucket_config.sso_account_id, bucket_region));
|
||||
let s3_client = Arc::new(init_s3_client(bucket_region));
|
||||
|
||||
let s3_root = match node_kind {
|
||||
NodeKind::Pageserver => RootTarget::Pageserver(S3Target {
|
||||
|
||||
@@ -173,7 +173,7 @@ impl Persistence {
|
||||
/// Wraps `with_conn` in order to collect latency and error metrics
|
||||
async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
|
||||
where
|
||||
F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
|
||||
F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
|
||||
R: Send + 'static,
|
||||
{
|
||||
let latency = &METRICS_REGISTRY
|
||||
@@ -199,13 +199,48 @@ impl Persistence {
|
||||
/// Call the provided function in a tokio blocking thread, with a Diesel database connection.
|
||||
async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
|
||||
where
|
||||
F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
|
||||
F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
|
||||
R: Send + 'static,
|
||||
{
|
||||
// A generous allowance for how many times we may retry serializable transactions
|
||||
// before giving up. This is not expected to be hit: it is a defensive measure in case we
|
||||
// somehow engineer a situation where duelling transactions might otherwise live-lock.
|
||||
const MAX_RETRIES: usize = 128;
|
||||
|
||||
let mut conn = self.connection_pool.get()?;
|
||||
tokio::task::spawn_blocking(move || -> DatabaseResult<R> { func(&mut conn) })
|
||||
.await
|
||||
.expect("Task panic")
|
||||
tokio::task::spawn_blocking(move || -> DatabaseResult<R> {
|
||||
let mut retry_count = 0;
|
||||
loop {
|
||||
match conn.build_transaction().serializable().run(|c| func(c)) {
|
||||
Ok(r) => break Ok(r),
|
||||
Err(
|
||||
err @ DatabaseError::Query(diesel::result::Error::DatabaseError(
|
||||
diesel::result::DatabaseErrorKind::SerializationFailure,
|
||||
_,
|
||||
)),
|
||||
) => {
|
||||
retry_count += 1;
|
||||
if retry_count > MAX_RETRIES {
|
||||
tracing::error!(
|
||||
"Exceeded max retries on SerializationFailure errors: {err:?}"
|
||||
);
|
||||
break Err(err);
|
||||
} else {
|
||||
// Retry on serialization errors: these are expected, because even though our
|
||||
// transactions don't fight for the same rows, they will occasionally collide
|
||||
// on index pages (e.g. increment_generation for unrelated shards can collide)
|
||||
tracing::debug!(
|
||||
"Retrying transaction on serialization failure {err:?}"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
Err(e) => break Err(e),
|
||||
}
|
||||
}
|
||||
})
|
||||
.await
|
||||
.expect("Task panic")
|
||||
}
|
||||
|
||||
/// When a node is first registered, persist it before using it for anything
|
||||
@@ -358,14 +393,11 @@ impl Persistence {
|
||||
self.with_measured_conn(
|
||||
DatabaseOperation::InsertTenantShards,
|
||||
move |conn| -> DatabaseResult<()> {
|
||||
conn.transaction(|conn| -> QueryResult<()> {
|
||||
for tenant in &shards {
|
||||
diesel::insert_into(tenant_shards)
|
||||
.values(tenant)
|
||||
.execute(conn)?;
|
||||
}
|
||||
Ok(())
|
||||
})?;
|
||||
for tenant in &shards {
|
||||
diesel::insert_into(tenant_shards)
|
||||
.values(tenant)
|
||||
.execute(conn)?;
|
||||
}
|
||||
Ok(())
|
||||
},
|
||||
)
|
||||
@@ -533,8 +565,11 @@ impl Persistence {
|
||||
let update = ShardUpdate {
|
||||
generation: input_generation.map(|g| g.into().unwrap() as i32),
|
||||
placement_policy: input_placement_policy
|
||||
.as_ref()
|
||||
.map(|p| serde_json::to_string(&p).unwrap()),
|
||||
config: input_config.map(|c| serde_json::to_string(&c).unwrap()),
|
||||
config: input_config
|
||||
.as_ref()
|
||||
.map(|c| serde_json::to_string(&c).unwrap()),
|
||||
scheduling_policy: input_scheduling_policy
|
||||
.map(|p| serde_json::to_string(&p).unwrap()),
|
||||
};
|
||||
@@ -581,55 +616,51 @@ impl Persistence {
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> {
|
||||
conn.transaction(|conn| -> DatabaseResult<()> {
|
||||
// Mark parent shards as splitting
|
||||
// Mark parent shards as splitting
|
||||
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.eq(old_shard_count.literal() as i32))
|
||||
.set((splitting.eq(1),))
|
||||
.execute(conn)?;
|
||||
if u8::try_from(updated)
|
||||
.map_err(|_| DatabaseError::Logical(
|
||||
format!("Overflow existing shard count {} while splitting", updated))
|
||||
)? != old_shard_count.count() {
|
||||
// Perhaps a deletion or another split raced with this attempt to split, mutating
|
||||
// the parent shards that we intend to split. In this case the split request should fail.
|
||||
return Err(DatabaseError::Logical(
|
||||
format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {})", old_shard_count.count())
|
||||
));
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.eq(old_shard_count.literal() as i32))
|
||||
.set((splitting.eq(1),))
|
||||
.execute(conn)?;
|
||||
if u8::try_from(updated)
|
||||
.map_err(|_| DatabaseError::Logical(
|
||||
format!("Overflow existing shard count {} while splitting", updated))
|
||||
)? != old_shard_count.count() {
|
||||
// Perhaps a deletion or another split raced with this attempt to split, mutating
|
||||
// the parent shards that we intend to split. In this case the split request should fail.
|
||||
return Err(DatabaseError::Logical(
|
||||
format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {})", old_shard_count.count())
|
||||
));
|
||||
}
|
||||
|
||||
// FIXME: spurious clone to sidestep closure move rules
|
||||
let parent_to_children = parent_to_children.clone();
|
||||
|
||||
// Insert child shards
|
||||
for (parent_shard_id, children) in parent_to_children {
|
||||
let mut parent = crate::schema::tenant_shards::table
|
||||
.filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
|
||||
.filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32))
|
||||
.load::<TenantShardPersistence>(conn)?;
|
||||
let parent = if parent.len() != 1 {
|
||||
return Err(DatabaseError::Logical(format!(
|
||||
"Parent shard {parent_shard_id} not found"
|
||||
)));
|
||||
} else {
|
||||
parent.pop().unwrap()
|
||||
};
|
||||
for mut shard in children {
|
||||
// Carry the parent's generation into the child
|
||||
shard.generation = parent.generation;
|
||||
|
||||
debug_assert!(shard.splitting == SplitState::Splitting);
|
||||
diesel::insert_into(tenant_shards)
|
||||
.values(shard)
|
||||
.execute(conn)?;
|
||||
}
|
||||
|
||||
// FIXME: spurious clone to sidestep closure move rules
|
||||
let parent_to_children = parent_to_children.clone();
|
||||
|
||||
// Insert child shards
|
||||
for (parent_shard_id, children) in parent_to_children {
|
||||
let mut parent = crate::schema::tenant_shards::table
|
||||
.filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
|
||||
.filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32))
|
||||
.load::<TenantShardPersistence>(conn)?;
|
||||
let parent = if parent.len() != 1 {
|
||||
return Err(DatabaseError::Logical(format!(
|
||||
"Parent shard {parent_shard_id} not found"
|
||||
)));
|
||||
} else {
|
||||
parent.pop().unwrap()
|
||||
};
|
||||
for mut shard in children {
|
||||
// Carry the parent's generation into the child
|
||||
shard.generation = parent.generation;
|
||||
|
||||
debug_assert!(shard.splitting == SplitState::Splitting);
|
||||
diesel::insert_into(tenant_shards)
|
||||
.values(shard)
|
||||
.execute(conn)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
})?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
})
|
||||
@@ -647,22 +678,18 @@ impl Persistence {
|
||||
self.with_measured_conn(
|
||||
DatabaseOperation::CompleteShardSplit,
|
||||
move |conn| -> DatabaseResult<()> {
|
||||
conn.transaction(|conn| -> QueryResult<()> {
|
||||
// Drop parent shards
|
||||
diesel::delete(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.eq(old_shard_count.literal() as i32))
|
||||
.execute(conn)?;
|
||||
// Drop parent shards
|
||||
diesel::delete(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.eq(old_shard_count.literal() as i32))
|
||||
.execute(conn)?;
|
||||
|
||||
// Clear sharding flag
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.set((splitting.eq(0),))
|
||||
.execute(conn)?;
|
||||
debug_assert!(updated > 0);
|
||||
|
||||
Ok(())
|
||||
})?;
|
||||
// Clear sharding flag
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.set((splitting.eq(0),))
|
||||
.execute(conn)?;
|
||||
debug_assert!(updated > 0);
|
||||
|
||||
Ok(())
|
||||
},
|
||||
@@ -681,39 +708,34 @@ impl Persistence {
|
||||
self.with_measured_conn(
|
||||
DatabaseOperation::AbortShardSplit,
|
||||
move |conn| -> DatabaseResult<AbortShardSplitStatus> {
|
||||
let aborted =
|
||||
conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
|
||||
// Clear the splitting state on parent shards
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.ne(new_shard_count.literal() as i32))
|
||||
.set((splitting.eq(0),))
|
||||
.execute(conn)?;
|
||||
// Clear the splitting state on parent shards
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.ne(new_shard_count.literal() as i32))
|
||||
.set((splitting.eq(0),))
|
||||
.execute(conn)?;
|
||||
|
||||
// Parent shards are already gone: we cannot abort.
|
||||
if updated == 0 {
|
||||
return Ok(AbortShardSplitStatus::Complete);
|
||||
}
|
||||
// Parent shards are already gone: we cannot abort.
|
||||
if updated == 0 {
|
||||
return Ok(AbortShardSplitStatus::Complete);
|
||||
}
|
||||
|
||||
// Sanity check: if parent shards were present, their cardinality should
|
||||
// be less than the number of child shards.
|
||||
if updated >= new_shard_count.count() as usize {
|
||||
return Err(DatabaseError::Logical(format!(
|
||||
"Unexpected parent shard count {updated} while aborting split to \
|
||||
// Sanity check: if parent shards were present, their cardinality should
|
||||
// be less than the number of child shards.
|
||||
if updated >= new_shard_count.count() as usize {
|
||||
return Err(DatabaseError::Logical(format!(
|
||||
"Unexpected parent shard count {updated} while aborting split to \
|
||||
count {new_shard_count:?} on tenant {split_tenant_id}"
|
||||
)));
|
||||
}
|
||||
)));
|
||||
}
|
||||
|
||||
// Erase child shards
|
||||
diesel::delete(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.eq(new_shard_count.literal() as i32))
|
||||
.execute(conn)?;
|
||||
// Erase child shards
|
||||
diesel::delete(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.eq(new_shard_count.literal() as i32))
|
||||
.execute(conn)?;
|
||||
|
||||
Ok(AbortShardSplitStatus::Aborted)
|
||||
})?;
|
||||
|
||||
Ok(aborted)
|
||||
Ok(AbortShardSplitStatus::Aborted)
|
||||
},
|
||||
)
|
||||
.await
|
||||
|
||||
0
test_runner/fixtures/endpoint/__init__.py
Normal file
0
test_runner/fixtures/endpoint/__init__.py
Normal file
23
test_runner/fixtures/endpoint/http.py
Normal file
23
test_runner/fixtures/endpoint/http.py
Normal file
@@ -0,0 +1,23 @@
|
||||
import requests
|
||||
from requests.adapters import HTTPAdapter
|
||||
|
||||
|
||||
class EndpointHttpClient(requests.Session):
|
||||
def __init__(
|
||||
self,
|
||||
port: int,
|
||||
):
|
||||
super().__init__()
|
||||
self.port = port
|
||||
|
||||
self.mount("http://", HTTPAdapter())
|
||||
|
||||
def dbs_and_roles(self):
|
||||
res = self.get(f"http://localhost:{self.port}/dbs_and_roles")
|
||||
res.raise_for_status()
|
||||
return res.json()
|
||||
|
||||
def database_schema(self, database: str):
|
||||
res = self.get(f"http://localhost:{self.port}/database_schema?database={database}")
|
||||
res.raise_for_status()
|
||||
return res.text
|
||||
@@ -48,6 +48,7 @@ from urllib3.util.retry import Retry
|
||||
from fixtures import overlayfs
|
||||
from fixtures.broker import NeonBroker
|
||||
from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
|
||||
from fixtures.endpoint.http import EndpointHttpClient
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
|
||||
from fixtures.pageserver.allowed_errors import (
|
||||
@@ -454,7 +455,7 @@ class NeonEnvBuilder:
|
||||
test_overlay_dir: Optional[Path] = None,
|
||||
pageserver_remote_storage: Optional[RemoteStorage] = None,
|
||||
# toml that will be decomposed into `--config-override` flags during `pageserver --init`
|
||||
pageserver_config_override: Optional[str] = None,
|
||||
pageserver_config_override: Optional[str | Callable[[Dict[str, Any]], None]] = None,
|
||||
num_safekeepers: int = 1,
|
||||
num_pageservers: int = 1,
|
||||
# Use non-standard SK ids to check for various parsing bugs
|
||||
@@ -1126,10 +1127,14 @@ class NeonEnv:
|
||||
)
|
||||
|
||||
if config.pageserver_config_override is not None:
|
||||
for o in config.pageserver_config_override.split(";"):
|
||||
override = toml.loads(o)
|
||||
for key, value in override.items():
|
||||
ps_cfg[key] = value
|
||||
if callable(config.pageserver_config_override):
|
||||
config.pageserver_config_override(ps_cfg)
|
||||
else:
|
||||
assert isinstance(config.pageserver_config_override, str)
|
||||
for o in config.pageserver_config_override.split(";"):
|
||||
override = toml.loads(o)
|
||||
for key, value in override.items():
|
||||
ps_cfg[key] = value
|
||||
|
||||
# Create a corresponding NeonPageserver object
|
||||
self.pageservers.append(
|
||||
@@ -2344,10 +2349,11 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
log.info(f"reconcile_all waited for {n} shards")
|
||||
return n
|
||||
|
||||
def reconcile_until_idle(self, timeout_secs=30, delay_max=5):
|
||||
def reconcile_until_idle(self, timeout_secs=30):
|
||||
start_at = time.time()
|
||||
n = 1
|
||||
delay_sec = 0.5
|
||||
delay_max = 5
|
||||
while n > 0:
|
||||
n = self.reconcile_all()
|
||||
if n == 0:
|
||||
@@ -3372,6 +3378,13 @@ class Endpoint(PgProtocol):
|
||||
self.active_safekeepers: List[int] = list(map(lambda sk: sk.id, env.safekeepers))
|
||||
# path to conf is <repo_dir>/endpoints/<endpoint_id>/pgdata/postgresql.conf
|
||||
|
||||
def http_client(
|
||||
self, auth_token: Optional[str] = None, retries: Optional[Retry] = None
|
||||
) -> EndpointHttpClient:
|
||||
return EndpointHttpClient(
|
||||
port=self.http_port,
|
||||
)
|
||||
|
||||
def create(
|
||||
self,
|
||||
branch_name: str,
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
import concurrent.futures
|
||||
import re
|
||||
import threading
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
@@ -61,19 +60,21 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
]
|
||||
)
|
||||
|
||||
env.storage_controller.allowed_errors.extend(
|
||||
[
|
||||
# The neon_local functionality for updating computes is flaky for unknown reasons
|
||||
".*Local notification hook failed.*",
|
||||
".*Marking shard.*for notification retry.*",
|
||||
".*Failed to notify compute.*",
|
||||
]
|
||||
)
|
||||
|
||||
# Total tenants
|
||||
tenant_count = 3
|
||||
tenant_count = 4
|
||||
|
||||
# Transaction rate: we set this rather than running at full-speed because we
|
||||
# might run on a slow node that doesn't cope well with many full-speed pgbenches running concurrently.
|
||||
transaction_rate = 50
|
||||
|
||||
# Choose a pgbench scale that is just high enough to hit the split threshold around the time init
|
||||
# finishes (we want splits going on during the main read/write bench)
|
||||
pgbench_scale = 40
|
||||
|
||||
# Runtime selected to give storage controller time to do all the shard splits while it runs
|
||||
pgbench_runtime = 180
|
||||
transaction_rate = 100
|
||||
|
||||
class TenantState:
|
||||
def __init__(self, timeline_id, endpoint):
|
||||
@@ -84,9 +85,7 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
tenants = {}
|
||||
for tenant_id in set(TenantId.generate() for _i in range(0, tenant_count)):
|
||||
timeline_id = TimelineId.generate()
|
||||
env.neon_cli.create_tenant(
|
||||
tenant_id, timeline_id, conf=tenant_conf, placement_policy='{"Attached":1}'
|
||||
)
|
||||
env.neon_cli.create_tenant(tenant_id, timeline_id, conf=tenant_conf)
|
||||
endpoint = env.endpoints.create("main", tenant_id=tenant_id)
|
||||
tenants[tenant_id] = TenantState(timeline_id, endpoint)
|
||||
endpoint.start()
|
||||
@@ -95,7 +94,7 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
pg_bin.run_capture(
|
||||
[
|
||||
"pgbench",
|
||||
f"-s{pgbench_scale}",
|
||||
"-s50",
|
||||
"-i",
|
||||
f"postgres://cloud_admin@localhost:{endpoint.pg_port}/postgres",
|
||||
]
|
||||
@@ -140,10 +139,12 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
if low_watermark is None or low_watermark > tps:
|
||||
low_watermark = tps
|
||||
|
||||
if tps < min_tps:
|
||||
raise RuntimeError(
|
||||
f"pgbench on tenant {endpoint.tenant_id} run at {out_path} has tps < {min_tps}"
|
||||
)
|
||||
# Temporarily disabled: have seen some 0 tps regions on Hetzner runners, but not
|
||||
# at the same time as a shard split.
|
||||
# if tps < min_tps:
|
||||
# raise RuntimeError(
|
||||
# f"pgbench on tenant {endpoint.tenant_id} run at {out_path} has tps < {min_tps}"
|
||||
# )
|
||||
|
||||
log.info(f"Checked {matched_lines} progress lines, lowest TPS was {min_tps}")
|
||||
|
||||
@@ -154,8 +155,9 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
out_path = pg_bin.run_capture(
|
||||
[
|
||||
"pgbench",
|
||||
"-s50",
|
||||
"-T",
|
||||
f"{pgbench_runtime}",
|
||||
"180",
|
||||
"-R",
|
||||
f"{transaction_rate}",
|
||||
"-P",
|
||||
@@ -170,8 +172,9 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
out_path = pg_bin.run_capture(
|
||||
[
|
||||
"pgbench",
|
||||
"-s50",
|
||||
"-T",
|
||||
"60",
|
||||
"30",
|
||||
"-R",
|
||||
f"{transaction_rate}",
|
||||
"-S",
|
||||
@@ -183,20 +186,7 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
|
||||
check_pgbench_output(out_path)
|
||||
|
||||
background_reconcile_stop = threading.Event()
|
||||
|
||||
def background_reconcile_task():
|
||||
# The controller will do all this autonomously, but with a 20 second wait between each
|
||||
# time it considers doing a split/optimization. To enable a shorter test, actively
|
||||
# poll the reconcile_all endpoint to make it all happen faster.
|
||||
#
|
||||
# Note that this is mainly to drain the post-split migrations faster, rather than to
|
||||
# prompt the splits themselves.
|
||||
while not background_reconcile_stop.is_set():
|
||||
env.storage_controller.reconcile_until_idle(timeout_secs=pgbench_runtime, delay_max=0.5)
|
||||
background_reconcile_stop.wait(5)
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count + 1) as pgbench_threads:
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads:
|
||||
pgbench_futs = []
|
||||
for tenant_state in tenants.values():
|
||||
fut = pgbench_threads.submit(run_pgbench_init, tenant_state.endpoint)
|
||||
@@ -206,8 +196,6 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
for fut in pgbench_futs:
|
||||
fut.result()
|
||||
|
||||
reconcile_fut = pgbench_threads.submit(background_reconcile_task)
|
||||
|
||||
pgbench_futs = []
|
||||
for tenant_state in tenants.values():
|
||||
fut = pgbench_threads.submit(run_pgbench_main, tenant_state.endpoint)
|
||||
@@ -217,10 +205,6 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
for fut in pgbench_futs:
|
||||
fut.result()
|
||||
|
||||
log.info("Waiting for background reconcile thread")
|
||||
background_reconcile_stop.set()
|
||||
reconcile_fut.result()
|
||||
|
||||
def assert_all_split():
|
||||
for tenant_id in tenants.keys():
|
||||
shards = tenant_get_shards(env, tenant_id)
|
||||
|
||||
72
test_runner/regress/test_aux_files.py
Normal file
72
test_runner/regress/test_aux_files.py
Normal file
@@ -0,0 +1,72 @@
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
logical_replication_sync,
|
||||
)
|
||||
|
||||
|
||||
def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg):
|
||||
env = neon_env_builder.init_start()
|
||||
endpoint = env.endpoints.create_start("main")
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
tenant_config = client.tenant_config(tenant_id).effective_config
|
||||
tenant_config["switch_aux_file_policy"] = "V2"
|
||||
client.set_tenant_config(tenant_id, tenant_config)
|
||||
# aux file v2 is enabled on the write path, so for now, it should be unset (or null)
|
||||
assert (
|
||||
client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)["last_aux_file_policy"]
|
||||
is None
|
||||
)
|
||||
|
||||
pg_conn = endpoint.connect()
|
||||
cur = pg_conn.cursor()
|
||||
|
||||
cur.execute("create table t(pk integer primary key, payload integer)")
|
||||
cur.execute(
|
||||
"CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120));"
|
||||
)
|
||||
cur.execute("create publication pub1 for table t, replication_example")
|
||||
|
||||
# now start subscriber, aux files will be created at this point. TODO: find better ways of testing aux files (i.e., neon_test_utils)
|
||||
# instead of going through the full logical replication process.
|
||||
vanilla_pg.start()
|
||||
vanilla_pg.safe_psql("create table t(pk integer primary key, payload integer)")
|
||||
vanilla_pg.safe_psql(
|
||||
"CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120), testcolumn1 int, testcolumn2 int, testcolumn3 int);"
|
||||
)
|
||||
connstr = endpoint.connstr().replace("'", "''")
|
||||
log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}")
|
||||
vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
|
||||
|
||||
# Wait logical replication channel to be established
|
||||
logical_replication_sync(vanilla_pg, endpoint)
|
||||
vanilla_pg.stop()
|
||||
endpoint.stop()
|
||||
|
||||
with env.pageserver.http_client() as client:
|
||||
# aux file v2 flag should be enabled at this point
|
||||
assert client.timeline_detail(tenant_id, timeline_id)["last_aux_file_policy"] == "V2"
|
||||
with env.pageserver.http_client() as client:
|
||||
tenant_config = client.tenant_config(tenant_id).effective_config
|
||||
tenant_config["switch_aux_file_policy"] = "V1"
|
||||
client.set_tenant_config(tenant_id, tenant_config)
|
||||
# the flag should still be enabled
|
||||
assert (
|
||||
client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[
|
||||
"last_aux_file_policy"
|
||||
]
|
||||
== "V2"
|
||||
)
|
||||
env.pageserver.restart()
|
||||
with env.pageserver.http_client() as client:
|
||||
# aux file v2 flag should be persisted
|
||||
assert (
|
||||
client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[
|
||||
"last_aux_file_policy"
|
||||
]
|
||||
== "V2"
|
||||
)
|
||||
@@ -11,8 +11,7 @@ from fixtures.utils import print_gc_result, query_scalar
|
||||
#
|
||||
def test_branch_behind(neon_env_builder: NeonEnvBuilder):
|
||||
# Disable pitr, because here we want to test branch creation after GC
|
||||
neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
|
||||
env = neon_env_builder.init_start()
|
||||
env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
|
||||
|
||||
error_regexes = [
|
||||
".*invalid branch start lsn.*",
|
||||
|
||||
34
test_runner/regress/test_compute_catalog.py
Normal file
34
test_runner/regress/test_compute_catalog.py
Normal file
@@ -0,0 +1,34 @@
|
||||
import requests
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
|
||||
|
||||
def test_compute_catalog(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
env.neon_cli.create_branch("test_config", "empty")
|
||||
|
||||
endpoint = env.endpoints.create_start("test_config", config_lines=["log_min_messages=debug1"])
|
||||
client = endpoint.http_client()
|
||||
|
||||
objects = client.dbs_and_roles()
|
||||
|
||||
# Assert that 'cloud_admin' role exists in the 'roles' list
|
||||
assert any(
|
||||
role["name"] == "cloud_admin" for role in objects["roles"]
|
||||
), "The 'cloud_admin' role is missing"
|
||||
|
||||
# Assert that 'postgres' database exists in the 'databases' list
|
||||
assert any(
|
||||
db["name"] == "postgres" for db in objects["databases"]
|
||||
), "The 'postgres' database is missing"
|
||||
|
||||
ddl = client.database_schema(database="postgres")
|
||||
|
||||
assert "-- PostgreSQL database dump" in ddl
|
||||
|
||||
try:
|
||||
client.database_schema(database="nonexistentdb")
|
||||
raise AssertionError("Expected HTTPError was not raised")
|
||||
except requests.exceptions.HTTPError as e:
|
||||
assert (
|
||||
e.response.status_code == 404
|
||||
), f"Expected 404 status code, but got {e.response.status_code}"
|
||||
@@ -67,8 +67,7 @@ async def update_and_gc(env: NeonEnv, endpoint: Endpoint, timeline: TimelineId):
|
||||
#
|
||||
def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
|
||||
# Disable pitr, because here we want to test branch creation after GC
|
||||
neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
|
||||
env = neon_env_builder.init_start()
|
||||
env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
|
||||
timeline = env.neon_cli.create_branch("test_gc_aggressive", "main")
|
||||
endpoint = env.endpoints.create_start("test_gc_aggressive")
|
||||
|
||||
@@ -94,13 +93,11 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
#
|
||||
def test_gc_index_upload(neon_env_builder: NeonEnvBuilder):
|
||||
# Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls
|
||||
neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
|
||||
num_index_uploads = 0
|
||||
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
# Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls
|
||||
env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.neon_cli.create_branch("test_gc_index_upload", "main")
|
||||
endpoint = env.endpoints.create_start("test_gc_index_upload")
|
||||
|
||||
@@ -16,8 +16,7 @@ from fixtures.utils import print_gc_result, query_scalar
|
||||
#
|
||||
def test_old_request_lsn(neon_env_builder: NeonEnvBuilder):
|
||||
# Disable pitr, because here we want to test branch creation after GC
|
||||
neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
|
||||
env = neon_env_builder.init_start()
|
||||
env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
|
||||
env.neon_cli.create_branch("test_old_request_lsn", "main")
|
||||
endpoint = env.endpoints.create_start("test_old_request_lsn")
|
||||
|
||||
|
||||
@@ -42,6 +42,8 @@ def test_pageserver_init_node_id(neon_simple_env: NeonEnv, neon_binpath: Path):
|
||||
"listen_http_addr",
|
||||
"pg_auth_type",
|
||||
"http_auth_type",
|
||||
# TODO: only needed for NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM in https://github.com/neondatabase/neon/pull/7748
|
||||
# "tenant_config",
|
||||
]
|
||||
required_config_overrides = [
|
||||
f"--config-override={toml.dumps({k: ps_config[k]})}" for k in required_config_keys
|
||||
|
||||
@@ -25,6 +25,7 @@ from fixtures.neon_fixtures import (
|
||||
S3Scrubber,
|
||||
generate_uploads_and_deletions,
|
||||
)
|
||||
from fixtures.pageserver.common_types import parse_layer_file_name
|
||||
from fixtures.pageserver.http import PageserverApiException
|
||||
from fixtures.pageserver.utils import (
|
||||
assert_tenant_state,
|
||||
@@ -632,39 +633,86 @@ def test_upgrade_generationless_local_file_paths(
|
||||
generation numbers: it should accept these layer files, and avoid doing
|
||||
a delete/download cycle on them.
|
||||
"""
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
neon_env_builder.num_pageservers = 2
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
|
||||
tenant_id = TenantId.generate()
|
||||
timeline_id = TimelineId.generate()
|
||||
env.neon_cli.create_tenant(
|
||||
tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Attached":1}'
|
||||
)
|
||||
|
||||
workload = Workload(env, tenant_id, timeline_id)
|
||||
workload.init()
|
||||
workload.write_rows(1000)
|
||||
|
||||
env.pageserver.stop()
|
||||
attached_pageserver = env.get_tenant_pageserver(tenant_id)
|
||||
secondary_pageserver = list([ps for ps in env.pageservers if ps.id != attached_pageserver.id])[
|
||||
0
|
||||
]
|
||||
|
||||
attached_pageserver.http_client().tenant_heatmap_upload(tenant_id)
|
||||
secondary_pageserver.http_client().tenant_secondary_download(tenant_id)
|
||||
|
||||
# Rename the local paths to legacy format, to simulate what
|
||||
# we would see when upgrading
|
||||
timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
|
||||
files_renamed = 0
|
||||
for filename in os.listdir(timeline_dir):
|
||||
path = os.path.join(timeline_dir, filename)
|
||||
log.info(f"Found file {path}")
|
||||
if path.endswith("-v1-00000001"):
|
||||
new_path = path[:-12]
|
||||
os.rename(path, new_path)
|
||||
log.info(f"Renamed {path} -> {new_path}")
|
||||
files_renamed += 1
|
||||
# we would see when upgrading. Do this on both attached and secondary locations, as we will
|
||||
# test the behavior of both.
|
||||
for pageserver in env.pageservers:
|
||||
pageserver.stop()
|
||||
timeline_dir = pageserver.timeline_dir(tenant_id, timeline_id)
|
||||
files_renamed = 0
|
||||
for filename in os.listdir(timeline_dir):
|
||||
path = os.path.join(timeline_dir, filename)
|
||||
log.info(f"Found file {path}")
|
||||
if path.endswith("-v1-00000001"):
|
||||
new_path = path[:-12]
|
||||
os.rename(path, new_path)
|
||||
log.info(f"Renamed {path} -> {new_path}")
|
||||
files_renamed += 1
|
||||
|
||||
assert files_renamed > 0
|
||||
assert files_renamed > 0
|
||||
|
||||
env.pageserver.start()
|
||||
pageserver.start()
|
||||
|
||||
workload.validate()
|
||||
|
||||
# Assert that there were no on-demand downloads
|
||||
assert (
|
||||
env.pageserver.http_client().get_metric_value(
|
||||
attached_pageserver.http_client().get_metric_value(
|
||||
"pageserver_remote_ondemand_downloaded_layers_total"
|
||||
)
|
||||
== 0
|
||||
)
|
||||
|
||||
# Do a secondary download and ensure there were no layer downloads
|
||||
secondary_pageserver.http_client().tenant_secondary_download(tenant_id)
|
||||
assert (
|
||||
secondary_pageserver.http_client().get_metric_value(
|
||||
"pageserver_secondary_download_layer_total"
|
||||
)
|
||||
== 0
|
||||
)
|
||||
|
||||
# Check that when we evict and promote one of the legacy-named layers, everything works as
|
||||
# expected
|
||||
local_layers = list(
|
||||
(
|
||||
parse_layer_file_name(path.name),
|
||||
os.path.join(attached_pageserver.timeline_dir(tenant_id, timeline_id), path),
|
||||
)
|
||||
for path in attached_pageserver.list_layers(tenant_id, timeline_id)
|
||||
)
|
||||
(victim_layer_name, victim_path) = local_layers[0]
|
||||
assert os.path.exists(victim_path)
|
||||
|
||||
attached_pageserver.http_client().evict_layer(
|
||||
tenant_id, timeline_id, victim_layer_name.to_str()
|
||||
)
|
||||
assert not os.path.exists(victim_path)
|
||||
|
||||
attached_pageserver.http_client().download_layer(
|
||||
tenant_id, timeline_id, victim_layer_name.to_str()
|
||||
)
|
||||
# We should download into the same local path we started with
|
||||
assert os.path.exists(victim_path)
|
||||
|
||||
@@ -10,11 +10,9 @@ from fixtures.utils import print_gc_result, query_scalar
|
||||
#
|
||||
def test_pitr_gc(neon_env_builder: NeonEnvBuilder):
|
||||
# Set pitr interval such that we need to keep the data
|
||||
neon_env_builder.pageserver_config_override = (
|
||||
"tenant_config={pitr_interval = '1 day', gc_horizon = 0}"
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_conf={"pitr_interval": "1 day", "gc_horizon": "0"}
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
endpoint_main = env.endpoints.create_start("main")
|
||||
|
||||
main_pg_conn = endpoint_main.connect()
|
||||
|
||||
@@ -135,7 +135,14 @@ async def test_websockets_pipelined(static_proxy: NeonProxy):
|
||||
query_message = "SELECT 1".encode("utf-8") + b"\0"
|
||||
length2 = (4 + len(query_message)).to_bytes(4, byteorder="big")
|
||||
await websocket.send(
|
||||
[length0, startup_message, b"p", length1, auth_message, b"Q", length2, query_message]
|
||||
length0
|
||||
+ startup_message
|
||||
+ b"p"
|
||||
+ length1
|
||||
+ auth_message
|
||||
+ b"Q"
|
||||
+ length2
|
||||
+ query_message
|
||||
)
|
||||
|
||||
startup_response = await websocket.recv()
|
||||
|
||||
@@ -10,9 +10,11 @@ from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
#
|
||||
def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
|
||||
# Override default checkpointer settings to run it more often
|
||||
neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}"
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_conf={
|
||||
"checkpoint_distance": "1048576",
|
||||
}
|
||||
)
|
||||
env.pageserver.is_testing_enabled_or_skip()
|
||||
|
||||
# We expect the pageserver to exit, which will cause storage storage controller
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import json
|
||||
from contextlib import closing
|
||||
from typing import Any, Dict
|
||||
|
||||
import psycopg2.extras
|
||||
from fixtures.common_types import Lsn
|
||||
@@ -14,16 +15,22 @@ from fixtures.utils import wait_until
|
||||
|
||||
def test_tenant_config(neon_env_builder: NeonEnvBuilder):
|
||||
"""Test per tenant configuration"""
|
||||
# set some non-default global config
|
||||
neon_env_builder.pageserver_config_override = """
|
||||
page_cache_size=444;
|
||||
wait_lsn_timeout='111 s';
|
||||
[tenant_config]
|
||||
checkpoint_distance = 10000
|
||||
compaction_target_size = 1048576
|
||||
evictions_low_residence_duration_metric_threshold = "2 days"
|
||||
eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold = "23 hours" }
|
||||
"""
|
||||
|
||||
def set_some_nondefault_global_config(ps_cfg: Dict[str, Any]):
|
||||
ps_cfg["page_cache_size"] = 444
|
||||
ps_cfg["wait_lsn_timeout"] = "111 s"
|
||||
|
||||
tenant_config = ps_cfg.setdefault("tenant_config", {})
|
||||
tenant_config["checkpoint_distance"] = 10000
|
||||
tenant_config["compaction_target_size"] = 1048576
|
||||
tenant_config["evictions_low_residence_duration_metric_threshold"] = "2 days"
|
||||
tenant_config["eviction_policy"] = {
|
||||
"kind": "LayerAccessThreshold",
|
||||
"period": "20s",
|
||||
"threshold": "23 hours",
|
||||
}
|
||||
|
||||
neon_env_builder.pageserver_config_override = set_some_nondefault_global_config
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
# we configure eviction but no remote storage, there might be error lines
|
||||
|
||||
@@ -502,9 +502,14 @@ def test_get_tenant_size_with_multiple_branches(
|
||||
|
||||
gc_horizon = 128 * 1024
|
||||
|
||||
neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}"
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_conf={
|
||||
"compaction_period": "0s",
|
||||
"gc_period": "0s",
|
||||
"pitr_interval": "0sec",
|
||||
"gc_horizon": gc_horizon,
|
||||
}
|
||||
)
|
||||
|
||||
# FIXME: we have a race condition between GC and delete timeline. GC might fail with this
|
||||
# error. Similar to https://github.com/neondatabase/neon/issues/2671
|
||||
|
||||
@@ -415,11 +415,12 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder
|
||||
|
||||
# Disable background compaction as we don't want it to happen after `get_physical_size` request
|
||||
# and before checking the expected size on disk, which makes the assertion failed
|
||||
neon_env_builder.pageserver_config_override = (
|
||||
"tenant_config={checkpoint_distance=100000, compaction_period='10m'}"
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_conf={
|
||||
"checkpoint_distance": "100000",
|
||||
"compaction_period": "10m",
|
||||
}
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_compaction")
|
||||
@@ -462,9 +463,14 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
# Disable background compaction and GC as we don't want it to happen after `get_physical_size` request
|
||||
# and before checking the expected size on disk, which makes the assertion failed
|
||||
neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance=100000, compaction_period='0s', gc_period='0s', pitr_interval='1s'}"
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_conf={
|
||||
"checkpoint_distance": "100000",
|
||||
"compaction_period": "0s",
|
||||
"gc_period": "0s",
|
||||
"pitr_interval": "1s",
|
||||
}
|
||||
)
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_gc")
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import time
|
||||
from typing import Any, Dict
|
||||
|
||||
from fixtures.common_types import Lsn, TenantId
|
||||
from fixtures.log_helper import log
|
||||
@@ -42,10 +43,14 @@ def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder):
|
||||
# Kills one of the safekeepers and ensures that only the active ones are printed in the state.
|
||||
def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuilder):
|
||||
# Trigger WAL wait timeout faster
|
||||
neon_env_builder.pageserver_config_override = """
|
||||
wait_lsn_timeout = "1s"
|
||||
tenant_config={walreceiver_connect_timeout = "2s", lagging_wal_timeout = "2s"}
|
||||
"""
|
||||
def customize_pageserver_toml(ps_cfg: Dict[str, Any]):
|
||||
ps_cfg["wait_lsn_timeout"] = "1s"
|
||||
tenant_config = ps_cfg.setdefault("tenant_config", {})
|
||||
tenant_config["walreceiver_connect_timeout"] = "2s"
|
||||
tenant_config["lagging_wal_timeout"] = "2s"
|
||||
|
||||
neon_env_builder.pageserver_config_override = customize_pageserver_toml
|
||||
|
||||
# Have notable SK ids to ensure we check logs for their presence, not some other random numbers
|
||||
neon_env_builder.safekeepers_id_start = 12345
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
|
||||
2
vendor/postgres-v14
vendored
2
vendor/postgres-v14
vendored
Submodule vendor/postgres-v14 updated: d6f7e2c604...a6dc3f010d
2
vendor/postgres-v15
vendored
2
vendor/postgres-v15
vendored
Submodule vendor/postgres-v15 updated: f0d6b0ef75...6fd679f515
2
vendor/postgres-v16
vendored
2
vendor/postgres-v16
vendored
Submodule vendor/postgres-v16 updated: 8ef3c33aa0...1f63dd206a
6
vendor/revisions.json
vendored
6
vendor/revisions.json
vendored
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"v16": ["16.2", "8ef3c33aa01631e17cb24a122776349fcc777b46"],
|
||||
"v15": ["15.6", "f0d6b0ef7581bd78011832e23d8420a7d2c8a83a"],
|
||||
"v14": ["14.11", "d6f7e2c604bfc7cbc4c46bcea0a8e800f4bc778a"]
|
||||
"v16": ["16.2", "1f63dd206a8aaa4727baad334c548219c52878e1"],
|
||||
"v15": ["15.6", "6fd679f5154d12f4892ddd450cc6be28a8ac31b0"],
|
||||
"v14": ["14.11", "a6dc3f010da31472a7ae9ab0ddfbf6e49131d93c"]
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user