Compare commits

..

1 Commits

Author SHA1 Message Date
github-actions[bot]
3f5a7bd5ae Compute release 2025-02-21 2025-02-21 07:00:48 +00:00
9 changed files with 65 additions and 99 deletions

10
Cargo.lock generated
View File

@@ -3263,7 +3263,8 @@ checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
[[package]]
name = "jemalloc_pprof"
version = "0.6.0"
source = "git+https://github.com/erikgrinaker/rust-jemalloc-pprof?branch=symbolize#0b146a1e2013bbc7fc8dc45f208f868c0b8ed193"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a883828bd6a4b957cd9f618886ff19e5f3ebd34e06ba0e855849e049fef32fb"
dependencies = [
"anyhow",
"libc",
@@ -3452,7 +3453,8 @@ dependencies = [
[[package]]
name = "mappings"
version = "0.6.0"
source = "git+https://github.com/erikgrinaker/rust-jemalloc-pprof?branch=symbolize#0b146a1e2013bbc7fc8dc45f208f868c0b8ed193"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ce9229c438fbf1c333926e2053c4c091feabbd40a1b590ec62710fea2384af9e"
dependencies = [
"anyhow",
"libc",
@@ -4727,10 +4729,10 @@ dependencies = [
[[package]]
name = "pprof_util"
version = "0.6.0"
source = "git+https://github.com/erikgrinaker/rust-jemalloc-pprof?branch=symbolize#0b146a1e2013bbc7fc8dc45f208f868c0b8ed193"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "65c568b3f8c1c37886ae07459b1946249e725c315306b03be5632f84c239f781"
dependencies = [
"anyhow",
"backtrace",
"flate2",
"num",
"paste",

View File

@@ -116,7 +116,7 @@ inferno = "0.12.0"
ipnet = "2.10.0"
itertools = "0.10"
itoa = "1.0.11"
jemalloc_pprof = { git = "https://github.com/erikgrinaker/rust-jemalloc-pprof", branch = "symbolize", version = "0.6", features = ["symbolize"] }
jemalloc_pprof = "0.6"
jsonwebtoken = "9"
lasso = "0.7"
libc = "0.2"

View File

@@ -395,22 +395,15 @@ RUN case "${PG_VERSION:?}" in \
cd plv8-src && \
if [[ "${PG_VERSION:?}" < "v17" ]]; then patch -p1 < /ext-src/plv8-3.1.10.patch; fi
# Step 1: Build the vendored V8 engine. It doesn't depend on PostgreSQL, so use
# 'build-deps' as the base. This enables caching and avoids unnecessary rebuilds.
# (The V8 engine takes a very long time to build)
FROM build-deps AS plv8-build
FROM pg-build AS plv8-build
ARG PG_VERSION
WORKDIR /ext-src/plv8-src
RUN apt update && \
apt install --no-install-recommends --no-install-suggests -y \
ninja-build python3-dev libncurses5 binutils clang \
&& apt clean && rm -rf /var/lib/apt/lists/*
COPY --from=plv8-src /ext-src/ /ext-src/
RUN make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) v8
# Step 2: Build the PostgreSQL-dependent parts
COPY --from=pg-build /usr/local/pgsql /usr/local/pgsql
ENV PATH="/usr/local/pgsql/bin:$PATH"
COPY --from=plv8-src /ext-src/ /ext-src/
WORKDIR /ext-src/plv8-src
RUN \
# generate and copy upgrade scripts
make generate_upgrades && \

View File

@@ -495,10 +495,19 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
}
Format::Pprof => {
let data = tokio::task::spawn_blocking(move || prof_ctl.dump_pprof())
.await
.map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
.map_err(ApiError::InternalServerError)?;
let data = tokio::task::spawn_blocking(move || {
let bytes = prof_ctl.dump_pprof()?;
// Symbolize the profile.
// TODO: consider moving this upstream to jemalloc_pprof and avoiding the
// serialization roundtrip.
let profile = pprof::decode(&bytes)?;
let profile = pprof::symbolize(profile)?;
let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS);
pprof::encode(&profile)
})
.await
.map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
.map_err(ApiError::InternalServerError)?;
Response::builder()
.status(200)
.header(CONTENT_TYPE, "application/octet-stream")
@@ -511,6 +520,8 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
let body = tokio::task::spawn_blocking(move || {
let bytes = prof_ctl.dump_pprof()?;
let profile = pprof::decode(&bytes)?;
let profile = pprof::symbolize(profile)?;
let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS);
let mut opts = inferno::flamegraph::Options::default();
opts.title = "Heap inuse".to_string();
opts.count_name = "bytes".to_string();

View File

@@ -672,10 +672,6 @@ fn start_pageserver(
}
});
// Allocate a bunch of memory.
let alloc = allocate(256 * 1024 * 1024);
println!("allocated {}b", alloc.len());
// Wait for cancellation signal and shut down the pageserver.
//
// This cancels the `shutdown_pageserver` cancellation tree. Right now that tree doesn't
@@ -699,17 +695,6 @@ fn start_pageserver(
})
}
#[inline(never)]
fn allocate(size: usize) -> Vec<u8> {
allocate_inline(size)
}
#[inline(always)]
fn allocate_inline(size: usize) -> Vec<u8> {
println!("allocating {size}b");
vec![9; size]
}
async fn create_remote_storage_client(
conf: &'static PageServerConf,
) -> anyhow::Result<GenericRemoteStorage> {

View File

@@ -598,10 +598,7 @@ async fn handle_tenant_timeline_passthrough(
let _timer = latency.start_timer(labels.clone());
let client = mgmt_api::Client::new(
node.base_url(),
service.get_config().pageserver_jwt_token.as_deref(),
);
let client = mgmt_api::Client::new(node.base_url(), service.get_config().jwt_token.as_deref());
let resp = client.get_raw(path).await.map_err(|e|
// We return 503 here because if we can't successfully send a request to the pageserver,
// either we aren't available or the pageserver is unavailable.
@@ -1357,7 +1354,10 @@ async fn handle_safekeeper_scheduling_policy(
.set_safekeeper_scheduling_policy(id, body.scheduling_policy)
.await?;
json_response(StatusCode::OK, ())
Ok(Response::builder()
.status(StatusCode::NO_CONTENT)
.body(Body::empty())
.unwrap())
}
/// Common wrapper for request handlers that call into Service and will operate on tenants: they must only

View File

@@ -53,10 +53,6 @@ struct Cli {
#[arg(long)]
jwt_token: Option<String>,
/// Token for authenticating this service with the safekeepers it controls
#[arg(long)]
safekeeper_jwt_token: Option<String>,
/// Token for authenticating this service with the control plane, when calling
/// the compute notification endpoint
#[arg(long)]
@@ -157,8 +153,7 @@ impl Default for StrictMode {
struct Secrets {
database_url: String,
public_key: Option<JwtAuth>,
pageserver_jwt_token: Option<String>,
safekeeper_jwt_token: Option<String>,
jwt_token: Option<String>,
control_plane_jwt_token: Option<String>,
peer_jwt_token: Option<String>,
}
@@ -166,7 +161,6 @@ struct Secrets {
impl Secrets {
const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
const SAFEKEEPER_JWT_TOKEN_ENV: &'static str = "SAFEKEEPER_JWT_TOKEN";
const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
const PEER_JWT_TOKEN_ENV: &'static str = "PEER_JWT_TOKEN";
const PUBLIC_KEY_ENV: &'static str = "PUBLIC_KEY";
@@ -190,14 +184,7 @@ impl Secrets {
let this = Self {
database_url,
public_key,
pageserver_jwt_token: Self::load_secret(
&args.jwt_token,
Self::PAGESERVER_JWT_TOKEN_ENV,
),
safekeeper_jwt_token: Self::load_secret(
&args.safekeeper_jwt_token,
Self::SAFEKEEPER_JWT_TOKEN_ENV,
),
jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV),
control_plane_jwt_token: Self::load_secret(
&args.control_plane_jwt_token,
Self::CONTROL_PLANE_JWT_TOKEN_ENV,
@@ -277,17 +264,11 @@ async fn async_main() -> anyhow::Result<()> {
let secrets = Secrets::load(&args).await?;
// TODO: once we've rolled out the safekeeper JWT token everywhere, put it into the validation code below
tracing::info!(
"safekeeper_jwt_token set: {:?}",
secrets.safekeeper_jwt_token.is_some()
);
// Validate required secrets and arguments are provided in strict mode
match strict_mode {
StrictMode::Strict
if (secrets.public_key.is_none()
|| secrets.pageserver_jwt_token.is_none()
|| secrets.jwt_token.is_none()
|| secrets.control_plane_jwt_token.is_none()) =>
{
// Production systems should always have secrets configured: if public_key was not set
@@ -312,8 +293,7 @@ async fn async_main() -> anyhow::Result<()> {
}
let config = Config {
pageserver_jwt_token: secrets.pageserver_jwt_token,
safekeeper_jwt_token: secrets.safekeeper_jwt_token,
jwt_token: secrets.jwt_token,
control_plane_jwt_token: secrets.control_plane_jwt_token,
peer_jwt_token: secrets.peer_jwt_token,
compute_hook_url: args.compute_hook_url,

View File

@@ -296,7 +296,7 @@ impl Reconciler {
.location_config(tenant_shard_id, config.clone(), flush_ms, lazy)
.await
},
&self.service_config.pageserver_jwt_token,
&self.service_config.jwt_token,
1,
3,
timeout,
@@ -417,7 +417,7 @@ impl Reconciler {
let client = PageserverClient::new(
node.get_id(),
node.base_url(),
self.service_config.pageserver_jwt_token.as_deref(),
self.service_config.jwt_token.as_deref(),
);
client
@@ -440,7 +440,7 @@ impl Reconciler {
let client = PageserverClient::new(
node.get_id(),
node.base_url(),
self.service_config.pageserver_jwt_token.as_deref(),
self.service_config.jwt_token.as_deref(),
);
let timelines = client.timeline_list(&tenant_shard_id).await?;
@@ -478,7 +478,7 @@ impl Reconciler {
)
.await
},
&self.service_config.pageserver_jwt_token,
&self.service_config.jwt_token,
1,
3,
request_download_timeout * 2,
@@ -771,7 +771,7 @@ impl Reconciler {
let observed_conf = match attached_node
.with_client_retries(
|client| async move { client.get_location_config(tenant_shard_id).await },
&self.service_config.pageserver_jwt_token,
&self.service_config.jwt_token,
1,
1,
Duration::from_secs(5),
@@ -1099,7 +1099,7 @@ impl Reconciler {
match origin
.with_client_retries(
|client| async move { client.get_location_config(tenant_shard_id).await },
&self.service_config.pageserver_jwt_token,
&self.service_config.jwt_token,
1,
3,
Duration::from_secs(5),

View File

@@ -348,12 +348,7 @@ pub struct Config {
// All pageservers managed by one instance of this service must have
// the same public key. This JWT token will be used to authenticate
// this service to the pageservers it manages.
pub pageserver_jwt_token: Option<String>,
// All safekeepers managed by one instance of this service must have
// the same public key. This JWT token will be used to authenticate
// this service to the safekeepers it manages.
pub safekeeper_jwt_token: Option<String>,
pub jwt_token: Option<String>,
// This JWT token will be used to authenticate this service to the control plane.
pub control_plane_jwt_token: Option<String>,
@@ -887,7 +882,7 @@ impl Service {
let response = node
.with_client_retries(
|client| async move { client.list_location_config().await },
&self.config.pageserver_jwt_token,
&self.config.jwt_token,
1,
5,
timeout,
@@ -988,7 +983,7 @@ impl Service {
let client = PageserverClient::new(
node.get_id(),
node.base_url(),
self.config.pageserver_jwt_token.as_deref(),
self.config.jwt_token.as_deref(),
);
match client
.location_config(
@@ -1558,14 +1553,14 @@ impl Service {
let reconcilers_cancel = cancel.child_token();
let heartbeater_ps = Heartbeater::new(
config.pageserver_jwt_token.clone(),
config.jwt_token.clone(),
config.max_offline_interval,
config.max_warming_up_interval,
cancel.clone(),
);
let heartbeater_sk = Heartbeater::new(
config.safekeeper_jwt_token.clone(),
config.jwt_token.clone(),
config.max_offline_interval,
config.max_warming_up_interval,
cancel.clone(),
@@ -1912,7 +1907,7 @@ impl Service {
let configs = match node
.with_client_retries(
|client| async move { client.list_location_config().await },
&self.config.pageserver_jwt_token,
&self.config.jwt_token,
1,
5,
SHORT_RECONCILE_TIMEOUT,
@@ -1970,7 +1965,7 @@ impl Service {
.location_config(tenant_shard_id, config, None, false)
.await
},
&self.config.pageserver_jwt_token,
&self.config.jwt_token,
1,
5,
SHORT_RECONCILE_TIMEOUT,
@@ -3105,7 +3100,7 @@ impl Service {
let client = PageserverClient::new(
node.get_id(),
node.base_url(),
self.config.pageserver_jwt_token.as_deref(),
self.config.jwt_token.as_deref(),
);
tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",);
@@ -3166,7 +3161,7 @@ impl Service {
let client = PageserverClient::new(
node.get_id(),
node.base_url(),
self.config.pageserver_jwt_token.as_deref(),
self.config.jwt_token.as_deref(),
);
futs.push(async move {
let result = client
@@ -3289,7 +3284,7 @@ impl Service {
.tenant_delete(TenantShardId::unsharded(tenant_id))
.await
},
&self.config.pageserver_jwt_token,
&self.config.jwt_token,
1,
3,
RECONCILE_TIMEOUT,
@@ -3508,7 +3503,7 @@ impl Service {
let timeline_info = create_one(
shard_zero_tid,
shard_zero_locations,
self.config.pageserver_jwt_token.clone(),
self.config.jwt_token.clone(),
create_req.clone(),
)
.await?;
@@ -3524,7 +3519,7 @@ impl Service {
// Create timeline on remaining shards with number >0
if !targets.0.is_empty() {
// If we had multiple shards, issue requests for the remainder now.
let jwt = &self.config.pageserver_jwt_token;
let jwt = &self.config.jwt_token;
self.tenant_for_shards(
targets
.0
@@ -3607,7 +3602,7 @@ impl Service {
tenant_shard_id,
timeline_id,
node,
self.config.pageserver_jwt_token.clone(),
self.config.jwt_token.clone(),
req.clone(),
))
})
@@ -3688,7 +3683,7 @@ impl Service {
tenant_shard_id,
timeline_id,
node,
self.config.pageserver_jwt_token.clone(),
self.config.jwt_token.clone(),
))
})
.await?;
@@ -3762,7 +3757,7 @@ impl Service {
tenant_shard_id,
timeline_id,
node,
self.config.pageserver_jwt_token.clone(),
self.config.jwt_token.clone(),
dir,
))
})
@@ -3877,7 +3872,7 @@ impl Service {
futs.push(async move {
node.with_client_retries(
|client| op(tenant_shard_id, client),
&self.config.pageserver_jwt_token,
&self.config.jwt_token,
warn_threshold,
max_retries,
timeout,
@@ -4126,7 +4121,7 @@ impl Service {
tenant_shard_id,
timeline_id,
node,
self.config.pageserver_jwt_token.clone(),
self.config.jwt_token.clone(),
))
})
.await?;
@@ -4148,7 +4143,7 @@ impl Service {
shard_zero_tid,
timeline_id,
shard_zero_locations.latest.node,
self.config.pageserver_jwt_token.clone(),
self.config.jwt_token.clone(),
)
.await?;
Ok(shard_zero_status)
@@ -4547,7 +4542,7 @@ impl Service {
client.location_config(child_id, config, None, false).await
},
&self.config.pageserver_jwt_token,
&self.config.jwt_token,
1,
10,
Duration::from_secs(5),
@@ -5147,7 +5142,7 @@ impl Service {
let client = PageserverClient::new(
node.get_id(),
node.base_url(),
self.config.pageserver_jwt_token.as_deref(),
self.config.jwt_token.as_deref(),
);
let response = client
.tenant_shard_split(
@@ -5473,7 +5468,7 @@ impl Service {
let client = PageserverClient::new(
node.get_id(),
node.base_url(),
self.config.pageserver_jwt_token.as_deref(),
self.config.jwt_token.as_deref(),
);
let scan_result = client
@@ -7099,7 +7094,7 @@ impl Service {
match attached_node
.with_client_retries(
|client| async move { client.tenant_heatmap_upload(tenant_shard_id).await },
&self.config.pageserver_jwt_token,
&self.config.jwt_token,
3,
10,
SHORT_RECONCILE_TIMEOUT,
@@ -7135,7 +7130,7 @@ impl Service {
)
.await
},
&self.config.pageserver_jwt_token,
&self.config.jwt_token,
3,
10,
SHORT_RECONCILE_TIMEOUT,
@@ -7190,7 +7185,7 @@ impl Service {
let request = request_ref.clone();
client.top_tenant_shards(request.clone()).await
},
&self.config.pageserver_jwt_token,
&self.config.jwt_token,
3,
3,
Duration::from_secs(5),
@@ -7363,7 +7358,7 @@ impl Service {
match node
.with_client_retries(
|client| async move { client.tenant_secondary_status(tenant_shard_id).await },
&self.config.pageserver_jwt_token,
&self.config.jwt_token,
1,
3,
Duration::from_millis(250),