mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-23 05:12:56 +00:00
Compare commits
4 Commits
release-53
...
jemalloc-p
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7d2709f4a1 | ||
|
|
6e2c04bc48 | ||
|
|
76ae735a24 | ||
|
|
7be445f627 |
@@ -252,7 +252,7 @@ debug = true
|
||||
|
||||
# disable debug symbols for all packages except this one to decrease binaries size
|
||||
[profile.release.package."*"]
|
||||
debug = false
|
||||
debug = true
|
||||
|
||||
[profile.release-line-debug]
|
||||
inherits = "release"
|
||||
|
||||
@@ -44,6 +44,7 @@ COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_i
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
|
||||
COPY --chown=nonroot . .
|
||||
|
||||
ENV _RJEM_MALLOC_CONF="prof:true"
|
||||
# Show build caching stats to check if it was used in the end.
|
||||
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
|
||||
RUN set -e \
|
||||
|
||||
@@ -5,7 +5,6 @@ use crate::{
|
||||
models::ShardParameters,
|
||||
};
|
||||
use hex::FromHex;
|
||||
use postgres_ffi::relfile_utils::INIT_FORKNUM;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::TenantId;
|
||||
|
||||
@@ -538,24 +537,6 @@ impl ShardIdentity {
|
||||
}
|
||||
}
|
||||
|
||||
/// Special case for issue `<https://github.com/neondatabase/neon/issues/7451>`
|
||||
///
|
||||
/// When we fail to read a forknum block, this function tells us whether we may ignore the error
|
||||
/// as a symptom of that issue.
|
||||
pub fn is_key_buggy_forknum(&self, key: &Key) -> bool {
|
||||
if !is_rel_block_key(key) || key.field5 != INIT_FORKNUM {
|
||||
return false;
|
||||
}
|
||||
|
||||
let mut hash = murmurhash32(key.field4);
|
||||
hash = hash_combine(hash, murmurhash32(key.field6 / self.stripe_size.0));
|
||||
let mapped_shard = ShardNumber((hash % self.count.0 as u32) as u8);
|
||||
|
||||
// The key may be affected by issue #7454: it is an initfork and it would not
|
||||
// have mapped to shard 0 until we fixed that issue.
|
||||
mapped_shard != ShardNumber(0)
|
||||
}
|
||||
|
||||
/// Return true if the key should be discarded if found in this shard's
|
||||
/// data store, e.g. during compaction after a split.
|
||||
///
|
||||
@@ -668,13 +649,7 @@ fn key_is_shard0(key: &Key) -> bool {
|
||||
// relation pages are distributed to shards other than shard zero. Everything else gets
|
||||
// stored on shard 0. This guarantees that shard 0 can independently serve basebackup
|
||||
// requests, and any request other than those for particular blocks in relations.
|
||||
//
|
||||
// The only exception to this rule is "initfork" data -- this relates to postgres's UNLOGGED table
|
||||
// type. These are special relations, usually with only 0 or 1 blocks, and we store them on shard 0
|
||||
// because they must be included in basebackups.
|
||||
let is_initfork = key.field5 == INIT_FORKNUM;
|
||||
|
||||
!is_rel_block_key(key) || is_initfork
|
||||
!is_rel_block_key(key)
|
||||
}
|
||||
|
||||
/// Provide the same result as the function in postgres `hashfn.h` with the same name
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
use anyhow::{anyhow, bail, ensure, Context};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use fail::fail_point;
|
||||
use pageserver_api::key::{key_to_slru_block, rel_block_to_key, Key};
|
||||
use pageserver_api::key::{key_to_slru_block, Key};
|
||||
use postgres_ffi::pg_constants;
|
||||
use std::fmt::Write as FmtWrite;
|
||||
use std::time::SystemTime;
|
||||
@@ -297,20 +297,7 @@ where
|
||||
if rel.forknum == INIT_FORKNUM {
|
||||
// I doubt we need _init fork itself, but having it at least
|
||||
// serves as a marker relation is unlogged.
|
||||
if let Err(_e) = self.add_rel(rel, rel).await {
|
||||
if self
|
||||
.timeline
|
||||
.get_shard_identity()
|
||||
.is_key_buggy_forknum(&rel_block_to_key(rel, 0x0))
|
||||
{
|
||||
// Workaround https://github.com/neondatabase/neon/issues/7451 -- if we have an unlogged relation
|
||||
// whose INIT_FORKNUM is not correctly on shard zero, then omit it in the basebackup. This allows
|
||||
// postgres to start up. The relation won't work, but it will be possible to DROP TABLE on it and
|
||||
// recreate.
|
||||
tracing::warn!("Omitting relation {rel} for issue #7451: drop and recreate this unlogged relation");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
self.add_rel(rel, rel).await?;
|
||||
self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?;
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -21,6 +21,7 @@ base64.workspace = true
|
||||
bstr.workspace = true
|
||||
bytes = { workspace = true, features = ["serde"] }
|
||||
camino.workspace = true
|
||||
camino-tempfile.workspace = true
|
||||
chrono.workspace = true
|
||||
clap.workspace = true
|
||||
consumption_metrics.workspace = true
|
||||
@@ -78,7 +79,7 @@ subtle.workspace = true
|
||||
sync_wrapper.workspace = true
|
||||
task-local-extensions.workspace = true
|
||||
thiserror.workspace = true
|
||||
tikv-jemallocator.workspace = true
|
||||
tikv-jemallocator = { workspace = true, features = ["profiling"] }
|
||||
tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
|
||||
tokio-postgres.workspace = true
|
||||
tokio-rustls.workspace = true
|
||||
@@ -102,7 +103,6 @@ redis.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
camino-tempfile.workspace = true
|
||||
fallible-iterator.workspace = true
|
||||
rcgen.workspace = true
|
||||
rstest.workspace = true
|
||||
|
||||
@@ -1,13 +1,17 @@
|
||||
use anyhow::{anyhow, bail};
|
||||
use camino::Utf8PathBuf;
|
||||
use camino_tempfile::Utf8TempDir;
|
||||
use hyper::{header::CONTENT_TYPE, Body, Request, Response, StatusCode};
|
||||
use measured::{text::BufferedTextEncoder, MetricGroup};
|
||||
use metrics::NeonMetrics;
|
||||
use once_cell::sync::Lazy;
|
||||
use std::{
|
||||
convert::Infallible,
|
||||
ffi::CString,
|
||||
net::TcpListener,
|
||||
sync::{Arc, Mutex},
|
||||
};
|
||||
use tracing::{info, info_span};
|
||||
use tracing::{info, info_span, warn};
|
||||
use utils::http::{
|
||||
endpoint::{self, request_span},
|
||||
error::ApiError,
|
||||
@@ -21,18 +25,49 @@ async fn status_handler(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
json_response(StatusCode::OK, "")
|
||||
}
|
||||
|
||||
async fn prof_dump(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
static PROF_MIB: Lazy<jemalloc::dump_mib> =
|
||||
Lazy::new(|| jemalloc::dump::mib().expect("could not create prof.dump MIB"));
|
||||
static PROF_DIR: Lazy<Utf8TempDir> =
|
||||
Lazy::new(|| camino_tempfile::tempdir().expect("could not create tempdir"));
|
||||
static PROF_FILE: Lazy<Utf8PathBuf> = Lazy::new(|| PROF_DIR.path().join("prof.dump"));
|
||||
static PROF_FILE0: Lazy<CString> = Lazy::new(|| CString::new(PROF_FILE.as_str()).unwrap());
|
||||
static DUMP_LOCK: Mutex<()> = Mutex::new(());
|
||||
|
||||
tokio::task::spawn_blocking(|| {
|
||||
let _guard = DUMP_LOCK.lock();
|
||||
PROF_MIB
|
||||
.write(&PROF_FILE0)
|
||||
.expect("could not trigger prof.dump");
|
||||
let prof_dump = std::fs::read_to_string(&*PROF_FILE).expect("could not open prof.dump");
|
||||
|
||||
Response::new(Body::from(prof_dump))
|
||||
})
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))
|
||||
}
|
||||
|
||||
fn make_router(metrics: AppMetrics) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
let state = Arc::new(Mutex::new(PrometheusHandler {
|
||||
encoder: BufferedTextEncoder::new(),
|
||||
metrics,
|
||||
}));
|
||||
|
||||
endpoint::make_router()
|
||||
let mut router = endpoint::make_router()
|
||||
.get("/metrics", move |r| {
|
||||
let state = state.clone();
|
||||
request_span(r, move |b| prometheus_metrics_handler(b, state))
|
||||
})
|
||||
.get("/v1/status", status_handler)
|
||||
.get("/v1/status", status_handler);
|
||||
|
||||
let prof_enabled = jemalloc::prof::read().unwrap_or_default();
|
||||
if prof_enabled {
|
||||
warn!("activating jemalloc profiling");
|
||||
jemalloc::active::write(true).unwrap();
|
||||
router = router.get("/v1/jemalloc/prof.dump", prof_dump);
|
||||
}
|
||||
|
||||
router
|
||||
}
|
||||
|
||||
pub async fn task_main(
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::marker::PhantomData;
|
||||
use std::{ffi::CStr, marker::PhantomData};
|
||||
|
||||
use measured::{
|
||||
label::NoLabels,
|
||||
@@ -9,7 +9,9 @@ use measured::{
|
||||
text::TextEncoder,
|
||||
LabelGroup, MetricGroup,
|
||||
};
|
||||
use tikv_jemalloc_ctl::{config, epoch, epoch_mib, stats, version};
|
||||
use tikv_jemalloc_ctl::{
|
||||
config, epoch, epoch_mib, raw, stats, version, Access, AsName, MibStr, Name,
|
||||
};
|
||||
|
||||
pub struct MetricRecorder {
|
||||
epoch: epoch_mib,
|
||||
@@ -114,3 +116,59 @@ jemalloc_gauge!(mapped, mapped_mib);
|
||||
jemalloc_gauge!(metadata, metadata_mib);
|
||||
jemalloc_gauge!(resident, resident_mib);
|
||||
jemalloc_gauge!(retained, retained_mib);
|
||||
|
||||
#[allow(non_camel_case_types)]
|
||||
pub struct dump;
|
||||
|
||||
impl dump {
|
||||
pub fn mib() -> tikv_jemalloc_ctl::Result<dump_mib> {
|
||||
Ok(dump_mib(b"prof.dump\0".as_slice().name().mib_str()?))
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(transparent)]
|
||||
#[derive(Copy, Clone)]
|
||||
#[allow(non_camel_case_types)]
|
||||
pub struct dump_mib(pub MibStr<[usize; 2]>);
|
||||
|
||||
impl dump_mib {
|
||||
pub fn write(self, value: &'static CStr) -> tikv_jemalloc_ctl::Result<()> {
|
||||
// No support for Access<CStr> yet.
|
||||
// self.0.write(value)
|
||||
let mib = [self.0[0], self.0[1]];
|
||||
raw::write_str_mib(&mib, value.to_bytes_with_nul())
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(non_camel_case_types)]
|
||||
pub struct active;
|
||||
|
||||
impl active {
|
||||
pub fn name() -> &'static Name {
|
||||
b"prof.active\0".as_slice().name()
|
||||
}
|
||||
}
|
||||
|
||||
impl active {
|
||||
pub fn read() -> tikv_jemalloc_ctl::Result<bool> {
|
||||
Self::name().read()
|
||||
}
|
||||
pub fn write(value: bool) -> tikv_jemalloc_ctl::Result<()> {
|
||||
Self::name().write(value)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(non_camel_case_types)]
|
||||
pub struct prof;
|
||||
|
||||
impl prof {
|
||||
pub fn name() -> &'static Name {
|
||||
b"opt.prof\0".as_slice().name()
|
||||
}
|
||||
}
|
||||
|
||||
impl prof {
|
||||
pub fn read() -> tikv_jemalloc_ctl::Result<bool> {
|
||||
Self::name().read()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1201,45 +1201,3 @@ def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder):
|
||||
max_lsn = max(Lsn(info["last_record_lsn"]) for info in infos)
|
||||
diff = max_lsn - min_lsn
|
||||
assert diff < 2 * 1024 * 1024, f"LSN diff={diff}, expected diff < 2MB due to backpressure"
|
||||
|
||||
|
||||
def test_sharding_unlogged_relation(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Check that an unlogged relation is handled properly on a sharded tenant
|
||||
|
||||
Reproducer for https://github.com/neondatabase/neon/issues/7451
|
||||
"""
|
||||
|
||||
neon_env_builder.num_pageservers = 2
|
||||
env = neon_env_builder.init_configs()
|
||||
neon_env_builder.start()
|
||||
|
||||
tenant_id = TenantId.generate()
|
||||
timeline_id = TimelineId.generate()
|
||||
env.neon_cli.create_tenant(tenant_id, timeline_id, shard_count=8)
|
||||
|
||||
# We will create many tables to ensure it's overwhelmingly likely that at least one
|
||||
# of them doesn't land on shard 0
|
||||
table_names = [f"my_unlogged_{i}" for i in range(0, 16)]
|
||||
|
||||
with env.endpoints.create_start("main", tenant_id=tenant_id) as ep:
|
||||
for table_name in table_names:
|
||||
ep.safe_psql(f"CREATE UNLOGGED TABLE {table_name} (id integer, value varchar(64));")
|
||||
ep.safe_psql(f"INSERT INTO {table_name} VALUES (1, 'foo')")
|
||||
result = ep.safe_psql(f"SELECT * from {table_name};")
|
||||
assert result == [(1, "foo")]
|
||||
ep.safe_psql(f"CREATE INDEX ON {table_name} USING btree (value);")
|
||||
|
||||
wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)
|
||||
|
||||
with env.endpoints.create_start("main", tenant_id=tenant_id) as ep:
|
||||
for table_name in table_names:
|
||||
# Check that table works: we can select and insert
|
||||
result = ep.safe_psql(f"SELECT * from {table_name};")
|
||||
assert result == []
|
||||
ep.safe_psql(f"INSERT INTO {table_name} VALUES (2, 'bar');")
|
||||
result = ep.safe_psql(f"SELECT * from {table_name};")
|
||||
assert result == [(2, "bar")]
|
||||
|
||||
# Ensure that post-endpoint-restart modifications are ingested happily by pageserver
|
||||
wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)
|
||||
|
||||
Reference in New Issue
Block a user