Compare commits

...

13 Commits

Author SHA1 Message Date
Christian Schwarz
afcb924bab Revert "Revert "revert recent VirtualFile asyncification changes (#5291)""
This reverts commit c06279a40f.
2024-01-09 18:36:01 +00:00
Christian Schwarz
fd14773bfc persist benchmarked config 2024-01-09 18:36:01 +00:00
Christian Schwarz
d8c90ea659 page cache: improve eviction-related metrics
These changes help with identifying thrashing.

The existing `pageserver_page_cache_find_victim_iters_total` is already
useful, but, it doesn't tell us how many individual find_victim() calls
are happening, only how many clock-LRU steps happened in the entire system,
without info about whether we needed to actually evict other data vs
just scan for a long time, e.g., because the cache is large.

The changes in this PR allows us to
1. count each possible outcome separately, esp evictions
2. compute mean iterations/outcome

I don't think anyone except me was paying close attention to
`pageserver_page_cache_find_victim_iters_total` before, so,
I think the slight behavior change of also counting iterations
for the 'iters exceeded' case is fine.

refs https://github.com/neondatabase/cloud/issues/8351
refs https://github.com/neondatabase/neon/issues/5479
2024-01-09 18:35:28 +00:00
Christian Schwarz
79a4ee938b Revert "revert recent VirtualFile asyncification changes (#5291)"
This reverts commit ab1f37e908.

fixes #5479
2024-01-09 18:35:27 +00:00
Christian Schwarz
ebcdd758eb test results
34e69cfc93 18:04:59 - 18:31:15 => 05:01 + 21:15 = 26:16 duration

That's ca 5min slower than what we saw without tokio-epoll-uring
(scratches head)
2024-01-09 18:35:27 +00:00
Christian Schwarz
c61d27c18d improve instructions 2024-01-09 18:35:27 +00:00
Christian Schwarz
3bb58f78f0 usage instructions for generator script 2024-01-09 18:35:27 +00:00
Christian Schwarz
c12af4ea58 many_tenants script now works 2024-01-09 18:35:27 +00:00
Christian Schwarz
2c4d2e9d7e update many tenants script to use the new method for duplicating tenants (copy-paste from benchmarking WIP PR) 2024-01-09 18:35:27 +00:00
Christian Schwarz
2043221fca Squashed commit of the following:
commit de90ba56d4
Author: Christian Schwarz <christian@neon.tech>
Date:   Mon Nov 27 14:47:26 2023 +0000

    expose generation number in API

commit ae2c7589f9
Author: Christian Schwarz <christian@neon.tech>
Date:   Mon Nov 27 14:53:13 2023 +0000

    pagectl: add subcommand to rewrite layer file history
2024-01-09 18:35:26 +00:00
Christian Schwarz
bf40845db4 measured BACKGROUND_RUNTIME performance using wrk
Launch wrk from command line 3-4 seconds after the load starts.
=> blocking of executor threads is clearly visible, my branch
  performs _much_ better.

baseline: commit 15b8618d25 (HEAD -> problame/loadtest-baseline, origin/problame/loadtest-baseline, main)
neon-main (compaction semaphore disabled!)

admin@ip-172-31-13-23:[~/neon]: wrk --latency http://localhost:2342
Running 10s test @ http://localhost:2342
  2 threads and 10 connections
  Thread Stats   Avg      Stdev     Max   +/- Stdev
    Latency    71.42ms   15.97ms 125.18ms   70.82%
    Req/Sec    41.44     28.85   101.00     57.35%
  Latency Distribution
     50%   72.53ms
     75%   82.07ms
     90%   91.44ms
     99%  116.56ms
  291 requests in 10.01s, 22.73KB read
  Socket errors: connect 0, read 0, write 0, timeout 10
Requests/sec:     29.07
Transfer/sec:      2.27KB

this branch (comapction semaphore also disabled!):

admin@ip-172-31-13-23:[~/neon]: wrk --latency http://localhost:2342
Running 10s test @ http://localhost:2342
  2 threads and 10 connections
  Thread Stats   Avg      Stdev     Max   +/- Stdev
    Latency    45.74ms   64.13ms 293.44ms   83.27%
    Req/Sec   442.81    258.18     1.32k    69.79%
  Latency Distribution
     50%    2.92ms
     75%   75.52ms
     90%  148.03ms
     99%  248.50ms
  8641 requests in 10.01s, 675.08KB read
Requests/sec:    862.81
Transfer/sec:     67.41KB
2024-01-09 18:31:20 +00:00
Christian Schwarz
44f885f444 HACK: BACKGROUND_RUNTIME webserver to measure response time using wrk 2024-01-09 18:31:20 +00:00
Christian Schwarz
eb679d4b27 REPRO the problem: , uses 430GB of space; 4 seconds load time; constant 20kIOPS after ~20s 2024-01-09 18:31:20 +00:00
14 changed files with 309 additions and 17 deletions

View File

@@ -0,0 +1,39 @@
remote_storage ={local_path='/mnt/many_tenants/test_pageserver_startup_many_tenants/repo/local_fs_remote_storage/pageserver'}
id =1
pg_distrib_dir ='/home/admin/neon/pg_install'
http_auth_type ='Trust'
pg_auth_type ='Trust'
listen_http_addr ='localhost:15004'
listen_pg_addr ='localhost:15003'
broker_endpoint ='http://127.0.0.1:15001/'
control_plane_api ='http://127.0.0.1:15002/'
# Initial configuration file created by 'pageserver --init'
#listen_pg_addr = '127.0.0.1:64000'
#listen_http_addr = '127.0.0.1:9898'
#wait_lsn_timeout = '60 s'
#wal_redo_timeout = '60 s'
#max_file_descriptors = 100
# initial superuser role name to use when creating a new tenant
#initial_superuser_name = 'cloud_admin'
#broker_endpoint = 'http://127.0.0.1:50051'
#log_format = 'plain'
#concurrent_tenant_size_logical_size_queries = '1'
metric_collection_endpoint = "https://127.0.0.1:6666"
#metric_collection_interval = '10 min'
#cached_metric_collection_interval = '0s'
#synthetic_size_calculation_interval = '10 min'
#disk_usage_based_eviction = { max_usage_pct = .., min_avail_bytes = .., period = "10s"}
#background_task_maximum_delay = '10s'
[tenant_config]

View File

@@ -368,6 +368,8 @@ pub struct TenantInfo {
/// If a layer is present in both local FS and S3, it counts only once.
pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
pub attachment_status: TenantAttachmentStatus,
#[serde(skip_serializing_if = "Option::is_none")]
pub generation: Option<u32>,
}
#[derive(Serialize, Deserialize, Clone)]
@@ -909,6 +911,7 @@ mod tests {
state: TenantState::Active,
current_physical_size: Some(42),
attachment_status: TenantAttachmentStatus::Attached,
generation: None,
};
let expected_active = json!({
"id": original_active.id.to_string(),
@@ -929,6 +932,7 @@ mod tests {
},
current_physical_size: Some(42),
attachment_status: TenantAttachmentStatus::Attached,
generation: None,
};
let expected_broken = json!({
"id": original_broken.id.to_string(),

View File

@@ -652,6 +652,31 @@ fn start_pageserver(
);
}
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::BackgroundRuntimeTurnaroundMeasure,
None,
None,
"background runtime turnaround measure",
true,
async move {
let server = hyper::Server::try_bind(&"0.0.0.0:2342".parse().unwrap()).expect("bind");
let server = server
.serve(hyper::service::make_service_fn(|_| async move {
Ok::<_, std::convert::Infallible>(hyper::service::service_fn(
move |_: hyper::Request<hyper::Body>| async move {
Ok::<_, std::convert::Infallible>(hyper::Response::new(
hyper::Body::from(format!("alive")),
))
},
))
}))
.with_graceful_shutdown(task_mgr::shutdown_watcher());
server.await?;
Ok(())
},
);
let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
// All started up! Now just sit and wait for shutdown signal.

View File

@@ -267,7 +267,7 @@ async fn calculate_synthetic_size_worker(
}
};
for (tenant_shard_id, tenant_state) in tenants {
for (tenant_shard_id, tenant_state, _gen) in tenants {
if tenant_state != TenantState::Active {
continue;
}

View File

@@ -196,7 +196,7 @@ pub(super) async fn collect_all_metrics(
}
};
let tenants = futures::stream::iter(tenants).filter_map(|(id, state)| async move {
let tenants = futures::stream::iter(tenants).filter_map(|(id, state, _)| async move {
if state != TenantState::Active || !id.is_zero() {
None
} else {

View File

@@ -633,7 +633,7 @@ async fn collect_eviction_candidates(
let mut candidates = Vec::new();
for (tenant_id, _state) in &tenants {
for (tenant_id, _state, _gen) in &tenants {
if cancel.is_cancelled() {
return Ok(EvictionCandidates::Cancelled);
}

View File

@@ -830,11 +830,12 @@ async fn tenant_list_handler(
ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into())
})?
.iter()
.map(|(id, state)| TenantInfo {
.map(|(id, state, gen)| TenantInfo {
id: *id,
state: state.clone(),
current_physical_size: None,
attachment_status: state.attachment_status(),
generation: (*gen).into(),
})
.collect::<Vec<TenantInfo>>();
@@ -864,6 +865,7 @@ async fn tenant_status(
state: state.clone(),
current_physical_size: Some(current_physical_size),
attachment_status: state.attachment_status(),
generation: tenant.generation().into(),
},
timelines: tenant.list_timeline_ids(),
})

View File

@@ -337,6 +337,63 @@ pub(crate) mod page_cache_eviction_metrics {
}
}
pub(crate) mod page_cache_eviction_metrics {
use std::num::NonZeroUsize;
use metrics::{register_int_counter_vec, IntCounter, IntCounterVec};
use once_cell::sync::Lazy;
#[derive(Clone, Copy)]
pub(crate) enum Outcome {
FoundSlotUnused { iters: NonZeroUsize },
FoundSlotEvicted { iters: NonZeroUsize },
ItersExceeded { iters: NonZeroUsize },
}
static ITERS_TOTAL_VEC: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_page_cache_find_victim_iters_total",
"Counter for the number of iterations in the find_victim loop",
&["outcome"],
)
.expect("failed to define a metric")
});
static CALLS_VEC: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_page_cache_find_victim_calls",
"Incremented at the end of each find_victim() call.\
Filter by outcome to get e.g., eviction rate.",
&["outcome"]
)
.unwrap()
});
pub(crate) fn observe(outcome: Outcome) {
macro_rules! dry {
($label:literal, $iters:expr) => {{
static LABEL: &'static str = $label;
static ITERS_TOTAL: Lazy<IntCounter> =
Lazy::new(|| ITERS_TOTAL_VEC.with_label_values(&[LABEL]));
static CALLS: Lazy<IntCounter> =
Lazy::new(|| CALLS_VEC.with_label_values(&[LABEL]));
ITERS_TOTAL.inc_by(($iters.get()) as u64);
CALLS.inc();
}};
}
match outcome {
Outcome::FoundSlotUnused { iters } => dry!("found_empty", iters),
Outcome::FoundSlotEvicted { iters } => {
dry!("found_evicted", iters)
}
Outcome::ItersExceeded { iters } => {
dry!("err_iters_exceeded", iters);
super::page_cache_errors_inc(super::PageCacheErrorKind::EvictIterLimit);
}
}
}
}
pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_page_cache_acquire_pinned_slot_seconds",

View File

@@ -300,6 +300,8 @@ pub enum TaskKind {
DebugTool,
BackgroundRuntimeTurnaroundMeasure,
#[cfg(test)]
UnitTest,
}

View File

@@ -1923,6 +1923,10 @@ impl Tenant {
self.current_state() == TenantState::Active
}
pub fn generation(&self) -> Generation {
self.generation
}
/// Changes tenant status to active, unless shutdown was already requested.
///
/// `background_jobs_can_start` is an optional barrier set to a value during pageserver startup

View File

@@ -1632,8 +1632,8 @@ pub(crate) enum TenantMapListError {
///
/// Get list of tenants, for the mgmt API
///
pub(crate) async fn list_tenants() -> Result<Vec<(TenantShardId, TenantState)>, TenantMapListError>
{
pub(crate) async fn list_tenants(
) -> Result<Vec<(TenantShardId, TenantState, Generation)>, TenantMapListError> {
let tenants = TENANTS.read().unwrap();
let m = match &*tenants {
TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
@@ -1641,7 +1641,9 @@ pub(crate) async fn list_tenants() -> Result<Vec<(TenantShardId, TenantState)>,
};
Ok(m.iter()
.filter_map(|(id, tenant)| match tenant {
TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())),
TenantSlot::Attached(tenant) => {
Some((*id, tenant.current_state(), tenant.generation()))
}
TenantSlot::Secondary(_) => None,
TenantSlot::InProgress(_) => None,
})

View File

@@ -708,11 +708,11 @@ impl DeltaLayerInner {
expected_summary.index_start_blk = actual_summary.index_start_blk;
expected_summary.index_root_blk = actual_summary.index_root_blk;
if actual_summary != expected_summary {
bail!(
"in-file summary does not match expected summary. actual = {:?} expected = {:?}",
actual_summary,
expected_summary
);
// bail!(
// "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
// actual_summary,
// expected_summary
// );
}
}

View File

@@ -390,11 +390,11 @@ impl ImageLayerInner {
expected_summary.index_root_blk = actual_summary.index_root_blk;
if actual_summary != expected_summary {
bail!(
"in-file summary does not match expected summary. actual = {:?} expected = {:?}",
actual_summary,
expected_summary
);
// bail!(
// "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
// actual_summary,
// expected_summary
// );
}
}

View File

@@ -0,0 +1,157 @@
import queue
import shutil
import subprocess
import threading
from pathlib import Path
from typing import List, Optional
from fixtures.neon_fixtures import (
NeonEnv,
NeonEnvBuilder,
PgBin,
last_flush_lsn_upload,
)
from fixtures.pageserver.utils import wait_until_tenant_active
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
from fixtures.types import TenantId
def duplicate_tenant(
env: NeonEnv, remote_storage: LocalFsStorage, template_tenant: TenantId, new_tenant: TenantId
):
src_timelines_dir: Path = remote_storage.tenant_path(template_tenant) / "timelines"
assert src_timelines_dir.is_dir(), f"{src_timelines_dir} is not a directory"
assert isinstance(remote_storage, LocalFsStorage)
dst_timelines_dir: Path = remote_storage.tenant_path(new_tenant) / "timelines"
dst_timelines_dir.parent.mkdir(parents=False, exist_ok=False)
dst_timelines_dir.mkdir(parents=False, exist_ok=False)
for tl in src_timelines_dir.iterdir():
src_tl_dir = src_timelines_dir / tl.name
assert src_tl_dir.is_dir(), f"{src_tl_dir} is not a directory"
dst_tl_dir = dst_timelines_dir / tl.name
dst_tl_dir.mkdir(parents=False, exist_ok=False)
for file in tl.iterdir():
shutil.copy2(file, dst_tl_dir)
if "__" in file.name:
cmd: List[str] = [
str(
env.neon_binpath / "pagectl"
), # TODO: abstract this like the other binaries
"layer",
"rewrite-summary",
str(dst_tl_dir / file.name),
"--new-tenant-id",
str(new_tenant),
]
subprocess.run(cmd, check=True)
else:
# index_part etc need no patching
pass
return None
def test_pageserver_startup_many_tenants(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
"""
Usage
TEST_OUTPUT=/mnt/many_tenants NEON_BIN=$PWD/target/release DEFAULT_PG_VERSION=15 ./scripts/pytest --preserve-database-files --timeout=0 ./test_runner/performance/test_pageserver_startup_many_tenants.py
Then
export NEON_REPO_DIR=/mnt/many_tenants/test_pageserver_startup_many_tenants/repo
# edit $NEON_REPO_DIR/pageserver_1/pageserver.toml to use metric collection,
# with intervals from prod:
#
# metric_collection_endpoint = "https://127.0.0.1:6666"
# metric_collection_interval: 10min
# cached_metric_collection_interval: 0s
# run a fake metric collection endpoint in some other terminal using
# python3 -m http.server 6666 > /dev/null 2>&1
# then start pageserver
ulimit -SH -n 100000
./target/release/neon_local start
"""
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
neon_env_builder.enable_generations = True
env = neon_env_builder.init_start()
remote_storage = env.pageserver_remote_storage
assert isinstance(remote_storage, LocalFsStorage)
# cleanup initial tenant
env.pageserver.tenant_detach(env.initial_tenant)
# create our template tenant
tenant_config_mgmt_api = {
"gc_period": "0s",
"checkpoint_timeout": "3650 day",
"compaction_period": "20 s",
"compaction_threshold": 10,
"compaction_target_size": 134217728,
"checkpoint_distance": 268435456,
"image_creation_threshold": 3,
}
tenant_config_cli = {k: str(v) for k, v in tenant_config_mgmt_api.items()}
ps_http = env.pageserver.http_client()
template_tenant, template_timeline = env.neon_cli.create_tenant(conf=tenant_config_cli)
ep = env.endpoints.create_start("main", tenant_id=template_tenant)
ep.safe_psql("create table foo(b text)")
for _i in range(0, 8):
ep.safe_psql("insert into foo(b) values ('some text')")
last_flush_lsn_upload(env, ep, template_tenant, template_timeline)
ep.stop_and_destroy()
env.pageserver.tenant_detach(template_tenant)
# duplicate the tenant in remote storage
def worker(queue: queue.Queue[Optional[TenantId]]):
while True:
tenant_id = queue.get()
if tenant_id is None:
return
assert isinstance(remote_storage, LocalFsStorage)
duplicate_tenant(env, remote_storage, template_tenant, tenant_id)
new_tenants: List[TenantId] = [TenantId.generate() for _ in range(0, 20_000)]
duplications: queue.Queue[Optional[TenantId]] = queue.Queue()
for t in new_tenants:
duplications.put(t)
workers = []
for _ in range(0, 8):
w = threading.Thread(target=worker, args=[duplications])
workers.append(w)
w.start()
duplications.put(None)
for w in workers:
w.join()
# for evaluation, use the same background loop periods as in prod
benchmark_tenant_config = {k: v for k, v in tenant_config_mgmt_api.items()}
del benchmark_tenant_config["compaction_period"]
del benchmark_tenant_config["gc_period"]
benchmark_tenant_config["eviction_policy"] = {
"kind": "LayerAccessThreshold",
"period": "10m",
# don't do evictions
"threshold": "1000d",
}
assert ps_http.tenant_list() == []
for tenant in new_tenants:
env.pageserver.tenant_attach(tenant, config=benchmark_tenant_config)
for tenant in new_tenants:
wait_until_tenant_active(ps_http, tenant)
# ensure all layers are resident for predictiable performance
# TODO: ensure all kinds of eviction are disabled (per-tenant, disk-usage-based)
for tenant in new_tenants:
ps_http.download_all_layers(tenant, template_timeline)