Compare commits

...

32 Commits

Author SHA1 Message Date
Bojan Serafimov
9ee5f17c41 Fix test 2022-06-29 17:40:48 -04:00
Bojan Serafimov
170c882c84 Check that tasks finish 2022-06-29 16:27:22 -04:00
Bojan Serafimov
a5bacd42f5 Merge branch 'main' into tenant-tasks-test 2022-06-29 14:52:33 -04:00
Bojan Serafimov
bfeaa4957f WIP 2022-06-29 14:24:22 -04:00
Bojan Serafimov
633b1762f6 Fix metric name 2022-06-29 11:39:15 -04:00
Bojan Serafimov
2b4c3cb932 Handle errors on shutdown 2022-06-28 09:29:59 -04:00
Bojan Serafimov
bf76f43ea4 Wait for tasks to complete 2022-06-27 17:54:31 -04:00
Bojan Serafimov
98062865f4 Handle errors inside loop 2022-06-24 17:08:44 -04:00
Bojan Serafimov
cdc81996b4 Add metrics 2022-06-24 15:28:53 -04:00
Bojan Serafimov
1169e9ea4c Rename threads to tasks 2022-06-24 11:08:55 -04:00
Bojan Serafimov
3a23869780 Log errors 2022-06-24 11:05:07 -04:00
Bojan Serafimov
796ee4d8af Instrument the task, not the await 2022-06-24 10:29:42 -04:00
Bojan Serafimov
b31ce411d2 Remove unnecessary map_err 2022-06-24 09:50:14 -04:00
Bojan Serafimov
24a5bd10a0 Add cancellation 2022-06-23 23:25:48 -04:00
Bojan Serafimov
2c029d9803 Remove redundant error context 2022-06-23 22:32:15 -04:00
Bojan Serafimov
763b00ccee Merge branch 'tenant-tasks' of github.com:neondatabase/neon into tenant-tasks 2022-06-23 14:15:14 -04:00
Bojan Serafimov
c44c8a0ea0 Add comment 2022-06-23 14:14:39 -04:00
Bojan Serafimov
692496d733 Cancel tasks 2022-06-23 13:20:53 -04:00
Bojan Serafimov
0f4552a544 Update TODO 2022-06-23 12:59:20 -04:00
Bojan Serafimov
d7d4cc8c77 Error instead of panic 2022-06-23 12:56:21 -04:00
Bojan Serafimov
9aab1d0f2b Expand blocking scope 2022-06-22 16:04:42 -04:00
Bojan Serafimov
83dc93ab0f Merge branch 'main' into tenant-tasks 2022-06-22 14:49:45 -04:00
Bojan Serafimov
9a9a58d52c Fmt 2022-06-15 09:49:07 -04:00
Bojan Serafimov
865e8740a7 Add docs 2022-06-15 09:35:41 -04:00
Bojan Serafimov
1a5d1a15d0 Use tokio sleep instead 2022-06-15 09:33:56 -04:00
Bojan Serafimov
02a9883f0f Add TenantTaskManager 2022-06-15 09:24:46 -04:00
Bojan Serafimov
ee36ca54d5 Run compaction as task 2022-06-15 09:16:57 -04:00
Bojan Serafimov
36cc6d2928 Fmt 2022-06-14 15:07:36 -04:00
Bojan Serafimov
e1a4c06918 Fix init 2022-06-14 15:05:30 -04:00
Bojan Serafimov
ec4528505e Simplify 2022-06-14 12:38:09 -04:00
Bojan Serafimov
c79e72e835 Add runtime 2022-06-14 11:08:17 -04:00
Bojan Serafimov
a1f85715ac WIP 2022-06-14 09:39:55 -04:00
8 changed files with 378 additions and 126 deletions

View File

@@ -263,6 +263,8 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
// start profiler (if enabled)
let profiler_guard = profiling::init_profiler(conf);
pageserver::tenant_tasks::init_tenant_task_pool()?;
// initialize authentication for incoming connections
let auth = match &conf.auth_type {
AuthType::Trust | AuthType::MD5 => None,

View File

@@ -158,6 +158,13 @@ pub struct LayeredRepository {
// Global pageserver config parameters
pub conf: &'static PageServerConf,
// Allows us to gracefully cancel operations that edit the directory
// that backs this layered repository. Usage:
//
// Use `let _guard = file_lock.try_read()` while writing any files.
// Use `let _guard = file_lock.write().unwrap()` to wait for all writes to finish.
pub file_lock: RwLock<()>,
// Overridden tenant-specific config parameters.
// We keep TenantConfOpt sturct here to preserve the information
// about parameters that are not set.
@@ -685,6 +692,7 @@ impl LayeredRepository {
) -> LayeredRepository {
LayeredRepository {
tenant_id,
file_lock: RwLock::new(()),
conf,
tenant_conf: Arc::new(RwLock::new(tenant_conf)),
timelines: Mutex::new(HashMap::new()),

View File

@@ -13,7 +13,7 @@ pub mod repository;
pub mod storage_sync;
pub mod tenant_config;
pub mod tenant_mgr;
pub mod tenant_threads;
pub mod tenant_tasks;
pub mod thread_mgr;
pub mod timelines;
pub mod virtual_file;

View File

@@ -230,8 +230,6 @@ pub fn shutdown_all_tenants() {
drop(m);
thread_mgr::shutdown_threads(Some(ThreadKind::WalReceiverManager), None, None);
thread_mgr::shutdown_threads(Some(ThreadKind::GarbageCollector), None, None);
thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), None, None);
// Ok, no background threads running anymore. Flush any remaining data in
// memory to disk.
@@ -330,44 +328,12 @@ pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow:
}
(TenantState::Idle, TenantState::Active) => {
info!("activating tenant {tenant_id}");
let compactor_spawn_result = thread_mgr::spawn(
ThreadKind::Compactor,
Some(tenant_id),
None,
"Compactor thread",
false,
move || crate::tenant_threads::compact_loop(tenant_id),
);
if compactor_spawn_result.is_err() {
let mut m = tenants_state::write_tenants();
m.get_mut(&tenant_id)
.with_context(|| format!("Tenant not found for id {tenant_id}"))?
.state = old_state;
drop(m);
}
compactor_spawn_result?;
let gc_spawn_result = thread_mgr::spawn(
ThreadKind::GarbageCollector,
Some(tenant_id),
None,
"GC thread",
false,
move || crate::tenant_threads::gc_loop(tenant_id),
)
.map(|_thread_id| ()) // update the `Result::Ok` type to match the outer function's return signature
.with_context(|| format!("Failed to launch GC thread for tenant {tenant_id}"));
if let Err(e) = &gc_spawn_result {
let mut m = tenants_state::write_tenants();
m.get_mut(&tenant_id)
.with_context(|| format!("Tenant not found for id {tenant_id}"))?
.state = old_state;
drop(m);
error!("Failed to start GC thread for tenant {tenant_id}, stopping its checkpointer thread: {e:?}");
thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), Some(tenant_id), None);
return gc_spawn_result;
}
// Spawn gc and compaction loops. The loops will shut themselves
// down when they notice that the tenant is inactive.
// TODO maybe use tokio::sync::watch instead?
crate::tenant_tasks::start_compaction_loop(tenant_id)?;
crate::tenant_tasks::start_gc_loop(tenant_id)?;
}
(TenantState::Idle, TenantState::Stopping) => {
info!("stopping idle tenant {tenant_id}");
@@ -379,8 +345,11 @@ pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow:
Some(tenant_id),
None,
);
thread_mgr::shutdown_threads(Some(ThreadKind::GarbageCollector), Some(tenant_id), None);
thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), Some(tenant_id), None);
// Wait until all gc/compaction tasks finish
// TODO send cancellation signal too, or make the state a watch
let repo = get_repository_for_tenant(tenant_id)?;
let _guard = repo.file_lock.write().unwrap();
}
}

View File

@@ -0,0 +1,289 @@
//! This module contains functions to serve per-tenant background processes,
//! such as compaction and GC
use std::collections::HashMap;
use std::ops::ControlFlow;
use std::time::Duration;
use crate::repository::Repository;
use crate::tenant_mgr::TenantState;
use crate::thread_mgr::ThreadKind;
use crate::{tenant_mgr, thread_mgr};
use anyhow::{self, Context};
use futures::stream::FuturesUnordered;
use futures::StreamExt;
use metrics::{register_int_counter_vec, IntCounterVec};
use once_cell::sync::{Lazy, OnceCell};
use tokio::sync::mpsc;
use tokio::sync::watch;
use tracing::*;
use utils::zid::ZTenantId;
static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_tenant_task_events",
"Number of task start/stop/fail events.",
&["event"],
)
.expect("Failed to register tenant_task_events metric")
});
///
/// Compaction task's main loop
///
async fn compaction_loop(tenantid: ZTenantId, mut cancel: watch::Receiver<()>) {
loop {
trace!("waking up");
// Run blocking part of the task
let period: Result<Result<_, anyhow::Error>, _> = tokio::task::spawn_blocking(move || {
// Break if tenant is not active
if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) {
return Ok(ControlFlow::Break(()));
}
// Break if we're not allowed to write to disk
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
// TODO do this inside repo.compaction_iteration instead.
let _guard = match repo.file_lock.try_read() {
Ok(g) => g,
Err(_) => return Ok(ControlFlow::Break(())),
};
// Run compaction
let compaction_period = repo.get_compaction_period();
repo.compaction_iteration()?;
Ok(ControlFlow::Continue(compaction_period))
})
.await;
// Decide whether to sleep or break
let sleep_duration = match period {
Ok(Ok(ControlFlow::Continue(period))) => period,
Ok(Ok(ControlFlow::Break(()))) => break,
Ok(Err(e)) => {
error!("Compaction failed, retrying: {}", e);
Duration::from_secs(2)
}
Err(e) => {
error!("Compaction join error, retrying: {}", e);
Duration::from_secs(2)
}
};
// Sleep
tokio::select! {
_ = cancel.changed() => {
trace!("received cancellation request");
break;
},
_ = tokio::time::sleep(sleep_duration) => {},
}
}
trace!(
"compaction loop stopped. State is {:?}",
tenant_mgr::get_tenant_state(tenantid)
);
}
static START_GC_LOOP: OnceCell<mpsc::Sender<ZTenantId>> = OnceCell::new();
static START_COMPACTION_LOOP: OnceCell<mpsc::Sender<ZTenantId>> = OnceCell::new();
/// Spawn a task that will periodically schedule garbage collection until
/// the tenant becomes inactive. This should be called on tenant
/// activation.
pub fn start_gc_loop(tenantid: ZTenantId) -> anyhow::Result<()> {
START_GC_LOOP
.get()
.context("Failed to get START_GC_LOOP")?
.blocking_send(tenantid)
.context("Failed to send to START_GC_LOOP channel")?;
Ok(())
}
/// Spawn a task that will periodically schedule compaction until
/// the tenant becomes inactive. This should be called on tenant
/// activation.
pub fn start_compaction_loop(tenantid: ZTenantId) -> anyhow::Result<()> {
START_COMPACTION_LOOP
.get()
.context("failed to get START_COMPACTION_LOOP")?
.blocking_send(tenantid)
.context("failed to send to START_COMPACTION_LOOP")?;
Ok(())
}
/// Spawn the TenantTaskManager
/// This needs to be called before start_gc_loop or start_compaction_loop
pub fn init_tenant_task_pool() -> anyhow::Result<()> {
let runtime = tokio::runtime::Builder::new_multi_thread()
.thread_name("tenant-task-worker")
.worker_threads(40) // Way more than necessary
.max_blocking_threads(100) // Way more than necessary
.enable_all()
.build()?;
let (gc_send, mut gc_recv) = mpsc::channel::<ZTenantId>(100);
START_GC_LOOP
.set(gc_send)
.expect("Failed to set START_GC_LOOP");
let (compaction_send, mut compaction_recv) = mpsc::channel::<ZTenantId>(100);
START_COMPACTION_LOOP
.set(compaction_send)
.expect("Failed to set START_COMPACTION_LOOP");
// TODO this is getting repetitive
let mut gc_loops = HashMap::<ZTenantId, watch::Sender<()>>::new();
let mut compaction_loops = HashMap::<ZTenantId, watch::Sender<()>>::new();
thread_mgr::spawn(
ThreadKind::TenantTaskManager,
None,
None,
"Tenant task manager main thread",
true,
move || {
runtime.block_on(async move {
let mut futures = FuturesUnordered::new();
loop {
tokio::select! {
_ = thread_mgr::shutdown_watcher() => {
// Send cancellation to all tasks
for (_, cancel) in gc_loops.drain() {
cancel.send(()).ok();
}
for (_, cancel) in compaction_loops.drain() {
cancel.send(()).ok();
}
// Exit after all tasks finish
while let Some(result) = futures.next().await {
match result {
Ok(()) => {
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
},
Err(e) => {
TENANT_TASK_EVENTS.with_label_values(&["panic"]).inc();
error!("loop join error {}", e)
},
}
}
break;
},
tenantid = gc_recv.recv() => {
let tenantid = tenantid.expect("Gc task channel closed unexpectedly");
// Spawn new task, request cancellation of the old one if exists
let (cancel_send, cancel_recv) = watch::channel(());
let handle = tokio::spawn(gc_loop(tenantid, cancel_recv)
.instrument(trace_span!("gc loop", tenant = %tenantid)));
if let Some(old_cancel_send) = gc_loops.insert(tenantid, cancel_send) {
old_cancel_send.send(()).ok();
}
// Update metrics, remember handle
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
futures.push(handle);
},
tenantid = compaction_recv.recv() => {
let tenantid = tenantid.expect("Compaction task channel closed unexpectedly");
// Spawn new task, request cancellation of the old one if exists
let (cancel_send, cancel_recv) = watch::channel(());
// TODO this instrument doesn't work
let handle = tokio::spawn(compaction_loop(tenantid, cancel_recv)
.instrument(trace_span!("compaction loop", tenant = %tenantid)));
if let Some(old_cancel_send) = compaction_loops.insert(tenantid, cancel_send) {
old_cancel_send.send(()).ok();
}
// Update metrics, remember handle
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
futures.push(handle);
},
result = futures.next() => {
// Log and count any unhandled panics
match result {
Some(Ok(())) => {
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
},
Some(Err(e)) => {
TENANT_TASK_EVENTS.with_label_values(&["panic"]).inc();
error!("loop join error {}", e)
},
None => {},
};
},
}
}
});
Ok(())
},
)?;
Ok(())
}
///
/// GC task's main loop
///
async fn gc_loop(tenantid: ZTenantId, mut cancel: watch::Receiver<()>) {
loop {
trace!("waking up");
// Run blocking part of the task
let period: Result<Result<_, anyhow::Error>, _> = tokio::task::spawn_blocking(move || {
// Break if tenant is not active
if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) {
return Ok(ControlFlow::Break(()));
}
// Break if we're not allowed to write to disk
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
// TODO do this inside repo.gc_iteration instead.
let _guard = match repo.file_lock.try_read() {
Ok(g) => g,
Err(_) => return Ok(ControlFlow::Break(())),
};
// Run gc
let gc_period = repo.get_gc_period();
let gc_horizon = repo.get_gc_horizon();
if gc_horizon > 0 {
repo.gc_iteration(None, gc_horizon, repo.get_pitr_interval(), false)?;
}
Ok(ControlFlow::Continue(gc_period))
})
.await;
// Decide whether to sleep or break
let sleep_duration = match period {
Ok(Ok(ControlFlow::Continue(period))) => period,
Ok(Ok(ControlFlow::Break(()))) => break,
Ok(Err(e)) => {
error!("Gc failed, retrying: {}", e);
Duration::from_secs(2)
}
Err(e) => {
error!("Gc join error, retrying: {}", e);
Duration::from_secs(2)
}
};
// Sleep
tokio::select! {
_ = cancel.changed() => {
trace!("received cancellation request");
break;
},
_ = tokio::time::sleep(sleep_duration) => {},
}
}
trace!(
"GC loop stopped. State is {:?}",
tenant_mgr::get_tenant_state(tenantid)
);
}

View File

@@ -1,79 +0,0 @@
//! This module contains functions to serve per-tenant background processes,
//! such as compaction and GC
use crate::repository::Repository;
use crate::tenant_mgr;
use crate::tenant_mgr::TenantState;
use anyhow::Result;
use std::time::Duration;
use tracing::*;
use utils::zid::ZTenantId;
///
/// Compaction thread's main loop
///
pub fn compact_loop(tenantid: ZTenantId) -> Result<()> {
if let Err(err) = compact_loop_ext(tenantid) {
error!("compact loop terminated with error: {:?}", err);
Err(err)
} else {
Ok(())
}
}
fn compact_loop_ext(tenantid: ZTenantId) -> Result<()> {
loop {
if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) {
break;
}
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
let compaction_period = repo.get_compaction_period();
std::thread::sleep(compaction_period);
trace!("compaction thread for tenant {} waking up", tenantid);
// Compact timelines
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
repo.compaction_iteration()?;
}
trace!(
"compaction thread stopped for tenant {} state is {:?}",
tenantid,
tenant_mgr::get_tenant_state(tenantid)
);
Ok(())
}
///
/// GC thread's main loop
///
pub fn gc_loop(tenantid: ZTenantId) -> Result<()> {
loop {
if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) {
break;
}
trace!("gc thread for tenant {} waking up", tenantid);
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
let gc_horizon = repo.get_gc_horizon();
// Garbage collect old files that are not needed for PITR anymore
if gc_horizon > 0 {
repo.gc_iteration(None, gc_horizon, repo.get_pitr_interval(), false)?;
}
// TODO Write it in more adequate way using
// condvar.wait_timeout() or something
let mut sleep_time = repo.get_gc_period().as_secs();
while sleep_time > 0 && tenant_mgr::get_tenant_state(tenantid) == Some(TenantState::Active)
{
sleep_time -= 1;
std::thread::sleep(Duration::from_secs(1));
}
}
trace!(
"GC thread stopped for tenant {} state is {:?}",
tenantid,
tenant_mgr::get_tenant_state(tenantid)
);
Ok(())
}

View File

@@ -94,11 +94,8 @@ pub enum ThreadKind {
// Main walreceiver manager thread that ensures that every timeline spawns a connection to safekeeper, to fetch WAL.
WalReceiverManager,
// Thread that handles compaction of all timelines for a tenant.
Compactor,
// Thread that handles GC of a tenant
GarbageCollector,
// Thread that schedules new compaction and gc jobs
TenantTaskManager,
// Thread that flushes frozen in-memory layers to disk
LayerFlushThread,

View File

@@ -0,0 +1,66 @@
from fixtures.neon_fixtures import NeonEnvBuilder
from uuid import UUID
def get_only_element(l):
assert len(l) == 1
return l[0]
def test_tenant_tasks(neon_env_builder: NeonEnvBuilder):
name = "test_tenant_tasks"
env = neon_env_builder.init_start()
client = env.pageserver.http_client()
def get_state(tenant):
all_states = client.tenant_list()
matching = [t for t in all_states if t["id"] == tenant.hex]
return get_only_element(matching)["state"]
def get_metric_value(name):
metrics = client.get_metrics()
relevant = [line for line in metrics.splitlines() if line.startswith(name)]
if len(relevant) == 0:
return 0
line = get_only_element(relevant)
value = line.lstrip(name).strip()
return int(value)
def detach_all_timelines(tenant):
timelines = [UUID(t["timeline_id"]) for t in client.timeline_list(tenant)]
for t in timelines:
client.timeline_detach(tenant, t)
# Create tenant, start compute
tenant, _ = env.neon_cli.create_tenant()
timeline = env.neon_cli.create_timeline(name, tenant_id=tenant)
pg = env.postgres.create_start(name, tenant_id=tenant)
assert(get_state(tenant) == "Active")
# Stop compute, detach timelines
# TODO tenant should go idle even if we don't explicitly detach
pg.stop()
detach_all_timelines(tenant)
import time; time.sleep(1)
assert(get_state(tenant) == "Idle")
# Detach all tenants
for tenant_info in client.tenant_list():
tenant_id = UUID(tenant_info["id"])
detach_all_timelines(tenant_id)
# TODO poll wait until idle instead
import time; time.sleep(1)
assert get_state(tenant_id) == "Idle"
# Read metrics
import time; time.sleep(1)
tasks_started = get_metric_value('pageserver_tenant_task_events{event="start"}')
tasks_ended = get_metric_value('pageserver_tenant_task_events{event="stop"}')
tasks_panicked = get_metric_value('pageserver_tenant_task_events{event="panic"}')
# TODO this fails because the "active -> idle" transition only waits for gc to
# finish without cancelling it, and gc waits for 100 seconds.
assert tasks_started == tasks_ended
assert tasks_panicked == 0