pageserver: remove legacy tenant config code, clean up redundant generation none/broken usages (#7947)

## Problem

In https://github.com/neondatabase/neon/pull/5299, the new config-v1
tenant config file was added to hold the LocationConf type. We left the
old config file in place for forward compat, and because running without
generations (therefore without LocationConf) as still useful before the
storage controller was ready for prime-time.

Closes: https://github.com/neondatabase/neon/issues/5388

## Summary of changes

- Remove code for reading and writing the legacy config file
- Remove Generation::Broken: it was unused.
- Treat missing config file on disk as an error loading a tenant, rather
than defaulting it. We can now remove LocationConf::default, and thereby
guarantee that we never construct a tenant with a None generation.
- Update some comments + add some assertions to clarify that
Generation::None is only used in layer metadata, not in the state of a
running tenant.
- Update docker compose test to create tenants with a generation
This commit is contained in:
John Spray
2024-06-26 20:53:59 +01:00
committed by GitHub
parent 76fc3d4aa1
commit c39d5b03e8
13 changed files with 67 additions and 237 deletions

10
docker-compose/README.md Normal file
View File

@@ -0,0 +1,10 @@
# Example docker compose configuration
The configuration in this directory is used for testing Neon docker images: it is
not intended for deploying a usable system. To run a development environment where
you can experiment with a minature Neon system, use `cargo neon` rather than container images.
This configuration does not start the storage controller, because the controller
needs a way to reconfigure running computes, and no such thing exists in this setup.

View File

@@ -23,11 +23,10 @@ echo "Page server is ready."
echo "Create a tenant and timeline"
generate_id tenant_id
PARAMS=(
-sb
-X POST
-X PUT
-H "Content-Type: application/json"
-d "{\"new_tenant_id\": \"${tenant_id}\"}"
http://pageserver:9898/v1/tenant/
-d "{\"mode\": \"AttachedSingle\", \"generation\": 1, \"tenant_conf\": {}}"
"http://pageserver:9898/v1/tenant/${tenant_id}/location_config"
)
result=$(curl "${PARAMS[@]}")
echo $result | jq .

View File

@@ -625,8 +625,7 @@ pub struct TenantInfo {
/// If a layer is present in both local FS and S3, it counts only once.
pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
pub attachment_status: TenantAttachmentStatus,
#[serde(skip_serializing_if = "Option::is_none")]
pub generation: Option<u32>,
pub generation: u32,
}
#[derive(Serialize, Deserialize, Clone)]
@@ -1453,7 +1452,7 @@ mod tests {
state: TenantState::Active,
current_physical_size: Some(42),
attachment_status: TenantAttachmentStatus::Attached,
generation: None,
generation: 1,
};
let expected_active = json!({
"id": original_active.id.to_string(),
@@ -1463,7 +1462,8 @@ mod tests {
"current_physical_size": 42,
"attachment_status": {
"slug":"attached",
}
},
"generation" : 1
});
let original_broken = TenantInfo {
@@ -1474,7 +1474,7 @@ mod tests {
},
current_physical_size: Some(42),
attachment_status: TenantAttachmentStatus::Attached,
generation: None,
generation: 1,
};
let expected_broken = json!({
"id": original_broken.id.to_string(),
@@ -1488,7 +1488,8 @@ mod tests {
"current_physical_size": 42,
"attachment_status": {
"slug":"attached",
}
},
"generation" : 1
});
assert_eq!(

View File

@@ -9,20 +9,11 @@ use serde::{Deserialize, Serialize};
/// numbers are used.
#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
pub enum Generation {
// Generations with this magic value will not add a suffix to S3 keys, and will not
// be included in persisted index_part.json. This value is only to be used
// during migration from pre-generation metadata to generation-aware metadata,
// and should eventually go away.
//
// A special Generation is used rather than always wrapping Generation in an Option,
// so that code handling generations doesn't have to be aware of the legacy
// case everywhere it touches a generation.
// The None Generation is used in the metadata of layers written before generations were
// introduced. A running Tenant always has a valid generation, but the layer metadata may
// include None generations.
None,
// Generations with this magic value may never be used to construct S3 keys:
// we will panic if someone tries to. This is for Tenants in the "Broken" state,
// so that we can satisfy their constructor with a Generation without risking
// a code bug using it in an S3 write (broken tenants should never write)
Broken,
Valid(u32),
}
@@ -42,11 +33,6 @@ impl Generation {
Self::None
}
// Create a new generation that will panic if you try to use get_suffix
pub fn broken() -> Self {
Self::Broken
}
pub const fn new(v: u32) -> Self {
Self::Valid(v)
}
@@ -60,9 +46,6 @@ impl Generation {
match self {
Self::Valid(v) => GenerationFileSuffix(Some(*v)),
Self::None => GenerationFileSuffix(None),
Self::Broken => {
panic!("Tried to use a broken generation");
}
}
}
@@ -86,7 +69,6 @@ impl Generation {
}
}
Self::None => Self::None,
Self::Broken => panic!("Attempted to use a broken generation"),
}
}
@@ -95,7 +77,6 @@ impl Generation {
match self {
Self::Valid(n) => Self::Valid(*n + 1),
Self::None => Self::Valid(1),
Self::Broken => panic!("Attempted to use a broken generation"),
}
}
@@ -128,7 +109,7 @@ impl Serialize for Generation {
if let Self::Valid(v) = self {
v.serialize(serializer)
} else {
// We should never be asked to serialize a None or Broken. Structures
// We should never be asked to serialize a None. Structures
// that include an optional generation should convert None to an
// Option<Generation>::None
Err(serde::ser::Error::custom(
@@ -159,9 +140,6 @@ impl Debug for Generation {
Self::None => {
write!(f, "<none>")
}
Self::Broken => {
write!(f, "<broken>")
}
}
}
}

View File

@@ -36,10 +36,7 @@ use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
use crate::{tenant::config::TenantConf, virtual_file};
use crate::{
TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME,
TIMELINE_DELETE_MARK_SUFFIX,
};
use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX};
use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
@@ -810,15 +807,11 @@ impl PageServerConf {
}
/// Points to a place in pageserver's local directory,
/// where certain tenant's tenantconf file should be located.
///
/// Legacy: superseded by tenant_location_config_path. Eventually
/// remove this function.
pub fn tenant_config_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
self.tenant_path(tenant_shard_id).join(TENANT_CONFIG_NAME)
}
pub fn tenant_location_config_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
/// where certain tenant's LocationConf be stored.
pub(crate) fn tenant_location_config_path(
&self,
tenant_shard_id: &TenantShardId,
) -> Utf8PathBuf {
self.tenant_path(tenant_shard_id)
.join(TENANT_LOCATION_CONFIG_NAME)
}

View File

@@ -382,17 +382,6 @@ pub enum DeletionQueueError {
}
impl DeletionQueueClient {
pub(crate) fn broken() -> Self {
// Channels whose receivers are immediately dropped.
let (tx, _rx) = tokio::sync::mpsc::unbounded_channel();
let (executor_tx, _executor_rx) = tokio::sync::mpsc::channel(1);
Self {
tx,
executor_tx,
lsn_table: Arc::default(),
}
}
/// This is cancel-safe. If you drop the future before it completes, the message
/// is not pushed, although in the context of the deletion queue it doesn't matter: once
/// we decide to do a deletion the decision is always final.

View File

@@ -887,7 +887,9 @@ async fn tenant_list_handler(
state: state.clone(),
current_physical_size: None,
attachment_status: state.attachment_status(),
generation: (*gen).into(),
generation: (*gen)
.into()
.expect("Tenants are always attached with a generation"),
})
.collect::<Vec<TenantInfo>>();
@@ -935,7 +937,10 @@ async fn tenant_status(
state: state.clone(),
current_physical_size: Some(current_physical_size),
attachment_status: state.attachment_status(),
generation: tenant.generation().into(),
generation: tenant
.generation()
.into()
.expect("Tenants are always attached with a generation"),
},
walredo: tenant.wal_redo_manager_status(),
timelines: tenant.list_timeline_ids(),

View File

@@ -113,11 +113,7 @@ pub async fn shutdown_pageserver(
}
/// Per-tenant configuration file.
/// Full path: `tenants/<tenant_id>/config`.
pub(crate) const TENANT_CONFIG_NAME: &str = "config";
/// Per-tenant configuration file.
/// Full path: `tenants/<tenant_id>/config`.
/// Full path: `tenants/<tenant_id>/config-v1`.
pub(crate) const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
/// Per-tenant copy of their remote heatmap, downloaded into the local

View File

@@ -648,7 +648,7 @@ impl Tenant {
init_order: Option<InitializationOrder>,
mode: SpawnMode,
ctx: &RequestContext,
) -> anyhow::Result<Arc<Tenant>> {
) -> Arc<Tenant> {
let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
conf,
tenant_shard_id,
@@ -856,7 +856,7 @@ impl Tenant {
}
.instrument(tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation)),
);
Ok(tenant)
tenant
}
#[instrument(skip_all)]
@@ -1147,30 +1147,6 @@ impl Tenant {
.await
}
/// Create a placeholder Tenant object for a broken tenant
pub fn create_broken_tenant(
conf: &'static PageServerConf,
tenant_shard_id: TenantShardId,
remote_storage: GenericRemoteStorage,
reason: String,
) -> Arc<Tenant> {
Arc::new(Tenant::new(
TenantState::Broken {
reason,
backtrace: String::new(),
},
conf,
AttachedTenantConf::try_from(LocationConf::default()).unwrap(),
// Shard identity isn't meaningful for a broken tenant: it's just a placeholder
// to occupy the slot for this TenantShardId.
ShardIdentity::broken(tenant_shard_id.shard_number, tenant_shard_id.shard_count),
None,
tenant_shard_id,
remote_storage,
DeletionQueueClient::broken(),
))
}
async fn load_timeline_metadata(
self: &Arc<Tenant>,
timeline_ids: HashSet<TimelineId>,
@@ -2494,6 +2470,10 @@ impl Tenant {
remote_storage: GenericRemoteStorage,
deletion_queue_client: DeletionQueueClient,
) -> Tenant {
debug_assert!(
!attached_conf.location.generation.is_none() || conf.control_plane_api.is_none()
);
let (state, mut rx) = watch::channel(state);
tokio::spawn(async move {
@@ -2584,45 +2564,22 @@ impl Tenant {
conf: &'static PageServerConf,
tenant_shard_id: &TenantShardId,
) -> anyhow::Result<LocationConf> {
let legacy_config_path = conf.tenant_config_path(tenant_shard_id);
let config_path = conf.tenant_location_config_path(tenant_shard_id);
if config_path.exists() {
// New-style config takes precedence
let deserialized = Self::read_config(&config_path)?;
Ok(toml_edit::de::from_document::<LocationConf>(deserialized)?)
} else if legacy_config_path.exists() {
// Upgrade path: found an old-style configuration only
let deserialized = Self::read_config(&legacy_config_path)?;
let mut tenant_conf = TenantConfOpt::default();
for (key, item) in deserialized.iter() {
match key {
"tenant_config" => {
tenant_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("Failed to parse config from file '{legacy_config_path}' as pageserver config"))?;
}
_ => bail!(
"config file {legacy_config_path} has unrecognized pageserver option '{key}'"
),
}
}
// Legacy configs are implicitly in attached state, and do not support sharding
Ok(LocationConf::attached_single(
tenant_conf,
Generation::none(),
&models::ShardParameters::default(),
))
} else {
// FIXME If the config file is not found, assume that we're attaching
// a detached tenant and config is passed via attach command.
// https://github.com/neondatabase/neon/issues/1555
// OR: we're loading after incomplete deletion that managed to remove config.
info!(
"tenant config not found in {} or {}",
config_path, legacy_config_path
);
Ok(LocationConf::default())
// The config should almost always exist for a tenant directory:
// - When attaching a tenant, the config is the first thing we write
// - When detaching a tenant, we atomically move the directory to a tmp location
// before deleting contents.
//
// The very rare edge case that can result in a missing config is if we crash during attach
// between creating directory and writing config. Callers should handle that as if the
// directory didn't exist.
anyhow::bail!("tenant config not found in {}", config_path);
}
}
@@ -2644,47 +2601,17 @@ impl Tenant {
tenant_shard_id: &TenantShardId,
location_conf: &LocationConf,
) -> anyhow::Result<()> {
let legacy_config_path = conf.tenant_config_path(tenant_shard_id);
let config_path = conf.tenant_location_config_path(tenant_shard_id);
Self::persist_tenant_config_at(
tenant_shard_id,
&config_path,
&legacy_config_path,
location_conf,
)
.await
Self::persist_tenant_config_at(tenant_shard_id, &config_path, location_conf).await
}
#[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
pub(super) async fn persist_tenant_config_at(
tenant_shard_id: &TenantShardId,
config_path: &Utf8Path,
legacy_config_path: &Utf8Path,
location_conf: &LocationConf,
) -> anyhow::Result<()> {
if let LocationMode::Attached(attach_conf) = &location_conf.mode {
// The modern-style LocationConf config file requires a generation to be set. In case someone
// is running a pageserver without the infrastructure to set generations, write out the legacy-style
// config file that only contains TenantConf.
//
// This will eventually be removed in https://github.com/neondatabase/neon/issues/5388
if attach_conf.generation.is_none() {
tracing::info!(
"Running without generations, writing legacy-style tenant config file"
);
Self::persist_tenant_config_legacy(
tenant_shard_id,
legacy_config_path,
&location_conf.tenant_conf,
)
.await?;
return Ok(());
}
}
debug!("persisting tenantconf to {config_path}");
let mut conf_content = r#"# This file contains a specific per-tenant's config.
@@ -2711,37 +2638,6 @@ impl Tenant {
Ok(())
}
#[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
async fn persist_tenant_config_legacy(
tenant_shard_id: &TenantShardId,
target_config_path: &Utf8Path,
tenant_conf: &TenantConfOpt,
) -> anyhow::Result<()> {
debug!("persisting tenantconf to {target_config_path}");
let mut conf_content = r#"# This file contains a specific per-tenant's config.
# It is read in case of pageserver restart.
[tenant_config]
"#
.to_string();
// Convert the config to a toml file.
conf_content += &toml_edit::ser::to_string(&tenant_conf)?;
let temp_path = path_with_suffix_extension(target_config_path, TEMP_FILE_SUFFIX);
let tenant_shard_id = *tenant_shard_id;
let target_config_path = target_config_path.to_owned();
let conf_content = conf_content.into_bytes();
VirtualFile::crashsafe_overwrite(target_config_path.clone(), temp_path, conf_content)
.await
.with_context(|| {
format!("write tenant {tenant_shard_id} config to {target_config_path}")
})?;
Ok(())
}
//
// How garbage collection works:
//

View File

@@ -281,22 +281,6 @@ impl LocationConf {
}
}
impl Default for LocationConf {
// TODO: this should be removed once tenant loading can guarantee that we are never
// loading from a directory without a configuration.
// => tech debt since https://github.com/neondatabase/neon/issues/1555
fn default() -> Self {
Self {
mode: LocationMode::Attached(AttachedLocationConfig {
generation: Generation::none(),
attach_mode: AttachmentMode::Single,
}),
tenant_conf: TenantConfOpt::default(),
shard: ShardIdentity::unsharded(),
}
}
}
/// A tenant's calcuated configuration, which is the result of merging a
/// tenant's TenantConfOpt with the global TenantConf from PageServerConf.
///

View File

@@ -495,17 +495,8 @@ pub async fn init_tenant_mgr(
let mut location_conf = match location_conf {
Ok(l) => l,
Err(e) => {
warn!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Marking tenant broken, failed to {e:#}");
tenants.insert(
tenant_shard_id,
TenantSlot::Attached(Tenant::create_broken_tenant(
conf,
tenant_shard_id,
resources.remote_storage.clone(),
format!("{}", e),
)),
);
// This should only happen in the case of a serialization bug or critical local I/O error: we cannot load this tenant
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to load tenant config, failed to {e:#}");
continue;
}
};
@@ -687,8 +678,7 @@ fn tenant_spawn(
"Cannot load tenant from empty directory {tenant_path:?}"
);
let remote_storage = resources.remote_storage.clone();
let tenant = match Tenant::spawn(
let tenant = Tenant::spawn(
conf,
tenant_shard_id,
resources,
@@ -697,13 +687,7 @@ fn tenant_spawn(
init_order,
mode,
ctx,
) {
Ok(tenant) => tenant,
Err(e) => {
error!("Failed to spawn tenant {tenant_shard_id}, reason: {e:#}");
Tenant::create_broken_tenant(conf, tenant_shard_id, remote_storage, format!("{e:#}"))
}
};
);
Ok(tenant)
}

View File

@@ -367,10 +367,9 @@ async fn upload_tenant_heatmap(
debug_assert_current_span_has_tenant_id();
let generation = tenant.get_generation();
debug_assert!(!generation.is_none());
if generation.is_none() {
// We do not expect this: generations were implemented before heatmap uploads. However,
// handle it so that we don't have to make the generation in the heatmap an Option<>
// (Generation::none is not serializable)
// We do not expect this: None generations should only appear in historic layer metadata, not in running Tenants
tracing::warn!("Skipping heatmap upload for tenant with generation==None");
return Ok(UploadHeatmapOutcome::Skipped);
}

View File

@@ -93,16 +93,12 @@ pub(crate) struct Layer(Arc<LayerInner>);
impl std::fmt::Display for Layer {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
if matches!(self.0.generation, Generation::Broken) {
write!(f, "{}-broken", self.layer_desc().short_id())
} else {
write!(
f,
"{}{}",
self.layer_desc().short_id(),
self.0.generation.get_suffix()
)
}
write!(
f,
"{}{}",
self.layer_desc().short_id(),
self.0.generation.get_suffix()
)
}
}