mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-29 08:10:38 +00:00
Compare commits
33 Commits
proxy-role
...
problame/f
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cabf452fa7 | ||
|
|
7c9f4c270e | ||
|
|
2404106586 | ||
|
|
b45c1b5965 | ||
|
|
82e97e0c59 | ||
|
|
00d9bf5b61 | ||
|
|
71f495c7f7 | ||
|
|
0a7e050144 | ||
|
|
55bfa91bd7 | ||
|
|
d90b2b99df | ||
|
|
27587e155d | ||
|
|
55aede2762 | ||
|
|
9f186b4d3e | ||
|
|
585687d563 | ||
|
|
65a98e425d | ||
|
|
b2e7249979 | ||
|
|
844303255a | ||
|
|
6d8df2579b | ||
|
|
3c3b53f8ad | ||
|
|
30064eb197 | ||
|
|
869acfe29b | ||
|
|
11a91eaf7b | ||
|
|
394ef013d0 | ||
|
|
a718287902 | ||
|
|
2eac1adcb9 | ||
|
|
3f90b2d337 | ||
|
|
a40ed86d87 | ||
|
|
1bf8bb88c5 | ||
|
|
f1901833a6 | ||
|
|
b41ee81308 | ||
|
|
205b6111e6 | ||
|
|
93572a3e99 | ||
|
|
15c0df4de7 |
@@ -700,13 +700,14 @@ impl ComputeNode {
|
||||
// In this case we need to connect with old `zenith_admin` name
|
||||
// and create new user. We cannot simply rename connected user,
|
||||
// but we can create a new one and grant it all privileges.
|
||||
let mut client = match Client::connect(self.connstr.as_str(), NoTls) {
|
||||
let connstr = self.connstr.clone();
|
||||
let mut client = match Client::connect(connstr.as_str(), NoTls) {
|
||||
Err(e) => {
|
||||
info!(
|
||||
"cannot connect to postgres: {}, retrying with `zenith_admin` username",
|
||||
e
|
||||
);
|
||||
let mut zenith_admin_connstr = self.connstr.clone();
|
||||
let mut zenith_admin_connstr = connstr.clone();
|
||||
|
||||
zenith_admin_connstr
|
||||
.set_username("zenith_admin")
|
||||
@@ -719,8 +720,8 @@ impl ComputeNode {
|
||||
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
|
||||
drop(client);
|
||||
|
||||
// reconnect with connsting with expected name
|
||||
Client::connect(self.connstr.as_str(), NoTls)?
|
||||
// reconnect with connstring with expected name
|
||||
Client::connect(connstr.as_str(), NoTls)?
|
||||
}
|
||||
Ok(client) => client,
|
||||
};
|
||||
@@ -734,8 +735,8 @@ impl ComputeNode {
|
||||
cleanup_instance(&mut client)?;
|
||||
handle_roles(spec, &mut client)?;
|
||||
handle_databases(spec, &mut client)?;
|
||||
handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
|
||||
handle_grants(spec, &mut client, self.connstr.as_str())?;
|
||||
handle_role_deletions(spec, connstr.as_str(), &mut client)?;
|
||||
handle_grants(spec, &mut client, connstr.as_str())?;
|
||||
handle_extensions(spec, &mut client)?;
|
||||
handle_extension_neon(&mut client)?;
|
||||
create_availability_check_data(&mut client)?;
|
||||
@@ -743,6 +744,12 @@ impl ComputeNode {
|
||||
// 'Close' connection
|
||||
drop(client);
|
||||
|
||||
if self.has_feature(ComputeFeature::Migrations) {
|
||||
thread::spawn(move || {
|
||||
let mut client = Client::connect(connstr.as_str(), NoTls)?;
|
||||
handle_migrations(&mut client)
|
||||
});
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -807,6 +814,10 @@ impl ComputeNode {
|
||||
handle_grants(&spec, &mut client, self.connstr.as_str())?;
|
||||
handle_extensions(&spec, &mut client)?;
|
||||
handle_extension_neon(&mut client)?;
|
||||
// We can skip handle_migrations here because a new migration can only appear
|
||||
// if we have a new version of the compute_ctl binary, which can only happen
|
||||
// if compute got restarted, in which case we'll end up inside of apply_config
|
||||
// instead of reconfigure.
|
||||
}
|
||||
|
||||
// 'Close' connection
|
||||
|
||||
@@ -727,3 +727,79 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
pub fn handle_migrations(client: &mut Client) -> Result<()> {
|
||||
info!("handle migrations");
|
||||
|
||||
// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
// !BE SURE TO ONLY ADD MIGRATIONS TO THE END OF THIS ARRAY. IF YOU DO NOT, VERY VERY BAD THINGS MAY HAPPEN!
|
||||
// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
|
||||
let migrations = [
|
||||
"ALTER ROLE neon_superuser BYPASSRLS",
|
||||
r#"
|
||||
DO $$
|
||||
DECLARE
|
||||
role_name text;
|
||||
BEGIN
|
||||
FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member')
|
||||
LOOP
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name);
|
||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
|
||||
END LOOP;
|
||||
|
||||
FOR role_name IN SELECT rolname FROM pg_roles
|
||||
WHERE
|
||||
NOT pg_has_role(rolname, 'neon_superuser', 'member') AND NOT starts_with(rolname, 'pg_')
|
||||
LOOP
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name);
|
||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
|
||||
END LOOP;
|
||||
END $$;
|
||||
"#,
|
||||
];
|
||||
|
||||
let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
||||
client.simple_query(query)?;
|
||||
|
||||
query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
|
||||
client.simple_query(query)?;
|
||||
|
||||
query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
|
||||
client.simple_query(query)?;
|
||||
|
||||
query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
|
||||
client.simple_query(query)?;
|
||||
|
||||
query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
|
||||
client.simple_query(query)?;
|
||||
|
||||
query = "SELECT id FROM neon_migration.migration_id";
|
||||
let row = client.query_one(query, &[])?;
|
||||
let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
|
||||
let starting_migration_id = current_migration;
|
||||
|
||||
query = "BEGIN";
|
||||
client.simple_query(query)?;
|
||||
|
||||
while current_migration < migrations.len() {
|
||||
info!("Running migration:\n{}\n", migrations[current_migration]);
|
||||
client.simple_query(migrations[current_migration])?;
|
||||
current_migration += 1;
|
||||
}
|
||||
let setval = format!(
|
||||
"UPDATE neon_migration.migration_id SET id={}",
|
||||
migrations.len()
|
||||
);
|
||||
client.simple_query(&setval)?;
|
||||
|
||||
query = "COMMIT";
|
||||
client.simple_query(query)?;
|
||||
|
||||
info!(
|
||||
"Ran {} migrations",
|
||||
(migrations.len() - starting_migration_id)
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -184,7 +184,7 @@ impl Persistence {
|
||||
pub(crate) async fn increment_generation(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
node_id: Option<NodeId>,
|
||||
node_id: NodeId,
|
||||
) -> anyhow::Result<Generation> {
|
||||
let (write, gen) = {
|
||||
let mut locked = self.state.lock().unwrap();
|
||||
@@ -192,14 +192,9 @@ impl Persistence {
|
||||
anyhow::bail!("Tried to increment generation of unknown shard");
|
||||
};
|
||||
|
||||
// If we're called with a None pageserver, we need only update the generation
|
||||
// record to disassociate it with this pageserver, not actually increment the number, as
|
||||
// the increment is guaranteed to happen the next time this tenant is attached.
|
||||
if node_id.is_some() {
|
||||
shard.generation += 1;
|
||||
}
|
||||
shard.generation += 1;
|
||||
shard.generation_pageserver = Some(node_id);
|
||||
|
||||
shard.generation_pageserver = node_id;
|
||||
let gen = Generation::new(shard.generation);
|
||||
(locked.save(), gen)
|
||||
};
|
||||
@@ -208,6 +203,19 @@ impl Persistence {
|
||||
Ok(gen)
|
||||
}
|
||||
|
||||
pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
|
||||
let write = {
|
||||
let mut locked = self.state.lock().unwrap();
|
||||
let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else {
|
||||
anyhow::bail!("Tried to increment generation of unknown shard");
|
||||
};
|
||||
shard.generation_pageserver = None;
|
||||
locked.save()
|
||||
};
|
||||
write.commit().await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn re_attach(
|
||||
&self,
|
||||
node_id: NodeId,
|
||||
|
||||
@@ -296,7 +296,7 @@ impl Reconciler {
|
||||
// Increment generation before attaching to new pageserver
|
||||
self.generation = self
|
||||
.persistence
|
||||
.increment_generation(self.tenant_shard_id, Some(dest_ps_id))
|
||||
.increment_generation(self.tenant_shard_id, dest_ps_id)
|
||||
.await?;
|
||||
|
||||
let dest_conf = build_location_config(
|
||||
@@ -395,7 +395,7 @@ impl Reconciler {
|
||||
// as locations with unknown (None) observed state.
|
||||
self.generation = self
|
||||
.persistence
|
||||
.increment_generation(self.tenant_shard_id, Some(node_id))
|
||||
.increment_generation(self.tenant_shard_id, node_id)
|
||||
.await?;
|
||||
wanted_conf.generation = self.generation.into();
|
||||
tracing::info!("Observed configuration requires update.");
|
||||
|
||||
@@ -362,13 +362,14 @@ impl Service {
|
||||
);
|
||||
}
|
||||
|
||||
let new_generation = if attach_req.node_id.is_some() {
|
||||
let new_generation = if let Some(req_node_id) = attach_req.node_id {
|
||||
Some(
|
||||
self.persistence
|
||||
.increment_generation(attach_req.tenant_shard_id, attach_req.node_id)
|
||||
.increment_generation(attach_req.tenant_shard_id, req_node_id)
|
||||
.await?,
|
||||
)
|
||||
} else {
|
||||
self.persistence.detach(attach_req.tenant_shard_id).await?;
|
||||
None
|
||||
};
|
||||
|
||||
@@ -407,6 +408,7 @@ impl Service {
|
||||
"attach_hook: tenant {} set generation {:?}, pageserver {}",
|
||||
attach_req.tenant_shard_id,
|
||||
tenant_state.generation,
|
||||
// TODO: this is an odd number of 0xf's
|
||||
attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
|
||||
);
|
||||
|
||||
|
||||
@@ -57,7 +57,7 @@ use crate::local_env::LocalEnv;
|
||||
use crate::postgresql_conf::PostgresConf;
|
||||
|
||||
use compute_api::responses::{ComputeState, ComputeStatus};
|
||||
use compute_api::spec::{Cluster, ComputeMode, ComputeSpec};
|
||||
use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec};
|
||||
|
||||
// contents of a endpoint.json file
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
@@ -70,6 +70,7 @@ pub struct EndpointConf {
|
||||
http_port: u16,
|
||||
pg_version: u32,
|
||||
skip_pg_catalog_updates: bool,
|
||||
features: Vec<ComputeFeature>,
|
||||
}
|
||||
|
||||
//
|
||||
@@ -140,6 +141,7 @@ impl ComputeControlPlane {
|
||||
// with this we basically test a case of waking up an idle compute, where
|
||||
// we also skip catalog updates in the cloud.
|
||||
skip_pg_catalog_updates: true,
|
||||
features: vec![],
|
||||
});
|
||||
|
||||
ep.create_endpoint_dir()?;
|
||||
@@ -154,6 +156,7 @@ impl ComputeControlPlane {
|
||||
pg_port,
|
||||
pg_version,
|
||||
skip_pg_catalog_updates: true,
|
||||
features: vec![],
|
||||
})?,
|
||||
)?;
|
||||
std::fs::write(
|
||||
@@ -215,6 +218,9 @@ pub struct Endpoint {
|
||||
|
||||
// Optimizations
|
||||
skip_pg_catalog_updates: bool,
|
||||
|
||||
// Feature flags
|
||||
features: Vec<ComputeFeature>,
|
||||
}
|
||||
|
||||
impl Endpoint {
|
||||
@@ -244,6 +250,7 @@ impl Endpoint {
|
||||
tenant_id: conf.tenant_id,
|
||||
pg_version: conf.pg_version,
|
||||
skip_pg_catalog_updates: conf.skip_pg_catalog_updates,
|
||||
features: conf.features,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -519,7 +526,7 @@ impl Endpoint {
|
||||
skip_pg_catalog_updates: self.skip_pg_catalog_updates,
|
||||
format_version: 1.0,
|
||||
operation_uuid: None,
|
||||
features: vec![],
|
||||
features: self.features.clone(),
|
||||
cluster: Cluster {
|
||||
cluster_id: None, // project ID: not used
|
||||
name: None, // project name: not used
|
||||
|
||||
@@ -90,6 +90,9 @@ pub enum ComputeFeature {
|
||||
/// track short-lived connections as user activity.
|
||||
ActivityMonitorExperimental,
|
||||
|
||||
/// Enable running migrations
|
||||
Migrations,
|
||||
|
||||
/// This is a special feature flag that is used to represent unknown feature flags.
|
||||
/// Basically all unknown to enum flags are represented as this one. See unit test
|
||||
/// `parse_unknown_features()` for more details.
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
use anyhow::{bail, Result};
|
||||
use byteorder::{ByteOrder, BE};
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::{Oid, TransactionId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fmt;
|
||||
use std::{fmt, ops::Range};
|
||||
|
||||
use crate::reltag::{BlockNumber, RelTag};
|
||||
use crate::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
|
||||
/// Key used in the Repository kv-store.
|
||||
///
|
||||
@@ -143,12 +145,390 @@ impl Key {
|
||||
}
|
||||
}
|
||||
|
||||
// Layout of the Key address space
|
||||
//
|
||||
// The Key struct, used to address the underlying key-value store, consists of
|
||||
// 18 bytes, split into six fields. See 'Key' in repository.rs. We need to map
|
||||
// all the data and metadata keys into those 18 bytes.
|
||||
//
|
||||
// Principles for the mapping:
|
||||
//
|
||||
// - Things that are often accessed or modified together, should be close to
|
||||
// each other in the key space. For example, if a relation is extended by one
|
||||
// block, we create a new key-value pair for the block data, and update the
|
||||
// relation size entry. Because of that, the RelSize key comes after all the
|
||||
// RelBlocks of a relation: the RelSize and the last RelBlock are always next
|
||||
// to each other.
|
||||
//
|
||||
// The key space is divided into four major sections, identified by the first
|
||||
// byte, and the form a hierarchy:
|
||||
//
|
||||
// 00 Relation data and metadata
|
||||
//
|
||||
// DbDir () -> (dbnode, spcnode)
|
||||
// Filenodemap
|
||||
// RelDir -> relnode forknum
|
||||
// RelBlocks
|
||||
// RelSize
|
||||
//
|
||||
// 01 SLRUs
|
||||
//
|
||||
// SlruDir kind
|
||||
// SlruSegBlocks segno
|
||||
// SlruSegSize
|
||||
//
|
||||
// 02 pg_twophase
|
||||
//
|
||||
// 03 misc
|
||||
// Controlfile
|
||||
// checkpoint
|
||||
// pg_version
|
||||
//
|
||||
// 04 aux files
|
||||
//
|
||||
// Below is a full list of the keyspace allocation:
|
||||
//
|
||||
// DbDir:
|
||||
// 00 00000000 00000000 00000000 00 00000000
|
||||
//
|
||||
// Filenodemap:
|
||||
// 00 SPCNODE DBNODE 00000000 00 00000000
|
||||
//
|
||||
// RelDir:
|
||||
// 00 SPCNODE DBNODE 00000000 00 00000001 (Postgres never uses relfilenode 0)
|
||||
//
|
||||
// RelBlock:
|
||||
// 00 SPCNODE DBNODE RELNODE FORK BLKNUM
|
||||
//
|
||||
// RelSize:
|
||||
// 00 SPCNODE DBNODE RELNODE FORK FFFFFFFF
|
||||
//
|
||||
// SlruDir:
|
||||
// 01 kind 00000000 00000000 00 00000000
|
||||
//
|
||||
// SlruSegBlock:
|
||||
// 01 kind 00000001 SEGNO 00 BLKNUM
|
||||
//
|
||||
// SlruSegSize:
|
||||
// 01 kind 00000001 SEGNO 00 FFFFFFFF
|
||||
//
|
||||
// TwoPhaseDir:
|
||||
// 02 00000000 00000000 00000000 00 00000000
|
||||
//
|
||||
// TwoPhaseFile:
|
||||
// 02 00000000 00000000 00000000 00 XID
|
||||
//
|
||||
// ControlFile:
|
||||
// 03 00000000 00000000 00000000 00 00000000
|
||||
//
|
||||
// Checkpoint:
|
||||
// 03 00000000 00000000 00000000 00 00000001
|
||||
//
|
||||
// AuxFiles:
|
||||
// 03 00000000 00000000 00000000 00 00000002
|
||||
//
|
||||
|
||||
//-- Section 01: relation data and metadata
|
||||
|
||||
pub const DBDIR_KEY: Key = Key {
|
||||
field1: 0x00,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
};
|
||||
|
||||
#[inline(always)]
|
||||
pub fn dbdir_key_range(spcnode: Oid, dbnode: Oid) -> Range<Key> {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: spcnode,
|
||||
field3: dbnode,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
}..Key {
|
||||
field1: 0x00,
|
||||
field2: spcnode,
|
||||
field3: dbnode,
|
||||
field4: 0xffffffff,
|
||||
field5: 0xff,
|
||||
field6: 0xffffffff,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn relmap_file_key(spcnode: Oid, dbnode: Oid) -> Key {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: spcnode,
|
||||
field3: dbnode,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: spcnode,
|
||||
field3: dbnode,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 1,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: rel.spcnode,
|
||||
field3: rel.dbnode,
|
||||
field4: rel.relnode,
|
||||
field5: rel.forknum,
|
||||
field6: blknum,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn rel_size_to_key(rel: RelTag) -> Key {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: rel.spcnode,
|
||||
field3: rel.dbnode,
|
||||
field4: rel.relnode,
|
||||
field5: rel.forknum,
|
||||
field6: 0xffffffff,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn rel_key_range(rel: RelTag) -> Range<Key> {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: rel.spcnode,
|
||||
field3: rel.dbnode,
|
||||
field4: rel.relnode,
|
||||
field5: rel.forknum,
|
||||
field6: 0,
|
||||
}..Key {
|
||||
field1: 0x00,
|
||||
field2: rel.spcnode,
|
||||
field3: rel.dbnode,
|
||||
field4: rel.relnode,
|
||||
field5: rel.forknum + 1,
|
||||
field6: 0,
|
||||
}
|
||||
}
|
||||
|
||||
//-- Section 02: SLRUs
|
||||
|
||||
#[inline(always)]
|
||||
pub fn slru_dir_to_key(kind: SlruKind) -> Key {
|
||||
Key {
|
||||
field1: 0x01,
|
||||
field2: match kind {
|
||||
SlruKind::Clog => 0x00,
|
||||
SlruKind::MultiXactMembers => 0x01,
|
||||
SlruKind::MultiXactOffsets => 0x02,
|
||||
},
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key {
|
||||
Key {
|
||||
field1: 0x01,
|
||||
field2: match kind {
|
||||
SlruKind::Clog => 0x00,
|
||||
SlruKind::MultiXactMembers => 0x01,
|
||||
SlruKind::MultiXactOffsets => 0x02,
|
||||
},
|
||||
field3: 1,
|
||||
field4: segno,
|
||||
field5: 0,
|
||||
field6: blknum,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key {
|
||||
Key {
|
||||
field1: 0x01,
|
||||
field2: match kind {
|
||||
SlruKind::Clog => 0x00,
|
||||
SlruKind::MultiXactMembers => 0x01,
|
||||
SlruKind::MultiXactOffsets => 0x02,
|
||||
},
|
||||
field3: 1,
|
||||
field4: segno,
|
||||
field5: 0,
|
||||
field6: 0xffffffff,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range<Key> {
|
||||
let field2 = match kind {
|
||||
SlruKind::Clog => 0x00,
|
||||
SlruKind::MultiXactMembers => 0x01,
|
||||
SlruKind::MultiXactOffsets => 0x02,
|
||||
};
|
||||
|
||||
Key {
|
||||
field1: 0x01,
|
||||
field2,
|
||||
field3: 1,
|
||||
field4: segno,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
}..Key {
|
||||
field1: 0x01,
|
||||
field2,
|
||||
field3: 1,
|
||||
field4: segno,
|
||||
field5: 1,
|
||||
field6: 0,
|
||||
}
|
||||
}
|
||||
|
||||
//-- Section 03: pg_twophase
|
||||
|
||||
pub const TWOPHASEDIR_KEY: Key = Key {
|
||||
field1: 0x02,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
};
|
||||
|
||||
#[inline(always)]
|
||||
pub fn twophase_file_key(xid: TransactionId) -> Key {
|
||||
Key {
|
||||
field1: 0x02,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: xid,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn twophase_key_range(xid: TransactionId) -> Range<Key> {
|
||||
let (next_xid, overflowed) = xid.overflowing_add(1);
|
||||
|
||||
Key {
|
||||
field1: 0x02,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: xid,
|
||||
}..Key {
|
||||
field1: 0x02,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: u8::from(overflowed),
|
||||
field6: next_xid,
|
||||
}
|
||||
}
|
||||
|
||||
//-- Section 03: Control file
|
||||
pub const CONTROLFILE_KEY: Key = Key {
|
||||
field1: 0x03,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
};
|
||||
|
||||
pub const CHECKPOINT_KEY: Key = Key {
|
||||
field1: 0x03,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 1,
|
||||
};
|
||||
|
||||
pub const AUX_FILES_KEY: Key = Key {
|
||||
field1: 0x03,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 2,
|
||||
};
|
||||
|
||||
// Reverse mappings for a few Keys.
|
||||
// These are needed by WAL redo manager.
|
||||
|
||||
// AUX_FILES currently stores only data for logical replication (slots etc), and
|
||||
// we don't preserve these on a branch because safekeepers can't follow timeline
|
||||
// switch (and generally it likely should be optional), so ignore these.
|
||||
#[inline(always)]
|
||||
pub fn is_inherited_key(key: Key) -> bool {
|
||||
key != AUX_FILES_KEY
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn is_rel_fsm_block_key(key: Key) -> bool {
|
||||
key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn is_rel_vm_block_key(key: Key) -> bool {
|
||||
key.field1 == 0x00
|
||||
&& key.field4 != 0
|
||||
&& key.field5 == VISIBILITYMAP_FORKNUM
|
||||
&& key.field6 != 0xffffffff
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
|
||||
Ok(match key.field1 {
|
||||
0x01 => {
|
||||
let kind = match key.field2 {
|
||||
0x00 => SlruKind::Clog,
|
||||
0x01 => SlruKind::MultiXactMembers,
|
||||
0x02 => SlruKind::MultiXactOffsets,
|
||||
_ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2),
|
||||
};
|
||||
let segno = key.field4;
|
||||
let blknum = key.field6;
|
||||
|
||||
(kind, segno, blknum)
|
||||
}
|
||||
_ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
|
||||
})
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn is_slru_block_key(key: Key) -> bool {
|
||||
key.field1 == 0x01 // SLRU-related
|
||||
&& key.field3 == 0x00000001 // but not SlruDir
|
||||
&& key.field6 != 0xffffffff // and not SlruSegSize
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn is_rel_block_key(key: &Key) -> bool {
|
||||
key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff
|
||||
}
|
||||
|
||||
/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
|
||||
#[inline(always)]
|
||||
pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
|
||||
Ok(match key.field1 {
|
||||
0x00 => (
|
||||
|
||||
@@ -61,7 +61,7 @@ use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::import_datadir::import_wal_from_tar;
|
||||
use crate::metrics;
|
||||
use crate::metrics::LIVE_CONNECTIONS_COUNT;
|
||||
use crate::pgdatadir_mapping::{rel_block_to_key, Version};
|
||||
use crate::pgdatadir_mapping::Version;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
@@ -75,6 +75,7 @@ use crate::tenant::PageReconstructError;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::trace::Tracer;
|
||||
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
||||
use postgres_ffi::BLCKSZ;
|
||||
|
||||
@@ -386,12 +387,18 @@ impl PageServerHandler {
|
||||
|
||||
/// Future that completes when we need to shut down the connection.
|
||||
///
|
||||
/// Reasons for need to shut down are:
|
||||
/// - any of the timelines we hold GateGuards for in `shard_timelines` is cancelled
|
||||
/// - task_mgr requests shutdown of the connection
|
||||
/// We currently need to shut down when any of the following happens:
|
||||
/// 1. any of the timelines we hold GateGuards for in `shard_timelines` is cancelled
|
||||
/// 2. task_mgr requests shutdown of the connection
|
||||
///
|
||||
/// The need to check for `task_mgr` cancellation arises mainly from `handle_pagerequests`
|
||||
/// where, at first, `shard_timelines` is empty, see <https://github.com/neondatabase/neon/pull/6388>
|
||||
/// NB on (1): the connection's lifecycle is not actually tied to any of the
|
||||
/// `shard_timelines`s' lifecycles. But it's _necessary_ in the current
|
||||
/// implementation to be responsive to timeline cancellation because
|
||||
/// the connection holds their `GateGuards` open (sored in `shard_timelines`).
|
||||
/// We currently do the easy thing and terminate the connection if any of the
|
||||
/// shard_timelines gets cancelled. But really, we cuold spend more effort
|
||||
/// and simply remove the cancelled timeline from the `shard_timelines`, thereby
|
||||
/// dropping the guard.
|
||||
///
|
||||
/// NB: keep in sync with [`Self::is_connection_cancelled`]
|
||||
async fn await_connection_cancelled(&self) {
|
||||
@@ -404,16 +411,17 @@ impl PageServerHandler {
|
||||
// immutable &self). So it's fine to evaluate shard_timelines after the sleep, we don't risk
|
||||
// missing any inserts to the map.
|
||||
|
||||
let mut futs = self
|
||||
.shard_timelines
|
||||
.values()
|
||||
.map(|ht| ht.timeline.cancel.cancelled())
|
||||
.collect::<FuturesUnordered<_>>();
|
||||
|
||||
tokio::select! {
|
||||
_ = task_mgr::shutdown_watcher() => { }
|
||||
_ = futs.next() => {}
|
||||
}
|
||||
let mut cancellation_sources = Vec::with_capacity(1 + self.shard_timelines.len());
|
||||
use futures::future::Either;
|
||||
cancellation_sources.push(Either::Left(task_mgr::shutdown_watcher()));
|
||||
cancellation_sources.extend(
|
||||
self.shard_timelines
|
||||
.values()
|
||||
.map(|ht| Either::Right(ht.timeline.cancel.cancelled())),
|
||||
);
|
||||
FuturesUnordered::from_iter(cancellation_sources)
|
||||
.next()
|
||||
.await;
|
||||
}
|
||||
|
||||
/// Checking variant of [`Self::await_connection_cancelled`].
|
||||
|
||||
@@ -13,7 +13,12 @@ use crate::repository::*;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::{ensure, Context};
|
||||
use bytes::{Buf, Bytes};
|
||||
use pageserver_api::key::is_rel_block_key;
|
||||
use pageserver_api::key::{
|
||||
dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
|
||||
rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
|
||||
slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
|
||||
AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
|
||||
};
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
@@ -1535,366 +1540,6 @@ struct SlruSegmentDirectory {
|
||||
|
||||
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
|
||||
|
||||
// Layout of the Key address space
|
||||
//
|
||||
// The Key struct, used to address the underlying key-value store, consists of
|
||||
// 18 bytes, split into six fields. See 'Key' in repository.rs. We need to map
|
||||
// all the data and metadata keys into those 18 bytes.
|
||||
//
|
||||
// Principles for the mapping:
|
||||
//
|
||||
// - Things that are often accessed or modified together, should be close to
|
||||
// each other in the key space. For example, if a relation is extended by one
|
||||
// block, we create a new key-value pair for the block data, and update the
|
||||
// relation size entry. Because of that, the RelSize key comes after all the
|
||||
// RelBlocks of a relation: the RelSize and the last RelBlock are always next
|
||||
// to each other.
|
||||
//
|
||||
// The key space is divided into four major sections, identified by the first
|
||||
// byte, and the form a hierarchy:
|
||||
//
|
||||
// 00 Relation data and metadata
|
||||
//
|
||||
// DbDir () -> (dbnode, spcnode)
|
||||
// Filenodemap
|
||||
// RelDir -> relnode forknum
|
||||
// RelBlocks
|
||||
// RelSize
|
||||
//
|
||||
// 01 SLRUs
|
||||
//
|
||||
// SlruDir kind
|
||||
// SlruSegBlocks segno
|
||||
// SlruSegSize
|
||||
//
|
||||
// 02 pg_twophase
|
||||
//
|
||||
// 03 misc
|
||||
// Controlfile
|
||||
// checkpoint
|
||||
// pg_version
|
||||
//
|
||||
// 04 aux files
|
||||
//
|
||||
// Below is a full list of the keyspace allocation:
|
||||
//
|
||||
// DbDir:
|
||||
// 00 00000000 00000000 00000000 00 00000000
|
||||
//
|
||||
// Filenodemap:
|
||||
// 00 SPCNODE DBNODE 00000000 00 00000000
|
||||
//
|
||||
// RelDir:
|
||||
// 00 SPCNODE DBNODE 00000000 00 00000001 (Postgres never uses relfilenode 0)
|
||||
//
|
||||
// RelBlock:
|
||||
// 00 SPCNODE DBNODE RELNODE FORK BLKNUM
|
||||
//
|
||||
// RelSize:
|
||||
// 00 SPCNODE DBNODE RELNODE FORK FFFFFFFF
|
||||
//
|
||||
// SlruDir:
|
||||
// 01 kind 00000000 00000000 00 00000000
|
||||
//
|
||||
// SlruSegBlock:
|
||||
// 01 kind 00000001 SEGNO 00 BLKNUM
|
||||
//
|
||||
// SlruSegSize:
|
||||
// 01 kind 00000001 SEGNO 00 FFFFFFFF
|
||||
//
|
||||
// TwoPhaseDir:
|
||||
// 02 00000000 00000000 00000000 00 00000000
|
||||
//
|
||||
// TwoPhaseFile:
|
||||
// 02 00000000 00000000 00000000 00 XID
|
||||
//
|
||||
// ControlFile:
|
||||
// 03 00000000 00000000 00000000 00 00000000
|
||||
//
|
||||
// Checkpoint:
|
||||
// 03 00000000 00000000 00000000 00 00000001
|
||||
//
|
||||
// AuxFiles:
|
||||
// 03 00000000 00000000 00000000 00 00000002
|
||||
//
|
||||
|
||||
//-- Section 01: relation data and metadata
|
||||
|
||||
const DBDIR_KEY: Key = Key {
|
||||
field1: 0x00,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
};
|
||||
|
||||
fn dbdir_key_range(spcnode: Oid, dbnode: Oid) -> Range<Key> {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: spcnode,
|
||||
field3: dbnode,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
}..Key {
|
||||
field1: 0x00,
|
||||
field2: spcnode,
|
||||
field3: dbnode,
|
||||
field4: 0xffffffff,
|
||||
field5: 0xff,
|
||||
field6: 0xffffffff,
|
||||
}
|
||||
}
|
||||
|
||||
fn relmap_file_key(spcnode: Oid, dbnode: Oid) -> Key {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: spcnode,
|
||||
field3: dbnode,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: spcnode,
|
||||
field3: dbnode,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 1,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: rel.spcnode,
|
||||
field3: rel.dbnode,
|
||||
field4: rel.relnode,
|
||||
field5: rel.forknum,
|
||||
field6: blknum,
|
||||
}
|
||||
}
|
||||
|
||||
fn rel_size_to_key(rel: RelTag) -> Key {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: rel.spcnode,
|
||||
field3: rel.dbnode,
|
||||
field4: rel.relnode,
|
||||
field5: rel.forknum,
|
||||
field6: 0xffffffff,
|
||||
}
|
||||
}
|
||||
|
||||
fn rel_key_range(rel: RelTag) -> Range<Key> {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: rel.spcnode,
|
||||
field3: rel.dbnode,
|
||||
field4: rel.relnode,
|
||||
field5: rel.forknum,
|
||||
field6: 0,
|
||||
}..Key {
|
||||
field1: 0x00,
|
||||
field2: rel.spcnode,
|
||||
field3: rel.dbnode,
|
||||
field4: rel.relnode,
|
||||
field5: rel.forknum + 1,
|
||||
field6: 0,
|
||||
}
|
||||
}
|
||||
|
||||
//-- Section 02: SLRUs
|
||||
|
||||
fn slru_dir_to_key(kind: SlruKind) -> Key {
|
||||
Key {
|
||||
field1: 0x01,
|
||||
field2: match kind {
|
||||
SlruKind::Clog => 0x00,
|
||||
SlruKind::MultiXactMembers => 0x01,
|
||||
SlruKind::MultiXactOffsets => 0x02,
|
||||
},
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key {
|
||||
Key {
|
||||
field1: 0x01,
|
||||
field2: match kind {
|
||||
SlruKind::Clog => 0x00,
|
||||
SlruKind::MultiXactMembers => 0x01,
|
||||
SlruKind::MultiXactOffsets => 0x02,
|
||||
},
|
||||
field3: 1,
|
||||
field4: segno,
|
||||
field5: 0,
|
||||
field6: blknum,
|
||||
}
|
||||
}
|
||||
|
||||
fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key {
|
||||
Key {
|
||||
field1: 0x01,
|
||||
field2: match kind {
|
||||
SlruKind::Clog => 0x00,
|
||||
SlruKind::MultiXactMembers => 0x01,
|
||||
SlruKind::MultiXactOffsets => 0x02,
|
||||
},
|
||||
field3: 1,
|
||||
field4: segno,
|
||||
field5: 0,
|
||||
field6: 0xffffffff,
|
||||
}
|
||||
}
|
||||
|
||||
fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range<Key> {
|
||||
let field2 = match kind {
|
||||
SlruKind::Clog => 0x00,
|
||||
SlruKind::MultiXactMembers => 0x01,
|
||||
SlruKind::MultiXactOffsets => 0x02,
|
||||
};
|
||||
|
||||
Key {
|
||||
field1: 0x01,
|
||||
field2,
|
||||
field3: 1,
|
||||
field4: segno,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
}..Key {
|
||||
field1: 0x01,
|
||||
field2,
|
||||
field3: 1,
|
||||
field4: segno,
|
||||
field5: 1,
|
||||
field6: 0,
|
||||
}
|
||||
}
|
||||
|
||||
//-- Section 03: pg_twophase
|
||||
|
||||
const TWOPHASEDIR_KEY: Key = Key {
|
||||
field1: 0x02,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
};
|
||||
|
||||
fn twophase_file_key(xid: TransactionId) -> Key {
|
||||
Key {
|
||||
field1: 0x02,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: xid,
|
||||
}
|
||||
}
|
||||
|
||||
fn twophase_key_range(xid: TransactionId) -> Range<Key> {
|
||||
let (next_xid, overflowed) = xid.overflowing_add(1);
|
||||
|
||||
Key {
|
||||
field1: 0x02,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: xid,
|
||||
}..Key {
|
||||
field1: 0x02,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: u8::from(overflowed),
|
||||
field6: next_xid,
|
||||
}
|
||||
}
|
||||
|
||||
//-- Section 03: Control file
|
||||
const CONTROLFILE_KEY: Key = Key {
|
||||
field1: 0x03,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
};
|
||||
|
||||
const CHECKPOINT_KEY: Key = Key {
|
||||
field1: 0x03,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 1,
|
||||
};
|
||||
|
||||
const AUX_FILES_KEY: Key = Key {
|
||||
field1: 0x03,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 2,
|
||||
};
|
||||
|
||||
// Reverse mappings for a few Keys.
|
||||
// These are needed by WAL redo manager.
|
||||
|
||||
// AUX_FILES currently stores only data for logical replication (slots etc), and
|
||||
// we don't preserve these on a branch because safekeepers can't follow timeline
|
||||
// switch (and generally it likely should be optional), so ignore these.
|
||||
pub fn is_inherited_key(key: Key) -> bool {
|
||||
key != AUX_FILES_KEY
|
||||
}
|
||||
|
||||
pub fn is_rel_fsm_block_key(key: Key) -> bool {
|
||||
key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
|
||||
}
|
||||
|
||||
pub fn is_rel_vm_block_key(key: Key) -> bool {
|
||||
key.field1 == 0x00
|
||||
&& key.field4 != 0
|
||||
&& key.field5 == VISIBILITYMAP_FORKNUM
|
||||
&& key.field6 != 0xffffffff
|
||||
}
|
||||
|
||||
pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
|
||||
Ok(match key.field1 {
|
||||
0x01 => {
|
||||
let kind = match key.field2 {
|
||||
0x00 => SlruKind::Clog,
|
||||
0x01 => SlruKind::MultiXactMembers,
|
||||
0x02 => SlruKind::MultiXactOffsets,
|
||||
_ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2),
|
||||
};
|
||||
let segno = key.field4;
|
||||
let blknum = key.field6;
|
||||
|
||||
(kind, segno, blknum)
|
||||
}
|
||||
_ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
|
||||
})
|
||||
}
|
||||
|
||||
fn is_slru_block_key(key: Key) -> bool {
|
||||
key.field1 == 0x01 // SLRU-related
|
||||
&& key.field3 == 0x00000001 // but not SlruDir
|
||||
&& key.field6 != 0xffffffff // and not SlruSegSize
|
||||
}
|
||||
|
||||
#[allow(clippy::bool_assert_comparison)]
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
@@ -716,6 +716,10 @@ impl Tenant {
|
||||
// stayed in Activating for such a long time that shutdown found it in
|
||||
// that state.
|
||||
tracing::info!(state=%tenant_clone.current_state(), "Tenant shut down before activation");
|
||||
// Make the tenant broken so that set_stopping will not hang waiting for it to leave
|
||||
// the Attaching state. This is an over-reaction (nothing really broke, the tenant is
|
||||
// just shutting down), but ensures progress.
|
||||
make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"));
|
||||
return Ok(());
|
||||
},
|
||||
)
|
||||
@@ -1877,7 +1881,7 @@ impl Tenant {
|
||||
&self,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<(), timeline::CompactionError> {
|
||||
) -> Result<(), timeline::CompactionError> {
|
||||
// Don't start doing work during shutdown, or when broken, we do not need those in the logs
|
||||
if !self.is_active() {
|
||||
return Ok(());
|
||||
|
||||
@@ -619,8 +619,8 @@ impl LayerMap {
|
||||
}
|
||||
|
||||
/// Return all L0 delta layers
|
||||
pub fn get_level0_deltas(&self) -> Result<Vec<Arc<PersistentLayerDesc>>> {
|
||||
Ok(self.l0_delta_layers.to_vec())
|
||||
pub fn get_level0_deltas(&self) -> Vec<Arc<PersistentLayerDesc>> {
|
||||
self.l0_delta_layers.to_vec()
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer map
|
||||
|
||||
@@ -237,7 +237,7 @@ use utils::id::{TenantId, TimelineId};
|
||||
use self::index::IndexPart;
|
||||
|
||||
use super::storage_layer::{Layer, LayerFileName, ResidentLayer};
|
||||
use super::upload_queue::SetDeletedFlagProgress;
|
||||
use super::upload_queue::{self, SetDeletedFlagProgress};
|
||||
use super::Generation;
|
||||
|
||||
pub(crate) use download::{is_temp_download_file, list_remote_timelines};
|
||||
@@ -621,7 +621,9 @@ impl RemoteTimelineClient {
|
||||
///
|
||||
/// Like schedule_index_upload_for_metadata_update(), this merely adds
|
||||
/// the upload to the upload queue and returns quickly.
|
||||
pub fn schedule_index_upload_for_file_changes(self: &Arc<Self>) -> anyhow::Result<()> {
|
||||
pub(crate) fn schedule_index_upload_for_file_changes(
|
||||
self: &Arc<Self>,
|
||||
) -> Result<(), upload_queue::NotInitialized> {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
@@ -666,7 +668,7 @@ impl RemoteTimelineClient {
|
||||
pub(crate) fn schedule_layer_file_upload(
|
||||
self: &Arc<Self>,
|
||||
layer: ResidentLayer,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), upload_queue::NotInitialized> {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
@@ -875,7 +877,7 @@ impl RemoteTimelineClient {
|
||||
self: &Arc<Self>,
|
||||
compacted_from: &[Layer],
|
||||
compacted_to: &[ResidentLayer],
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), upload_queue::NotInitialized> {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
|
||||
@@ -290,7 +290,7 @@ impl Layer {
|
||||
}
|
||||
|
||||
/// Downloads if necessary and creates a guard, which will keep this layer from being evicted.
|
||||
pub(crate) async fn download_and_keep_resident(&self) -> anyhow::Result<ResidentLayer> {
|
||||
pub(crate) async fn download_and_keep_resident(&self) -> Result<ResidentLayer, DownloadError> {
|
||||
let downloaded = self.0.get_or_maybe_download(true, None).await?;
|
||||
|
||||
Ok(ResidentLayer {
|
||||
@@ -1174,7 +1174,7 @@ pub(crate) enum EvictionError {
|
||||
|
||||
/// Error internal to the [`LayerInner::get_or_maybe_download`]
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
enum DownloadError {
|
||||
pub(crate) enum DownloadError {
|
||||
#[error("timeline has already shutdown")]
|
||||
TimelineShutdown,
|
||||
#[error("no remote storage configured")]
|
||||
@@ -1197,6 +1197,15 @@ enum DownloadError {
|
||||
PostStatFailed(#[source] std::io::Error),
|
||||
}
|
||||
|
||||
impl DownloadError {
|
||||
pub fn is_cancelled(&self) -> bool {
|
||||
match self {
|
||||
Self::TimelineShutdown | Self::DownloadCancelled => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub(crate) enum NeedsDownload {
|
||||
NotFound,
|
||||
|
||||
@@ -9,6 +9,7 @@ use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::metrics::TENANT_TASK_EVENTS;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
|
||||
use crate::tenant::timeline::CompactionError;
|
||||
use crate::tenant::{Tenant, TenantState};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
@@ -181,8 +182,11 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
);
|
||||
error_run_count += 1;
|
||||
let wait_duration = Duration::from_secs_f64(wait_duration);
|
||||
error!(
|
||||
"Compaction failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
|
||||
log_compaction_error(
|
||||
&e,
|
||||
error_run_count,
|
||||
&wait_duration,
|
||||
cancel.is_cancelled(),
|
||||
);
|
||||
wait_duration
|
||||
} else {
|
||||
@@ -210,6 +214,38 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
|
||||
}
|
||||
|
||||
fn log_compaction_error(
|
||||
e: &CompactionError,
|
||||
error_run_count: u32,
|
||||
sleep_duration: &std::time::Duration,
|
||||
task_cancelled: bool,
|
||||
) {
|
||||
use crate::tenant::upload_queue::NotInitialized;
|
||||
use crate::tenant::PageReconstructError;
|
||||
use CompactionError::*;
|
||||
|
||||
enum LooksLike {
|
||||
Info,
|
||||
Error,
|
||||
}
|
||||
|
||||
let decision = match e {
|
||||
ShuttingDown => None,
|
||||
_ if task_cancelled => Some(LooksLike::Info),
|
||||
Other(e) => Some(LooksLike::Error),
|
||||
};
|
||||
|
||||
match decision {
|
||||
Some(LooksLike::Info) => info!(
|
||||
"Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:#}",
|
||||
),
|
||||
Some(LooksLike::Error) => error!(
|
||||
"Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:?}",
|
||||
),
|
||||
None => {}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// GC task's main loop
|
||||
///
|
||||
|
||||
@@ -73,8 +73,8 @@ use crate::metrics::{
|
||||
TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
|
||||
};
|
||||
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
|
||||
use crate::pgdatadir_mapping::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key};
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use pageserver_api::key::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key};
|
||||
use pageserver_api::reltag::RelTag;
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
|
||||
@@ -103,11 +103,14 @@ use self::layer_manager::LayerManager;
|
||||
use self::logical_size::LogicalSize;
|
||||
use self::walreceiver::{WalReceiver, WalReceiverConf};
|
||||
|
||||
use super::config::TenantConf;
|
||||
use super::remote_timeline_client::index::{IndexLayerMetadata, IndexPart};
|
||||
use super::remote_timeline_client::RemoteTimelineClient;
|
||||
use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
|
||||
use super::{config::TenantConf, upload_queue::NotInitialized};
|
||||
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
|
||||
use super::{
|
||||
remote_timeline_client::index::{IndexLayerMetadata, IndexPart},
|
||||
storage_layer::layer,
|
||||
};
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
pub(super) enum FlushLoopState {
|
||||
@@ -391,8 +394,7 @@ pub(crate) enum PageReconstructError {
|
||||
#[error("Ancestor LSN wait error: {0}")]
|
||||
AncestorLsnTimeout(#[from] WaitLsnError),
|
||||
|
||||
/// The operation was cancelled
|
||||
#[error("Cancelled")]
|
||||
#[error("timeline shutting down")]
|
||||
Cancelled,
|
||||
|
||||
/// The ancestor of this is being stopped
|
||||
@@ -404,6 +406,19 @@ pub(crate) enum PageReconstructError {
|
||||
WalRedo(anyhow::Error),
|
||||
}
|
||||
|
||||
impl PageReconstructError {
|
||||
/// Returns true if this error indicates a tenant/timeline shutdown alike situation
|
||||
pub(crate) fn is_stopping(&self) -> bool {
|
||||
use PageReconstructError::*;
|
||||
match self {
|
||||
Other(_) => false,
|
||||
AncestorLsnTimeout(_) => false,
|
||||
Cancelled | AncestorStopping(_) => true,
|
||||
WalRedo(_) => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
enum FlushLayerError {
|
||||
/// Timeline cancellation token was cancelled
|
||||
@@ -832,8 +847,7 @@ impl Timeline {
|
||||
// "enough".
|
||||
let layers = self
|
||||
.create_image_layers(&partitioning, lsn, false, &image_ctx)
|
||||
.await
|
||||
.map_err(anyhow::Error::from)?;
|
||||
.await?;
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
for layer in layers {
|
||||
remote_client.schedule_layer_file_upload(layer)?;
|
||||
@@ -3200,7 +3214,46 @@ pub(crate) enum CompactionError {
|
||||
ShuttingDown,
|
||||
/// Compaction cannot be done right now; page reconstruction and so on.
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
impl CompactionError {
|
||||
fn other<E>(err: E) -> Self
|
||||
where
|
||||
E: std::error::Error + Send + Sync + 'static,
|
||||
{
|
||||
CompactionError::Other(anyhow::Error::new(err))
|
||||
}
|
||||
}
|
||||
|
||||
impl From<PageReconstructError> for CompactionError {
|
||||
fn from(value: PageReconstructError) -> Self {
|
||||
if value.is_stopping() {
|
||||
CompactionError::ShuttingDown
|
||||
} else {
|
||||
CompactionError::other(value)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<NotInitialized> for CompactionError {
|
||||
fn from(value: NotInitialized) -> Self {
|
||||
if value.is_stopping() {
|
||||
CompactionError::ShuttingDown
|
||||
} else {
|
||||
CompactionError::other(value)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<layer::DownloadError> for CompactionError {
|
||||
fn from(value: layer::DownloadError) -> Self {
|
||||
if value.is_cancelled() {
|
||||
CompactionError::ShuttingDown
|
||||
} else {
|
||||
CompactionError::other(value)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
@@ -3333,7 +3386,7 @@ impl Timeline {
|
||||
stats.read_lock_held_spawn_blocking_startup_micros =
|
||||
stats.read_lock_acquisition_micros.till_now(); // set by caller
|
||||
let layers = guard.layer_map();
|
||||
let level0_deltas = layers.get_level0_deltas()?;
|
||||
let level0_deltas = layers.get_level0_deltas();
|
||||
let mut level0_deltas = level0_deltas
|
||||
.into_iter()
|
||||
.map(|x| guard.get_from_desc(&x))
|
||||
@@ -3380,7 +3433,8 @@ impl Timeline {
|
||||
delta
|
||||
.download_and_keep_resident()
|
||||
.await
|
||||
.context("download layer for failpoint")?,
|
||||
.context("download layer for failpoint")
|
||||
.map_err(CompactionError::Other)?,
|
||||
);
|
||||
}
|
||||
tracing::info!("compact-level0-phase1-return-same"); // so that we can check if we hit the failpoint
|
||||
@@ -3464,7 +3518,7 @@ impl Timeline {
|
||||
let mut all_keys = Vec::new();
|
||||
|
||||
for l in deltas_to_compact.iter() {
|
||||
all_keys.extend(l.load_keys(ctx).await?);
|
||||
all_keys.extend(l.load_keys(ctx).await.map_err(CompactionError::Other)?);
|
||||
}
|
||||
|
||||
// FIXME: should spawn_blocking the rest of this function
|
||||
@@ -3484,7 +3538,10 @@ impl Timeline {
|
||||
// has not so much sense, because largest holes will corresponds field1/field2 changes.
|
||||
// But we are mostly interested to eliminate holes which cause generation of excessive image layers.
|
||||
// That is why it is better to measure size of hole as number of covering image layers.
|
||||
let coverage_size = layers.image_coverage(&key_range, last_record_lsn)?.len();
|
||||
let coverage_size = layers
|
||||
.image_coverage(&key_range, last_record_lsn)
|
||||
.map_err(CompactionError::Other)?
|
||||
.len();
|
||||
if coverage_size >= min_hole_coverage_size {
|
||||
heap.push(Hole {
|
||||
key_range,
|
||||
@@ -3583,7 +3640,7 @@ impl Timeline {
|
||||
key, lsn, ref val, ..
|
||||
} in all_values_iter
|
||||
{
|
||||
let value = val.load(ctx).await?;
|
||||
let value = val.load(ctx).await.map_err(CompactionError::Other)?;
|
||||
let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
|
||||
// We need to check key boundaries once we reach next key or end of layer with the same key
|
||||
if !same_key || lsn == dup_end_lsn {
|
||||
@@ -3640,7 +3697,8 @@ impl Timeline {
|
||||
.take()
|
||||
.unwrap()
|
||||
.finish(prev_key.unwrap().next(), self)
|
||||
.await?,
|
||||
.await
|
||||
.map_err(CompactionError::Other)?,
|
||||
);
|
||||
writer = None;
|
||||
|
||||
@@ -3670,7 +3728,8 @@ impl Timeline {
|
||||
lsn_range.clone()
|
||||
},
|
||||
)
|
||||
.await?,
|
||||
.await
|
||||
.map_err(CompactionError::Other)?,
|
||||
);
|
||||
}
|
||||
|
||||
@@ -3681,7 +3740,12 @@ impl Timeline {
|
||||
});
|
||||
|
||||
if !self.shard_identity.is_key_disposable(&key) {
|
||||
writer.as_mut().unwrap().put_value(key, lsn, value).await?;
|
||||
writer
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.put_value(key, lsn, value)
|
||||
.await
|
||||
.map_err(CompactionError::Other)?;
|
||||
} else {
|
||||
debug!(
|
||||
"Dropping key {} during compaction (it belongs on shard {:?})",
|
||||
@@ -3697,7 +3761,12 @@ impl Timeline {
|
||||
prev_key = Some(key);
|
||||
}
|
||||
if let Some(writer) = writer {
|
||||
new_layers.push(writer.finish(prev_key.unwrap().next(), self).await?);
|
||||
new_layers.push(
|
||||
writer
|
||||
.finish(prev_key.unwrap().next(), self)
|
||||
.await
|
||||
.map_err(CompactionError::Other)?,
|
||||
);
|
||||
}
|
||||
|
||||
// Sync layers
|
||||
@@ -3726,7 +3795,8 @@ impl Timeline {
|
||||
// minimize latency.
|
||||
par_fsync::par_fsync_async(&layer_paths)
|
||||
.await
|
||||
.context("fsync all new layers")?;
|
||||
.context("fsync all new layers")
|
||||
.map_err(CompactionError::Other)?;
|
||||
|
||||
let timeline_dir = self
|
||||
.conf
|
||||
@@ -3734,7 +3804,8 @@ impl Timeline {
|
||||
|
||||
par_fsync::par_fsync_async(&[timeline_dir])
|
||||
.await
|
||||
.context("fsync of timeline dir")?;
|
||||
.context("fsync of timeline dir")
|
||||
.map_err(CompactionError::Other)?;
|
||||
}
|
||||
|
||||
stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();
|
||||
|
||||
@@ -126,6 +126,27 @@ pub(super) struct UploadQueueStopped {
|
||||
pub(super) deleted_at: SetDeletedFlagProgress,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum NotInitialized {
|
||||
#[error("queue is in state Uninitialized")]
|
||||
Uninitialized,
|
||||
#[error("queue is in state Stopping")]
|
||||
Stopped,
|
||||
#[error("queue is shutting down")]
|
||||
ShuttingDown,
|
||||
}
|
||||
|
||||
impl NotInitialized {
|
||||
pub(crate) fn is_stopping(&self) -> bool {
|
||||
use NotInitialized::*;
|
||||
match self {
|
||||
Uninitialized => false,
|
||||
Stopped => true,
|
||||
ShuttingDown => true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl UploadQueue {
|
||||
pub(crate) fn initialize_empty_remote(
|
||||
&mut self,
|
||||
@@ -213,18 +234,20 @@ impl UploadQueue {
|
||||
Ok(self.initialized_mut().expect("we just set it"))
|
||||
}
|
||||
|
||||
pub(crate) fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> {
|
||||
pub(crate) fn initialized_mut(
|
||||
&mut self,
|
||||
) -> Result<&mut UploadQueueInitialized, NotInitialized> {
|
||||
use UploadQueue::*;
|
||||
match self {
|
||||
UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
|
||||
anyhow::bail!("queue is in state {}", self.as_str())
|
||||
}
|
||||
UploadQueue::Initialized(x) => {
|
||||
if !x.shutting_down {
|
||||
Ok(x)
|
||||
Uninitialized => Err(NotInitialized::Uninitialized.into()),
|
||||
Initialized(x) => {
|
||||
if x.shutting_down {
|
||||
Err(NotInitialized::ShuttingDown.into())
|
||||
} else {
|
||||
anyhow::bail!("queue is shutting down")
|
||||
Ok(x)
|
||||
}
|
||||
}
|
||||
Stopped(_) => Err(NotInitialized::Stopped.into()),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -33,11 +33,12 @@ use utils::failpoint_support;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::metrics::WAL_INGEST;
|
||||
use crate::pgdatadir_mapping::*;
|
||||
use crate::pgdatadir_mapping::{DatadirModification, Version};
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::walrecord::*;
|
||||
use crate::ZERO_PAGE;
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
|
||||
@@ -47,11 +47,10 @@ use crate::metrics::{
|
||||
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM,
|
||||
WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
|
||||
};
|
||||
use crate::pgdatadir_mapping::key_to_slru_block;
|
||||
use crate::repository::Key;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
|
||||
use pageserver_api::key::key_to_rel_block;
|
||||
use pageserver_api::key::{key_to_rel_block, key_to_slru_block};
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
|
||||
|
||||
@@ -637,7 +637,7 @@ HandleAlterRole(AlterRoleStmt *stmt)
|
||||
ListCell *option;
|
||||
const char *role_name = stmt->role->rolename;
|
||||
|
||||
if (RoleIsNeonSuperuser(role_name))
|
||||
if (RoleIsNeonSuperuser(role_name) && !superuser())
|
||||
elog(ERROR, "can't ALTER neon_superuser");
|
||||
|
||||
foreach(option, stmt->options)
|
||||
|
||||
@@ -64,10 +64,26 @@ static int max_reconnect_attempts = 60;
|
||||
|
||||
#define MAX_PAGESERVER_CONNSTRING_SIZE 256
|
||||
|
||||
/*
|
||||
* The "neon.pageserver_connstring" GUC is marked with the PGC_SIGHUP option,
|
||||
* allowing it to be changed using pg_reload_conf(). The control plane can
|
||||
* update the connection string if the pageserver crashes, is relocated, or
|
||||
* new shards are added. A copy of the current value of the GUC is kept in
|
||||
* shared memory, updated by the postmaster, because regular backends don't
|
||||
* reload the config during query execution, but we might need to re-establish
|
||||
* the pageserver connection with the new connection string even in the middle
|
||||
* of a query.
|
||||
*
|
||||
* The shared memory copy is protected by a lockless algorithm using two
|
||||
* atomic counters. The counters allow a backend to quickly check if the value
|
||||
* has changed since last access, and to detect and retry copying the value if
|
||||
* the postmaster changes the value concurrently. (Postmaster doesn't have a
|
||||
* PGPROC entry and therefore cannot use LWLocks.)
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
LWLockId lock;
|
||||
pg_atomic_uint64 update_counter;
|
||||
pg_atomic_uint64 begin_update_counter;
|
||||
pg_atomic_uint64 end_update_counter;
|
||||
char pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
|
||||
} PagestoreShmemState;
|
||||
|
||||
@@ -84,7 +100,7 @@ static bool pageserver_flush(void);
|
||||
static void pageserver_disconnect(void);
|
||||
|
||||
static bool
|
||||
PagestoreShmemIsValid()
|
||||
PagestoreShmemIsValid(void)
|
||||
{
|
||||
return pagestore_shared && UsedShmemSegAddr;
|
||||
}
|
||||
@@ -98,31 +114,58 @@ CheckPageserverConnstring(char **newval, void **extra, GucSource source)
|
||||
static void
|
||||
AssignPageserverConnstring(const char *newval, void *extra)
|
||||
{
|
||||
if (!PagestoreShmemIsValid())
|
||||
/*
|
||||
* Only postmaster updates the copy in shared memory.
|
||||
*/
|
||||
if (!PagestoreShmemIsValid() || IsUnderPostmaster)
|
||||
return;
|
||||
LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE);
|
||||
|
||||
pg_atomic_add_fetch_u64(&pagestore_shared->begin_update_counter, 1);
|
||||
pg_write_barrier();
|
||||
strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
|
||||
pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1);
|
||||
LWLockRelease(pagestore_shared->lock);
|
||||
pg_write_barrier();
|
||||
pg_atomic_add_fetch_u64(&pagestore_shared->end_update_counter, 1);
|
||||
}
|
||||
|
||||
static bool
|
||||
CheckConnstringUpdated()
|
||||
CheckConnstringUpdated(void)
|
||||
{
|
||||
if (!PagestoreShmemIsValid())
|
||||
return false;
|
||||
return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter);
|
||||
return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->begin_update_counter);
|
||||
}
|
||||
|
||||
static void
|
||||
ReloadConnstring()
|
||||
ReloadConnstring(void)
|
||||
{
|
||||
uint64 begin_update_counter;
|
||||
uint64 end_update_counter;
|
||||
|
||||
if (!PagestoreShmemIsValid())
|
||||
return;
|
||||
LWLockAcquire(pagestore_shared->lock, LW_SHARED);
|
||||
strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
|
||||
pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter);
|
||||
LWLockRelease(pagestore_shared->lock);
|
||||
|
||||
/*
|
||||
* Copy the current settnig from shared to local memory. Postmaster can
|
||||
* update the value concurrently, in which case we would copy a garbled
|
||||
* mix of the old and new values. We will detect it because the counter's
|
||||
* won't match, and retry. But it's important that we don't do anything
|
||||
* within the retry-loop that would depend on the string having valid
|
||||
* contents.
|
||||
*/
|
||||
do
|
||||
{
|
||||
begin_update_counter = pg_atomic_read_u64(&pagestore_shared->begin_update_counter);
|
||||
end_update_counter = pg_atomic_read_u64(&pagestore_shared->end_update_counter);
|
||||
pg_read_barrier();
|
||||
|
||||
strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
|
||||
pg_read_barrier();
|
||||
}
|
||||
while (begin_update_counter != end_update_counter
|
||||
|| begin_update_counter != pg_atomic_read_u64(&pagestore_shared->begin_update_counter)
|
||||
|| end_update_counter != pg_atomic_read_u64(&pagestore_shared->end_update_counter));
|
||||
|
||||
pagestore_local_counter = end_update_counter;
|
||||
}
|
||||
|
||||
static bool
|
||||
@@ -137,7 +180,7 @@ pageserver_connect(int elevel)
|
||||
static TimestampTz last_connect_time = 0;
|
||||
static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
|
||||
TimestampTz now;
|
||||
uint64_t us_since_last_connect;
|
||||
uint64_t us_since_last_connect;
|
||||
|
||||
Assert(!connected);
|
||||
|
||||
@@ -147,7 +190,7 @@ pageserver_connect(int elevel)
|
||||
}
|
||||
|
||||
now = GetCurrentTimestamp();
|
||||
us_since_last_connect = now - last_connect_time;
|
||||
us_since_last_connect = now - last_connect_time;
|
||||
if (us_since_last_connect < delay_us)
|
||||
{
|
||||
pg_usleep(delay_us - us_since_last_connect);
|
||||
@@ -505,8 +548,8 @@ PagestoreShmemInit(void)
|
||||
&found);
|
||||
if (!found)
|
||||
{
|
||||
pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock);
|
||||
pg_atomic_init_u64(&pagestore_shared->update_counter, 0);
|
||||
pg_atomic_init_u64(&pagestore_shared->begin_update_counter, 0);
|
||||
pg_atomic_init_u64(&pagestore_shared->end_update_counter, 0);
|
||||
AssignPageserverConnstring(page_server_connstring, NULL);
|
||||
}
|
||||
LWLockRelease(AddinShmemInitLock);
|
||||
@@ -531,7 +574,6 @@ pagestore_shmem_request(void)
|
||||
#endif
|
||||
|
||||
RequestAddinShmemSpace(PagestoreShmemSize());
|
||||
RequestNamedLWLockTranche("neon_libpagestore", 1);
|
||||
}
|
||||
|
||||
static void
|
||||
|
||||
@@ -110,7 +110,7 @@ pub static REMOVED_WAL_SEGMENTS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
pub static BACKED_UP_SEGMENTS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"safekeeper_backed_up_segments_total",
|
||||
"Number of WAL segments backed up to the broker"
|
||||
"Number of WAL segments backed up to the S3"
|
||||
)
|
||||
.expect("Failed to register safekeeper_backed_up_segments_total counter")
|
||||
});
|
||||
@@ -337,6 +337,7 @@ pub struct TimelineCollector {
|
||||
flushed_wal_seconds: GaugeVec,
|
||||
collect_timeline_metrics: Gauge,
|
||||
timelines_count: IntGauge,
|
||||
active_timelines_count: IntGauge,
|
||||
}
|
||||
|
||||
impl Default for TimelineCollector {
|
||||
@@ -520,6 +521,13 @@ impl TimelineCollector {
|
||||
.unwrap();
|
||||
descs.extend(timelines_count.desc().into_iter().cloned());
|
||||
|
||||
let active_timelines_count = IntGauge::new(
|
||||
"safekeeper_active_timelines",
|
||||
"Total number of active timelines",
|
||||
)
|
||||
.unwrap();
|
||||
descs.extend(active_timelines_count.desc().into_iter().cloned());
|
||||
|
||||
TimelineCollector {
|
||||
descs,
|
||||
commit_lsn,
|
||||
@@ -540,6 +548,7 @@ impl TimelineCollector {
|
||||
flushed_wal_seconds,
|
||||
collect_timeline_metrics,
|
||||
timelines_count,
|
||||
active_timelines_count,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -572,6 +581,7 @@ impl Collector for TimelineCollector {
|
||||
|
||||
let timelines = GlobalTimelines::get_all();
|
||||
let timelines_count = timelines.len();
|
||||
let mut active_timelines_count = 0;
|
||||
|
||||
// Prometheus Collector is sync, and data is stored under async lock. To
|
||||
// bridge the gap with a crutch, collect data in spawned thread with
|
||||
@@ -590,6 +600,10 @@ impl Collector for TimelineCollector {
|
||||
let timeline_id = tli.ttid.timeline_id.to_string();
|
||||
let labels = &[tenant_id.as_str(), timeline_id.as_str()];
|
||||
|
||||
if tli.timeline_is_active {
|
||||
active_timelines_count += 1;
|
||||
}
|
||||
|
||||
self.commit_lsn
|
||||
.with_label_values(labels)
|
||||
.set(tli.mem_state.commit_lsn.into());
|
||||
@@ -681,6 +695,8 @@ impl Collector for TimelineCollector {
|
||||
|
||||
// report total number of timelines
|
||||
self.timelines_count.set(timelines_count as i64);
|
||||
self.active_timelines_count
|
||||
.set(active_timelines_count as i64);
|
||||
mfs.extend(self.timelines_count.collect());
|
||||
|
||||
mfs
|
||||
|
||||
@@ -10,11 +10,15 @@ use crate::{GlobalTimelines, SafeKeeperConf};
|
||||
pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
let wal_removal_interval = Duration::from_millis(5000);
|
||||
loop {
|
||||
let now = tokio::time::Instant::now();
|
||||
let mut active_timelines = 0;
|
||||
|
||||
let tlis = GlobalTimelines::get_all();
|
||||
for tli in &tlis {
|
||||
if !tli.is_active().await {
|
||||
continue;
|
||||
}
|
||||
active_timelines += 1;
|
||||
let ttid = tli.ttid;
|
||||
async {
|
||||
if let Err(e) = tli.maybe_persist_control_file().await {
|
||||
@@ -27,6 +31,17 @@ pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
.instrument(info_span!("WAL removal", ttid = %ttid))
|
||||
.await;
|
||||
}
|
||||
|
||||
let elapsed = now.elapsed();
|
||||
let total_timelines = tlis.len();
|
||||
|
||||
if elapsed > wal_removal_interval {
|
||||
info!(
|
||||
"WAL removal is too long, processed {} active timelines ({} total) in {:?}",
|
||||
active_timelines, total_timelines, elapsed
|
||||
);
|
||||
}
|
||||
|
||||
sleep(wal_removal_interval).await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2914,6 +2914,7 @@ class Endpoint(PgProtocol):
|
||||
|
||||
# Write it back updated
|
||||
with open(config_path, "w") as file:
|
||||
log.info(json.dumps(dict(data_dict, **kwargs)))
|
||||
json.dump(dict(data_dict, **kwargs), file, indent=4)
|
||||
|
||||
# Mock the extension part of spec passed from control plane for local testing
|
||||
|
||||
@@ -248,8 +248,15 @@ def test_ddl_forwarding(ddl: DdlForwardingContext):
|
||||
# We don't have compute_ctl, so here, so create neon_superuser here manually
|
||||
cur.execute("CREATE ROLE neon_superuser NOLOGIN CREATEDB CREATEROLE")
|
||||
|
||||
with pytest.raises(psycopg2.InternalError):
|
||||
cur.execute("ALTER ROLE neon_superuser LOGIN")
|
||||
# Contrary to popular belief, being superman does not make you superuser
|
||||
cur.execute("CREATE ROLE superman LOGIN NOSUPERUSER PASSWORD 'jungle_man'")
|
||||
|
||||
with ddl.pg.cursor(user="superman", password="jungle_man") as superman_cur:
|
||||
# We allow real SUPERUSERs to ALTER neon_superuser
|
||||
with pytest.raises(psycopg2.InternalError):
|
||||
superman_cur.execute("ALTER ROLE neon_superuser LOGIN")
|
||||
|
||||
cur.execute("ALTER ROLE neon_superuser LOGIN")
|
||||
|
||||
with pytest.raises(psycopg2.InternalError):
|
||||
cur.execute("CREATE DATABASE trololobus WITH OWNER neon_superuser")
|
||||
|
||||
37
test_runner/regress/test_migrations.py
Normal file
37
test_runner/regress/test_migrations.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import time
|
||||
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
|
||||
|
||||
def test_migrations(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
env.neon_cli.create_branch("test_migrations", "empty")
|
||||
|
||||
endpoint = env.endpoints.create("test_migrations")
|
||||
log_path = endpoint.endpoint_path() / "compute.log"
|
||||
|
||||
endpoint.respec(skip_pg_catalog_updates=False, features=["migrations"])
|
||||
endpoint.start()
|
||||
|
||||
time.sleep(1) # Sleep to let migrations run
|
||||
|
||||
with endpoint.cursor() as cur:
|
||||
cur.execute("SELECT id FROM neon_migration.migration_id")
|
||||
migration_id = cur.fetchall()
|
||||
assert migration_id[0][0] == 2
|
||||
|
||||
with open(log_path, "r") as log_file:
|
||||
logs = log_file.read()
|
||||
assert "INFO handle_migrations: Ran 2 migrations" in logs
|
||||
|
||||
endpoint.stop()
|
||||
endpoint.start()
|
||||
time.sleep(1) # Sleep to let migrations run
|
||||
with endpoint.cursor() as cur:
|
||||
cur.execute("SELECT id FROM neon_migration.migration_id")
|
||||
migration_id = cur.fetchall()
|
||||
assert migration_id[0][0] == 2
|
||||
|
||||
with open(log_path, "r") as log_file:
|
||||
logs = log_file.read()
|
||||
assert "INFO handle_migrations: Ran 0 migrations" in logs
|
||||
42
test_runner/regress/test_pageserver_reconnect.py
Normal file
42
test_runner/regress/test_pageserver_reconnect.py
Normal file
@@ -0,0 +1,42 @@
|
||||
import threading
|
||||
import time
|
||||
from contextlib import closing
|
||||
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnv, PgBin
|
||||
|
||||
|
||||
# Test updating neon.pageserver_connstring setting on the fly.
|
||||
#
|
||||
# This merely changes some whitespace in the connection string, so
|
||||
# this doesn't prove that the new string actually takes effect. But at
|
||||
# least the code gets exercised.
|
||||
def test_pageserver_reconnect(neon_simple_env: NeonEnv, pg_bin: PgBin):
|
||||
env = neon_simple_env
|
||||
env.neon_cli.create_branch("test_pageserver_restarts")
|
||||
endpoint = env.endpoints.create_start("test_pageserver_restarts")
|
||||
n_reconnects = 1000
|
||||
timeout = 0.01
|
||||
scale = 10
|
||||
|
||||
def run_pgbench(connstr: str):
|
||||
log.info(f"Start a pgbench workload on pg {connstr}")
|
||||
pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
|
||||
pg_bin.run_capture(["pgbench", f"-T{int(n_reconnects*timeout)}", connstr])
|
||||
|
||||
thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True)
|
||||
thread.start()
|
||||
|
||||
with closing(endpoint.connect()) as con:
|
||||
with con.cursor() as c:
|
||||
c.execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'")
|
||||
connstring = c.fetchall()[0][0]
|
||||
for i in range(n_reconnects):
|
||||
time.sleep(timeout)
|
||||
c.execute(
|
||||
"alter system set neon.pageserver_connstring=%s",
|
||||
(connstring + (" " * (i % 2)),),
|
||||
)
|
||||
c.execute("select pg_reload_conf()")
|
||||
|
||||
thread.join()
|
||||
Reference in New Issue
Block a user