mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-05 06:20:37 +00:00
storcon: boilerplate to upsert safekeeper records on deploy (#8879)
We currently do not record safekeepers in the storage controller database. We want to migrate timelines across safekeepers eventually, so start recording the safekeepers on deploy. Cc: #8698
This commit is contained in:
@@ -0,0 +1,2 @@
|
||||
-- This file should undo anything in `up.sql`
|
||||
DROP TABLE safekeepers;
|
||||
@@ -0,0 +1,15 @@
|
||||
-- started out as a copy of cplane schema, removed the unnecessary columns.
|
||||
CREATE TABLE safekeepers (
|
||||
-- the surrogate identifier defined by control plane database sequence
|
||||
id BIGINT PRIMARY KEY,
|
||||
region_id TEXT NOT NULL,
|
||||
version BIGINT NOT NULL,
|
||||
-- the natural id on whatever cloud platform, not needed in storage controller
|
||||
-- instance_id TEXT UNIQUE NOT NULL,
|
||||
host TEXT NOT NULL,
|
||||
port INTEGER NOT NULL,
|
||||
active BOOLEAN NOT NULL DEFAULT false,
|
||||
-- projects_count INTEGER NOT NULL DEFAULT 0,
|
||||
http_port INTEGER NOT NULL,
|
||||
availability_zone_id TEXT NOT NULL
|
||||
);
|
||||
@@ -2,6 +2,7 @@ use crate::metrics::{
|
||||
HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup,
|
||||
METRICS_REGISTRY,
|
||||
};
|
||||
use crate::persistence::SafekeeperPersistence;
|
||||
use crate::reconciler::ReconcileError;
|
||||
use crate::service::{LeadershipStatus, Service, STARTUP_RECONCILE_TIMEOUT};
|
||||
use anyhow::Context;
|
||||
@@ -767,6 +768,55 @@ impl From<ReconcileError> for ApiError {
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the safekeeper record by instance id, or 404.
|
||||
///
|
||||
/// Not used by anything except manual testing.
|
||||
async fn handle_get_safekeeper(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let id = parse_request_param::<i64>(&req, "id")?;
|
||||
|
||||
let state = get_state(&req);
|
||||
|
||||
let res = state.service.get_safekeeper(id).await;
|
||||
|
||||
match res {
|
||||
Ok(b) => json_response(StatusCode::OK, b),
|
||||
Err(crate::persistence::DatabaseError::Query(diesel::result::Error::NotFound)) => {
|
||||
Err(ApiError::NotFound("unknown instance_id".into()))
|
||||
}
|
||||
Err(other) => Err(other.into()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Used as part of deployment scripts.
|
||||
///
|
||||
/// Assumes information is only relayed to storage controller after first selecting an unique id on
|
||||
/// control plane database, which means we have an id field in the request and payload.
|
||||
async fn handle_upsert_safekeeper(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let body = json_request::<SafekeeperPersistence>(&mut req).await?;
|
||||
let id = parse_request_param::<i64>(&req, "id")?;
|
||||
|
||||
if id != body.id {
|
||||
// it should be repeated
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"id mismatch: url={id:?}, body={:?}",
|
||||
body.id
|
||||
)));
|
||||
}
|
||||
|
||||
let state = get_state(&req);
|
||||
|
||||
state.service.upsert_safekeeper(body).await?;
|
||||
|
||||
Ok(Response::builder()
|
||||
.status(StatusCode::NO_CONTENT)
|
||||
.body(Body::empty())
|
||||
.unwrap())
|
||||
}
|
||||
|
||||
/// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
|
||||
/// be allowed to run if Service has finished its initial reconciliation.
|
||||
async fn tenant_service_handler<R, H>(
|
||||
@@ -1127,6 +1177,13 @@ pub fn make_router(
|
||||
.put("/control/v1/step_down", |r| {
|
||||
named_request_span(r, handle_step_down, RequestName("control_v1_step_down"))
|
||||
})
|
||||
.get("/control/v1/safekeeper/:id", |r| {
|
||||
named_request_span(r, handle_get_safekeeper, RequestName("v1_safekeeper"))
|
||||
})
|
||||
.post("/control/v1/safekeeper/:id", |r| {
|
||||
// id is in the body
|
||||
named_request_span(r, handle_upsert_safekeeper, RequestName("v1_safekeeper"))
|
||||
})
|
||||
// Tenant operations
|
||||
// The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
|
||||
// this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
|
||||
|
||||
@@ -938,6 +938,48 @@ impl Persistence {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn safekeeper_get(
|
||||
&self,
|
||||
id: i64,
|
||||
) -> Result<SafekeeperPersistence, DatabaseError> {
|
||||
use crate::schema::safekeepers::dsl::{id as id_column, safekeepers};
|
||||
self.with_conn(move |conn| -> DatabaseResult<SafekeeperPersistence> {
|
||||
Ok(safekeepers
|
||||
.filter(id_column.eq(&id))
|
||||
.select(SafekeeperPersistence::as_select())
|
||||
.get_result(conn)?)
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn safekeeper_upsert(
|
||||
&self,
|
||||
record: SafekeeperPersistence,
|
||||
) -> Result<(), DatabaseError> {
|
||||
use crate::schema::safekeepers::dsl::*;
|
||||
|
||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||
let bind = record.as_insert_or_update();
|
||||
|
||||
let inserted_updated = diesel::insert_into(safekeepers)
|
||||
.values(&bind)
|
||||
.on_conflict(id)
|
||||
.do_update()
|
||||
.set(&bind)
|
||||
.execute(conn)?;
|
||||
|
||||
if inserted_updated != 1 {
|
||||
return Err(DatabaseError::Logical(format!(
|
||||
"unexpected number of rows ({})",
|
||||
inserted_updated
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
|
||||
@@ -1073,3 +1115,47 @@ pub(crate) struct ControllerPersistence {
|
||||
pub(crate) address: String,
|
||||
pub(crate) started_at: chrono::DateTime<chrono::Utc>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Queryable, Selectable, Eq, PartialEq, Debug, Clone)]
|
||||
#[diesel(table_name = crate::schema::safekeepers)]
|
||||
pub(crate) struct SafekeeperPersistence {
|
||||
pub(crate) id: i64,
|
||||
pub(crate) region_id: String,
|
||||
/// 1 is special, it means just created (not currently posted to storcon).
|
||||
/// Zero or negative is not really expected.
|
||||
/// Otherwise the number from `release-$(number_of_commits_on_branch)` tag.
|
||||
pub(crate) version: i64,
|
||||
pub(crate) host: String,
|
||||
pub(crate) port: i32,
|
||||
pub(crate) active: bool,
|
||||
pub(crate) http_port: i32,
|
||||
pub(crate) availability_zone_id: String,
|
||||
}
|
||||
|
||||
impl SafekeeperPersistence {
|
||||
fn as_insert_or_update(&self) -> InsertUpdateSafekeeper<'_> {
|
||||
InsertUpdateSafekeeper {
|
||||
id: self.id,
|
||||
region_id: &self.region_id,
|
||||
version: self.version,
|
||||
host: &self.host,
|
||||
port: self.port,
|
||||
active: self.active,
|
||||
http_port: self.http_port,
|
||||
availability_zone_id: &self.availability_zone_id,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Insertable, AsChangeset)]
|
||||
#[diesel(table_name = crate::schema::safekeepers)]
|
||||
struct InsertUpdateSafekeeper<'a> {
|
||||
id: i64,
|
||||
region_id: &'a str,
|
||||
version: i64,
|
||||
host: &'a str,
|
||||
port: i32,
|
||||
active: bool,
|
||||
http_port: i32,
|
||||
availability_zone_id: &'a str,
|
||||
}
|
||||
|
||||
@@ -45,3 +45,17 @@ diesel::table! {
|
||||
}
|
||||
|
||||
diesel::allow_tables_to_appear_in_same_query!(controllers, metadata_health, nodes, tenant_shards,);
|
||||
|
||||
diesel::table! {
|
||||
safekeepers {
|
||||
id -> Int8,
|
||||
region_id -> Text,
|
||||
version -> Int8,
|
||||
instance_id -> Text,
|
||||
host -> Text,
|
||||
port -> Int4,
|
||||
active -> Bool,
|
||||
http_port -> Int4,
|
||||
availability_zone_id -> Text,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6476,4 +6476,18 @@ impl Service {
|
||||
|
||||
global_observed
|
||||
}
|
||||
|
||||
pub(crate) async fn get_safekeeper(
|
||||
&self,
|
||||
id: i64,
|
||||
) -> Result<crate::persistence::SafekeeperPersistence, DatabaseError> {
|
||||
self.persistence.safekeeper_get(id).await
|
||||
}
|
||||
|
||||
pub(crate) async fn upsert_safekeeper(
|
||||
&self,
|
||||
record: crate::persistence::SafekeeperPersistence,
|
||||
) -> Result<(), DatabaseError> {
|
||||
self.persistence.safekeeper_upsert(record).await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2845,6 +2845,29 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
|
||||
raise AssertionError("unreachable")
|
||||
|
||||
def on_safekeeper_deploy(self, id: int, body: dict[str, Any]):
|
||||
self.request(
|
||||
"POST",
|
||||
f"{self.api}/control/v1/safekeeper/{id}",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
json=body,
|
||||
)
|
||||
|
||||
def get_safekeeper(self, id: int) -> Optional[dict[str, Any]]:
|
||||
try:
|
||||
response = self.request(
|
||||
"GET",
|
||||
f"{self.api}/control/v1/safekeeper/{id}",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
json = response.json()
|
||||
assert isinstance(json, dict)
|
||||
return json
|
||||
except StorageControllerApiException as e:
|
||||
if e.status_code == 404:
|
||||
return None
|
||||
raise e
|
||||
|
||||
def __enter__(self) -> "NeonStorageController":
|
||||
return self
|
||||
|
||||
|
||||
@@ -31,7 +31,7 @@ from fixtures.pageserver.utils import (
|
||||
remote_storage_delete_key,
|
||||
timeline_delete_wait_completed,
|
||||
)
|
||||
from fixtures.pg_version import PgVersion
|
||||
from fixtures.pg_version import PgVersion, run_only_on_default_postgres
|
||||
from fixtures.port_distributor import PortDistributor
|
||||
from fixtures.remote_storage import RemoteStorageKind, s3_storage
|
||||
from fixtures.storage_controller_proxy import StorageControllerProxy
|
||||
@@ -2330,3 +2330,69 @@ def test_storage_controller_timeline_crud_race(neon_env_builder: NeonEnvBuilder)
|
||||
connect=0, # Disable retries: we want to see the 503
|
||||
)
|
||||
).timeline_create(PgVersion.NOT_SET, tenant_id, create_timeline_id)
|
||||
|
||||
|
||||
@run_only_on_default_postgres("this is like a 'unit test' against storcon db")
|
||||
def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
|
||||
fake_id = 5
|
||||
|
||||
target = env.storage_controller
|
||||
|
||||
assert target.get_safekeeper(fake_id) is None
|
||||
|
||||
body = {
|
||||
"active": True,
|
||||
"id": fake_id,
|
||||
"created_at": "2023-10-25T09:11:25Z",
|
||||
"updated_at": "2024-08-28T11:32:43Z",
|
||||
"region_id": "aws-us-east-2",
|
||||
"host": "safekeeper-333.us-east-2.aws.neon.build",
|
||||
"port": 6401,
|
||||
"http_port": 7676,
|
||||
"version": 5957,
|
||||
"availability_zone_id": "us-east-2b",
|
||||
}
|
||||
|
||||
target.on_safekeeper_deploy(fake_id, body)
|
||||
|
||||
inserted = target.get_safekeeper(fake_id)
|
||||
assert inserted is not None
|
||||
assert eq_safekeeper_records(body, inserted)
|
||||
|
||||
# error out if pk is changed (unexpected)
|
||||
with pytest.raises(StorageControllerApiException) as exc:
|
||||
different_pk = dict(body)
|
||||
different_pk["id"] = 4
|
||||
assert different_pk["id"] != body["id"]
|
||||
target.on_safekeeper_deploy(fake_id, different_pk)
|
||||
assert exc.value.status_code == 400
|
||||
|
||||
inserted_again = target.get_safekeeper(fake_id)
|
||||
assert inserted_again is not None
|
||||
assert eq_safekeeper_records(inserted, inserted_again)
|
||||
|
||||
# the most common case, version goes up:
|
||||
assert isinstance(body["version"], int)
|
||||
body["version"] += 1
|
||||
target.on_safekeeper_deploy(fake_id, body)
|
||||
inserted_now = target.get_safekeeper(fake_id)
|
||||
assert inserted_now is not None
|
||||
|
||||
assert eq_safekeeper_records(body, inserted_now)
|
||||
|
||||
|
||||
def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool:
|
||||
compared = [dict(a), dict(b)]
|
||||
|
||||
masked_keys = ["created_at", "updated_at"]
|
||||
|
||||
for d in compared:
|
||||
# keep deleting these in case we are comparing the body as it will be uploaded by real scripts
|
||||
for key in masked_keys:
|
||||
if key in d:
|
||||
del d[key]
|
||||
|
||||
return compared[0] == compared[1]
|
||||
|
||||
Reference in New Issue
Block a user