mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-04 12:02:55 +00:00
[BRC-2905] Feed back PS-detected data corruption signals to SK and PG… (#12748)
… walproposer (#895) Data corruptions are typically detected on the pageserver side when it replays WAL records. However, since PS doesn't synchronously replay WAL records as they are being ingested through safekeepers, we need some extra plumbing to feed information about pageserver-detected corruptions during compaction (and/or WAL redo in general) back to SK and PG for proper action. We don't yet know what actions PG/SK should take upon receiving the signal, but we should have the detection and feedback in place. Add an extra `corruption_detected` field to the `PageserverFeedback` message that is sent from PS -> SK -> PG. It's a boolean value that is set to true when PS detects a "critical error" that signals data corruption, and it's sent in all `PageserverFeedback` messages. Upon receiving this signal, the safekeeper raises a `safekeeper_ps_corruption_detected` gauge metric (value set to 1). The safekeeper then forwards this signal to PG where a `ps_corruption_detected` gauge metric (value also set to 1) is raised in the `neon_perf_counters` view. Added an integration test in `test_compaction.py::test_ps_corruption_detection_feedback` that confirms that the safekeeper and PG can receive the data corruption signal in the `PageserverFeedback` message in a simulated data corruption. ## Problem ## Summary of changes --------- Co-authored-by: William Huang <william.huang@databricks.com>
This commit is contained in:
@@ -45,6 +45,7 @@ DatabricksMetricsShmemInit(void)
|
||||
pg_atomic_init_u32(&databricks_metrics_shared->index_corruption_count, 0);
|
||||
pg_atomic_init_u32(&databricks_metrics_shared->data_corruption_count, 0);
|
||||
pg_atomic_init_u32(&databricks_metrics_shared->internal_error_count, 0);
|
||||
pg_atomic_init_u32(&databricks_metrics_shared->ps_corruption_detected, 0);
|
||||
}
|
||||
}
|
||||
/* END_HADRON */
|
||||
@@ -440,6 +441,7 @@ neon_get_perf_counters(PG_FUNCTION_ARGS)
|
||||
{"sql_index_corruption_count", false, 0, (double) pg_atomic_read_u32(&databricks_metrics_shared->index_corruption_count)},
|
||||
{"sql_data_corruption_count", false, 0, (double) pg_atomic_read_u32(&databricks_metrics_shared->data_corruption_count)},
|
||||
{"sql_internal_error_count", false, 0, (double) pg_atomic_read_u32(&databricks_metrics_shared->internal_error_count)},
|
||||
{"ps_corruption_detected", false, 0, (double) pg_atomic_read_u32(&databricks_metrics_shared->ps_corruption_detected)},
|
||||
{NULL, false, 0, 0},
|
||||
};
|
||||
for (int i = 0; databricks_metrics[i].name != NULL; i++)
|
||||
|
||||
@@ -183,6 +183,7 @@ typedef struct
|
||||
pg_atomic_uint32 index_corruption_count;
|
||||
pg_atomic_uint32 data_corruption_count;
|
||||
pg_atomic_uint32 internal_error_count;
|
||||
pg_atomic_uint32 ps_corruption_detected;
|
||||
} databricks_metrics;
|
||||
|
||||
extern databricks_metrics *databricks_metrics_shared;
|
||||
|
||||
@@ -1887,6 +1887,12 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
|
||||
ps_feedback->shard_number = pq_getmsgint(reply_message, sizeof(uint32));
|
||||
psfeedback_log("%u", key, ps_feedback->shard_number);
|
||||
}
|
||||
else if (strcmp(key, "corruption_detected") == 0)
|
||||
{
|
||||
Assert(value_len == 1);
|
||||
ps_feedback->corruption_detected = pq_getmsgbyte(reply_message) != 0;
|
||||
psfeedback_log("%s", key, ps_feedback->corruption_detected ? "true" : "false");
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
|
||||
@@ -374,6 +374,8 @@ typedef struct PageserverFeedback
|
||||
XLogRecPtr remote_consistent_lsn;
|
||||
TimestampTz replytime;
|
||||
uint32 shard_number;
|
||||
/* true if the pageserver has detected data corruption in the timeline */
|
||||
bool corruption_detected;
|
||||
} PageserverFeedback;
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
|
||||
@@ -49,6 +49,7 @@
|
||||
|
||||
#include "libpqwalproposer.h"
|
||||
#include "neon.h"
|
||||
#include "neon_perf_counters.h"
|
||||
#include "neon_walreader.h"
|
||||
#include "walproposer.h"
|
||||
|
||||
@@ -741,6 +742,11 @@ record_pageserver_feedback(PageserverFeedback *ps_feedback, shardno_t num_shards
|
||||
Assert(ps_feedback->shard_number < MAX_SHARDS);
|
||||
Assert(ps_feedback->shard_number < num_shards);
|
||||
|
||||
// Begin Hadron: Record any corruption signal from the pageserver first.
|
||||
if (ps_feedback->corruption_detected) {
|
||||
pg_atomic_write_u32(&databricks_metrics_shared->ps_corruption_detected, 1);
|
||||
}
|
||||
|
||||
SpinLockAcquire(&walprop_shared->mutex);
|
||||
|
||||
// Hadron: Update the num_shards from the source-of-truth (shard map) lazily when we receive
|
||||
|
||||
Reference in New Issue
Block a user