mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-10 15:02:56 +00:00
## Problem This is a follow-up to TODO, as part of the effort to rewire the compute reconfiguration/notification mechanism to make it more robust. Please refer to that commit or ticket BRC-1778 for full context of the problem. ## Summary of changes The previous change added mechanism in `compute_ctl` that makes it possible to refresh the configuration of PG on-demand by having `compute_ctl` go out to download a new config from the control plane/HCC. This change wired this mechanism up with PG so that PG will signal `compute_ctl` to refresh its configuration when it suspects that it could be talking to incorrect pageservers due to a stale configuration. PG will become suspicious that it is talking to the wrong pageservers in the following situations: 1. It cannot connect to a pageserver (e.g., getting a network-level connection refused error) 2. It can connect to a pageserver, but the pageserver does not return any data for the GetPage request 3. It can connect to a pageserver, but the pageserver returns a malformed response 4. It can connect to a pageserver, but there is an error receiving the GetPage request response for any other reason This change also includes a minor tweak to `compute_ctl`'s config refresh behavior. Upon receiving a request to refresh PG configuration, `compute_ctl` will reach out to download a config, but it will not attempt to apply the configuration if the config is the same as the old config is it replacing. This optimization is added because the act of reconfiguring itself requires working pageserver connections. In many failure situations it is likely that PG detects an issue with a pageserver before the control plane can detect the issue, migrate tenants, and update the compute config. In this case even the latest compute config won't point PG to working pageservers, causing the configuration attempt to hang and negatively impact PG's time-to-recovery. With this change, `compute_ctl` only attempts reconfiguration if the refreshed config points PG to different pageservers. ## How is this tested? The new code paths are exercised in all existing tests because this mechanism is on by default. Explicitly tested in `test_runner/regress/test_change_pageserver.py`. Co-authored-by: William Huang <william.huang@databricks.com>
113 lines
3.3 KiB
C
113 lines
3.3 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* extension_server.c
|
|
* Request compute_ctl to download extension files.
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#include "postgres.h"
|
|
|
|
#include <curl/curl.h>
|
|
|
|
#include "utils/guc.h"
|
|
|
|
#include "extension_server.h"
|
|
#include "neon_utils.h"
|
|
|
|
int hadron_extension_server_port = 0;
|
|
static int extension_server_request_timeout = 60;
|
|
static int extension_server_connect_timeout = 60;
|
|
|
|
static download_extension_file_hook_type prev_download_extension_file_hook = NULL;
|
|
|
|
/*
|
|
* to download all SQL (and data) files for an extension:
|
|
* curl -X POST http://localhost:8080/extension_server/postgis
|
|
* it covers two possible extension files layouts:
|
|
* 1. extension_name--version--platform.sql
|
|
* 2. extension_name/extension_name--version.sql
|
|
* extension_name/extra_files.csv
|
|
* to download specific library file:
|
|
* curl -X POST http://localhost:8080/extension_server/postgis-3.so?is_library=true
|
|
*/
|
|
static bool
|
|
neon_download_extension_file_http(const char *filename, bool is_library)
|
|
{
|
|
CURLcode res;
|
|
bool ret = false;
|
|
CURL *handle = NULL;
|
|
char *compute_ctl_url;
|
|
|
|
handle = alloc_curl_handle();
|
|
|
|
curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "POST");
|
|
if (extension_server_request_timeout > 0)
|
|
curl_easy_setopt(handle, CURLOPT_TIMEOUT, (long)extension_server_request_timeout /* seconds */ );
|
|
if (extension_server_connect_timeout > 0)
|
|
curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, (long)extension_server_connect_timeout /* seconds */ );
|
|
|
|
compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
|
|
hadron_extension_server_port, filename, is_library ? "?is_library=true" : "");
|
|
|
|
elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url);
|
|
|
|
curl_easy_setopt(handle, CURLOPT_URL, compute_ctl_url);
|
|
|
|
/* Perform the request, res will get the return code */
|
|
res = curl_easy_perform(handle);
|
|
curl_easy_cleanup(handle);
|
|
|
|
/* Check for errors */
|
|
if (res == CURLE_OK)
|
|
{
|
|
ret = true;
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* Don't error here because postgres will try to find the file and will
|
|
* fail with some proper error message if it's not found.
|
|
*/
|
|
elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res));
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
void
|
|
pg_init_extension_server()
|
|
{
|
|
/* Port to connect to compute_ctl on localhost */
|
|
/* to request extension files. */
|
|
DefineCustomIntVariable("neon.extension_server_port",
|
|
"connection string to the compute_ctl",
|
|
NULL,
|
|
&hadron_extension_server_port,
|
|
0, 0, INT_MAX,
|
|
PGC_POSTMASTER,
|
|
0, /* no flags required */
|
|
NULL, NULL, NULL);
|
|
|
|
DefineCustomIntVariable("neon.extension_server_request_timeout",
|
|
"timeout for fetching extensions in seconds",
|
|
NULL,
|
|
&extension_server_request_timeout,
|
|
60, 0, INT_MAX,
|
|
PGC_SUSET,
|
|
GUC_UNIT_S,
|
|
NULL, NULL, NULL);
|
|
|
|
DefineCustomIntVariable("neon.extension_server_connect_timeout",
|
|
"timeout for connecting to the extension server in seconds",
|
|
NULL,
|
|
&extension_server_connect_timeout,
|
|
60, 0, INT_MAX,
|
|
PGC_SUSET,
|
|
GUC_UNIT_S,
|
|
NULL, NULL, NULL);
|
|
|
|
/* set download_extension_file_hook */
|
|
prev_download_extension_file_hook = download_extension_file_hook;
|
|
download_extension_file_hook = neon_download_extension_file_http;
|
|
}
|