Replace etcd with storage_broker.

This is the replacement itself, the binary landed earlier. See
docs/storage_broker.md.

ref
https://github.com/neondatabase/neon/pull/2466
https://github.com/neondatabase/neon/issues/2394
This commit is contained in:
Arseny Sher
2022-09-16 13:44:28 +03:00
committed by Arseny Sher
parent 249d77c720
commit 32662ff1c4
56 changed files with 1064 additions and 2222 deletions

View File

@@ -3,7 +3,7 @@ storage:
bucket_name: neon-storage-ireland bucket_name: neon-storage-ireland
bucket_region: eu-west-1 bucket_region: eu-west-1
console_mgmt_base_url: http://neon-stress-console.local console_mgmt_base_url: http://neon-stress-console.local
etcd_endpoints: neon-stress-etcd.local:2379 broker_endpoint: http://storage-broker.neon-stress.local:50051
safekeeper_enable_s3_offload: 'false' safekeeper_enable_s3_offload: 'false'
pageserver_config_stub: pageserver_config_stub:
pg_distrib_dir: /usr/local pg_distrib_dir: /usr/local

View File

@@ -3,7 +3,7 @@ storage:
bucket_name: neon-prod-storage-ap-southeast-1 bucket_name: neon-prod-storage-ap-southeast-1
bucket_region: ap-southeast-1 bucket_region: ap-southeast-1
console_mgmt_base_url: http://console-release.local console_mgmt_base_url: http://console-release.local
etcd_endpoints: etcd-0.ap-southeast-1.aws.neon.tech:2379 broker_endpoint: https://storage-broker.epsilon.ap-southeast-1.internal.aws.neon.tech:443
pageserver_config_stub: pageserver_config_stub:
pg_distrib_dir: /usr/local pg_distrib_dir: /usr/local
remote_storage: remote_storage:

View File

@@ -3,7 +3,7 @@ storage:
bucket_name: neon-prod-storage-eu-central-1 bucket_name: neon-prod-storage-eu-central-1
bucket_region: eu-central-1 bucket_region: eu-central-1
console_mgmt_base_url: http://console-release.local console_mgmt_base_url: http://console-release.local
etcd_endpoints: etcd-0.eu-central-1.aws.neon.tech:2379 broker_endpoint: https://storage-broker.gamma.eu-central-1.internal.aws.neon.tech:443
pageserver_config_stub: pageserver_config_stub:
pg_distrib_dir: /usr/local pg_distrib_dir: /usr/local
remote_storage: remote_storage:

View File

@@ -3,7 +3,7 @@ storage:
bucket_name: neon-prod-storage-us-east-2 bucket_name: neon-prod-storage-us-east-2
bucket_region: us-east-2 bucket_region: us-east-2
console_mgmt_base_url: http://console-release.local console_mgmt_base_url: http://console-release.local
etcd_endpoints: etcd-0.us-east-2.aws.neon.tech:2379 broker_endpoint: https://storage-broker.delta.us-east-2.internal.aws.neon.tech:443
pageserver_config_stub: pageserver_config_stub:
pg_distrib_dir: /usr/local pg_distrib_dir: /usr/local
remote_storage: remote_storage:

View File

@@ -4,7 +4,7 @@ storage:
console_mgmt_base_url: http://console-release.local console_mgmt_base_url: http://console-release.local
bucket_name: zenith-storage-oregon bucket_name: zenith-storage-oregon
bucket_region: us-west-2 bucket_region: us-west-2
etcd_endpoints: zenith-1-etcd.local:2379 broker_endpoint: http://storage-broker.prod.local:50051
pageserver_config_stub: pageserver_config_stub:
pg_distrib_dir: /usr/local pg_distrib_dir: /usr/local
remote_storage: remote_storage:

View File

@@ -3,7 +3,7 @@ storage:
bucket_name: neon-dev-storage-eu-west-1 bucket_name: neon-dev-storage-eu-west-1
bucket_region: eu-west-1 bucket_region: eu-west-1
console_mgmt_base_url: http://console-staging.local console_mgmt_base_url: http://console-staging.local
etcd_endpoints: etcd-0.eu-west-1.aws.neon.build:2379 broker_endpoint: https://storage-broker.zeta.eu-west-1.internal.aws.neon.build:443
pageserver_config_stub: pageserver_config_stub:
pg_distrib_dir: /usr/local pg_distrib_dir: /usr/local
remote_storage: remote_storage:

View File

@@ -3,7 +3,7 @@ storage:
bucket_name: zenith-staging-storage-us-east-1 bucket_name: zenith-staging-storage-us-east-1
bucket_region: us-east-1 bucket_region: us-east-1
console_mgmt_base_url: http://console-staging.local console_mgmt_base_url: http://console-staging.local
etcd_endpoints: etcd-0.us-east-2.aws.neon.build:2379 broker_endpoint: http://storage-broker.staging.local:50051
pageserver_config_stub: pageserver_config_stub:
pg_distrib_dir: /usr/local pg_distrib_dir: /usr/local
remote_storage: remote_storage:

View File

@@ -3,7 +3,7 @@ storage:
bucket_name: neon-staging-storage-us-east-2 bucket_name: neon-staging-storage-us-east-2
bucket_region: us-east-2 bucket_region: us-east-2
console_mgmt_base_url: http://console-staging.local console_mgmt_base_url: http://console-staging.local
etcd_endpoints: etcd-0.us-east-2.aws.neon.build:2379 broker_endpoint: https://storage-broker.beta.us-east-2.internal.aws.neon.build:443
pageserver_config_stub: pageserver_config_stub:
pg_distrib_dir: /usr/local pg_distrib_dir: /usr/local
remote_storage: remote_storage:

View File

@@ -6,7 +6,7 @@ After=network.target auditd.service
Type=simple Type=simple
User=pageserver User=pageserver
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_PAGESERVER }} Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_PAGESERVER }}
ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoints=['{{ etcd_endpoints }}']" -D /storage/pageserver/data ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoint='{{ broker_endpoint }}'" -D /storage/pageserver/data
ExecReload=/bin/kill -HUP $MAINPID ExecReload=/bin/kill -HUP $MAINPID
KillMode=mixed KillMode=mixed
KillSignal=SIGINT KillSignal=SIGINT

View File

@@ -6,7 +6,7 @@ After=network.target auditd.service
Type=simple Type=simple
User=safekeeper User=safekeeper
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_SAFEKEEPER }} Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_SAFEKEEPER }}
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ safekeeper_s3_prefix }}"}' ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoint={{ broker_endpoint }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ safekeeper_s3_prefix }}"}'
ExecReload=/bin/kill -HUP $MAINPID ExecReload=/bin/kill -HUP $MAINPID
KillMode=mixed KillMode=mixed
KillSignal=SIGINT KillSignal=SIGINT

View File

@@ -888,7 +888,8 @@ jobs:
helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
deploy-storage-broker-staging: deploy-storage-broker:
name: deploy storage broker on old staging and old prod
runs-on: [ self-hosted, dev, x64 ] runs-on: [ self-hosted, dev, x64 ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently. # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.

901
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -79,7 +79,7 @@ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/
RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \ RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \
&& /usr/local/bin/pageserver -D /data/.neon/ --init \ && /usr/local/bin/pageserver -D /data/.neon/ --init \
-c "id=1234" \ -c "id=1234" \
-c "broker_endpoints=['http://etcd:2379']" \ -c "broker_endpoint='http://storage_broker:50051'" \
-c "pg_distrib_dir='/usr/local/'" \ -c "pg_distrib_dir='/usr/local/'" \
-c "listen_pg_addr='0.0.0.0:6400'" \ -c "listen_pg_addr='0.0.0.0:6400'" \
-c "listen_http_addr='0.0.0.0:9898'" -c "listen_http_addr='0.0.0.0:9898'"

View File

@@ -26,12 +26,12 @@ See developer documentation in [/docs/SUMMARY.md](/docs/SUMMARY.md) for more inf
* On Ubuntu or Debian, this set of packages should be sufficient to build the code: * On Ubuntu or Debian, this set of packages should be sufficient to build the code:
```bash ```bash
apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \ apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
libssl-dev clang pkg-config libpq-dev etcd cmake postgresql-client protobuf-compiler libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler
``` ```
* On Fedora, these packages are needed: * On Fedora, these packages are needed:
```bash ```bash
dnf install flex bison readline-devel zlib-devel openssl-devel \ dnf install flex bison readline-devel zlib-devel openssl-devel \
libseccomp-devel perl clang cmake etcd postgresql postgresql-contrib protobuf-compiler libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler
``` ```
2. [Install Rust](https://www.rust-lang.org/tools/install) 2. [Install Rust](https://www.rust-lang.org/tools/install)
@@ -44,7 +44,7 @@ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
1. Install XCode and dependencies 1. Install XCode and dependencies
``` ```
xcode-select --install xcode-select --install
brew install protobuf etcd openssl flex bison brew install protobuf openssl flex bison
``` ```
2. [Install Rust](https://www.rust-lang.org/tools/install) 2. [Install Rust](https://www.rust-lang.org/tools/install)
@@ -123,12 +123,12 @@ Stopped pageserver 1 process with pid 2545906
# start pageserver and safekeeper # start pageserver and safekeeper
> ./target/debug/neon_local start > ./target/debug/neon_local start
Starting etcd broker using "/usr/bin/etcd" Starting neon broker at 127.0.0.1:50051
etcd started, pid: 2545996 storage_broker started, pid: 2918372
Starting pageserver at '127.0.0.1:64000' in '.neon'. Starting pageserver at '127.0.0.1:64000' in '.neon'.
pageserver started, pid: 2546005 pageserver started, pid: 2918386
Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'. Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'.
safekeeper 1 started, pid: 2546041 safekeeper 1 started, pid: 2918437
# start postgres compute node # start postgres compute node
> ./target/debug/neon_local pg start main > ./target/debug/neon_local pg start main

View File

@@ -25,5 +25,7 @@ url = "2.2.2"
pageserver_api = { path = "../libs/pageserver_api" } pageserver_api = { path = "../libs/pageserver_api" }
postgres_connection = { path = "../libs/postgres_connection" } postgres_connection = { path = "../libs/postgres_connection" }
safekeeper_api = { path = "../libs/safekeeper_api" } safekeeper_api = { path = "../libs/safekeeper_api" }
# Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
storage_broker = { version = "0.1", path = "../storage_broker" }
utils = { path = "../libs/utils" } utils = { path = "../libs/utils" }
workspace_hack = { version = "0.1", path = "../workspace_hack" } workspace_hack = { version = "0.1", path = "../workspace_hack" }

View File

@@ -10,5 +10,5 @@ id = 1
pg_port = 5454 pg_port = 5454
http_port = 7676 http_port = 7676
[etcd_broker] [broker]
broker_endpoints = ['http://127.0.0.1:2379'] listen_addr = '127.0.0.1:50051'

View File

@@ -8,10 +8,10 @@
use anyhow::{anyhow, bail, Context, Result}; use anyhow::{anyhow, bail, Context, Result};
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command}; use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
use control_plane::compute::ComputeControlPlane; use control_plane::compute::ComputeControlPlane;
use control_plane::local_env::{EtcdBroker, LocalEnv}; use control_plane::local_env::LocalEnv;
use control_plane::pageserver::PageServerNode; use control_plane::pageserver::PageServerNode;
use control_plane::safekeeper::SafekeeperNode; use control_plane::safekeeper::SafekeeperNode;
use control_plane::{etcd, local_env}; use control_plane::{broker, local_env};
use pageserver_api::models::TimelineInfo; use pageserver_api::models::TimelineInfo;
use pageserver_api::{ use pageserver_api::{
DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR, DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR,
@@ -22,9 +22,10 @@ use safekeeper_api::{
DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
}; };
use std::collections::{BTreeSet, HashMap}; use std::collections::{BTreeSet, HashMap};
use std::path::{Path, PathBuf}; use std::path::PathBuf;
use std::process::exit; use std::process::exit;
use std::str::FromStr; use std::str::FromStr;
use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
use utils::{ use utils::{
auth::{Claims, Scope}, auth::{Claims, Scope},
id::{NodeId, TenantId, TenantTimelineId, TimelineId}, id::{NodeId, TenantId, TenantTimelineId, TimelineId},
@@ -41,13 +42,12 @@ project_git_version!(GIT_VERSION);
const DEFAULT_PG_VERSION: &str = "14"; const DEFAULT_PG_VERSION: &str = "14";
fn default_conf(etcd_binary_path: &Path) -> String { fn default_conf() -> String {
format!( format!(
r#" r#"
# Default built-in configuration, defined in main.rs # Default built-in configuration, defined in main.rs
[etcd_broker] [broker]
broker_endpoints = ['http://localhost:2379'] listen_addr = '{DEFAULT_BROKER_ADDR}'
etcd_binary_path = '{etcd_binary_path}'
[pageserver] [pageserver]
id = {DEFAULT_PAGESERVER_ID} id = {DEFAULT_PAGESERVER_ID}
@@ -60,7 +60,6 @@ id = {DEFAULT_SAFEKEEPER_ID}
pg_port = {DEFAULT_SAFEKEEPER_PG_PORT} pg_port = {DEFAULT_SAFEKEEPER_PG_PORT}
http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT} http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT}
"#, "#,
etcd_binary_path = etcd_binary_path.display(),
pageserver_auth_type = AuthType::Trust, pageserver_auth_type = AuthType::Trust,
) )
} }
@@ -298,7 +297,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
})? })?
} else { } else {
// Built-in default config // Built-in default config
default_conf(&EtcdBroker::locate_etcd()?) default_conf()
}; };
let pg_version = init_match let pg_version = init_match
@@ -807,14 +806,14 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
} }
fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> { fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
etcd::start_etcd_process(env)?; broker::start_broker_process(env)?;
let pageserver = PageServerNode::from_env(env); let pageserver = PageServerNode::from_env(env);
// Postgres nodes are not started automatically // Postgres nodes are not started automatically
if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) { if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) {
eprintln!("pageserver start failed: {e}"); eprintln!("pageserver start failed: {e}");
try_stop_etcd_process(env); try_stop_storage_broker_process(env);
exit(1); exit(1);
} }
@@ -822,7 +821,7 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow
let safekeeper = SafekeeperNode::from_env(env, node); let safekeeper = SafekeeperNode::from_env(env, node);
if let Err(e) = safekeeper.start() { if let Err(e) = safekeeper.start() {
eprintln!("safekeeper '{}' start failed: {e}", safekeeper.id); eprintln!("safekeeper '{}' start failed: {e}", safekeeper.id);
try_stop_etcd_process(env); try_stop_storage_broker_process(env);
exit(1); exit(1);
} }
} }
@@ -854,14 +853,14 @@ fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<
} }
} }
try_stop_etcd_process(env); try_stop_storage_broker_process(env);
Ok(()) Ok(())
} }
fn try_stop_etcd_process(env: &local_env::LocalEnv) { fn try_stop_storage_broker_process(env: &local_env::LocalEnv) {
if let Err(e) = etcd::stop_etcd_process(env) { if let Err(e) = broker::stop_broker_process(env) {
eprintln!("etcd stop failed: {e}"); eprintln!("neon broker stop failed: {e}");
} }
} }

View File

@@ -0,0 +1,48 @@
use anyhow::Context;
use std::path::PathBuf;
use crate::{background_process, local_env};
pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
let broker = &env.broker;
let listen_addr = &broker.listen_addr;
print!("Starting neon broker at {}", listen_addr);
let args = [format!("--listen-addr={listen_addr}")];
let client = reqwest::blocking::Client::new();
background_process::start_process(
"storage_broker",
&env.base_data_dir,
&env.storage_broker_bin(),
&args,
[],
background_process::InitialPidFile::Create(&storage_broker_pid_file_path(env)),
|| {
let url = broker.client_url();
let status_url = url.join("status").with_context(|| {
format!("Failed to append /status path to broker endpoint {url}",)
})?;
let request = client
.get(status_url)
.build()
.with_context(|| format!("Failed to construct request to broker endpoint {url}"))?;
match client.execute(request) {
Ok(resp) => Ok(resp.status().is_success()),
Err(_) => Ok(false),
}
},
)
.context("Failed to spawn storage_broker subprocess")?;
Ok(())
}
pub fn stop_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
background_process::stop_process(true, "storage_broker", &storage_broker_pid_file_path(env))
}
fn storage_broker_pid_file_path(env: &local_env::LocalEnv) -> PathBuf {
env.base_data_dir.join("storage_broker.pid")
}

View File

@@ -1,78 +0,0 @@
use std::{fs, path::PathBuf};
use anyhow::Context;
use crate::{background_process, local_env};
pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
let etcd_broker = &env.etcd_broker;
print!(
"Starting etcd broker using {:?}",
etcd_broker.etcd_binary_path
);
let etcd_data_dir = env.base_data_dir.join("etcd");
fs::create_dir_all(&etcd_data_dir)
.with_context(|| format!("Failed to create etcd data dir {etcd_data_dir:?}"))?;
let client_urls = etcd_broker.comma_separated_endpoints();
let args = [
format!("--data-dir={}", etcd_data_dir.display()),
format!("--listen-client-urls={client_urls}"),
format!("--advertise-client-urls={client_urls}"),
// Set --quota-backend-bytes to keep the etcd virtual memory
// size smaller. Our test etcd clusters are very small.
// See https://github.com/etcd-io/etcd/issues/7910
"--quota-backend-bytes=100000000".to_string(),
// etcd doesn't compact (vacuum) with default settings,
// enable it to prevent space exhaustion.
"--auto-compaction-mode=revision".to_string(),
"--auto-compaction-retention=1".to_string(),
];
let pid_file_path = etcd_pid_file_path(env);
let client = reqwest::blocking::Client::new();
background_process::start_process(
"etcd",
&etcd_data_dir,
&etcd_broker.etcd_binary_path,
&args,
[],
background_process::InitialPidFile::Create(&pid_file_path),
|| {
for broker_endpoint in &etcd_broker.broker_endpoints {
let request = broker_endpoint
.join("health")
.with_context(|| {
format!(
"Failed to append /health path to broker endopint {}",
broker_endpoint
)
})
.and_then(|url| {
client.get(&url.to_string()).build().with_context(|| {
format!("Failed to construct request to etcd endpoint {url}")
})
})?;
if client.execute(request).is_ok() {
return Ok(true);
}
}
Ok(false)
},
)
.context("Failed to spawn etcd subprocess")?;
Ok(())
}
pub fn stop_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
background_process::stop_process(true, "etcd", &etcd_pid_file_path(env))
}
fn etcd_pid_file_path(env: &local_env::LocalEnv) -> PathBuf {
env.base_data_dir.join("etcd.pid")
}

View File

@@ -8,8 +8,8 @@
// //
mod background_process; mod background_process;
pub mod broker;
pub mod compute; pub mod compute;
pub mod etcd;
pub mod local_env; pub mod local_env;
pub mod pageserver; pub mod pageserver;
pub mod postgresql_conf; pub mod postgresql_conf;

View File

@@ -4,12 +4,16 @@
//! script which will use local paths. //! script which will use local paths.
use anyhow::{bail, ensure, Context}; use anyhow::{bail, ensure, Context};
use reqwest::Url; use reqwest::Url;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr}; use serde_with::{serde_as, DisplayFromStr};
use std::collections::HashMap; use std::collections::HashMap;
use std::env; use std::env;
use std::fs; use std::fs;
use std::net::IpAddr;
use std::net::Ipv4Addr;
use std::net::SocketAddr;
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::process::{Command, Stdio}; use std::process::{Command, Stdio};
use utils::{ use utils::{
@@ -62,7 +66,7 @@ pub struct LocalEnv {
#[serde(default)] #[serde(default)]
pub private_key_path: PathBuf, pub private_key_path: PathBuf,
pub etcd_broker: EtcdBroker, pub broker: NeonBroker,
pub pageserver: PageServerConf, pub pageserver: PageServerConf,
@@ -78,67 +82,26 @@ pub struct LocalEnv {
branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>, branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
} }
/// Etcd broker config for cluster internal communication. /// Broker config for cluster internal communication.
#[serde_as]
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
pub struct EtcdBroker { #[serde(default)]
/// A prefix to all to any key when pushing/polling etcd from a node. pub struct NeonBroker {
#[serde(default)] /// Broker listen address for storage nodes coordination, e.g. '127.0.0.1:50051'.
pub broker_etcd_prefix: Option<String>, pub listen_addr: SocketAddr,
/// Broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'.
#[serde(default)]
#[serde_as(as = "Vec<DisplayFromStr>")]
pub broker_endpoints: Vec<Url>,
/// Etcd binary path to use.
#[serde(default)]
pub etcd_binary_path: PathBuf,
} }
impl EtcdBroker { // Dummy Default impl to satisfy Deserialize derive.
pub fn locate_etcd() -> anyhow::Result<PathBuf> { impl Default for NeonBroker {
let which_output = Command::new("which") fn default() -> Self {
.arg("etcd") NeonBroker {
.output() listen_addr: SocketAddr::new(IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)), 0),
.context("Failed to run 'which etcd' command")?; }
let stdout = String::from_utf8_lossy(&which_output.stdout);
ensure!(
which_output.status.success(),
"'which etcd' invocation failed. Status: {}, stdout: {stdout}, stderr: {}",
which_output.status,
String::from_utf8_lossy(&which_output.stderr)
);
let etcd_path = PathBuf::from(stdout.trim());
ensure!(
etcd_path.is_file(),
"'which etcd' invocation was successful, but the path it returned is not a file or does not exist: {}",
etcd_path.display()
);
Ok(etcd_path)
} }
}
pub fn comma_separated_endpoints(&self) -> String { impl NeonBroker {
self.broker_endpoints pub fn client_url(&self) -> Url {
.iter() Url::parse(&format!("http://{}", self.listen_addr)).expect("failed to construct url")
.map(|url| {
// URL by default adds a '/' path at the end, which is not what etcd CLI wants.
let url_string = url.as_str();
if url_string.ends_with('/') {
&url_string[0..url_string.len() - 1]
} else {
url_string
}
})
.fold(String::new(), |mut comma_separated_urls, url| {
if !comma_separated_urls.is_empty() {
comma_separated_urls.push(',');
}
comma_separated_urls.push_str(url);
comma_separated_urls
})
} }
} }
@@ -234,6 +197,10 @@ impl LocalEnv {
self.neon_distrib_dir.join("safekeeper") self.neon_distrib_dir.join("safekeeper")
} }
pub fn storage_broker_bin(&self) -> PathBuf {
self.neon_distrib_dir.join("storage_broker")
}
pub fn pg_data_dirs_path(&self) -> PathBuf { pub fn pg_data_dirs_path(&self) -> PathBuf {
self.base_data_dir.join("pgdatadirs").join("tenants") self.base_data_dir.join("pgdatadirs").join("tenants")
} }
@@ -511,8 +478,8 @@ mod tests {
"failed to parse simple config {simple_conf_toml}, reason: {simple_conf_parse_result:?}" "failed to parse simple config {simple_conf_toml}, reason: {simple_conf_parse_result:?}"
); );
let string_to_replace = "broker_endpoints = ['http://127.0.0.1:2379']"; let string_to_replace = "listen_addr = '127.0.0.1:50051'";
let spoiled_url_str = "broker_endpoints = ['!@$XOXO%^&']"; let spoiled_url_str = "listen_addr = '!@$XOXO%^&'";
let spoiled_url_toml = simple_conf_toml.replace(string_to_replace, spoiled_url_str); let spoiled_url_toml = simple_conf_toml.replace(string_to_replace, spoiled_url_str);
assert!( assert!(
spoiled_url_toml.contains(spoiled_url_str), spoiled_url_toml.contains(spoiled_url_str),

View File

@@ -96,13 +96,8 @@ impl PageServerNode {
} }
} }
pub fn initialize( // pageserver conf overrides defined by neon_local configuration.
&self, fn neon_local_overrides(&self) -> Vec<String> {
create_tenant: Option<TenantId>,
initial_timeline_id: Option<TimelineId>,
config_overrides: &[&str],
pg_version: u32,
) -> anyhow::Result<TimelineId> {
let id = format!("id={}", self.env.pageserver.id); let id = format!("id={}", self.env.pageserver.id);
// FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc. // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
let pg_distrib_dir_param = format!( let pg_distrib_dir_param = format!(
@@ -117,41 +112,32 @@ impl PageServerNode {
); );
let listen_pg_addr_param = let listen_pg_addr_param =
format!("listen_pg_addr='{}'", self.env.pageserver.listen_pg_addr); format!("listen_pg_addr='{}'", self.env.pageserver.listen_pg_addr);
let broker_endpoints_param = format!( let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
"broker_endpoints=[{}]",
self.env
.etcd_broker
.broker_endpoints
.iter()
.map(|url| format!("'{url}'"))
.collect::<Vec<_>>()
.join(",")
);
let broker_etcd_prefix_param = self
.env
.etcd_broker
.broker_etcd_prefix
.as_ref()
.map(|prefix| format!("broker_etcd_prefix='{prefix}'"));
let mut init_config_overrides = config_overrides.to_vec(); let mut overrides = vec![
init_config_overrides.push(&id); id,
init_config_overrides.push(&pg_distrib_dir_param); pg_distrib_dir_param,
init_config_overrides.push(&authg_type_param); authg_type_param,
init_config_overrides.push(&listen_http_addr_param); listen_http_addr_param,
init_config_overrides.push(&listen_pg_addr_param); listen_pg_addr_param,
init_config_overrides.push(&broker_endpoints_param); broker_endpoint_param,
];
if let Some(broker_etcd_prefix_param) = broker_etcd_prefix_param.as_deref() {
init_config_overrides.push(broker_etcd_prefix_param);
}
if self.env.pageserver.auth_type != AuthType::Trust { if self.env.pageserver.auth_type != AuthType::Trust {
init_config_overrides.push("auth_validation_public_key_path='auth_public_key.pem'"); overrides.push("auth_validation_public_key_path='auth_public_key.pem'".to_owned());
} }
overrides
}
pub fn initialize(
&self,
create_tenant: Option<TenantId>,
initial_timeline_id: Option<TimelineId>,
config_overrides: &[&str],
pg_version: u32,
) -> anyhow::Result<TimelineId> {
let mut pageserver_process = self let mut pageserver_process = self
.start_node(&init_config_overrides, &self.env.base_data_dir, true) .start_node(config_overrides, &self.env.base_data_dir, true)
.with_context(|| { .with_context(|| {
format!( format!(
"Failed to start a process for pageserver {}", "Failed to start a process for pageserver {}",
@@ -224,6 +210,9 @@ impl PageServerNode {
datadir: &Path, datadir: &Path,
update_config: bool, update_config: bool,
) -> anyhow::Result<Child> { ) -> anyhow::Result<Child> {
let mut overrides = self.neon_local_overrides();
overrides.extend(config_overrides.iter().map(|&c| c.to_owned()));
print!( print!(
"Starting pageserver at '{}' in '{}'", "Starting pageserver at '{}' in '{}'",
self.pg_connection_config.raw_address(), self.pg_connection_config.raw_address(),
@@ -242,7 +231,7 @@ impl PageServerNode {
args.push("--update-config"); args.push("--update-config");
} }
for config_override in config_overrides { for config_override in &overrides {
args.extend(["-c", config_override]); args.extend(["-c", config_override]);
} }

View File

@@ -131,13 +131,8 @@ impl SafekeeperNode {
args.push("--no-sync"); args.push("--no-sync");
} }
let comma_separated_endpoints = self.env.etcd_broker.comma_separated_endpoints(); let broker_endpoint = format!("{}", self.env.broker.client_url());
if !comma_separated_endpoints.is_empty() { args.extend(["--broker-endpoint", &broker_endpoint]);
args.extend(["--broker-endpoints", &comma_separated_endpoints]);
}
if let Some(prefix) = self.env.etcd_broker.broker_etcd_prefix.as_deref() {
args.extend(["--broker-etcd-prefix", prefix]);
}
let mut backup_threads = String::new(); let mut backup_threads = String::new();
if let Some(threads) = self.conf.backup_threads { if let Some(threads) = self.conf.backup_threads {

View File

@@ -1,29 +1,6 @@
version: '3' version: '3'
services: services:
etcd:
restart: always
image: quay.io/coreos/etcd:v3.5.4
ports:
- 2379:2379
- 2380:2380
environment:
# This signifficantly speeds up etcd and we anyway don't data persistency there.
ETCD_UNSAFE_NO_FSYNC: "1"
command:
- "etcd"
- "--auto-compaction-mode=revision"
- "--auto-compaction-retention=1"
- "--name=etcd-cluster"
- "--initial-cluster-state=new"
- "--initial-cluster-token=etcd-cluster-1"
- "--initial-cluster=etcd-cluster=http://etcd:2380"
- "--initial-advertise-peer-urls=http://etcd:2380"
- "--advertise-client-urls=http://etcd:2379"
- "--listen-client-urls=http://0.0.0.0:2379"
- "--listen-peer-urls=http://0.0.0.0:2380"
- "--quota-backend-bytes=134217728" # 128 MB
minio: minio:
restart: always restart: always
image: quay.io/minio/minio:RELEASE.2022-10-20T00-55-09Z image: quay.io/minio/minio:RELEASE.2022-10-20T00-55-09Z
@@ -56,7 +33,7 @@ services:
restart: always restart: always
image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest} image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
environment: environment:
- BROKER_ENDPOINT='http://etcd:2379' - BROKER_ENDPOINT='http://storage_broker:50051'
- AWS_ACCESS_KEY_ID=minio - AWS_ACCESS_KEY_ID=minio
- AWS_SECRET_ACCESS_KEY=password - AWS_SECRET_ACCESS_KEY=password
#- RUST_BACKTRACE=1 #- RUST_BACKTRACE=1
@@ -68,7 +45,7 @@ services:
- "-c" - "-c"
command: command:
- "/usr/local/bin/pageserver -D /data/.neon/ - "/usr/local/bin/pageserver -D /data/.neon/
-c \"broker_endpoints=[$$BROKER_ENDPOINT]\" -c \"broker_endpoint=$$BROKER_ENDPOINT\"
-c \"listen_pg_addr='0.0.0.0:6400'\" -c \"listen_pg_addr='0.0.0.0:6400'\"
-c \"listen_http_addr='0.0.0.0:9898'\" -c \"listen_http_addr='0.0.0.0:9898'\"
-c \"remote_storage={endpoint='http://minio:9000', -c \"remote_storage={endpoint='http://minio:9000',
@@ -76,7 +53,7 @@ services:
bucket_region='eu-north-1', bucket_region='eu-north-1',
prefix_in_bucket='/pageserver/'}\"" prefix_in_bucket='/pageserver/'}\""
depends_on: depends_on:
- etcd - storage_broker
- minio_create_buckets - minio_create_buckets
safekeeper1: safekeeper1:
@@ -85,7 +62,7 @@ services:
environment: environment:
- SAFEKEEPER_ADVERTISE_URL=safekeeper1:5454 - SAFEKEEPER_ADVERTISE_URL=safekeeper1:5454
- SAFEKEEPER_ID=1 - SAFEKEEPER_ID=1
- BROKER_ENDPOINT=http://etcd:2379 - BROKER_ENDPOINT=http://storage_broker:50051
- AWS_ACCESS_KEY_ID=minio - AWS_ACCESS_KEY_ID=minio
- AWS_SECRET_ACCESS_KEY=password - AWS_SECRET_ACCESS_KEY=password
#- RUST_BACKTRACE=1 #- RUST_BACKTRACE=1
@@ -99,14 +76,14 @@ services:
- "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
--listen-http='0.0.0.0:7676' --listen-http='0.0.0.0:7676'
--id=$$SAFEKEEPER_ID --id=$$SAFEKEEPER_ID
--broker-endpoints=$$BROKER_ENDPOINT --broker-endpoint=$$BROKER_ENDPOINT
-D /data -D /data
--remote-storage=\"{endpoint='http://minio:9000', --remote-storage=\"{endpoint='http://minio:9000',
bucket_name='neon', bucket_name='neon',
bucket_region='eu-north-1', bucket_region='eu-north-1',
prefix_in_bucket='/safekeeper/'}\"" prefix_in_bucket='/safekeeper/'}\""
depends_on: depends_on:
- etcd - storage_broker
- minio_create_buckets - minio_create_buckets
safekeeper2: safekeeper2:
@@ -115,7 +92,7 @@ services:
environment: environment:
- SAFEKEEPER_ADVERTISE_URL=safekeeper2:5454 - SAFEKEEPER_ADVERTISE_URL=safekeeper2:5454
- SAFEKEEPER_ID=2 - SAFEKEEPER_ID=2
- BROKER_ENDPOINT=http://etcd:2379 - BROKER_ENDPOINT=http://storage_broker:50051
- AWS_ACCESS_KEY_ID=minio - AWS_ACCESS_KEY_ID=minio
- AWS_SECRET_ACCESS_KEY=password - AWS_SECRET_ACCESS_KEY=password
#- RUST_BACKTRACE=1 #- RUST_BACKTRACE=1
@@ -129,14 +106,14 @@ services:
- "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
--listen-http='0.0.0.0:7676' --listen-http='0.0.0.0:7676'
--id=$$SAFEKEEPER_ID --id=$$SAFEKEEPER_ID
--broker-endpoints=$$BROKER_ENDPOINT --broker-endpoint=$$BROKER_ENDPOINT
-D /data -D /data
--remote-storage=\"{endpoint='http://minio:9000', --remote-storage=\"{endpoint='http://minio:9000',
bucket_name='neon', bucket_name='neon',
bucket_region='eu-north-1', bucket_region='eu-north-1',
prefix_in_bucket='/safekeeper/'}\"" prefix_in_bucket='/safekeeper/'}\""
depends_on: depends_on:
- etcd - storage_broker
- minio_create_buckets - minio_create_buckets
safekeeper3: safekeeper3:
@@ -145,7 +122,7 @@ services:
environment: environment:
- SAFEKEEPER_ADVERTISE_URL=safekeeper3:5454 - SAFEKEEPER_ADVERTISE_URL=safekeeper3:5454
- SAFEKEEPER_ID=3 - SAFEKEEPER_ID=3
- BROKER_ENDPOINT=http://etcd:2379 - BROKER_ENDPOINT=http://storage_broker:50051
- AWS_ACCESS_KEY_ID=minio - AWS_ACCESS_KEY_ID=minio
- AWS_SECRET_ACCESS_KEY=password - AWS_SECRET_ACCESS_KEY=password
#- RUST_BACKTRACE=1 #- RUST_BACKTRACE=1
@@ -159,16 +136,25 @@ services:
- "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
--listen-http='0.0.0.0:7676' --listen-http='0.0.0.0:7676'
--id=$$SAFEKEEPER_ID --id=$$SAFEKEEPER_ID
--broker-endpoints=$$BROKER_ENDPOINT --broker-endpoint=$$BROKER_ENDPOINT
-D /data -D /data
--remote-storage=\"{endpoint='http://minio:9000', --remote-storage=\"{endpoint='http://minio:9000',
bucket_name='neon', bucket_name='neon',
bucket_region='eu-north-1', bucket_region='eu-north-1',
prefix_in_bucket='/safekeeper/'}\"" prefix_in_bucket='/safekeeper/'}\""
depends_on: depends_on:
- etcd - storage_broker
- minio_create_buckets - minio_create_buckets
storage_broker:
restart: always
image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
ports:
- 50051:50051
command:
- "storage_broker"
- "--listen-addr=0.0.0.0:50051"
compute: compute:
restart: always restart: always
build: build:

View File

@@ -2,7 +2,7 @@
### Overview ### Overview
We use JWT tokens in communication between almost all components (compute, pageserver, safekeeper, CLI) regardless of the protocol used (HTTP/PostgreSQL). We use JWT tokens in communication between almost all components (compute, pageserver, safekeeper, CLI) regardless of the protocol used (HTTP/PostgreSQL).
Etcd currently has no authentication. storage_broker currently has no authentication.
Authentication is optional and is disabled by default for easier debugging. Authentication is optional and is disabled by default for easier debugging.
It is used in some tests, though. It is used in some tests, though.
Note that we do not cover authentication with `pg.neon.tech` here. Note that we do not cover authentication with `pg.neon.tech` here.
@@ -84,7 +84,7 @@ the scope is the tenant and the token is usually passed through the
Pageserver keeps track of multiple tenants, each having multiple timelines. Pageserver keeps track of multiple tenants, each having multiple timelines.
For each timeline, it connects to the corresponding Safekeeper. For each timeline, it connects to the corresponding Safekeeper.
Information about "corresponding Safekeeper" is published by Safekeepers Information about "corresponding Safekeeper" is published by Safekeepers
in the Etcd, but they do not publish access tokens, otherwise what is in the storage_broker, but they do not publish access tokens, otherwise what is
the point of authentication. the point of authentication.
Pageserver keeps a connection to some set of Safekeepers, which Pageserver keeps a connection to some set of Safekeepers, which

View File

@@ -23,9 +23,9 @@ We build all images after a successful `release` tests run and push automaticall
You can see a [docker compose](https://docs.docker.com/compose/) example to create a neon cluster in [/docker-compose/docker-compose.yml](/docker-compose/docker-compose.yml). It creates the following conatainers. You can see a [docker compose](https://docs.docker.com/compose/) example to create a neon cluster in [/docker-compose/docker-compose.yml](/docker-compose/docker-compose.yml). It creates the following conatainers.
- etcd x 1
- pageserver x 1 - pageserver x 1
- safekeeper x 3 - safekeeper x 3
- storage_broker x 1
- compute x 1 - compute x 1
- MinIO x 1 # This is Amazon S3 compatible object storage - MinIO x 1 # This is Amazon S3 compatible object storage
@@ -41,7 +41,7 @@ $ cd docker-compose/docker-compose.yml
$ docker-compose down # remove the conainers if exists $ docker-compose down # remove the conainers if exists
$ PG_VERSION=15 TAG=2221 docker-compose up --build -d # You can specify the postgres and image version $ PG_VERSION=15 TAG=2221 docker-compose up --build -d # You can specify the postgres and image version
Creating network "dockercompose_default" with the default driver Creating network "dockercompose_default" with the default driver
Creating dockercompose_etcd3_1 ... Creating docker-compose_storage_broker_1 ... done
(...omit...) (...omit...)
``` ```

View File

@@ -10,7 +10,6 @@ the values in the config file, if any are specified for the same key and get int
```toml ```toml
# Initial configuration file created by 'pageserver --init' # Initial configuration file created by 'pageserver --init'
listen_pg_addr = '127.0.0.1:64000' listen_pg_addr = '127.0.0.1:64000'
listen_http_addr = '127.0.0.1:9898' listen_http_addr = '127.0.0.1:9898'
@@ -25,13 +24,12 @@ max_file_descriptors = '100'
# initial superuser role name to use when creating a new tenant # initial superuser role name to use when creating a new tenant
initial_superuser_name = 'cloud_admin' initial_superuser_name = 'cloud_admin'
broker_etcd_prefix = 'neon' broker_endpoint = 'http://127.0.0.1:50051'
broker_endpoints = ['some://etcd']
# [remote_storage] # [remote_storage]
``` ```
The config above shows default values for all basic pageserver settings, besides `broker_endpoints`: that one has to be set by the user, The config above shows default values for all basic pageserver settings, besides `broker_endpoint`: that one has to be set by the user,
see the corresponding section below. see the corresponding section below.
Pageserver uses default values for all files that are missing in the config, so it's not a hard error to leave the config blank. Pageserver uses default values for all files that are missing in the config, so it's not a hard error to leave the config blank.
Yet, it validates the config values it can (e.g. postgres install dir) and errors if the validation fails, refusing to start. Yet, it validates the config values it can (e.g. postgres install dir) and errors if the validation fails, refusing to start.
@@ -50,16 +48,10 @@ Example: `${PAGESERVER_BIN} -c "checkpoint_timeout = '10 m'" -c "remote_storage=
Note that TOML distinguishes between strings and integers, the former require single or double quotes around them. Note that TOML distinguishes between strings and integers, the former require single or double quotes around them.
#### broker_endpoints #### broker_endpoint
A list of endpoints (etcd currently) to connect and pull the information from. A storage broker endpoint to connect and pull the information from. Default is
Mandatory, does not have a default, since requires etcd to be started as a separate process, `'http://127.0.0.1:50051'`.
and its connection url should be specified separately.
#### broker_etcd_prefix
A prefix to add for every etcd key used, to separate one group of related instances from another, in the same cluster.
Default is `neon`.
#### checkpoint_distance #### checkpoint_distance

View File

@@ -1,18 +0,0 @@
[package]
name = "etcd_broker"
version = "0.1.0"
edition = "2021"
[dependencies]
etcd-client = "0.9.0"
regex = "1.4.5"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1"
serde_with = "2.0"
once_cell = "1.13.0"
utils = { path = "../utils" }
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
tokio = "1"
tracing = "0.1"
thiserror = "1"

View File

@@ -1,209 +0,0 @@
//! A set of primitives to access a shared data/updates, propagated via etcd broker (not persistent).
//! Intended to connect services to each other, not to store their data.
/// All broker keys, that are used when dealing with etcd.
pub mod subscription_key;
/// All broker values, possible to use when dealing with etcd.
pub mod subscription_value;
use std::str::FromStr;
use serde::de::DeserializeOwned;
use subscription_key::SubscriptionKey;
use tokio::{sync::mpsc, task::JoinHandle};
use tracing::*;
use crate::subscription_key::SubscriptionFullKey;
pub use etcd_client::*;
/// Default value to use for prefixing to all etcd keys with.
/// This way allows isolating safekeeper/pageserver groups in the same etcd cluster.
pub const DEFAULT_NEON_BROKER_ETCD_PREFIX: &str = "neon";
/// A way to control the data retrieval from a certain subscription.
pub struct BrokerSubscription<V> {
/// An unbounded channel to fetch the relevant etcd updates from.
pub value_updates: mpsc::UnboundedReceiver<BrokerUpdate<V>>,
key: SubscriptionKey,
/// A subscription task handle, to allow waiting on it for the task to complete.
/// Both the updates channel and the handle require `&mut`, so it's better to keep
/// both `pub` to allow using both in the same structures without borrow checker complaining.
pub watcher_handle: JoinHandle<Result<(), BrokerError>>,
watcher: Watcher,
}
impl<V> BrokerSubscription<V> {
/// Cancels the subscription, stopping the data poller and waiting for it to shut down.
pub async fn cancel(mut self) -> Result<(), BrokerError> {
self.watcher.cancel().await.map_err(|e| {
BrokerError::EtcdClient(
e,
format!("Failed to cancel broker subscription, kind: {:?}", self.key),
)
})?;
match (&mut self.watcher_handle).await {
Ok(res) => res,
Err(e) => {
if e.is_cancelled() {
// don't error on the tasks that are cancelled already
Ok(())
} else {
Err(BrokerError::InternalError(format!(
"Panicked during broker subscription task, kind: {:?}, error: {e}",
self.key
)))
}
}
}
}
}
impl<V> Drop for BrokerSubscription<V> {
fn drop(&mut self) {
// we poll data from etcd into the channel in the same struct, so if the whole struct gets dropped,
// no more data is used by the receiver and it's safe to cancel and drop the whole etcd subscription task.
self.watcher_handle.abort();
}
}
/// An update from the etcd broker.
pub struct BrokerUpdate<V> {
/// Etcd generation version, the bigger the more actual the data is.
pub etcd_version: i64,
/// Etcd key for the corresponding value, parsed from the broker KV.
pub key: SubscriptionFullKey,
/// Current etcd value, parsed from the broker KV.
pub value: V,
}
#[derive(Debug, thiserror::Error)]
pub enum BrokerError {
#[error("Etcd client error: {0}. Context: {1}")]
EtcdClient(etcd_client::Error, String),
#[error("Error during parsing etcd key: {0}")]
KeyNotParsed(String),
#[error("Internal error: {0}")]
InternalError(String),
}
/// Creates a background task to poll etcd for timeline updates from safekeepers.
/// Stops and returns `Err` on any error during etcd communication.
/// Watches the key changes until either the watcher is cancelled via etcd or the subscription cancellation handle,
/// exiting normally in such cases.
/// Etcd values are parsed as json fukes into a type, specified in the generic patameter.
pub async fn subscribe_for_json_values<V>(
client: &mut Client,
key: SubscriptionKey,
) -> Result<BrokerSubscription<V>, BrokerError>
where
V: DeserializeOwned + Send + 'static,
{
subscribe_for_values(client, key, |_, value_str| {
match serde_json::from_str::<V>(value_str) {
Ok(value) => Some(value),
Err(e) => {
error!("Failed to parse value str '{value_str}': {e}");
None
}
}
})
.await
}
/// Same as [`subscribe_for_json_values`], but allows to specify a custom parser of a etcd value string.
pub async fn subscribe_for_values<P, V>(
client: &mut Client,
key: SubscriptionKey,
value_parser: P,
) -> Result<BrokerSubscription<V>, BrokerError>
where
V: Send + 'static,
P: Fn(SubscriptionFullKey, &str) -> Option<V> + Send + 'static,
{
info!("Subscribing to broker value updates, key: {key:?}");
let subscription_key = key.clone();
let (watcher, mut stream) = client
.watch(key.watch_key(), Some(WatchOptions::new().with_prefix()))
.await
.map_err(|e| {
BrokerError::EtcdClient(
e,
format!("Failed to init the watch for subscription {key:?}"),
)
})?;
let (value_updates_sender, value_updates_receiver) = mpsc::unbounded_channel();
let watcher_handle = tokio::spawn(async move {
while let Some(resp) = stream.message().await.map_err(|e| BrokerError::InternalError(format!(
"Failed to get messages from the subscription stream, kind: {:?}, error: {e}", key.kind
)))? {
if resp.canceled() {
info!("Watch for timeline updates subscription was canceled, exiting");
break;
}
let events = resp.events();
debug!("Processing {} events", events.len());
for event in events {
if EventType::Put == event.event_type() {
if let Some(new_etcd_kv) = event.kv() {
match parse_etcd_kv(new_etcd_kv, &value_parser, &key.cluster_prefix) {
Ok(Some((key, value))) => if let Err(e) = value_updates_sender.send(BrokerUpdate {
etcd_version: new_etcd_kv.version(),
key,
value,
}) {
info!("Broker value updates for key {key:?} sender got dropped, exiting: {e}");
break;
},
Ok(None) => debug!("Ignoring key {key:?} : no value was returned by the parser"),
Err(BrokerError::KeyNotParsed(e)) => debug!("Unexpected key {key:?} for timeline update: {e}"),
Err(e) => error!("Failed to represent etcd KV {new_etcd_kv:?}: {e}"),
};
}
}
}
}
Ok(())
}.instrument(info_span!("etcd_broker")));
Ok(BrokerSubscription {
key: subscription_key,
value_updates: value_updates_receiver,
watcher_handle,
watcher,
})
}
fn parse_etcd_kv<P, V>(
kv: &KeyValue,
value_parser: &P,
cluster_prefix: &str,
) -> Result<Option<(SubscriptionFullKey, V)>, BrokerError>
where
P: Fn(SubscriptionFullKey, &str) -> Option<V>,
{
let key_str = kv.key_str().map_err(|e| {
BrokerError::EtcdClient(e, "Failed to extract key str out of etcd KV".to_string())
})?;
let value_str = kv.value_str().map_err(|e| {
BrokerError::EtcdClient(e, "Failed to extract value str out of etcd KV".to_string())
})?;
if !key_str.starts_with(cluster_prefix) {
return Err(BrokerError::KeyNotParsed(format!(
"KV has unexpected key '{key_str}' that does not start with cluster prefix {cluster_prefix}"
)));
}
let key = SubscriptionFullKey::from_str(&key_str[cluster_prefix.len()..]).map_err(|e| {
BrokerError::KeyNotParsed(format!("Failed to parse KV key '{key_str}': {e}"))
})?;
Ok(value_parser(key, value_str).map(|value| (key, value)))
}

View File

@@ -1,310 +0,0 @@
//! Etcd broker keys, used in the project and shared between instances.
//! The keys are split into two categories:
//!
//! * [`SubscriptionFullKey`] full key format: `<cluster_prefix>/<tenant>/<timeline>/<node_kind>/<operation>/<node_id>`
//! Always returned from etcd in this form, always start with the user key provided.
//!
//! * [`SubscriptionKey`] user input key format: always partial, since it's unknown which `node_id`'s are available.
//! Full key always starts with the user input one, due to etcd subscription properties.
use std::{fmt::Display, str::FromStr};
use once_cell::sync::Lazy;
use regex::{Captures, Regex};
use utils::id::{NodeId, TenantId, TenantTimelineId};
/// The subscription kind to the timeline updates from safekeeper.
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct SubscriptionKey {
/// Generic cluster prefix, allowing to use the same etcd instance by multiple logic groups.
pub cluster_prefix: String,
/// The subscription kind.
pub kind: SubscriptionKind,
}
/// All currently possible key kinds of a etcd broker subscription.
/// Etcd works so, that every key that starts with the subbscription key given is considered matching and
/// returned as part of the subscrption.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum SubscriptionKind {
/// Get every update in etcd.
All,
/// Get etcd updates for any timeiline of a certain tenant, affected by any operation from any node kind.
TenantTimelines(TenantId),
/// Get etcd updates for a certain timeline of a tenant, affected by any operation from any node kind.
Timeline(TenantTimelineId),
/// Get etcd timeline updates, specific to a certain node kind.
Node(TenantTimelineId, NodeKind),
/// Get etcd timeline updates for a certain operation on specific nodes.
Operation(TenantTimelineId, NodeKind, OperationKind),
}
/// All kinds of nodes, able to write into etcd.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum NodeKind {
Safekeeper,
Pageserver,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum OperationKind {
Safekeeper(SkOperationKind),
}
/// Current operations, running inside the safekeeper node.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum SkOperationKind {
TimelineInfo,
WalBackup,
}
static SUBSCRIPTION_FULL_KEY_REGEX: Lazy<Regex> = Lazy::new(|| {
Regex::new("/([[:xdigit:]]+)/([[:xdigit:]]+)/([^/]+)/([^/]+)/([[:digit:]]+)$")
.expect("wrong subscription full etcd key regex")
});
/// Full key, received from etcd during any of the component's work.
/// No other etcd keys are considered during system's work.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct SubscriptionFullKey {
pub id: TenantTimelineId,
pub node_kind: NodeKind,
pub operation: OperationKind,
pub node_id: NodeId,
}
impl SubscriptionKey {
/// Subscribes for all etcd updates.
pub fn all(cluster_prefix: String) -> Self {
SubscriptionKey {
cluster_prefix,
kind: SubscriptionKind::All,
}
}
/// Subscribes to a given timeline info updates from safekeepers.
pub fn sk_timeline_info(cluster_prefix: String, timeline: TenantTimelineId) -> Self {
Self {
cluster_prefix,
kind: SubscriptionKind::Operation(
timeline,
NodeKind::Safekeeper,
OperationKind::Safekeeper(SkOperationKind::TimelineInfo),
),
}
}
/// Subscribes to all timeine updates during specific operations, running on the corresponding nodes.
pub fn operation(
cluster_prefix: String,
timeline: TenantTimelineId,
node_kind: NodeKind,
operation: OperationKind,
) -> Self {
Self {
cluster_prefix,
kind: SubscriptionKind::Operation(timeline, node_kind, operation),
}
}
/// Etcd key to use for watching a certain timeline updates from safekeepers.
pub fn watch_key(&self) -> String {
let cluster_prefix = &self.cluster_prefix;
match self.kind {
SubscriptionKind::All => cluster_prefix.to_string(),
SubscriptionKind::TenantTimelines(tenant_id) => {
format!("{cluster_prefix}/{tenant_id}")
}
SubscriptionKind::Timeline(id) => {
format!("{cluster_prefix}/{id}")
}
SubscriptionKind::Node(id, node_kind) => {
format!("{cluster_prefix}/{id}/{node_kind}")
}
SubscriptionKind::Operation(id, node_kind, operation_kind) => {
format!("{cluster_prefix}/{id}/{node_kind}/{operation_kind}")
}
}
}
}
impl Display for OperationKind {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
OperationKind::Safekeeper(o) => o.fmt(f),
}
}
}
impl FromStr for OperationKind {
type Err = String;
fn from_str(operation_kind_str: &str) -> Result<Self, Self::Err> {
match operation_kind_str {
"timeline_info" => Ok(OperationKind::Safekeeper(SkOperationKind::TimelineInfo)),
"wal_backup" => Ok(OperationKind::Safekeeper(SkOperationKind::WalBackup)),
_ => Err(format!("Unknown operation kind: {operation_kind_str}")),
}
}
}
impl Display for SubscriptionFullKey {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let Self {
id,
node_kind,
operation,
node_id,
} = self;
write!(f, "{id}/{node_kind}/{operation}/{node_id}")
}
}
impl FromStr for SubscriptionFullKey {
type Err = String;
fn from_str(subscription_kind_str: &str) -> Result<Self, Self::Err> {
let key_captures = match SUBSCRIPTION_FULL_KEY_REGEX.captures(subscription_kind_str) {
Some(captures) => captures,
None => {
return Err(format!(
"Subscription kind str does not match a subscription full key regex {}",
SUBSCRIPTION_FULL_KEY_REGEX.as_str()
));
}
};
Ok(Self {
id: TenantTimelineId::new(
parse_capture(&key_captures, 1)?,
parse_capture(&key_captures, 2)?,
),
node_kind: parse_capture(&key_captures, 3)?,
operation: parse_capture(&key_captures, 4)?,
node_id: NodeId(parse_capture(&key_captures, 5)?),
})
}
}
fn parse_capture<T>(caps: &Captures, index: usize) -> Result<T, String>
where
T: FromStr,
<T as FromStr>::Err: Display,
{
let capture_match = caps
.get(index)
.ok_or_else(|| format!("Failed to get capture match at index {index}"))?
.as_str();
capture_match.parse().map_err(|e| {
format!(
"Failed to parse {} from {capture_match}: {e}",
std::any::type_name::<T>()
)
})
}
impl Display for NodeKind {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Safekeeper => write!(f, "safekeeper"),
Self::Pageserver => write!(f, "pageserver"),
}
}
}
impl FromStr for NodeKind {
type Err = String;
fn from_str(node_kind_str: &str) -> Result<Self, Self::Err> {
match node_kind_str {
"safekeeper" => Ok(Self::Safekeeper),
"pageserver" => Ok(Self::Pageserver),
_ => Err(format!("Invalid node kind: {node_kind_str}")),
}
}
}
impl Display for SkOperationKind {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::TimelineInfo => write!(f, "timeline_info"),
Self::WalBackup => write!(f, "wal_backup"),
}
}
}
impl FromStr for SkOperationKind {
type Err = String;
fn from_str(operation_str: &str) -> Result<Self, Self::Err> {
match operation_str {
"timeline_info" => Ok(Self::TimelineInfo),
"wal_backup" => Ok(Self::WalBackup),
_ => Err(format!("Invalid operation: {operation_str}")),
}
}
}
#[cfg(test)]
mod tests {
use utils::id::TimelineId;
use super::*;
#[test]
fn full_cluster_key_parsing() {
let prefix = "neon";
let node_kind = NodeKind::Safekeeper;
let operation_kind = OperationKind::Safekeeper(SkOperationKind::WalBackup);
let tenant_id = TenantId::generate();
let timeline_id = TimelineId::generate();
let id = TenantTimelineId::new(tenant_id, timeline_id);
let node_id = NodeId(1);
let timeline_subscription_keys = [
SubscriptionKey {
cluster_prefix: prefix.to_string(),
kind: SubscriptionKind::All,
},
SubscriptionKey {
cluster_prefix: prefix.to_string(),
kind: SubscriptionKind::TenantTimelines(tenant_id),
},
SubscriptionKey {
cluster_prefix: prefix.to_string(),
kind: SubscriptionKind::Timeline(id),
},
SubscriptionKey {
cluster_prefix: prefix.to_string(),
kind: SubscriptionKind::Node(id, node_kind),
},
SubscriptionKey {
cluster_prefix: prefix.to_string(),
kind: SubscriptionKind::Operation(id, node_kind, operation_kind),
},
];
let full_key_string = format!(
"{}/{node_id}",
timeline_subscription_keys.last().unwrap().watch_key()
);
for key in timeline_subscription_keys {
assert!(full_key_string.starts_with(&key.watch_key()), "Full key '{full_key_string}' should start with any of the keys, keys, but {key:?} did not match");
}
let full_key = SubscriptionFullKey::from_str(&full_key_string).unwrap_or_else(|e| {
panic!("Failed to parse {full_key_string} as a subscription full key: {e}")
});
assert_eq!(
full_key,
SubscriptionFullKey {
id,
node_kind,
operation: operation_kind,
node_id
}
)
}
}

View File

@@ -1,38 +0,0 @@
//! Module for the values to put into etcd.
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use utils::lsn::Lsn;
/// Data about safekeeper's timeline. Fields made optional for easy migrations.
#[serde_as]
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct SkTimelineInfo {
/// Term of the last entry.
pub last_log_term: Option<u64>,
/// LSN of the last record.
#[serde_as(as = "Option<DisplayFromStr>")]
#[serde(default)]
pub flush_lsn: Option<Lsn>,
/// Up to which LSN safekeeper regards its WAL as committed.
#[serde_as(as = "Option<DisplayFromStr>")]
#[serde(default)]
pub commit_lsn: Option<Lsn>,
/// LSN up to which safekeeper has backed WAL.
#[serde_as(as = "Option<DisplayFromStr>")]
#[serde(default)]
pub backup_lsn: Option<Lsn>,
/// LSN of last checkpoint uploaded by pageserver.
#[serde_as(as = "Option<DisplayFromStr>")]
#[serde(default)]
pub remote_consistent_lsn: Option<Lsn>,
#[serde_as(as = "Option<DisplayFromStr>")]
#[serde(default)]
pub peer_horizon_lsn: Option<Lsn>,
#[serde_as(as = "Option<DisplayFromStr>")]
#[serde(default)]
pub local_start_lsn: Option<Lsn>,
/// A connection string to use for WAL receiving.
#[serde(default)]
pub safekeeper_connstr: Option<String>,
}

View File

@@ -22,3 +22,40 @@ pub struct TimelineCreateRequest {
// If not passed, it is assigned to the beginning of commit_lsn segment. // If not passed, it is assigned to the beginning of commit_lsn segment.
pub local_start_lsn: Option<Lsn>, pub local_start_lsn: Option<Lsn>,
} }
fn lsn_invalid() -> Lsn {
Lsn::INVALID
}
/// Data about safekeeper's timeline, mirrors broker.proto.
#[serde_as]
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct SkTimelineInfo {
/// Term of the last entry.
pub last_log_term: Option<u64>,
/// LSN of the last record.
#[serde_as(as = "DisplayFromStr")]
#[serde(default = "lsn_invalid")]
pub flush_lsn: Lsn,
/// Up to which LSN safekeeper regards its WAL as committed.
#[serde_as(as = "DisplayFromStr")]
#[serde(default = "lsn_invalid")]
pub commit_lsn: Lsn,
/// LSN up to which safekeeper has backed WAL.
#[serde_as(as = "DisplayFromStr")]
#[serde(default = "lsn_invalid")]
pub backup_lsn: Lsn,
/// LSN of last checkpoint uploaded by pageserver.
#[serde_as(as = "DisplayFromStr")]
#[serde(default = "lsn_invalid")]
pub remote_consistent_lsn: Lsn,
#[serde_as(as = "DisplayFromStr")]
#[serde(default = "lsn_invalid")]
pub peer_horizon_lsn: Lsn,
#[serde_as(as = "DisplayFromStr")]
#[serde(default = "lsn_invalid")]
pub local_start_lsn: Lsn,
/// A connection string to use for WAL receiving.
#[serde(default)]
pub safekeeper_connstr: Option<String>,
}

View File

@@ -59,13 +59,13 @@ tracing = "0.1.36"
url = "2" url = "2"
walkdir = "2.3.2" walkdir = "2.3.2"
etcd_broker = { path = "../libs/etcd_broker" }
metrics = { path = "../libs/metrics" } metrics = { path = "../libs/metrics" }
pageserver_api = { path = "../libs/pageserver_api" } pageserver_api = { path = "../libs/pageserver_api" }
postgres_connection = { path = "../libs/postgres_connection" } postgres_connection = { path = "../libs/postgres_connection" }
postgres_ffi = { path = "../libs/postgres_ffi" } postgres_ffi = { path = "../libs/postgres_ffi" }
pq_proto = { path = "../libs/pq_proto" } pq_proto = { path = "../libs/pq_proto" }
remote_storage = { path = "../libs/remote_storage" } remote_storage = { path = "../libs/remote_storage" }
storage_broker = { version = "0.1", path = "../storage_broker" }
tenant_size_model = { path = "../libs/tenant_size_model" } tenant_size_model = { path = "../libs/tenant_size_model" }
utils = { path = "../libs/utils" } utils = { path = "../libs/utils" }
workspace_hack = { version = "0.1", path = "../workspace_hack" } workspace_hack = { version = "0.1", path = "../workspace_hack" }

View File

@@ -247,7 +247,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
// start profiler (if enabled) // start profiler (if enabled)
let profiler_guard = profiling::init_profiler(conf); let profiler_guard = profiling::init_profiler(conf);
WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_etcd_client(conf))?; WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_broker_client(conf))?;
// initialize authentication for incoming connections // initialize authentication for incoming connections
let auth = match &conf.auth_type { let auth = match &conf.auth_type {

View File

@@ -7,6 +7,7 @@
use anyhow::{anyhow, bail, ensure, Context, Result}; use anyhow::{anyhow, bail, ensure, Context, Result};
use remote_storage::{RemotePath, RemoteStorageConfig}; use remote_storage::{RemotePath, RemoteStorageConfig};
use std::env; use std::env;
use storage_broker::Uri;
use utils::crashsafe::path_with_suffix_extension; use utils::crashsafe::path_with_suffix_extension;
use utils::id::ConnectionId; use utils::id::ConnectionId;
@@ -18,7 +19,7 @@ use std::sync::Arc;
use std::time::Duration; use std::time::Duration;
use toml_edit; use toml_edit;
use toml_edit::{Document, Item}; use toml_edit::{Document, Item};
use url::Url;
use utils::{ use utils::{
id::{NodeId, TenantId, TimelineId}, id::{NodeId, TenantId, TimelineId},
logging::LogFormat, logging::LogFormat,
@@ -39,6 +40,7 @@ pub mod defaults {
DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
DEFAULT_PG_LISTEN_PORT, DEFAULT_PG_LISTEN_PORT,
}; };
pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s"; pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s";
pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
@@ -59,7 +61,6 @@ pub mod defaults {
pub const DEFAULT_CONFIG_FILE: &str = formatcp!( pub const DEFAULT_CONFIG_FILE: &str = formatcp!(
r###" r###"
# Initial configuration file created by 'pageserver --init' # Initial configuration file created by 'pageserver --init'
#listen_pg_addr = '{DEFAULT_PG_LISTEN_ADDR}' #listen_pg_addr = '{DEFAULT_PG_LISTEN_ADDR}'
#listen_http_addr = '{DEFAULT_HTTP_LISTEN_ADDR}' #listen_http_addr = '{DEFAULT_HTTP_LISTEN_ADDR}'
@@ -71,6 +72,8 @@ pub mod defaults {
# initial superuser role name to use when creating a new tenant # initial superuser role name to use when creating a new tenant
#initial_superuser_name = '{DEFAULT_SUPERUSER}' #initial_superuser_name = '{DEFAULT_SUPERUSER}'
#broker_endpoint = '{BROKER_DEFAULT_ENDPOINT}'
#log_format = '{DEFAULT_LOG_FORMAT}' #log_format = '{DEFAULT_LOG_FORMAT}'
#concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}' #concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'
@@ -132,12 +135,8 @@ pub struct PageServerConf {
pub profiling: ProfilingConfig, pub profiling: ProfilingConfig,
pub default_tenant_conf: TenantConf, pub default_tenant_conf: TenantConf,
/// A prefix to add in etcd brokers before every key. /// Storage broker endpoints to connect to.
/// Can be used for isolating different pageserver groups within the same etcd cluster. pub broker_endpoint: Uri,
pub broker_etcd_prefix: String,
/// Etcd broker endpoints to connect to.
pub broker_endpoints: Vec<Url>,
pub log_format: LogFormat, pub log_format: LogFormat,
@@ -148,8 +147,7 @@ pub struct PageServerConf {
/// We do not want to store this in a PageServerConf because the latter may be logged /// We do not want to store this in a PageServerConf because the latter may be logged
/// and/or serialized at a whim, while the token is secret. Currently this token is the /// and/or serialized at a whim, while the token is secret. Currently this token is the
/// same for accessing all tenants/timelines, but may become per-tenant/per-timeline in /// same for accessing all tenants/timelines, but may become per-tenant/per-timeline in
/// the future, more tokens and auth may arrive for etcd and/or its rewrite (see /// the future, more tokens and auth may arrive for storage broker, completely changing the logic.
/// https://github.com/neondatabase/neon/issues/2394), completely changing the logic.
/// Hence, we resort to a global variable for now instead of passing the token from the /// Hence, we resort to a global variable for now instead of passing the token from the
/// startup code to the connection code through a dozen layers. /// startup code to the connection code through a dozen layers.
pub static SAFEKEEPER_AUTH_TOKEN: OnceCell<Arc<String>> = OnceCell::new(); pub static SAFEKEEPER_AUTH_TOKEN: OnceCell<Arc<String>> = OnceCell::new();
@@ -216,8 +214,7 @@ struct PageServerConfigBuilder {
id: BuilderValue<NodeId>, id: BuilderValue<NodeId>,
profiling: BuilderValue<ProfilingConfig>, profiling: BuilderValue<ProfilingConfig>,
broker_etcd_prefix: BuilderValue<String>, broker_endpoint: BuilderValue<Uri>,
broker_endpoints: BuilderValue<Vec<Url>>,
log_format: BuilderValue<LogFormat>, log_format: BuilderValue<LogFormat>,
@@ -247,8 +244,9 @@ impl Default for PageServerConfigBuilder {
remote_storage_config: Set(None), remote_storage_config: Set(None),
id: NotSet, id: NotSet,
profiling: Set(ProfilingConfig::Disabled), profiling: Set(ProfilingConfig::Disabled),
broker_etcd_prefix: Set(etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string()), broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT
broker_endpoints: Set(Vec::new()), .parse()
.expect("failed to parse default broker endpoint")),
log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()), log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()), concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()),
@@ -308,12 +306,8 @@ impl PageServerConfigBuilder {
self.remote_storage_config = BuilderValue::Set(remote_storage_config) self.remote_storage_config = BuilderValue::Set(remote_storage_config)
} }
pub fn broker_endpoints(&mut self, broker_endpoints: Vec<Url>) { pub fn broker_endpoint(&mut self, broker_endpoint: Uri) {
self.broker_endpoints = BuilderValue::Set(broker_endpoints) self.broker_endpoint = BuilderValue::Set(broker_endpoint)
}
pub fn broker_etcd_prefix(&mut self, broker_etcd_prefix: String) {
self.broker_etcd_prefix = BuilderValue::Set(broker_etcd_prefix)
} }
pub fn id(&mut self, node_id: NodeId) { pub fn id(&mut self, node_id: NodeId) {
@@ -368,12 +362,9 @@ impl PageServerConfigBuilder {
profiling: self.profiling.ok_or(anyhow!("missing profiling"))?, profiling: self.profiling.ok_or(anyhow!("missing profiling"))?,
// TenantConf is handled separately // TenantConf is handled separately
default_tenant_conf: TenantConf::default(), default_tenant_conf: TenantConf::default(),
broker_endpoints: self broker_endpoint: self
.broker_endpoints .broker_endpoint
.ok_or(anyhow!("No broker endpoints provided"))?, .ok_or(anyhow!("No broker endpoints provided"))?,
broker_etcd_prefix: self
.broker_etcd_prefix
.ok_or(anyhow!("missing broker_etcd_prefix"))?,
log_format: self.log_format.ok_or(anyhow!("missing log_format"))?, log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
concurrent_tenant_size_logical_size_queries: self concurrent_tenant_size_logical_size_queries: self
.concurrent_tenant_size_logical_size_queries .concurrent_tenant_size_logical_size_queries
@@ -540,17 +531,7 @@ impl PageServerConf {
} }
"id" => builder.id(NodeId(parse_toml_u64(key, item)?)), "id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
"profiling" => builder.profiling(parse_toml_from_str(key, item)?), "profiling" => builder.profiling(parse_toml_from_str(key, item)?),
"broker_etcd_prefix" => builder.broker_etcd_prefix(parse_toml_string(key, item)?), "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
"broker_endpoints" => builder.broker_endpoints(
parse_toml_array(key, item)?
.into_iter()
.map(|endpoint_str| {
endpoint_str.parse::<Url>().with_context(|| {
format!("Array item {endpoint_str} for key {key} is not a valid url endpoint")
})
})
.collect::<anyhow::Result<_>>()?,
),
"log_format" => builder.log_format( "log_format" => builder.log_format(
LogFormat::from_config(&parse_toml_string(key, item)?)? LogFormat::from_config(&parse_toml_string(key, item)?)?
), ),
@@ -677,8 +658,7 @@ impl PageServerConf {
remote_storage_config: None, remote_storage_config: None,
profiling: ProfilingConfig::Disabled, profiling: ProfilingConfig::Disabled,
default_tenant_conf: TenantConf::dummy_conf(), default_tenant_conf: TenantConf::dummy_conf(),
broker_endpoints: Vec::new(), broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(), log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
} }
@@ -730,22 +710,6 @@ where
}) })
} }
fn parse_toml_array(name: &str, item: &Item) -> anyhow::Result<Vec<String>> {
let array = item
.as_array()
.with_context(|| format!("configure option {name} is not an array"))?;
array
.iter()
.map(|value| {
value
.as_str()
.map(str::to_string)
.with_context(|| format!("Array item {value:?} for key {name} is not a string"))
})
.collect()
}
/// Configurable semaphore permits setting. /// Configurable semaphore permits setting.
/// ///
/// Does not allow semaphore permits to be zero, because at runtime initially zero permits and empty /// Does not allow semaphore permits to be zero, because at runtime initially zero permits and empty
@@ -835,10 +799,10 @@ log_format = 'json'
fn parse_defaults() -> anyhow::Result<()> { fn parse_defaults() -> anyhow::Result<()> {
let tempdir = tempdir()?; let tempdir = tempdir()?;
let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
let broker_endpoint = "http://127.0.0.1:7777"; let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;
// we have to create dummy values to overcome the validation errors // we have to create dummy values to overcome the validation errors
let config_string = format!( let config_string = format!(
"pg_distrib_dir='{}'\nid=10\nbroker_endpoints = ['{broker_endpoint}']", "pg_distrib_dir='{}'\nid=10\nbroker_endpoint = '{broker_endpoint}'",
pg_distrib_dir.display() pg_distrib_dir.display()
); );
let toml = config_string.parse()?; let toml = config_string.parse()?;
@@ -864,10 +828,7 @@ log_format = 'json'
remote_storage_config: None, remote_storage_config: None,
profiling: ProfilingConfig::Disabled, profiling: ProfilingConfig::Disabled,
default_tenant_conf: TenantConf::default(), default_tenant_conf: TenantConf::default(),
broker_endpoints: vec![broker_endpoint broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
.parse()
.expect("Failed to parse a valid broker endpoint URL")],
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(), log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
}, },
@@ -881,10 +842,10 @@ log_format = 'json'
fn parse_basic_config() -> anyhow::Result<()> { fn parse_basic_config() -> anyhow::Result<()> {
let tempdir = tempdir()?; let tempdir = tempdir()?;
let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
let broker_endpoint = "http://127.0.0.1:7777"; let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;
let config_string = format!( let config_string = format!(
"{ALL_BASE_VALUES_TOML}pg_distrib_dir='{}'\nbroker_endpoints = ['{broker_endpoint}']", "{ALL_BASE_VALUES_TOML}pg_distrib_dir='{}'\nbroker_endpoint = '{broker_endpoint}'",
pg_distrib_dir.display() pg_distrib_dir.display()
); );
let toml = config_string.parse()?; let toml = config_string.parse()?;
@@ -910,10 +871,7 @@ log_format = 'json'
remote_storage_config: None, remote_storage_config: None,
profiling: ProfilingConfig::Disabled, profiling: ProfilingConfig::Disabled,
default_tenant_conf: TenantConf::default(), default_tenant_conf: TenantConf::default(),
broker_endpoints: vec![broker_endpoint broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
.parse()
.expect("Failed to parse a valid broker endpoint URL")],
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
log_format: LogFormat::Json, log_format: LogFormat::Json,
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
}, },
@@ -947,7 +905,7 @@ local_path = '{}'"#,
let config_string = format!( let config_string = format!(
r#"{ALL_BASE_VALUES_TOML} r#"{ALL_BASE_VALUES_TOML}
pg_distrib_dir='{}' pg_distrib_dir='{}'
broker_endpoints = ['{broker_endpoint}'] broker_endpoint = '{broker_endpoint}'
{remote_storage_config_str}"#, {remote_storage_config_str}"#,
pg_distrib_dir.display(), pg_distrib_dir.display(),
@@ -1014,7 +972,7 @@ concurrency_limit = {s3_concurrency_limit}"#
let config_string = format!( let config_string = format!(
r#"{ALL_BASE_VALUES_TOML} r#"{ALL_BASE_VALUES_TOML}
pg_distrib_dir='{}' pg_distrib_dir='{}'
broker_endpoints = ['{broker_endpoint}'] broker_endpoint = '{broker_endpoint}'
{remote_storage_config_str}"#, {remote_storage_config_str}"#,
pg_distrib_dir.display(), pg_distrib_dir.display(),
@@ -1059,7 +1017,7 @@ broker_endpoints = ['{broker_endpoint}']
let config_string = format!( let config_string = format!(
r#"{ALL_BASE_VALUES_TOML} r#"{ALL_BASE_VALUES_TOML}
pg_distrib_dir='{}' pg_distrib_dir='{}'
broker_endpoints = ['{broker_endpoint}'] broker_endpoint = '{broker_endpoint}'
[tenant_config] [tenant_config]
trace_read_requests = {trace_read_requests}"#, trace_read_requests = {trace_read_requests}"#,

View File

@@ -71,7 +71,7 @@ use crate::shutdown_pageserver;
// //
// WAL receiver runtime: // WAL receiver runtime:
// - used to handle WAL receiver connections. // - used to handle WAL receiver connections.
// - and to receiver updates from etcd // - and to receiver updates from storage_broker
// //
// Background runtime // Background runtime
// - layer flushing // - layer flushing
@@ -178,7 +178,7 @@ pub enum TaskKind {
PageRequestHandler, PageRequestHandler,
// Manages the WAL receiver connection for one timeline. It subscribes to // Manages the WAL receiver connection for one timeline. It subscribes to
// events from etcd, decides which safekeeper to connect to. It spawns a // events from storage_broker, decides which safekeeper to connect to. It spawns a
// separate WalReceiverConnection task to handle each connection. // separate WalReceiverConnection task to handle each connection.
WalReceiverManager, WalReceiverManager,

View File

@@ -54,7 +54,7 @@ use utils::{
use crate::repository::GcResult; use crate::repository::GcResult;
use crate::repository::{Key, Value}; use crate::repository::{Key, Value};
use crate::task_mgr::TaskKind; use crate::task_mgr::TaskKind;
use crate::walreceiver::{is_etcd_client_initialized, spawn_connection_manager_task}; use crate::walreceiver::{is_broker_client_initialized, spawn_connection_manager_task};
use crate::walredo::WalRedoManager; use crate::walredo::WalRedoManager;
use crate::CheckpointConfig; use crate::CheckpointConfig;
use crate::METADATA_FILE_NAME; use crate::METADATA_FILE_NAME;
@@ -856,12 +856,12 @@ impl Timeline {
} }
pub(super) fn launch_wal_receiver(self: &Arc<Self>) { pub(super) fn launch_wal_receiver(self: &Arc<Self>) {
if !is_etcd_client_initialized() { if !is_broker_client_initialized() {
if cfg!(test) { if cfg!(test) {
info!("not launching WAL receiver because etcd client hasn't been initialized"); info!("not launching WAL receiver because broker client hasn't been initialized");
return; return;
} else { } else {
panic!("etcd client not initialized"); panic!("broker client not initialized");
} }
} }
@@ -882,7 +882,6 @@ impl Timeline {
drop(tenant_conf_guard); drop(tenant_conf_guard);
let self_clone = Arc::clone(self); let self_clone = Arc::clone(self);
spawn_connection_manager_task( spawn_connection_manager_task(
self.conf.broker_etcd_prefix.clone(),
self_clone, self_clone,
walreceiver_connect_timeout, walreceiver_connect_timeout,
lagging_wal_timeout, lagging_wal_timeout,

View File

@@ -6,7 +6,7 @@
//! hence WAL receiver needs to react on such events. //! hence WAL receiver needs to react on such events.
//! //!
//! * get a broker subscription, stream data from it to determine that a timeline needs WAL streaming. //! * get a broker subscription, stream data from it to determine that a timeline needs WAL streaming.
//! For that, it watches specific keys in etcd broker and pulls the relevant data periodically. //! For that, it watches specific keys in storage_broker and pulls the relevant data periodically.
//! The data is produced by safekeepers, that push it periodically and pull it to synchronize between each other. //! The data is produced by safekeepers, that push it periodically and pull it to synchronize between each other.
//! Without this data, no WAL streaming is possible currently. //! Without this data, no WAL streaming is possible currently.
//! //!
@@ -26,57 +26,49 @@ mod walreceiver_connection;
use crate::config::PageServerConf; use crate::config::PageServerConf;
use crate::task_mgr::WALRECEIVER_RUNTIME; use crate::task_mgr::WALRECEIVER_RUNTIME;
use anyhow::{ensure, Context}; use anyhow::Context;
use etcd_broker::Client;
use itertools::Itertools;
use once_cell::sync::OnceCell; use once_cell::sync::OnceCell;
use std::future::Future; use std::future::Future;
use storage_broker::BrokerClientChannel;
use tokio::sync::watch; use tokio::sync::watch;
use tracing::*; use tracing::*;
use url::Url;
pub use connection_manager::spawn_connection_manager_task; pub use connection_manager::spawn_connection_manager_task;
static ETCD_CLIENT: OnceCell<Client> = OnceCell::new(); static BROKER_CLIENT: OnceCell<BrokerClientChannel> = OnceCell::new();
/// ///
/// Initialize the etcd client. This must be called once at page server startup. /// Initialize the broker client. This must be called once at page server startup.
/// ///
pub async fn init_etcd_client(conf: &'static PageServerConf) -> anyhow::Result<()> { pub async fn init_broker_client(conf: &'static PageServerConf) -> anyhow::Result<()> {
let etcd_endpoints = conf.broker_endpoints.clone(); let broker_endpoint = conf.broker_endpoint.clone();
ensure!(
!etcd_endpoints.is_empty(),
"Cannot start wal receiver: etcd endpoints are empty"
);
let etcd_client = Client::connect(etcd_endpoints.clone(), None) // Note: we do not attempt connecting here (but validate endpoints sanity).
.await let broker_client = storage_broker::connect(broker_endpoint.clone()).context(format!(
.context("Failed to connect to etcd")?; "Failed to create broker client to {}",
&conf.broker_endpoint
))?;
// FIXME: Should we still allow the pageserver to start, if etcd if BROKER_CLIENT.set(broker_client).is_err() {
// doesn't work? It could still serve GetPage requests, with the panic!("broker already initialized");
// data it has locally and from what it can download from remote
// storage
if ETCD_CLIENT.set(etcd_client).is_err() {
panic!("etcd already initialized");
} }
info!( info!(
"Initialized etcd client with endpoints: {}", "Initialized broker client with endpoints: {}",
etcd_endpoints.iter().map(Url::to_string).join(", ") broker_endpoint
); );
Ok(()) Ok(())
} }
/// ///
/// Get a handle to the etcd client /// Get a handle to the broker client
/// ///
pub fn get_etcd_client() -> &'static etcd_broker::Client { pub fn get_broker_client() -> &'static BrokerClientChannel {
ETCD_CLIENT.get().expect("etcd client not initialized") BROKER_CLIENT.get().expect("broker client not initialized")
} }
pub fn is_etcd_client_initialized() -> bool { pub fn is_broker_client_initialized() -> bool {
ETCD_CLIENT.get().is_some() BROKER_CLIENT.get().is_some()
} }
/// A handle of an asynchronous task. /// A handle of an asynchronous task.

View File

@@ -1,21 +1,15 @@
//! WAL receiver logic that ensures the pageserver gets connectected to safekeeper, //! WAL receiver logic that ensures the pageserver gets connectected to safekeeper,
//! that contains the latest WAL to stream and this connection does not go stale. //! that contains the latest WAL to stream and this connection does not go stale.
//! //!
//! To achieve that, a etcd broker is used: safekepers propagate their timelines' state in it, //! To achieve that, a storage broker is used: safekepers propagate their timelines' state in it,
//! the manager subscribes for changes and accumulates those to query the one with the biggest Lsn for connection. //! the manager subscribes for changes and accumulates those to query the one with the biggest Lsn for connection.
//! Current connection state is tracked too, to ensure it's not getting stale. //! Current connection state is tracked too, to ensure it's not getting stale.
//! //!
//! After every connection or etcd update fetched, the state gets updated correspondingly and rechecked for the new conneciton leader, //! After every connection or storage broker update fetched, the state gets updated correspondingly and rechecked for the new conneciton leader,
//! then a [re]connection happens, if necessary. //! then a [re]connection happens, if necessary.
//! Only WAL streaming task expects to be finished, other loops (etcd, connection management) never exit unless cancelled explicitly via the dedicated channel. //! Only WAL streaming task expects to be finished, other loops (storage broker, connection management) never exit unless cancelled explicitly via the dedicated channel.
use std::{ use std::{collections::HashMap, num::NonZeroU64, ops::ControlFlow, sync::Arc, time::Duration};
collections::{hash_map, HashMap},
num::NonZeroU64,
ops::ControlFlow,
sync::Arc,
time::Duration,
};
use crate::task_mgr::TaskKind; use crate::task_mgr::TaskKind;
use crate::task_mgr::WALRECEIVER_RUNTIME; use crate::task_mgr::WALRECEIVER_RUNTIME;
@@ -23,16 +17,18 @@ use crate::tenant::Timeline;
use crate::{task_mgr, walreceiver::TaskStateUpdate}; use crate::{task_mgr, walreceiver::TaskStateUpdate};
use anyhow::Context; use anyhow::Context;
use chrono::{NaiveDateTime, Utc}; use chrono::{NaiveDateTime, Utc};
use etcd_broker::{
subscription_key::SubscriptionKey, subscription_value::SkTimelineInfo, BrokerSubscription,
BrokerUpdate, Client,
};
use pageserver_api::models::TimelineState; use pageserver_api::models::TimelineState;
use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey;
use storage_broker::proto::SafekeeperTimelineInfo;
use storage_broker::proto::SubscribeSafekeeperInfoRequest;
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
use storage_broker::BrokerClientChannel;
use storage_broker::Streaming;
use tokio::{select, sync::watch}; use tokio::{select, sync::watch};
use tracing::*; use tracing::*;
use crate::{ use crate::{
exponential_backoff, walreceiver::get_etcd_client, DEFAULT_BASE_BACKOFF_SECONDS, exponential_backoff, walreceiver::get_broker_client, DEFAULT_BASE_BACKOFF_SECONDS,
DEFAULT_MAX_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
}; };
use postgres_connection::{parse_host_port, PgConnectionConfig}; use postgres_connection::{parse_host_port, PgConnectionConfig};
@@ -45,14 +41,13 @@ use super::{walreceiver_connection::WalConnectionStatus, TaskEvent, TaskHandle};
/// Spawns the loop to take care of the timeline's WAL streaming connection. /// Spawns the loop to take care of the timeline's WAL streaming connection.
pub fn spawn_connection_manager_task( pub fn spawn_connection_manager_task(
broker_loop_prefix: String,
timeline: Arc<Timeline>, timeline: Arc<Timeline>,
wal_connect_timeout: Duration, wal_connect_timeout: Duration,
lagging_wal_timeout: Duration, lagging_wal_timeout: Duration,
max_lsn_wal_lag: NonZeroU64, max_lsn_wal_lag: NonZeroU64,
auth_token: Option<Arc<String>>, auth_token: Option<Arc<String>>,
) { ) {
let mut etcd_client = get_etcd_client().clone(); let mut broker_client = get_broker_client().clone();
let tenant_id = timeline.tenant_id; let tenant_id = timeline.tenant_id;
let timeline_id = timeline.timeline_id; let timeline_id = timeline.timeline_id;
@@ -65,7 +60,7 @@ pub fn spawn_connection_manager_task(
&format!("walreceiver for timeline {tenant_id}/{timeline_id}"), &format!("walreceiver for timeline {tenant_id}/{timeline_id}"),
false, false,
async move { async move {
info!("WAL receiver broker started, connecting to etcd"); info!("WAL receiver manager started, connecting to broker");
let mut walreceiver_state = WalreceiverState::new( let mut walreceiver_state = WalreceiverState::new(
timeline, timeline,
wal_connect_timeout, wal_connect_timeout,
@@ -81,8 +76,7 @@ pub fn spawn_connection_manager_task(
return Ok(()); return Ok(());
}, },
loop_step_result = connection_manager_loop_step( loop_step_result = connection_manager_loop_step(
&broker_loop_prefix, &mut broker_client,
&mut etcd_client,
&mut walreceiver_state, &mut walreceiver_state,
) => match loop_step_result { ) => match loop_step_result {
ControlFlow::Continue(()) => continue, ControlFlow::Continue(()) => continue,
@@ -103,10 +97,9 @@ pub fn spawn_connection_manager_task(
/// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker. /// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker.
/// Based on the updates, desides whether to start, keep or stop a WAL receiver task. /// Based on the updates, desides whether to start, keep or stop a WAL receiver task.
/// If etcd subscription is cancelled, exits. /// If storage broker subscription is cancelled, exits.
async fn connection_manager_loop_step( async fn connection_manager_loop_step(
broker_prefix: &str, broker_client: &mut BrokerClientChannel,
etcd_client: &mut Client,
walreceiver_state: &mut WalreceiverState, walreceiver_state: &mut WalreceiverState,
) -> ControlFlow<(), ()> { ) -> ControlFlow<(), ()> {
let mut timeline_state_updates = walreceiver_state.timeline.subscribe_for_state_updates(); let mut timeline_state_updates = walreceiver_state.timeline.subscribe_for_state_updates();
@@ -124,13 +117,11 @@ async fn connection_manager_loop_step(
timeline_id: walreceiver_state.timeline.timeline_id, timeline_id: walreceiver_state.timeline.timeline_id,
}; };
// XXX: We never explicitly cancel etcd task, instead establishing one and never letting it go, // Subscribe to the broker updates. Stream shares underlying TCP connection
// running the entire loop step as much as possible to an end. // with other streams on this client (other connection managers). When
// The task removal happens implicitly on drop, both aborting the etcd subscription task and dropping the receiver channel end, // object goes out of scope, stream finishes in drop() automatically.
// forcing the etcd subscription to exit either way. let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id).await;
let mut broker_subscription = info!("Subscribed for broker timeline updates");
subscribe_for_timeline_updates(etcd_client, broker_prefix, id).await;
info!("Subscribed for etcd timeline changes, waiting for new etcd data");
loop { loop {
let time_until_next_retry = walreceiver_state.time_until_next_retry(); let time_until_next_retry = walreceiver_state.time_until_next_retry();
@@ -145,12 +136,6 @@ async fn connection_manager_loop_step(
// - this might change the current desired connection // - this might change the current desired connection
// - timeline state changes to something that does not allow walreceiver to run concurrently // - timeline state changes to something that does not allow walreceiver to run concurrently
select! { select! {
broker_connection_result = &mut broker_subscription.watcher_handle => {
info!("Broker connection was closed from the other side, ending current broker loop step");
cleanup_broker_connection(broker_connection_result, walreceiver_state);
return ControlFlow::Continue(());
},
Some(wal_connection_update) = async { Some(wal_connection_update) = async {
match walreceiver_state.wal_connection.as_mut() { match walreceiver_state.wal_connection.as_mut() {
Some(wal_connection) => Some(wal_connection.connection_task.next_task_event().await), Some(wal_connection) => Some(wal_connection.connection_task.next_task_event().await),
@@ -185,22 +170,16 @@ async fn connection_manager_loop_step(
} }
}, },
// Got a new update from etcd // Got a new update from the broker
broker_update = broker_subscription.value_updates.recv() => { broker_update = broker_subscription.message() => {
match broker_update { match broker_update {
Some(broker_update) => walreceiver_state.register_timeline_update(broker_update), Ok(Some(broker_update)) => walreceiver_state.register_timeline_update(broker_update),
None => { Err(e) => {
info!("Broker sender end was dropped, ending current broker loop step"); error!("broker subscription failed: {e}");
// Ensure to cancel and wait for the broker subscription task end, to log its result. return ControlFlow::Continue(());
// Broker sender end is in the broker subscription task and its drop means abnormal task completion. }
// First, ensure that the task is stopped (abort can be done without errors on already stopped tasks and repeated multiple times). Ok(None) => {
broker_subscription.watcher_handle.abort(); error!("broker subscription stream ended"); // can't happen
// Then, wait for the task to finish and print its result. If the task was finished before abort (which we assume in this abnormal case),
// a proper error message will be printed, otherwise an abortion message is printed which is ok, since we're signalled to finish anyway.
cleanup_broker_connection(
(&mut broker_subscription.watcher_handle).await,
walreceiver_state,
);
return ControlFlow::Continue(()); return ControlFlow::Continue(());
} }
} }
@@ -234,17 +213,6 @@ async fn connection_manager_loop_step(
_ = async { tokio::time::sleep(time_until_next_retry.unwrap()).await }, if time_until_next_retry.is_some() => {} _ = async { tokio::time::sleep(time_until_next_retry.unwrap()).await }, if time_until_next_retry.is_some() => {}
} }
// Fetch more etcd timeline updates, but limit ourselves since they may arrive quickly.
let mut max_events_to_poll = 100_u32;
while max_events_to_poll > 0 {
if let Ok(broker_update) = broker_subscription.value_updates.try_recv() {
walreceiver_state.register_timeline_update(broker_update);
max_events_to_poll -= 1;
} else {
break;
}
}
if let Some(new_candidate) = walreceiver_state.next_connection_candidate() { if let Some(new_candidate) = walreceiver_state.next_connection_candidate() {
info!("Switching to new connection candidate: {new_candidate:?}"); info!("Switching to new connection candidate: {new_candidate:?}");
walreceiver_state walreceiver_state
@@ -285,33 +253,11 @@ async fn wait_for_active_timeline(
} }
} }
fn cleanup_broker_connection(
broker_connection_result: Result<Result<(), etcd_broker::BrokerError>, tokio::task::JoinError>,
walreceiver_state: &mut WalreceiverState,
) {
match broker_connection_result {
Ok(Ok(())) => info!("Broker conneciton task finished, ending current broker loop step"),
Ok(Err(broker_error)) => warn!("Broker conneciton ended with error: {broker_error}"),
Err(abort_error) => {
if abort_error.is_panic() {
error!("Broker connection panicked: {abort_error}")
} else {
debug!("Broker connection aborted: {abort_error}")
}
}
}
walreceiver_state.wal_stream_candidates.clear();
}
/// Endlessly try to subscribe for broker updates for a given timeline. /// Endlessly try to subscribe for broker updates for a given timeline.
/// If there are no safekeepers to maintain the lease, the timeline subscription will be unavailable in the broker and the operation will fail constantly.
/// This is ok, pageservers should anyway try subscribing (with some backoff) since it's the only way they can get the timeline WAL anyway.
async fn subscribe_for_timeline_updates( async fn subscribe_for_timeline_updates(
etcd_client: &mut Client, broker_client: &mut BrokerClientChannel,
broker_prefix: &str,
id: TenantTimelineId, id: TenantTimelineId,
) -> BrokerSubscription<SkTimelineInfo> { ) -> Streaming<SafekeeperTimelineInfo> {
let mut attempt = 0; let mut attempt = 0;
loop { loop {
exponential_backoff( exponential_backoff(
@@ -322,18 +268,21 @@ async fn subscribe_for_timeline_updates(
.await; .await;
attempt += 1; attempt += 1;
match etcd_broker::subscribe_for_json_values( // subscribe to the specific timeline
etcd_client, let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId {
SubscriptionKey::sk_timeline_info(broker_prefix.to_owned(), id), tenant_id: id.tenant_id.as_ref().to_owned(),
) timeline_id: id.timeline_id.as_ref().to_owned(),
.instrument(info_span!("etcd_subscription")) });
.await let request = SubscribeSafekeeperInfoRequest {
{ subscription_key: Some(key),
Ok(new_subscription) => { };
return new_subscription;
match broker_client.subscribe_safekeeper_info(request).await {
Ok(resp) => {
return resp.into_inner();
} }
Err(e) => { Err(e) => {
warn!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in etcd: {e:#}"); warn!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}");
continue; continue;
} }
} }
@@ -360,8 +309,8 @@ struct WalreceiverState {
wal_connection: Option<WalConnection>, wal_connection: Option<WalConnection>,
/// Info about retries and unsuccessful attempts to connect to safekeepers. /// Info about retries and unsuccessful attempts to connect to safekeepers.
wal_connection_retries: HashMap<NodeId, RetryInfo>, wal_connection_retries: HashMap<NodeId, RetryInfo>,
/// Data about all timelines, available for connection, fetched from etcd, grouped by their corresponding safekeeper node id. /// Data about all timelines, available for connection, fetched from storage broker, grouped by their corresponding safekeeper node id.
wal_stream_candidates: HashMap<NodeId, EtcdSkTimeline>, wal_stream_candidates: HashMap<NodeId, BrokerSkTimeline>,
auth_token: Option<Arc<String>>, auth_token: Option<Arc<String>>,
} }
@@ -395,13 +344,11 @@ struct RetryInfo {
retry_duration_seconds: f64, retry_duration_seconds: f64,
} }
/// Data about the timeline to connect to, received from etcd. /// Data about the timeline to connect to, received from the broker.
#[derive(Debug)] #[derive(Debug)]
struct EtcdSkTimeline { struct BrokerSkTimeline {
timeline: SkTimelineInfo, timeline: SafekeeperTimelineInfo,
/// Etcd generation, the bigger it is, the more up to date the timeline data is. /// Time at which the data was fetched from the broker last time, to track the stale data.
etcd_version: i64,
/// Time at which the data was fetched from etcd last time, to track the stale data.
latest_update: NaiveDateTime, latest_update: NaiveDateTime,
} }
@@ -538,31 +485,18 @@ impl WalreceiverState {
next_retry_at.and_then(|next_retry_at| (next_retry_at - now).to_std().ok()) next_retry_at.and_then(|next_retry_at| (next_retry_at - now).to_std().ok())
} }
/// Adds another etcd timeline into the state, if its more recent than the one already added there for the same key. /// Adds another broker timeline into the state, if its more recent than the one already added there for the same key.
fn register_timeline_update(&mut self, timeline_update: BrokerUpdate<SkTimelineInfo>) { fn register_timeline_update(&mut self, timeline_update: SafekeeperTimelineInfo) {
match self self.wal_stream_candidates.insert(
.wal_stream_candidates NodeId(timeline_update.safekeeper_id),
.entry(timeline_update.key.node_id) BrokerSkTimeline {
{ timeline: timeline_update,
hash_map::Entry::Occupied(mut o) => { latest_update: Utc::now().naive_utc(),
let existing_value = o.get_mut(); },
if existing_value.etcd_version < timeline_update.etcd_version { );
existing_value.etcd_version = timeline_update.etcd_version;
existing_value.timeline = timeline_update.value;
existing_value.latest_update = Utc::now().naive_utc();
}
}
hash_map::Entry::Vacant(v) => {
v.insert(EtcdSkTimeline {
timeline: timeline_update.value,
etcd_version: timeline_update.etcd_version,
latest_update: Utc::now().naive_utc(),
});
}
}
} }
/// Cleans up stale etcd records and checks the rest for the new connection candidate. /// Cleans up stale broker records and checks the rest for the new connection candidate.
/// Returns a new candidate, if the current state is absent or somewhat lagging, `None` otherwise. /// Returns a new candidate, if the current state is absent or somewhat lagging, `None` otherwise.
/// The current rules for approving new candidates: /// The current rules for approving new candidates:
/// * pick a candidate different from the connected safekeeper with biggest `commit_lsn` and lowest failed connection attemps /// * pick a candidate different from the connected safekeeper with biggest `commit_lsn` and lowest failed connection attemps
@@ -585,7 +519,7 @@ impl WalreceiverState {
Some(existing_wal_connection) => { Some(existing_wal_connection) => {
let connected_sk_node = existing_wal_connection.sk_id; let connected_sk_node = existing_wal_connection.sk_id;
let (new_sk_id, new_safekeeper_etcd_data, new_wal_source_connconf) = let (new_sk_id, new_safekeeper_broker_data, new_wal_source_connconf) =
self.select_connection_candidate(Some(connected_sk_node))?; self.select_connection_candidate(Some(connected_sk_node))?;
let now = Utc::now().naive_utc(); let now = Utc::now().naive_utc();
@@ -614,7 +548,7 @@ impl WalreceiverState {
} }
if let Some(current_commit_lsn) = existing_wal_connection.status.commit_lsn { if let Some(current_commit_lsn) = existing_wal_connection.status.commit_lsn {
let new_commit_lsn = new_safekeeper_etcd_data.commit_lsn.unwrap_or(Lsn(0)); let new_commit_lsn = Lsn(new_safekeeper_broker_data.commit_lsn);
// Check if the new candidate has much more WAL than the current one. // Check if the new candidate has much more WAL than the current one.
match new_commit_lsn.0.checked_sub(current_commit_lsn.0) { match new_commit_lsn.0.checked_sub(current_commit_lsn.0) {
Some(new_sk_lsn_advantage) => { Some(new_sk_lsn_advantage) => {
@@ -644,7 +578,7 @@ impl WalreceiverState {
.status .status
.commit_lsn .commit_lsn
.unwrap_or(current_lsn); .unwrap_or(current_lsn);
let candidate_commit_lsn = new_safekeeper_etcd_data.commit_lsn.unwrap_or(Lsn(0)); let candidate_commit_lsn = Lsn(new_safekeeper_broker_data.commit_lsn);
// Keep discovered_new_wal only if connected safekeeper has not caught up yet. // Keep discovered_new_wal only if connected safekeeper has not caught up yet.
let mut discovered_new_wal = existing_wal_connection let mut discovered_new_wal = existing_wal_connection
@@ -727,7 +661,7 @@ impl WalreceiverState {
None None
} }
/// Selects the best possible candidate, based on the data collected from etcd updates about the safekeepers. /// Selects the best possible candidate, based on the data collected from the broker updates about the safekeepers.
/// Optionally, omits the given node, to support gracefully switching from a healthy safekeeper to another. /// Optionally, omits the given node, to support gracefully switching from a healthy safekeeper to another.
/// ///
/// The candidate that is chosen: /// The candidate that is chosen:
@@ -736,7 +670,7 @@ impl WalreceiverState {
fn select_connection_candidate( fn select_connection_candidate(
&self, &self,
node_to_omit: Option<NodeId>, node_to_omit: Option<NodeId>,
) -> Option<(NodeId, &SkTimelineInfo, PgConnectionConfig)> { ) -> Option<(NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> {
self.applicable_connection_candidates() self.applicable_connection_candidates()
.filter(|&(sk_id, _, _)| Some(sk_id) != node_to_omit) .filter(|&(sk_id, _, _)| Some(sk_id) != node_to_omit)
.max_by_key(|(_, info, _)| info.commit_lsn) .max_by_key(|(_, info, _)| info.commit_lsn)
@@ -746,12 +680,12 @@ impl WalreceiverState {
/// Some safekeepers are filtered by the retry cooldown. /// Some safekeepers are filtered by the retry cooldown.
fn applicable_connection_candidates( fn applicable_connection_candidates(
&self, &self,
) -> impl Iterator<Item = (NodeId, &SkTimelineInfo, PgConnectionConfig)> { ) -> impl Iterator<Item = (NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> {
let now = Utc::now().naive_utc(); let now = Utc::now().naive_utc();
self.wal_stream_candidates self.wal_stream_candidates
.iter() .iter()
.filter(|(_, info)| info.timeline.commit_lsn.is_some()) .filter(|(_, info)| Lsn(info.timeline.commit_lsn) != Lsn::INVALID)
.filter(move |(sk_id, _)| { .filter(move |(sk_id, _)| {
let next_retry_at = self let next_retry_at = self
.wal_connection_retries .wal_connection_retries
@@ -761,12 +695,14 @@ impl WalreceiverState {
}); });
next_retry_at.is_none() || next_retry_at.unwrap() <= now next_retry_at.is_none() || next_retry_at.unwrap() <= now
}) }).filter_map(|(sk_id, broker_info)| {
.filter_map(|(sk_id, etcd_info)| { let info = &broker_info.timeline;
let info = &etcd_info.timeline; if info.safekeeper_connstr.is_empty() {
return None; // no connection string, ignore sk
}
match wal_stream_connection_config( match wal_stream_connection_config(
self.id, self.id,
info.safekeeper_connstr.as_deref()?, info.safekeeper_connstr.as_ref(),
match &self.auth_token { match &self.auth_token {
None => None, None => None,
Some(x) => Some(x), Some(x) => Some(x),
@@ -781,15 +717,15 @@ impl WalreceiverState {
}) })
} }
/// Remove candidates which haven't sent etcd updates for a while. /// Remove candidates which haven't sent broker updates for a while.
fn cleanup_old_candidates(&mut self) { fn cleanup_old_candidates(&mut self) {
let mut node_ids_to_remove = Vec::with_capacity(self.wal_stream_candidates.len()); let mut node_ids_to_remove = Vec::with_capacity(self.wal_stream_candidates.len());
self.wal_stream_candidates.retain(|node_id, etcd_info| { self.wal_stream_candidates.retain(|node_id, broker_info| {
if let Ok(time_since_latest_etcd_update) = if let Ok(time_since_latest_broker_update) =
(Utc::now().naive_utc() - etcd_info.latest_update).to_std() (Utc::now().naive_utc() - broker_info.latest_update).to_std()
{ {
let should_retain = time_since_latest_etcd_update < self.lagging_wal_timeout; let should_retain = time_since_latest_broker_update < self.lagging_wal_timeout;
if !should_retain { if !should_retain {
node_ids_to_remove.push(*node_id); node_ids_to_remove.push(*node_id);
} }
@@ -870,6 +806,28 @@ mod tests {
use crate::tenant::harness::{TenantHarness, TIMELINE_ID}; use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
use url::Host; use url::Host;
fn dummy_broker_sk_timeline(
commit_lsn: u64,
safekeeper_connstr: &str,
latest_update: NaiveDateTime,
) -> BrokerSkTimeline {
BrokerSkTimeline {
timeline: SafekeeperTimelineInfo {
safekeeper_id: 0,
tenant_timeline_id: None,
last_log_term: 0,
flush_lsn: 0,
commit_lsn,
backup_lsn: 0,
remote_consistent_lsn: 0,
peer_horizon_lsn: 0,
local_start_lsn: 0,
safekeeper_connstr: safekeeper_connstr.to_owned(),
},
latest_update,
}
}
#[tokio::test] #[tokio::test]
async fn no_connection_no_candidate() -> anyhow::Result<()> { async fn no_connection_no_candidate() -> anyhow::Result<()> {
let harness = TenantHarness::create("no_connection_no_candidate")?; let harness = TenantHarness::create("no_connection_no_candidate")?;
@@ -881,74 +839,16 @@ mod tests {
state.wal_connection = None; state.wal_connection = None;
state.wal_stream_candidates = HashMap::from([ state.wal_stream_candidates = HashMap::from([
( (NodeId(0), dummy_broker_sk_timeline(1, "", now)),
NodeId(0), (NodeId(1), dummy_broker_sk_timeline(0, "no_commit_lsn", now)),
EtcdSkTimeline { (NodeId(2), dummy_broker_sk_timeline(0, "no_commit_lsn", now)),
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: Some(Lsn(1)),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: None,
},
etcd_version: 0,
latest_update: now,
},
),
(
NodeId(1),
EtcdSkTimeline {
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: None,
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some("no_commit_lsn".to_string()),
},
etcd_version: 0,
latest_update: now,
},
),
(
NodeId(2),
EtcdSkTimeline {
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: None,
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some("no_commit_lsn".to_string()),
},
etcd_version: 0,
latest_update: now,
},
),
( (
NodeId(3), NodeId(3),
EtcdSkTimeline { dummy_broker_sk_timeline(
timeline: SkTimelineInfo { 1 + state.max_lsn_wal_lag.get(),
last_log_term: None, "delay_over_threshold",
flush_lsn: None, delay_over_threshold,
commit_lsn: Some(Lsn(1 + state.max_lsn_wal_lag.get())), ),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: None,
},
etcd_version: 0,
latest_update: delay_over_threshold,
},
), ),
]); ]);
@@ -995,57 +895,23 @@ mod tests {
state.wal_stream_candidates = HashMap::from([ state.wal_stream_candidates = HashMap::from([
( (
connected_sk_id, connected_sk_id,
EtcdSkTimeline { dummy_broker_sk_timeline(
timeline: SkTimelineInfo { current_lsn + state.max_lsn_wal_lag.get() * 2,
last_log_term: None, DUMMY_SAFEKEEPER_HOST,
flush_lsn: None, now,
commit_lsn: Some(Lsn(current_lsn + state.max_lsn_wal_lag.get() * 2)), ),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
},
etcd_version: 0,
latest_update: now,
},
), ),
( (
NodeId(1), NodeId(1),
EtcdSkTimeline { dummy_broker_sk_timeline(current_lsn, "not_advanced_lsn", now),
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: Some(Lsn(current_lsn)),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some("not_advanced_lsn".to_string()),
},
etcd_version: 0,
latest_update: now,
},
), ),
( (
NodeId(2), NodeId(2),
EtcdSkTimeline { dummy_broker_sk_timeline(
timeline: SkTimelineInfo { current_lsn + state.max_lsn_wal_lag.get() / 2,
last_log_term: None, "not_enough_advanced_lsn",
flush_lsn: None, now,
commit_lsn: Some(Lsn(current_lsn + state.max_lsn_wal_lag.get() / 2)), ),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some("not_enough_advanced_lsn".to_string()),
},
etcd_version: 0,
latest_update: now,
},
), ),
]); ]);
@@ -1067,21 +933,7 @@ mod tests {
state.wal_connection = None; state.wal_connection = None;
state.wal_stream_candidates = HashMap::from([( state.wal_stream_candidates = HashMap::from([(
NodeId(0), NodeId(0),
EtcdSkTimeline { dummy_broker_sk_timeline(1 + state.max_lsn_wal_lag.get(), DUMMY_SAFEKEEPER_HOST, now),
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: Some(Lsn(1 + state.max_lsn_wal_lag.get())),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
},
etcd_version: 0,
latest_update: now,
},
)]); )]);
let only_candidate = state let only_candidate = state
@@ -1102,57 +954,15 @@ mod tests {
state.wal_stream_candidates = HashMap::from([ state.wal_stream_candidates = HashMap::from([
( (
NodeId(0), NodeId(0),
EtcdSkTimeline { dummy_broker_sk_timeline(selected_lsn - 100, "smaller_commit_lsn", now),
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: Some(Lsn(selected_lsn - 100)),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some("smaller_commit_lsn".to_string()),
},
etcd_version: 0,
latest_update: now,
},
), ),
( (
NodeId(1), NodeId(1),
EtcdSkTimeline { dummy_broker_sk_timeline(selected_lsn, DUMMY_SAFEKEEPER_HOST, now),
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: Some(Lsn(selected_lsn)),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
},
etcd_version: 0,
latest_update: now,
},
), ),
( (
NodeId(2), NodeId(2),
EtcdSkTimeline { dummy_broker_sk_timeline(selected_lsn + 100, "", now),
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: Some(Lsn(selected_lsn + 100)),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: None,
},
etcd_version: 0,
latest_update: now,
},
), ),
]); ]);
let biggest_wal_candidate = state.next_connection_candidate().expect( let biggest_wal_candidate = state.next_connection_candidate().expect(
@@ -1186,39 +996,11 @@ mod tests {
state.wal_stream_candidates = HashMap::from([ state.wal_stream_candidates = HashMap::from([
( (
NodeId(0), NodeId(0),
EtcdSkTimeline { dummy_broker_sk_timeline(bigger_lsn.0, DUMMY_SAFEKEEPER_HOST, now),
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: Some(bigger_lsn),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
},
etcd_version: 0,
latest_update: now,
},
), ),
( (
NodeId(1), NodeId(1),
EtcdSkTimeline { dummy_broker_sk_timeline(current_lsn.0, DUMMY_SAFEKEEPER_HOST, now),
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: Some(current_lsn),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
},
etcd_version: 0,
latest_update: now,
},
), ),
]); ]);
state.wal_connection_retries = HashMap::from([( state.wal_connection_retries = HashMap::from([(
@@ -1275,39 +1057,11 @@ mod tests {
state.wal_stream_candidates = HashMap::from([ state.wal_stream_candidates = HashMap::from([
( (
connected_sk_id, connected_sk_id,
EtcdSkTimeline { dummy_broker_sk_timeline(current_lsn.0, DUMMY_SAFEKEEPER_HOST, now),
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: Some(current_lsn),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
},
etcd_version: 0,
latest_update: now,
},
), ),
( (
NodeId(1), NodeId(1),
EtcdSkTimeline { dummy_broker_sk_timeline(new_lsn.0, "advanced_by_lsn_safekeeper", now),
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: Some(new_lsn),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some("advanced_by_lsn_safekeeper".to_string()),
},
etcd_version: 0,
latest_update: now,
},
), ),
]); ]);
@@ -1367,21 +1121,7 @@ mod tests {
}); });
state.wal_stream_candidates = HashMap::from([( state.wal_stream_candidates = HashMap::from([(
NodeId(0), NodeId(0),
EtcdSkTimeline { dummy_broker_sk_timeline(current_lsn.0, DUMMY_SAFEKEEPER_HOST, now),
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: Some(current_lsn),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
},
etcd_version: 0,
latest_update: now,
},
)]); )]);
let over_threshcurrent_candidate = state.next_connection_candidate().expect( let over_threshcurrent_candidate = state.next_connection_candidate().expect(
@@ -1441,21 +1181,7 @@ mod tests {
}); });
state.wal_stream_candidates = HashMap::from([( state.wal_stream_candidates = HashMap::from([(
NodeId(0), NodeId(0),
EtcdSkTimeline { dummy_broker_sk_timeline(new_lsn.0, DUMMY_SAFEKEEPER_HOST, now),
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: Some(new_lsn),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
},
etcd_version: 0,
latest_update: now,
},
)]); )]);
let over_threshcurrent_candidate = state.next_connection_candidate().expect( let over_threshcurrent_candidate = state.next_connection_candidate().expect(

View File

@@ -4,6 +4,7 @@ version = "0.1.0"
edition = "2021" edition = "2021"
[dependencies] [dependencies]
async-stream = "0.3"
anyhow = "1.0" anyhow = "1.0"
async-trait = "0.1" async-trait = "0.1"
byteorder = "1.4.3" byteorder = "1.4.3"
@@ -33,12 +34,12 @@ toml_edit = { version = "0.14", features = ["easy"] }
tracing = "0.1.27" tracing = "0.1.27"
url = "2.2.2" url = "2.2.2"
etcd_broker = { path = "../libs/etcd_broker" }
metrics = { path = "../libs/metrics" } metrics = { path = "../libs/metrics" }
postgres_ffi = { path = "../libs/postgres_ffi" } postgres_ffi = { path = "../libs/postgres_ffi" }
pq_proto = { path = "../libs/pq_proto" } pq_proto = { path = "../libs/pq_proto" }
remote_storage = { path = "../libs/remote_storage" } remote_storage = { path = "../libs/remote_storage" }
safekeeper_api = { path = "../libs/safekeeper_api" } safekeeper_api = { path = "../libs/safekeeper_api" }
storage_broker = { version = "0.1", path = "../storage_broker" }
utils = { path = "../libs/utils" } utils = { path = "../libs/utils" }
workspace_hack = { version = "0.1", path = "../workspace_hack" } workspace_hack = { version = "0.1", path = "../workspace_hack" }

View File

@@ -13,7 +13,6 @@ use std::thread;
use tokio::sync::mpsc; use tokio::sync::mpsc;
use toml_edit::Document; use toml_edit::Document;
use tracing::*; use tracing::*;
use url::{ParseError, Url};
use utils::pid_file; use utils::pid_file;
use metrics::set_build_info_metric; use metrics::set_build_info_metric;
@@ -29,6 +28,7 @@ use safekeeper::wal_backup;
use safekeeper::wal_service; use safekeeper::wal_service;
use safekeeper::GlobalTimelines; use safekeeper::GlobalTimelines;
use safekeeper::SafeKeeperConf; use safekeeper::SafeKeeperConf;
use storage_broker::DEFAULT_ENDPOINT;
use utils::auth::JwtAuth; use utils::auth::JwtAuth;
use utils::{ use utils::{
http::endpoint, http::endpoint,
@@ -82,12 +82,8 @@ fn main() -> anyhow::Result<()> {
)); ));
} }
if let Some(addr) = arg_matches.get_one::<String>("broker-endpoints") { if let Some(addr) = arg_matches.get_one::<String>("broker-endpoint") {
let collected_ep: Result<Vec<Url>, ParseError> = addr.split(',').map(Url::parse).collect(); conf.broker_endpoint = addr.parse().context("failed to parse broker endpoint")?;
conf.broker_endpoints = collected_ep.context("Failed to parse broker endpoint urls")?;
}
if let Some(prefix) = arg_matches.get_one::<String>("broker-etcd-prefix") {
conf.broker_etcd_prefix = prefix.to_string();
} }
if let Some(heartbeat_timeout_str) = arg_matches.get_one::<String>("heartbeat-timeout") { if let Some(heartbeat_timeout_str) = arg_matches.get_one::<String>("heartbeat-timeout") {
@@ -224,19 +220,15 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bo
threads.push(safekeeper_thread); threads.push(safekeeper_thread);
if !conf.broker_endpoints.is_empty() { let conf_ = conf.clone();
let conf_ = conf.clone(); threads.push(
threads.push( thread::Builder::new()
thread::Builder::new() .name("broker thread".into())
.name("broker thread".into()) .spawn(|| {
.spawn(|| { // TODO: add auth?
// TODO: add auth? broker::thread_main(conf_);
broker::thread_main(conf_); })?,
})?, );
);
} else {
warn!("No broker endpoints providing, starting without node sync")
}
let conf_ = conf.clone(); let conf_ = conf.clone();
threads.push( threads.push(
@@ -369,14 +361,9 @@ fn cli() -> Command {
.arg( .arg(
Arg::new("id").long("id").help("safekeeper node id: integer") Arg::new("id").long("id").help("safekeeper node id: integer")
).arg( ).arg(
Arg::new("broker-endpoints") Arg::new("broker-endpoint")
.long("broker-endpoints") .long("broker-endpoint")
.help("a comma separated broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'"), .help(formatcp!("Broker endpoint for storage nodes coordination in the form http[s]://host:port, default '{DEFAULT_ENDPOINT}'. In case of https schema TLS is connection is established; plaintext otherwise.")),
)
.arg(
Arg::new("broker-etcd-prefix")
.long("broker-etcd-prefix")
.help("a prefix to always use when polling/pusing data in etcd from this safekeeper"),
) )
.arg( .arg(
Arg::new("heartbeat-timeout") Arg::new("heartbeat-timeout")

View File

@@ -1,15 +1,18 @@
//! Communication with etcd, providing safekeeper peers and pageserver coordination. //! Communication with the broker, providing safekeeper peers and pageserver coordination.
use anyhow::anyhow;
use anyhow::bail;
use anyhow::Context; use anyhow::Context;
use anyhow::Error; use anyhow::Error;
use anyhow::Result; use anyhow::Result;
use etcd_broker::subscription_value::SkTimelineInfo;
use etcd_broker::LeaseKeepAliveStream;
use etcd_broker::LeaseKeeper;
use std::collections::hash_map::Entry; use storage_broker::parse_proto_ttid;
use std::collections::HashMap; use storage_broker::proto::broker_service_client::BrokerServiceClient;
use std::collections::HashSet; use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey;
use storage_broker::proto::SubscribeSafekeeperInfoRequest;
use storage_broker::Request;
use std::time::Duration; use std::time::Duration;
use tokio::task::JoinHandle; use tokio::task::JoinHandle;
use tokio::{runtime, time::sleep}; use tokio::{runtime, time::sleep};
@@ -17,15 +20,9 @@ use tracing::*;
use crate::GlobalTimelines; use crate::GlobalTimelines;
use crate::SafeKeeperConf; use crate::SafeKeeperConf;
use etcd_broker::{
subscription_key::{OperationKind, SkOperationKind, SubscriptionKey},
Client, PutOptions,
};
use utils::id::{NodeId, TenantTimelineId};
const RETRY_INTERVAL_MSEC: u64 = 1000; const RETRY_INTERVAL_MSEC: u64 = 1000;
const PUSH_INTERVAL_MSEC: u64 = 1000; const PUSH_INTERVAL_MSEC: u64 = 1000;
const LEASE_TTL_SEC: i64 = 10;
pub fn thread_main(conf: SafeKeeperConf) { pub fn thread_main(conf: SafeKeeperConf) {
let runtime = runtime::Builder::new_current_thread() let runtime = runtime::Builder::new_current_thread()
@@ -34,158 +31,70 @@ pub fn thread_main(conf: SafeKeeperConf) {
.unwrap(); .unwrap();
let _enter = info_span!("broker").entered(); let _enter = info_span!("broker").entered();
info!("started, broker endpoints {:?}", conf.broker_endpoints); info!("started, broker endpoint {:?}", conf.broker_endpoint);
runtime.block_on(async { runtime.block_on(async {
main_loop(conf).await; main_loop(conf).await;
}); });
} }
/// Key to per timeline per safekeeper data.
fn timeline_safekeeper_path(
broker_etcd_prefix: String,
ttid: TenantTimelineId,
sk_id: NodeId,
) -> String {
format!(
"{}/{sk_id}",
SubscriptionKey::sk_timeline_info(broker_etcd_prefix, ttid).watch_key()
)
}
async fn push_sk_info(
ttid: TenantTimelineId,
mut client: Client,
key: String,
sk_info: SkTimelineInfo,
mut lease: Lease,
) -> anyhow::Result<(TenantTimelineId, Lease)> {
let put_opts = PutOptions::new().with_lease(lease.id);
client
.put(
key.clone(),
serde_json::to_string(&sk_info)?,
Some(put_opts),
)
.await
.with_context(|| format!("failed to push safekeeper info to {}", key))?;
// revive the lease
lease
.keeper
.keep_alive()
.await
.context("failed to send LeaseKeepAliveRequest")?;
lease
.ka_stream
.message()
.await
.context("failed to receive LeaseKeepAliveResponse")?;
Ok((ttid, lease))
}
struct Lease {
id: i64,
keeper: LeaseKeeper,
ka_stream: LeaseKeepAliveStream,
}
/// Push once in a while data about all active timelines to the broker. /// Push once in a while data about all active timelines to the broker.
async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
let mut client = Client::connect(&conf.broker_endpoints, None).await?; let mut client = BrokerServiceClient::connect(conf.broker_endpoint.clone()).await?;
let mut leases: HashMap<TenantTimelineId, Lease> = HashMap::new();
let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC); let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);
loop {
// Note: we lock runtime here and in timeline methods as GlobalTimelines
// is under plain mutex. That's ok, all this code is not performance
// sensitive and there is no risk of deadlock as we don't await while
// lock is held.
let mut active_tlis = GlobalTimelines::get_all();
active_tlis.retain(|tli| tli.is_active());
let active_tlis_set: HashSet<TenantTimelineId> = let outbound = async_stream::stream! {
active_tlis.iter().map(|tli| tli.ttid).collect(); loop {
// Note: we lock runtime here and in timeline methods as GlobalTimelines
// // Get and maintain (if not yet) per timeline lease to automatically delete obsolete data. // is under plain mutex. That's ok, all this code is not performance
for tli in &active_tlis { // sensitive and there is no risk of deadlock as we don't await while
if let Entry::Vacant(v) = leases.entry(tli.ttid) { // lock is held.
let lease = client.lease_grant(LEASE_TTL_SEC, None).await?; let mut active_tlis = GlobalTimelines::get_all();
let (keeper, ka_stream) = client.lease_keep_alive(lease.id()).await?; active_tlis.retain(|tli| tli.is_active());
v.insert(Lease { for tli in &active_tlis {
id: lease.id(),
keeper,
ka_stream,
});
}
}
leases.retain(|ttid, _| active_tlis_set.contains(ttid));
// Push data concurrently to not suffer from latency, with many timelines it can be slow.
let handles = active_tlis
.iter()
.map(|tli| {
let sk_info = tli.get_safekeeper_info(&conf); let sk_info = tli.get_safekeeper_info(&conf);
let key = yield sk_info;
timeline_safekeeper_path(conf.broker_etcd_prefix.clone(), tli.ttid, conf.my_id); }
let lease = leases.remove(&tli.ttid).unwrap(); sleep(push_interval).await;
tokio::spawn(push_sk_info(tli.ttid, client.clone(), key, sk_info, lease))
})
.collect::<Vec<_>>();
for h in handles {
let (ttid, lease) = h.await??;
// It is ugly to pull leases from hash and then put it back, but
// otherwise we have to resort to long living per tli tasks (which
// would generate a lot of errors when etcd is down) as task wants to
// have 'static objects, we can't borrow to it.
leases.insert(ttid, lease);
} }
};
sleep(push_interval).await; client
} .publish_safekeeper_info(Request::new(outbound))
.await?;
Ok(())
} }
/// Subscribe and fetch all the interesting data from the broker. /// Subscribe and fetch all the interesting data from the broker.
async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
let mut client = Client::connect(&conf.broker_endpoints, None).await?; let mut client = storage_broker::connect(conf.broker_endpoint)?;
let mut subscription = etcd_broker::subscribe_for_values( // TODO: subscribe only to local timelines instead of all
&mut client, let request = SubscribeSafekeeperInfoRequest {
SubscriptionKey::all(conf.broker_etcd_prefix.clone()), subscription_key: Some(ProtoSubscriptionKey::All(())),
|full_key, value_str| { };
if full_key.operation == OperationKind::Safekeeper(SkOperationKind::TimelineInfo) {
match serde_json::from_str::<SkTimelineInfo>(value_str) { let mut stream = client
Ok(new_info) => return Some(new_info), .subscribe_safekeeper_info(request)
Err(e) => { .await
error!("Failed to parse timeline info from value str '{value_str}': {e}") .context("subscribe_safekeper_info request failed")?
} .into_inner();
}
} while let Some(msg) = stream.message().await? {
None let proto_ttid = msg
}, .tenant_timeline_id
) .as_ref()
.await .ok_or_else(|| anyhow!("missing tenant_timeline_id"))?;
.context("failed to subscribe for safekeeper info")?; let ttid = parse_proto_ttid(proto_ttid)?;
loop { if let Ok(tli) = GlobalTimelines::get(ttid) {
match subscription.value_updates.recv().await { // Note that we also receive *our own* info. That's
Some(new_info) => { // important, as it is used as an indication of live
// note: there are blocking operations below, but it's considered fine for now // connection to the broker.
if let Ok(tli) = GlobalTimelines::get(new_info.key.id) {
// Note that we also receive *our own* info. That's // note: there are blocking operations below, but it's considered fine for now
// important, as it is used as an indication of live tli.record_safekeeper_info(&msg).await?
// connection to the broker.
tli.record_safekeeper_info(&new_info.value, new_info.key.node_id)
.await?
}
}
None => {
// XXX it means we lost connection with etcd, error is consumed inside sub object
debug!("timeline updates sender closed, aborting the pull loop");
return Ok(());
}
} }
} }
bail!("end of stream");
} }
async fn main_loop(conf: SafeKeeperConf) { async fn main_loop(conf: SafeKeeperConf) {

View File

@@ -3,11 +3,14 @@ use hyper::{Body, Request, Response, StatusCode, Uri};
use anyhow::Context; use anyhow::Context;
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use postgres_ffi::WAL_SEGMENT_SIZE; use postgres_ffi::WAL_SEGMENT_SIZE;
use safekeeper_api::models::SkTimelineInfo;
use serde::Serialize; use serde::Serialize;
use serde::Serializer; use serde::Serializer;
use std::collections::{HashMap, HashSet}; use std::collections::{HashMap, HashSet};
use std::fmt::Display; use std::fmt::Display;
use std::sync::Arc; use std::sync::Arc;
use storage_broker::proto::SafekeeperTimelineInfo;
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
use tokio::task::JoinError; use tokio::task::JoinError;
use crate::safekeeper::ServerInfo; use crate::safekeeper::ServerInfo;
@@ -16,7 +19,6 @@ use crate::safekeeper::Term;
use crate::timelines_global_map::TimelineDeleteForceResult; use crate::timelines_global_map::TimelineDeleteForceResult;
use crate::GlobalTimelines; use crate::GlobalTimelines;
use crate::SafeKeeperConf; use crate::SafeKeeperConf;
use etcd_broker::subscription_value::SkTimelineInfo;
use utils::{ use utils::{
auth::JwtAuth, auth::JwtAuth,
http::{ http::{
@@ -241,7 +243,22 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
parse_request_param(&request, "timeline_id")?, parse_request_param(&request, "timeline_id")?,
); );
check_permission(&request, Some(ttid.tenant_id))?; check_permission(&request, Some(ttid.tenant_id))?;
let safekeeper_info: SkTimelineInfo = json_request(&mut request).await?; let sk_info: SkTimelineInfo = json_request(&mut request).await?;
let proto_sk_info = SafekeeperTimelineInfo {
safekeeper_id: 0,
tenant_timeline_id: Some(ProtoTenantTimelineId {
tenant_id: ttid.tenant_id.as_ref().to_owned(),
timeline_id: ttid.timeline_id.as_ref().to_owned(),
}),
last_log_term: sk_info.last_log_term.unwrap_or(0),
flush_lsn: sk_info.flush_lsn.0,
commit_lsn: sk_info.commit_lsn.0,
remote_consistent_lsn: sk_info.remote_consistent_lsn.0,
peer_horizon_lsn: sk_info.peer_horizon_lsn.0,
safekeeper_connstr: sk_info.safekeeper_connstr.unwrap_or_else(|| "".to_owned()),
backup_lsn: sk_info.backup_lsn.0,
local_start_lsn: sk_info.local_start_lsn.0,
};
let tli = GlobalTimelines::get(ttid) let tli = GlobalTimelines::get(ttid)
// `GlobalTimelines::get` returns an error when it can't find the timeline. // `GlobalTimelines::get` returns an error when it can't find the timeline.
@@ -252,7 +269,7 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
) )
}) })
.map_err(ApiError::NotFound)?; .map_err(ApiError::NotFound)?;
tli.record_safekeeper_info(&safekeeper_info, NodeId(1)) tli.record_safekeeper_info(&proto_sk_info)
.await .await
.map_err(ApiError::InternalServerError)?; .map_err(ApiError::InternalServerError)?;

View File

@@ -1,11 +1,11 @@
use defaults::{ use defaults::{
DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_WAL_BACKUP_RUNTIME_THREADS, DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_WAL_BACKUP_RUNTIME_THREADS,
}; };
use storage_broker::Uri;
// //
use remote_storage::RemoteStorageConfig; use remote_storage::RemoteStorageConfig;
use std::path::PathBuf; use std::path::PathBuf;
use std::time::Duration; use std::time::Duration;
use url::Url;
use utils::{ use utils::{
id::{NodeId, TenantId, TenantTimelineId}, id::{NodeId, TenantId, TenantTimelineId},
@@ -62,8 +62,7 @@ pub struct SafeKeeperConf {
pub backup_runtime_threads: usize, pub backup_runtime_threads: usize,
pub wal_backup_enabled: bool, pub wal_backup_enabled: bool,
pub my_id: NodeId, pub my_id: NodeId,
pub broker_endpoints: Vec<Url>, pub broker_endpoint: Uri,
pub broker_etcd_prefix: String,
pub auth_validation_public_key_path: Option<PathBuf>, pub auth_validation_public_key_path: Option<PathBuf>,
pub heartbeat_timeout: Duration, pub heartbeat_timeout: Duration,
pub max_offloader_lag_bytes: u64, pub max_offloader_lag_bytes: u64,
@@ -93,8 +92,9 @@ impl Default for SafeKeeperConf {
listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
remote_storage: None, remote_storage: None,
my_id: NodeId(0), my_id: NodeId(0),
broker_endpoints: Vec::new(), broker_endpoint: storage_broker::DEFAULT_ENDPOINT
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(), .parse()
.expect("failed to parse default broker endpoint"),
backup_runtime_threads: DEFAULT_WAL_BACKUP_RUNTIME_THREADS, backup_runtime_threads: DEFAULT_WAL_BACKUP_RUNTIME_THREADS,
wal_backup_enabled: true, wal_backup_enabled: true,
auth_validation_public_key_path: None, auth_validation_public_key_path: None,

View File

@@ -4,13 +4,13 @@ use anyhow::{bail, Context, Result};
use byteorder::{LittleEndian, ReadBytesExt}; use byteorder::{LittleEndian, ReadBytesExt};
use bytes::{Buf, BufMut, Bytes, BytesMut}; use bytes::{Buf, BufMut, Bytes, BytesMut};
use etcd_broker::subscription_value::SkTimelineInfo;
use postgres_ffi::{TimeLineID, XLogSegNo, MAX_SEND_SIZE}; use postgres_ffi::{TimeLineID, XLogSegNo, MAX_SEND_SIZE};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::cmp::max; use std::cmp::max;
use std::cmp::min; use std::cmp::min;
use std::fmt; use std::fmt;
use std::io::Read; use std::io::Read;
use storage_broker::proto::SafekeeperTimelineInfo;
use tracing::*; use tracing::*;
@@ -896,39 +896,38 @@ where
} }
/// Update timeline state with peer safekeeper data. /// Update timeline state with peer safekeeper data.
pub fn record_safekeeper_info(&mut self, sk_info: &SkTimelineInfo) -> Result<()> { pub fn record_safekeeper_info(&mut self, sk_info: &SafekeeperTimelineInfo) -> Result<()> {
let mut sync_control_file = false; let mut sync_control_file = false;
if let (Some(commit_lsn), Some(last_log_term)) = (sk_info.commit_lsn, sk_info.last_log_term)
{ if (Lsn(sk_info.commit_lsn) != Lsn::INVALID) && (sk_info.last_log_term != INVALID_TERM) {
// Note: the check is too restrictive, generally we can update local // Note: the check is too restrictive, generally we can update local
// commit_lsn if our history matches (is part of) history of advanced // commit_lsn if our history matches (is part of) history of advanced
// commit_lsn provider. // commit_lsn provider.
if last_log_term == self.get_epoch() { if sk_info.last_log_term == self.get_epoch() {
self.global_commit_lsn = max(commit_lsn, self.global_commit_lsn); self.global_commit_lsn = max(Lsn(sk_info.commit_lsn), self.global_commit_lsn);
self.update_commit_lsn()?; self.update_commit_lsn()?;
} }
} }
if let Some(backup_lsn) = sk_info.backup_lsn {
let new_backup_lsn = max(backup_lsn, self.inmem.backup_lsn); let new_backup_lsn = max(Lsn(sk_info.backup_lsn), self.inmem.backup_lsn);
sync_control_file |= sync_control_file |=
self.state.backup_lsn + (self.state.server.wal_seg_size as u64) < new_backup_lsn; self.state.backup_lsn + (self.state.server.wal_seg_size as u64) < new_backup_lsn;
self.inmem.backup_lsn = new_backup_lsn; self.inmem.backup_lsn = new_backup_lsn;
}
if let Some(remote_consistent_lsn) = sk_info.remote_consistent_lsn { let new_remote_consistent_lsn = max(
let new_remote_consistent_lsn = Lsn(sk_info.remote_consistent_lsn),
max(remote_consistent_lsn, self.inmem.remote_consistent_lsn); self.inmem.remote_consistent_lsn,
sync_control_file |= self.state.remote_consistent_lsn );
+ (self.state.server.wal_seg_size as u64) sync_control_file |= self.state.remote_consistent_lsn
< new_remote_consistent_lsn; + (self.state.server.wal_seg_size as u64)
self.inmem.remote_consistent_lsn = new_remote_consistent_lsn; < new_remote_consistent_lsn;
} self.inmem.remote_consistent_lsn = new_remote_consistent_lsn;
if let Some(peer_horizon_lsn) = sk_info.peer_horizon_lsn {
let new_peer_horizon_lsn = max(peer_horizon_lsn, self.inmem.peer_horizon_lsn); let new_peer_horizon_lsn = max(Lsn(sk_info.peer_horizon_lsn), self.inmem.peer_horizon_lsn);
sync_control_file |= self.state.peer_horizon_lsn sync_control_file |= self.state.peer_horizon_lsn + (self.state.server.wal_seg_size as u64)
+ (self.state.server.wal_seg_size as u64) < new_peer_horizon_lsn;
< new_peer_horizon_lsn; self.inmem.peer_horizon_lsn = new_peer_horizon_lsn;
self.inmem.peer_horizon_lsn = new_peer_horizon_lsn;
}
if sync_control_file { if sync_control_file {
self.persist_control_file(self.state.clone())?; self.persist_control_file(self.state.clone())?;
} }

View File

@@ -2,7 +2,6 @@
//! to glue together SafeKeeper and all other background services. //! to glue together SafeKeeper and all other background services.
use anyhow::{bail, Result}; use anyhow::{bail, Result};
use etcd_broker::subscription_value::SkTimelineInfo;
use parking_lot::{Mutex, MutexGuard}; use parking_lot::{Mutex, MutexGuard};
use postgres_ffi::XLogSegNo; use postgres_ffi::XLogSegNo;
use pq_proto::ReplicationFeedback; use pq_proto::ReplicationFeedback;
@@ -18,6 +17,9 @@ use utils::{
lsn::Lsn, lsn::Lsn,
}; };
use storage_broker::proto::SafekeeperTimelineInfo;
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
use crate::safekeeper::{ use crate::safekeeper::{
AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState,
SafekeeperMemState, ServerInfo, Term, SafekeeperMemState, ServerInfo, Term,
@@ -47,13 +49,13 @@ pub struct PeerInfo {
} }
impl PeerInfo { impl PeerInfo {
fn from_sk_info(sk_id: NodeId, sk_info: &SkTimelineInfo, ts: Instant) -> PeerInfo { fn from_sk_info(sk_info: &SafekeeperTimelineInfo, ts: Instant) -> PeerInfo {
PeerInfo { PeerInfo {
sk_id, sk_id: NodeId(sk_info.safekeeper_id),
_last_log_term: sk_info.last_log_term.unwrap_or(0), _last_log_term: sk_info.last_log_term,
_flush_lsn: sk_info.flush_lsn.unwrap_or(Lsn::INVALID), _flush_lsn: Lsn(sk_info.flush_lsn),
commit_lsn: sk_info.commit_lsn.unwrap_or(Lsn::INVALID), commit_lsn: Lsn(sk_info.commit_lsn),
local_start_lsn: sk_info.local_start_lsn.unwrap_or(Lsn::INVALID), local_start_lsn: Lsn(sk_info.local_start_lsn),
ts, ts,
} }
} }
@@ -308,21 +310,31 @@ impl SharedState {
pos pos
} }
fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SkTimelineInfo { fn get_safekeeper_info(
SkTimelineInfo { &self,
last_log_term: Some(self.sk.get_epoch()), ttid: &TenantTimelineId,
flush_lsn: Some(self.sk.wal_store.flush_lsn()), conf: &SafeKeeperConf,
) -> SafekeeperTimelineInfo {
SafekeeperTimelineInfo {
safekeeper_id: conf.my_id.0,
tenant_timeline_id: Some(ProtoTenantTimelineId {
tenant_id: ttid.tenant_id.as_ref().to_owned(),
timeline_id: ttid.timeline_id.as_ref().to_owned(),
}),
last_log_term: self.sk.get_epoch(),
flush_lsn: self.sk.wal_store.flush_lsn().0,
// note: this value is not flushed to control file yet and can be lost // note: this value is not flushed to control file yet and can be lost
commit_lsn: Some(self.sk.inmem.commit_lsn), commit_lsn: self.sk.inmem.commit_lsn.0,
// TODO: rework feedbacks to avoid max here // TODO: rework feedbacks to avoid max here
remote_consistent_lsn: Some(max( remote_consistent_lsn: max(
self.get_replicas_state().remote_consistent_lsn, self.get_replicas_state().remote_consistent_lsn,
self.sk.inmem.remote_consistent_lsn, self.sk.inmem.remote_consistent_lsn,
)), )
peer_horizon_lsn: Some(self.sk.inmem.peer_horizon_lsn), .0,
safekeeper_connstr: Some(conf.listen_pg_addr.clone()), peer_horizon_lsn: self.sk.inmem.peer_horizon_lsn.0,
backup_lsn: Some(self.sk.inmem.backup_lsn), safekeeper_connstr: conf.listen_pg_addr.clone(),
local_start_lsn: Some(self.sk.state.local_start_lsn), backup_lsn: self.sk.inmem.backup_lsn.0,
local_start_lsn: self.sk.state.local_start_lsn.0,
} }
} }
} }
@@ -682,23 +694,19 @@ impl Timeline {
} }
/// Get safekeeper info for broadcasting to broker and other peers. /// Get safekeeper info for broadcasting to broker and other peers.
pub fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SkTimelineInfo { pub fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SafekeeperTimelineInfo {
let shared_state = self.write_shared_state(); let shared_state = self.write_shared_state();
shared_state.get_safekeeper_info(conf) shared_state.get_safekeeper_info(&self.ttid, conf)
} }
/// Update timeline state with peer safekeeper data. /// Update timeline state with peer safekeeper data.
pub async fn record_safekeeper_info( pub async fn record_safekeeper_info(&self, sk_info: &SafekeeperTimelineInfo) -> Result<()> {
&self,
sk_info: &SkTimelineInfo,
sk_id: NodeId,
) -> Result<()> {
let is_wal_backup_action_pending: bool; let is_wal_backup_action_pending: bool;
let commit_lsn: Lsn; let commit_lsn: Lsn;
{ {
let mut shared_state = self.write_shared_state(); let mut shared_state = self.write_shared_state();
shared_state.sk.record_safekeeper_info(sk_info)?; shared_state.sk.record_safekeeper_info(sk_info)?;
let peer_info = PeerInfo::from_sk_info(sk_id, sk_info, Instant::now()); let peer_info = PeerInfo::from_sk_info(sk_info, Instant::now());
shared_state.peers_info.upsert(&peer_info); shared_state.peers_info.upsert(&peer_info);
is_wal_backup_action_pending = shared_state.update_status(self.ttid); is_wal_backup_action_pending = shared_state.update_status(self.ttid);
commit_lsn = shared_state.sk.inmem.commit_lsn; commit_lsn = shared_state.sk.inmem.commit_lsn;

View File

@@ -7,9 +7,11 @@ edition = "2021"
bench = [] bench = []
[dependencies] [dependencies]
anyhow = "1.0"
async-stream = "0.3" async-stream = "0.3"
bytes = "1.0" bytes = "1.0"
clap = { version = "4.0", features = ["derive"] } clap = { version = "4.0", features = ["derive"] }
const_format = "0.2.21"
futures = "0.3" futures = "0.3"
futures-core = "0.3" futures-core = "0.3"
futures-util = "0.3" futures-util = "0.3"
@@ -19,7 +21,7 @@ hyper = {version = "0.14.14", features = ["full"]}
once_cell = "1.13.0" once_cell = "1.13.0"
parking_lot = "0.12" parking_lot = "0.12"
prost = "0.11" prost = "0.11"
tonic = "0.8" tonic = {version = "0.8", features = ["tls", "tls-roots"]}
tokio = { version = "1.0", features = ["macros", "rt-multi-thread"] } tokio = { version = "1.0", features = ["macros", "rt-multi-thread"] }
tokio-stream = "0.1" tokio-stream = "0.1"
tracing = "0.1.27" tracing = "0.1.27"

View File

@@ -6,8 +6,8 @@ use clap::Parser;
use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey; use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey;
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
use storage_broker::proto::{SafekeeperTimelineInfo, SubscribeSafekeeperInfoRequest}; use storage_broker::proto::{SafekeeperTimelineInfo, SubscribeSafekeeperInfoRequest};
use storage_broker::BrokerClientChannel;
use storage_broker::DEFAULT_LISTEN_ADDR; use storage_broker::{BrokerClientChannel, DEFAULT_ENDPOINT};
use tokio::time; use tokio::time;
use tonic::Request; use tonic::Request;
@@ -88,9 +88,7 @@ fn tli_from_u64(i: u64) -> Vec<u8> {
async fn subscribe(client: Option<BrokerClientChannel>, counter: Arc<AtomicU64>, i: u64) { async fn subscribe(client: Option<BrokerClientChannel>, counter: Arc<AtomicU64>, i: u64) {
let mut client = match client { let mut client = match client {
Some(c) => c, Some(c) => c,
None => BrokerClientChannel::connect_lazy(format!("http://{}", DEFAULT_LISTEN_ADDR)) None => storage_broker::connect(DEFAULT_ENDPOINT).unwrap(),
.await
.unwrap(),
}; };
let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId { let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId {
@@ -114,9 +112,7 @@ async fn subscribe(client: Option<BrokerClientChannel>, counter: Arc<AtomicU64>,
async fn publish(client: Option<BrokerClientChannel>, n_keys: u64) { async fn publish(client: Option<BrokerClientChannel>, n_keys: u64) {
let mut client = match client { let mut client = match client {
Some(c) => c, Some(c) => c,
None => BrokerClientChannel::connect_lazy(format!("http://{}", DEFAULT_LISTEN_ADDR)) None => storage_broker::connect(DEFAULT_ENDPOINT).unwrap(),
.await
.unwrap(),
}; };
let mut counter: u64 = 0; let mut counter: u64 = 0;
@@ -156,9 +152,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
} }
let h = tokio::spawn(progress_reporter(counters.clone())); let h = tokio::spawn(progress_reporter(counters.clone()));
let c = BrokerClientChannel::connect_lazy(format!("http://{}", DEFAULT_LISTEN_ADDR)) let c = storage_broker::connect(DEFAULT_ENDPOINT).unwrap();
.await
.unwrap();
for i in 0..args.num_subs { for i in 0..args.num_subs {
let c = Some(c.clone()); let c = Some(c.clone());

View File

@@ -2,6 +2,7 @@ use hyper::body::HttpBody;
use std::pin::Pin; use std::pin::Pin;
use std::task::{Context, Poll}; use std::task::{Context, Poll};
use tonic::codegen::StdError; use tonic::codegen::StdError;
use tonic::transport::{ClientTlsConfig, Endpoint};
use tonic::{transport::Channel, Code, Status}; use tonic::{transport::Channel, Code, Status};
use utils::id::{TenantId, TenantTimelineId, TimelineId}; use utils::id::{TenantId, TenantTimelineId, TimelineId};
@@ -20,12 +21,35 @@ pub mod metrics;
pub use tonic::Request; pub use tonic::Request;
pub use tonic::Streaming; pub use tonic::Streaming;
pub const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:50051"; pub use hyper::Uri;
// NeonBrokerClient charged with tonic provided Channel transport; helps to pub const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:50051";
pub const DEFAULT_ENDPOINT: &str = const_format::formatcp!("http://{DEFAULT_LISTEN_ADDR}");
// BrokerServiceClient charged with tonic provided Channel transport; helps to
// avoid depending on tonic directly in user crates. // avoid depending on tonic directly in user crates.
pub type BrokerClientChannel = BrokerServiceClient<Channel>; pub type BrokerClientChannel = BrokerServiceClient<Channel>;
// Create connection object configured to run TLS if schema starts with https://
// and plain text otherwise. Connection is lazy, only endpoint sanity is
// validated here.
pub fn connect<U>(endpoint: U) -> anyhow::Result<BrokerClientChannel>
where
U: std::convert::TryInto<Uri>,
U::Error: std::error::Error + Send + Sync + 'static,
{
let uri: Uri = endpoint.try_into()?;
let mut tonic_endpoint: Endpoint = uri.into();
// If schema starts with https, start encrypted connection; do plain text
// otherwise.
if let Some("https") = tonic_endpoint.uri().scheme_str() {
let tls = ClientTlsConfig::new();
tonic_endpoint = tonic_endpoint.tls_config(tls)?;
}
let channel = tonic_endpoint.connect_lazy();
Ok(BrokerClientChannel::new(channel))
}
impl BrokerClientChannel { impl BrokerClientChannel {
/// Create a new client to the given endpoint, but don't actually connect until the first request. /// Create a new client to the given endpoint, but don't actually connect until the first request.
pub async fn connect_lazy<D>(dst: D) -> Result<Self, tonic::transport::Error> pub async fn connect_lazy<D>(dst: D) -> Result<Self, tonic::transport::Error>

View File

@@ -13,8 +13,6 @@ Prerequisites:
below to run from other directories. below to run from other directories.
- The neon git repo, including the postgres submodule - The neon git repo, including the postgres submodule
(for some tests, e.g. `pg_regress`) (for some tests, e.g. `pg_regress`)
- Some tests (involving storage nodes coordination) require etcd installed. Follow
[`the guide`](https://etcd.io/docs/v3.5/install/) to obtain it.
### Test Organization ### Test Organization

View File

@@ -33,7 +33,7 @@ from _pytest.config import Config
from _pytest.fixtures import FixtureRequest from _pytest.fixtures import FixtureRequest
from fixtures.log_helper import log from fixtures.log_helper import log
from fixtures.types import Lsn, TenantId, TimelineId from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import Fn, allure_attach_from_dir, etcd_path, get_self_dir, subprocess_capture from fixtures.utils import Fn, allure_attach_from_dir, get_self_dir, subprocess_capture
# Type-related stuff # Type-related stuff
from psycopg2.extensions import connection as PgConnection from psycopg2.extensions import connection as PgConnection
@@ -281,19 +281,22 @@ def port_distributor(worker_base_port: int) -> PortDistributor:
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def default_broker( def default_broker(
request: FixtureRequest, port_distributor: PortDistributor, top_output_dir: Path request: FixtureRequest,
) -> Iterator[Etcd]: port_distributor: PortDistributor,
top_output_dir: Path,
neon_binpath: Path,
) -> Iterator[NeonBroker]:
# multiple pytest sessions could get launched in parallel, get them different ports/datadirs
client_port = port_distributor.get_port() client_port = port_distributor.get_port()
# multiple pytest sessions could get launched in parallel, get them different datadirs broker_logfile = (
etcd_datadir = get_test_output_dir(request, top_output_dir) / f"etcd_datadir_{client_port}" get_test_output_dir(request, top_output_dir) / f"storage_broker_{client_port}.log"
etcd_datadir.mkdir(exist_ok=True, parents=True)
broker = Etcd(
datadir=str(etcd_datadir), port=client_port, peer_port=port_distributor.get_port()
) )
broker_logfile.parents[0].mkdir(exist_ok=True, parents=True)
broker = NeonBroker(logfile=broker_logfile, port=client_port, neon_binpath=neon_binpath)
yield broker yield broker
broker.stop() broker.stop()
allure_attach_from_dir(etcd_datadir) allure_attach_from_dir(Path(broker_logfile))
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
@@ -570,7 +573,7 @@ class NeonEnvBuilder:
self, self,
repo_dir: Path, repo_dir: Path,
port_distributor: PortDistributor, port_distributor: PortDistributor,
broker: Etcd, broker: NeonBroker,
run_id: uuid.UUID, run_id: uuid.UUID,
mock_s3_server: MockS3Server, mock_s3_server: MockS3Server,
neon_binpath: Path, neon_binpath: Path,
@@ -846,9 +849,8 @@ class NeonEnv:
toml += textwrap.dedent( toml += textwrap.dedent(
f""" f"""
[etcd_broker] [broker]
broker_endpoints = ['{self.broker.client_url()}'] listen_addr = '{self.broker.listen_addr()}'
etcd_binary_path = '{self.broker.binary_path}'
""" """
) )
@@ -949,7 +951,7 @@ def _shared_simple_env(
request: FixtureRequest, request: FixtureRequest,
port_distributor: PortDistributor, port_distributor: PortDistributor,
mock_s3_server: MockS3Server, mock_s3_server: MockS3Server,
default_broker: Etcd, default_broker: NeonBroker,
run_id: uuid.UUID, run_id: uuid.UUID,
top_output_dir: Path, top_output_dir: Path,
neon_binpath: Path, neon_binpath: Path,
@@ -1010,7 +1012,7 @@ def neon_env_builder(
neon_binpath: Path, neon_binpath: Path,
pg_distrib_dir: Path, pg_distrib_dir: Path,
pg_version: str, pg_version: str,
default_broker: Etcd, default_broker: NeonBroker,
run_id: uuid.UUID, run_id: uuid.UUID,
) -> Iterator[NeonEnvBuilder]: ) -> Iterator[NeonEnvBuilder]:
""" """
@@ -1743,7 +1745,7 @@ class NeonPageserver(PgProtocol):
# All tests print these, when starting up or shutting down # All tests print these, when starting up or shutting down
".*wal receiver task finished with an error: walreceiver connection handling failure.*", ".*wal receiver task finished with an error: walreceiver connection handling failure.*",
".*Shutdown task error: walreceiver connection handling failure.*", ".*Shutdown task error: walreceiver connection handling failure.*",
".*Etcd client error: grpc request error: status: Unavailable.*", ".*wal_connection_manager.*tcp connect error: Connection refused.*",
".*query handler for .* failed: Connection reset by peer.*", ".*query handler for .* failed: Connection reset by peer.*",
".*serving compute connection task.*exited with error: Broken pipe.*", ".*serving compute connection task.*exited with error: Broken pipe.*",
".*Connection aborted: error communicating with the server: Broken pipe.*", ".*Connection aborted: error communicating with the server: Broken pipe.*",
@@ -1834,7 +1836,6 @@ class NeonPageserver(PgProtocol):
def assert_no_errors(self): def assert_no_errors(self):
logfile = open(os.path.join(self.env.repo_dir, "pageserver.log"), "r") logfile = open(os.path.join(self.env.repo_dir, "pageserver.log"), "r")
error_or_warn = re.compile("ERROR|WARN") error_or_warn = re.compile("ERROR|WARN")
errors = [] errors = []
while True: while True:
@@ -2653,51 +2654,36 @@ class SafekeeperHttpClient(requests.Session):
@dataclass @dataclass
class Etcd: class NeonBroker:
"""An object managing etcd instance""" """An object managing storage_broker instance"""
datadir: str logfile: Path
port: int port: int
peer_port: int neon_binpath: Path
binary_path: Path = field(init=False)
handle: Optional[subprocess.Popen[Any]] = None # handle of running daemon handle: Optional[subprocess.Popen[Any]] = None # handle of running daemon
def __post_init__(self): def listen_addr(self):
self.binary_path = etcd_path() return f"127.0.0.1:{self.port}"
def client_url(self): def client_url(self):
return f"http://127.0.0.1:{self.port}" return f"http://{self.listen_addr()}"
def check_status(self): def check_status(self):
with requests.Session() as s: return True # TODO
s.mount("http://", requests.adapters.HTTPAdapter(max_retries=1)) # do not retry
s.get(f"{self.client_url()}/health").raise_for_status()
def try_start(self): def try_start(self):
if self.handle is not None: if self.handle is not None:
log.debug(f"etcd is already running on port {self.port}") log.debug(f"storage_broker is already running on port {self.port}")
return return
Path(self.datadir).mkdir(exist_ok=True) listen_addr = self.listen_addr()
log.info(f'starting storage_broker to listen incoming connections at "{listen_addr}"')
if not self.binary_path.is_file(): with open(self.logfile, "wb") as logfile:
raise RuntimeError(f"etcd broker binary '{self.binary_path}' is not a file")
client_url = self.client_url()
log.info(f'Starting etcd to listen incoming connections at "{client_url}"')
with open(os.path.join(self.datadir, "etcd.log"), "wb") as log_file:
args = [ args = [
self.binary_path, self.neon_binpath / "storage_broker",
f"--data-dir={self.datadir}", f"--listen-addr={listen_addr}",
f"--listen-client-urls={client_url}",
f"--advertise-client-urls={client_url}",
f"--listen-peer-urls=http://127.0.0.1:{self.peer_port}",
# Set --quota-backend-bytes to keep the etcd virtual memory
# size smaller. Our test etcd clusters are very small.
# See https://github.com/etcd-io/etcd/issues/7910
"--quota-backend-bytes=100000000",
] ]
self.handle = subprocess.Popen(args, stdout=log_file, stderr=log_file) self.handle = subprocess.Popen(args, stdout=logfile, stderr=logfile)
# wait for start # wait for start
started_at = time.time() started_at = time.time()
@@ -2707,7 +2693,9 @@ class Etcd:
except Exception as e: except Exception as e:
elapsed = time.time() - started_at elapsed = time.time() - started_at
if elapsed > 5: if elapsed > 5:
raise RuntimeError(f"timed out waiting {elapsed:.0f}s for etcd start: {e}") raise RuntimeError(
f"timed out waiting {elapsed:.0f}s for storage_broker start: {e}"
)
time.sleep(0.5) time.sleep(0.5)
else: else:
break # success break # success

View File

@@ -1,7 +1,6 @@
import contextlib import contextlib
import os import os
import re import re
import shutil
import subprocess import subprocess
import tarfile import tarfile
import time import time
@@ -74,13 +73,6 @@ def print_gc_result(row: Dict[str, Any]):
) )
def etcd_path() -> Path:
path_output = shutil.which("etcd")
if path_output is None:
raise RuntimeError("etcd not found in PATH")
return Path(path_output)
def query_scalar(cur: cursor, query: str) -> Any: def query_scalar(cur: cursor, query: str) -> Any:
""" """
It is a convenience wrapper to avoid repetitions It is a convenience wrapper to avoid repetitions

View File

@@ -97,17 +97,19 @@ def test_backward_compatibility(
), "COMPATIBILITY_SNAPSHOT_DIR is not set. It should be set to `compatibility_snapshot_pg14` path generateted by test_create_snapshot (ideally generated by the previous version of Neon)" ), "COMPATIBILITY_SNAPSHOT_DIR is not set. It should be set to `compatibility_snapshot_pg14` path generateted by test_create_snapshot (ideally generated by the previous version of Neon)"
compatibility_snapshot_dir = Path(compatibility_snapshot_dir_env).resolve() compatibility_snapshot_dir = Path(compatibility_snapshot_dir_env).resolve()
# Copy the snapshot to current directory, and prepare for the test
prepare_snapshot(
from_dir=compatibility_snapshot_dir,
to_dir=test_output_dir / "compatibility_snapshot",
port_distributor=port_distributor,
)
breaking_changes_allowed = ( breaking_changes_allowed = (
os.environ.get("ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true" os.environ.get("ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true"
) )
try: try:
# Copy the snapshot to current directory, and prepare for the test
prepare_snapshot(
from_dir=compatibility_snapshot_dir,
to_dir=test_output_dir / "compatibility_snapshot",
neon_binpath=neon_binpath,
port_distributor=port_distributor,
)
check_neon_works( check_neon_works(
test_output_dir / "compatibility_snapshot" / "repo", test_output_dir / "compatibility_snapshot" / "repo",
neon_binpath, neon_binpath,
@@ -155,18 +157,21 @@ def test_forward_compatibility(
compatibility_snapshot_dir = ( compatibility_snapshot_dir = (
test_output_dir.parent / "test_create_snapshot" / "compatibility_snapshot_pg14" test_output_dir.parent / "test_create_snapshot" / "compatibility_snapshot_pg14"
) )
# Copy the snapshot to current directory, and prepare for the test
prepare_snapshot(
from_dir=compatibility_snapshot_dir,
to_dir=test_output_dir / "compatibility_snapshot",
port_distributor=port_distributor,
pg_distrib_dir=compatibility_postgres_distrib_dir,
)
breaking_changes_allowed = ( breaking_changes_allowed = (
os.environ.get("ALLOW_FORWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true" os.environ.get("ALLOW_FORWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true"
) )
try: try:
# Copy the snapshot to current directory, and prepare for the test
prepare_snapshot(
from_dir=compatibility_snapshot_dir,
to_dir=test_output_dir / "compatibility_snapshot",
port_distributor=port_distributor,
neon_binpath=compatibility_neon_bin,
pg_distrib_dir=compatibility_postgres_distrib_dir,
)
check_neon_works( check_neon_works(
test_output_dir / "compatibility_snapshot" / "repo", test_output_dir / "compatibility_snapshot" / "repo",
compatibility_neon_bin, compatibility_neon_bin,
@@ -194,6 +199,7 @@ def prepare_snapshot(
from_dir: Path, from_dir: Path,
to_dir: Path, to_dir: Path,
port_distributor: PortDistributor, port_distributor: PortDistributor,
neon_binpath: Path,
pg_distrib_dir: Optional[Path] = None, pg_distrib_dir: Optional[Path] = None,
): ):
assert from_dir.exists(), f"Snapshot '{from_dir}' doesn't exist" assert from_dir.exists(), f"Snapshot '{from_dir}' doesn't exist"
@@ -227,9 +233,14 @@ def prepare_snapshot(
pageserver_config["listen_pg_addr"] = port_distributor.replace_with_new_port( pageserver_config["listen_pg_addr"] = port_distributor.replace_with_new_port(
pageserver_config["listen_pg_addr"] pageserver_config["listen_pg_addr"]
) )
pageserver_config["broker_endpoints"] = [ # since storage_broker these are overriden by neon_local during pageserver
port_distributor.replace_with_new_port(ep) for ep in pageserver_config["broker_endpoints"] # start; remove both to prevent unknown options during etcd ->
] # storage_broker migration. TODO: remove once broker is released
pageserver_config.pop("broker_endpoint", None)
pageserver_config.pop("broker_endpoints", None)
etcd_broker_endpoints = [f"http://localhost:{port_distributor.get_port()}/"]
if get_neon_version(neon_binpath) == "49da498f651b9f3a53b56c7c0697636d880ddfe0":
pageserver_config["broker_endpoints"] = etcd_broker_endpoints # old etcd version
if pg_distrib_dir: if pg_distrib_dir:
pageserver_config["pg_distrib_dir"] = str(pg_distrib_dir) pageserver_config["pg_distrib_dir"] = str(pg_distrib_dir)
@@ -239,10 +250,22 @@ def prepare_snapshot(
snapshot_config_toml = repo_dir / "config" snapshot_config_toml = repo_dir / "config"
snapshot_config = toml.load(snapshot_config_toml) snapshot_config = toml.load(snapshot_config_toml)
snapshot_config["etcd_broker"]["broker_endpoints"] = [
port_distributor.replace_with_new_port(ep) # Provide up/downgrade etcd <-> storage_broker to make forward/backward
for ep in snapshot_config["etcd_broker"]["broker_endpoints"] # compatibility test happy. TODO: leave only the new part once broker is released.
] if get_neon_version(neon_binpath) == "49da498f651b9f3a53b56c7c0697636d880ddfe0":
# old etcd version
snapshot_config["etcd_broker"] = {
"etcd_binary_path": shutil.which("etcd"),
"broker_endpoints": etcd_broker_endpoints,
}
snapshot_config.pop("broker", None)
else:
# new storage_broker version
broker_listen_addr = f"127.0.0.1:{port_distributor.get_port()}"
snapshot_config["broker"] = {"listen_addr": broker_listen_addr}
snapshot_config.pop("etcd_broker", None)
snapshot_config["pageserver"]["listen_http_addr"] = port_distributor.replace_with_new_port( snapshot_config["pageserver"]["listen_http_addr"] = port_distributor.replace_with_new_port(
snapshot_config["pageserver"]["listen_http_addr"] snapshot_config["pageserver"]["listen_http_addr"]
) )
@@ -277,6 +300,12 @@ def prepare_snapshot(
), f"there're files referencing `test_create_snapshot/repo`, this path should be replaced with {repo_dir}:\n{rv.stdout}" ), f"there're files referencing `test_create_snapshot/repo`, this path should be replaced with {repo_dir}:\n{rv.stdout}"
# get git SHA of neon binary
def get_neon_version(neon_binpath: Path):
out = subprocess.check_output([neon_binpath / "neon_local", "--version"]).decode("utf-8")
return out.split("git:", 1)[1].rstrip()
def check_neon_works( def check_neon_works(
repo_dir: Path, repo_dir: Path,
neon_binpath: Path, neon_binpath: Path,

View File

@@ -7,7 +7,7 @@ from typing import Any, Dict, Optional, Tuple
import pytest import pytest
from fixtures.log_helper import log from fixtures.log_helper import log
from fixtures.neon_fixtures import ( from fixtures.neon_fixtures import (
Etcd, NeonBroker,
NeonEnv, NeonEnv,
NeonEnvBuilder, NeonEnvBuilder,
PageserverHttpClient, PageserverHttpClient,
@@ -32,7 +32,7 @@ def new_pageserver_service(
remote_storage_mock_path: Path, remote_storage_mock_path: Path,
pg_port: int, pg_port: int,
http_port: int, http_port: int,
broker: Optional[Etcd], broker: Optional[NeonBroker],
pg_distrib_dir: Path, pg_distrib_dir: Path,
): ):
""" """
@@ -53,7 +53,7 @@ def new_pageserver_service(
] ]
if broker is not None: if broker is not None:
cmd.append( cmd.append(
f"-c broker_endpoints=['{broker.client_url()}']", f"-c broker_endpoint='{broker.client_url()}'",
) )
pageserver_client = PageserverHttpClient( pageserver_client = PageserverHttpClient(
port=http_port, port=http_port,

View File

@@ -16,7 +16,7 @@ from typing import Any, List, Optional
import pytest import pytest
from fixtures.log_helper import log from fixtures.log_helper import log
from fixtures.neon_fixtures import ( from fixtures.neon_fixtures import (
Etcd, NeonBroker,
NeonEnv, NeonEnv,
NeonEnvBuilder, NeonEnvBuilder,
NeonPageserver, NeonPageserver,
@@ -520,7 +520,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
) )
# advance remote_consistent_lsn to trigger WAL trimming # advance remote_consistent_lsn to trigger WAL trimming
# this LSN should be less than commit_lsn, so timeline will be active=true in safekeepers, to push etcd updates # this LSN should be less than commit_lsn, so timeline will be active=true in safekeepers, to push broker updates
env.safekeepers[0].http_client().record_safekeeper_info( env.safekeepers[0].http_client().record_safekeeper_info(
tenant_id, timeline_id, {"remote_consistent_lsn": str(offloaded_seg_end)} tenant_id, timeline_id, {"remote_consistent_lsn": str(offloaded_seg_end)}
) )
@@ -812,10 +812,10 @@ class SafekeeperEnv:
): ):
self.repo_dir = repo_dir self.repo_dir = repo_dir
self.port_distributor = port_distributor self.port_distributor = port_distributor
self.broker = Etcd( self.broker = NeonBroker(
datadir=os.path.join(self.repo_dir, "etcd"), logfile=Path(self.repo_dir) / "storage_broker.log",
port=self.port_distributor.get_port(), port=self.port_distributor.get_port(),
peer_port=self.port_distributor.get_port(), neon_binpath=neon_binpath,
) )
self.pg_bin = pg_bin self.pg_bin = pg_bin
self.num_safekeepers = num_safekeepers self.num_safekeepers = num_safekeepers
@@ -863,7 +863,7 @@ class SafekeeperEnv:
str(safekeeper_dir), str(safekeeper_dir),
"--id", "--id",
str(i), str(i),
"--broker-endpoints", "--broker-endpoint",
self.broker.client_url(), self.broker.client_url(),
] ]
log.info(f'Running command "{" ".join(cmd)}"') log.info(f'Running command "{" ".join(cmd)}"')

View File

@@ -32,14 +32,14 @@ nom = { version = "7", features = ["alloc", "std"] }
num-bigint = { version = "0.4", features = ["std"] } num-bigint = { version = "0.4", features = ["std"] }
num-integer = { version = "0.1", features = ["i128", "std"] } num-integer = { version = "0.1", features = ["i128", "std"] }
num-traits = { version = "0.2", features = ["i128", "libm", "std"] } num-traits = { version = "0.2", features = ["i128", "libm", "std"] }
prost-93f6ce9d446188ac = { package = "prost", version = "0.10", features = ["prost-derive", "std"] } prost = { version = "0.11", features = ["prost-derive", "std"] }
prost-a6292c17cd707f01 = { package = "prost", version = "0.11", features = ["prost-derive", "std"] }
rand = { version = "0.8", features = ["alloc", "getrandom", "libc", "rand_chacha", "rand_hc", "small_rng", "std", "std_rng"] } rand = { version = "0.8", features = ["alloc", "getrandom", "libc", "rand_chacha", "rand_hc", "small_rng", "std", "std_rng"] }
regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
reqwest = { version = "0.11", default-features = false, features = ["__rustls", "__tls", "blocking", "default-tls", "hyper-rustls", "hyper-tls", "json", "native-tls-crate", "rustls", "rustls-pemfile", "rustls-tls", "rustls-tls-webpki-roots", "serde_json", "tokio-native-tls", "tokio-rustls", "webpki-roots"] } reqwest = { version = "0.11", default-features = false, features = ["__rustls", "__tls", "blocking", "default-tls", "hyper-rustls", "hyper-tls", "json", "native-tls-crate", "rustls", "rustls-pemfile", "rustls-tls", "rustls-tls-webpki-roots", "serde_json", "tokio-native-tls", "tokio-rustls", "webpki-roots"] }
scopeguard = { version = "1", features = ["use_std"] } scopeguard = { version = "1", features = ["use_std"] }
serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] } serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] }
socket2 = { version = "0.4", default-features = false, features = ["all"] }
stable_deref_trait = { version = "1", features = ["alloc", "std"] } stable_deref_trait = { version = "1", features = ["alloc", "std"] }
tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] } tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] }
tokio-util = { version = "0.7", features = ["codec", "io", "io-util", "tracing"] } tokio-util = { version = "0.7", features = ["codec", "io", "io-util", "tracing"] }
@@ -59,8 +59,7 @@ libc = { version = "0.2", features = ["extra_traits", "std"] }
log = { version = "0.4", default-features = false, features = ["serde", "std"] } log = { version = "0.4", default-features = false, features = ["serde", "std"] }
memchr = { version = "2", features = ["std"] } memchr = { version = "2", features = ["std"] }
nom = { version = "7", features = ["alloc", "std"] } nom = { version = "7", features = ["alloc", "std"] }
prost-93f6ce9d446188ac = { package = "prost", version = "0.10", features = ["prost-derive", "std"] } prost = { version = "0.11", features = ["prost-derive", "std"] }
prost-a6292c17cd707f01 = { package = "prost", version = "0.11", features = ["prost-derive", "std"] }
regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] } serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] }