Compare commits

...

1 Commits

Author SHA1 Message Date
Arthur Petukhovsky
f1fcb4d0d7 Patch compute_ctl to run basebackup 2025-01-10 02:03:59 +01:00
8 changed files with 170 additions and 1221 deletions

View File

@@ -24,3 +24,4 @@
!storage_controller/
!vendor/postgres-*/
!workspace_hack/
!debug-oom/

4
Cargo.lock generated
View File

@@ -4494,9 +4494,9 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
[[package]]
name = "pq-sys"
version = "0.6.3"
version = "0.4.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f6cc05d7ea95200187117196eee9edd0644424911821aeb28a18ce60ea0b8793"
checksum = "31c0052426df997c0cbd30789eb44ca097e3541717a7b8fa36b1c464ee7edebd"
dependencies = [
"vcpkg",
]

File diff suppressed because it is too large Load Diff

View File

@@ -68,6 +68,7 @@ use compute_tools::spec::*;
use compute_tools::swap::resize_swap;
use rlimit::{setrlimit, Resource};
use utils::failpoint_support;
use utils::id::{TenantId, TimelineId};
// this is an arbitrary build tag. Fine as a default / for testing purposes
// in-case of not-set environment var
@@ -87,9 +88,9 @@ fn main() -> Result<()> {
let cli_args = process_cli(&clap_args)?;
let cli_spec = try_spec_from_cli(&clap_args, &cli_args)?;
// let cli_spec = try_spec_from_cli(&clap_args, &cli_args)?;
let wait_spec_result = wait_spec(build_tag, cli_args, cli_spec)?;
let wait_spec_result = wait_spec(build_tag, cli_args)?;
start_postgres(&clap_args, wait_spec_result)?
@@ -313,14 +314,41 @@ fn wait_spec(
http_port,
..
}: ProcessCliResult,
CliSpecParams {
spec,
live_config_allowed,
}: CliSpecParams,
) -> Result<WaitSpecResult> {
let mut new_state = ComputeState::new();
let spec_set;
let live_config_allowed = true;
let spec = Some(ComputeSpec {
// format_version: todo!(),
// operation_uuid: todo!(),
// features: todo!(),
// swap_size_bytes: todo!(),
// disk_quota_bytes: todo!(),
// disable_lfc_resizing: todo!(),
// cluster: todo!(),
// delta_operations: todo!(),
// skip_pg_catalog_updates: todo!(),
// tenant_id: todo!(),
// timeline_id: todo!(),
// pageserver_connstring: todo!(),
// safekeeper_connstrings: todo!(),
// mode: todo!(),
// storage_auth_token: todo!(),
// remote_extensions: todo!(),
// pgbouncer_settings: todo!(),
// shard_stripe_size: todo!(),
// local_proxy_config: todo!(),
// reconfigure_concurrency: todo!(),
pageserver_connstring: Some("pageserver-1.example.com:5432".to_string()),
safekeeper_connstrings: vec!["safekeeper-1.example.com:5432".to_string()],
tenant_id: Some(TenantId::generate()),
timeline_id: Some(TimelineId::generate()),
..Default::default()
});
if let Some(spec) = spec {
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
info!("new pspec.spec: {:?}", pspec.spec);
@@ -355,9 +383,7 @@ fn wait_spec(
// available for binding. Prewarming helps Postgres start quicker later,
// because QEMU will already have its memory allocated from the host, and
// the necessary binaries will already be cached.
if !spec_set {
compute.prewarm_postgres()?;
}
compute.prewarm_postgres()?;
// Launch http service first, so that we can serve control-plane requests
// while configuration is still in progress.

View File

@@ -358,64 +358,22 @@ impl ComputeNode {
let spec = compute_state.pspec.as_ref().expect("spec must be set");
let start_time = Instant::now();
let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
let mut config = postgres::Config::from_str(shard0_connstr)?;
// Use the storage auth token from the config file, if given.
// Note: this overrides any password set in the connection string.
if let Some(storage_auth_token) = &spec.storage_auth_token {
info!("Got storage auth token from spec file");
config.password(storage_auth_token);
} else {
info!("Storage auth token not set");
}
// Connect to pageserver
let mut client = config.connect(NoTls)?;
let pageserver_connect_micros = start_time.elapsed().as_micros() as u64;
let basebackup_cmd = match lsn {
Lsn(0) => {
if spec.spec.mode != ComputeMode::Primary {
format!(
"basebackup {} {} --gzip --replica",
spec.tenant_id, spec.timeline_id
)
} else {
format!("basebackup {} {} --gzip", spec.tenant_id, spec.timeline_id)
}
}
_ => {
if spec.spec.mode != ComputeMode::Primary {
format!(
"basebackup {} {} {} --gzip --replica",
spec.tenant_id, spec.timeline_id, lsn
)
} else {
format!(
"basebackup {} {} {} --gzip",
spec.tenant_id, spec.timeline_id, lsn
)
}
}
};
let copyreader = client.copy_out(basebackup_cmd.as_str())?;
let mut measured_reader = MeasuredReader::new(copyreader);
// Open backup file directly
let backup_file = std::fs::File::open("/var/db/backups/backup.tar.gz")?;
let mut measured_reader = MeasuredReader::new(backup_file);
let mut bufreader = std::io::BufReader::new(&mut measured_reader);
// Read the archive directly from the `CopyOutReader`
// Read the archive directly from the file
//
// Set `ignore_zeros` so that unpack() reads all the Copy data and
// doesn't stop at the end-of-archive marker. Otherwise, if the server
// sends an Error after finishing the tarball, we will not notice it.
// doesn't stop at the end-of-archive marker.
let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut bufreader));
ar.set_ignore_zeros(true);
ar.unpack(&self.pgdata)?;
// Report metrics
let mut state = self.state.lock().unwrap();
state.metrics.pageserver_connect_micros = pageserver_connect_micros;
state.metrics.pageserver_connect_micros = 0;
state.metrics.basebackup_bytes = measured_reader.get_byte_count() as u64;
state.metrics.basebackup_ms = start_time.elapsed().as_millis() as u64;
Ok(())
@@ -628,32 +586,7 @@ impl ComputeNode {
self.http_port,
)?;
// Syncing safekeepers is only safe with primary nodes: if a primary
// is already connected it will be kicked out, so a secondary (standby)
// cannot sync safekeepers.
let lsn = match spec.mode {
ComputeMode::Primary => {
info!("checking if safekeepers are synced");
let lsn = if let Ok(Some(lsn)) = self.check_safekeepers_synced(compute_state) {
lsn
} else {
info!("starting safekeepers syncing");
self.sync_safekeepers(pspec.storage_auth_token.clone())
.with_context(|| "failed to sync safekeepers")?
};
info!("safekeepers synced at LSN {}", lsn);
lsn
}
ComputeMode::Static(lsn) => {
info!("Starting read-only node at static LSN {}", lsn);
lsn
}
ComputeMode::Replica => {
info!("Initializing standby from latest Pageserver LSN");
Lsn(0)
}
};
let lsn = Lsn(0);
info!(
"getting basebackup@{} from pageserver {}",
lsn, &pspec.pageserver_connstr

1
debug-oom/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
backup.tar.gz

21
debug-oom/README.md Normal file
View File

@@ -0,0 +1,21 @@
To build a compute image:
```
docker build --build-arg GIT_VERSION=custombuild --build-arg PG_VERSION=v16 -t neon-local-v16 -f ../compute/compute-node.Dockerfile .. && \
../../autoscaling/bin/vm-builder \
-spec=../compute/vm-image-spec-bullseye.yaml \
-src=neon-local-v16:latest \
-dst=vm-neon-local-v16:latest \
-target-arch=linux/amd64 \
-size 2G && \
../../autoscaling/bin/kind load docker-image vm-neon-local-v16:latest --name neonvm-arthur
```
To start a compute node:
```
kubectl apply -f ./spec.yml
```
How to destroy:
```
kubectl delete -f ./spec.yml
```

99
debug-oom/spec.yml Normal file
View File

@@ -0,0 +1,99 @@
apiVersion: vm.neon.tech/v1
kind: VirtualMachine
metadata:
annotations:
autoscaling.neon.tech/bounds: '{"min":{"cpu":"250m","mem":"1Gi"},"max":{"cpu":"2","mem":"8Gi"}}'
autoscaling.neon.tech/config: '{"enableLFCMetrics":true}'
creationTimestamp: "2025-01-04T18:37:29Z"
finalizers:
- vm.neon.tech/finalizer
generation: 1
labels:
autoscaling.neon.tech/enabled: "true"
neon/component: compute-node
neon/compute-id: compute-purple-art-unreal
neon/endpoint-id: ep-unreal
name: compute-purple-art-unreal
namespace: default
spec:
cpuScalingMode: QmpScaling
disks:
- emptyDisk:
discard: true
size: 36096Mi
mountPath: /neonvm/cache
name: cache
readOnly: false
- emptyDisk:
discard: true
enableQuotas: true
size: 150Gi
mountPath: /var/db/postgres/compute
name: pgdata
readOnly: false
enableAcceleration: true
enableNetworkMonitoring: false
enableSSH: true
guest:
args:
- -c
- /usr/local/bin/compute_ctl -D /var/db/postgres/compute/pgdata -b /usr/local/bin/postgres
-C postgresql://cloud_admin@127.0.0.1/postgres?options=-c%20default_transaction_read_only%3Dfalse
--compute-id compute-purple-art-unreal --control-plane-uri http://dontexist.local:9096
--resize-swap-on-bind --set-disk-quota-for-fs /var/db/postgres/compute 2>&1
command:
- /bin/sh
cpus:
max: 10
min: 250m
use: 500m
env:
- name: RUST_LOG
value: info
- name: OTEL_SDK_DISABLED
value: "true"
- name: AUTOSCALING
value: "true"
memorySlotSize: 1Gi
memorySlots:
max: 40
min: 1
use: 2
ports:
- name: postgres
port: 5432
protocol: TCP
- name: control
port: 3080
protocol: TCP
- name: pooler
port: 6432
protocol: TCP
- name: host-metrics
port: 9100
protocol: TCP
- name: metrics
port: 9187
protocol: TCP
- name: sql-exporter
port: 9399
protocol: TCP
- name: sql-exporter-2
port: 9499
protocol: TCP
- name: vm-monitor
port: 10301
protocol: TCP
- name: local-proxy
port: 10432
protocol: TCP
rootDisk:
image: vm-neon-local-v16
imagePullPolicy: IfNotPresent
size: 20Gi
settings:
swap: 40Gi
sysctl:
- vm.overcommit_memory=2
restartPolicy: Always
schedulerName: autoscale-scheduler