Compare commits

..

84 Commits

Author SHA1 Message Date
Heikki Linnakangas
db1ea8b430 WIP: get rid of layer_removal_cs 2022-11-28 11:47:46 +02:00
Heikki Linnakangas
2dad3a6445 Make schedule_layer_file_deletion atomic.
Make sure that we don't mutate the upload queue, until we're sure that
we will complete all the changse. Otherwise, we could bail out after
removing some files from upload_queue.latest_files, but not all.

This is purely theoretical: there were only two ? calls in the
function, and neither of them should actually return an error:

1. `RelativePath::from_local_path` only returns error if the argument
  path is not in the base directory. I.e. in this case, if the argument
  path was outside the timeline directory. Shouldn't happen.

2. latest_metadata.to_bytes() only returns an error if the serde
   serialization fails. It really shouldn't fail.

I considered turning those into panics instead, but as long as the
function as whole can return an Err, the callers need to be prepared
for that anyway, so there's little difference. Nevertheless, I
refactored the code a little to make it atomic even one of those
can't-happen errors happen after all. And I used a closure to make it
harder to introduce new dangerous ?-operators in the future.
2022-11-25 22:58:05 +02:00
Christian Schwarz
d36fd4f141 merge_local_remote_metadata: bail out in case where local has metadata file but remote is newer than local 2022-11-25 15:45:21 -05:00
Heikki Linnakangas
a8a6603e66 rustfmt 2022-11-25 22:20:23 +02:00
Christian Schwarz
87c82f5151 storage_sync/delete.rs: convert FIXME into follow-up https://github.com/neondatabase/neon/issues/2934 2022-11-25 14:57:05 -05:00
Christian Schwarz
c9a2b1fd10 startup: transition tenant into Broken state if config load fails (instead of skipping over it and not adding it to the tenants map) 2022-11-25 14:55:51 -05:00
Heikki Linnakangas
fdfa86b5b0 Put back check that a timeline must have ancestor or some layer files. 2022-11-25 21:49:12 +02:00
Heikki Linnakangas
3eb85957df Little cleanup around measure_remote_op calls
- Previously, the functions in download.rs did the measurement themselves,
  whereas for functions in delete.rs and upload.rs, it was the caller's
  responsibility. Move the measure_remote_op calls from download.rs to
  the callers, for consistency.
- Remove pointless async blocks in upload.rs and delete.rs. They would've
  been useful for inserting the measure_remote_op calls, but since the
  caller's are responsible for measure_remote_op now, they're not neeed.
- tiny cosmetic cleanup around imports
2022-11-25 21:05:06 +02:00
Christian Schwarz
6b4a28bf7f remove TODO on missing tests that will be covered later by https://github.com/neondatabase/neon/pull/2928 2022-11-25 13:27:07 -05:00
Christian Schwarz
22d6c1dda6 Merge remote-tracking branch 'origin/dkr/on-demand-split/per-tenant-remote-sync' into problame/for-dkr/on-demand-split/per-tenant-remote-sync 2022-11-25 13:22:31 -05:00
Christian Schwarz
abfac6ef2a Merge remote-tracking branch 'origin/main' into HEAD
Conflicts:
	libs/pageserver_api/src/models.rs
	pageserver/src/lib.rs
	pageserver/src/tenant_mgr.rs

There was a merge conflict following attach_tenant() where
I didn't understand why Git called out a conflict.
I went through the changes in `origin/main` since the last
merge done by Heikki, couldn't find anything that would
conflict there.

Original git diff right after after `git merge` follows:

   diff --cc libs/pageserver_api/src/models.rs
   index 750585b58,aefd79336..000000000
   --- a/libs/pageserver_api/src/models.rs
   +++ b/libs/pageserver_api/src/models.rs
   @@@ -15,17 -15,13 +15,27 @@@ use bytes::{BufMut, Bytes, BytesMut}
     /// A state of a tenant in pageserver's memory.
     #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
     pub enum TenantState {
   ++<<<<<<< HEAD
    +    // This tenant is being loaded from local disk
    +    Loading,
    +    // This tenant is being downloaded from cloud storage.
    +    Attaching,
    +    /// Tenant is fully operational
    +    Active,
    +    /// A tenant is recognized by pageserver, but it is being detached or the system is being
    +    /// shut down.
    +    Paused,
    +    /// A tenant is recognized by the pageserver, but can no longer used for any operations,
    +    /// because it failed to get activated.
   ++=======
   +     /// Tenant is fully operational, its background jobs might be running or not.
   +     Active { background_jobs_running: bool },
   +     /// A tenant is recognized by pageserver, but it is being detached or the
   +     /// system is being shut down.
   +     Paused,
   +     /// A tenant is recognized by the pageserver, but can no longer be used for
   +     /// any operations, because it failed to be activated.
   ++>>>>>>> origin/main
         Broken,
     }

   diff --cc pageserver/src/lib.rs
   index 2d5b66f57,e3112223e..000000000
   --- a/pageserver/src/lib.rs
   +++ b/pageserver/src/lib.rs
   @@@ -22,7 -23,11 +23,13 @@@ pub mod walreceiver
     pub mod walrecord;
     pub mod walredo;

   ++<<<<<<< HEAD
   ++=======
   + use std::collections::HashMap;
   + use std::path::Path;
   +
   ++>>>>>>> origin/main
     use tracing::info;
    -use utils:🆔:{TenantId, TimelineId};

     use crate::task_mgr::TaskKind;

   @@@ -103,14 -108,51 +110,64 @@@ fn exponential_backoff_duration_seconds
         }
     }

   ++<<<<<<< HEAD
    +/// A suffix to be used during file sync from the remote storage,
    +/// to ensure that we do not leave corrupted files that pretend to be layers.
    +const TEMP_FILE_SUFFIX: &str = "___temp";
   ++=======
   + /// A newtype to store arbitrary data grouped by tenant and timeline ids.
   + /// One could use [`utils:🆔:TenantTimelineId`] for grouping, but that would
   + /// not include the cases where a certain tenant has zero timelines.
   + /// This is sometimes important: a tenant could be registered during initial load from FS,
   + /// even if he has no timelines on disk.
   + #[derive(Debug)]
   + pub struct TenantTimelineValues<T>(HashMap<TenantId, HashMap<TimelineId, T>>);
   +
   + impl<T> TenantTimelineValues<T> {
   +     fn new() -> Self {
   +         Self(HashMap::new())
   +     }
   + }
   +
   + /// The name of the metadata file pageserver creates per timeline.
   + /// Full path: `tenants/<tenant_id>/timelines/<timeline_id>/metadata`.
   + pub const METADATA_FILE_NAME: &str = "metadata";
   +
   + /// Per-tenant configuration file.
   + /// Full path: `tenants/<tenant_id>/config`.
   + pub const TENANT_CONFIG_NAME: &str = "config";
   +
   + /// A suffix used for various temporary files. Any temporary files found in the
   + /// data directory at pageserver startup can be automatically removed.
   + pub const TEMP_FILE_SUFFIX: &str = "___temp";
   +
   + /// A marker file to mark that a timeline directory was not fully initialized.
   + /// If a timeline directory with this marker is encountered at pageserver startup,
   + /// the timeline directory and the marker file are both removed.
   + /// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
   + pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
   +
   + pub fn is_temporary(path: &Path) -> bool {
   +     match path.file_name() {
   +         Some(name) => name.to_string_lossy().ends_with(TEMP_FILE_SUFFIX),
   +         None => false,
   +     }
   + }
   +
   + pub fn is_uninit_mark(path: &Path) -> bool {
   +     match path.file_name() {
   +         Some(name) => name
   +             .to_string_lossy()
   +             .ends_with(TIMELINE_UNINIT_MARK_SUFFIX),
   +         None => false,
   ++    }
   ++}
   ++>>>>>>> origin/main
    +
    +pub fn is_temporary(path: &std::path::Path) -> bool {
    +    match path.file_name() {
    +        Some(name) => name.to_string_lossy().ends_with(TEMP_FILE_SUFFIX),
    +        None => false,
         }
     }

   diff --cc pageserver/src/tenant_mgr.rs
   index 73593bc48,061d7fa19..000000000
   --- a/pageserver/src/tenant_mgr.rs
   +++ b/pageserver/src/tenant_mgr.rs
   @@@ -13,11 -13,18 +13,22 @@@ use tracing::*
     use remote_storage::GenericRemoteStorage;

     use crate::config::PageServerConf;
   ++<<<<<<< HEAD
   ++=======
   + use crate::http::models::TenantInfo;
   + use crate::storage_sync::index::{LayerFileMetadata, RemoteIndex, RemoteTimelineIndex};
   + use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData, TimelineLocalFiles};
   ++>>>>>>> origin/main
     use crate::task_mgr::{self, TaskKind};
    -use crate::tenant::{
    -    ephemeral_file::is_ephemeral_file, metadata::TimelineMetadata, Tenant, TenantState,
    -};
    +use crate::tenant::{Tenant, TenantState};
     use crate::tenant_config::TenantConfOpt;
   ++<<<<<<< HEAD
   ++=======
   + use crate::walredo::PostgresRedoManager;
   + use crate::{is_temporary, is_uninit_mark, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};
   ++>>>>>>> origin/main

    -use utils::crashsafe::{self, path_with_suffix_extension};
    +use utils::fs_ext::PathExt;
     use utils:🆔:{TenantId, TimelineId};

     mod tenants_state {
   @@@ -341,87 -521,334 +352,247 @@@ pub fn list_tenants() -> Vec<(TenantId
             .collect()
     }

    -#[derive(Debug)]
    -pub enum TenantAttachData {
    -    Ready(HashMap<TimelineId, TimelineLocalFiles>),
    -    Broken(anyhow::Error),
    -}
    -/// Attempts to collect information about all tenant and timelines, existing on the local FS.
    -/// If finds any, deletes all temporary files and directories, created before. Also removes empty directories,
    -/// that may appear due to such removals.
    -/// Does not fail on particular timeline or tenant collection errors, rather logging them and ignoring the entities.
    -fn local_tenant_timeline_files(
    -    config: &'static PageServerConf,
    -) -> anyhow::Result<HashMap<TenantId, TenantAttachData>> {
    -    let _entered = info_span!("local_tenant_timeline_files").entered();
    -
    -    let mut local_tenant_timeline_files = HashMap::new();
    -    let tenants_dir = config.tenants_path();
    -    for tenants_dir_entry in fs::read_dir(&tenants_dir)
    -        .with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))?
    -    {
    -        match &tenants_dir_entry {
    -            Ok(tenants_dir_entry) => {
    -                let tenant_dir_path = tenants_dir_entry.path();
    -                if is_temporary(&tenant_dir_path) {
    -                    info!(
    -                        "Found temporary tenant directory, removing: {}",
    -                        tenant_dir_path.display()
    -                    );
    -                    if let Err(e) = fs::remove_dir_all(&tenant_dir_path) {
    -                        error!(
    -                            "Failed to remove temporary directory '{}': {:?}",
    -                            tenant_dir_path.display(),
    -                            e
    -                        );
    -                    }
    -                } else {
    -                    match collect_timelines_for_tenant(config, &tenant_dir_path) {
    -                        Ok((tenant_id, TenantAttachData::Broken(e))) => {
    -                            local_tenant_timeline_files.entry(tenant_id).or_insert(TenantAttachData::Broken(e));
    -                        },
    -                        Ok((tenant_id, TenantAttachData::Ready(collected_files))) => {
    -                            if collected_files.is_empty() {
    -                                match remove_if_empty(&tenant_dir_path) {
    -                                    Ok(true) => info!("Removed empty tenant directory {}", tenant_dir_path.display()),
    -                                    Ok(false) => {
    -                                        // insert empty timeline entry: it has some non-temporary files inside that we cannot remove
    -                                        // so make obvious for HTTP API callers, that something exists there and try to load the tenant
    -                                        let _ = local_tenant_timeline_files.entry(tenant_id).or_insert_with(|| TenantAttachData::Ready(HashMap::new()));
    -                                    },
    -                                    Err(e) => error!("Failed to remove empty tenant directory: {e:?}"),
    -                                }
    -                            } else {
    -                                match local_tenant_timeline_files.entry(tenant_id) {
    -                                    hash_map::Entry::Vacant(entry) => {
    -                                        entry.insert(TenantAttachData::Ready(collected_files));
    -                                    }
    -                                    hash_map::Entry::Occupied(entry) =>{
    -                                        if let TenantAttachData::Ready(old_timelines) = entry.into_mut() {
    -                                            old_timelines.extend(collected_files);
    -                                        }
    -                                    },
    -                                }
    -                            }
    -                        },
    -                        Err(e) => error!(
    -                            "Failed to collect tenant files from dir '{}' for entry {:?}, reason: {:#}",
    -                            tenants_dir.display(),
    -                            tenants_dir_entry,
    -                            e
    -                        ),
    -                    }
    +/// Execute Attach mgmt API command.
    +///
    +/// Downloading all the tenant data is performed in the background, this merely
    +/// spawns the background task and returns quickly.
    +pub async fn attach_tenant(
    +    conf: &'static PageServerConf,
    +    tenant_id: TenantId,
    +    remote_storage: &GenericRemoteStorage,
    +) -> anyhow::Result<()> {
    +    match tenants_state::write_tenants().entry(tenant_id) {
    +        hash_map::Entry::Occupied(e) => {
    +            // Cannot attach a tenant that already exists. The error message depends on
    +            // the state it's in.
    +            match e.get().current_state() {
    +                TenantState::Attaching => {
    +                    anyhow::bail!("tenant {tenant_id} attach is already in progress")
                     }
   ++<<<<<<< HEAD
    +                current_state => {
    +                    anyhow::bail!("tenant already exists, current state: {current_state:?}")
   ++=======
   +             }
   +             Err(e) => error!(
   +                 "Failed to list tenants dir entry {:?} in directory {}, reason: {:?}",
   +                 tenants_dir_entry,
   +                 tenants_dir.display(),
   +                 e
   +             ),
   +         }
   +     }
   +
   +     info!(
   +         "Collected files for {} tenants",
   +         local_tenant_timeline_files.len(),
   +     );
   +     Ok(local_tenant_timeline_files)
   + }
   +
   + fn remove_if_empty(tenant_dir_path: &Path) -> anyhow::Result<bool> {
   +     let directory_is_empty = tenant_dir_path
   +         .read_dir()
   +         .with_context(|| {
   +             format!(
   +                 "Failed to read directory '{}' contents",
   +                 tenant_dir_path.display()
   +             )
   +         })?
   +         .next()
   +         .is_none();
   +
   +     if directory_is_empty {
   +         fs::remove_dir_all(&tenant_dir_path).with_context(|| {
   +             format!(
   +                 "Failed to remove empty directory '{}'",
   +                 tenant_dir_path.display(),
   +             )
   +         })?;
   +
   +         Ok(true)
   +     } else {
   +         Ok(false)
   +     }
   + }
   +
   + fn collect_timelines_for_tenant(
   +     config: &'static PageServerConf,
   +     tenant_path: &Path,
   + ) -> anyhow::Result<(TenantId, TenantAttachData)> {
   +     let tenant_id = tenant_path
   +         .file_name()
   +         .and_then(OsStr::to_str)
   +         .unwrap_or_default()
   +         .parse::<TenantId>()
   +         .context("Could not parse tenant id out of the tenant dir name")?;
   +     let timelines_dir = config.timelines_path(&tenant_id);
   +
   +     if !timelines_dir.as_path().is_dir() {
   +         return Ok((
   +             tenant_id,
   +             TenantAttachData::Broken(anyhow::anyhow!(
   +                 "Tenant {} has no timelines directory at {}",
   +                 tenant_id,
   +                 timelines_dir.display()
   +             )),
   +         ));
   +     }
   +
   +     let mut tenant_timelines = HashMap::new();
   +     for timelines_dir_entry in fs::read_dir(&timelines_dir)
   +         .with_context(|| format!("Failed to list timelines dir entry for tenant {tenant_id}"))?
   +     {
   +         match timelines_dir_entry {
   +             Ok(timelines_dir_entry) => {
   +                 let timeline_dir = timelines_dir_entry.path();
   +                 if is_temporary(&timeline_dir) {
   +                     info!(
   +                         "Found temporary timeline directory, removing: {}",
   +                         timeline_dir.display()
   +                     );
   +                     if let Err(e) = fs::remove_dir_all(&timeline_dir) {
   +                         error!(
   +                             "Failed to remove temporary directory '{}': {:?}",
   +                             timeline_dir.display(),
   +                             e
   +                         );
   +                     }
   +                 } else if is_uninit_mark(&timeline_dir) {
   +                     let timeline_uninit_mark_file = &timeline_dir;
   +                     info!(
   +                         "Found an uninit mark file {}, removing the timeline and its uninit mark",
   +                         timeline_uninit_mark_file.display()
   +                     );
   +                     let timeline_id = timeline_uninit_mark_file
   +                         .file_stem()
   +                         .and_then(OsStr::to_str)
   +                         .unwrap_or_default()
   +                         .parse::<TimelineId>()
   +                         .with_context(|| {
   +                             format!(
   +                                 "Could not parse timeline id out of the timeline uninit mark name {}",
   +                                 timeline_uninit_mark_file.display()
   +                             )
   +                         })?;
   +                     let timeline_dir = config.timeline_path(&timeline_id, &tenant_id);
   +                     if let Err(e) =
   +                         remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file)
   +                     {
   +                         error!("Failed to clean up uninit marked timeline: {e:?}");
   +                     }
   +                 } else {
   +                     let timeline_id = timeline_dir
   +                         .file_name()
   +                         .and_then(OsStr::to_str)
   +                         .unwrap_or_default()
   +                         .parse::<TimelineId>()
   +                         .with_context(|| {
   +                             format!(
   +                                 "Could not parse timeline id out of the timeline dir name {}",
   +                                 timeline_dir.display()
   +                             )
   +                         })?;
   +                     let timeline_uninit_mark_file =
   +                         config.timeline_uninit_mark_file_path(tenant_id, timeline_id);
   +                     if timeline_uninit_mark_file.exists() {
   +                         info!("Found an uninit mark file for timeline {tenant_id}/{timeline_id}, removing the timeline and its uninit mark");
   +                         if let Err(e) = remove_timeline_and_uninit_mark(
   +                             &timeline_dir,
   +                             &timeline_uninit_mark_file,
   +                         ) {
   +                             error!("Failed to clean up uninit marked timeline: {e:?}");
   +                         }
   +                     } else {
   +                         match collect_timeline_files(&timeline_dir) {
   +                             Ok((metadata, timeline_files)) => {
   +                                 tenant_timelines.insert(
   +                                     timeline_id,
   +                                     TimelineLocalFiles::collected(metadata, timeline_files),
   +                                 );
   +                             }
   +                             Err(e) => {
   +                                 error!(
   +                                     "Failed to process timeline dir contents at '{}', reason: {:?}",
   +                                     timeline_dir.display(),
   +                                     e
   +                                 );
   +                                 match remove_if_empty(&timeline_dir) {
   +                                     Ok(true) => info!(
   +                                         "Removed empty timeline directory {}",
   +                                         timeline_dir.display()
   +                                     ),
   +                                     Ok(false) => (),
   +                                     Err(e) => {
   +                                         error!("Failed to remove empty timeline directory: {e:?}")
   +                                     }
   +                                 }
   +                             }
   +                         }
   +                     }
   ++>>>>>>> origin/main
                     }
                 }
    -            Err(e) => {
    -                error!("Failed to list timelines for entry tenant {tenant_id}, reason: {e:?}")
    -            }
    +        }
    +        hash_map::Entry::Vacant(v) => {
    +            let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage)?;
    +            v.insert(tenant);
    +            Ok(())
             }
         }
    -
    -    if tenant_timelines.is_empty() {
    -        // this is normal, we've removed all broken, empty and temporary timeline dirs
    -        // but should allow the tenant to stay functional and allow creating new timelines
    -        // on a restart, we require tenants to have the timelines dir, so leave it on disk
    -        debug!("Tenant {tenant_id} has no timelines loaded");
    -    }
    -
    -    Ok((tenant_id, TenantAttachData::Ready(tenant_timelines)))
     }

    -fn remove_timeline_and_uninit_mark(timeline_dir: &Path, uninit_mark: &Path) -> anyhow::Result<()> {
    -    fs::remove_dir_all(&timeline_dir)
    -        .or_else(|e| {
    -            if e.kind() == std::io::ErrorKind::NotFound {
    -                // we can leave the uninit mark without a timeline dir,
    -                // just remove the mark then
    -                Ok(())
    -            } else {
    -                Err(e)
    -            }
    -        })
    -        .with_context(|| {
    -            format!(
    -                "Failed to remove unit marked timeline directory {}",
    -                timeline_dir.display()
    -            )
    -        })?;
    -    fs::remove_file(&uninit_mark).with_context(|| {
    -        format!(
    -            "Failed to remove timeline uninit mark file {}",
    -            uninit_mark.display()
    -        )
    -    })?;
    +#[cfg(feature = "testing")]
    +use {
    +    crate::repository::GcResult, pageserver_api::models::TimelineGcRequest,
    +    utils::http::error::ApiError,
    +};

    -    Ok(())
    -}
    +#[cfg(feature = "testing")]
    +pub fn immediate_gc(
    +    tenant_id: TenantId,
    +    timeline_id: TimelineId,
    +    gc_req: TimelineGcRequest,
    +) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> {
    +    let guard = tenants_state::read_tenants();

    -// discover timeline files and extract timeline metadata
    -//  NOTE: ephemeral files are excluded from the list
    -fn collect_timeline_files(
    -    timeline_dir: &Path,
    -) -> anyhow::Result<(TimelineMetadata, HashMap<PathBuf, LayerFileMetadata>)> {
    -    let mut timeline_files = HashMap::new();
    -    let mut timeline_metadata_path = None;
    -
    -    let timeline_dir_entries =
    -        fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?;
    -    for entry in timeline_dir_entries {
    -        let entry_path = entry.context("Failed to list timeline dir entry")?.path();
    -        let metadata = entry_path.metadata()?;
    -
    -        if metadata.is_file() {
    -            if entry_path.file_name().and_then(OsStr::to_str) == Some(METADATA_FILE_NAME) {
    -                timeline_metadata_path = Some(entry_path);
    -            } else if is_ephemeral_file(&entry_path.file_name().unwrap().to_string_lossy()) {
    -                debug!("skipping ephemeral file {}", entry_path.display());
    -                continue;
    -            } else if is_temporary(&entry_path) {
    -                info!("removing temp timeline file at {}", entry_path.display());
    -                fs::remove_file(&entry_path).with_context(|| {
    -                    format!(
    -                        "failed to remove temp download file at {}",
    -                        entry_path.display()
    -                    )
    -                })?;
    -            } else {
    -                let layer_metadata = LayerFileMetadata::new(metadata.len());
    -                timeline_files.insert(entry_path, layer_metadata);
    +    let tenant = guard
    +        .get(&tenant_id)
    +        .map(Arc::clone)
    +        .with_context(|| format!("Tenant {tenant_id} not found"))
    +        .map_err(ApiError::NotFound)?;
    +
    +    let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
    +    // Use tenant's pitr setting
    +    let pitr = tenant.get_pitr_interval();
    +
    +    // Run in task_mgr to avoid race with detach operation
    +    let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
    +    task_mgr::spawn(
    +        &tokio::runtime::Handle::current(),
    +        TaskKind::GarbageCollector,
    +        Some(tenant_id),
    +        Some(timeline_id),
    +        &format!("timeline_gc_handler garbage collection run for tenant {tenant_id} timeline {timeline_id}"),
    +        false,
    +        async move {
    +            fail::fail_point!("immediate_gc_task_pre");
    +            let result = tenant
    +                .gc_iteration(Some(timeline_id), gc_horizon, pitr, true)
    +                .instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
    +                .await;
    +                // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
    +                // better once the types support it.
    +            match task_done.send(result) {
    +                Ok(_) => (),
    +                Err(result) => error!("failed to send gc result: {result:?}"),
                 }
    +            Ok(())
             }
    -    }
    -
    -    // FIXME (rodionov) if attach call succeeded, and then pageserver is restarted before download is completed
    -    //   then attach is lost. There would be no retries for that,
    -    //   initial collect will fail because there is no metadata.
    -    //   We either need to start download if we see empty dir after restart or attach caller should
    -    //   be aware of that and retry attach if awaits_download for timeline switched from true to false
    -    //   but timelinne didn't appear locally.
    -    //   Check what happens with remote index in that case.
    -    let timeline_metadata_path = match timeline_metadata_path {
    -        Some(path) => path,
    -        None => anyhow::bail!("No metadata file found in the timeline directory"),
    -    };
    -    let metadata = TimelineMetadata::from_bytes(
    -        &fs::read(&timeline_metadata_path).context("Failed to read timeline metadata file")?,
    -    )
    -    .context("Failed to parse timeline metadata file bytes")?;
    -
    -    anyhow::ensure!(
    -        metadata.ancestor_timeline().is_some() || !timeline_files.is_empty(),
    -        "Timeline has no ancestor and no layer files"
         );

    -    Ok((metadata, timeline_files))
    +    // drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
    +    drop(guard);
    +
    +    Ok(wait_task_done)
     }
   diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
   index da50d99db..360ff1c63 160000
   --- a/vendor/postgres-v14
   +++ b/vendor/postgres-v14
   @@ -1 +1 @@
   -Subproject commit da50d99db54848f7a3e910f920aaad7dc6915d36
   +Subproject commit 360ff1c637a57d351a7a5a391d8e8afd8fde8c3a
   diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
   index 780c3f8e3..d31b3f7c6 160000
   --- a/vendor/postgres-v15
   +++ b/vendor/postgres-v15
   @@ -1 +1 @@
   -Subproject commit 780c3f8e3524c2e32a2e28884c7b647fcebf71d7
   +Subproject commit d31b3f7c6d108e52c8bb11e812ce4e266501ea3d
2022-11-25 12:19:53 -05:00
Heikki Linnakangas
e51f2be3d0 Fix physical size tracking when files are downloaded.
Includes a test.
2022-11-25 19:07:53 +02:00
Christian Schwarz
4e4bc8fbde Tenant::load remove obsolete FIXME, add .with_context(): see https://github.com/neondatabase/neon/pull/2785#discussion_r1030581703 2022-11-25 11:43:50 -05:00
Christian Schwarz
dd2a77c2ef Abort uploads if the tenant/timeline is requested to shut down
- Introduce another UploadQueue::Stopped enum variant to indicate the state
  where the UploadQueue is shut down.
- In perform_upload_task, wait concurrently for tenant/timeline
  shutdown. If we are requested to shut down, the first in-progress tasks
  that notices the shutdown request transitions the queue from
  UploadQueue::Initialized to UploadQueue::Stopped state.
  This involves dropping all the queued ops that are not yet
  in progress, which conveniently unblocks wait_completion() calls
  that are waiting for their barrier to be executed.
  They will receive an Err(), and do something sensible.

Right now, wait_completion() is only used by tests, but I
suspect that we should be using it in wherever we
delete layer files, e.g., GC and compaction, as explained
in the storage_sync.rs block comment section "Consistency".

This change also fixes
test_timeline_deletion_with_files_stuck_in_upload_queue
which I added in the previous commit.
Before, timeline delete would wait until all in-progress
tasks and queued tasks were done.
If, like in the test, a task was stuck due to upload error,
timeline deletion would wait forever. Now it gets an error.

Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2022-11-25 16:32:31 +02:00
Christian Schwarz
ef95637c65 add test case to cover race between timeline delete and stuck upload 2022-11-25 16:32:31 +02:00
Christian Schwarz
206b5d2ada remove obsolete TODO (see github PR discussion https://github.com/neondatabase/neon/pull/2785#discussion_r1030547179 ) 2022-11-24 13:20:38 -05:00
Christian Schwarz
77fea61fcc address FIXME in tenant_attach regarding tenant config 2022-11-24 13:20:38 -05:00
Christian Schwarz
e3f4c0e4ac remove FIXME addressed by 'On tenant load, start WAL receivers only after all timelines have been loaded.' 2022-11-24 13:20:38 -05:00
Christian Schwarz
2302ecda04 don't run background loops in unit tests 2022-11-24 13:20:38 -05:00
Heikki Linnakangas
5257bbe2b9 Remove information-free ".context" messages
We capture stack traces of all errors, so these don't really add any
value. As a thought experiment, if we had to add a line like this,
with the function name in it, every time we use the ?-operator, we're
doing something wrong.

test_tenants.py::test_tenant_creation_fails creates a failpoint and
checks that the error returned by the pageserver contains the
failpoint name, and that was failing because it wasn't on the first
line of the error. We should probably improve our error-scraping logic
in the tests to not rely so heavily on string matching, but that's a
different topic.

FWIW, these are also pretty unlikely to fail in practice.
2022-11-24 19:20:17 +01:00
Heikki Linnakangas
6b61ed5fab Fix clippy warning and some typos 2022-11-24 19:20:17 +01:00
Christian Schwarz
f28bf70596 tenant creation: re-use load_local_tenant() 2022-11-24 19:20:17 +01:00
Heikki Linnakangas
dc9c33139b On tenant load, start WAL receivers only after all timelines have been loaded.
And similarly on attach. This way, if the tenant load/attach fails
halfway through, we don't have any leftover WAL receivers still
running on the broken tenant.
2022-11-24 19:20:17 +01:00
Heikki Linnakangas
0260ee23b9 Start background loops on create_tenant correctly.
This was caught by the test_gc_cutoff test.
2022-11-24 19:20:17 +01:00
Christian Schwarz
c2f5d011c7 fix python linter complaints 2022-11-24 19:20:17 +01:00
Christian Schwarz
1022b4b98f use utils::failpoint_sleep_millis_async!("attach-before-activate")
The detach_while_attaching test still passes, presumably because
the two request execute on different OS threads.
2022-11-24 19:20:17 +01:00
Heikki Linnakangas
791eebefe2 Silence clippy warning 2022-11-24 19:20:17 +01:00
Heikki Linnakangas
50b686c3e4 rustfmt 2022-11-24 19:20:17 +01:00
Heikki Linnakangas
39b10696e9 Fix unit tests.
`activate` is now more strict and errors out if the tenant is already
Active.
2022-11-24 19:20:17 +01:00
Heikki Linnakangas
264b0ada9f Handle concurrent detach and attach more gracefully.
If tenant detach is requested while the tenant is still in Attaching
state, we set the state to Paused, but when the attach completed, it
changed it to Active again, and worse, it started the background jobs.
To fix, rewrite the set_state() function so that when you activate a
tenant that is already in Paused state, it stays in Paused state and
we don't start the background loops.
2022-11-24 19:20:17 +01:00
Heikki Linnakangas
78338f7b94 Remove background_jobs_enabled, move code from tenant_mgr.rs to tenant.rs 2022-11-24 19:20:17 +01:00
Heikki Linnakangas
0d533ce840 Test detach while attach is still in progress 2022-11-24 19:20:17 +01:00
Christian Schwarz
978f1879b9 fix typo in storage_sync.rs module comment
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
2022-11-24 19:17:41 +01:00
Christian Schwarz
c863d679f8 storage_sync: update module doc comment to reflect our changes 2022-11-23 11:46:03 -05:00
Heikki Linnakangas
28e9eb6539 Add one more compaction to test, to ensure GC finds old layers to remove 2022-11-22 20:00:41 +02:00
Christian Schwarz
d07a4d02bb test_remote_storage_upload_queue_retries: actually generate Delete ops 2022-11-22 12:13:38 -05:00
Christian Schwarz
c61731a31f metric pageserver_remote_upload_queue_unfinished_tasks: add labels for file & op kind, and check for them in the test
Fails right now because turns out we don't actually generate layer
removal tasks with the current test code. That will be the next commit.
2022-11-22 12:13:38 -05:00
Christian Schwarz
5a55dce282 ensure layers_removed > 0 2022-11-22 12:13:36 -05:00
Heikki Linnakangas
e6de1b0e8c Increase the timeout in test.
Downloading all files can take more than 5 seconds, if there are even
small network glitches or similar.
2022-11-22 17:22:33 +02:00
Heikki Linnakangas
58be279be1 Fix python flake8 errors 2022-11-22 16:40:51 +02:00
Heikki Linnakangas
d1b92e976a When a new layer file is created in compaction, also upload it.
I can't believe this was missing..
2022-11-22 16:35:31 +02:00
Heikki Linnakangas
7d46c7c118 Fix python formatting 2022-11-22 15:16:31 +02:00
Heikki Linnakangas
d6c1a9aa18 Fix formatting 2022-11-22 15:02:09 +02:00
Heikki Linnakangas
49f2eac934 Add missing import
Was causing the test to fail. It's annoying that you don't get an error
from that earlier..
2022-11-22 15:01:03 +02:00
Heikki Linnakangas
5c8387aff1 Fix setting failpoints in test_remote_storage_backup_and_restore
The checkpoints in the test were numbered 1 and 2, but the code only
tried to set the failpoints for checkpoint 0. So they were never set.
2022-11-22 14:52:11 +02:00
Heikki Linnakangas
3c4680b718 Turn upload_queue_items_metric into a regular function 2022-11-22 14:45:22 +02:00
Christian Schwarz
6b7cbec9b3 WIP: add test for storage_sync upload retries 2022-11-22 05:49:46 -05:00
Christian Schwarz
6de9a33c05 metric REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS 2022-11-22 05:49:46 -05:00
Christian Schwarz
77f883c95c prometheus metrics for storage_sync2
Extracted from https://github.com/neondatabase/neon/pull/2595

Use pub(crate) to highlight unused metrics.
2022-11-22 05:49:25 -05:00
Heikki Linnakangas
e6557f4f91 Silence test failures, where we operate on a tenant before it's loaded
Saw a failure like this, from 'test_tenants_attached_after_download' and
'test_tenant_redownloads_truncated_file_on_startup':

> test_runner/fixtures/neon_fixtures.py:1064: in verbose_error
>     res.raise_for_status()
> /github/home/.cache/pypoetry/virtualenvs/neon-_pxWMzVK-py3.9/lib/python3.9/site-packages/requests/models.py:1021: in raise_for_status
>     raise HTTPError(http_error_msg, response=self)
> E   requests.exceptions.HTTPError: 404 Client Error: Not Found for url: http://localhost:18150/v1/tenant/2334c9c113a82b5dd1651a0a23c53448/timeline
>
> The above exception was the direct cause of the following exception:
> test_runner/regress/test_tenants_with_remote_storage.py:185: in test_tenants_attached_after_download
>     restored_timelines = client.timeline_list(tenant_id)
> test_runner/fixtures/neon_fixtures.py:1148: in timeline_list
>     self.verbose_error(res)
> test_runner/fixtures/neon_fixtures.py:1070: in verbose_error
>     raise PageserverApiException(msg) from e
> E   fixtures.neon_fixtures.PageserverApiException: NotFound: Tenant 2334c9c113a82b5dd1651a0a23c53448 is not active. Current state: Loading

These tests starts the pageserver, wait until
assert_no_in_progress_downloads_for_tenant says that
has_downloads_in_progress is false, and then call timeline_list on the
tenant. But has_downloads_in_progress was only returned as true when
the tenant was being attached, not when it was being loaded at
pageserver startup. Change tenant_status API endpoint
(/v1/tenant/:tenant_id) so that it returns
has_downloads_in_progress=true also for tenants that are still in
Loading state.
2022-11-21 20:50:42 +01:00
Heikki Linnakangas
e8db20eb26 On connection from compute, wait for tenant to become Active.
If a connection from compute arrives while a tenant is still in
Loading state, wait for it to become Active instead of throwing an
error to the client. This should fix the errors from test_gc_cutoff
test that repeatedly restarts the pageserver and immediately tries to
connect to it.
2022-11-21 20:50:42 +01:00
Heikki Linnakangas
7552e2d25f Enable passing FAILPOINTS at startup.
- Pass through FAILPOINTS environment variable to the pageserver in
  "neon_local pageserver start" command

- On startup, list any failpoints that were set with FAILPOINTS to the log

- Add optional "extra_env_vars" argument to the NeonPageserver.start()
  function in the python fixture, so that you can pass FAILPOINTS

None of the tests use this functionality yet; that comes in a separate
commit.

closes https://github.com/neondatabase/neon/pull/2865
2022-11-21 20:50:42 +01:00
Heikki Linnakangas
dfb4160403 Don't remove timelines directory while pageserver is running.
The test removes all the timelines from local disk, to test how
pageserver startup works when it's missing. But if the pageserver
hasn't finished loading the timeline yet, you can get an error:

> 2022-11-20T01:30:41.053207Z  INFO load{tenant_id=0f6ba053925a997b99b5eb45f9c548ac}:load_local_timeline{timeline_id=308ada17f4c3d790b631805d2dd51807}: no index file was found on the remote
> 2022-11-20T01:30:41.054045Z ERROR load{tenant_id=0f6ba053925a997b99b5eb45f9c548ac}:load_local_timeline{timeline_id=308ada17f4c3d790b631805d2dd51807}: Failed to initialize timeline 0f6ba053925a997b99b5eb45f9c548ac/308ada17f4c3d790b631805d2dd51807: Failed to load layermap for timeline 0f6ba053925a997b99b5eb45f9c548ac/308ada17f4c3d790b631805d2dd51807
>
> Caused by:
>     No such file or directory (os error 2)

I saw this in CI, here:
  https://neon-github-public-dev.s3.amazonaws.com/reports/pr-2785/debug/3505805425/index.html#suites/ec4311502db344eee91f1354e9dc839b/725c7d0ecec1ec4d/

And was able to reproduce it with this:

> --- a/pageserver/src/tenant.rs
> +++ b/pageserver/src/tenant.rs
> @@ -946,6 +946,8 @@ impl Tenant {
>              None => None,
>          };
>
> +        tokio::time::sleep(std::time::Duration::from_secs(2)).await;
> +
>          self.setup_timeline(
>              timeline_id,
>              remote_client,

Even on 'main', it's pretty sketchy to remote the directory while the
pageserver is still running, but it didn't lead to an error because
the pagesever finished loading the local layer maps before starting
up. Now that that's spawned into background, the directory might get
removed before the loading finishes.
2022-11-20 21:52:22 +02:00
Heikki Linnakangas
f9bdc030e8 Fix python formatting failure 2022-11-20 02:37:15 +02:00
Heikki Linnakangas
e4c9b83a39 Merge remote-tracking branch 'origin/main' into HEAD 2022-11-20 02:31:21 +02:00
Heikki Linnakangas
a78c16328e Silence test_remote_storage failure, caused by error message change 2022-11-20 00:13:43 +02:00
Heikki Linnakangas
eed99b7251 Silence clippy warning 2022-11-19 18:26:18 +02:00
Heikki Linnakangas
a2fbd93e91 Fix python isort codestyle check 2022-11-18 18:29:27 +02:00
Christian Schwarz
66f8f686a0 run manual gc in a task_mgr task to prevent race with detach
This fixes flaky test_tenant_detach_smoke.
2022-11-18 12:15:14 +02:00
Christian Schwarz
919f2b261a make test_tenant_detach_smoke fail reproducibly
Add failpoint that triggers the race condition.
2022-11-18 12:15:14 +02:00
Christian Schwarz
9d273c840a timeline: explicit tracking of flush loop state: NotStarted, Running, Exited
This allows us to error out in the case where we request flush but the
flush loop is not running.
Before, we would only track whether it was started, but not when it
exited.
Better to use an enum with 3 states than a 2-state bool because then
the error message can answer the question whether we ever started
the flush loop or not.
2022-11-18 12:15:14 +02:00
Dmitry Rodionov
6600e1f896 initialize upload queue before starting download operations 2022-11-17 23:53:31 +02:00
Dmitry Rodionov
348369414b fix incorrect metadata update
Previously in some cases local metadata was confused with remote one and
there was a check, that we write locally only if remote metadata has
greater disk_consistent_lsn. So because they were equal we didnt write
anything. For attach scenario this ended up in not writing metadata at
all.

Rearrange code so we decide on proper metadata value earlier on and
initialize timeline with correct one without need to update it late in
the initialization process in .reconsile_with_remote
2022-11-17 23:53:31 +02:00
Christian Schwarz
3890acaf7f stop using Option for UploadQueueInitialized::{latest_metadata,last_uploaded_consistent_lsn} 2022-11-17 21:34:16 +02:00
Christian Schwarz
f537a7a873 add explainer comments regarding UploadQueueInitialized::{latest_files,latest_metadata,last_uploaded_consistent_lsn} 2022-11-17 21:34:16 +02:00
Christian Schwarz
71bc45a21b storage_sync: track upload queue initialization state using enum & fix last_uploaded_consistent_lsn initialization for empty remote storage
As pointed out in b8488e70a9 (r1024319620)
the following is wrong for the case where the remote storage is empty:

    metadata = whatever the local-ONLY metadata is
    ...
    upload_queue.latest_metadata = Some(metadata.clone());
    upload_queue.last_uploaded_consistent_lsn = Some(metadata.disk_consistent_lsn());

The reason why it's wrong is that we return last_uploaded_consistent_lsn
to safekeepers. So, we'd be returning an Lsn that is not yet uploaded to
S3.
2022-11-17 21:34:16 +02:00
Christian Schwarz
decef74503 don't start background jobs if tenant has not timelines
Before this change, test_pageserver_with_empty_tenants was failing at:

  assert loaded_tenant["state"] == {
        "Active": {"background_jobs_running": False}
    }, "Tenant {tenant_with_empty_timelines_dir} with empty timelines dir should be active and ready for timeline creation"

because background_jobs_running was True instead of False.

Personally I think we should simply always start the background loops
and not bother, but let's punt this until after we've merged this PR.
2022-11-17 11:22:02 +02:00
Christian Schwarz
8aed805933 refactor: remove misleading log messages and redundant asserts from test_pageserver_with_empty_tenants
At least since commit 'Return broken tenants due to non existing timelines dir (#2552) (#2575)'
some of these messages are wrong.
2022-11-17 11:21:49 +02:00
Dmitry Rodionov
b8488e70a9 run clippy/fmt 2022-11-16 17:25:04 +02:00
Christian Schwarz
16fdd104ac bring back HTTP API has_in_progress_downloads and awaits_download field, derived from TenantState 2022-11-16 14:57:26 +02:00
Christian Schwarz
bb6dbd2f43 crash-safe and resumable tenant attach
This change introduces a marker file

  $repo/tenants/$tenant_id/attaching

that is present while a tenant is in Attaching state.

When pageserver restarts, we use it to resume the tenant attach operation.
Before this change, a crash during tenant attach would result in one of
the following:
1. crash upon restart due to missing metadata file (IIRC)
2. "successful" loading of the tenant with a subset of timelines
2022-11-16 14:57:26 +02:00
Dmitry Rodionov
c4c4558736 adjust allowed errors for test_broken_timeline 2022-11-16 14:57:26 +02:00
Heikki Linnakangas
bfdc09cf4a Improve comments and checks in test_broken_timeline.py 2022-11-16 14:57:21 +02:00
Dmitry Rodionov
1839ce0545 properly merge remote metadata with local one 2022-11-16 14:42:15 +02:00
Dmitry Rodionov
8e04f0455e add a bunch of .context calls 2022-11-16 14:42:15 +02:00
Dmitry Rodionov
6839773538 handle temporary files during layer map loading 2022-11-16 14:42:15 +02:00
Dmitry Rodionov
2a96c4cfcd start walreceiver after reconcile_with_remote 2022-11-16 14:42:15 +02:00
Christian Schwarz
027cf22663 fix layer download during reconcile_with_remote
reconcile_with_remote, in this PR, is supposed to download all the layer
files synchronously.
I don't know why, but, download_missing was

1. not doing the download at all for DeltaLayer
2. not using the right RelativePath for image layer

This patch fixes both.
2022-11-16 14:42:15 +02:00
Christian Schwarz
f4daa877b5 load_remote_timeline already has an #[instrument] 2022-11-16 14:42:15 +02:00
Christian Schwarz
ed28ced3bc schedule_index_upload: remove unused Option() around metadata param 2022-11-16 14:42:15 +02:00
Christian Schwarz
d7c120574b dedup download_missing() calls 2022-11-16 14:42:15 +02:00
Dmitry Rodionov
c9188ffa67 fix wrong path handling in reconcile_with_remote, refine spans 2022-11-16 14:42:15 +02:00
Dmitry Rodionov
c631fa1f50 fix test_gc_cutoff test
Also improve it so it fails earlier if something is not working
because otherwise it was failing because of the timeout.
And if timeout was big enough test can even pass
2022-11-16 14:42:15 +02:00
Dmitry Rodionov
795c3ca131 Port per-tenant upload queue and startup changes from #2595
This is a part of https://github.com/neondatabase/neon/pull/2595.
It takes out switch to per tenant upload queue and changes to pageserver
startup sequence because these two are highly interleaved with each
other. I'm still not happy with the size of the diff, but splitting
it even more will probably consume even more time. Ideally we should
do it, but this patch isis already a step forward and should be
easier to get this patch in yet still quite difficult.
Mainly because of the size and fixes for existing concerns which
will extend the diff even further

Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2022-11-16 14:42:15 +02:00
370 changed files with 21536 additions and 64256 deletions

View File

@@ -4,7 +4,7 @@
hakari-package = "workspace_hack"
# Format for `workspace-hack = ...` lines in other Cargo.tomls. Requires cargo-hakari 0.9.8 or above.
dep-format-version = "3"
dep-format-version = "2"
# Setting workspace.resolver = "2" in the root Cargo.toml is HIGHLY recommended.
# Hakari works much better with the new feature resolver.

View File

@@ -15,7 +15,6 @@
!proxy/
!safekeeper/
!storage_broker/
!trace/
!vendor/postgres-v14/
!vendor/postgres-v15/
!workspace_hack/

View File

@@ -14,7 +14,7 @@
- [ ] Check [#dev-production-stream](https://neondb.slack.com/archives/C03F5SM1N02) Slack channel
- [ ] Check [stuck projects page](https://console.neon.tech/admin/projects?sort=last_active&order=desc&stuck=true)
- [ ] Check [recent operation failures](https://console.neon.tech/admin/operations?action=create_timeline%2Cstart_compute%2Cstop_compute%2Csuspend_compute%2Capply_config%2Cdelete_timeline%2Cdelete_tenant%2Ccreate_branch%2Ccheck_availability&sort=updated_at&order=desc&had_retries=some)
- [ ] Check [cloud SLO dashboard](https://neonprod.grafana.net/d/_oWcBMJ7k/cloud-slos?orgId=1)
- [ ] Check [compute startup metrics dashboard](https://neonprod.grafana.net/d/5OkYJEmVz/compute-startup-time)
- [ ] Check [cloud SLO dashboard](https://observer.zenith.tech/d/_oWcBMJ7k/cloud-slos?orgId=1)
- [ ] Check [compute startup metrics dashboard](https://observer.zenith.tech/d/5OkYJEmVz/compute-startup-time)
<!-- List everything that should be done **after** release, any admin UI configuration / Grafana dashboard / alert changes / setting changes / etc -->

View File

@@ -32,8 +32,8 @@ runs:
exit 2
fi
- name: Calculate variables
id: calculate-vars
- name: Calculate key
id: calculate-key
shell: bash -euxo pipefail {0}
run: |
# TODO: for manually triggered workflows (via workflow_dispatch) we need to have a separate key
@@ -41,22 +41,14 @@ runs:
pr_number=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
if [ "${pr_number}" != "null" ]; then
key=pr-${pr_number}
elif [ "${GITHUB_REF_NAME}" = "main" ]; then
elif [ "${GITHUB_REF}" = "refs/heads/main" ]; then
# Shortcut for a special branch
key=main
elif [ "${GITHUB_REF_NAME}" = "release" ]; then
# Shortcut for a special branch
key=release
else
key=branch-$(printf "${GITHUB_REF_NAME}" | tr -c "[:alnum:]._-" "-")
key=branch-$(echo ${GITHUB_REF#refs/heads/} | tr -c "[:alnum:]._-" "-")
fi
echo "KEY=${key}" >> $GITHUB_OUTPUT
# Sanitize test selection to remove `/` and any other special characters
# Use printf instead of echo to avoid having `\n` at the end of the string
test_selection=$(printf "${{ inputs.test_selection }}" | tr -c "[:alnum:]._-" "-" )
echo "TEST_SELECTION=${test_selection}" >> $GITHUB_OUTPUT
- uses: actions/setup-java@v3
if: ${{ inputs.action == 'generate' }}
with:
@@ -82,11 +74,10 @@ runs:
- name: Upload Allure results
if: ${{ inputs.action == 'store' }}
env:
REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
REPORT_PREFIX: reports/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }}
RAW_PREFIX: reports-raw/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }}
TEST_OUTPUT: /tmp/test_output
BUCKET: neon-github-public-dev
TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
shell: bash -euxo pipefail {0}
run: |
# Add metadata
@@ -107,7 +98,7 @@ runs:
BUILD_TYPE=${{ inputs.build_type }}
EOF
ARCHIVE="${GITHUB_RUN_ID}-${TEST_SELECTION}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst"
ARCHIVE="${GITHUB_RUN_ID}-${{ inputs.test_selection }}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst"
ZSTD_NBTHREADS=0
tar -C ${TEST_OUTPUT}/allure/results -cf ${ARCHIVE} --zstd .
@@ -118,9 +109,8 @@ runs:
if: ${{ inputs.action == 'generate' }}
shell: bash -euxo pipefail {0}
env:
LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt
LOCK_FILE: reports/${{ steps.calculate-key.outputs.KEY }}/lock.txt
BUCKET: neon-github-public-dev
TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
run: |
LOCK_TIMEOUT=300 # seconds
@@ -133,12 +123,12 @@ runs:
fi
sleep 1
done
echo "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${TEST_SELECTION}" > lock.txt
echo "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${{ inputs.test_selection }}" > lock.txt
aws s3 mv --only-show-errors lock.txt "s3://${BUCKET}/${LOCK_FILE}"
# A double-check that exactly WE have acquired the lock
aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt
if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${TEST_SELECTION}" ]; then
if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${{ inputs.test_selection }}" ]; then
break
fi
done
@@ -147,8 +137,8 @@ runs:
if: ${{ inputs.action == 'generate' }}
id: generate-report
env:
REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
REPORT_PREFIX: reports/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }}
RAW_PREFIX: reports-raw/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }}
TEST_OUTPUT: /tmp/test_output
BUCKET: neon-github-public-dev
shell: bash -euxo pipefail {0}
@@ -202,13 +192,12 @@ runs:
if: ${{ inputs.action == 'generate' && always() }}
shell: bash -euxo pipefail {0}
env:
LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt
LOCK_FILE: reports/${{ steps.calculate-key.outputs.KEY }}/lock.txt
BUCKET: neon-github-public-dev
TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
run: |
aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt || exit 0
if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${TEST_SELECTION}" ]; then
if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${{ inputs.test_selection }}" ]; then
aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
fi

View File

@@ -1,138 +0,0 @@
name: 'Create Branch'
description: 'Create Branch using API'
inputs:
api_key:
desctiption: 'Neon API key'
required: true
project_id:
desctiption: 'ID of the Project to create Branch in'
required: true
api_host:
desctiption: 'Neon API host'
default: console.stage.neon.tech
outputs:
dsn:
description: 'Created Branch DSN (for main database)'
value: ${{ steps.change-password.outputs.dsn }}
branch_id:
description: 'Created Branch ID'
value: ${{ steps.create-branch.outputs.branch_id }}
runs:
using: "composite"
steps:
- name: Create New Branch
id: create-branch
shell: bash -euxo pipefail {0}
run: |
for i in $(seq 1 10); do
branch=$(curl \
"https://${API_HOST}/api/v2/projects/${PROJECT_ID}/branches" \
--header "Accept: application/json" \
--header "Content-Type: application/json" \
--header "Authorization: Bearer ${API_KEY}" \
--data "{
\"branch\": {
\"name\": \"Created by actions/neon-branch-create; GITHUB_RUN_ID=${GITHUB_RUN_ID} at $(date +%s)\"
},
\"endpoints\": [
{
\"type\": \"read_write\"
}
]
}")
if [ -z "${branch}" ]; then
sleep 1
continue
fi
branch_id=$(echo $branch | jq --raw-output '.branch.id')
if [ "${branch_id}" == "null" ]; then
sleep 1
continue
fi
break
done
if [ -z "${branch_id}" ] || [ "${branch_id}" == "null" ]; then
echo 2>&1 "Failed to create branch after 10 attempts, the latest response was: ${branch}"
exit 1
fi
branch_id=$(echo $branch | jq --raw-output '.branch.id')
echo "branch_id=${branch_id}" >> $GITHUB_OUTPUT
host=$(echo $branch | jq --raw-output '.endpoints[0].host')
echo "host=${host}" >> $GITHUB_OUTPUT
env:
API_HOST: ${{ inputs.api_host }}
API_KEY: ${{ inputs.api_key }}
PROJECT_ID: ${{ inputs.project_id }}
- name: Get Role name
id: role-name
shell: bash -euxo pipefail {0}
run: |
roles=$(curl \
"https://${API_HOST}/api/v2/projects/${PROJECT_ID}/branches/${BRANCH_ID}/roles" \
--fail \
--header "Accept: application/json" \
--header "Content-Type: application/json" \
--header "Authorization: Bearer ${API_KEY}"
)
role_name=$(echo $roles | jq --raw-output '.roles[] | select(.protected == false) | .name')
echo "role_name=${role_name}" >> $GITHUB_OUTPUT
env:
API_HOST: ${{ inputs.api_host }}
API_KEY: ${{ inputs.api_key }}
PROJECT_ID: ${{ inputs.project_id }}
BRANCH_ID: ${{ steps.create-branch.outputs.branch_id }}
- name: Change Password
id: change-password
# A shell without `set -x` to not to expose password/dsn in logs
shell: bash -euo pipefail {0}
run: |
for i in $(seq 1 10); do
reset_password=$(curl \
"https://${API_HOST}/api/v2/projects/${PROJECT_ID}/branches/${BRANCH_ID}/roles/${ROLE_NAME}/reset_password" \
--request POST \
--header "Accept: application/json" \
--header "Content-Type: application/json" \
--header "Authorization: Bearer ${API_KEY}"
)
if [ -z "${reset_password}" ]; then
sleep 1
continue
fi
password=$(echo $reset_password | jq --raw-output '.role.password')
if [ "${password}" == "null" ]; then
sleep 1
continue
fi
echo "::add-mask::${password}"
break
done
if [ -z "${password}" ] || [ "${password}" == "null" ]; then
echo 2>&1 "Failed to reset password after 10 attempts, the latest response was: ${reset_password}"
exit 1
fi
dsn="postgres://${ROLE_NAME}:${password}@${HOST}/neondb"
echo "::add-mask::${dsn}"
echo "dsn=${dsn}" >> $GITHUB_OUTPUT
env:
API_HOST: ${{ inputs.api_host }}
API_KEY: ${{ inputs.api_key }}
PROJECT_ID: ${{ inputs.project_id }}
BRANCH_ID: ${{ steps.create-branch.outputs.branch_id }}
ROLE_NAME: ${{ steps.role-name.outputs.role_name }}
HOST: ${{ steps.create-branch.outputs.host }}

View File

@@ -1,58 +0,0 @@
name: 'Delete Branch'
description: 'Delete Branch using API'
inputs:
api_key:
desctiption: 'Neon API key'
required: true
project_id:
desctiption: 'ID of the Project which should be deleted'
required: true
branch_id:
desctiption: 'ID of the branch to delete'
required: true
api_host:
desctiption: 'Neon API host'
default: console.stage.neon.tech
runs:
using: "composite"
steps:
- name: Delete Branch
# Do not try to delete a branch if .github/actions/neon-project-create
# or .github/actions/neon-branch-create failed before
if: ${{ inputs.project_id != '' && inputs.branch_id != '' }}
shell: bash -euxo pipefail {0}
run: |
for i in $(seq 1 10); do
deleted_branch=$(curl \
"https://${API_HOST}/api/v2/projects/${PROJECT_ID}/branches/${BRANCH_ID}" \
--request DELETE \
--header "Accept: application/json" \
--header "Content-Type: application/json" \
--header "Authorization: Bearer ${API_KEY}"
)
if [ -z "${deleted_branch}" ]; then
sleep 1
continue
fi
branch_id=$(echo $deleted_branch | jq --raw-output '.branch.id')
if [ "${branch_id}" == "null" ]; then
sleep 1
continue
fi
break
done
if [ -z "${branch_id}" ] || [ "${branch_id}" == "null" ]; then
echo 2>&1 "Failed to delete branch after 10 attempts, the latest response was: ${deleted_branch}"
exit 1
fi
env:
API_HOST: ${{ inputs.api_host }}
API_KEY: ${{ inputs.api_key }}
PROJECT_ID: ${{ inputs.project_id }}
BRANCH_ID: ${{ inputs.branch_id }}

View File

@@ -5,16 +5,12 @@ inputs:
api_key:
desctiption: 'Neon API key'
required: true
environment:
desctiption: 'dev (aka captest) or stage'
required: true
region_id:
desctiption: 'Region ID, if not set the project will be created in the default region'
default: aws-us-east-2
postgres_version:
desctiption: 'Postgres version; default is 15'
default: 15
api_host:
desctiption: 'Neon API host'
default: console.stage.neon.tech
required: false
outputs:
dsn:
description: 'Created Project DSN (for main database)'
@@ -26,13 +22,38 @@ outputs:
runs:
using: "composite"
steps:
- name: Parse Input
id: parse-input
shell: bash -euxo pipefail {0}
run: |
case "${ENVIRONMENT}" in
dev)
API_HOST=console.dev.neon.tech
REGION_ID=${REGION_ID:-eu-west-1}
;;
staging)
API_HOST=console.stage.neon.tech
REGION_ID=${REGION_ID:-us-east-1}
;;
*)
echo 2>&1 "Unknown environment=${ENVIRONMENT}. Allowed 'dev' or 'staging' only"
exit 1
;;
esac
echo "api_host=${API_HOST}" >> $GITHUB_OUTPUT
echo "region_id=${REGION_ID}" >> $GITHUB_OUTPUT
env:
ENVIRONMENT: ${{ inputs.environment }}
REGION_ID: ${{ inputs.region_id }}
- name: Create Neon Project
id: create-neon-project
# A shell without `set -x` to not to expose password/dsn in logs
shell: bash -euo pipefail {0}
run: |
project=$(curl \
"https://${API_HOST}/api/v2/projects" \
"https://${API_HOST}/api/v1/projects" \
--fail \
--header "Accept: application/json" \
--header "Content-Type: application/json" \
@@ -40,7 +61,7 @@ runs:
--data "{
\"project\": {
\"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\",
\"pg_version\": ${POSTGRES_VERSION},
\"platform_id\": \"aws\",
\"region_id\": \"${REGION_ID}\",
\"settings\": { }
}
@@ -49,16 +70,13 @@ runs:
# Mask password
echo "::add-mask::$(echo $project | jq --raw-output '.roles[] | select(.name != "web_access") | .password')"
dsn=$(echo $project | jq --raw-output '.connection_uris[0].connection_uri')
dsn=$(echo $project | jq --raw-output '.roles[] | select(.name != "web_access") | .dsn')/main
echo "::add-mask::${dsn}"
echo "dsn=${dsn}" >> $GITHUB_OUTPUT
project_id=$(echo $project | jq --raw-output '.project.id')
project_id=$(echo $project | jq --raw-output '.id')
echo "project_id=${project_id}" >> $GITHUB_OUTPUT
echo "Project ${project_id} has been created"
env:
API_HOST: ${{ inputs.api_host }}
API_KEY: ${{ inputs.api_key }}
REGION_ID: ${{ inputs.region_id }}
POSTGRES_VERSION: ${{ inputs.postgres_version }}
API_HOST: ${{ steps.parse-input.outputs.api_host }}
REGION_ID: ${{ steps.parse-input.outputs.region_id }}

View File

@@ -5,31 +5,50 @@ inputs:
api_key:
desctiption: 'Neon API key'
required: true
environment:
desctiption: 'dev (aka captest) or stage'
required: true
project_id:
desctiption: 'ID of the Project to delete'
required: true
api_host:
desctiption: 'Neon API host'
default: console.stage.neon.tech
runs:
using: "composite"
steps:
- name: Delete Neon Project
# Do not try to delete a project if .github/actions/neon-project-create failed before
if: ${{ inputs.project_id != '' }}
- name: Parse Input
id: parse-input
shell: bash -euxo pipefail {0}
run: |
curl \
"https://${API_HOST}/api/v2/projects/${PROJECT_ID}" \
--fail \
--request DELETE \
--header "Accept: application/json" \
--header "Content-Type: application/json" \
--header "Authorization: Bearer ${API_KEY}"
case "${ENVIRONMENT}" in
dev)
API_HOST=console.dev.neon.tech
;;
staging)
API_HOST=console.stage.neon.tech
;;
*)
echo 2>&1 "Unknown environment=${ENVIRONMENT}. Allowed 'dev' or 'staging' only"
exit 1
;;
esac
echo "Project ${PROJECT_ID} has been deleted"
echo "api_host=${API_HOST}" >> $GITHUB_OUTPUT
env:
ENVIRONMENT: ${{ inputs.environment }}
- name: Delete Neon Project
shell: bash -euxo pipefail {0}
run: |
# Allow PROJECT_ID to be empty/null for cases when .github/actions/neon-project-create failed
if [ -n "${PROJECT_ID}" ]; then
curl -X "POST" \
"https://${API_HOST}/api/v1/projects/${PROJECT_ID}/delete" \
--fail \
--header "Accept: application/json" \
--header "Content-Type: application/json" \
--header "Authorization: Bearer ${API_KEY}"
fi
env:
API_HOST: ${{ inputs.api_host }}
API_KEY: ${{ inputs.api_key }}
PROJECT_ID: ${{ inputs.project_id }}
API_HOST: ${{ steps.parse-input.outputs.api_host }}

View File

@@ -123,8 +123,8 @@ runs:
exit 1
fi
if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
# -n16 uses sixteen processes to run tests via pytest-xdist
EXTRA_PARAMS="-n16 $EXTRA_PARAMS"
# -n4 uses four processes to run tests via pytest-xdist
EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
# --dist=loadgroup points tests marked with @pytest.mark.xdist_group
# to the same worker to make @pytest.mark.order work with xdist

View File

@@ -3,6 +3,7 @@
localhost_warning = False
host_key_checking = False
timeout = 30
collections_paths = ./collections
[ssh_connection]
ssh_args = -F ./ansible.ssh.cfg

View File

@@ -117,8 +117,7 @@
shell:
cmd: |
INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/pageservers/$INSTANCE_ID | jq '.version = {{ current_version }}' > /tmp/new_version
curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/pageservers
curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/pageservers/$INSTANCE_ID
tags:
- pageserver
@@ -187,7 +186,6 @@
shell:
cmd: |
INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/safekeepers/$INSTANCE_ID | jq '.version = {{ current_version }}' > /tmp/new_version
curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/safekeepers
curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/safekeepers/$INSTANCE_ID
tags:
- safekeeper

31
.github/ansible/neon-stress.hosts.yaml vendored Normal file
View File

@@ -0,0 +1,31 @@
storage:
vars:
bucket_name: neon-storage-ireland
bucket_region: eu-west-1
console_mgmt_base_url: http://neon-stress-console.local
etcd_endpoints: neon-stress-etcd.local:2379
safekeeper_enable_s3_offload: 'false'
pageserver_config_stub:
pg_distrib_dir: /usr/local
remote_storage:
bucket_name: "{{ bucket_name }}"
bucket_region: "{{ bucket_region }}"
prefix_in_bucket: "{{ inventory_hostname }}"
safekeeper_s3_prefix: neon-stress/wal
hostname_suffix: ".local"
remote_user: admin
children:
pageservers:
hosts:
neon-stress-ps-1:
console_region_id: aws-eu-west-1
neon-stress-ps-2:
console_region_id: aws-eu-west-1
safekeepers:
hosts:
neon-stress-sk-1:
console_region_id: aws-eu-west-1
neon-stress-sk-2:
console_region_id: aws-eu-west-1
neon-stress-sk-3:
console_region_id: aws-eu-west-1

View File

@@ -3,11 +3,9 @@ storage:
bucket_name: neon-prod-storage-ap-southeast-1
bucket_region: ap-southeast-1
console_mgmt_base_url: http://console-release.local
broker_endpoint: http://storage-broker-lb.epsilon.ap-southeast-1.internal.aws.neon.tech:50051
etcd_endpoints: etcd-0.ap-southeast-1.aws.neon.tech:2379
pageserver_config_stub:
pg_distrib_dir: /usr/local
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
metric_collection_interval: 10min
remote_storage:
bucket_name: "{{ bucket_name }}"
bucket_region: "{{ bucket_region }}"
@@ -18,7 +16,6 @@ storage:
ansible_aws_ssm_region: ap-southeast-1
ansible_aws_ssm_bucket_name: neon-prod-storage-ap-southeast-1
console_region_id: aws-ap-southeast-1
sentry_environment: production
children:
pageservers:

View File

@@ -3,11 +3,9 @@ storage:
bucket_name: neon-prod-storage-eu-central-1
bucket_region: eu-central-1
console_mgmt_base_url: http://console-release.local
broker_endpoint: http://storage-broker-lb.gamma.eu-central-1.internal.aws.neon.tech:50051
etcd_endpoints: etcd-0.eu-central-1.aws.neon.tech:2379
pageserver_config_stub:
pg_distrib_dir: /usr/local
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
metric_collection_interval: 10min
remote_storage:
bucket_name: "{{ bucket_name }}"
bucket_region: "{{ bucket_region }}"
@@ -18,7 +16,6 @@ storage:
ansible_aws_ssm_region: eu-central-1
ansible_aws_ssm_bucket_name: neon-prod-storage-eu-central-1
console_region_id: aws-eu-central-1
sentry_environment: production
children:
pageservers:

View File

@@ -3,11 +3,9 @@ storage:
bucket_name: neon-prod-storage-us-east-2
bucket_region: us-east-2
console_mgmt_base_url: http://console-release.local
broker_endpoint: http://storage-broker-lb.delta.us-east-2.internal.aws.neon.tech:50051
etcd_endpoints: etcd-0.us-east-2.aws.neon.tech:2379
pageserver_config_stub:
pg_distrib_dir: /usr/local
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
metric_collection_interval: 10min
remote_storage:
bucket_name: "{{ bucket_name }}"
bucket_region: "{{ bucket_region }}"
@@ -18,7 +16,6 @@ storage:
ansible_aws_ssm_region: us-east-2
ansible_aws_ssm_bucket_name: neon-prod-storage-us-east-2
console_region_id: aws-us-east-2
sentry_environment: production
children:
pageservers:
@@ -36,4 +33,4 @@ storage:
ansible_host: i-06d113fb73bfddeb0
safekeeper-2.us-east-2.aws.neon.tech:
ansible_host: i-09f66c8e04afff2e8

View File

@@ -1,41 +0,0 @@
storage:
vars:
bucket_name: neon-prod-storage-us-west-2
bucket_region: us-west-2
console_mgmt_base_url: http://console-release.local
broker_endpoint: http://storage-broker-lb.eta.us-west-2.internal.aws.neon.tech:50051
pageserver_config_stub:
pg_distrib_dir: /usr/local
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
metric_collection_interval: 10min
remote_storage:
bucket_name: "{{ bucket_name }}"
bucket_region: "{{ bucket_region }}"
prefix_in_bucket: "pageserver/v1"
safekeeper_s3_prefix: safekeeper/v1/wal
hostname_suffix: ""
remote_user: ssm-user
ansible_aws_ssm_region: us-west-2
ansible_aws_ssm_bucket_name: neon-prod-storage-us-west-2
console_region_id: aws-us-west-2-new
sentry_environment: production
children:
pageservers:
hosts:
pageserver-0.us-west-2.aws.neon.tech:
ansible_host: i-0d9f6dfae0e1c780d
pageserver-1.us-west-2.aws.neon.tech:
ansible_host: i-0c834be1dddba8b3f
pageserver-2.us-west-2.aws.neon.tech:
ansible_host: i-051642d372c0a4f32
safekeepers:
hosts:
safekeeper-0.us-west-2.aws.neon.tech:
ansible_host: i-00719d8a74986fda6
safekeeper-1.us-west-2.aws.neon.tech:
ansible_host: i-074682f9d3c712e7c
safekeeper-2.us-west-2.aws.neon.tech:
ansible_host: i-042b7efb1729d7966

View File

@@ -4,11 +4,9 @@ storage:
console_mgmt_base_url: http://console-release.local
bucket_name: zenith-storage-oregon
bucket_region: us-west-2
broker_endpoint: http://storage-broker.prod.local:50051
etcd_endpoints: zenith-1-etcd.local:2379
pageserver_config_stub:
pg_distrib_dir: /usr/local
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
metric_collection_interval: 10min
remote_storage:
bucket_name: "{{ bucket_name }}"
bucket_region: "{{ bucket_region }}"
@@ -16,7 +14,6 @@ storage:
safekeeper_s3_prefix: prod-1/wal
hostname_suffix: ".local"
remote_user: admin
sentry_environment: production
children:
pageservers:
@@ -36,5 +33,5 @@ storage:
console_region_id: aws-us-west-2
zenith-1-sk-2:
console_region_id: aws-us-west-2
zenith-1-sk-4:
zenith-1-sk-3:
console_region_id: aws-us-west-2

View File

@@ -1,8 +1,7 @@
#!/bin/sh
# fetch params from meta-data service
# get instance id from meta-data service
INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
AZ_ID=$(curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone)
# store fqdn hostname in var
HOST=$(hostname -f)
@@ -17,8 +16,7 @@ cat <<EOF | tee /tmp/payload
"instance_id": "${INSTANCE_ID}",
"http_host": "${HOST}",
"http_port": 9898,
"active": false,
"availability_zone_id": "${AZ_ID}"
"active": false
}
EOF

View File

@@ -3,11 +3,9 @@ storage:
bucket_name: neon-dev-storage-eu-west-1
bucket_region: eu-west-1
console_mgmt_base_url: http://console-staging.local
broker_endpoint: http://storage-broker-lb.zeta.eu-west-1.internal.aws.neon.build:50051
etcd_endpoints: etcd-0.eu-west-1.aws.neon.build:2379
pageserver_config_stub:
pg_distrib_dir: /usr/local
metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
metric_collection_interval: 10min
remote_storage:
bucket_name: "{{ bucket_name }}"
bucket_region: "{{ bucket_region }}"
@@ -18,7 +16,6 @@ storage:
ansible_aws_ssm_region: eu-west-1
ansible_aws_ssm_bucket_name: neon-dev-storage-eu-west-1
console_region_id: aws-eu-west-1
sentry_environment: staging
children:
pageservers:

34
.github/ansible/staging.hosts.yaml vendored Normal file
View File

@@ -0,0 +1,34 @@
storage:
vars:
bucket_name: zenith-staging-storage-us-east-1
bucket_region: us-east-1
console_mgmt_base_url: http://console-staging.local
etcd_endpoints: etcd-0.us-east-2.aws.neon.build:2379
pageserver_config_stub:
pg_distrib_dir: /usr/local
remote_storage:
bucket_name: "{{ bucket_name }}"
bucket_region: "{{ bucket_region }}"
prefix_in_bucket: "{{ inventory_hostname }}"
safekeeper_s3_prefix: us-stage/wal
hostname_suffix: ".local"
remote_user: admin
children:
pageservers:
hosts:
zenith-us-stage-ps-2:
console_region_id: aws-us-east-1
zenith-us-stage-ps-3:
console_region_id: aws-us-east-1
zenith-us-stage-ps-4:
console_region_id: aws-us-east-1
safekeepers:
hosts:
zenith-us-stage-sk-4:
console_region_id: aws-us-east-1
zenith-us-stage-sk-5:
console_region_id: aws-us-east-1
zenith-us-stage-sk-6:
console_region_id: aws-us-east-1

View File

@@ -3,11 +3,9 @@ storage:
bucket_name: neon-staging-storage-us-east-2
bucket_region: us-east-2
console_mgmt_base_url: http://console-staging.local
broker_endpoint: http://storage-broker-lb.beta.us-east-2.internal.aws.neon.build:50051
etcd_endpoints: etcd-0.us-east-2.aws.neon.build:2379
pageserver_config_stub:
pg_distrib_dir: /usr/local
metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
metric_collection_interval: 10min
remote_storage:
bucket_name: "{{ bucket_name }}"
bucket_region: "{{ bucket_region }}"
@@ -18,7 +16,6 @@ storage:
ansible_aws_ssm_region: us-east-2
ansible_aws_ssm_bucket_name: neon-staging-storage-us-east-2
console_region_id: aws-us-east-2
sentry_environment: staging
children:
pageservers:
@@ -27,10 +24,6 @@ storage:
ansible_host: i-0c3e70929edb5d691
pageserver-1.us-east-2.aws.neon.build:
ansible_host: i-0565a8b4008aa3f40
pageserver-2.us-east-2.aws.neon.build:
ansible_host: i-01e31cdf7e970586a
pageserver-3.us-east-2.aws.neon.build:
ansible_host: i-0602a0291365ef7cc
safekeepers:
hosts:

View File

@@ -5,8 +5,8 @@ After=network.target auditd.service
[Service]
Type=simple
User=pageserver
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_PAGESERVER }} SENTRY_ENVIRONMENT={{ sentry_environment }}
ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoint='{{ broker_endpoint }}'" -D /storage/pageserver/data
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/v14/lib
ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoints=['{{ etcd_endpoints }}']" -D /storage/pageserver/data
ExecReload=/bin/kill -HUP $MAINPID
KillMode=mixed
KillSignal=SIGINT

View File

@@ -5,8 +5,8 @@ After=network.target auditd.service
[Service]
Type=simple
User=safekeeper
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_SAFEKEEPER }} SENTRY_ENVIRONMENT={{ sentry_environment }}
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoint={{ broker_endpoint }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ safekeeper_s3_prefix }}"}'
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ safekeeper_s3_prefix }}"}'
ExecReload=/bin/kill -HUP $MAINPID
KillMode=mixed
KillSignal=SIGINT

View File

@@ -8,10 +8,6 @@ settings:
authBackend: "console"
authEndpoint: "http://console-staging.local/management/api/v2"
domain: "*.eu-west-1.aws.neon.build"
sentryEnvironment: "staging"
wssPort: 8443
metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
metricCollectionInterval: "1min"
# -- Additional labels for neon-proxy pods
podLabels:
@@ -26,7 +22,6 @@ exposedService:
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: eu-west-1.aws.neon.build
httpsPort: 443
#metrics:
# enabled: true
@@ -34,28 +29,3 @@ exposedService:
# enabled: true
# selector:
# release: kube-prometheus-stack
extraManifests:
- apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: "{{ include \"neon-proxy.fullname\" . }}"
labels:
helm.sh/chart: neon-proxy-{{ .Chart.Version }}
app.kubernetes.io/name: neon-proxy
app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
app.kubernetes.io/managed-by: Helm
namespace: "{{ .Release.Namespace }}"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "neon-proxy"
endpoints:
- port: http
path: /metrics
interval: 10s
scrapeTimeout: 10s
namespaceSelector:
matchNames:
- "{{ .Release.Namespace }}"

View File

@@ -1,52 +0,0 @@
# Helm chart values for neon-storage-broker
podLabels:
neon_env: staging
neon_service: storage-broker
# Use L4 LB
service:
# service.annotations -- Annotations to add to the service
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external # use newer AWS Load Balancer Controller
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internal # deploy LB to private subnet
# assign service to this name at external-dns
external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.zeta.eu-west-1.internal.aws.neon.build
# service.type -- Service type
type: LoadBalancer
# service.port -- broker listen port
port: 50051
ingress:
enabled: false
metrics:
enabled: false
extraManifests:
- apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: "{{ include \"neon-storage-broker.fullname\" . }}"
labels:
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
app.kubernetes.io/name: neon-storage-broker
app.kubernetes.io/instance: neon-storage-broker
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
app.kubernetes.io/managed-by: Helm
namespace: "{{ .Release.Namespace }}"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "neon-storage-broker"
endpoints:
- port: broker
path: /metrics
interval: 10s
scrapeTimeout: 10s
namespaceSelector:
matchNames:
- "{{ .Release.Namespace }}"
settings:
sentryEnvironment: "staging"

View File

@@ -8,10 +8,6 @@ settings:
authBackend: "link"
authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
uri: "https://console.stage.neon.tech/psql_session/"
domain: "pg.neon.build"
sentryEnvironment: "staging"
metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
metricCollectionInterval: "1min"
# -- Additional labels for neon-proxy-link pods
podLabels:
@@ -41,28 +37,3 @@ exposedService:
# enabled: true
# selector:
# release: kube-prometheus-stack
extraManifests:
- apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: "{{ include \"neon-proxy.fullname\" . }}"
labels:
helm.sh/chart: neon-proxy-{{ .Chart.Version }}
app.kubernetes.io/name: neon-proxy
app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
app.kubernetes.io/managed-by: Helm
namespace: "{{ .Release.Namespace }}"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "neon-proxy"
endpoints:
- port: http
path: /metrics
interval: 10s
scrapeTimeout: 10s
namespaceSelector:
matchNames:
- "{{ .Release.Namespace }}"

View File

@@ -1,61 +0,0 @@
# Helm chart values for neon-proxy-scram.
# This is a YAML-formatted file.
image:
repository: neondatabase/neon
settings:
authBackend: "console"
authEndpoint: "http://console-staging.local/management/api/v2"
domain: "*.cloud.stage.neon.tech"
sentryEnvironment: "staging"
wssPort: 8443
metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
metricCollectionInterval: "1min"
# -- Additional labels for neon-proxy pods
podLabels:
zenith_service: proxy-scram-legacy
zenith_env: dev
zenith_region: us-east-2
zenith_region_slug: us-east-2
exposedService:
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: neon-proxy-scram-legacy.beta.us-east-2.aws.neon.build
httpsPort: 443
#metrics:
# enabled: true
# serviceMonitor:
# enabled: true
# selector:
# release: kube-prometheus-stack
extraManifests:
- apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: "{{ include \"neon-proxy.fullname\" . }}"
labels:
helm.sh/chart: neon-proxy-{{ .Chart.Version }}
app.kubernetes.io/name: neon-proxy
app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
app.kubernetes.io/managed-by: Helm
namespace: "{{ .Release.Namespace }}"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "neon-proxy"
endpoints:
- port: http
path: /metrics
interval: 10s
scrapeTimeout: 10s
namespaceSelector:
matchNames:
- "{{ .Release.Namespace }}"

View File

@@ -8,10 +8,6 @@ settings:
authBackend: "console"
authEndpoint: "http://console-staging.local/management/api/v2"
domain: "*.us-east-2.aws.neon.build"
sentryEnvironment: "staging"
wssPort: 8443
metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
metricCollectionInterval: "1min"
# -- Additional labels for neon-proxy pods
podLabels:
@@ -26,7 +22,6 @@ exposedService:
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.build
httpsPort: 443
#metrics:
# enabled: true
@@ -34,28 +29,3 @@ exposedService:
# enabled: true
# selector:
# release: kube-prometheus-stack
extraManifests:
- apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: "{{ include \"neon-proxy.fullname\" . }}"
labels:
helm.sh/chart: neon-proxy-{{ .Chart.Version }}
app.kubernetes.io/name: neon-proxy
app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
app.kubernetes.io/managed-by: Helm
namespace: "{{ .Release.Namespace }}"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "neon-proxy"
endpoints:
- port: http
path: /metrics
interval: 10s
scrapeTimeout: 10s
namespaceSelector:
matchNames:
- "{{ .Release.Namespace }}"

View File

@@ -1,52 +0,0 @@
# Helm chart values for neon-storage-broker
podLabels:
neon_env: staging
neon_service: storage-broker
# Use L4 LB
service:
# service.annotations -- Annotations to add to the service
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external # use newer AWS Load Balancer Controller
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internal # deploy LB to private subnet
# assign service to this name at external-dns
external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.beta.us-east-2.internal.aws.neon.build
# service.type -- Service type
type: LoadBalancer
# service.port -- broker listen port
port: 50051
ingress:
enabled: false
metrics:
enabled: false
extraManifests:
- apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: "{{ include \"neon-storage-broker.fullname\" . }}"
labels:
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
app.kubernetes.io/name: neon-storage-broker
app.kubernetes.io/instance: neon-storage-broker
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
app.kubernetes.io/managed-by: Helm
namespace: "{{ .Release.Namespace }}"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "neon-storage-broker"
endpoints:
- port: broker
path: /metrics
interval: 10s
scrapeTimeout: 10s
namespaceSelector:
matchNames:
- "{{ .Release.Namespace }}"
settings:
sentryEnvironment: "staging"

View File

@@ -0,0 +1,26 @@
fullnameOverride: "neon-stress-proxy-scram"
settings:
authBackend: "console"
authEndpoint: "http://neon-stress-console.local/management/api/v2"
domain: "*.stress.neon.tech"
podLabels:
zenith_service: proxy-scram
zenith_env: staging
zenith_region: eu-west-1
zenith_region_slug: ireland
exposedService:
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: '*.stress.neon.tech'
metrics:
enabled: true
serviceMonitor:
enabled: true
selector:
release: kube-prometheus-stack

View File

@@ -0,0 +1,35 @@
fullnameOverride: "neon-stress-proxy"
settings:
authBackend: "link"
authEndpoint: "https://console.dev.neon.tech/authenticate_proxy_request/"
uri: "https://console.dev.neon.tech/psql_session/"
# -- Additional labels for zenith-proxy pods
podLabels:
zenith_service: proxy
zenith_env: staging
zenith_region: eu-west-1
zenith_region_slug: ireland
service:
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internal
external-dns.alpha.kubernetes.io/hostname: neon-stress-proxy.local
type: LoadBalancer
exposedService:
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: connect.dev.neon.tech
metrics:
enabled: true
serviceMonitor:
enabled: true
selector:
release: kube-prometheus-stack

View File

@@ -8,10 +8,6 @@ settings:
authBackend: "console"
authEndpoint: "http://console-release.local/management/api/v2"
domain: "*.ap-southeast-1.aws.neon.tech"
sentryEnvironment: "production"
wssPort: 8443
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
metricCollectionInterval: "10min"
# -- Additional labels for neon-proxy pods
podLabels:
@@ -26,7 +22,6 @@ exposedService:
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: ap-southeast-1.aws.neon.tech
httpsPort: 443
#metrics:
# enabled: true
@@ -34,28 +29,3 @@ exposedService:
# enabled: true
# selector:
# release: kube-prometheus-stack
extraManifests:
- apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: "{{ include \"neon-proxy.fullname\" . }}"
labels:
helm.sh/chart: neon-proxy-{{ .Chart.Version }}
app.kubernetes.io/name: neon-proxy
app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
app.kubernetes.io/managed-by: Helm
namespace: "{{ .Release.Namespace }}"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "neon-proxy"
endpoints:
- port: http
path: /metrics
interval: 10s
scrapeTimeout: 10s
namespaceSelector:
matchNames:
- "{{ .Release.Namespace }}"

View File

@@ -1,52 +0,0 @@
# Helm chart values for neon-storage-broker
podLabels:
neon_env: production
neon_service: storage-broker
# Use L4 LB
service:
# service.annotations -- Annotations to add to the service
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external # use newer AWS Load Balancer Controller
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internal # deploy LB to private subnet
# assign service to this name at external-dns
external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.epsilon.ap-southeast-1.internal.aws.neon.tech
# service.type -- Service type
type: LoadBalancer
# service.port -- broker listen port
port: 50051
ingress:
enabled: false
metrics:
enabled: false
extraManifests:
- apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: "{{ include \"neon-storage-broker.fullname\" . }}"
labels:
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
app.kubernetes.io/name: neon-storage-broker
app.kubernetes.io/instance: neon-storage-broker
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
app.kubernetes.io/managed-by: Helm
namespace: "{{ .Release.Namespace }}"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "neon-storage-broker"
endpoints:
- port: broker
path: /metrics
interval: 10s
scrapeTimeout: 10s
namespaceSelector:
matchNames:
- "{{ .Release.Namespace }}"
settings:
sentryEnvironment: "production"

View File

@@ -8,10 +8,6 @@ settings:
authBackend: "console"
authEndpoint: "http://console-release.local/management/api/v2"
domain: "*.eu-central-1.aws.neon.tech"
sentryEnvironment: "production"
wssPort: 8443
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
metricCollectionInterval: "10min"
# -- Additional labels for neon-proxy pods
podLabels:
@@ -26,7 +22,6 @@ exposedService:
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: eu-central-1.aws.neon.tech
httpsPort: 443
#metrics:
# enabled: true
@@ -34,28 +29,3 @@ exposedService:
# enabled: true
# selector:
# release: kube-prometheus-stack
extraManifests:
- apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: "{{ include \"neon-proxy.fullname\" . }}"
labels:
helm.sh/chart: neon-proxy-{{ .Chart.Version }}
app.kubernetes.io/name: neon-proxy
app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
app.kubernetes.io/managed-by: Helm
namespace: "{{ .Release.Namespace }}"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "neon-proxy"
endpoints:
- port: http
path: /metrics
interval: 10s
scrapeTimeout: 10s
namespaceSelector:
matchNames:
- "{{ .Release.Namespace }}"

View File

@@ -1,52 +0,0 @@
# Helm chart values for neon-storage-broker
podLabels:
neon_env: production
neon_service: storage-broker
# Use L4 LB
service:
# service.annotations -- Annotations to add to the service
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external # use newer AWS Load Balancer Controller
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internal # deploy LB to private subnet
# assign service to this name at external-dns
external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.gamma.eu-central-1.internal.aws.neon.tech
# service.type -- Service type
type: LoadBalancer
# service.port -- broker listen port
port: 50051
ingress:
enabled: false
metrics:
enabled: false
extraManifests:
- apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: "{{ include \"neon-storage-broker.fullname\" . }}"
labels:
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
app.kubernetes.io/name: neon-storage-broker
app.kubernetes.io/instance: neon-storage-broker
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
app.kubernetes.io/managed-by: Helm
namespace: "{{ .Release.Namespace }}"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "neon-storage-broker"
endpoints:
- port: broker
path: /metrics
interval: 10s
scrapeTimeout: 10s
namespaceSelector:
matchNames:
- "{{ .Release.Namespace }}"
settings:
sentryEnvironment: "production"

View File

@@ -1,59 +0,0 @@
# Helm chart values for neon-proxy-link.
# This is a YAML-formatted file.
image:
repository: neondatabase/neon
settings:
authBackend: "link"
authEndpoint: "https://console.neon.tech/authenticate_proxy_request/"
uri: "https://console.neon.tech/psql_session/"
domain: "pg.neon.tech"
sentryEnvironment: "production"
# -- Additional labels for zenith-proxy pods
podLabels:
zenith_service: proxy
zenith_env: production
zenith_region: us-east-2
zenith_region_slug: us-east-2
service:
type: LoadBalancer
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internal
external-dns.alpha.kubernetes.io/hostname: neon-proxy-link-mgmt.delta.us-east-2.aws.neon.tech
exposedService:
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: neon-proxy-link.delta.us-east-2.aws.neon.tech
extraManifests:
- apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: "{{ include \"neon-proxy.fullname\" . }}"
labels:
helm.sh/chart: neon-proxy-{{ .Chart.Version }}
app.kubernetes.io/name: neon-proxy
app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
app.kubernetes.io/managed-by: Helm
namespace: "{{ .Release.Namespace }}"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "neon-proxy"
endpoints:
- port: http
path: /metrics
interval: 10s
scrapeTimeout: 10s
namespaceSelector:
matchNames:
- "{{ .Release.Namespace }}"

View File

@@ -8,10 +8,6 @@ settings:
authBackend: "console"
authEndpoint: "http://console-release.local/management/api/v2"
domain: "*.us-east-2.aws.neon.tech"
sentryEnvironment: "production"
wssPort: 8443
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
metricCollectionInterval: "10min"
# -- Additional labels for neon-proxy pods
podLabels:
@@ -26,7 +22,6 @@ exposedService:
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.tech
httpsPort: 443
#metrics:
# enabled: true
@@ -34,28 +29,3 @@ exposedService:
# enabled: true
# selector:
# release: kube-prometheus-stack
extraManifests:
- apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: "{{ include \"neon-proxy.fullname\" . }}"
labels:
helm.sh/chart: neon-proxy-{{ .Chart.Version }}
app.kubernetes.io/name: neon-proxy
app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
app.kubernetes.io/managed-by: Helm
namespace: "{{ .Release.Namespace }}"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "neon-proxy"
endpoints:
- port: http
path: /metrics
interval: 10s
scrapeTimeout: 10s
namespaceSelector:
matchNames:
- "{{ .Release.Namespace }}"

View File

@@ -1,52 +0,0 @@
# Helm chart values for neon-storage-broker
podLabels:
neon_env: production
neon_service: storage-broker
# Use L4 LB
service:
# service.annotations -- Annotations to add to the service
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external # use newer AWS Load Balancer Controller
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internal # deploy LB to private subnet
# assign service to this name at external-dns
external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.delta.us-east-2.internal.aws.neon.tech
# service.type -- Service type
type: LoadBalancer
# service.port -- broker listen port
port: 50051
ingress:
enabled: false
metrics:
enabled: false
extraManifests:
- apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: "{{ include \"neon-storage-broker.fullname\" . }}"
labels:
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
app.kubernetes.io/name: neon-storage-broker
app.kubernetes.io/instance: neon-storage-broker
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
app.kubernetes.io/managed-by: Helm
namespace: "{{ .Release.Namespace }}"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "neon-storage-broker"
endpoints:
- port: broker
path: /metrics
interval: 10s
scrapeTimeout: 10s
namespaceSelector:
matchNames:
- "{{ .Release.Namespace }}"
settings:
sentryEnvironment: "production"

View File

@@ -1,61 +0,0 @@
# Helm chart values for neon-proxy-scram.
# This is a YAML-formatted file.
image:
repository: neondatabase/neon
settings:
authBackend: "console"
authEndpoint: "http://console-release.local/management/api/v2"
domain: "*.cloud.neon.tech"
sentryEnvironment: "production"
wssPort: 8443
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
metricCollectionInterval: "10min"
# -- Additional labels for neon-proxy pods
podLabels:
zenith_service: proxy-scram
zenith_env: prod
zenith_region: us-west-2
zenith_region_slug: us-west-2
exposedService:
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: neon-proxy-scram-legacy.eta.us-west-2.aws.neon.tech
httpsPort: 443
#metrics:
# enabled: true
# serviceMonitor:
# enabled: true
# selector:
# release: kube-prometheus-stack
extraManifests:
- apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: "{{ include \"neon-proxy.fullname\" . }}"
labels:
helm.sh/chart: neon-proxy-{{ .Chart.Version }}
app.kubernetes.io/name: neon-proxy
app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
app.kubernetes.io/managed-by: Helm
namespace: "{{ .Release.Namespace }}"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "neon-proxy"
endpoints:
- port: http
path: /metrics
interval: 10s
scrapeTimeout: 10s
namespaceSelector:
matchNames:
- "{{ .Release.Namespace }}"

View File

@@ -1,61 +0,0 @@
# Helm chart values for neon-proxy-scram.
# This is a YAML-formatted file.
image:
repository: neondatabase/neon
settings:
authBackend: "console"
authEndpoint: "http://console-release.local/management/api/v2"
domain: "*.us-west-2.aws.neon.tech"
sentryEnvironment: "production"
wssPort: 8443
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
metricCollectionInterval: "10min"
# -- Additional labels for neon-proxy pods
podLabels:
zenith_service: proxy-scram
zenith_env: prod
zenith_region: us-west-2
zenith_region_slug: us-west-2
exposedService:
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: us-west-2.aws.neon.tech
httpsPort: 443
#metrics:
# enabled: true
# serviceMonitor:
# enabled: true
# selector:
# release: kube-prometheus-stack
extraManifests:
- apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: "{{ include \"neon-proxy.fullname\" . }}"
labels:
helm.sh/chart: neon-proxy-{{ .Chart.Version }}
app.kubernetes.io/name: neon-proxy
app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
app.kubernetes.io/managed-by: Helm
namespace: "{{ .Release.Namespace }}"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "neon-proxy"
endpoints:
- port: http
path: /metrics
interval: 10s
scrapeTimeout: 10s
namespaceSelector:
matchNames:
- "{{ .Release.Namespace }}"

View File

@@ -1,52 +0,0 @@
# Helm chart values for neon-storage-broker
podLabels:
neon_env: production
neon_service: storage-broker
# Use L4 LB
service:
# service.annotations -- Annotations to add to the service
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external # use newer AWS Load Balancer Controller
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internal # deploy LB to private subnet
# assign service to this name at external-dns
external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.eta.us-west-2.internal.aws.neon.tech
# service.type -- Service type
type: LoadBalancer
# service.port -- broker listen port
port: 50051
ingress:
enabled: false
metrics:
enabled: false
extraManifests:
- apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: "{{ include \"neon-storage-broker.fullname\" . }}"
labels:
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
app.kubernetes.io/name: neon-storage-broker
app.kubernetes.io/instance: neon-storage-broker
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
app.kubernetes.io/managed-by: Helm
namespace: "{{ .Release.Namespace }}"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "neon-storage-broker"
endpoints:
- port: broker
path: /metrics
interval: 10s
scrapeTimeout: 10s
namespaceSelector:
matchNames:
- "{{ .Release.Namespace }}"
settings:
sentryEnvironment: "production"

View File

@@ -1,56 +0,0 @@
# Helm chart values for neon-storage-broker
podLabels:
neon_env: production
neon_service: storage-broker
# Use L4 LB
service:
# service.annotations -- Annotations to add to the service
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external # use newer AWS Load Balancer Controller
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internal # deploy LB to private subnet
# assign service to this name at external-dns
external-dns.alpha.kubernetes.io/hostname: storage-broker.prod.local
# service.type -- Service type
type: LoadBalancer
# service.port -- broker listen port
port: 50051
ingress:
enabled: false
metrics:
enabled: true
serviceMonitor:
enabled: true
selector:
release: kube-prometheus-stack
extraManifests:
- apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: "{{ include \"neon-storage-broker.fullname\" . }}"
labels:
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
app.kubernetes.io/name: neon-storage-broker
app.kubernetes.io/instance: neon-storage-broker
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
app.kubernetes.io/managed-by: Helm
namespace: "{{ .Release.Namespace }}"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "neon-storage-broker"
endpoints:
- port: broker
path: /metrics
interval: 10s
scrapeTimeout: 10s
namespaceSelector:
matchNames:
- "{{ .Release.Namespace }}"
settings:
sentryEnvironment: "production"

View File

@@ -0,0 +1,24 @@
settings:
authBackend: "console"
authEndpoint: "http://console-release.local/management/api/v2"
domain: "*.cloud.neon.tech"
podLabels:
zenith_service: proxy-scram
zenith_env: production
zenith_region: us-west-2
zenith_region_slug: oregon
exposedService:
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: '*.cloud.neon.tech'
metrics:
enabled: true
serviceMonitor:
enabled: true
selector:
release: kube-prometheus-stack

View File

@@ -0,0 +1,33 @@
settings:
authBackend: "link"
authEndpoint: "https://console.neon.tech/authenticate_proxy_request/"
uri: "https://console.neon.tech/psql_session/"
# -- Additional labels for zenith-proxy pods
podLabels:
zenith_service: proxy
zenith_env: production
zenith_region: us-west-2
zenith_region_slug: oregon
service:
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internal
external-dns.alpha.kubernetes.io/hostname: proxy-release.local
type: LoadBalancer
exposedService:
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: connect.neon.tech,pg.neon.tech
metrics:
enabled: true
serviceMonitor:
enabled: true
selector:
release: kube-prometheus-stack

View File

@@ -0,0 +1,31 @@
# Helm chart values for zenith-proxy.
# This is a YAML-formatted file.
image:
repository: neondatabase/neon
settings:
authBackend: "console"
authEndpoint: "http://console-staging.local/management/api/v2"
domain: "*.cloud.stage.neon.tech"
# -- Additional labels for zenith-proxy pods
podLabels:
zenith_service: proxy-scram
zenith_env: staging
zenith_region: us-east-1
zenith_region_slug: virginia
exposedService:
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: cloud.stage.neon.tech
metrics:
enabled: true
serviceMonitor:
enabled: true
selector:
release: kube-prometheus-stack

31
.github/helm-values/staging.proxy.yaml vendored Normal file
View File

@@ -0,0 +1,31 @@
# Helm chart values for zenith-proxy.
# This is a YAML-formatted file.
image:
repository: neondatabase/neon
settings:
authBackend: "link"
authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
uri: "https://console.stage.neon.tech/psql_session/"
# -- Additional labels for zenith-proxy pods
podLabels:
zenith_service: proxy
zenith_env: staging
zenith_region: us-east-1
zenith_region_slug: virginia
exposedService:
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: connect.stage.neon.tech
metrics:
enabled: true
serviceMonitor:
enabled: true
selector:
release: kube-prometheus-stack

View File

@@ -1,10 +0,0 @@
## Describe your changes
## Issue ticket number and link
## Checklist before requesting a review
- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section.

View File

@@ -15,10 +15,12 @@ on:
workflow_dispatch: # adds ability to run this manually
inputs:
environment:
description: 'Environment to run remote tests on (dev or staging)'
required: false
region_id:
description: 'Use a particular region. If not set the default region will be used'
required: false
default: 'aws-us-east-2'
save_perf_report:
type: boolean
description: 'Publish perf report or not. If not set, the report is published only for the main branch'
@@ -35,69 +37,97 @@ concurrency:
jobs:
bench:
env:
TEST_PG_BENCH_DURATIONS_MATRIX: "300"
TEST_PG_BENCH_SCALES_MATRIX: "10,100"
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
DEFAULT_PG_VERSION: 14
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: remote
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
PLATFORM: "neon-staging"
# this workflow runs on self hosteed runner
# it's environment is quite different from usual guthub runner
# probably the most important difference is that it doesn't start from clean workspace each time
# e g if you install system packages they are not cleaned up since you install them directly in host machine
# not a container or something
# See documentation for more info: https://docs.github.com/en/actions/hosting-your-own-runners/about-self-hosted-runners
runs-on: [self-hosted, zenith-benchmarker]
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
env:
POSTGRES_DISTRIB_DIR: /usr/pgsql
DEFAULT_PG_VERSION: 14
steps:
- uses: actions/checkout@v3
- name: Checkout zenith repo
uses: actions/checkout@v3
- name: Download Neon artifact
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-release-artifact
path: /tmp/neon/
prefix: latest
# actions/setup-python@v2 is not working correctly on self-hosted runners
# see https://github.com/actions/setup-python/issues/162
# and probably https://github.com/actions/setup-python/issues/162#issuecomment-865387976 in particular
# so the simplest solution to me is to use already installed system python and spin virtualenvs for job runs.
# there is Python 3.7.10 already installed on the machine so use it to install poetry and then use poetry's virtuealenvs
- name: Install poetry & deps
run: |
python3 -m pip install --upgrade poetry wheel
# since pip/poetry caches are reused there shouldn't be any troubles with install every time
./scripts/pysync
- name: Show versions
run: |
echo Python
python3 --version
poetry run python3 --version
echo Poetry
poetry --version
echo Pgbench
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
- name: Create Neon Project
id: create-neon-project
uses: ./.github/actions/neon-project-create
with:
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
environment: ${{ github.event.inputs.environment || 'staging' }}
api_key: ${{ ( github.event.inputs.environment || 'staging' ) == 'staging' && secrets.NEON_STAGING_API_KEY || secrets.NEON_CAPTEST_API_KEY }}
- name: Run benchmark
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ env.BUILD_TYPE }}
test_selection: performance
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
# Set --sparse-ordering option of pytest-order plugin
# to ensure tests are running in order of appears in the file.
# It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py
# pgbench is installed system wide from official repo
# https://download.postgresql.org/pub/repos/yum/13/redhat/rhel-7-x86_64/
# via
# sudo tee /etc/yum.repos.d/pgdg.repo<<EOF
# [pgdg13]
# name=PostgreSQL 13 for RHEL/CentOS 7 - x86_64
# baseurl=https://download.postgresql.org/pub/repos/yum/13/redhat/rhel-7-x86_64/
# enabled=1
# gpgcheck=0
# EOF
# sudo yum makecache
# sudo yum install postgresql13-contrib
# actual binaries are located in /usr/pgsql-13/bin/
env:
# The pgbench test runs two tests of given duration against each scale.
# So the total runtime with these parameters is 2 * 2 * 300 = 1200, or 20 minutes.
# Plus time needed to initialize the test databases.
TEST_PG_BENCH_DURATIONS_MATRIX: "300"
TEST_PG_BENCH_SCALES_MATRIX: "10,100"
PLATFORM: "neon-staging"
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
REMOTE_ENV: "1" # indicate to test harness that we do not have zenith binaries locally
run: |
# just to be sure that no data was cached on self hosted runner
# since it might generate duplicates when calling ingest_perf_test_result.py
rm -rf perf-report-staging
mkdir -p perf-report-staging
# Set --sparse-ordering option of pytest-order plugin to ensure tests are running in order of appears in the file,
# it's important for test_perf_pgbench.py::test_pgbench_remote_* tests
./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --sparse-ordering --out-dir perf-report-staging --timeout 5400
- name: Submit result
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
run: |
REPORT_FROM=$(realpath perf-report-staging) REPORT_TO=staging scripts/generate_and_push_perf_report.sh
- name: Delete Neon Project
if: ${{ always() }}
uses: ./.github/actions/neon-project-delete
with:
environment: staging
project_id: ${{ steps.create-neon-project.outputs.project_id }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
- name: Create Allure report
if: success() || failure()
uses: ./.github/actions/allure-report
with:
action: generate
build_type: ${{ env.BUILD_TYPE }}
- name: Post to a Slack channel
if: ${{ github.event.schedule && failure() }}
uses: slackapi/slack-github-action@v1
@@ -116,16 +146,15 @@ jobs:
# neon-captest-prefetch: Same, with prefetching enabled (new project)
# rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
# rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
platform: [ neon-captest-reuse, neon-captest-prefetch, rds-postgres ]
platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch, rds-postgres ]
db_size: [ 10gb ]
runner: [ us-east-2 ]
include:
- platform: neon-captest-new
db_size: 50gb
- platform: neon-captest-prefetch
db_size: 50gb
runner: us-east-2
- platform: rds-aurora
db_size: 50gb
runner: us-east-2
env:
TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
@@ -137,9 +166,9 @@ jobs:
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
PLATFORM: ${{ matrix.platform }}
runs-on: [ self-hosted, "${{ matrix.runner }}", x64 ]
runs-on: [ self-hosted, dev, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:pinned
options: --init
timeout-minutes: 360 # 6h
@@ -164,9 +193,8 @@ jobs:
id: create-neon-project
uses: ./.github/actions/neon-project-create
with:
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
environment: ${{ github.event.inputs.environment || 'dev' }}
api_key: ${{ ( github.event.inputs.environment || 'dev' ) == 'staging' && secrets.NEON_STAGING_API_KEY || secrets.NEON_CAPTEST_API_KEY }}
- name: Set up Connection String
id: set-up-connstr
@@ -179,7 +207,7 @@ jobs:
CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
;;
rds-aurora)
CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CONNSTR }}
CONNSTR=${{ secrets.BENCHMARK_RDS_CONNSTR }}
;;
rds-postgres)
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }}
@@ -197,11 +225,8 @@ jobs:
- name: Set database options
if: matrix.platform == 'neon-captest-prefetch'
run: |
DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()")
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on"
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32"
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32"
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE main SET enable_seqscan_prefetch=on"
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE main SET seqscan_prefetch_buffers=10"
env:
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
@@ -244,13 +269,6 @@ jobs:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
- name: Delete Neon Project
if: ${{ steps.create-neon-project.outputs.project_id && always() }}
uses: ./.github/actions/neon-project-delete
with:
project_id: ${{ steps.create-neon-project.outputs.project_id }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
- name: Create Allure report
if: success() || failure()
uses: ./.github/actions/allure-report
@@ -258,6 +276,14 @@ jobs:
action: generate
build_type: ${{ env.BUILD_TYPE }}
- name: Delete Neon Project
if: ${{ steps.create-neon-project.outputs.project_id && always() }}
uses: ./.github/actions/neon-project-delete
with:
environment: dev
project_id: ${{ steps.create-neon-project.outputs.project_id }}
api_key: ${{ secrets.NEON_CAPTEST_API_KEY }}
- name: Post to a Slack channel
if: ${{ github.event.schedule && failure() }}
uses: slackapi/slack-github-action@v1
@@ -266,331 +292,3 @@ jobs:
slack-message: "Periodic perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
clickbench-compare:
# ClichBench DB for rds-aurora and rds-Postgres deployed to the same clusters
# we use for performance testing in pgbench-compare.
# Run this job only when pgbench-compare is finished to avoid the intersection.
# We might change it after https://github.com/neondatabase/neon/issues/2900.
#
# *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
# *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
if: success() || failure()
needs: [ pgbench-compare ]
strategy:
fail-fast: false
matrix:
# neon-captest-prefetch: We have pre-created projects with prefetch enabled
# rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
# rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
platform: [ neon-captest-prefetch, rds-postgres, rds-aurora ]
env:
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
DEFAULT_PG_VERSION: 14
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: remote
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
PLATFORM: ${{ matrix.platform }}
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
timeout-minutes: 360 # 6h
steps:
- uses: actions/checkout@v3
- name: Download Neon artifact
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-release-artifact
path: /tmp/neon/
prefix: latest
- name: Add Postgres binaries to PATH
run: |
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
- name: Set up Connection String
id: set-up-connstr
run: |
case "${PLATFORM}" in
neon-captest-prefetch)
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
;;
rds-aurora)
CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CLICKBENCH_10M_CONNSTR }}
;;
rds-postgres)
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CLICKBENCH_10M_CONNSTR }}
;;
*)
echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'"
exit 1
;;
esac
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
psql ${CONNSTR} -c "SELECT version();"
- name: Set database options
if: matrix.platform == 'neon-captest-prefetch'
run: |
DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()")
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on"
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32"
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32"
env:
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
- name: ClickBench benchmark
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ env.BUILD_TYPE }}
test_selection: performance/test_perf_olap.py
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 21600 -k test_clickbench
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
- name: Create Allure report
if: success() || failure()
uses: ./.github/actions/allure-report
with:
action: generate
build_type: ${{ env.BUILD_TYPE }}
- name: Post to a Slack channel
if: ${{ github.event.schedule && failure() }}
uses: slackapi/slack-github-action@v1
with:
channel-id: "C033QLM5P7D" # dev-staging-stream
slack-message: "Periodic OLAP perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
tpch-compare:
# TCP-H DB for rds-aurora and rds-Postgres deployed to the same clusters
# we use for performance testing in pgbench-compare & clickbench-compare.
# Run this job only when clickbench-compare is finished to avoid the intersection.
# We might change it after https://github.com/neondatabase/neon/issues/2900.
#
# *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
if: success() || failure()
needs: [ clickbench-compare ]
strategy:
fail-fast: false
matrix:
# neon-captest-prefetch: We have pre-created projects with prefetch enabled
# rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
# rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
platform: [ neon-captest-prefetch, rds-postgres, rds-aurora ]
env:
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
DEFAULT_PG_VERSION: 14
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: remote
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
PLATFORM: ${{ matrix.platform }}
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
timeout-minutes: 360 # 6h
steps:
- uses: actions/checkout@v3
- name: Download Neon artifact
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-release-artifact
path: /tmp/neon/
prefix: latest
- name: Add Postgres binaries to PATH
run: |
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
- name: Set up Connection String
id: set-up-connstr
run: |
case "${PLATFORM}" in
neon-captest-prefetch)
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_TPCH_S10_CONNSTR }}
;;
rds-aurora)
CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_TPCH_S10_CONNSTR }}
;;
rds-postgres)
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_TPCH_S10_CONNSTR }}
;;
*)
echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'"
exit 1
;;
esac
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
psql ${CONNSTR} -c "SELECT version();"
- name: Set database options
if: matrix.platform == 'neon-captest-prefetch'
run: |
DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()")
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on"
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32"
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32"
env:
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
- name: Run TPC-H benchmark
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ env.BUILD_TYPE }}
test_selection: performance/test_perf_olap.py
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 21600 -k test_tpch
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
- name: Create Allure report
if: success() || failure()
uses: ./.github/actions/allure-report
with:
action: generate
build_type: ${{ env.BUILD_TYPE }}
- name: Post to a Slack channel
if: ${{ github.event.schedule && failure() }}
uses: slackapi/slack-github-action@v1
with:
channel-id: "C033QLM5P7D" # dev-staging-stream
slack-message: "Periodic TPC-H perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
user-examples-compare:
if: success() || failure()
needs: [ tpch-compare ]
strategy:
fail-fast: false
matrix:
# neon-captest-prefetch: We have pre-created projects with prefetch enabled
# rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
# rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
platform: [ neon-captest-prefetch, rds-postgres, rds-aurora ]
env:
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
DEFAULT_PG_VERSION: 14
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: remote
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
PLATFORM: ${{ matrix.platform }}
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
timeout-minutes: 360 # 6h
steps:
- uses: actions/checkout@v3
- name: Download Neon artifact
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-release-artifact
path: /tmp/neon/
prefix: latest
- name: Add Postgres binaries to PATH
run: |
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
- name: Set up Connection String
id: set-up-connstr
run: |
case "${PLATFORM}" in
neon-captest-prefetch)
CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
;;
rds-aurora)
CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_AURORA_CONNSTR }}
;;
rds-postgres)
CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_POSTGRES_CONNSTR }}
;;
*)
echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'"
exit 1
;;
esac
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
psql ${CONNSTR} -c "SELECT version();"
- name: Set database options
if: matrix.platform == 'neon-captest-prefetch'
run: |
DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()")
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on"
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32"
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32"
env:
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
- name: Run user examples
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ env.BUILD_TYPE }}
test_selection: performance/test_perf_olap.py
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 21600 -k test_user_examples
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
- name: Create Allure report
if: success() || failure()
uses: ./.github/actions/allure-report
with:
action: generate
build_type: ${{ env.BUILD_TYPE }}
- name: Post to a Slack channel
if: ${{ github.event.schedule && failure() }}
uses: slackapi/slack-github-action@v1
with:
channel-id: "C033QLM5P7D" # dev-staging-stream
slack-message: "Periodic TPC-H perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

View File

@@ -1,4 +1,4 @@
name: Build and Test
name: Test and Deploy
on:
push:
@@ -7,10 +7,6 @@ on:
- release
pull_request:
defaults:
run:
shell: bash -euxo pipefail {0}
concurrency:
# Allow only one workflow per any non-`main` branch.
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }}
@@ -19,12 +15,10 @@ concurrency:
env:
RUST_BACKTRACE: 1
COPT: '-Werror'
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
jobs:
tag:
runs-on: [ self-hosted, gen3, small ]
runs-on: [ self-hosted, dev, x64 ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
outputs:
build-tag: ${{steps.build-tag.outputs.tag}}
@@ -51,91 +45,8 @@ jobs:
shell: bash
id: build-tag
check-codestyle-python:
runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: false
fetch-depth: 1
- name: Cache poetry deps
id: cache_poetry
uses: actions/cache@v3
with:
path: ~/.cache/pypoetry/virtualenvs
key: v1-codestyle-python-deps-${{ hashFiles('poetry.lock') }}
- name: Install Python deps
run: ./scripts/pysync
- name: Run isort to ensure code format
run: poetry run isort --diff --check .
- name: Run black to ensure code format
run: poetry run black --diff --check .
- name: Run flake8 to ensure code format
run: poetry run flake8 .
- name: Run mypy to check types
run: poetry run mypy .
check-codestyle-rust:
runs-on: [ self-hosted, gen3, large ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 1
# Disabled for now
# - name: Restore cargo deps cache
# id: cache_cargo
# uses: actions/cache@v3
# with:
# path: |
# !~/.cargo/registry/src
# ~/.cargo/git/
# target/
# key: v1-${{ runner.os }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
# Some of our rust modules use FFI and need those to be checked
- name: Get postgres headers
run: make postgres-headers -j$(nproc)
- name: Run cargo clippy
run: ./run_clippy.sh
# Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
- name: Check formatting
if: ${{ !cancelled() }}
run: cargo fmt --all -- --check
# https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
- name: Check rust dependencies
if: ${{ !cancelled() }}
run: |
cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date
cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack
# https://github.com/EmbarkStudios/cargo-deny
- name: Check rust licenses/bans/advisories/sources
if: ${{ !cancelled() }}
run: cargo deny check
build-neon:
runs-on: [ self-hosted, gen3, large ]
runs-on: [ self-hosted, dev, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
@@ -143,6 +54,7 @@ jobs:
fail-fast: false
matrix:
build_type: [ debug, release ]
env:
BUILD_TYPE: ${{ matrix.build_type }}
GIT_VERSION: ${{ github.sha }}
@@ -167,10 +79,12 @@ jobs:
- name: Set pg 14 revision for caching
id: pg_v14_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
shell: bash -euxo pipefail {0}
- name: Set pg 15 revision for caching
id: pg_v15_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
shell: bash -euxo pipefail {0}
# Set some environment variables used by all the steps.
#
@@ -184,37 +98,37 @@ jobs:
# corresponding Cargo.toml files for their descriptions.
- name: Set env variables
run: |
CARGO_FEATURES="--features testing"
if [[ $BUILD_TYPE == "debug" ]]; then
cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
CARGO_FLAGS="--locked $CARGO_FEATURES"
CARGO_FEATURES="--features testing"
CARGO_FLAGS="--locked --timings $CARGO_FEATURES"
elif [[ $BUILD_TYPE == "release" ]]; then
cov_prefix=""
CARGO_FLAGS="--locked --release $CARGO_FEATURES"
CARGO_FEATURES="--features testing,profiling"
CARGO_FLAGS="--locked --timings --release $CARGO_FEATURES"
fi
echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV
echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV
echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV
shell: bash -euxo pipefail {0}
# Disabled for now
# Don't include the ~/.cargo/registry/src directory. It contains just
# uncompressed versions of the crates in ~/.cargo/registry/cache
# directory, and it's faster to let 'cargo' to rebuild it from the
# compressed crates.
# - name: Cache cargo deps
# id: cache_cargo
# uses: actions/cache@v3
# with:
# path: |
# ~/.cargo/registry/
# !~/.cargo/registry/src
# ~/.cargo/git/
# target/
# # Fall back to older versions of the key, if no cache for current Cargo.lock was found
# key: |
# v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
# v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-
- name: Cache cargo deps
id: cache_cargo
uses: actions/cache@v3
with:
path: |
~/.cargo/registry/
!~/.cargo/registry/src
~/.cargo/git/
target/
# Fall back to older versions of the key, if no cache for current Cargo.lock was found
key: |
v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-
- name: Cache postgres v14 build
id: cache_pg_14
@@ -233,21 +147,26 @@ jobs:
- name: Build postgres v14
if: steps.cache_pg_14.outputs.cache-hit != 'true'
run: mold -run make postgres-v14 -j$(nproc)
shell: bash -euxo pipefail {0}
- name: Build postgres v15
if: steps.cache_pg_15.outputs.cache-hit != 'true'
run: mold -run make postgres-v15 -j$(nproc)
shell: bash -euxo pipefail {0}
- name: Build neon extensions
run: mold -run make neon-pg-ext -j$(nproc)
shell: bash -euxo pipefail {0}
- name: Run cargo build
run: |
${cov_prefix} mold -run cargo build $CARGO_FLAGS --bins --tests
shell: bash -euxo pipefail {0}
- name: Run cargo test
run: |
${cov_prefix} cargo test $CARGO_FLAGS
shell: bash -euxo pipefail {0}
- name: Install rust binaries
run: |
@@ -288,9 +207,11 @@ jobs:
echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list
done
fi
shell: bash -euxo pipefail {0}
- name: Install postgres binaries
run: cp -a pg_install /tmp/neon/pg_install
shell: bash -euxo pipefail {0}
- name: Upload Neon artifact
uses: ./.github/actions/upload
@@ -298,13 +219,24 @@ jobs:
name: neon-${{ runner.os }}-${{ matrix.build_type }}-artifact
path: /tmp/neon
- name: Prepare cargo build timing stats for storing
run: |
mkdir -p "/tmp/neon/cargo-timings/$BUILD_TYPE/"
cp -r ./target/cargo-timings/* "/tmp/neon/cargo-timings/$BUILD_TYPE/"
shell: bash -euxo pipefail {0}
- name: Upload cargo build stats
uses: ./.github/actions/upload
with:
name: neon-${{ runner.os }}-${{ matrix.build_type }}-build-stats
path: /tmp/neon/cargo-timings/
# XXX: keep this after the binaries.list is formed, so the coverage can properly work later
- name: Merge and upload coverage data
if: matrix.build_type == 'debug'
uses: ./.github/actions/save-coverage-data
regress-tests:
runs-on: [ self-hosted, gen3, large ]
runs-on: [ self-hosted, dev, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
@@ -318,7 +250,7 @@ jobs:
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 1
fetch-depth: 2
- name: Pytest regression tests
uses: ./.github/actions/run-python-test-set
@@ -337,7 +269,7 @@ jobs:
uses: ./.github/actions/save-coverage-data
benchmarks:
runs-on: [ self-hosted, gen3, small ]
runs-on: [ self-hosted, dev, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
@@ -352,7 +284,7 @@ jobs:
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 1
fetch-depth: 2
- name: Pytest benchmarks
uses: ./.github/actions/run-python-test-set
@@ -368,12 +300,12 @@ jobs:
# while coverage is currently collected for the debug ones
merge-allure-report:
runs-on: [ self-hosted, gen3, small ]
runs-on: [ self-hosted, dev, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
needs: [ regress-tests, benchmarks ]
if: ${{ !cancelled() }}
if: success() || failure()
strategy:
fail-fast: false
matrix:
@@ -398,6 +330,7 @@ jobs:
SHA: ${{ github.event.pull_request.head.sha || github.sha }}
REPORT_URL: ${{ steps.create-allure-report.outputs.report-url }}
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
shell: bash -euxo pipefail {0}
run: |
curl --fail --output suites.json ${REPORT_URL%/index.html}/data/suites.json
./scripts/pysync
@@ -405,7 +338,7 @@ jobs:
DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json
coverage-report:
runs-on: [ self-hosted, gen3, small ]
runs-on: [ self-hosted, dev, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
@@ -421,17 +354,16 @@ jobs:
submodules: true
fetch-depth: 1
# Disabled for now
# - name: Restore cargo deps cache
# id: cache_cargo
# uses: actions/cache@v3
# with:
# path: |
# ~/.cargo/registry/
# !~/.cargo/registry/src
# ~/.cargo/git/
# target/
# key: v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
- name: Restore cargo deps cache
id: cache_cargo
uses: actions/cache@v3
with:
path: |
~/.cargo/registry/
!~/.cargo/registry/src
~/.cargo/git/
target/
key: v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
- name: Get Neon artifact
uses: ./.github/actions/download
@@ -447,6 +379,7 @@ jobs:
- name: Merge coverage data
run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge
shell: bash -euxo pipefail {0}
- name: Build and upload coverage report
run: |
@@ -479,9 +412,10 @@ jobs:
\"description\": \"Coverage report is ready\",
\"target_url\": \"$REPORT_URL\"
}"
shell: bash -euxo pipefail {0}
trigger-e2e-tests:
runs-on: [ self-hosted, gen3, small ]
runs-on: [ self-hosted, dev, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
options: --init
@@ -526,13 +460,9 @@ jobs:
}"
neon-image:
runs-on: [ self-hosted, gen3, large ]
runs-on: [ self-hosted, dev, x64 ]
needs: [ tag ]
# https://github.com/GoogleContainerTools/kaniko/issues/2005
container: gcr.io/kaniko-project/executor:v1.7.0-debug
defaults:
run:
shell: sh -eu {0}
container: gcr.io/kaniko-project/executor:v1.9.0-debug
steps:
- name: Checkout
@@ -545,19 +475,12 @@ jobs:
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
- name: Kaniko build neon
run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
# Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
- name: Cleanup ECR folder
run: rm -rf ~/.ecr
run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
compute-tools-image:
runs-on: [ self-hosted, gen3, large ]
runs-on: [ self-hosted, dev, x64 ]
needs: [ tag ]
container: gcr.io/kaniko-project/executor:v1.7.0-debug
defaults:
run:
shell: sh -eu {0}
container: gcr.io/kaniko-project/executor:v1.9.0-debug
steps:
- name: Checkout
@@ -567,23 +490,12 @@ jobs:
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
- name: Kaniko build compute tools
run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
- name: Cleanup ECR folder
run: rm -rf ~/.ecr
compute-node-image:
runs-on: [ self-hosted, gen3, large ]
container: gcr.io/kaniko-project/executor:v1.7.0-debug
compute-node-image-v14:
runs-on: [ self-hosted, dev, x64 ]
container: gcr.io/kaniko-project/executor:v1.9.0-debug
needs: [ tag ]
strategy:
fail-fast: false
matrix:
version: [ v14, v15 ]
defaults:
run:
shell: sh -eu {0}
steps:
- name: Checkout
uses: actions/checkout@v1 # v3 won't work with kaniko
@@ -594,59 +506,29 @@ jobs:
- name: Configure ECR login
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
- name: Kaniko build compute node with extensions
run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --build-arg PG_VERSION=${{ matrix.version }} --dockerfile Dockerfile.compute-node --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
- name: Cleanup ECR folder
run: rm -rf ~/.ecr
vm-compute-node-image:
runs-on: [ self-hosted, gen3, large ]
needs: [ tag, compute-node-image ]
strategy:
fail-fast: false
matrix:
version: [ v14, v15 ]
defaults:
run:
shell: sh -eu {0}
env:
VM_INFORMANT_VERSION: 0.1.1
- name: Kaniko build compute node with extensions v14
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}}
compute-node-image-v15:
runs-on: [ self-hosted, dev, x64 ]
container: gcr.io/kaniko-project/executor:v1.9.0-debug
needs: [ tag ]
steps:
- name: Downloading latest vm-builder
run: |
curl -L https://github.com/neondatabase/neonvm/releases/latest/download/vm-builder -o vm-builder
chmod +x vm-builder
- name: Checkout
uses: actions/checkout@v1 # v3 won't work with kaniko
with:
submodules: true
fetch-depth: 0
- name: Pulling compute-node image
run: |
docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
- name: Configure ECR login
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
- name: Downloading VM informant version ${{ env.VM_INFORMANT_VERSION }}
run: |
curl -fL https://github.com/neondatabase/autoscaling/releases/download/${{ env.VM_INFORMANT_VERSION }}/vm-informant -o vm-informant
chmod +x vm-informant
- name: Adding VM informant to compute-node image
run: |
ID=$(docker create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}})
docker cp vm-informant $ID:/bin/vm-informant
docker commit $ID temp-vm-compute-node
docker rm -f $ID
- name: Build vm image
run: |
# note: as of 2023-01-12, vm-builder requires a trailing ":latest" for local images
./vm-builder -src=temp-vm-compute-node:latest -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
- name: Pushing vm-compute-node image
run: |
docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
- name: Kaniko build compute node with extensions v15
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}}
test-images:
needs: [ tag, neon-image, compute-node-image, compute-tools-image ]
runs-on: [ self-hosted, gen3, small ]
needs: [ tag, neon-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
runs-on: [ self-hosted, dev, x64 ]
steps:
- name: Checkout
@@ -688,39 +570,20 @@ jobs:
docker compose -f ./docker-compose/docker-compose.yml down
promote-images:
runs-on: [ self-hosted, gen3, small ]
needs: [ tag, test-images, vm-compute-node-image ]
container: golang:1.19-bullseye
runs-on: [ self-hosted, dev, x64 ]
needs: [ tag, test-images ]
if: github.event_name != 'workflow_dispatch'
container: amazon/aws-cli
strategy:
fail-fast: false
matrix:
name: [ neon, compute-node-v14, compute-node-v15, compute-tools ]
steps:
- name: Install Crane & ECR helper
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
- name: Promote image to latest
run: |
go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
- name: Configure ECR login
run: |
mkdir /github/home/.docker/
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
- name: Add latest tag to images
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
run: |
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
- name: Cleanup ECR folder
run: rm -rf ~/.ecr
export MANIFEST=$(aws ecr batch-get-image --repository-name ${{ matrix.name }} --image-ids imageTag=${{needs.tag.outputs.build-tag}} --query 'images[].imageManifest' --output text)
aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST"
push-docker-hub:
runs-on: [ self-hosted, dev, x64 ]
@@ -747,15 +610,9 @@ jobs:
- name: Pull compute node v14 image from ECR
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} compute-node-v14
- name: Pull vm compute node v14 image from ECR
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
- name: Pull compute node v15 image from ECR
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} compute-node-v15
- name: Pull vm compute node v15 image from ECR
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
- name: Pull rust image from ECR
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust
@@ -767,9 +624,7 @@ jobs:
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
- name: Configure Docker Hub login
run: |
@@ -786,15 +641,9 @@ jobs:
- name: Push compute node v14 image to Docker Hub
run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}}
- name: Push vm compute node v14 image to Docker Hub
run: crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
- name: Push compute node v15 image to Docker Hub
run: crane push compute-node-v15 neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}}
- name: Push vm compute node v15 image to Docker Hub
run: crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
- name: Push rust image to Docker Hub
run: crane push rust neondatabase/rust:pinned
@@ -806,21 +655,132 @@ jobs:
crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
- name: Cleanup ECR folder
run: rm -rf ~/.ecr
calculate-deploy-targets:
runs-on: [ self-hosted, Linux, k8s-runner ]
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
outputs:
matrix-include: ${{ steps.set-matrix.outputs.include }}
steps:
- id: set-matrix
run: |
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA", "console_api_key_secret": "NEON_STAGING_API_KEY"}'
NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA", "console_api_key_secret": "NEON_CAPTEST_API_KEY"}'
echo "include=[$STAGING, $NEON_STRESS]" >> $GITHUB_OUTPUT
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}'
echo "include=[$PRODUCTION]" >> $GITHUB_OUTPUT
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
exit 1
fi
deploy:
runs-on: [ self-hosted, Linux, k8s-runner ]
#container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
# We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
defaults:
run:
shell: bash
strategy:
matrix:
include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
- name: Setup python
uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Setup ansible
run: |
export PATH="/root/.local/bin:$PATH"
pip install --progress-bar off --user ansible boto3 toml
- name: Redeploy
run: |
export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
cd "$(pwd)/.github/ansible"
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
./get_binaries.sh
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
RELEASE=true ./get_binaries.sh
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
exit 1
fi
eval $(ssh-agent)
echo "${{ secrets.TELEPORT_SSH_KEY }}" | tr -d '\n'| base64 --decode >ssh-key
echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
chmod 0600 ssh-key
ssh-add ssh-key
rm -f ssh-key ssh-key-cert.pub
ansible-galaxy collection install sivel.toiletwater
ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets[matrix.console_api_key_secret] }}
rm -f neon_install.tar.gz .neon_current_version
deploy-new:
runs-on: [ self-hosted, dev, x64 ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
# We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
if: |
(github.ref_name == 'main') &&
github.event_name != 'workflow_dispatch'
defaults:
run:
shell: bash
strategy:
matrix:
target_region: [ us-east-2 ]
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
- name: Redeploy
run: |
export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
cd "$(pwd)/.github/ansible"
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
./get_binaries.sh
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
RELEASE=true ./get_binaries.sh
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
exit 1
fi
ansible-galaxy collection install sivel.toiletwater
ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
rm -f neon_install.tar.gz .neon_current_version
deploy-pr-test-new:
runs-on: [ self-hosted, gen3, small ]
runs-on: [ self-hosted, dev, x64 ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
# We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
needs: [ push-docker-hub, tag, regress-tests ]
if: |
contains(github.event.pull_request.labels.*.name, 'deploy-test-storage') &&
contains(github.event.pull_request.labels.*.name, 'deploy-test-storage') &&
github.event_name != 'workflow_dispatch'
defaults:
run:
@@ -843,46 +803,181 @@ jobs:
./get_binaries.sh
ansible-galaxy collection install sivel.toiletwater
ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
rm -f neon_install.tar.gz .neon_current_version
- name: Cleanup ansible folder
run: rm -rf ~/.ansible
deploy:
runs-on: [ self-hosted, gen3, small ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
deploy-prod-new:
runs-on: prod
container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
# We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
needs: [ push-docker-hub, tag, regress-tests ]
if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
if: |
(github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
defaults:
run:
shell: bash
strategy:
matrix:
target_region: [ us-east-2, eu-central-1, ap-southeast-1 ]
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: false
submodules: true
fetch-depth: 0
- name: Trigger deploy workflow
env:
GH_TOKEN: ${{ github.token }}
- name: Redeploy
run: |
export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
cd "$(pwd)/.github/ansible"
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
gh workflow run deploy-dev.yml --ref main -f branch=${{ github.sha }} -f dockerTag=${{needs.tag.outputs.build-tag}}
./get_binaries.sh
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
gh workflow run deploy-prod.yml --ref release -f branch=${{ github.sha }} -f dockerTag=${{needs.tag.outputs.build-tag}} -f disclamerAcknowledged=true
RELEASE=true ./get_binaries.sh
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
exit 1
fi
ansible-galaxy collection install sivel.toiletwater
ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_PRODUCTION_API_KEY}}
rm -f neon_install.tar.gz .neon_current_version
deploy-proxy:
runs-on: [ self-hosted, dev, x64 ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
defaults:
run:
shell: bash
strategy:
matrix:
include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
env:
KUBECONFIG: .kubeconfig
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
- name: Add curl
run: apt update && apt install curl -y
- name: Store kubeconfig file
run: |
echo "${{ secrets[matrix.kubeconfig_secret] }}" | base64 --decode > ${KUBECONFIG}
chmod 0600 ${KUBECONFIG}
- name: Setup helm v3
run: |
curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
helm repo add neondatabase https://neondatabase.github.io/helm-charts
- name: Re-deploy proxy
run: |
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
deploy-proxy-new:
runs-on: [ self-hosted, dev, x64 ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
needs: [ push-docker-hub, tag, regress-tests ]
if: |
(github.ref_name == 'main') &&
github.event_name != 'workflow_dispatch'
defaults:
run:
shell: bash
strategy:
matrix:
include:
- target_region: us-east-2
target_cluster: dev-us-east-2-beta
deploy_link_proxy: true
- target_region: eu-west-1
target_cluster: dev-eu-west-1-zeta
deploy_link_proxy: false
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
- name: Configure environment
run: |
helm repo add neondatabase https://neondatabase.github.io/helm-charts
aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
- name: Re-deploy scram proxy
run: |
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
- name: Re-deploy link proxy
if: matrix.deploy_link_proxy
run: |
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
deploy-proxy-prod-new:
runs-on: prod
container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
needs: [ push-docker-hub, tag, regress-tests ]
if: |
(github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
defaults:
run:
shell: bash
strategy:
matrix:
include:
- target_region: us-east-2
target_cluster: prod-us-east-2-delta
- target_region: eu-central-1
target_cluster: prod-eu-central-1-gamma
- target_region: ap-southeast-1
target_cluster: prod-ap-southeast-1-epsilon
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
- name: Configure environment
run: |
helm repo add neondatabase https://neondatabase.github.io/helm-charts
aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
- name: Re-deploy proxy
run: |
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
promote-compatibility-data:
runs-on: [ self-hosted, gen3, small ]
runs-on: [ self-hosted, dev, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
needs: [ push-docker-hub, tag, regress-tests ]
needs: [ deploy, deploy-proxy ]
if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch'
steps:
- name: Promote compatibility snapshot for the release
shell: bash -euxo pipefail {0}
env:
BUCKET: neon-github-public-dev
PREFIX: artifacts/latest

166
.github/workflows/codestyle.yml vendored Normal file
View File

@@ -0,0 +1,166 @@
name: Check code style and build
on:
push:
branches:
- main
pull_request:
defaults:
run:
shell: bash -euxo pipefail {0}
concurrency:
# Allow only one workflow per any non-`main` branch.
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }}
cancel-in-progress: true
env:
RUST_BACKTRACE: 1
COPT: '-Werror'
jobs:
check-codestyle-rust:
strategy:
fail-fast: false
matrix:
# XXX: both OSes have rustup
# * https://github.com/actions/runner-images/blob/main/images/macos/macos-12-Readme.md#rust-tools
# * https://github.com/actions/runner-images/blob/main/images/linux/Ubuntu2204-Readme.md#rust-tools
# this is all we need to install our toolchain later via rust-toolchain.toml
# so don't install any toolchain explicitly.
os: [ubuntu-latest, macos-latest]
timeout-minutes: 90
name: check codestyle rust and postgres
runs-on: ${{ matrix.os }}
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 2
- name: Check formatting
run: cargo fmt --all -- --check
- name: Install Ubuntu postgres dependencies
if: matrix.os == 'ubuntu-latest'
run: |
sudo apt update
sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev libssl-dev protobuf-compiler
- name: Install macOS postgres dependencies
if: matrix.os == 'macos-latest'
run: brew install flex bison openssl protobuf
- name: Set pg 14 revision for caching
id: pg_v14_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
shell: bash -euxo pipefail {0}
- name: Set pg 15 revision for caching
id: pg_v15_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
shell: bash -euxo pipefail {0}
- name: Cache postgres v14 build
id: cache_pg_14
uses: actions/cache@v3
with:
path: pg_install/v14
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v15 build
id: cache_pg_15
uses: actions/cache@v3
with:
path: pg_install/v15
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Set extra env for macOS
if: matrix.os == 'macos-latest'
run: |
echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
- name: Build postgres v14
if: steps.cache_pg_14.outputs.cache-hit != 'true'
run: make postgres-v14
shell: bash -euxo pipefail {0}
- name: Build postgres v15
if: steps.cache_pg_15.outputs.cache-hit != 'true'
run: make postgres-v15
shell: bash -euxo pipefail {0}
- name: Build neon extensions
run: make neon-pg-ext
- name: Cache cargo deps
id: cache_cargo
uses: actions/cache@v3
with:
path: |
~/.cargo/registry
!~/.cargo/registry/src
~/.cargo/git
target
key: v6-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust
- name: Run cargo clippy
run: ./run_clippy.sh
- name: Ensure all project builds
run: cargo build --locked --all --all-targets
check-rust-dependencies:
runs-on: [ self-hosted, dev, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: false
fetch-depth: 1
# https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
- name: Check every project module is covered by Hakari
run: |
cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date
cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack
shell: bash -euxo pipefail {0}
check-codestyle-python:
runs-on: [ self-hosted, Linux, k8s-runner ]
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: false
fetch-depth: 1
- name: Cache poetry deps
id: cache_poetry
uses: actions/cache@v3
with:
path: ~/.cache/pypoetry/virtualenvs
key: v1-codestyle-python-deps-${{ hashFiles('poetry.lock') }}
- name: Install Python deps
run: ./scripts/pysync
- name: Run isort to ensure code format
run: poetry run isort --diff --check .
- name: Run black to ensure code format
run: poetry run black --diff --check .
- name: Run flake8 to ensure code format
run: poetry run flake8 .
- name: Run mypy to check types
run: poetry run mypy .

View File

@@ -1,179 +0,0 @@
name: Neon Deploy dev
on:
workflow_dispatch:
inputs:
dockerTag:
description: 'Docker tag to deploy'
required: true
type: string
branch:
description: 'Branch or commit used for deploy scripts and configs'
required: true
type: string
default: 'main'
deployStorage:
description: 'Deploy storage'
required: true
type: boolean
default: true
deployProxy:
description: 'Deploy proxy'
required: true
type: boolean
default: true
deployStorageBroker:
description: 'Deploy storage-broker'
required: true
type: boolean
default: true
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
concurrency:
group: deploy-dev
cancel-in-progress: false
jobs:
deploy-storage-new:
runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
options: --user root --privileged
if: inputs.deployStorage
defaults:
run:
shell: bash
strategy:
matrix:
target_region: [ eu-west-1, us-east-2 ]
environment:
name: dev-${{ matrix.target_region }}
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
ref: ${{ inputs.branch }}
- name: Redeploy
run: |
export DOCKER_TAG=${{ inputs.dockerTag }}
cd "$(pwd)/.github/ansible"
./get_binaries.sh
ansible-galaxy collection install sivel.toiletwater
ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
rm -f neon_install.tar.gz .neon_current_version
- name: Cleanup ansible folder
run: rm -rf ~/.ansible
deploy-proxy-new:
runs-on: [ self-hosted, gen3, small ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
if: inputs.deployProxy
defaults:
run:
shell: bash
strategy:
matrix:
include:
- target_region: us-east-2
target_cluster: dev-us-east-2-beta
deploy_link_proxy: true
deploy_legacy_scram_proxy: true
- target_region: eu-west-1
target_cluster: dev-eu-west-1-zeta
deploy_link_proxy: false
deploy_legacy_scram_proxy: false
environment:
name: dev-${{ matrix.target_region }}
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
ref: ${{ inputs.branch }}
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v1-node16
with:
role-to-assume: arn:aws:iam::369495373322:role/github-runner
aws-region: eu-central-1
role-skip-session-tagging: true
role-duration-seconds: 1800
- name: Configure environment
run: |
helm repo add neondatabase https://neondatabase.github.io/helm-charts
aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
- name: Re-deploy scram proxy
run: |
DOCKER_TAG=${{ inputs.dockerTag }}
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
- name: Re-deploy link proxy
if: matrix.deploy_link_proxy
run: |
DOCKER_TAG=${{ inputs.dockerTag }}
helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
- name: Re-deploy legacy scram proxy
if: matrix.deploy_legacy_scram_proxy
run: |
DOCKER_TAG=${{ inputs.dockerTag }}
helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
- name: Cleanup helm folder
run: rm -rf ~/.cache
deploy-storage-broker-new:
runs-on: [ self-hosted, gen3, small ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
if: inputs.deployStorageBroker
defaults:
run:
shell: bash
strategy:
matrix:
include:
- target_region: us-east-2
target_cluster: dev-us-east-2-beta
- target_region: eu-west-1
target_cluster: dev-eu-west-1-zeta
environment:
name: dev-${{ matrix.target_region }}
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
ref: ${{ inputs.branch }}
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v1-node16
with:
role-to-assume: arn:aws:iam::369495373322:role/github-runner
aws-region: eu-central-1
role-skip-session-tagging: true
role-duration-seconds: 1800
- name: Configure environment
run: |
helm repo add neondatabase https://neondatabase.github.io/helm-charts
aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
- name: Deploy storage-broker
run:
helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
- name: Cleanup helm folder
run: rm -rf ~/.cache

View File

@@ -1,240 +0,0 @@
name: Neon Deploy prod
on:
workflow_dispatch:
inputs:
dockerTag:
description: 'Docker tag to deploy'
required: true
type: string
branch:
description: 'Branch or commit used for deploy scripts and configs'
required: true
type: string
default: 'release'
deployStorage:
description: 'Deploy storage'
required: true
type: boolean
default: true
deployProxy:
description: 'Deploy proxy'
required: true
type: boolean
default: true
deployStorageBroker:
description: 'Deploy storage-broker'
required: true
type: boolean
default: true
disclamerAcknowledged:
description: 'I confirm that there is an emergency and I can not use regular release workflow'
required: true
type: boolean
default: false
concurrency:
group: deploy-prod
cancel-in-progress: false
jobs:
deploy-prod-new:
runs-on: prod
container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
if: inputs.deployStorage && inputs.disclamerAcknowledged
defaults:
run:
shell: bash
strategy:
matrix:
target_region: [ us-east-2, us-west-2, eu-central-1, ap-southeast-1 ]
environment:
name: prod-${{ matrix.target_region }}
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
ref: ${{ inputs.branch }}
- name: Redeploy
run: |
export DOCKER_TAG=${{ inputs.dockerTag }}
cd "$(pwd)/.github/ansible"
./get_binaries.sh
ansible-galaxy collection install sivel.toiletwater
ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
rm -f neon_install.tar.gz .neon_current_version
deploy-proxy-prod-new:
runs-on: prod
container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
if: inputs.deployProxy && inputs.disclamerAcknowledged
defaults:
run:
shell: bash
strategy:
matrix:
include:
- target_region: us-east-2
target_cluster: prod-us-east-2-delta
deploy_link_proxy: true
deploy_legacy_scram_proxy: false
- target_region: us-west-2
target_cluster: prod-us-west-2-eta
deploy_link_proxy: false
deploy_legacy_scram_proxy: true
- target_region: eu-central-1
target_cluster: prod-eu-central-1-gamma
deploy_link_proxy: false
deploy_legacy_scram_proxy: false
- target_region: ap-southeast-1
target_cluster: prod-ap-southeast-1-epsilon
deploy_link_proxy: false
deploy_legacy_scram_proxy: false
environment:
name: prod-${{ matrix.target_region }}
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
ref: ${{ inputs.branch }}
- name: Configure environment
run: |
helm repo add neondatabase https://neondatabase.github.io/helm-charts
aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
- name: Re-deploy scram proxy
run: |
DOCKER_TAG=${{ inputs.dockerTag }}
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
- name: Re-deploy link proxy
if: matrix.deploy_link_proxy
run: |
DOCKER_TAG=${{ inputs.dockerTag }}
helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
- name: Re-deploy legacy scram proxy
if: matrix.deploy_legacy_scram_proxy
run: |
DOCKER_TAG=${{ inputs.dockerTag }}
helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
deploy-storage-broker-prod-new:
runs-on: prod
container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
if: inputs.deployStorageBroker && inputs.disclamerAcknowledged
defaults:
run:
shell: bash
strategy:
matrix:
include:
- target_region: us-east-2
target_cluster: prod-us-east-2-delta
- target_region: us-west-2
target_cluster: prod-us-west-2-eta
- target_region: eu-central-1
target_cluster: prod-eu-central-1-gamma
- target_region: ap-southeast-1
target_cluster: prod-ap-southeast-1-epsilon
environment:
name: prod-${{ matrix.target_region }}
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
ref: ${{ inputs.branch }}
- name: Configure environment
run: |
helm repo add neondatabase https://neondatabase.github.io/helm-charts
aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
- name: Deploy storage-broker
run:
helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
# Deploy to old account below
deploy:
runs-on: prod
container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
if: inputs.deployStorage && inputs.disclamerAcknowledged
defaults:
run:
shell: bash
environment:
name: prod-old
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
ref: ${{ inputs.branch }}
- name: Redeploy
run: |
export DOCKER_TAG=${{ inputs.dockerTag }}
cd "$(pwd)/.github/ansible"
./get_binaries.sh
eval $(ssh-agent)
echo "${{ secrets.TELEPORT_SSH_KEY }}" | tr -d '\n'| base64 --decode >ssh-key
echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
chmod 0600 ssh-key
ssh-add ssh-key
rm -f ssh-key ssh-key-cert.pub
ANSIBLE_CONFIG=./ansible.cfg ansible-galaxy collection install sivel.toiletwater
ANSIBLE_CONFIG=./ansible.cfg ansible-playbook deploy.yaml -i production.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
rm -f neon_install.tar.gz .neon_current_version
# Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ansible/collections': Permission denied
- name: Cleanup ansible folder
run: rm -rf ~/.ansible
deploy-storage-broker:
name: deploy storage broker on old staging and old prod
runs-on: [ self-hosted, gen3, small ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
if: inputs.deployStorageBroker && inputs.disclamerAcknowledged
defaults:
run:
shell: bash
environment:
name: prod-old
env:
KUBECONFIG: .kubeconfig
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
ref: ${{ inputs.branch }}
- name: Store kubeconfig file
run: |
echo "${{ secrets.PRODUCTION_KUBECONFIG_DATA }}" | base64 --decode > ${KUBECONFIG}
chmod 0600 ${KUBECONFIG}
- name: Add neon helm chart
run: helm repo add neondatabase https://neondatabase.github.io/helm-charts
- name: Deploy storage-broker
run:
helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/production.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
- name: Cleanup helm folder
run: rm -rf ~/.cache

View File

@@ -1,154 +0,0 @@
name: Check neon with extra platform builds
on:
push:
branches:
- main
pull_request:
defaults:
run:
shell: bash -euxo pipefail {0}
concurrency:
# Allow only one workflow per any non-`main` branch.
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }}
cancel-in-progress: true
env:
RUST_BACKTRACE: 1
COPT: '-Werror'
jobs:
check-macos-build:
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')
timeout-minutes: 90
runs-on: macos-latest
env:
# Use release build only, to have less debug info around
# Hence keeping target/ (and general cache size) smaller
BUILD_TYPE: release
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 1
- name: Install macOS postgres dependencies
run: brew install flex bison openssl protobuf
- name: Set pg 14 revision for caching
id: pg_v14_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
- name: Set pg 15 revision for caching
id: pg_v15_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
- name: Cache postgres v14 build
id: cache_pg_14
uses: actions/cache@v3
with:
path: pg_install/v14
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v15 build
id: cache_pg_15
uses: actions/cache@v3
with:
path: pg_install/v15
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Set extra env for macOS
run: |
echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
- name: Cache cargo deps
uses: actions/cache@v3
with:
path: |
~/.cargo/registry
!~/.cargo/registry/src
~/.cargo/git
target
key: v1-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
- name: Build postgres v14
if: steps.cache_pg_14.outputs.cache-hit != 'true'
run: make postgres-v14 -j$(nproc)
- name: Build postgres v15
if: steps.cache_pg_15.outputs.cache-hit != 'true'
run: make postgres-v15 -j$(nproc)
- name: Build neon extensions
run: make neon-pg-ext -j$(nproc)
- name: Run cargo build
run: cargo build --all --release
- name: Check that no warnings are produced
run: ./run_clippy.sh
gather-rust-build-stats:
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats')
runs-on: [ self-hosted, gen3, large ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
env:
BUILD_TYPE: release
# remove the cachepot wrapper and build without crate caches
RUSTC_WRAPPER: ""
# build with incremental compilation produce partial results
# so do not attempt to cache this build, also disable the incremental compilation
CARGO_INCREMENTAL: 0
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 1
# Some of our rust modules use FFI and need those to be checked
- name: Get postgres headers
run: make postgres-headers -j$(nproc)
- name: Produce the build stats
run: cargo build --all --release --timings
- name: Upload the build stats
id: upload-stats
env:
BUCKET: neon-github-public-dev
SHA: ${{ github.event.pull_request.head.sha || github.sha }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
run: |
REPORT_URL=https://${BUCKET}.s3.amazonaws.com/build-stats/${SHA}/${GITHUB_RUN_ID}/cargo-timing.html
aws s3 cp --only-show-errors ./target/cargo-timings/cargo-timing.html "s3://${BUCKET}/build-stats/${SHA}/${GITHUB_RUN_ID}/"
echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
- name: Publish build stats report
uses: actions/github-script@v6
env:
REPORT_URL: ${{ steps.upload-stats.outputs.report-url }}
SHA: ${{ github.event.pull_request.head.sha || github.sha }}
with:
script: |
const { REPORT_URL, SHA } = process.env
await github.rest.repos.createCommitStatus({
owner: context.repo.owner,
repo: context.repo.repo,
sha: `${SHA}`,
state: 'success',
target_url: `${REPORT_URL}`,
context: `Build stats (release)`,
})

View File

@@ -23,7 +23,6 @@ jobs:
runs-on: [ ubuntu-latest ]
env:
DEFAULT_PG_VERSION: 14
TEST_OUTPUT: /tmp/test_output
steps:
@@ -52,8 +51,8 @@ jobs:
id: create-neon-project
uses: ./.github/actions/neon-project-create
with:
environment: staging
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
- name: Run pytest
env:
@@ -64,7 +63,7 @@ jobs:
run: |
# Test framework expects we have psql binary;
# but since we don't really need it in this test, let's mock it
mkdir -p "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin" && touch "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin/psql";
mkdir -p "$POSTGRES_DISTRIB_DIR/v14/bin" && touch "$POSTGRES_DISTRIB_DIR/v14/bin/psql";
./scripts/pytest \
--junitxml=$TEST_OUTPUT/junit.xml \
--tb=short \
@@ -76,6 +75,7 @@ jobs:
if: ${{ always() }}
uses: ./.github/actions/neon-project-delete
with:
environment: staging
project_id: ${{ steps.create-neon-project.outputs.project_id }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}

View File

@@ -1,33 +0,0 @@
name: Create Release Branch
on:
schedule:
- cron: '0 10 * * 2'
jobs:
create_release_branch:
runs-on: [ubuntu-latest]
steps:
- name: Check out code
uses: actions/checkout@v3
with:
ref: main
- name: Get current date
id: date
run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
- name: Create release branch
run: git checkout -b releases/${{ steps.date.outputs.date }}
- name: Push new branch
run: git push origin releases/${{ steps.date.outputs.date }}
- name: Create pull request into release
uses: thomaseizinger/create-pull-request@e3972219c86a56550fb70708d96800d8e24ba862 # 1.3.0
with:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
head: releases/${{ steps.date.outputs.date }}
base: release
title: Release ${{ steps.date.outputs.date }}

2624
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -1,3 +1,14 @@
# 'named-profiles' feature was stabilized in cargo 1.57. This line makes the
# build work with older cargo versions.
#
# We have this because as of this writing, the latest cargo Debian package
# that's available is 1.56. (Confusingly, the Debian package version number
# is 0.57, whereas 'cargo --version' says 1.56.)
#
# See https://tracker.debian.org/pkg/cargo for the current status of the
# package. When that gets updated, we can remove this.
cargo-features = ["named-profiles"]
[workspace]
members = [
"compute_tools",
@@ -7,155 +18,9 @@ members = [
"safekeeper",
"storage_broker",
"workspace_hack",
"trace",
"libs/*",
]
[workspace.package]
edition = "2021"
license = "Apache-2.0"
## All dependency versions, used in the project
[workspace.dependencies]
anyhow = { version = "1.0", features = ["backtrace"] }
async-stream = "0.3"
async-trait = "0.1"
atty = "0.2.14"
aws-config = { version = "0.51.0", default-features = false, features=["rustls"] }
aws-sdk-s3 = "0.21.0"
aws-smithy-http = "0.51.0"
aws-types = "0.51.0"
base64 = "0.13.0"
bincode = "1.3"
bindgen = "0.61"
bstr = "1.0"
byteorder = "1.4"
bytes = "1.0"
chrono = { version = "0.4", default-features = false, features = ["clock"] }
clap = { version = "4.0", features = ["derive"] }
close_fds = "0.3.2"
comfy-table = "6.1"
const_format = "0.2"
crc32c = "0.6"
crossbeam-utils = "0.8.5"
enum-map = "2.4.2"
enumset = "1.0.12"
fail = "0.5.0"
fs2 = "0.4.3"
futures = "0.3"
futures-core = "0.3"
futures-util = "0.3"
git-version = "0.3"
hashbrown = "0.13"
hashlink = "0.8.1"
hex = "0.4"
hex-literal = "0.3"
hmac = "0.12.1"
hostname = "0.3.1"
humantime = "2.1"
humantime-serde = "1.1.1"
hyper = "0.14"
hyper-tungstenite = "0.9"
itertools = "0.10"
jsonwebtoken = "8"
libc = "0.2"
md5 = "0.7.0"
memoffset = "0.8"
nix = "0.26"
notify = "5.0.0"
num-traits = "0.2.15"
once_cell = "1.13"
opentelemetry = "0.18.0"
opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions = "0.10.0"
tracing-opentelemetry = "0.18.0"
parking_lot = "0.12"
pin-project-lite = "0.2"
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
prost = "0.11"
rand = "0.8"
regex = "1.4"
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
routerify = "3"
rpds = "0.12.0"
rustls = "0.20"
rustls-pemfile = "1"
rustls-split = "0.3"
scopeguard = "1.1"
sentry = { version = "0.29", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1"
serde_with = "2.0"
sha2 = "0.10.2"
signal-hook = "0.3"
socket2 = "0.4.4"
strum = "0.24"
strum_macros = "0.24"
svg_fmt = "0.4.1"
tar = "0.4"
thiserror = "1.0"
tls-listener = { version = "0.6", features = ["rustls", "hyper-h1"] }
tokio = { version = "1.17", features = ["macros"] }
tokio-postgres-rustls = "0.9.0"
tokio-rustls = "0.23"
tokio-stream = "0.1"
tokio-util = { version = "0.7", features = ["io"] }
toml = "0.5"
toml_edit = { version = "0.17", features = ["easy"] }
tonic = {version = "0.8", features = ["tls", "tls-roots"]}
tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
url = "2.2"
uuid = { version = "1.2", features = ["v4", "serde"] }
walkdir = "2.3.2"
webpki-roots = "0.22.5"
x509-parser = "0.14"
## TODO replace this with tracing
env_logger = "0.10"
log = "0.4"
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }
## Other git libraries
heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
## Local libraries
consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
metrics = { version = "0.1", path = "./libs/metrics/" }
pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
pq_proto = { version = "0.1", path = "./libs/pq_proto/" }
remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
utils = { version = "0.1", path = "./libs/utils/" }
## Common library dependency
workspace_hack = { version = "0.1", path = "./workspace_hack/" }
## Build dependencies
criterion = "0.4"
rcgen = "0.10"
rstest = "0.16"
tempfile = "3.2"
tonic-build = "0.8"
# This is only needed for proxy's tests.
# TODO: we should probably fork `tokio-postgres-rustls` instead.
[patch.crates-io]
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
################# Binary contents sections
[profile.release]
# This is useful for profiling and, to some extent, debug.
# Besides, debug info should not affect the performance.
@@ -216,3 +81,9 @@ inherits = "release"
debug = false # true = 2 = all symbols, 1 = line only
opt-level = "z"
lto = true
# This is only needed for proxy's tests.
# TODO: we should probably fork `tokio-postgres-rustls` instead.
[patch.crates-io]
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }

View File

@@ -79,7 +79,7 @@ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/
RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \
&& /usr/local/bin/pageserver -D /data/.neon/ --init \
-c "id=1234" \
-c "broker_endpoint='http://storage_broker:50051'" \
-c "broker_endpoints=['http://etcd:2379']" \
-c "pg_distrib_dir='/usr/local/'" \
-c "listen_pg_addr='0.0.0.0:6400'" \
-c "listen_http_addr='0.0.0.0:9898'"

View File

@@ -1,5 +1,8 @@
ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
ARG IMAGE=rust
#
# This file is identical to the Dockerfile.compute-node-v15 file
# except for the version of Postgres that is built.
#
ARG TAG=pinned
#########################################################################################
@@ -10,8 +13,7 @@ ARG TAG=pinned
FROM debian:bullseye-slim AS build-deps
RUN apt update && \
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
libicu-dev
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev
#########################################################################################
#
@@ -20,21 +22,14 @@ RUN apt update && \
#
#########################################################################################
FROM build-deps AS pg-build
ARG PG_VERSION
COPY vendor/postgres-${PG_VERSION} postgres
COPY vendor/postgres-v14 postgres
RUN cd postgres && \
./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp --with-icu && \
./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
# Install headers
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
# Enable some of contrib extensions
echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install
#########################################################################################
#
@@ -45,34 +40,22 @@ RUN cd postgres && \
FROM build-deps AS postgis-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN apt update && \
apt install -y cmake gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \
libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \
protobuf-c-compiler xsltproc
RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz && \
tar zxvf SFCGAL-v1.3.10.tar.gz && \
cd SFCGAL-v1.3.10 && cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \
DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
make clean && cp -R /sfcgal/* /
apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
tar xvzf postgis-3.3.1.tar.gz && \
cd postgis-3.3.1 && \
./autogen.sh && \
export PATH="/usr/local/pgsql/bin:$PATH" && \
./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
./configure && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
cd extensions/postgis && \
make clean && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_raster.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_sfcgal.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control
#########################################################################################
#
@@ -144,27 +127,6 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control
#########################################################################################
#
# Layer "unit-pg-build"
# compile unit extension
#
#########################################################################################
FROM build-deps AS unit-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz && \
tar xvzf 7.7.tar.gz && \
cd postgresql-unit-7.7 && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
# unit extension's "create extension" script relies on absolute install path to fill some reference tables.
# We move the extension from '/usr/local/pgsql/' to '/usr/local/' after it is build. So we need to adjust the path.
# This one-liner removes pgsql/ part of the path.
# NOTE: Other extensions that rely on MODULEDIR variable after building phase will need the same fix.
find /usr/local/pgsql/share/extension/ -name "unit*.sql" -print0 | xargs -0 sed -i "s|pgsql/||g" && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/unit.control
#########################################################################################
#
# Layer "neon-pg-ext-build"
@@ -173,11 +135,9 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz &
#########################################################################################
FROM build-deps AS neon-pg-ext-build
COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=postgis-build /sfcgal/* /
COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=h3-pg-build /h3/usr /
COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY pgxn/ pgxn/
RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -190,7 +150,7 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
# Compile and run the Neon-specific `compute_ctl` binary
#
#########################################################################################
FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
USER nonroot
# Copy entire project to get Cargo.* files with proper dependencies for the whole project
COPY --chown=nonroot . .
@@ -210,6 +170,9 @@ RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgto
# Remove headers that we won't need anymore - we've completed installation of all extensions
RUN rm -r /usr/local/pgsql/include
# Remove now-useless PGXS src infrastructure
RUN rm -r /usr/local/pgsql/lib/pgxs/src
# Remove static postgresql libraries - all compilation is finished, so we
# can now remove these files - they must be included in other binaries by now
# if they were to be used by other libraries.
@@ -235,24 +198,17 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
# Install:
# libreadline8 for psql
# libicu67, locales for collations (including ICU)
# libossp-uuid16 for extension ossp-uuid
# libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
# libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS
RUN apt update && \
apt install --no-install-recommends -y \
locales \
libicu67 \
libreadline8 \
libossp-uuid16 \
libgeos-c1v5 \
libgdal28 \
libproj19 \
libprotobuf-c1 \
libsfcgal1 \
gdb && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
libprotobuf-c1 && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
ENV LANG en_US.utf8
USER postgres
ENTRYPOINT ["/usr/local/bin/compute_ctl"]

214
Dockerfile.compute-node-v15 Normal file
View File

@@ -0,0 +1,214 @@
#
# This file is identical to the Dockerfile.compute-node-v14 file
# except for the version of Postgres that is built.
#
ARG TAG=pinned
#########################################################################################
#
# Layer "build-deps"
#
#########################################################################################
FROM debian:bullseye-slim AS build-deps
RUN apt update && \
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev
#########################################################################################
#
# Layer "pg-build"
# Build Postgres from the neon postgres repository.
#
#########################################################################################
FROM build-deps AS pg-build
COPY vendor/postgres-v15 postgres
RUN cd postgres && \
./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
# Install headers
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install
#########################################################################################
#
# Layer "postgis-build"
# Build PostGIS from the upstream PostGIS mirror.
#
#########################################################################################
FROM build-deps AS postgis-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN apt update && \
apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
tar xvzf postgis-3.3.1.tar.gz && \
cd postgis-3.3.1 && \
./autogen.sh && \
export PATH="/usr/local/pgsql/bin:$PATH" && \
./configure && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
cd extensions/postgis && \
make clean && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_raster.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control
#########################################################################################
#
# Layer "plv8-build"
# Build plv8
#
#########################################################################################
FROM build-deps AS plv8-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN apt update && \
apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils
# https://github.com/plv8/plv8/issues/475:
# v8 uses gold for linking and sets `--thread-count=4` which breaks
# gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607)
# Install newer gold version manually as debian-testing binutils version updates
# libc version, which in turn breaks other extension built against non-testing libc.
RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \
tar xvzf binutils-2.38.tar.gz && \
cd binutils-2.38 && \
cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \
cd ../bfd && ./configure && make bfdver.h && \
cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \
cp /usr/local/bin/ld.gold /usr/bin/gold
# Sed is used to patch for https://github.com/plv8/plv8/issues/503
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
tar xvzf v3.1.4.tar.gz && \
cd plv8-3.1.4 && \
export PATH="/usr/local/pgsql/bin:$PATH" && \
sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
rm -rf /plv8-* && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control
#########################################################################################
#
# Layer "h3-pg-build"
# Build h3_pg
#
#########################################################################################
FROM build-deps AS h3-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
# packaged cmake is too old
RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \
-q -O /tmp/cmake-install.sh \
&& chmod u+x /tmp/cmake-install.sh \
&& /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
&& rm /tmp/cmake-install.sh
RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
tar xvzf h3.tgz && \
cd h3-4.0.1 && \
mkdir build && \
cd build && \
cmake .. -DCMAKE_BUILD_TYPE=Release && \
make -j $(getconf _NPROCESSORS_ONLN) && \
DESTDIR=/h3 make install && \
cp -R /h3/usr / && \
rm -rf build
RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3-pg.tgz && \
tar xvzf h3-pg.tgz && \
cd h3-pg-4.0.1 && \
export PATH="/usr/local/pgsql/bin:$PATH" && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control
#########################################################################################
#
# Layer "neon-pg-ext-build"
# compile neon extensions
#
#########################################################################################
FROM build-deps AS neon-pg-ext-build
COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=h3-pg-build /h3/usr /
COPY pgxn/ pgxn/
RUN make -j $(getconf _NPROCESSORS_ONLN) \
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-C pgxn/neon \
-s install
#########################################################################################
#
# Compile and run the Neon-specific `compute_ctl` binary
#
#########################################################################################
FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
USER nonroot
# Copy entire project to get Cargo.* files with proper dependencies for the whole project
COPY --chown=nonroot . .
RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
#########################################################################################
#
# Clean up postgres folder before inclusion
#
#########################################################################################
FROM neon-pg-ext-build AS postgres-cleanup-layer
COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
# Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise)
RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp
# Remove headers that we won't need anymore - we've completed installation of all extensions
RUN rm -r /usr/local/pgsql/include
# Remove now-useless PGXS src infrastructure
RUN rm -r /usr/local/pgsql/lib/pgxs/src
# Remove static postgresql libraries - all compilation is finished, so we
# can now remove these files - they must be included in other binaries by now
# if they were to be used by other libraries.
RUN rm /usr/local/pgsql/lib/lib*.a
#########################################################################################
#
# Final layer
# Put it all together into the final image
#
#########################################################################################
FROM debian:bullseye-slim
# Add user postgres
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
echo "postgres:test_console_pass" | chpasswd && \
mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
chown -R postgres:postgres /var/db/postgres && \
chmod 0750 /var/db/postgres/compute && \
echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
# Install:
# libreadline8 for psql
# libossp-uuid16 for extension ossp-uuid
# libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS
RUN apt update && \
apt install --no-install-recommends -y \
libreadline8 \
libossp-uuid16 \
libgeos-c1v5 \
libgdal28 \
libproj19 \
libprotobuf-c1 && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
USER postgres
ENTRYPOINT ["/usr/local/bin/compute_ctl"]

203
Makefile
View File

@@ -61,115 +61,146 @@ all: neon postgres neon-pg-ext
#
# The 'postgres_ffi' depends on the Postgres headers.
.PHONY: neon
neon: postgres-headers
neon: postgres-v14-headers postgres-v15-headers
+@echo "Compiling Neon"
$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)
### PostgreSQL parts
# Some rules are duplicated for Postgres v14 and 15. We may want to refactor
# The rules are duplicated for Postgres v14 and 15. We may want to refactor
# to avoid the duplication in the future, but it's tolerable for now.
#
$(POSTGRES_INSTALL_DIR)/build/%/config.status:
+@echo "Configuring Postgres $* build"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
(cd $(POSTGRES_INSTALL_DIR)/build/$* && \
env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \
$(POSTGRES_INSTALL_DIR)/build/v14/config.status:
+@echo "Configuring Postgres v14 build"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/v14
(cd $(POSTGRES_INSTALL_DIR)/build/v14 && \
env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-v14/configure \
CFLAGS='$(PG_CFLAGS)' \
$(PG_CONFIGURE_OPTS) \
--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$* > configure.log)
--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v14 > configure.log)
$(POSTGRES_INSTALL_DIR)/build/v15/config.status:
+@echo "Configuring Postgres v15 build"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/v15
(cd $(POSTGRES_INSTALL_DIR)/build/v15 && \
env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-v15/configure \
CFLAGS='$(PG_CFLAGS)' \
$(PG_CONFIGURE_OPTS) \
--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v15 > configure.log)
# nicer alias to run 'configure'
# Note: I've been unable to use templates for this part of our configuration.
# I'm not sure why it wouldn't work, but this is the only place (apart from
# the "build-all-versions" entry points) where direct mention of PostgreSQL
# versions is used.
.PHONY: postgres-configure-v15
postgres-configure-v15: $(POSTGRES_INSTALL_DIR)/build/v15/config.status
.PHONY: postgres-configure-v14
postgres-configure-v14: $(POSTGRES_INSTALL_DIR)/build/v14/config.status
.PHONY: postgres-v14-configure
postgres-v14-configure: $(POSTGRES_INSTALL_DIR)/build/v14/config.status
.PHONY: postgres-v15-configure
postgres-v15-configure: $(POSTGRES_INSTALL_DIR)/build/v15/config.status
# Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)/<version>/include
.PHONY: postgres-headers-%
postgres-headers-%: postgres-configure-%
+@echo "Installing PostgreSQL $* headers"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/include MAKELEVEL=0 install
.PHONY: postgres-v14-headers
postgres-v14-headers: postgres-v14-configure
+@echo "Installing PostgreSQL v14 headers"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/include MAKELEVEL=0 install
.PHONY: postgres-v15-headers
postgres-v15-headers: postgres-v15-configure
+@echo "Installing PostgreSQL v15 headers"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/include MAKELEVEL=0 install
# Compile and install PostgreSQL
.PHONY: postgres-%
postgres-%: postgres-configure-% \
postgres-headers-% # to prevent `make install` conflicts with neon's `postgres-headers`
+@echo "Compiling PostgreSQL $*"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 install
+@echo "Compiling libpq $*"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/interfaces/libpq install
+@echo "Compiling pg_prewarm $*"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_prewarm install
+@echo "Compiling pg_buffercache $*"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install
+@echo "Compiling pageinspect $*"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
.PHONY: postgres-v14
postgres-v14: postgres-v14-configure \
postgres-v14-headers # to prevent `make install` conflicts with neon's `postgres-headers`
+@echo "Compiling PostgreSQL v14"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14 MAKELEVEL=0 install
+@echo "Compiling libpq v14"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/interfaces/libpq install
+@echo "Compiling pg_prewarm v14"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_prewarm install
+@echo "Compiling pg_buffercache v14"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_buffercache install
+@echo "Compiling pageinspect v14"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pageinspect install
.PHONY: postgres-clean-%
postgres-clean-%:
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 clean
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache clean
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect clean
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/interfaces/libpq clean
.PHONY: neon-pg-ext-%
neon-pg-ext-%: postgres-%
+@echo "Compiling neon $*"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-$*
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install
+@echo "Compiling neon_walredo $*"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$*
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install
+@echo "Compiling neon_test_utils $*"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$*
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install
.PHONY: neon-pg-ext-clean-%
neon-pg-ext-clean-%:
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_walredo-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile clean
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_test_utils-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean
.PHONY: neon-pg-ext
neon-pg-ext: \
neon-pg-ext-v14 \
neon-pg-ext-v15
.PHONY: neon-pg-ext-clean
neon-pg-ext-clean: \
neon-pg-ext-clean-v14 \
neon-pg-ext-clean-v15
.PHONY: postgres-v15
postgres-v15: postgres-v15-configure \
postgres-v15-headers # to prevent `make install` conflicts with neon's `postgres-headers`
+@echo "Compiling PostgreSQL v15"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15 MAKELEVEL=0 install
+@echo "Compiling libpq v15"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/interfaces/libpq install
+@echo "Compiling pg_prewarm v15"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_prewarm install
+@echo "Compiling pg_buffercache v15"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_buffercache install
+@echo "Compiling pageinspect v15"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pageinspect install
# shorthand to build all Postgres versions
.PHONY: postgres
postgres: \
postgres-v14 \
postgres-v15
postgres: postgres-v14 postgres-v15
.PHONY: postgres-headers
postgres-headers: \
postgres-headers-v14 \
postgres-headers-v15
.PHONY: postgres-v14-clean
postgres-v14-clean:
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14 MAKELEVEL=0 clean
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_buffercache clean
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pageinspect clean
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/interfaces/libpq clean
.PHONY: postgres-clean
postgres-clean: \
postgres-clean-v14 \
postgres-clean-v15
.PHONY: postgres-v15-clean
postgres-v15-clean:
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15 MAKELEVEL=0 clean
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_buffercache clean
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pageinspect clean
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/interfaces/libpq clean
neon-pg-ext-v14: postgres-v14
+@echo "Compiling neon v14"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-v14
(cd $(POSTGRES_INSTALL_DIR)/build/neon-v14 && \
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
+@echo "Compiling neon_walredo v14"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14
(cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14 && \
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install)
+@echo "Compiling neon_test_utils" v14
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14
(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14 && \
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install)
neon-pg-ext-v15: postgres-v15
+@echo "Compiling neon v15"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-v15
(cd $(POSTGRES_INSTALL_DIR)/build/neon-v15 && \
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
+@echo "Compiling neon_walredo v15"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15
(cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15 && \
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install)
+@echo "Compiling neon_test_utils" v15
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15
(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15 && \
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install)
.PHONY: neon-pg-ext-clean
$(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon clean
$(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils clean
neon-pg-ext: neon-pg-ext-v14 neon-pg-ext-v15
postgres-headers: postgres-v14-headers postgres-v15-headers
postgres-clean: postgres-v14-clean postgres-v15-clean
# This doesn't remove the effects of 'configure'.
.PHONY: clean
clean: postgres-clean neon-pg-ext-clean
clean:
cd $(POSTGRES_INSTALL_DIR)/build/v14 && $(MAKE) clean
cd $(POSTGRES_INSTALL_DIR)/build/v15 && $(MAKE) clean
$(CARGO_CMD_PREFIX) cargo clean
cd pgxn/neon && $(MAKE) clean
cd pgxn/neon_test_utils && $(MAKE) clean
# This removes everything
.PHONY: distclean

View File

@@ -2,20 +2,29 @@
Neon is a serverless open-source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes the PostgreSQL storage layer by redistributing data across a cluster of nodes.
The project used to be called "Zenith". Many of the commands and code comments
still refer to "zenith", but we are in the process of renaming things.
## Quick start
Try the [Neon Free Tier](https://neon.tech/docs/introduction/technical-preview-free-tier/) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions.
[Join the waitlist](https://neon.tech/) for our free tier to receive your serverless postgres instance. Then connect to it with your preferred postgres client (psql, dbeaver, etc) or use the online SQL editor.
Alternatively, compile and run the project [locally](#running-local-installation).
## Architecture overview
A Neon installation consists of compute nodes and the Neon storage engine. Compute nodes are stateless PostgreSQL nodes backed by the Neon storage engine.
A Neon installation consists of compute nodes and a Neon storage engine.
Compute nodes are stateless PostgreSQL nodes backed by the Neon storage engine.
The Neon storage engine consists of two major components:
- Pageserver. Scalable storage backend for the compute nodes.
- Safekeepers. The safekeepers form a redundant WAL service that received WAL from the compute node, and stores it durably until it has been processed by the pageserver and uploaded to cloud storage.
- WAL service. The service receives WAL from the compute node and ensures that it is stored durably.
See developer documentation in [/docs/SUMMARY.md](/docs/SUMMARY.md) for more information.
Pageserver consists of:
- Repository - Neon storage implementation.
- WAL receiver - service that receives WAL from WAL service and stores it in the repository.
- Page service - service that communicates with compute nodes and responds with pages from the repository.
- WAL redo - service that builds pages from base images and WAL records on Page service request
## Running local installation
@@ -26,13 +35,12 @@ See developer documentation in [/docs/SUMMARY.md](/docs/SUMMARY.md) for more inf
* On Ubuntu or Debian, this set of packages should be sufficient to build the code:
```bash
apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler
libssl-dev clang pkg-config libpq-dev etcd cmake postgresql-client protobuf-compiler
```
* On Fedora, these packages are needed:
```bash
dnf install flex bison readline-devel zlib-devel openssl-devel \
libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
protobuf-devel
libseccomp-devel perl clang cmake etcd postgresql postgresql-contrib protobuf-compiler
```
2. [Install Rust](https://www.rust-lang.org/tools/install)
@@ -45,7 +53,7 @@ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
1. Install XCode and dependencies
```
xcode-select --install
brew install protobuf openssl flex bison
brew install protobuf etcd openssl flex bison
```
2. [Install Rust](https://www.rust-lang.org/tools/install)
@@ -108,7 +116,7 @@ make -j`sysctl -n hw.logicalcpu`
To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively.
To run the integration tests or Python scripts (not required to use the code), install
Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory.
Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (requires [poetry](https://python-poetry.org/)) in the project directory.
#### Running neon database
@@ -118,21 +126,18 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r
# Later that would be responsibility of a package install script
> ./target/debug/neon_local init
Starting pageserver at '127.0.0.1:64000' in '.neon'.
pageserver started, pid: 2545906
Successfully initialized timeline de200bd42b49cc1814412c7e592dd6e9
Stopped pageserver 1 process with pid 2545906
# start pageserver, safekeeper, and broker for their intercommunication
# start pageserver and safekeeper
> ./target/debug/neon_local start
Starting neon broker at 127.0.0.1:50051
storage_broker started, pid: 2918372
Starting etcd broker using "/usr/bin/etcd"
etcd started, pid: 2545996
Starting pageserver at '127.0.0.1:64000' in '.neon'.
pageserver started, pid: 2918386
pageserver started, pid: 2546005
Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'.
safekeeper 1 started, pid: 2918437
# create initial tenant and use it as a default for every future neon_local invocation
> ./target/debug/neon_local tenant create --set-default
tenant 9ef87a5bf0d92544f6fafeeb3239695c successfully created on the pageserver
Created an initial timeline 'de200bd42b49cc1814412c7e592dd6e9' at Lsn 0/16B5A50 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c
Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one
safekeeper 1 started, pid: 2546041
# start postgres compute node
> ./target/debug/neon_local pg start main
@@ -224,20 +229,12 @@ CARGO_BUILD_FLAGS="--features=testing" make
## Documentation
[/docs/](/docs/) Contains a top-level overview of all available markdown documentation.
Now we use README files to cover design ideas and overall architecture for each module and `rustdoc` style documentation comments. See also [/docs/](/docs/) a top-level overview of all available markdown documentation.
- [/docs/sourcetree.md](/docs/sourcetree.md) contains overview of source tree layout.
To view your `rustdoc` documentation in a browser, try running `cargo doc --no-deps --open`
See also README files in some source directories, and `rustdoc` style documentation comments.
Other resources:
- [SELECT 'Hello, World'](https://neon.tech/blog/hello-world/): Blog post by Nikita Shamgunov on the high level architecture
- [Architecture decisions in Neon](https://neon.tech/blog/architecture-decisions-in-neon/): Blog post by Heikki Linnakangas
- [Neon: Serverless PostgreSQL!](https://www.youtube.com/watch?v=rES0yzeERns): Presentation on storage system by Heikki Linnakangas in the CMU Database Group seminar series
### Postgres-specific terms
Due to Neon's very close relation with PostgreSQL internals, numerous specific terms are used.

View File

@@ -1,28 +1,23 @@
[package]
name = "compute_tools"
version = "0.1.0"
edition.workspace = true
license.workspace = true
edition = "2021"
[dependencies]
anyhow.workspace = true
chrono.workspace = true
clap.workspace = true
futures.workspace = true
hyper = { workspace = true, features = ["full"] }
notify.workspace = true
opentelemetry.workspace = true
postgres.workspace = true
regex.workspace = true
serde.workspace = true
serde_json.workspace = true
tar.workspace = true
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
tokio-postgres.workspace = true
tracing.workspace = true
tracing-opentelemetry.workspace = true
tracing-subscriber.workspace = true
tracing-utils.workspace = true
url.workspace = true
workspace_hack.workspace = true
anyhow = "1.0"
chrono = "0.4"
clap = "4.0"
env_logger = "0.9"
futures = "0.3.13"
hyper = { version = "0.14", features = ["full"] }
log = { version = "0.4", features = ["std", "serde"] }
notify = "5.0.0"
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
regex = "1"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1"
tar = "0.4"
tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
url = "2.2.2"
workspace_hack = { version = "0.1", path = "../workspace_hack" }

View File

@@ -19,10 +19,6 @@ Also `compute_ctl` spawns two separate service threads:
- `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
last activity requests.
If the `vm-informant` binary is present at `/bin/vm-informant`, it will also be started. For VM
compute nodes, `vm-informant` communicates with the VM autoscaling system. It coordinates
downscaling and (eventually) will request immediate upscaling under resource pressure.
Usage example:
```sh
compute_ctl -D /var/db/postgres/compute \

View File

@@ -18,10 +18,6 @@
//! - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
//! last activity requests.
//!
//! If the `vm-informant` binary is present at `/bin/vm-informant`, it will also be started. For VM
//! compute nodes, `vm-informant` communicates with the VM autoscaling system. It coordinates
//! downscaling and (eventually) will request immediate upscaling under resource pressure.
//!
//! Usage example:
//! ```sh
//! compute_ctl -D /var/db/postgres/compute \
@@ -40,11 +36,10 @@ use std::{thread, time::Duration};
use anyhow::{Context, Result};
use chrono::Utc;
use clap::Arg;
use tracing::{error, info};
use log::{error, info};
use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus};
use compute_tools::http::api::launch_http_server;
use compute_tools::informant::spawn_vm_informant_if_present;
use compute_tools::logger::*;
use compute_tools::monitor::launch_monitor;
use compute_tools::params::*;
@@ -53,7 +48,8 @@ use compute_tools::spec::*;
use url::Url;
fn main() -> Result<()> {
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
// TODO: re-use `utils::logging` later
init_logger(DEFAULT_LOG_LEVEL)?;
let matches = cli().get_matches();
@@ -84,29 +80,6 @@ fn main() -> Result<()> {
}
};
// Extract OpenTelemetry context for the startup actions from the spec, and
// attach it to the current tracing context.
//
// This is used to propagate the context for the 'start_compute' operation
// from the neon control plane. This allows linking together the wider
// 'start_compute' operation that creates the compute container, with the
// startup actions here within the container.
//
// Switch to the startup context here, and exit it once the startup has
// completed and Postgres is up and running.
//
// NOTE: This is supposed to only cover the *startup* actions. Once
// postgres is configured and up-and-running, we exit this span. Any other
// actions that are performed on incoming HTTP requests, for example, are
// performed in separate spans.
let startup_context_guard = if let Some(ref carrier) = spec.startup_tracing_context {
use opentelemetry::propagation::TextMapPropagator;
use opentelemetry::sdk::propagation::TraceContextPropagator;
Some(TraceContextPropagator::new().extract(carrier).attach())
} else {
None
};
let pageserver_connstr = spec
.cluster
.settings
@@ -132,7 +105,7 @@ fn main() -> Result<()> {
tenant,
timeline,
pageserver_connstr,
metrics: ComputeMetrics::default(),
metrics: ComputeMetrics::new(),
state: RwLock::new(ComputeState::new()),
};
let compute = Arc::new(compute_state);
@@ -141,55 +114,30 @@ fn main() -> Result<()> {
// requests, while configuration is still in progress.
let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
// Also spawn the thread responsible for handling the VM informant -- if it's present
let _vm_informant_handle = spawn_vm_informant_if_present().expect("cannot launch VM informant");
// Start Postgres
let mut delay_exit = false;
let mut exit_code = None;
let pg = match compute.start_compute() {
Ok(pg) => Some(pg),
Err(err) => {
error!("could not start the compute node: {:?}", err);
// Run compute (Postgres) and hang waiting on it.
match compute.prepare_and_run() {
Ok(ec) => {
let code = ec.code().unwrap_or(1);
info!("Postgres exited with code {}, shutting down", code);
exit(code)
}
Err(error) => {
error!("could not start the compute node: {:?}", error);
let mut state = compute.state.write().unwrap();
state.error = Some(format!("{:?}", err));
state.error = Some(format!("{:?}", error));
state.status = ComputeStatus::Failed;
drop(state);
delay_exit = true;
None
// Keep serving HTTP requests, so the cloud control plane was able to
// get the actual error.
info!("giving control plane 30s to collect the error before shutdown");
thread::sleep(Duration::from_secs(30));
info!("shutting down");
Err(error)
}
};
// Wait for the child Postgres process forever. In this state Ctrl+C will
// propagate to Postgres and it will be shut down as well.
if let Some(mut pg) = pg {
// Startup is finished, exit the startup tracing span
drop(startup_context_guard);
let ecode = pg
.wait()
.expect("failed to start waiting on Postgres process");
info!("Postgres exited with code {}, shutting down", ecode);
exit_code = ecode.code()
}
if let Err(err) = compute.check_for_core_dumps() {
error!("error while checking for core dumps: {err:?}");
}
// If launch failed, keep serving HTTP requests for a while, so the cloud
// control plane can get the actual error.
if delay_exit {
info!("giving control plane 30s to collect the error before shutdown");
thread::sleep(Duration::from_secs(30));
info!("shutting down");
}
// Shutdown trace pipeline gracefully, so that it has a chance to send any
// pending traces before we exit.
tracing_utils::shutdown_tracing();
exit(exit_code.unwrap_or(1))
}
fn cli() -> clap::Command {

View File

@@ -1,12 +1,11 @@
use anyhow::{anyhow, Result};
use log::error;
use postgres::Client;
use tokio_postgres::NoTls;
use tracing::{error, instrument};
use crate::compute::ComputeNode;
#[instrument(skip_all)]
pub fn create_writability_check_data(client: &mut Client) -> Result<()> {
pub fn create_writablity_check_data(client: &mut Client) -> Result<()> {
let query = "
CREATE TABLE IF NOT EXISTS health_check (
id serial primary key,
@@ -22,7 +21,6 @@ pub fn create_writability_check_data(client: &mut Client) -> Result<()> {
Ok(())
}
#[instrument(skip_all)]
pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
let (client, connection) = tokio_postgres::connect(compute.connstr.as_str(), NoTls).await?;
if client.is_closed() {

View File

@@ -17,17 +17,17 @@
use std::fs;
use std::os::unix::fs::PermissionsExt;
use std::path::Path;
use std::process::{Command, Stdio};
use std::process::{Command, ExitStatus, Stdio};
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::RwLock;
use anyhow::{Context, Result};
use chrono::{DateTime, Utc};
use log::info;
use postgres::{Client, NoTls};
use serde::{Serialize, Serializer};
use tracing::{info, instrument, warn};
use crate::checker::create_writability_check_data;
use crate::checker::create_writablity_check_data;
use crate::config;
use crate::pg_helpers::*;
use crate::spec::*;
@@ -91,7 +91,7 @@ pub enum ComputeStatus {
Failed,
}
#[derive(Default, Serialize)]
#[derive(Serialize)]
pub struct ComputeMetrics {
pub sync_safekeepers_ms: AtomicU64,
pub basebackup_ms: AtomicU64,
@@ -99,6 +99,23 @@ pub struct ComputeMetrics {
pub total_startup_ms: AtomicU64,
}
impl ComputeMetrics {
pub fn new() -> Self {
Self {
sync_safekeepers_ms: AtomicU64::new(0),
basebackup_ms: AtomicU64::new(0),
config_ms: AtomicU64::new(0),
total_startup_ms: AtomicU64::new(0),
}
}
}
impl Default for ComputeMetrics {
fn default() -> Self {
Self::new()
}
}
impl ComputeNode {
pub fn set_status(&self, status: ComputeStatus) {
self.state.write().unwrap().status = status;
@@ -121,7 +138,6 @@ impl ComputeNode {
// Get basebackup from the libpq connection to pageserver using `connstr` and
// unarchive it to `pgdata` directory overriding all its previous content.
#[instrument(skip(self))]
fn get_basebackup(&self, lsn: &str) -> Result<()> {
let start_time = Utc::now();
@@ -155,12 +171,11 @@ impl ComputeNode {
// Run `postgres` in a special mode with `--sync-safekeepers` argument
// and return the reported LSN back to the caller.
#[instrument(skip(self))]
fn sync_safekeepers(&self) -> Result<String> {
let start_time = Utc::now();
let sync_handle = Command::new(&self.pgbin)
.args(["--sync-safekeepers"])
.args(&["--sync-safekeepers"])
.env("PGDATA", &self.pgdata) // we cannot use -D in this mode
.stdout(Stdio::piped())
.spawn()
@@ -198,7 +213,6 @@ impl ComputeNode {
/// Do all the preparations like PGDATA directory creation, configuration,
/// safekeepers sync, basebackup, etc.
#[instrument(skip(self))]
pub fn prepare_pgdata(&self) -> Result<()> {
let spec = &self.spec;
let pgdata_path = Path::new(&self.pgdata);
@@ -232,27 +246,23 @@ impl ComputeNode {
/// Start Postgres as a child process and manage DBs/roles.
/// After that this will hang waiting on the postmaster process to exit.
#[instrument(skip(self))]
pub fn start_postgres(&self) -> Result<std::process::Child> {
pub fn run(&self) -> Result<ExitStatus> {
let start_time = Utc::now();
let pgdata_path = Path::new(&self.pgdata);
// Run postgres as a child process.
let mut pg = Command::new(&self.pgbin)
.args(["-D", &self.pgdata])
.args(&["-D", &self.pgdata])
.spawn()
.expect("cannot start postgres process");
wait_for_postgres(&mut pg, pgdata_path)?;
Ok(pg)
}
#[instrument(skip(self))]
pub fn apply_config(&self) -> Result<()> {
// If connection fails,
// it may be the old node with `zenith_admin` superuser.
//
// In this case we need to connect with old `zenith_admin` name
// In this case we need to connect with old `zenith_admin`name
// and create new user. We cannot simply rename connected user,
// but we can create a new one and grant it all privileges.
let mut client = match Client::connect(self.connstr.as_str(), NoTls) {
@@ -278,43 +288,16 @@ impl ComputeNode {
Ok(client) => client,
};
// Proceed with post-startup configuration. Note, that order of operations is important.
handle_roles(&self.spec, &mut client)?;
handle_databases(&self.spec, &mut client)?;
handle_role_deletions(self, &mut client)?;
handle_grants(self, &mut client)?;
create_writability_check_data(&mut client)?;
create_writablity_check_data(&mut client)?;
// 'Close' connection
drop(client);
info!(
"finished configuration of compute for project {}",
self.spec.cluster.cluster_id
);
Ok(())
}
#[instrument(skip(self))]
pub fn start_compute(&self) -> Result<std::process::Child> {
info!(
"starting compute for project {}, operation {}, tenant {}, timeline {}",
self.spec.cluster.cluster_id,
self.spec.operation_uuid.as_ref().unwrap(),
self.tenant,
self.timeline,
);
self.prepare_pgdata()?;
let start_time = Utc::now();
let pg = self.start_postgres()?;
self.apply_config()?;
let startup_end_time = Utc::now();
self.metrics.config_ms.store(
startup_end_time
.signed_duration_since(start_time)
@@ -334,70 +317,30 @@ impl ComputeNode {
self.set_status(ComputeStatus::Running);
Ok(pg)
info!(
"finished configuration of compute for project {}",
self.spec.cluster.cluster_id
);
// Wait for child Postgres process basically forever. In this state Ctrl+C
// will propagate to Postgres and it will be shut down as well.
let ecode = pg
.wait()
.expect("failed to start waiting on Postgres process");
Ok(ecode)
}
// Look for core dumps and collect backtraces.
//
// EKS worker nodes have following core dump settings:
// /proc/sys/kernel/core_pattern -> core
// /proc/sys/kernel/core_uses_pid -> 1
// ulimint -c -> unlimited
// which results in core dumps being written to postgres data directory as core.<pid>.
//
// Use that as a default location and pattern, except macos where core dumps are written
// to /cores/ directory by default.
pub fn check_for_core_dumps(&self) -> Result<()> {
let core_dump_dir = match std::env::consts::OS {
"macos" => Path::new("/cores/"),
_ => Path::new(&self.pgdata),
};
pub fn prepare_and_run(&self) -> Result<ExitStatus> {
info!(
"starting compute for project {}, operation {}, tenant {}, timeline {}",
self.spec.cluster.cluster_id,
self.spec.operation_uuid.as_ref().unwrap(),
self.tenant,
self.timeline,
);
// Collect core dump paths if any
info!("checking for core dumps in {}", core_dump_dir.display());
let files = fs::read_dir(core_dump_dir)?;
let cores = files.filter_map(|entry| {
let entry = entry.ok()?;
let _ = entry.file_name().to_str()?.strip_prefix("core.")?;
Some(entry.path())
});
// Print backtrace for each core dump
for core_path in cores {
warn!(
"core dump found: {}, collecting backtrace",
core_path.display()
);
// Try first with gdb
let backtrace = Command::new("gdb")
.args(["--batch", "-q", "-ex", "bt", &self.pgbin])
.arg(&core_path)
.output();
// Try lldb if no gdb is found -- that is handy for local testing on macOS
let backtrace = match backtrace {
Err(ref e) if e.kind() == std::io::ErrorKind::NotFound => {
warn!("cannot find gdb, trying lldb");
Command::new("lldb")
.arg("-c")
.arg(&core_path)
.args(["--batch", "-o", "bt all", "-o", "quit"])
.output()
}
_ => backtrace,
}?;
warn!(
"core dump backtrace: {}",
String::from_utf8_lossy(&backtrace.stdout)
);
warn!(
"debugger stderr: {}",
String::from_utf8_lossy(&backtrace.stderr)
);
}
Ok(())
self.prepare_pgdata()?;
self.run()
}
}

View File

@@ -6,20 +6,32 @@ use std::thread;
use anyhow::Result;
use hyper::service::{make_service_fn, service_fn};
use hyper::{Body, Method, Request, Response, Server, StatusCode};
use log::{error, info};
use serde_json;
use tracing::{error, info};
use tracing_utils::http::OtelName;
use crate::compute::ComputeNode;
use crate::compute::{ComputeNode, ComputeStatus};
// Service function to handle all available routes.
async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body> {
//
// NOTE: The URI path is currently included in traces. That's OK because
// it doesn't contain any variable parts or sensitive information. But
// please keep that in mind if you change the routing here.
//
async fn routes(req: Request<Body>, compute: Arc<ComputeNode>) -> Response<Body> {
match (req.method(), req.uri().path()) {
// Timestamp of the last Postgres activity in the plain text.
// DEPRECATED in favour of /status
(&Method::GET, "/last_activity") => {
info!("serving /last_active GET request");
let state = compute.state.read().unwrap();
// Use RFC3339 format for consistency.
Response::new(Body::from(state.last_active.to_rfc3339()))
}
// Has compute setup process finished? -> true/false.
// DEPRECATED in favour of /status
(&Method::GET, "/ready") => {
info!("serving /ready GET request");
let status = compute.get_status();
Response::new(Body::from(format!("{}", status == ComputeStatus::Running)))
}
// Serialized compute state.
(&Method::GET, "/status") => {
info!("serving /status GET request");
@@ -34,9 +46,19 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
Response::new(Body::from(serde_json::to_string(&compute.metrics).unwrap()))
}
// DEPRECATED, use POST instead
(&Method::GET, "/check_writability") => {
info!("serving /check_writability GET request");
let res = crate::checker::check_writability(&compute).await;
match res {
Ok(_) => Response::new(Body::from("true")),
Err(e) => Response::new(Body::from(e.to_string())),
}
}
(&Method::POST, "/check_writability") => {
info!("serving /check_writability POST request");
let res = crate::checker::check_writability(compute).await;
let res = crate::checker::check_writability(&compute).await;
match res {
Ok(_) => Response::new(Body::from("true")),
Err(e) => Response::new(Body::from(e.to_string())),
@@ -62,19 +84,7 @@ async fn serve(state: Arc<ComputeNode>) {
async move {
Ok::<_, Infallible>(service_fn(move |req: Request<Body>| {
let state = state.clone();
async move {
Ok::<_, Infallible>(
// NOTE: We include the URI path in the string. It
// doesn't contain any variable parts or sensitive
// information in this API.
tracing_utils::http::tracing_handler(
req,
|req| routes(req, &state),
OtelName::UriPath,
)
.await,
)
}
async move { Ok::<_, Infallible>(routes(req, state).await) }
}))
}
});

View File

@@ -37,7 +37,58 @@ paths:
schema:
$ref: "#/components/schemas/ComputeMetrics"
/ready:
get:
deprecated: true
tags:
- "info"
summary: Check whether compute startup process finished successfully
description: ""
operationId: computeIsReady
responses:
"200":
description: Compute is ready ('true') or not ('false')
content:
text/plain:
schema:
type: string
example: "true"
/last_activity:
get:
deprecated: true
tags:
- "info"
summary: Get timestamp of the last compute activity
description: ""
operationId: getLastComputeActivityTS
responses:
"200":
description: Timestamp of the last compute activity
content:
text/plain:
schema:
type: string
example: "2022-10-12T07:20:50.52Z"
/check_writability:
get:
deprecated: true
tags:
- "check"
summary: Check that we can write new data on this compute
description: ""
operationId: checkComputeWritabilityDeprecated
responses:
"200":
description: Check result
content:
text/plain:
schema:
type: string
description: Error text or 'true' if check passed
example: "true"
post:
tags:
- "check"

View File

@@ -1,50 +0,0 @@
use std::path::Path;
use std::process;
use std::thread;
use std::time::Duration;
use tracing::{info, warn};
use anyhow::{Context, Result};
const VM_INFORMANT_PATH: &str = "/bin/vm-informant";
const RESTART_INFORMANT_AFTER_MILLIS: u64 = 5000;
/// Launch a thread to start the VM informant if it's present (and restart, on failure)
pub fn spawn_vm_informant_if_present() -> Result<Option<thread::JoinHandle<()>>> {
let exists = Path::new(VM_INFORMANT_PATH)
.try_exists()
.context("could not check if path exists")?;
if !exists {
return Ok(None);
}
Ok(Some(
thread::Builder::new()
.name("run-vm-informant".into())
.spawn(move || run_informant())?,
))
}
fn run_informant() -> ! {
let restart_wait = Duration::from_millis(RESTART_INFORMANT_AFTER_MILLIS);
info!("starting VM informant");
loop {
let mut cmd = process::Command::new(VM_INFORMANT_PATH);
// Block on subprocess:
let result = cmd.status();
match result {
Err(e) => warn!("failed to run VM informant at {VM_INFORMANT_PATH:?}: {e}"),
Ok(status) if !status.success() => {
warn!("{VM_INFORMANT_PATH} exited with code {status:?}, retrying")
}
Ok(_) => info!("{VM_INFORMANT_PATH} ended gracefully (unexpectedly). Retrying"),
}
// Wait before retrying
thread::sleep(restart_wait);
}
}

View File

@@ -8,7 +8,6 @@ pub mod http;
#[macro_use]
pub mod logger;
pub mod compute;
pub mod informant;
pub mod monitor;
pub mod params;
pub mod pg_helpers;

View File

@@ -1,37 +1,43 @@
use tracing_opentelemetry::OpenTelemetryLayer;
use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::prelude::*;
use std::io::Write;
/// Initialize logging to stderr, and OpenTelemetry tracing and exporter.
///
/// Logging is configured using either `default_log_level` or
use anyhow::Result;
use chrono::Utc;
use env_logger::{Builder, Env};
macro_rules! info_println {
($($tts:tt)*) => {
if log_enabled!(Level::Info) {
println!($($tts)*);
}
}
}
macro_rules! info_print {
($($tts:tt)*) => {
if log_enabled!(Level::Info) {
print!($($tts)*);
}
}
}
/// Initialize `env_logger` using either `default_level` or
/// `RUST_LOG` environment variable as default log level.
///
/// OpenTelemetry is configured with OTLP/HTTP exporter. It picks up
/// configuration from environment variables. For example, to change the destination,
/// set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`. See
/// `tracing-utils` package description.
///
pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
// Initialize Logging
let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
.unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_log_level));
pub fn init_logger(default_level: &str) -> Result<()> {
let env = Env::default().filter_or("RUST_LOG", default_level);
let fmt_layer = tracing_subscriber::fmt::layer()
.with_target(false)
.with_writer(std::io::stderr);
// Initialize OpenTelemetry
let otlp_layer =
tracing_utils::init_tracing_without_runtime("compute_ctl").map(OpenTelemetryLayer::new);
// Put it all together
tracing_subscriber::registry()
.with(env_filter)
.with(otlp_layer)
.with(fmt_layer)
Builder::from_env(env)
.format(|buf, record| {
let thread_handle = std::thread::current();
writeln!(
buf,
"{} [{}] {}: {}",
Utc::now().format("%Y-%m-%d %H:%M:%S%.3f %Z"),
thread_handle.name().unwrap_or("main"),
record.level(),
record.args()
)
})
.init();
tracing::info!("logging and tracing started");
Ok(())
}

View File

@@ -3,8 +3,8 @@ use std::{thread, time};
use anyhow::Result;
use chrono::{DateTime, Utc};
use log::{debug, info};
use postgres::{Client, NoTls};
use tracing::{debug, info};
use crate::compute::ComputeNode;
@@ -52,16 +52,10 @@ fn watch_compute_activity(compute: &ComputeNode) {
let mut idle_backs: Vec<DateTime<Utc>> = vec![];
for b in backs.into_iter() {
let state: String = match b.try_get("state") {
Ok(state) => state,
Err(_) => continue,
};
let state: String = b.get("state");
let change: String = b.get("state_change");
if state == "idle" {
let change: String = match b.try_get("state_change") {
Ok(state_change) => state_change,
Err(_) => continue,
};
let change = DateTime::parse_from_rfc3339(&change);
match change {
Ok(t) => idle_backs.push(t.with_timezone(&Utc)),
@@ -80,8 +74,10 @@ fn watch_compute_activity(compute: &ComputeNode) {
}
}
// Get idle backend `state_change` with the max timestamp.
if let Some(last) = idle_backs.iter().max() {
// Sort idle backend `state_change` timestamps. The last one corresponds
// to the last activity.
idle_backs.sort();
if let Some(last) = idle_backs.last() {
last_active = *last;
}
}

View File

@@ -1,9 +1,3 @@
pub const DEFAULT_LOG_LEVEL: &str = "info";
// From Postgres docs:
// To ease transition from the md5 method to the newer SCRAM method, if md5 is specified
// as a method in pg_hba.conf but the user's password on the server is encrypted for SCRAM
// (see below), then SCRAM-based authentication will automatically be chosen instead.
// https://www.postgresql.org/docs/15/auth-password.html
//
// So it's safe to set md5 here, as `control-plane` anyway uses SCRAM for all roles.
pub const DEFAULT_CONNSTRING: &str = "host=localhost user=postgres";
pub const PG_HBA_ALL_MD5: &str = "host\tall\t\tall\t\t0.0.0.0/0\t\tmd5";

View File

@@ -11,7 +11,6 @@ use anyhow::{bail, Result};
use notify::{RecursiveMode, Watcher};
use postgres::{Client, Transaction};
use serde::Deserialize;
use tracing::{debug, instrument};
const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds
@@ -120,9 +119,16 @@ pub trait GenericOptionsSearch {
impl GenericOptionsSearch for GenericOptions {
/// Lookup option by name
fn find(&self, name: &str) -> Option<String> {
let ops = self.as_ref()?;
let op = ops.iter().find(|s| s.name == name)?;
op.value.clone()
match &self {
Some(ops) => {
let op = ops.iter().find(|s| s.name == name);
match op {
Some(op) => op.value.clone(),
None => None,
}
}
None => None,
}
}
}
@@ -130,8 +136,8 @@ impl Role {
/// Serialize a list of role parameters into a Postgres-acceptable
/// string of arguments.
pub fn to_pg_options(&self) -> String {
// XXX: consider putting LOGIN as a default option somewhere higher, e.g. in control-plane.
// For now, we do not use generic `options` for roles. Once used, add
// XXX: consider putting LOGIN as a default option somewhere higher, e.g. in Rails.
// For now we do not use generic `options` for roles. Once used, add
// `self.options.as_pg_options()` somewhere here.
let mut params: String = "LOGIN".to_string();
@@ -155,14 +161,6 @@ impl Role {
}
impl Database {
pub fn new(name: PgIdent, owner: PgIdent) -> Self {
Self {
name,
owner,
options: None,
}
}
/// Serialize a list of database parameters into a Postgres-acceptable
/// string of arguments.
/// NB: `TEMPLATE` is actually also an identifier, but so far we only need
@@ -221,7 +219,11 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
&[],
)?
.iter()
.map(|row| Database::new(row.get("datname"), row.get("owner")))
.map(|row| Database {
name: row.get("datname"),
owner: row.get("owner"),
options: None,
})
.collect();
Ok(postgres_dbs)
@@ -230,7 +232,6 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
/// Wait for Postgres to become ready to accept connections. It's ready to
/// accept connections when the state-field in `pgdata/postmaster.pid` says
/// 'ready'.
#[instrument(skip(pg))]
pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
let pid_path = pgdata.join("postmaster.pid");
@@ -289,18 +290,18 @@ pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
}
let res = rx.recv_timeout(Duration::from_millis(100));
debug!("woken up by notify: {res:?}");
log::debug!("woken up by notify: {res:?}");
// If there are multiple events in the channel already, we only need to be
// check once. Swallow the extra events before we go ahead to check the
// pid file.
while let Ok(res) = rx.try_recv() {
debug!("swallowing extra event: {res:?}");
log::debug!("swallowing extra event: {res:?}");
}
// Check that we can open pid file first.
if let Ok(file) = File::open(&pid_path) {
if !postmaster_pid_seen {
debug!("postmaster.pid appeared");
log::debug!("postmaster.pid appeared");
watcher
.unwatch(pgdata)
.expect("Failed to remove pgdata dir watch");
@@ -316,7 +317,7 @@ pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
// Pid file could be there and we could read it, but it could be empty, for example.
if let Some(Ok(line)) = last_line {
let status = line.trim();
debug!("last line of postmaster.pid: {status:?}");
log::debug!("last line of postmaster.pid: {status:?}");
// Now Postgres is ready to accept connections
if status == "ready" {
@@ -332,7 +333,7 @@ pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
}
}
tracing::info!("PostgreSQL is now running, continuing to configure it");
log::info!("PostgreSQL is now running, continuing to configure it");
Ok(())
}

View File

@@ -1,12 +1,11 @@
use std::collections::HashMap;
use std::path::Path;
use std::str::FromStr;
use anyhow::Result;
use log::{info, log_enabled, warn, Level};
use postgres::config::Config;
use postgres::{Client, NoTls};
use serde::Deserialize;
use tracing::{info, info_span, instrument, span_enabled, warn, Level};
use crate::compute::ComputeNode;
use crate::config;
@@ -23,8 +22,6 @@ pub struct ComputeSpec {
/// Expected cluster state at the end of transition process.
pub cluster: Cluster,
pub delta_operations: Option<Vec<DeltaOp>>,
pub startup_tracing_context: Option<HashMap<String, String>>,
}
/// Cluster state seen from the perspective of the external tools
@@ -82,25 +79,23 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
/// Given a cluster spec json and open transaction it handles roles creation,
/// deletion and update.
#[instrument(skip_all)]
pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
let mut xact = client.transaction()?;
let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;
// Print a list of existing Postgres roles (only in debug mode)
if span_enabled!(Level::INFO) {
info!("postgres roles:");
for r in &existing_roles {
info!(
" - {}:{}",
r.name,
if r.encrypted_password.is_some() {
"[FILTERED]"
} else {
"(null)"
}
);
}
info!("postgres roles:");
for r in &existing_roles {
info_println!(
"{} - {}:{}",
" ".repeat(27 + 5),
r.name,
if r.encrypted_password.is_some() {
"[FILTERED]"
} else {
"(null)"
}
);
}
// Process delta operations first
@@ -141,80 +136,58 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
info!("cluster spec roles:");
for role in &spec.cluster.roles {
let name = &role.name;
// XXX: with a limited number of roles it is fine, but consider making it a HashMap
let pg_role = existing_roles.iter().find(|r| r.name == *name);
enum RoleAction {
None,
Update,
Create,
}
let action = if let Some(r) = pg_role {
if (r.encrypted_password.is_none() && role.encrypted_password.is_some())
|| (r.encrypted_password.is_some() && role.encrypted_password.is_none())
{
RoleAction::Update
} else if let Some(pg_pwd) = &r.encrypted_password {
// Check whether password changed or not (trim 'md5' prefix first if any)
//
// This is a backward compatibility hack, which comes from the times when we were using
// md5 for everyone and hashes were stored in the console db without md5 prefix. So when
// role comes from the control-plane (json spec) `Role.encrypted_password` doesn't have md5 prefix,
// but when role comes from Postgres (`get_existing_roles` / `existing_roles`) it has this prefix.
// Here is the only place so far where we compare hashes, so it seems to be the best candidate
// to place this compatibility layer.
let pg_pwd = if let Some(stripped) = pg_pwd.strip_prefix("md5") {
stripped
} else {
pg_pwd
};
if pg_pwd != *role.encrypted_password.as_ref().unwrap() {
RoleAction::Update
} else {
RoleAction::None
}
} else {
RoleAction::None
}
} else {
RoleAction::Create
};
match action {
RoleAction::None => {}
RoleAction::Update => {
let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
query.push_str(&role.to_pg_options());
xact.execute(query.as_str(), &[])?;
}
RoleAction::Create => {
let mut query: String = format!("CREATE ROLE {} ", name.pg_quote());
info!("role create query: '{}'", &query);
query.push_str(&role.to_pg_options());
xact.execute(query.as_str(), &[])?;
let grant_query = format!(
"GRANT pg_read_all_data, pg_write_all_data TO {}",
name.pg_quote()
);
xact.execute(grant_query.as_str(), &[])?;
info!("role grant query: '{}'", &grant_query);
}
}
if span_enabled!(Level::INFO) {
let pwd = if role.encrypted_password.is_some() {
info_print!(
"{} - {}:{}",
" ".repeat(27 + 5),
name,
if role.encrypted_password.is_some() {
"[FILTERED]"
} else {
"(null)"
};
let action_str = match action {
RoleAction::None => "",
RoleAction::Create => " -> create",
RoleAction::Update => " -> update",
};
info!(" - {}:{}{}", name, pwd, action_str);
}
);
// XXX: with a limited number of roles it is fine, but consider making it a HashMap
let pg_role = existing_roles.iter().find(|r| r.name == *name);
if let Some(r) = pg_role {
let mut update_role = false;
if (r.encrypted_password.is_none() && role.encrypted_password.is_some())
|| (r.encrypted_password.is_some() && role.encrypted_password.is_none())
{
update_role = true;
} else if let Some(pg_pwd) = &r.encrypted_password {
// Check whether password changed or not (trim 'md5:' prefix first)
update_role = pg_pwd[3..] != *role.encrypted_password.as_ref().unwrap();
}
if update_role {
let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
info_print!(" -> update");
query.push_str(&role.to_pg_options());
xact.execute(query.as_str(), &[])?;
}
} else {
info!("role name: '{}'", &name);
let mut query: String = format!("CREATE ROLE {} ", name.pg_quote());
info!("role create query: '{}'", &query);
info_print!(" -> create");
query.push_str(&role.to_pg_options());
xact.execute(query.as_str(), &[])?;
let grant_query = format!(
"GRANT pg_read_all_data, pg_write_all_data TO {}",
name.pg_quote()
);
xact.execute(grant_query.as_str(), &[])?;
info!("role grant query: '{}'", &grant_query);
}
info_print!("\n");
}
xact.commit()?;
@@ -223,32 +196,23 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
}
/// Reassign all dependent objects and delete requested roles.
#[instrument(skip_all)]
pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<()> {
if let Some(ops) = &node.spec.delta_operations {
// First, reassign all dependent objects to db owners.
let spec = &node.spec;
// First, reassign all dependent objects to db owners.
if let Some(ops) = &spec.delta_operations {
info!("reassigning dependent objects of to-be-deleted roles");
// Fetch existing roles. We could've exported and used `existing_roles` from
// `handle_roles()`, but we only make this list there before creating new roles.
// Which is probably fine as we never create to-be-deleted roles, but that'd
// just look a bit untidy. Anyway, the entire `pg_roles` should be in shared
// buffers already, so this shouldn't be a big deal.
let mut xact = client.transaction()?;
let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;
xact.commit()?;
for op in ops {
// Check that role is still present in Postgres, as this could be a
// restart with the same spec after role deletion.
if op.action == "delete_role" && existing_roles.iter().any(|r| r.name == op.name) {
if op.action == "delete_role" {
reassign_owned_objects(node, &op.name)?;
}
}
}
// Second, proceed with role deletions.
// Second, proceed with role deletions.
let mut xact = client.transaction()?;
if let Some(ops) = &spec.delta_operations {
info!("processing role deletions");
let mut xact = client.transaction()?;
for op in ops {
// We do not check either role exists or not,
// Postgres will take care of it for us
@@ -259,7 +223,6 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<
xact.execute(query.as_str(), &[])?;
}
}
xact.commit()?;
}
Ok(())
@@ -300,16 +263,13 @@ fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()>
/// like `CREATE DATABASE` and `DROP DATABASE` do not support it. Statement-level
/// atomicity should be enough here due to the order of operations and various checks,
/// which together provide us idempotency.
#[instrument(skip_all)]
pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
let existing_dbs: Vec<Database> = get_existing_dbs(client)?;
// Print a list of existing Postgres databases (only in debug mode)
if span_enabled!(Level::INFO) {
info!("postgres databases:");
for r in &existing_dbs {
info!(" {}:{}", r.name, r.owner);
}
info!("postgres databases:");
for r in &existing_dbs {
info_println!("{} - {}:{}", " ".repeat(27 + 5), r.name, r.owner);
}
// Process delta operations first
@@ -352,15 +312,12 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
for db in &spec.cluster.databases {
let name = &db.name;
info_print!("{} - {}:{}", " ".repeat(27 + 5), db.name, db.owner);
// XXX: with a limited number of databases it is fine, but consider making it a HashMap
let pg_db = existing_dbs.iter().find(|r| r.name == *name);
enum DatabaseAction {
None,
Update,
Create,
}
let action = if let Some(r) = pg_db {
if let Some(r) = pg_db {
// XXX: db owner name is returned as quoted string from Postgres,
// when quoting is needed.
let new_owner = if r.owner.starts_with('"') {
@@ -370,42 +327,24 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
};
if new_owner != r.owner {
// Update the owner
DatabaseAction::Update
} else {
DatabaseAction::None
}
} else {
DatabaseAction::Create
};
match action {
DatabaseAction::None => {}
DatabaseAction::Update => {
let query: String = format!(
"ALTER DATABASE {} OWNER TO {}",
name.pg_quote(),
db.owner.pg_quote()
);
let _guard = info_span!("executing", query).entered();
client.execute(query.as_str(), &[])?;
}
DatabaseAction::Create => {
let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote());
query.push_str(&db.to_pg_options());
let _guard = info_span!("executing", query).entered();
client.execute(query.as_str(), &[])?;
}
};
info_print!(" -> update");
if span_enabled!(Level::INFO) {
let action_str = match action {
DatabaseAction::None => "",
DatabaseAction::Create => " -> create",
DatabaseAction::Update => " -> update",
};
info!(" - {}:{}{}", db.name, db.owner, action_str);
client.execute(query.as_str(), &[])?;
}
} else {
let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote());
info_print!(" -> create");
query.push_str(&db.to_pg_options());
client.execute(query.as_str(), &[])?;
}
info_print!("\n");
}
Ok(())
@@ -413,7 +352,6 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
/// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
/// to allow users creating trusted extensions and re-creating `public` schema, for example.
#[instrument(skip_all)]
pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
let spec = &node.spec;

View File

@@ -38,33 +38,4 @@ mod pg_helpers_tests {
assert_eq!(ident.pg_quote(), "\"\"\"name\"\";\\n select 1;\"");
}
#[test]
fn generic_options_search() {
let generic_options: GenericOptions = Some(vec![
GenericOption {
name: "present_value".into(),
value: Some("value".into()),
vartype: "string".into(),
},
GenericOption {
name: "missed_value".into(),
value: None,
vartype: "int".into(),
},
]);
assert_eq!(generic_options.find("present_value"), Some("value".into()));
assert_eq!(generic_options.find("missed_value"), None);
assert_eq!(generic_options.find("invalid_value"), None);
let empty_generic_options: GenericOptions = Some(vec![]);
assert_eq!(empty_generic_options.find("present_value"), None);
assert_eq!(empty_generic_options.find("missed_value"), None);
assert_eq!(empty_generic_options.find("invalid_value"), None);
let none_generic_options: GenericOptions = None;
assert_eq!(none_generic_options.find("present_value"), None);
assert_eq!(none_generic_options.find("missed_value"), None);
assert_eq!(none_generic_options.find("invalid_value"), None);
}
}

View File

@@ -1,31 +1,29 @@
[package]
name = "control_plane"
version = "0.1.0"
edition.workspace = true
license.workspace = true
edition = "2021"
[dependencies]
anyhow.workspace = true
clap.workspace = true
comfy-table.workspace = true
git-version.workspace = true
nix.workspace = true
once_cell.workspace = true
postgres.workspace = true
regex.workspace = true
reqwest = { workspace = true, features = ["blocking", "json"] }
serde.workspace = true
serde_with.workspace = true
tar.workspace = true
thiserror.workspace = true
toml.workspace = true
url.workspace = true
anyhow = "1.0"
clap = "4.0"
comfy-table = "6.1"
git-version = "0.3.5"
nix = "0.25"
once_cell = "1.13.0"
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
regex = "1"
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
serde = { version = "1.0", features = ["derive"] }
serde_with = "2.0"
tar = "0.4.38"
thiserror = "1"
toml = "0.5"
url = "2.2.2"
# Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
# instead, so that recompile times are better.
pageserver_api.workspace = true
safekeeper_api.workspace = true
postgres_connection.workspace = true
storage_broker.workspace = true
utils.workspace = true
workspace_hack.workspace = true
pageserver_api = { path = "../libs/pageserver_api" }
postgres_connection = { path = "../libs/postgres_connection" }
safekeeper_api = { path = "../libs/safekeeper_api" }
utils = { path = "../libs/utils" }
workspace_hack = { version = "0.1", path = "../workspace_hack" }

View File

@@ -10,5 +10,5 @@ id = 1
pg_port = 5454
http_port = 7676
[broker]
listen_addr = '127.0.0.1:50051'
[etcd_broker]
broker_endpoints = ['http://127.0.0.1:2379']

View File

@@ -14,19 +14,17 @@
use std::ffi::OsStr;
use std::io::Write;
use std::os::unix::prelude::AsRawFd;
use std::os::unix::process::CommandExt;
use std::path::{Path, PathBuf};
use std::path::Path;
use std::process::{Child, Command};
use std::time::Duration;
use std::{fs, io, thread};
use anyhow::Context;
use anyhow::{anyhow, bail, Context, Result};
use nix::errno::Errno;
use nix::fcntl::{FcntlArg, FdFlag};
use nix::sys::signal::{kill, Signal};
use nix::unistd::Pid;
use utils::pid_file::{self, PidFileRead};
use utils::lock_file;
// These constants control the loop used to poll for process start / stop.
//
@@ -51,21 +49,21 @@ pub enum InitialPidFile<'t> {
}
/// Start a background child process using the parameters given.
pub fn start_process<F, AI, A, EI>(
pub fn start_process<
F,
S: AsRef<OsStr>,
EI: IntoIterator<Item = (String, String)>, // Not generic AsRef<OsStr>, otherwise empty `envs` prevents type inference
>(
process_name: &str,
datadir: &Path,
command: &Path,
args: AI,
args: &[S],
envs: EI,
initial_pid_file: InitialPidFile,
process_status_check: F,
) -> anyhow::Result<Child>
where
F: Fn() -> anyhow::Result<bool>,
AI: IntoIterator<Item = A>,
A: AsRef<OsStr>,
// Not generic AsRef<OsStr>, otherwise empty `envs` prevents type inference
EI: IntoIterator<Item = (String, String)>,
{
let log_path = datadir.join(format!("{process_name}.log"));
let process_log_file = fs::OpenOptions::new()
@@ -88,14 +86,6 @@ where
let filled_cmd = fill_aws_secrets_vars(fill_rust_env_vars(background_command));
filled_cmd.envs(envs);
let pid_file_to_check = match initial_pid_file {
InitialPidFile::Create(path) => {
pre_exec_create_pidfile(filled_cmd, path);
path
}
InitialPidFile::Expect(path) => path,
};
let mut spawned_process = filled_cmd.spawn().with_context(|| {
format!("Could not spawn {process_name}, see console output and log files for details.")
})?;
@@ -105,8 +95,29 @@ where
.with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?,
);
let pid_file_to_check = match initial_pid_file {
InitialPidFile::Create(target_pid_file_path) => {
match lock_file::create_lock_file(target_pid_file_path, pid.to_string()) {
lock_file::LockCreationResult::Created { .. } => {
// We use "lock" file here only to create the pid file. The lock on the pidfile will be dropped as soon
// as this CLI invocation exits, so it's a bit useless, but doesn't any harm either.
}
lock_file::LockCreationResult::AlreadyLocked { .. } => {
anyhow::bail!("Cannot write pid file for {process_name} at path {target_pid_file_path:?}: file is already locked by another process")
}
lock_file::LockCreationResult::CreationFailed(e) => {
return Err(e.context(format!(
"Failed to create pid file for {process_name} at path {target_pid_file_path:?}"
)))
}
}
None
}
InitialPidFile::Expect(pid_file_path) => Some(pid_file_path),
};
for retries in 0..RETRIES {
match process_started(pid, Some(pid_file_to_check), &process_status_check) {
match process_started(pid, pid_file_to_check, &process_status_check) {
Ok(true) => {
println!("\n{process_name} started, pid: {pid}");
return Ok(spawned_process);
@@ -138,27 +149,12 @@ where
/// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> anyhow::Result<()> {
let pid = match pid_file::read(pid_file)
.with_context(|| format!("read pid_file {pid_file:?}"))?
{
PidFileRead::NotExist => {
println!("{process_name} is already stopped: no pid file present at {pid_file:?}");
return Ok(());
}
PidFileRead::NotHeldByAnyProcess(_) => {
// Don't try to kill according to file contents beacuse the pid might have been re-used by another process.
// Don't delete the file either, it can race with new pid file creation.
// Read `pid_file` module comment for details.
println!(
"No process is holding the pidfile. The process must have already exited. Leave in place to avoid race conditions: {pid_file:?}"
);
return Ok(());
}
PidFileRead::LockedByOtherProcess(pid) => pid,
};
// XXX the pid could become invalid (and recycled) at any time before the kill() below.
if !pid_file.exists() {
println!("{process_name} is already stopped: no pid file {pid_file:?} is present");
return Ok(());
}
let pid = read_pidfile(pid_file)?;
// send signal
let sig = if immediate {
print!("Stopping {process_name} with pid {pid} immediately..");
Signal::SIGQUIT
@@ -170,9 +166,8 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
match kill(pid, sig) {
Ok(()) => (),
Err(Errno::ESRCH) => {
// Again, don't delete the pid file. The unlink can race with a new pid file being created.
println!(
"{process_name} with pid {pid} does not exist, but a pid file {pid_file:?} was found. Likely the pid got recycled. Lucky we didn't harm anyone."
"{process_name} with pid {pid} does not exist, but a pid file {pid_file:?} was found"
);
return Ok(());
}
@@ -184,6 +179,11 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
match process_has_stopped(pid) {
Ok(true) => {
println!("\n{process_name} stopped");
if let Err(e) = fs::remove_file(pid_file) {
if e.kind() != io::ErrorKind::NotFound {
eprintln!("Failed to remove pid file {pid_file:?} after stopping the process: {e:#}");
}
}
return Ok(());
}
Ok(false) => {
@@ -209,14 +209,7 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
}
fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
// If RUST_BACKTRACE is set, pass it through. But if it's not set, default
// to RUST_BACKTRACE=1.
let backtrace_setting = std::env::var_os("RUST_BACKTRACE");
let backtrace_setting = backtrace_setting
.as_deref()
.unwrap_or_else(|| OsStr::new("1"));
let mut filled_cmd = cmd.env_clear().env("RUST_BACKTRACE", backtrace_setting);
let mut filled_cmd = cmd.env_clear().env("RUST_BACKTRACE", "1");
// Pass through these environment variables to the command
for var in ["LLVM_PROFILE_FILE", "FAILPOINTS", "RUST_LOG"] {
@@ -241,69 +234,6 @@ fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
cmd
}
/// Add a `pre_exec` to the cmd that, inbetween fork() and exec(),
/// 1. Claims a pidfile with a fcntl lock on it and
/// 2. Sets up the pidfile's file descriptor so that it (and the lock)
/// will remain held until the cmd exits.
fn pre_exec_create_pidfile<P>(cmd: &mut Command, path: P) -> &mut Command
where
P: Into<PathBuf>,
{
let path: PathBuf = path.into();
// SAFETY
// pre_exec is marked unsafe because it runs between fork and exec.
// Why is that dangerous in various ways?
// Long answer: https://github.com/rust-lang/rust/issues/39575
// Short answer: in a multi-threaded program, other threads may have
// been inside of critical sections at the time of fork. In the
// original process, that was allright, assuming they protected
// the critical sections appropriately, e.g., through locks.
// Fork adds another process to the mix that
// 1. Has a single thread T
// 2. In an exact copy of the address space at the time of fork.
// A variety of problems scan occur now:
// 1. T tries to grab a lock that was locked at the time of fork.
// It will wait forever since in its address space, the lock
// is in state 'taken' but the thread that would unlock it is
// not there.
// 2. A rust object that represented some external resource in the
// parent now got implicitly copied by the the fork, even though
// the object's type is not `Copy`. The parent program may use
// non-copyability as way to enforce unique ownership of an
// external resource in the typesystem. The fork breaks that
// assumption, as now both parent and child process have an
// owned instance of the object that represents the same
// underlying resource.
// While these seem like niche problems, (1) in particular is
// highly relevant. For example, `malloc()` may grab a mutex internally,
// and so, if we forked while another thread was mallocing' and our
// pre_exec closure allocates as well, it will block on the malloc
// mutex forever
//
// The proper solution is to only use C library functions that are marked
// "async-signal-safe": https://man7.org/linux/man-pages/man7/signal-safety.7.html
//
// With this specific pre_exec() closure, the non-error path doesn't allocate.
// The error path uses `anyhow`, and hence does allocate.
// We take our chances there, hoping that any potential disaster is constrained
// to the child process (e.g., malloc has no state ourside of the child process).
// Last, `expect` prints to stderr, and stdio is not async-signal-safe.
// Again, we take our chances, making the same assumptions as for malloc.
unsafe {
cmd.pre_exec(move || {
let file = pid_file::claim_for_current_process(&path).expect("claim pid file");
// Remove the FD_CLOEXEC flag on the pidfile descriptor so that the pidfile
// remains locked after exec.
nix::fcntl::fcntl(file.as_raw_fd(), FcntlArg::F_SETFD(FdFlag::empty()))
.expect("remove FD_CLOEXEC");
// Don't run drop(file), it would close the file before we actually exec.
std::mem::forget(file);
Ok(())
});
}
cmd
}
fn process_started<F>(
pid: Pid,
pid_file_to_check: Option<&Path>,
@@ -314,11 +244,14 @@ where
{
match status_check() {
Ok(true) => match pid_file_to_check {
Some(pid_file_path) => match pid_file::read(pid_file_path)? {
PidFileRead::NotExist => Ok(false),
PidFileRead::LockedByOtherProcess(pid_in_file) => Ok(pid_in_file == pid),
PidFileRead::NotHeldByAnyProcess(_) => Ok(false),
},
Some(pid_file_path) => {
if pid_file_path.exists() {
let pid_in_file = read_pidfile(pid_file_path)?;
Ok(pid_in_file == pid)
} else {
Ok(false)
}
}
None => Ok(true),
},
Ok(false) => Ok(false),
@@ -326,6 +259,21 @@ where
}
}
/// Read a PID file
///
/// We expect a file that contains a single integer.
fn read_pidfile(pidfile: &Path) -> Result<Pid> {
let pid_str = fs::read_to_string(pidfile)
.with_context(|| format!("failed to read pidfile {pidfile:?}"))?;
let pid: i32 = pid_str
.parse()
.map_err(|_| anyhow!("failed to parse pidfile {pidfile:?}"))?;
if pid < 1 {
bail!("pidfile {pidfile:?} contained bad value '{pid}'");
}
Ok(Pid::from_raw(pid))
}
fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
match kill(pid, None) {
// Process exists, keep waiting

View File

@@ -8,10 +8,10 @@
use anyhow::{anyhow, bail, Context, Result};
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
use control_plane::compute::ComputeControlPlane;
use control_plane::local_env::LocalEnv;
use control_plane::local_env::{EtcdBroker, LocalEnv};
use control_plane::pageserver::PageServerNode;
use control_plane::safekeeper::SafekeeperNode;
use control_plane::{broker, local_env};
use control_plane::{etcd, local_env};
use pageserver_api::models::TimelineInfo;
use pageserver_api::{
DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR,
@@ -22,10 +22,9 @@ use safekeeper_api::{
DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
};
use std::collections::{BTreeSet, HashMap};
use std::path::PathBuf;
use std::path::{Path, PathBuf};
use std::process::exit;
use std::str::FromStr;
use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
use utils::{
auth::{Claims, Scope},
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
@@ -42,12 +41,13 @@ project_git_version!(GIT_VERSION);
const DEFAULT_PG_VERSION: &str = "14";
fn default_conf() -> String {
fn default_conf(etcd_binary_path: &Path) -> String {
format!(
r#"
# Default built-in configuration, defined in main.rs
[broker]
listen_addr = '{DEFAULT_BROKER_ADDR}'
[etcd_broker]
broker_endpoints = ['http://localhost:2379']
etcd_binary_path = '{etcd_binary_path}'
[pageserver]
id = {DEFAULT_PAGESERVER_ID}
@@ -60,6 +60,7 @@ id = {DEFAULT_SAFEKEEPER_ID}
pg_port = {DEFAULT_SAFEKEEPER_PG_PORT}
http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT}
"#,
etcd_binary_path = etcd_binary_path.display(),
pageserver_auth_type = AuthType::Trust,
)
}
@@ -263,7 +264,7 @@ fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::R
} else if let Some(default_id) = env.default_tenant_id {
Ok(default_id)
} else {
anyhow::bail!("No tenant id. Use --tenant-id, or set a default tenant");
bail!("No tenant id. Use --tenant-id, or set 'default_tenant_id' in the config file");
}
}
@@ -284,6 +285,8 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId
}
fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
let initial_timeline_id_arg = parse_timeline_id(init_match)?;
// Create config file
let toml_file: String = if let Some(config_path) = init_match.get_one::<PathBuf>("config") {
// load and parse the file
@@ -295,7 +298,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
})?
} else {
// Built-in default config
default_conf()
default_conf(&EtcdBroker::locate_etcd()?)
};
let pg_version = init_match
@@ -307,16 +310,30 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
env.init(pg_version)
.context("Failed to initialize neon repository")?;
let initial_tenant_id = env
.default_tenant_id
.expect("default_tenant_id should be generated by the `env.init()` call above");
// Initialize pageserver, create initial tenant and timeline.
let pageserver = PageServerNode::from_env(&env);
pageserver
.initialize(&pageserver_config_overrides(init_match))
let initial_timeline_id = pageserver
.initialize(
Some(initial_tenant_id),
initial_timeline_id_arg,
&pageserver_config_overrides(init_match),
pg_version,
)
.unwrap_or_else(|e| {
eprintln!("pageserver init failed: {e:?}");
eprintln!("pageserver init failed: {e}");
exit(1);
});
env.register_branch_mapping(
DEFAULT_BRANCH_NAME.to_owned(),
initial_tenant_id,
initial_timeline_id,
)?;
Ok(env)
}
@@ -325,7 +342,7 @@ fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
.get_many::<String>("pageserver-config-override")
.into_iter()
.flatten()
.map(String::as_str)
.map(|s| s.as_str())
.collect()
}
@@ -372,17 +389,6 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
println!(
"Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {new_tenant_id}",
);
if create_match.get_flag("set-default") {
println!("Setting tenant {new_tenant_id} as a default one");
env.default_tenant_id = Some(new_tenant_id);
}
}
Some(("set-default", set_default_match)) => {
let tenant_id =
parse_tenant_id(set_default_match)?.context("No tenant id specified")?;
println!("Setting tenant {tenant_id} as a default one");
env.default_tenant_id = Some(tenant_id);
}
Some(("config", create_match)) => {
let tenant_id = get_tenant_id(create_match, env)?;
@@ -544,7 +550,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
table.load_preset(comfy_table::presets::NOTHING);
table.set_header([
table.set_header(&[
"NODE",
"ADDRESS",
"TIMELINE",
@@ -579,7 +585,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
.map(|name| name.as_str())
.unwrap_or("?");
table.add_row([
table.add_row(&[
node_name.as_str(),
&node.address.to_string(),
&node.timeline_id.to_string(),
@@ -742,7 +748,7 @@ fn get_safekeeper(env: &local_env::LocalEnv, id: NodeId) -> Result<SafekeeperNod
if let Some(node) = env.safekeepers.iter().find(|node| node.id == id) {
Ok(SafekeeperNode::from_env(env, node))
} else {
bail!("could not find safekeeper {id}")
bail!("could not find safekeeper '{}'", id)
}
}
@@ -801,22 +807,22 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
}
fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
etcd::start_etcd_process(env)?;
let pageserver = PageServerNode::from_env(env);
// Postgres nodes are not started automatically
broker::start_broker_process(env)?;
let pageserver = PageServerNode::from_env(env);
if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) {
eprintln!("pageserver {} start failed: {:#}", env.pageserver.id, e);
try_stop_all(env, true);
eprintln!("pageserver start failed: {e}");
try_stop_etcd_process(env);
exit(1);
}
for node in env.safekeepers.iter() {
let safekeeper = SafekeeperNode::from_env(env, node);
if let Err(e) = safekeeper.start() {
eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
try_stop_all(env, false);
eprintln!("safekeeper '{}' start failed: {e}", safekeeper.id);
try_stop_etcd_process(env);
exit(1);
}
}
@@ -827,41 +833,35 @@ fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<
let immediate =
sub_match.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
try_stop_all(env, immediate);
Ok(())
}
fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
let pageserver = PageServerNode::from_env(env);
// Stop all compute nodes
match ComputeControlPlane::load(env.clone()) {
Ok(cplane) => {
for (_k, node) in cplane.nodes {
if let Err(e) = node.stop(false) {
eprintln!("postgres stop failed: {e:#}");
}
}
}
Err(e) => {
eprintln!("postgres stop failed, could not restore control plane data from env: {e:#}")
let cplane = ComputeControlPlane::load(env.clone())?;
for (_k, node) in cplane.nodes {
if let Err(e) = node.stop(false) {
eprintln!("postgres stop failed: {}", e);
}
}
if let Err(e) = pageserver.stop(immediate) {
eprintln!("pageserver {} stop failed: {:#}", env.pageserver.id, e);
eprintln!("pageserver stop failed: {}", e);
}
for node in env.safekeepers.iter() {
let safekeeper = SafekeeperNode::from_env(env, node);
if let Err(e) = safekeeper.stop(immediate) {
eprintln!("safekeeper {} stop failed: {:#}", safekeeper.id, e);
eprintln!("safekeeper '{}' stop failed: {}", safekeeper.id, e);
}
}
if let Err(e) = broker::stop_broker_process(env) {
eprintln!("neon broker stop failed: {e:#}");
try_stop_etcd_process(env);
Ok(())
}
fn try_stop_etcd_process(env: &local_env::LocalEnv) {
if let Err(e) = etcd::stop_etcd_process(env) {
eprintln!("etcd stop failed: {e}");
}
}
@@ -901,7 +901,6 @@ fn cli() -> Command {
let stop_mode_arg = Arg::new("stop-mode")
.short('m')
.value_parser(["fast", "immediate"])
.default_value("fast")
.help("If 'immediate', don't flush repository data at shutdown")
.required(false)
.value_name("stop-mode");
@@ -923,8 +922,9 @@ fn cli() -> Command {
.version(GIT_VERSION)
.subcommand(
Command::new("init")
.about("Initialize a new Neon repository, preparing configs for services to start with")
.about("Initialize a new Neon repository")
.arg(pageserver_config_args.clone())
.arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
.arg(
Arg::new("config")
.long("config")
@@ -986,14 +986,11 @@ fn cli() -> Command {
.arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
.arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))
.arg(pg_version_arg.clone())
.arg(Arg::new("set-default").long("set-default").action(ArgAction::SetTrue).required(false)
.help("Use this tenant in future CLI commands where tenant_id is needed, but not specified"))
)
.subcommand(Command::new("set-default").arg(tenant_id_arg.clone().required(true))
.about("Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified"))
.subcommand(Command::new("config")
.arg(tenant_id_arg.clone())
.arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
.arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))
)
)
.subcommand(
Command::new("pageserver")

View File

@@ -1,48 +0,0 @@
use anyhow::Context;
use std::path::PathBuf;
use crate::{background_process, local_env};
pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
let broker = &env.broker;
let listen_addr = &broker.listen_addr;
print!("Starting neon broker at {}", listen_addr);
let args = [format!("--listen-addr={listen_addr}")];
let client = reqwest::blocking::Client::new();
background_process::start_process(
"storage_broker",
&env.base_data_dir,
&env.storage_broker_bin(),
args,
[],
background_process::InitialPidFile::Create(&storage_broker_pid_file_path(env)),
|| {
let url = broker.client_url();
let status_url = url.join("status").with_context(|| {
format!("Failed to append /status path to broker endpoint {url}",)
})?;
let request = client
.get(status_url)
.build()
.with_context(|| format!("Failed to construct request to broker endpoint {url}"))?;
match client.execute(request) {
Ok(resp) => Ok(resp.status().is_success()),
Err(_) => Ok(false),
}
},
)
.context("Failed to spawn storage_broker subprocess")?;
Ok(())
}
pub fn stop_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
background_process::stop_process(true, "storage_broker", &storage_broker_pid_file_path(env))
}
fn storage_broker_pid_file_path(env: &local_env::LocalEnv) -> PathBuf {
env.base_data_dir.join("storage_broker.pid")
}

View File

@@ -44,7 +44,7 @@ impl ComputeControlPlane {
let mut nodes = BTreeMap::default();
let pgdatadirspath = &env.pg_data_dirs_path();
for tenant_dir in fs::read_dir(pgdatadirspath)
for tenant_dir in fs::read_dir(&pgdatadirspath)
.with_context(|| format!("failed to list {}", pgdatadirspath.display()))?
{
let tenant_dir = tenant_dir?;
@@ -67,8 +67,8 @@ impl ComputeControlPlane {
fn get_port(&mut self) -> u16 {
1 + self
.nodes
.values()
.map(|node| node.address.port())
.iter()
.map(|(_name, node)| node.address.port())
.max()
.unwrap_or(self.base_port)
}
@@ -183,7 +183,7 @@ impl PostgresNode {
fn sync_safekeepers(&self, auth_token: &Option<String>, pg_version: u32) -> Result<Lsn> {
let pg_path = self.env.pg_bin_dir(pg_version)?.join("postgres");
let mut cmd = Command::new(pg_path);
let mut cmd = Command::new(&pg_path);
cmd.arg("--sync-safekeepers")
.env_clear()
@@ -201,7 +201,7 @@ impl PostgresNode {
.stderr(Stdio::piped());
if let Some(token) = auth_token {
cmd.env("NEON_AUTH_TOKEN", token);
cmd.env("ZENITH_AUTH_TOKEN", token);
}
let sync_handle = cmd
@@ -261,7 +261,7 @@ impl PostgresNode {
}
fn create_pgdata(&self) -> Result<()> {
fs::create_dir_all(self.pgdata()).with_context(|| {
fs::create_dir_all(&self.pgdata()).with_context(|| {
format!(
"could not create data directory {}",
self.pgdata().display()
@@ -304,17 +304,17 @@ impl PostgresNode {
// Set up authentication
//
// $NEON_AUTH_TOKEN will be replaced with value from environment
// $ZENITH_AUTH_TOKEN will be replaced with value from environment
// variable during compute pg startup. It is done this way because
// otherwise user will be able to retrieve the value using SHOW
// command or pg_settings
let password = if let AuthType::NeonJWT = auth_type {
"$NEON_AUTH_TOKEN"
"$ZENITH_AUTH_TOKEN"
} else {
""
};
// NOTE avoiding spaces in connection string, because it is less error prone if we forward it somewhere.
// Also note that not all parameters are supported here. Because in compute we substitute $NEON_AUTH_TOKEN
// Also note that not all parameters are supported here. Because in compute we substitute $ZENITH_AUTH_TOKEN
// We parse this string and build it back with token from env var, and for simplicity rebuild
// uses only needed variables namely host, port, user, password.
format!("postgresql://no_user:{password}@{host}:{port}")
@@ -323,7 +323,7 @@ impl PostgresNode {
conf.append_line("");
conf.append("neon.pageserver_connstring", &pageserver_connstr);
if let AuthType::NeonJWT = auth_type {
conf.append("neon.safekeeper_token_env", "$NEON_AUTH_TOKEN");
conf.append("neon.safekeeper_token_env", "$ZENITH_AUTH_TOKEN");
}
conf.append("neon.tenant_id", &self.tenant_id.to_string());
conf.append("neon.timeline_id", &self.timeline_id.to_string());
@@ -448,7 +448,7 @@ impl PostgresNode {
self.env.pg_lib_dir(self.pg_version)?.to_str().unwrap(),
);
if let Some(token) = auth_token {
cmd.env("NEON_AUTH_TOKEN", token);
cmd.env("ZENITH_AUTH_TOKEN", token);
}
let pg_ctl = cmd.output().context("pg_ctl failed")?;
@@ -478,7 +478,7 @@ impl PostgresNode {
postgresql_conf_path.to_str().unwrap()
)
})?;
fs::remove_dir_all(self.pgdata())?;
fs::remove_dir_all(&self.pgdata())?;
self.create_pgdata()?;
// 2. Bring back config files
@@ -514,7 +514,7 @@ impl PostgresNode {
"Destroying postgres data directory '{}'",
self.pgdata().to_str().unwrap()
);
fs::remove_dir_all(self.pgdata())?;
fs::remove_dir_all(&self.pgdata())?;
} else {
self.pg_ctl(&["stop"], &None)?;
}

78
control_plane/src/etcd.rs Normal file
View File

@@ -0,0 +1,78 @@
use std::{fs, path::PathBuf};
use anyhow::Context;
use crate::{background_process, local_env};
pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
let etcd_broker = &env.etcd_broker;
print!(
"Starting etcd broker using {:?}",
etcd_broker.etcd_binary_path
);
let etcd_data_dir = env.base_data_dir.join("etcd");
fs::create_dir_all(&etcd_data_dir)
.with_context(|| format!("Failed to create etcd data dir {etcd_data_dir:?}"))?;
let client_urls = etcd_broker.comma_separated_endpoints();
let args = [
format!("--data-dir={}", etcd_data_dir.display()),
format!("--listen-client-urls={client_urls}"),
format!("--advertise-client-urls={client_urls}"),
// Set --quota-backend-bytes to keep the etcd virtual memory
// size smaller. Our test etcd clusters are very small.
// See https://github.com/etcd-io/etcd/issues/7910
"--quota-backend-bytes=100000000".to_string(),
// etcd doesn't compact (vacuum) with default settings,
// enable it to prevent space exhaustion.
"--auto-compaction-mode=revision".to_string(),
"--auto-compaction-retention=1".to_string(),
];
let pid_file_path = etcd_pid_file_path(env);
let client = reqwest::blocking::Client::new();
background_process::start_process(
"etcd",
&etcd_data_dir,
&etcd_broker.etcd_binary_path,
&args,
[],
background_process::InitialPidFile::Create(&pid_file_path),
|| {
for broker_endpoint in &etcd_broker.broker_endpoints {
let request = broker_endpoint
.join("health")
.with_context(|| {
format!(
"Failed to append /health path to broker endopint {}",
broker_endpoint
)
})
.and_then(|url| {
client.get(&url.to_string()).build().with_context(|| {
format!("Failed to construct request to etcd endpoint {url}")
})
})?;
if client.execute(request).is_ok() {
return Ok(true);
}
}
Ok(false)
},
)
.context("Failed to spawn etcd subprocess")?;
Ok(())
}
pub fn stop_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
background_process::stop_process(true, "etcd", &etcd_pid_file_path(env))
}
fn etcd_pid_file_path(env: &local_env::LocalEnv) -> PathBuf {
env.base_data_dir.join("etcd.pid")
}

View File

@@ -8,8 +8,8 @@
//
mod background_process;
pub mod broker;
pub mod compute;
pub mod etcd;
pub mod local_env;
pub mod pageserver;
pub mod postgresql_conf;

View File

@@ -4,16 +4,12 @@
//! script which will use local paths.
use anyhow::{bail, ensure, Context};
use reqwest::Url;
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use std::collections::HashMap;
use std::env;
use std::fs;
use std::net::IpAddr;
use std::net::Ipv4Addr;
use std::net::SocketAddr;
use std::path::{Path, PathBuf};
use std::process::{Command, Stdio};
use utils::{
@@ -66,7 +62,7 @@ pub struct LocalEnv {
#[serde(default)]
pub private_key_path: PathBuf,
pub broker: NeonBroker,
pub etcd_broker: EtcdBroker,
pub pageserver: PageServerConf,
@@ -82,26 +78,67 @@ pub struct LocalEnv {
branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
}
/// Broker config for cluster internal communication.
/// Etcd broker config for cluster internal communication.
#[serde_as]
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
#[serde(default)]
pub struct NeonBroker {
/// Broker listen address for storage nodes coordination, e.g. '127.0.0.1:50051'.
pub listen_addr: SocketAddr,
pub struct EtcdBroker {
/// A prefix to all to any key when pushing/polling etcd from a node.
#[serde(default)]
pub broker_etcd_prefix: Option<String>,
/// Broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'.
#[serde(default)]
#[serde_as(as = "Vec<DisplayFromStr>")]
pub broker_endpoints: Vec<Url>,
/// Etcd binary path to use.
#[serde(default)]
pub etcd_binary_path: PathBuf,
}
// Dummy Default impl to satisfy Deserialize derive.
impl Default for NeonBroker {
fn default() -> Self {
NeonBroker {
listen_addr: SocketAddr::new(IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)), 0),
}
impl EtcdBroker {
pub fn locate_etcd() -> anyhow::Result<PathBuf> {
let which_output = Command::new("which")
.arg("etcd")
.output()
.context("Failed to run 'which etcd' command")?;
let stdout = String::from_utf8_lossy(&which_output.stdout);
ensure!(
which_output.status.success(),
"'which etcd' invocation failed. Status: {}, stdout: {stdout}, stderr: {}",
which_output.status,
String::from_utf8_lossy(&which_output.stderr)
);
let etcd_path = PathBuf::from(stdout.trim());
ensure!(
etcd_path.is_file(),
"'which etcd' invocation was successful, but the path it returned is not a file or does not exist: {}",
etcd_path.display()
);
Ok(etcd_path)
}
}
impl NeonBroker {
pub fn client_url(&self) -> Url {
Url::parse(&format!("http://{}", self.listen_addr)).expect("failed to construct url")
pub fn comma_separated_endpoints(&self) -> String {
self.broker_endpoints
.iter()
.map(|url| {
// URL by default adds a '/' path at the end, which is not what etcd CLI wants.
let url_string = url.as_str();
if url_string.ends_with('/') {
&url_string[0..url_string.len() - 1]
} else {
url_string
}
})
.fold(String::new(), |mut comma_separated_urls, url| {
if !comma_separated_urls.is_empty() {
comma_separated_urls.push(',');
}
comma_separated_urls.push_str(url);
comma_separated_urls
})
}
}
@@ -197,10 +234,6 @@ impl LocalEnv {
self.neon_distrib_dir.join("safekeeper")
}
pub fn storage_broker_bin(&self) -> PathBuf {
self.neon_distrib_dir.join("storage_broker")
}
pub fn pg_data_dirs_path(&self) -> PathBuf {
self.base_data_dir.join("pgdatadirs").join("tenants")
}
@@ -296,6 +329,11 @@ impl LocalEnv {
env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
}
// If no initial tenant ID was given, generate it.
if env.default_tenant_id.is_none() {
env.default_tenant_id = Some(TenantId::generate());
}
env.base_data_dir = base_path();
Ok(env)
@@ -399,7 +437,7 @@ impl LocalEnv {
}
}
fs::create_dir(base_path)?;
fs::create_dir(&base_path)?;
// generate keys for jwt
// openssl genrsa -out private_key.pem 2048
@@ -408,7 +446,7 @@ impl LocalEnv {
private_key_path = base_path.join("auth_private_key.pem");
let keygen_output = Command::new("openssl")
.arg("genrsa")
.args(["-out", private_key_path.to_str().unwrap()])
.args(&["-out", private_key_path.to_str().unwrap()])
.arg("2048")
.stdout(Stdio::null())
.output()
@@ -425,10 +463,10 @@ impl LocalEnv {
// openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem
let keygen_output = Command::new("openssl")
.arg("rsa")
.args(["-in", private_key_path.to_str().unwrap()])
.args(&["-in", private_key_path.to_str().unwrap()])
.arg("-pubout")
.args(["-outform", "PEM"])
.args(["-out", public_key_path.to_str().unwrap()])
.args(&["-outform", "PEM"])
.args(&["-out", public_key_path.to_str().unwrap()])
.stdout(Stdio::null())
.output()
.context("failed to generate auth private key")?;
@@ -473,8 +511,8 @@ mod tests {
"failed to parse simple config {simple_conf_toml}, reason: {simple_conf_parse_result:?}"
);
let string_to_replace = "listen_addr = '127.0.0.1:50051'";
let spoiled_url_str = "listen_addr = '!@$XOXO%^&'";
let string_to_replace = "broker_endpoints = ['http://127.0.0.1:2379']";
let spoiled_url_str = "broker_endpoints = ['!@$XOXO%^&']";
let spoiled_url_toml = simple_conf_toml.replace(string_to_replace, spoiled_url_str);
assert!(
spoiled_url_toml.contains(spoiled_url_str),

View File

@@ -1,10 +1,9 @@
use std::borrow::Cow;
use std::collections::HashMap;
use std::fs::File;
use std::fs::{self, File};
use std::io::{BufReader, Write};
use std::num::NonZeroU64;
use std::path::PathBuf;
use std::process::{Child, Command};
use std::path::{Path, PathBuf};
use std::process::Child;
use std::{io, result};
use anyhow::{bail, Context};
@@ -97,8 +96,13 @@ impl PageServerNode {
}
}
// pageserver conf overrides defined by neon_local configuration.
fn neon_local_overrides(&self) -> Vec<String> {
pub fn initialize(
&self,
create_tenant: Option<TenantId>,
initial_timeline_id: Option<TimelineId>,
config_overrides: &[&str],
pg_version: u32,
) -> anyhow::Result<TimelineId> {
let id = format!("id={}", self.env.pageserver.id);
// FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
let pg_distrib_dir_param = format!(
@@ -113,32 +117,98 @@ impl PageServerNode {
);
let listen_pg_addr_param =
format!("listen_pg_addr='{}'", self.env.pageserver.listen_pg_addr);
let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
let broker_endpoints_param = format!(
"broker_endpoints=[{}]",
self.env
.etcd_broker
.broker_endpoints
.iter()
.map(|url| format!("'{url}'"))
.collect::<Vec<_>>()
.join(",")
);
let broker_etcd_prefix_param = self
.env
.etcd_broker
.broker_etcd_prefix
.as_ref()
.map(|prefix| format!("broker_etcd_prefix='{prefix}'"));
let mut overrides = vec![
id,
pg_distrib_dir_param,
authg_type_param,
listen_http_addr_param,
listen_pg_addr_param,
broker_endpoint_param,
];
let mut init_config_overrides = config_overrides.to_vec();
init_config_overrides.push(&id);
init_config_overrides.push(&pg_distrib_dir_param);
init_config_overrides.push(&authg_type_param);
init_config_overrides.push(&listen_http_addr_param);
init_config_overrides.push(&listen_pg_addr_param);
init_config_overrides.push(&broker_endpoints_param);
if let Some(broker_etcd_prefix_param) = broker_etcd_prefix_param.as_deref() {
init_config_overrides.push(broker_etcd_prefix_param);
}
if self.env.pageserver.auth_type != AuthType::Trust {
overrides.push("auth_validation_public_key_path='auth_public_key.pem'".to_owned());
init_config_overrides.push("auth_validation_public_key_path='auth_public_key.pem'");
}
overrides
let mut pageserver_process = self
.start_node(&init_config_overrides, &self.env.base_data_dir, true)
.with_context(|| {
format!(
"Failed to start a process for pageserver {}",
self.env.pageserver.id,
)
})?;
let init_result = self
.try_init_timeline(create_tenant, initial_timeline_id, pg_version)
.context("Failed to create initial tenant and timeline for pageserver");
match &init_result {
Ok(initial_timeline_id) => {
println!("Successfully initialized timeline {initial_timeline_id}")
}
Err(e) => eprintln!("{e:#}"),
}
match pageserver_process.kill() {
Err(e) => {
eprintln!(
"Failed to stop pageserver {} process with pid {}: {e:#}",
self.env.pageserver.id,
pageserver_process.id(),
)
}
Ok(()) => {
println!(
"Stopped pageserver {} process with pid {}",
self.env.pageserver.id,
pageserver_process.id(),
);
// cleanup after pageserver startup, since we do not call regular `stop_process` during init
let pid_file = self.pid_file();
if let Err(e) = fs::remove_file(&pid_file) {
if e.kind() != io::ErrorKind::NotFound {
eprintln!("Failed to remove pid file {pid_file:?} after stopping the process: {e:#}");
}
}
}
}
init_result
}
/// Initializes a pageserver node by creating its config with the overrides provided.
pub fn initialize(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
// First, run `pageserver --init` and wait for it to write a config into FS and exit.
self.pageserver_init(config_overrides).with_context(|| {
format!(
"Failed to run init for pageserver node {}",
self.env.pageserver.id,
)
})
fn try_init_timeline(
&self,
new_tenant_id: Option<TenantId>,
new_timeline_id: Option<TimelineId>,
pg_version: u32,
) -> anyhow::Result<TimelineId> {
let initial_tenant_id = self.tenant_create(new_tenant_id, HashMap::new())?;
let initial_timeline_info = self.timeline_create(
initial_tenant_id,
new_timeline_id,
None,
None,
Some(pg_version),
)?;
Ok(initial_timeline_info.timeline_id)
}
pub fn repo_path(&self) -> PathBuf {
@@ -153,73 +223,52 @@ impl PageServerNode {
}
pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
self.start_node(config_overrides, false)
self.start_node(config_overrides, &self.repo_path(), false)
}
fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
let datadir = self.repo_path();
let node_id = self.env.pageserver.id;
println!(
"Initializing pageserver node {} at '{}' in {:?}",
node_id,
self.pg_connection_config.raw_address(),
datadir
);
io::stdout().flush()?;
let datadir_path_str = datadir.to_str().with_context(|| {
format!("Cannot start pageserver node {node_id} in path that has no string representation: {datadir:?}")
})?;
let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str);
args.push(Cow::Borrowed("--init"));
let init_output = Command::new(self.env.pageserver_bin())
.args(args.iter().map(Cow::as_ref))
.envs(self.pageserver_env_variables()?)
.output()
.with_context(|| format!("Failed to run pageserver init for node {node_id}"))?;
anyhow::ensure!(
init_output.status.success(),
"Pageserver init for node {} did not finish successfully, stdout: {}, stderr: {}",
node_id,
String::from_utf8_lossy(&init_output.stdout),
String::from_utf8_lossy(&init_output.stderr),
);
Ok(())
}
fn start_node(&self, config_overrides: &[&str], update_config: bool) -> anyhow::Result<Child> {
let mut overrides = self.neon_local_overrides();
overrides.extend(config_overrides.iter().map(|&c| c.to_owned()));
let datadir = self.repo_path();
fn start_node(
&self,
config_overrides: &[&str],
datadir: &Path,
update_config: bool,
) -> anyhow::Result<Child> {
print!(
"Starting pageserver node {} at '{}' in {:?}",
self.env.pageserver.id,
"Starting pageserver at '{}' in '{}'",
self.pg_connection_config.raw_address(),
datadir
datadir.display()
);
io::stdout().flush()?;
let datadir_path_str = datadir.to_str().with_context(|| {
format!(
"Cannot start pageserver node {} in path that has no string representation: {:?}",
self.env.pageserver.id, datadir,
)
})?;
let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str);
let mut args = vec![
"-D",
datadir.to_str().with_context(|| {
format!("Datadir path {datadir:?} cannot be represented as a unicode string")
})?,
];
if update_config {
args.push(Cow::Borrowed("--update-config"));
args.push("--update-config");
}
for config_override in config_overrides {
args.extend(["-c", config_override]);
}
let envs = if self.env.pageserver.auth_type != AuthType::Trust {
// Generate a token to connect from the pageserver to a safekeeper
let token = self
.env
.generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?;
vec![("ZENITH_AUTH_TOKEN".to_owned(), token)]
} else {
vec![]
};
background_process::start_process(
"pageserver",
&datadir,
datadir,
&self.env.pageserver_bin(),
args.iter().map(Cow::as_ref),
self.pageserver_env_variables()?,
&args,
envs,
background_process::InitialPidFile::Expect(&self.pid_file()),
|| match self.check_status() {
Ok(()) => Ok(true),
@@ -229,35 +278,6 @@ impl PageServerNode {
)
}
fn pageserver_basic_args<'a>(
&self,
config_overrides: &'a [&'a str],
datadir_path_str: &'a str,
) -> Vec<Cow<'a, str>> {
let mut args = vec![Cow::Borrowed("-D"), Cow::Borrowed(datadir_path_str)];
let mut overrides = self.neon_local_overrides();
overrides.extend(config_overrides.iter().map(|&c| c.to_owned()));
for config_override in overrides {
args.push(Cow::Borrowed("-c"));
args.push(Cow::Owned(config_override));
}
args
}
fn pageserver_env_variables(&self) -> anyhow::Result<Vec<(String, String)>> {
Ok(if self.env.pageserver.auth_type != AuthType::Trust {
// Generate a token to connect from the pageserver to a safekeeper
let token = self
.env
.generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?;
vec![("NEON_AUTH_TOKEN".to_owned(), token)]
} else {
Vec::new()
})
}
///
/// Stop the server.
///

View File

@@ -131,8 +131,13 @@ impl SafekeeperNode {
args.push("--no-sync");
}
let broker_endpoint = format!("{}", self.env.broker.client_url());
args.extend(["--broker-endpoint", &broker_endpoint]);
let comma_separated_endpoints = self.env.etcd_broker.comma_separated_endpoints();
if !comma_separated_endpoints.is_empty() {
args.extend(["--broker-endpoints", &comma_separated_endpoints]);
}
if let Some(prefix) = self.env.etcd_broker.broker_etcd_prefix.as_deref() {
args.extend(["--broker-etcd-prefix", prefix]);
}
let mut backup_threads = String::new();
if let Some(threads) = self.conf.backup_threads {

View File

@@ -1,90 +0,0 @@
# This file was auto-generated using `cargo deny init`.
# cargo-deny is a cargo plugin that lets you lint your project's
# dependency graph to ensure all your dependencies conform
# to your expectations and requirements.
# Root options
targets = []
all-features = false
no-default-features = false
feature-depth = 1
# This section is considered when running `cargo deny check advisories`
# More documentation for the advisories section can be found here:
# https://embarkstudios.github.io/cargo-deny/checks/advisories/cfg.html
[advisories]
db-urls = ["https://github.com/rustsec/advisory-db"]
vulnerability = "deny"
unmaintained = "warn"
yanked = "warn"
notice = "warn"
ignore = []
# This section is considered when running `cargo deny check licenses`
# More documentation for the licenses section can be found here:
# https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html
[licenses]
unlicensed = "deny"
allow = [
"Apache-2.0",
"Artistic-2.0",
"BSD-2-Clause",
"BSD-3-Clause",
"ISC",
"MIT",
"MPL-2.0",
"OpenSSL",
"Unicode-DFS-2016",
]
deny = []
copyleft = "warn"
allow-osi-fsf-free = "neither"
default = "deny"
confidence-threshold = 0.8
exceptions = [
# Zlib license has some restrictions if we decide to change sth
{ allow = ["Zlib"], name = "const_format_proc_macros", version = "*" },
{ allow = ["Zlib"], name = "const_format", version = "*" },
]
[[licenses.clarify]]
name = "ring"
version = "*"
expression = "MIT AND ISC AND OpenSSL"
license-files = [
{ path = "LICENSE", hash = 0xbd0eed23 }
]
[licenses.private]
ignore = true
registries = []
# This section is considered when running `cargo deny check bans`.
# More documentation about the 'bans' section can be found here:
# https://embarkstudios.github.io/cargo-deny/checks/bans/cfg.html
[bans]
multiple-versions = "warn"
wildcards = "allow"
highlight = "all"
workspace-default-features = "allow"
external-default-features = "allow"
allow = []
deny = []
skip = []
skip-tree = []
# This section is considered when running `cargo deny check sources`.
# More documentation about the 'sources' section can be found here:
# https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html
[sources]
unknown-registry = "warn"
unknown-git = "warn"
allow-registry = ["https://github.com/rust-lang/crates.io-index"]
allow-git = []
[sources.allow-org]
github = [
"neondatabase",
]
gitlab = []
bitbucket = []

View File

@@ -1,6 +1,29 @@
version: '3'
services:
etcd:
restart: always
image: quay.io/coreos/etcd:v3.5.4
ports:
- 2379:2379
- 2380:2380
environment:
# This signifficantly speeds up etcd and we anyway don't data persistency there.
ETCD_UNSAFE_NO_FSYNC: "1"
command:
- "etcd"
- "--auto-compaction-mode=revision"
- "--auto-compaction-retention=1"
- "--name=etcd-cluster"
- "--initial-cluster-state=new"
- "--initial-cluster-token=etcd-cluster-1"
- "--initial-cluster=etcd-cluster=http://etcd:2380"
- "--initial-advertise-peer-urls=http://etcd:2380"
- "--advertise-client-urls=http://etcd:2379"
- "--listen-client-urls=http://0.0.0.0:2379"
- "--listen-peer-urls=http://0.0.0.0:2380"
- "--quota-backend-bytes=134217728" # 128 MB
minio:
restart: always
image: quay.io/minio/minio:RELEASE.2022-10-20T00-55-09Z
@@ -33,7 +56,7 @@ services:
restart: always
image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
environment:
- BROKER_ENDPOINT='http://storage_broker:50051'
- BROKER_ENDPOINT='http://etcd:2379'
- AWS_ACCESS_KEY_ID=minio
- AWS_SECRET_ACCESS_KEY=password
#- RUST_BACKTRACE=1
@@ -45,7 +68,7 @@ services:
- "-c"
command:
- "/usr/local/bin/pageserver -D /data/.neon/
-c \"broker_endpoint=$$BROKER_ENDPOINT\"
-c \"broker_endpoints=[$$BROKER_ENDPOINT]\"
-c \"listen_pg_addr='0.0.0.0:6400'\"
-c \"listen_http_addr='0.0.0.0:9898'\"
-c \"remote_storage={endpoint='http://minio:9000',
@@ -53,7 +76,7 @@ services:
bucket_region='eu-north-1',
prefix_in_bucket='/pageserver/'}\""
depends_on:
- storage_broker
- etcd
- minio_create_buckets
safekeeper1:
@@ -62,7 +85,7 @@ services:
environment:
- SAFEKEEPER_ADVERTISE_URL=safekeeper1:5454
- SAFEKEEPER_ID=1
- BROKER_ENDPOINT=http://storage_broker:50051
- BROKER_ENDPOINT=http://etcd:2379
- AWS_ACCESS_KEY_ID=minio
- AWS_SECRET_ACCESS_KEY=password
#- RUST_BACKTRACE=1
@@ -76,14 +99,14 @@ services:
- "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
--listen-http='0.0.0.0:7676'
--id=$$SAFEKEEPER_ID
--broker-endpoint=$$BROKER_ENDPOINT
--broker-endpoints=$$BROKER_ENDPOINT
-D /data
--remote-storage=\"{endpoint='http://minio:9000',
bucket_name='neon',
bucket_region='eu-north-1',
prefix_in_bucket='/safekeeper/'}\""
depends_on:
- storage_broker
- etcd
- minio_create_buckets
safekeeper2:
@@ -92,7 +115,7 @@ services:
environment:
- SAFEKEEPER_ADVERTISE_URL=safekeeper2:5454
- SAFEKEEPER_ID=2
- BROKER_ENDPOINT=http://storage_broker:50051
- BROKER_ENDPOINT=http://etcd:2379
- AWS_ACCESS_KEY_ID=minio
- AWS_SECRET_ACCESS_KEY=password
#- RUST_BACKTRACE=1
@@ -106,14 +129,14 @@ services:
- "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
--listen-http='0.0.0.0:7676'
--id=$$SAFEKEEPER_ID
--broker-endpoint=$$BROKER_ENDPOINT
--broker-endpoints=$$BROKER_ENDPOINT
-D /data
--remote-storage=\"{endpoint='http://minio:9000',
bucket_name='neon',
bucket_region='eu-north-1',
prefix_in_bucket='/safekeeper/'}\""
depends_on:
- storage_broker
- etcd
- minio_create_buckets
safekeeper3:
@@ -122,7 +145,7 @@ services:
environment:
- SAFEKEEPER_ADVERTISE_URL=safekeeper3:5454
- SAFEKEEPER_ID=3
- BROKER_ENDPOINT=http://storage_broker:50051
- BROKER_ENDPOINT=http://etcd:2379
- AWS_ACCESS_KEY_ID=minio
- AWS_SECRET_ACCESS_KEY=password
#- RUST_BACKTRACE=1
@@ -136,25 +159,16 @@ services:
- "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
--listen-http='0.0.0.0:7676'
--id=$$SAFEKEEPER_ID
--broker-endpoint=$$BROKER_ENDPOINT
--broker-endpoints=$$BROKER_ENDPOINT
-D /data
--remote-storage=\"{endpoint='http://minio:9000',
bucket_name='neon',
bucket_region='eu-north-1',
prefix_in_bucket='/safekeeper/'}\""
depends_on:
- storage_broker
- etcd
- minio_create_buckets
storage_broker:
restart: always
image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
ports:
- 50051:50051
command:
- "storage_broker"
- "--listen-addr=0.0.0.0:50051"
compute:
restart: always
build:

View File

@@ -2,7 +2,7 @@
### Overview
We use JWT tokens in communication between almost all components (compute, pageserver, safekeeper, CLI) regardless of the protocol used (HTTP/PostgreSQL).
storage_broker currently has no authentication.
Etcd currently has no authentication.
Authentication is optional and is disabled by default for easier debugging.
It is used in some tests, though.
Note that we do not cover authentication with `pg.neon.tech` here.
@@ -65,7 +65,7 @@ There is no administrative API except those provided by PostgreSQL.
#### Outgoing connections
Compute connects to Pageserver for getting pages.
The connection string is configured by the `neon.pageserver_connstring` PostgreSQL GUC, e.g. `postgresql://no_user:$NEON_AUTH_TOKEN@localhost:15028`.
The connection string is configured by the `neon.pageserver_connstring` PostgreSQL GUC, e.g. `postgresql://no_user:$ZENITH_AUTH_TOKEN@localhost:15028`.
The environment variable inside the connection string is substituted with
the JWT token.
@@ -77,14 +77,14 @@ If the GUC is unset, no token is passed.
Note that both tokens can be (and typically are) the same;
the scope is the tenant and the token is usually passed through the
`$NEON_AUTH_TOKEN` environment variable.
`$ZENITH_AUTH_TOKEN` environment variable.
### Pageserver
#### Overview
Pageserver keeps track of multiple tenants, each having multiple timelines.
For each timeline, it connects to the corresponding Safekeeper.
Information about "corresponding Safekeeper" is published by Safekeepers
in the storage_broker, but they do not publish access tokens, otherwise what is
in the Etcd, but they do not publish access tokens, otherwise what is
the point of authentication.
Pageserver keeps a connection to some set of Safekeepers, which
@@ -114,7 +114,7 @@ either of three values:
Pageserver makes a connection to a Safekeeper for each active timeline.
As Pageserver may want to access any timeline it has on the disk,
it is given a blanket JWT token to access any data on any Safekeeper.
This token is passed through an environment variable called `NEON_AUTH_TOKEN`
This token is passed through an environment variable called `ZENITH_AUTH_TOKEN`
(non-configurable as of writing this text).
A better way _may be_ to store JWT token for each timeline next to it,

View File

@@ -1,115 +0,0 @@
### Overview
Pageserver and proxy periodically collect consumption metrics and push them to a HTTP endpoint.
This doc describes current implementation details.
For design details see [the RFC](./rfcs/021-metering.md) and [the discussion on Github](https://github.com/neondatabase/neon/pull/2884).
- The metrics are collected in a separate thread, and the collection interval and endpoint are configurable.
- Metrics are cached, so that we don't send unchanged metrics on every iteration.
- Metrics are sent in batches of 1000 (see CHUNK_SIZE const) metrics max with no particular grouping guarantees.
batch format is
```json
{ "events" : [metric1, metric2, ...]]}
```
See metric format examples below.
- All metrics values are in bytes, unless otherwise specified.
- Currently no retries are implemented.
### Pageserver metrics
#### Configuration
The endpoint and the collection interval are specified in the pageserver config file (or can be passed as command line arguments):
`metric_collection_endpoint` defaults to None, which means that metric collection is disabled by default.
`metric_collection_interval` defaults to 10min
#### Metrics
Currently, the following metrics are collected:
- `written_size`
Amount of WAL produced , by a timeline, i.e. last_record_lsn
This is an absolute, per-timeline metric.
- `resident_size`
Size of all the layer files in the tenant's directory on disk on the pageserver.
This is an absolute, per-tenant metric.
- `remote_storage_size`
Size of the remote storage (S3) directory.
This is an absolute, per-tenant metric.
- `timeline_logical_size`
Logical size of the data in the timeline
This is an absolute, per-timeline metric.
- `synthetic_storage_size`
Size of all tenant's branches including WAL
This is the same metric that `tenant/{tenant_id}/size` endpoint returns.
This is an absolute, per-tenant metric.
Synthetic storage size is calculated in a separate thread, so it might be slightly outdated.
#### Format example
```json
{
"metric": "remote_storage_size",
"type": "absolute",
"time": "2022-12-28T11:07:19.317310284Z",
"idempotency_key": "2022-12-28 11:07:19.317310324 UTC-1-4019",
"value": 12345454,
"tenant_id": "5d07d9ce9237c4cd845ea7918c0afa7d",
"timeline_id": "a03ebb4f5922a1c56ff7485cc8854143",
}
```
`idempotency_key` is a unique key for each metric, so that we can deduplicate metrics.
It is a combination of the time, node_id and a random number.
### Proxy consumption metrics
#### Configuration
The endpoint and the collection interval can be passed as command line arguments for proxy:
`metric_collection_endpoint` no default, which means that metric collection is disabled by default.
`metric_collection_interval` no default
#### Metrics
Currently, only one proxy metric is collected:
- `proxy_io_bytes_per_client`
Outbound traffic per client.
This is an incremental, per-endpoint metric.
#### Format example
```json
{
"metric": "proxy_io_bytes_per_client",
"type": "incremental",
"start_time": "2022-12-28T11:07:19.317310284Z",
"stop_time": "2022-12-28T11:07:19.317310284Z",
"idempotency_key": "2022-12-28 11:07:19.317310324 UTC-1-4019",
"value": 12345454,
"endpoint_id": "5d07d9ce9237c4cd845ea7918c0afa7d",
}
```
The metric is incremental, so the value is the difference between the current and the previous value.
If there is no previous value, the value, the value is the current value and the `start_time` equals `stop_time`.
### TODO
- [ ] Handle errors better: currently if one tenant fails to gather metrics, the whole iteration fails and metrics are not sent for any tenant.
- [ ] Add retries
- [ ] Tune the interval

View File

@@ -23,9 +23,9 @@ We build all images after a successful `release` tests run and push automaticall
You can see a [docker compose](https://docs.docker.com/compose/) example to create a neon cluster in [/docker-compose/docker-compose.yml](/docker-compose/docker-compose.yml). It creates the following conatainers.
- etcd x 1
- pageserver x 1
- safekeeper x 3
- storage_broker x 1
- compute x 1
- MinIO x 1 # This is Amazon S3 compatible object storage
@@ -41,7 +41,7 @@ $ cd docker-compose/docker-compose.yml
$ docker-compose down # remove the conainers if exists
$ PG_VERSION=15 TAG=2221 docker-compose up --build -d # You can specify the postgres and image version
Creating network "dockercompose_default" with the default driver
Creating docker-compose_storage_broker_1 ... done
Creating dockercompose_etcd3_1 ...
(...omit...)
```

View File

@@ -1,186 +0,0 @@
# Consumption tracking
# Goals
This proposal is made with two mostly but not entirely overlapping goals:
* Collect info that is needed for consumption-based billing
* Cross-check AWS bills
# Metrics
There are six metrics to collect:
* CPU time. Wall clock seconds * the current number of cores. We have a fixed ratio of memory to cores, so the current memory size is the function of the number of cores. Measured per each `endpoint`.
* Traffic. In/out traffic on the proxy. Measured per each `endpoint`.
* Written size. Amount of data we write. That is different from both traffic and storage size, as only during the writing we
a) occupy some disk bandwidth on safekeepers
b) necessarily cross AZ boundaries delivering WAL to all safekeepers
Each timeline/branch has at most one writer, so the data is collected per branch.
* Synthetic storage size. That is what is exposed now with pageserver's `/v1/tenant/{}/size`. Looks like now it is per-tenant. (Side note: can we make it per branch to show as branch physical size in UI?)
* Real storage size. That is the size of the tenant directory on the pageservers disk. Per-tenant.
* S3 storage size. That is the size of the tenant data on S3. Per-tenant.
That info should be enough to build an internal model that predicts AWS price (hence tracking `written data` and `real storage size`). As for the billing model we probably can get away with mentioning only `CPU time`, `synthetic storage size`, and `traffic` consumption.
# Services participating in metrics collection
## Proxy
For actual implementation details check `/docs/consumption_metrics.md`
Proxy is the only place that knows about traffic flow, so it tracks it and reports it with quite a small interval, let's say 1 minute. A small interval is needed here since the proxy is stateless, and any restart will reset accumulated consumption. Also proxy should report deltas since the last report, not an absolute value of the counter. Such kind of events is easier to integrate over a period of time to get the amount of traffic during some time interval.
Example event:
```json
{
"metric": "proxy_io_bytes_per_client",
"type": "incremental",
"start_time": "2022-12-28T11:07:19.317310284Z",
"stop_time": "2022-12-28T11:07:19.317310284Z",
"idempotency_key": "2022-12-28 11:07:19.317310324 UTC-1-4019",
"value": 12345454,
"endpoint_id": "5d07d9ce9237c4cd845ea7918c0afa7d",
}
```
Since we report deltas over some period of time, it makes sense to include `event_start_time`/`event_stop_time` where `event_start_time` is the time of the previous report. That will allow us to identify metering gaps better (e.g., failed send/delivery).
When there is no active connection proxy can avoid reporting anything. Also, deltas are additive, so several console instances serving the same user and endpoint can report traffic without coordination.
## Console
The console knows about start/stop events, so it knows the amount of CPU time allocated to each endpoint. It also knows about operation successes and failures and can avoid billing clients after unsuccessful 'suspend' events. The console doesn't know the current compute size within the allowed limits on the endpoint. So with CPU time, we do the following:
* While we don't yet have the autoscaling console can report `cpu time` as the number of seconds since the last `start_compute` event.
* When we have autoscaling, `autoscaler-agent` can report `cpu time`*`compute_units_count` in the same increments as the proxy reports traffic.
Example event:
```json
{
"metric": "effective_compute_seconds",
"type": "increment",
"endpoint_id": "blazing-warrior-34",
"event_start_time": ...,
"event_stop_time": ...,
"value": 12345454,
}
```
I'd also suggest reporting one value, `cpu time`*`compute_units_count`, instead of two separate fields as it makes event schema simpler (it is possible to treat it the same way as traffic) and preserves additivity.
## Pageserver
For actual implementation details check `/docs/consumption_metrics.md`
Pageserver knows / has access to / can calculate the rest of the metrics:
* Written size -- that is basically `last_received_lsn`,
* Synthetic storage size -- there is a way to calculate it, albeit a costly one,
* Real storage size -- there is a way to calculate it using a layer map or filesystem,
* S3 storage size -- can calculate it by S3 API calls
Some of those metrics are expensive to calculate, so the reporting period here is driven mainly by implementation details. We can set it to, for example, once per hour. Not a big deal since the pageserver is stateful, and all metrics can be reported as an absolute value, not increments. At the same time, a smaller reporting period improves UX, so it would be good to have something more real-time.
`written size` is primarily a safekeeper-related metric, but since it is available on both pageserver and safekeeper, we can avoid reporting anything from the safekeeper.
Example event:
```json
{
"metric": "remote_storage_size",
"type": "absolute",
"time": "2022-12-28T11:07:19.317310284Z",
"idempotency_key": "2022-12-28 11:07:19.317310324 UTC-1-4019",
"value": 12345454,
"tenant_id": "5d07d9ce9237c4cd845ea7918c0afa7d",
"timeline_id": "a03ebb4f5922a1c56ff7485cc8854143",
}
```
# Data collection
## Push vs. pull
We already have pull-based Prometheus metrics, so it is tempting to use them here too. However, in our setup, it is hard to tell when some metric changes. For example, garbage collection will constantly free some disk space over a week, even if the project is down for that week. We could also iterate through all existing tenants/branches/endpoints, but that means some amount of code to do that properly and most likely we will end up with some per-metric hacks in the collector to cut out some of the tenants that are surely not changing that metric.
With the push model, it is easier to publish data only about actively changing metrics -- pageserver knows when it performs s3 offloads, garbage collection and starts/stops consuming data from the safekeeper; proxy knows about connected clients; console / autoscaler-agent knows about active cpu time.
Hence, let's go with a push-based model.
## Common bus vs. proxying through the console
We can implement such push systems in a few ways:
a. Each component pushes its metrics to the "common bus", namely segment, Kafka, or something similar. That approach scales well, but it would be harder to test it locally, will introduce new dependencies, we will have to distribute secrets for that connection to all of the components, etc. We would also have to loop back some of the events and their aggregates to the console, as we want to show some that metrics to the user in real-time.
b. Each component can call HTTP `POST` with its events to the console, and the console can forward it to the segment for later integration with metronome / orb / onebill / etc. With that approach, only the console has to speak with segment. Also since that data passes through the console, the console can save the latest metrics values, so there is no need for constant feedback of that events back from the segment.
# Implementation
Each (proxy|pageserver|autoscaler-agent) sends consumption events to the single endpoint in the console:
```json
POST /usage_events HTTP/1.1
Content-Type: application/json
[
{
"metric": "remote_storage_size",
"type": "absolute",
"time": "2022-12-28T11:07:19.317310284Z",
"idempotency_key": "2022-12-28 11:07:19.317310324 UTC-1-4019",
"value": 12345454,
"tenant_id": "5d07d9ce9237c4cd845ea7918c0afa7d",
"timeline_id": "a03ebb4f5922a1c56ff7485cc8854143",
},
...
]
```
![data flow](./images/metering.jpg)
Events could be either:
* `incremental` -- change in consumption since the previous event or service restart. That is `effective_cpu_seconds`, `traffic_in_bytes`, and `traffic_out_bytes`.
* `absolute` -- that is the current value of a metric. All of the size-related metrics are absolute.
Each service can post events at its own pace and bundle together data from different tenants/endpoints.
The console algorithm upon receive of events could be the following:
1. Create and send a segment event with the same content (possibly enriching it with tenant/timeline data for endpoint-based events).
2. Update the latest state of per-tenant and per-endpoint metrics in the database.
3. Check whether any of that metrics is above the allowed threshold and stop the project if necessary.
Since all the data comes in batches, we can do the batch update to reduce the number of queries in the database. Proxy traffic is probably the most frequent metric, so with batching, we will have extra `number_of_proxies` requests to the database each minute. This is most likely fine for now but will generate many dead tuples in the console database. If that is the case, we can change step 2 to the following:
2.1. Check if there $tenant_$metric / $endpoint_$metric key in Redis
2.2. If no stored value is found and the metric is incremental, then fetch the current value from DWH (which keeps aggregated value for all the events) and publish it.
2.3. Publish a new value (absolute metric) or add an increment to the stored value (incremental metric)
## Consumption watchdog
Since all the data goes through the console, we don't have to run any background thread/coroutines to check whether consumption is within the allowed limits. We only change consumption with `POST /usage_events`, so limit checks could be applied in the same handler.
## Extensibility
If we need to add a new metric (e.g. s3 traffic or something else), the console code should, by default, process it and publish segment event, even if the metric name is unknown to the console.
## Naming & schema
Each metric name should end up with units -- now `_seconds` and `_bytes`, and segment event should always have `tenant_id` and `timeline_id`/`endpoint_id` where applicable.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 232 KiB

View File

@@ -10,6 +10,7 @@ the values in the config file, if any are specified for the same key and get int
```toml
# Initial configuration file created by 'pageserver --init'
listen_pg_addr = '127.0.0.1:64000'
listen_http_addr = '127.0.0.1:9898'
@@ -24,12 +25,13 @@ max_file_descriptors = '100'
# initial superuser role name to use when creating a new tenant
initial_superuser_name = 'cloud_admin'
broker_endpoint = 'http://127.0.0.1:50051'
broker_etcd_prefix = 'neon'
broker_endpoints = ['some://etcd']
# [remote_storage]
```
The config above shows default values for all basic pageserver settings, besides `broker_endpoint`: that one has to be set by the user,
The config above shows default values for all basic pageserver settings, besides `broker_endpoints`: that one has to be set by the user,
see the corresponding section below.
Pageserver uses default values for all files that are missing in the config, so it's not a hard error to leave the config blank.
Yet, it validates the config values it can (e.g. postgres install dir) and errors if the validation fails, refusing to start.
@@ -48,10 +50,16 @@ Example: `${PAGESERVER_BIN} -c "checkpoint_timeout = '10 m'" -c "remote_storage=
Note that TOML distinguishes between strings and integers, the former require single or double quotes around them.
#### broker_endpoint
#### broker_endpoints
A storage broker endpoint to connect and pull the information from. Default is
`'http://127.0.0.1:50051'`.
A list of endpoints (etcd currently) to connect and pull the information from.
Mandatory, does not have a default, since requires etcd to be started as a separate process,
and its connection url should be specified separately.
#### broker_etcd_prefix
A prefix to add for every etcd key used, to separate one group of related instances from another, in the same cluster.
Default is `neon`.
#### checkpoint_distance

View File

@@ -18,6 +18,10 @@ Intended to be used in integration tests and in CLI tools for local installation
Documentation of the Neon features and concepts.
Now it is mostly dev documentation.
`/monitoring`:
TODO
`/pageserver`:
Neon storage service.
@@ -41,9 +45,9 @@ and create new databases and accounts (control plane API in our case).
Integration tests, written in Python using the `pytest` framework.
`/vendor/postgres-v14` and `/vendor/postgres-v15`:
`/vendor/postgres-v14`:
PostgreSQL source tree per version, with the modifications needed for Neon.
PostgreSQL source tree, with the modifications needed for Neon.
`/pgxn/neon`:
@@ -94,13 +98,6 @@ cargo hakari manage-deps
If you don't have hakari installed (`error: no such subcommand: hakari`), install it by running `cargo install cargo-hakari`.
### Checking Rust 3rd-parties
[Cargo deny](https://embarkstudios.github.io/cargo-deny/index.html) is a cargo plugin that lets us lint project's dependency graph to ensure all dependencies conform to requirements. It detects security issues, matches licenses, and ensures crates only come from trusted sources.
```bash
cargo deny check
```
## Using Python
Note that Debian/Ubuntu Python packages are stale, as it commonly happens,
so manual installation of dependencies is not recommended.

View File

@@ -1,16 +0,0 @@
[package]
name = "consumption_metrics"
version = "0.1.0"
edition = "2021"
license = "Apache-2.0"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
anyhow = "1.0.68"
chrono = { version = "0.4", default-features = false, features = ["clock", "serde"] }
rand = "0.8.3"
serde = "1.0.152"
serde_with = "2.1.0"
utils = { version = "0.1.0", path = "../utils" }
workspace_hack = { version = "0.1.0", path = "../../workspace_hack" }

View File

@@ -1,50 +0,0 @@
//!
//! Shared code for consumption metics collection
//!
use chrono::{DateTime, Utc};
use rand::Rng;
use serde::Serialize;
#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
#[serde(tag = "type")]
pub enum EventType {
#[serde(rename = "absolute")]
Absolute { time: DateTime<Utc> },
#[serde(rename = "incremental")]
Incremental {
start_time: DateTime<Utc>,
stop_time: DateTime<Utc>,
},
}
#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
pub struct Event<Extra> {
#[serde(flatten)]
#[serde(rename = "type")]
pub kind: EventType,
pub metric: &'static str,
pub idempotency_key: String,
pub value: u64,
#[serde(flatten)]
pub extra: Extra,
}
pub fn idempotency_key(node_id: String) -> String {
format!(
"{}-{}-{:04}",
Utc::now(),
node_id,
rand::thread_rng().gen_range(0..=9999)
)
}
pub const CHUNK_SIZE: usize = 1000;
// Just a wrapper around a slice of events
// to serialize it as `{"events" : [ ] }
#[derive(serde::Serialize)]
pub struct EventChunk<'a, T> {
pub events: &'a [T],
}

View File

@@ -0,0 +1,18 @@
[package]
name = "etcd_broker"
version = "0.1.0"
edition = "2021"
[dependencies]
etcd-client = "0.9.0"
regex = "1.4.5"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1"
serde_with = "2.0"
once_cell = "1.13.0"
utils = { path = "../utils" }
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
tokio = "1"
tracing = "0.1"
thiserror = "1"

Some files were not shown because too many files have changed in this diff Show More