mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-03 18:50:38 +00:00
Compare commits
4 Commits
release-co
...
diko/baseb
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0a57a85fc0 | ||
|
|
385324ee8a | ||
|
|
8a68d463f6 | ||
|
|
3046c307da |
@@ -55,9 +55,16 @@ impl FeatureResolverBackgroundLoop {
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let feature_store = FeatureStore::new_with_flags(resp.flags);
|
||||
this.feature_store.store(Arc::new(feature_store));
|
||||
tracing::info!("Feature flag updated");
|
||||
let project_id = this.posthog_client.config.project_id.parse::<u64>().ok();
|
||||
match FeatureStore::new_with_flags(resp.flags, project_id) {
|
||||
Ok(feature_store) => {
|
||||
this.feature_store.store(Arc::new(feature_store));
|
||||
tracing::info!("Feature flag updated");
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!("Cannot process feature flag spec: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
tracing::info!("PostHog feature resolver stopped");
|
||||
}
|
||||
|
||||
@@ -39,6 +39,9 @@ pub struct LocalEvaluationResponse {
|
||||
|
||||
#[derive(Deserialize)]
|
||||
pub struct LocalEvaluationFlag {
|
||||
#[allow(dead_code)]
|
||||
id: u64,
|
||||
team_id: u64,
|
||||
key: String,
|
||||
filters: LocalEvaluationFlagFilters,
|
||||
active: bool,
|
||||
@@ -107,17 +110,32 @@ impl FeatureStore {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_with_flags(flags: Vec<LocalEvaluationFlag>) -> Self {
|
||||
pub fn new_with_flags(
|
||||
flags: Vec<LocalEvaluationFlag>,
|
||||
project_id: Option<u64>,
|
||||
) -> Result<Self, &'static str> {
|
||||
let mut store = Self::new();
|
||||
store.set_flags(flags);
|
||||
store
|
||||
store.set_flags(flags, project_id)?;
|
||||
Ok(store)
|
||||
}
|
||||
|
||||
pub fn set_flags(&mut self, flags: Vec<LocalEvaluationFlag>) {
|
||||
pub fn set_flags(
|
||||
&mut self,
|
||||
flags: Vec<LocalEvaluationFlag>,
|
||||
project_id: Option<u64>,
|
||||
) -> Result<(), &'static str> {
|
||||
self.flags.clear();
|
||||
for flag in flags {
|
||||
if let Some(project_id) = project_id {
|
||||
if flag.team_id != project_id {
|
||||
return Err(
|
||||
"Retrieved a spec with different project id, wrong config? Discarding the feature flags.",
|
||||
);
|
||||
}
|
||||
}
|
||||
self.flags.insert(flag.key.clone(), flag);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Generate a consistent hash for a user ID (e.g., tenant ID).
|
||||
@@ -534,6 +552,13 @@ impl PostHogClient {
|
||||
})
|
||||
}
|
||||
|
||||
/// Check if the server API key is a feature flag secure API key. This key can only be
|
||||
/// used to fetch the feature flag specs and can only be used on a undocumented API
|
||||
/// endpoint.
|
||||
fn is_feature_flag_secure_api_key(&self) -> bool {
|
||||
self.config.server_api_key.starts_with("phs_")
|
||||
}
|
||||
|
||||
/// Fetch the feature flag specs from the server.
|
||||
///
|
||||
/// This is unfortunately an undocumented API at:
|
||||
@@ -547,10 +572,22 @@ impl PostHogClient {
|
||||
) -> anyhow::Result<LocalEvaluationResponse> {
|
||||
// BASE_URL/api/projects/:project_id/feature_flags/local_evaluation
|
||||
// with bearer token of self.server_api_key
|
||||
let url = format!(
|
||||
"{}/api/projects/{}/feature_flags/local_evaluation",
|
||||
self.config.private_api_url, self.config.project_id
|
||||
);
|
||||
// OR
|
||||
// BASE_URL/api/feature_flag/local_evaluation/
|
||||
// with bearer token of feature flag specific self.server_api_key
|
||||
let url = if self.is_feature_flag_secure_api_key() {
|
||||
// The new feature local evaluation secure API token
|
||||
format!(
|
||||
"{}/api/feature_flag/local_evaluation",
|
||||
self.config.private_api_url
|
||||
)
|
||||
} else {
|
||||
// The old personal API token
|
||||
format!(
|
||||
"{}/api/projects/{}/feature_flags/local_evaluation",
|
||||
self.config.private_api_url, self.config.project_id
|
||||
)
|
||||
};
|
||||
let response = self
|
||||
.client
|
||||
.get(url)
|
||||
@@ -803,7 +840,7 @@ mod tests {
|
||||
fn evaluate_multivariate() {
|
||||
let mut store = FeatureStore::new();
|
||||
let response: LocalEvaluationResponse = serde_json::from_str(data()).unwrap();
|
||||
store.set_flags(response.flags);
|
||||
store.set_flags(response.flags, None).unwrap();
|
||||
|
||||
// This lacks the required properties and cannot be evaluated.
|
||||
let variant =
|
||||
@@ -873,7 +910,7 @@ mod tests {
|
||||
|
||||
let mut store = FeatureStore::new();
|
||||
let response: LocalEvaluationResponse = serde_json::from_str(data()).unwrap();
|
||||
store.set_flags(response.flags);
|
||||
store.set_flags(response.flags, None).unwrap();
|
||||
|
||||
// This lacks the required properties and cannot be evaluated.
|
||||
let variant = store.evaluate_boolean_inner("boolean-flag", 1.00, &HashMap::new());
|
||||
@@ -929,7 +966,7 @@ mod tests {
|
||||
|
||||
let mut store = FeatureStore::new();
|
||||
let response: LocalEvaluationResponse = serde_json::from_str(data()).unwrap();
|
||||
store.set_flags(response.flags);
|
||||
store.set_flags(response.flags, None).unwrap();
|
||||
|
||||
// This lacks the required properties and cannot be evaluated.
|
||||
let variant =
|
||||
|
||||
@@ -824,6 +824,7 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
timestamp: SystemTime,
|
||||
done_if_after: SystemTime,
|
||||
cancel: &CancellationToken,
|
||||
_complexity_limit: Option<NonZeroU32>,
|
||||
) -> Result<(), TimeTravelError> {
|
||||
let msg = "PLEASE NOTE: Azure Blob storage time-travel recovery may not work as expected "
|
||||
.to_string()
|
||||
|
||||
@@ -440,6 +440,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
timestamp: SystemTime,
|
||||
done_if_after: SystemTime,
|
||||
cancel: &CancellationToken,
|
||||
complexity_limit: Option<NonZeroU32>,
|
||||
) -> Result<(), TimeTravelError>;
|
||||
}
|
||||
|
||||
@@ -651,22 +652,23 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
timestamp: SystemTime,
|
||||
done_if_after: SystemTime,
|
||||
cancel: &CancellationToken,
|
||||
complexity_limit: Option<NonZeroU32>,
|
||||
) -> Result<(), TimeTravelError> {
|
||||
match self {
|
||||
Self::LocalFs(s) => {
|
||||
s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
||||
s.time_travel_recover(prefix, timestamp, done_if_after, cancel, complexity_limit)
|
||||
.await
|
||||
}
|
||||
Self::AwsS3(s) => {
|
||||
s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
||||
s.time_travel_recover(prefix, timestamp, done_if_after, cancel, complexity_limit)
|
||||
.await
|
||||
}
|
||||
Self::AzureBlob(s) => {
|
||||
s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
||||
s.time_travel_recover(prefix, timestamp, done_if_after, cancel, complexity_limit)
|
||||
.await
|
||||
}
|
||||
Self::Unreliable(s) => {
|
||||
s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
||||
s.time_travel_recover(prefix, timestamp, done_if_after, cancel, complexity_limit)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -610,6 +610,7 @@ impl RemoteStorage for LocalFs {
|
||||
_timestamp: SystemTime,
|
||||
_done_if_after: SystemTime,
|
||||
_cancel: &CancellationToken,
|
||||
_complexity_limit: Option<NonZeroU32>,
|
||||
) -> Result<(), TimeTravelError> {
|
||||
Err(TimeTravelError::Unimplemented)
|
||||
}
|
||||
|
||||
@@ -981,22 +981,16 @@ impl RemoteStorage for S3Bucket {
|
||||
timestamp: SystemTime,
|
||||
done_if_after: SystemTime,
|
||||
cancel: &CancellationToken,
|
||||
complexity_limit: Option<NonZeroU32>,
|
||||
) -> Result<(), TimeTravelError> {
|
||||
let kind = RequestKind::TimeTravel;
|
||||
let permit = self.permit(kind, cancel).await?;
|
||||
|
||||
tracing::trace!("Target time: {timestamp:?}, done_if_after {done_if_after:?}");
|
||||
|
||||
// Limit the number of versions deletions, mostly so that we don't
|
||||
// keep requesting forever if the list is too long, as we'd put the
|
||||
// list in RAM.
|
||||
// Building a list of 100k entries that reaches the limit roughly takes
|
||||
// 40 seconds, and roughly corresponds to tenants of 2 TiB physical size.
|
||||
const COMPLEXITY_LIMIT: Option<NonZeroU32> = NonZeroU32::new(100_000);
|
||||
|
||||
let mode = ListingMode::NoDelimiter;
|
||||
let version_listing = self
|
||||
.list_versions_with_permit(&permit, prefix, mode, COMPLEXITY_LIMIT, cancel)
|
||||
.list_versions_with_permit(&permit, prefix, mode, complexity_limit, cancel)
|
||||
.await
|
||||
.map_err(|err| match err {
|
||||
DownloadError::Other(e) => TimeTravelError::Other(e),
|
||||
|
||||
@@ -240,11 +240,12 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
timestamp: SystemTime,
|
||||
done_if_after: SystemTime,
|
||||
cancel: &CancellationToken,
|
||||
complexity_limit: Option<NonZeroU32>,
|
||||
) -> Result<(), TimeTravelError> {
|
||||
self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned())))
|
||||
.map_err(TimeTravelError::Other)?;
|
||||
self.inner
|
||||
.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
||||
.time_travel_recover(prefix, timestamp, done_if_after, cancel, complexity_limit)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -157,7 +157,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
// No changes after recovery to t2 (no-op)
|
||||
let t_final = time_point().await;
|
||||
ctx.client
|
||||
.time_travel_recover(None, t2, t_final, &cancel)
|
||||
.time_travel_recover(None, t2, t_final, &cancel, None)
|
||||
.await?;
|
||||
let t2_files_recovered = list_files(&ctx.client, &cancel).await?;
|
||||
println!("after recovery to t2: {t2_files_recovered:?}");
|
||||
@@ -173,7 +173,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
// after recovery to t1: path1 is back, path2 has the old content
|
||||
let t_final = time_point().await;
|
||||
ctx.client
|
||||
.time_travel_recover(None, t1, t_final, &cancel)
|
||||
.time_travel_recover(None, t1, t_final, &cancel, None)
|
||||
.await?;
|
||||
let t1_files_recovered = list_files(&ctx.client, &cancel).await?;
|
||||
println!("after recovery to t1: {t1_files_recovered:?}");
|
||||
@@ -189,7 +189,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
// after recovery to t0: everything is gone except for path1
|
||||
let t_final = time_point().await;
|
||||
ctx.client
|
||||
.time_travel_recover(None, t0, t_final, &cancel)
|
||||
.time_travel_recover(None, t0, t_final, &cancel, None)
|
||||
.await?;
|
||||
let t0_files_recovered = list_files(&ctx.client, &cancel).await?;
|
||||
println!("after recovery to t0: {t0_files_recovered:?}");
|
||||
|
||||
@@ -176,9 +176,11 @@ async fn main() -> anyhow::Result<()> {
|
||||
let config = RemoteStorageConfig::from_toml_str(&cmd.config_toml_str)?;
|
||||
let storage = remote_storage::GenericRemoteStorage::from_config(&config).await;
|
||||
let cancel = CancellationToken::new();
|
||||
// Complexity limit: as we are running this command locally, we should have a lot of memory available, and we do not
|
||||
// need to limit the number of versions we are going to delete.
|
||||
storage
|
||||
.unwrap()
|
||||
.time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel)
|
||||
.time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel, None)
|
||||
.await?;
|
||||
}
|
||||
Commands::Key(dkc) => dkc.execute(),
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
|
||||
use anyhow::Context;
|
||||
use async_compression::tokio::write::GzipEncoder;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use metrics::core::{AtomicU64, GenericCounter};
|
||||
@@ -167,14 +168,17 @@ impl BasebackupCache {
|
||||
.join(Self::entry_filename(tenant_id, timeline_id, lsn))
|
||||
}
|
||||
|
||||
fn tmp_dir(&self) -> Utf8PathBuf {
|
||||
self.data_dir.join("tmp")
|
||||
}
|
||||
|
||||
fn entry_tmp_path(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Utf8PathBuf {
|
||||
self.data_dir
|
||||
.join("tmp")
|
||||
self.tmp_dir()
|
||||
.join(Self::entry_filename(tenant_id, timeline_id, lsn))
|
||||
}
|
||||
|
||||
@@ -194,15 +198,18 @@ impl BasebackupCache {
|
||||
Some((tenant_id, timeline_id, lsn))
|
||||
}
|
||||
|
||||
async fn cleanup(&self) -> anyhow::Result<()> {
|
||||
// Cleanup tmp directory.
|
||||
let tmp_dir = self.data_dir.join("tmp");
|
||||
let mut tmp_dir = tokio::fs::read_dir(&tmp_dir).await?;
|
||||
while let Some(dir_entry) = tmp_dir.next_entry().await? {
|
||||
if let Err(e) = tokio::fs::remove_file(dir_entry.path()).await {
|
||||
tracing::warn!("Failed to remove basebackup cache tmp file: {:#}", e);
|
||||
}
|
||||
// Recreate the tmp directory to clear all files in it.
|
||||
async fn clean_tmp_dir(&self) -> anyhow::Result<()> {
|
||||
let tmp_dir = self.tmp_dir();
|
||||
if tmp_dir.exists() {
|
||||
tokio::fs::remove_dir_all(&tmp_dir).await?;
|
||||
}
|
||||
tokio::fs::create_dir_all(&tmp_dir).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn cleanup(&self) -> anyhow::Result<()> {
|
||||
self.clean_tmp_dir().await?;
|
||||
|
||||
// Remove outdated entries.
|
||||
let entries_old = self.entries.lock().unwrap().clone();
|
||||
@@ -241,16 +248,14 @@ impl BasebackupCache {
|
||||
}
|
||||
|
||||
async fn on_startup(&self) -> anyhow::Result<()> {
|
||||
// Create data_dir and tmp directory if they do not exist.
|
||||
tokio::fs::create_dir_all(&self.data_dir.join("tmp"))
|
||||
// Create data_dir if it does not exist.
|
||||
tokio::fs::create_dir_all(&self.data_dir)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
anyhow::anyhow!(
|
||||
"Failed to create basebackup cache data_dir {:?}: {:?}",
|
||||
self.data_dir,
|
||||
e
|
||||
)
|
||||
})?;
|
||||
.context("Failed to create basebackup cache data directory")?;
|
||||
|
||||
self.clean_tmp_dir()
|
||||
.await
|
||||
.context("Failed to clean tmp directory")?;
|
||||
|
||||
// Read existing entries from the data_dir and add them to in-memory state.
|
||||
let mut entries = HashMap::new();
|
||||
@@ -408,6 +413,19 @@ impl BasebackupCache {
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
|
||||
let feature_flag = tenant
|
||||
.feature_resolver
|
||||
.evaluate_boolean("enable-basebackup-cache", tenant_shard_id.tenant_id);
|
||||
|
||||
if feature_flag.is_err() {
|
||||
tracing::info!(
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
"Basebackup cache is disabled for tenant by feature flag, skipping basebackup",
|
||||
);
|
||||
self.prepare_skip_count.inc();
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let tenant_state = tenant.current_state();
|
||||
if tenant_state != TenantState::Active {
|
||||
anyhow::bail!(
|
||||
@@ -451,6 +469,11 @@ impl BasebackupCache {
|
||||
}
|
||||
|
||||
// Move the tmp file to the final location atomically.
|
||||
// The tmp file is fsynced, so it's guaranteed that we will not have a partial file
|
||||
// in the main directory.
|
||||
// It's not necessary to fsync the inode after renaming, because the worst case is that
|
||||
// the rename operation will be rolled back on the disk failure, the entry will disappear
|
||||
// from the main directory, and the entry access will cause a cache miss.
|
||||
let entry_path = self.entry_path(tenant_shard_id.tenant_id, timeline_id, req_lsn);
|
||||
tokio::fs::rename(&entry_tmp_path, &entry_path).await?;
|
||||
|
||||
@@ -468,16 +491,17 @@ impl BasebackupCache {
|
||||
}
|
||||
|
||||
/// Prepares a basebackup in a temporary file.
|
||||
/// Guarantees that the tmp file is fsynced before returning.
|
||||
async fn prepare_basebackup_tmp(
|
||||
&self,
|
||||
emptry_tmp_path: &Utf8Path,
|
||||
entry_tmp_path: &Utf8Path,
|
||||
timeline: &Arc<Timeline>,
|
||||
req_lsn: Lsn,
|
||||
) -> anyhow::Result<()> {
|
||||
let ctx = RequestContext::new(TaskKind::BasebackupCache, DownloadBehavior::Download);
|
||||
let ctx = ctx.with_scope_timeline(timeline);
|
||||
|
||||
let file = tokio::fs::File::create(emptry_tmp_path).await?;
|
||||
let file = tokio::fs::File::create(entry_tmp_path).await?;
|
||||
let mut writer = BufWriter::new(file);
|
||||
|
||||
let mut encoder = GzipEncoder::with_quality(
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
//! Helper functions to upload files to remote storage with a RemoteStorage
|
||||
|
||||
use std::io::{ErrorKind, SeekFrom};
|
||||
use std::num::NonZeroU32;
|
||||
use std::time::SystemTime;
|
||||
|
||||
use anyhow::{Context, bail};
|
||||
@@ -228,11 +229,25 @@ pub(crate) async fn time_travel_recover_tenant(
|
||||
let timelines_path = super::remote_timelines_path(tenant_shard_id);
|
||||
prefixes.push(timelines_path);
|
||||
}
|
||||
|
||||
// Limit the number of versions deletions, mostly so that we don't
|
||||
// keep requesting forever if the list is too long, as we'd put the
|
||||
// list in RAM.
|
||||
// Building a list of 100k entries that reaches the limit roughly takes
|
||||
// 40 seconds, and roughly corresponds to tenants of 2 TiB physical size.
|
||||
const COMPLEXITY_LIMIT: Option<NonZeroU32> = NonZeroU32::new(100_000);
|
||||
|
||||
for prefix in &prefixes {
|
||||
backoff::retry(
|
||||
|| async {
|
||||
storage
|
||||
.time_travel_recover(Some(prefix), timestamp, done_if_after, cancel)
|
||||
.time_travel_recover(
|
||||
Some(prefix),
|
||||
timestamp,
|
||||
done_if_after,
|
||||
cancel,
|
||||
COMPLEXITY_LIMIT,
|
||||
)
|
||||
.await
|
||||
},
|
||||
|e| !matches!(e, TimeTravelError::Other(_)),
|
||||
|
||||
Reference in New Issue
Block a user