diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 08b74a2656..172b904331 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -180,7 +180,8 @@ jobs: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned options: --init - timeout-minutes: 360 # 6h + # Increase timeout to 8h, default timeout is 6h + timeout-minutes: 480 steps: - uses: actions/checkout@v3 @@ -321,8 +322,6 @@ jobs: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned options: --init - timeout-minutes: 360 # 6h - steps: - uses: actions/checkout@v3 @@ -414,8 +413,6 @@ jobs: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned options: --init - timeout-minutes: 360 # 6h - steps: - uses: actions/checkout@v3 @@ -501,8 +498,6 @@ jobs: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned options: --init - timeout-minutes: 360 # 6h - steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 5f82ab7aca..8215c02291 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -659,6 +659,7 @@ jobs: --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} + --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} @@ -716,6 +717,7 @@ jobs: --context . --build-arg GIT_VERSION=${{ github.sha }} --build-arg PG_VERSION=${{ matrix.version }} + --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com --dockerfile Dockerfile.compute-node --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} @@ -736,7 +738,7 @@ jobs: run: shell: sh -eu {0} env: - VM_BUILDER_VERSION: v0.8.0 + VM_BUILDER_VERSION: v0.11.0 steps: - name: Checkout @@ -914,7 +916,7 @@ jobs: exit 1 fi - - name: Create tag "release-${{ needs.tag.outputs.build-tag }}" + - name: Create git tag if: github.ref_name == 'release' uses: actions/github-script@v6 with: @@ -924,7 +926,7 @@ jobs: github.rest.git.createRef({ owner: context.repo.owner, repo: context.repo.repo, - ref: "refs/tags/release-${{ needs.tag.outputs.build-tag }}", + ref: "refs/tags/${{ needs.tag.outputs.build-tag }}", sha: context.sha, }) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index ae330d8a20..d69b2cf174 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -2,6 +2,7 @@ ARG PG_VERSION ARG REPOSITORY=neondatabase ARG IMAGE=rust ARG TAG=pinned +ARG BUILD_TAG ######################################################################################### # @@ -480,6 +481,40 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_1.tar. make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/rdkit.control +######################################################################################### +# +# Layer "pg-uuidv7-pg-build" +# compile pg_uuidv7 extension +# +######################################################################################### +FROM build-deps AS pg-uuidv7-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +ENV PATH "/usr/local/pgsql/bin/:$PATH" +RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \ + echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \ + mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xvzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_uuidv7.control + +######################################################################################### +# +# Layer "pg-roaringbitmap-pg-build" +# compile pg_roaringbitmap extension +# +######################################################################################### +FROM build-deps AS pg-roaringbitmap-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +ENV PATH "/usr/local/pgsql/bin/:$PATH" +RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \ + echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \ + mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xvzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control + ######################################################################################### # # Layer "rust extensions" @@ -613,6 +648,8 @@ COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ @@ -634,6 +671,9 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \ # ######################################################################################### FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools +ARG BUILD_TAG +ENV BUILD_TAG=$BUILD_TAG + USER nonroot # Copy entire project to get Cargo.* files with proper dependencies for the whole project COPY --chown=nonroot . . diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools index e86fb40ca4..3066e3f7ca 100644 --- a/Dockerfile.compute-tools +++ b/Dockerfile.compute-tools @@ -3,6 +3,7 @@ ARG REPOSITORY=neondatabase ARG IMAGE=rust ARG TAG=pinned +ARG BUILD_TAG FROM $REPOSITORY/$IMAGE:$TAG AS rust-build WORKDIR /home/nonroot @@ -16,6 +17,8 @@ ENV CACHEPOT_S3_KEY_PREFIX=cachepot ARG CACHEPOT_BUCKET=neon-github-dev #ARG AWS_ACCESS_KEY_ID #ARG AWS_SECRET_ACCESS_KEY +ARG BUILD_TAG +ENV BUILD_TAG=$BUILD_TAG COPY . . diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index c6cfde1d1a..90b39e9dd9 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -54,9 +54,15 @@ use compute_tools::monitor::launch_monitor; use compute_tools::params::*; use compute_tools::spec::*; +const BUILD_TAG_DEFAULT: &str = "local"; + fn main() -> Result<()> { init_tracing_and_logging(DEFAULT_LOG_LEVEL)?; + let build_tag = option_env!("BUILD_TAG").unwrap_or(BUILD_TAG_DEFAULT); + + info!("build_tag: {build_tag}"); + let matches = cli().get_matches(); let http_port = *matches diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 94cebf93de..6b549e198b 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -133,6 +133,84 @@ impl TryFrom for ParsedSpec { } } +/// Create special neon_superuser role, that's a slightly nerfed version of a real superuser +/// that we give to customers +fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> { + let roles = spec + .cluster + .roles + .iter() + .map(|r| format!("'{}'", escape_literal(&r.name))) + .collect::>(); + + let dbs = spec + .cluster + .databases + .iter() + .map(|db| format!("'{}'", escape_literal(&db.name))) + .collect::>(); + + let roles_decl = if roles.is_empty() { + String::from("roles text[] := NULL;") + } else { + format!( + r#" + roles text[] := ARRAY(SELECT rolname + FROM pg_catalog.pg_roles + WHERE rolname IN ({}));"#, + roles.join(", ") + ) + }; + + let database_decl = if dbs.is_empty() { + String::from("dbs text[] := NULL;") + } else { + format!( + r#" + dbs text[] := ARRAY(SELECT datname + FROM pg_catalog.pg_database + WHERE datname IN ({}));"#, + dbs.join(", ") + ) + }; + + // ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on all databases + // (see https://www.postgresql.org/docs/current/ddl-priv.html) + let query = format!( + r#" + DO $$ + DECLARE + r text; + {} + {} + BEGIN + IF NOT EXISTS ( + SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser') + THEN + CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN IN ROLE pg_read_all_data, pg_write_all_data; + IF array_length(roles, 1) IS NOT NULL THEN + EXECUTE format('GRANT neon_superuser TO %s', + array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', ')); + FOREACH r IN ARRAY roles LOOP + EXECUTE format('ALTER ROLE %s CREATEROLE CREATEDB', quote_ident(r)); + END LOOP; + END IF; + IF array_length(dbs, 1) IS NOT NULL THEN + EXECUTE format('GRANT ALL PRIVILEGES ON DATABASE %s TO neon_superuser', + array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(dbs) as x), ', ')); + END IF; + END IF; + END + $$;"#, + roles_decl, database_decl, + ); + info!("Neon superuser created:\n{}", &query); + client + .simple_query(&query) + .map_err(|e| anyhow::anyhow!(e).context(query))?; + Ok(()) +} + impl ComputeNode { pub fn set_status(&self, status: ComputeStatus) { let mut state = self.state.lock().unwrap(); @@ -347,6 +425,8 @@ impl ComputeNode { .map_err(|_| anyhow::anyhow!("invalid connstr"))?; let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?; + // Disable forwarding so that users don't get a cloud_admin role + client.simple_query("SET neon.forward_ddl = false")?; client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?; client.simple_query("GRANT zenith_admin TO cloud_admin")?; drop(client); @@ -357,14 +437,16 @@ impl ComputeNode { Ok(client) => client, }; - // Proceed with post-startup configuration. Note, that order of operations is important. // Disable DDL forwarding because control plane already knows about these roles/databases. client.simple_query("SET neon.forward_ddl = false")?; + + // Proceed with post-startup configuration. Note, that order of operations is important. let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec; + create_neon_superuser(spec, &mut client)?; handle_roles(spec, &mut client)?; handle_databases(spec, &mut client)?; handle_role_deletions(spec, self.connstr.as_str(), &mut client)?; - handle_grants(spec, self.connstr.as_str(), &mut client)?; + handle_grants(spec, self.connstr.as_str())?; handle_extensions(spec, &mut client)?; // 'Close' connection @@ -402,7 +484,7 @@ impl ComputeNode { handle_roles(&spec, &mut client)?; handle_databases(&spec, &mut client)?; handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?; - handle_grants(&spec, self.connstr.as_str(), &mut client)?; + handle_grants(&spec, self.connstr.as_str())?; handle_extensions(&spec, &mut client)?; } diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index d5c845e9ea..b76ed1fd85 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -17,7 +17,7 @@ use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role}; const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds /// Escape a string for including it in a SQL literal -fn escape_literal(s: &str) -> String { +pub fn escape_literal(s: &str) -> String { s.replace('\'', "''").replace('\\', "\\\\") } diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index a2a19ae0da..520696da00 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -269,17 +269,13 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> { xact.execute(query.as_str(), &[])?; } RoleAction::Create => { - let mut query: String = format!("CREATE ROLE {} ", name.pg_quote()); + let mut query: String = format!( + "CREATE ROLE {} CREATEROLE CREATEDB IN ROLE neon_superuser", + name.pg_quote() + ); info!("role create query: '{}'", &query); query.push_str(&role.to_pg_options()); xact.execute(query.as_str(), &[])?; - - let grant_query = format!( - "GRANT pg_read_all_data, pg_write_all_data TO {}", - name.pg_quote() - ); - xact.execute(grant_query.as_str(), &[])?; - info!("role grant query: '{}'", &grant_query); } } @@ -476,6 +472,11 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { query.push_str(&db.to_pg_options()); let _guard = info_span!("executing", query).entered(); client.execute(query.as_str(), &[])?; + let grant_query: String = format!( + "GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser", + name.pg_quote() + ); + client.execute(grant_query.as_str(), &[])?; } }; @@ -495,35 +496,9 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { /// Grant CREATE ON DATABASE to the database owner and do some other alters and grants /// to allow users creating trusted extensions and re-creating `public` schema, for example. #[instrument(skip_all)] -pub fn handle_grants(spec: &ComputeSpec, connstr: &str, client: &mut Client) -> Result<()> { +pub fn handle_grants(spec: &ComputeSpec, connstr: &str) -> Result<()> { info!("cluster spec grants:"); - // We now have a separate `web_access` role to connect to the database - // via the web interface and proxy link auth. And also we grant a - // read / write all data privilege to every role. So also grant - // create to everyone. - // XXX: later we should stop messing with Postgres ACL in such horrible - // ways. - let roles = spec - .cluster - .roles - .iter() - .map(|r| r.name.pg_quote()) - .collect::>(); - - for db in &spec.cluster.databases { - let dbname = &db.name; - - let query: String = format!( - "GRANT CREATE ON DATABASE {} TO {}", - dbname.pg_quote(), - roles.join(", ") - ); - info!("grant query {}", &query); - - client.execute(query.as_str(), &[])?; - } - // Do some per-database access adjustments. We'd better do this at db creation time, // but CREATE DATABASE isn't transactional. So we cannot create db + do some grants // atomically. diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index d3131ac476..52683ff1c3 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -67,6 +67,7 @@ pub struct EndpointConf { pg_port: u16, http_port: u16, pg_version: u32, + skip_pg_catalog_updates: bool, } // @@ -135,6 +136,7 @@ impl ComputeControlPlane { mode, tenant_id, pg_version, + skip_pg_catalog_updates: false, }); ep.create_endpoint_dir()?; @@ -148,6 +150,7 @@ impl ComputeControlPlane { http_port, pg_port, pg_version, + skip_pg_catalog_updates: false, })?, )?; std::fs::write( @@ -183,6 +186,9 @@ pub struct Endpoint { // the endpoint runs in. pub env: LocalEnv, pageserver: Arc, + + // Optimizations + skip_pg_catalog_updates: bool, } impl Endpoint { @@ -216,6 +222,7 @@ impl Endpoint { mode: conf.mode, tenant_id: conf.tenant_id, pg_version: conf.pg_version, + skip_pg_catalog_updates: conf.skip_pg_catalog_updates, }) } @@ -450,7 +457,7 @@ impl Endpoint { // Create spec file let spec = ComputeSpec { - skip_pg_catalog_updates: false, + skip_pg_catalog_updates: self.skip_pg_catalog_updates, format_version: 1.0, operation_uuid: None, cluster: Cluster { diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index 6011713c8f..f24488b19d 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -23,6 +23,7 @@ use prometheus::{Registry, Result}; pub mod launch_timestamp; mod wrappers; pub use wrappers::{CountedReader, CountedWriter}; +pub mod metric_vec_duration; pub type UIntGauge = GenericGauge; pub type UIntGaugeVec = GenericGaugeVec; diff --git a/libs/metrics/src/metric_vec_duration.rs b/libs/metrics/src/metric_vec_duration.rs new file mode 100644 index 0000000000..840f60f19b --- /dev/null +++ b/libs/metrics/src/metric_vec_duration.rs @@ -0,0 +1,23 @@ +//! Helpers for observing duration on HistogramVec / CounterVec / GaugeVec / MetricVec. + +use std::{future::Future, time::Instant}; + +pub trait DurationResultObserver { + fn observe_result(&self, res: &Result, duration: std::time::Duration); +} + +pub async fn observe_async_block_duration_by_result< + T, + E, + F: Future>, + O: DurationResultObserver, +>( + observer: &O, + block: F, +) -> Result { + let start = Instant::now(); + let result = block.await; + let duration = start.elapsed(); + observer.observe_result(&result, duration); + result +} diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index ac1f8a357e..0e9c237e1e 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -70,6 +70,14 @@ impl RemotePath { pub fn join(&self, segment: &Path) -> Self { Self(self.0.join(segment)) } + + pub fn get_path(&self) -> &PathBuf { + &self.0 + } + + pub fn extension(&self) -> Option<&str> { + self.0.extension()?.to_str() + } } /// Storage (potentially remote) API to manage its state. @@ -86,6 +94,19 @@ pub trait RemoteStorage: Send + Sync + 'static { prefix: Option<&RemotePath>, ) -> Result, DownloadError>; + /// Lists all files in directory "recursively" + /// (not really recursively, because AWS has a flat namespace) + /// Note: This is subtely different than list_prefixes, + /// because it is for listing files instead of listing + /// names sharing common prefixes. + /// For example, + /// list_files("foo/bar") = ["foo/bar/cat123.txt", + /// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"] + /// whereas, + /// list_prefixes("foo/bar/") = ["cat", "dog"] + /// See `test_real_s3.rs` for more details. + async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result>; + /// Streams the local file contents into remote into the remote storage entry. async fn upload( &self, @@ -174,6 +195,14 @@ impl GenericRemoteStorage { } } + pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result> { + match self { + Self::LocalFs(s) => s.list_files(folder).await, + Self::AwsS3(s) => s.list_files(folder).await, + Self::Unreliable(s) => s.list_files(folder).await, + } + } + pub async fn upload( &self, from: impl io::AsyncRead + Unpin + Send + Sync + 'static, diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 59304c2481..ca5fbd5de5 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -48,6 +48,14 @@ impl LocalFs { Ok(Self { storage_root }) } + // mirrors S3Bucket::s3_object_to_relative_path + fn local_file_to_relative_path(&self, key: PathBuf) -> RemotePath { + let relative_path = key + .strip_prefix(&self.storage_root) + .expect("relative path must contain storage_root as prefix"); + RemotePath(relative_path.into()) + } + async fn read_storage_metadata( &self, file_path: &Path, @@ -132,6 +140,34 @@ impl RemoteStorage for LocalFs { Ok(prefixes) } + // recursively lists all files in a directory, + // mirroring the `list_files` for `s3_bucket` + async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result> { + let full_path = match folder { + Some(folder) => folder.with_base(&self.storage_root), + None => self.storage_root.clone(), + }; + let mut files = vec![]; + let mut directory_queue = vec![full_path.clone()]; + + while !directory_queue.is_empty() { + let cur_folder = directory_queue + .pop() + .expect("queue cannot be empty: we just checked"); + let mut entries = fs::read_dir(cur_folder.clone()).await?; + while let Some(entry) = entries.next_entry().await? { + let file_name: PathBuf = entry.file_name().into(); + let full_file_name = cur_folder.clone().join(&file_name); + let file_remote_path = self.local_file_to_relative_path(full_file_name.clone()); + files.push(file_remote_path.clone()); + if full_file_name.is_dir() { + directory_queue.push(full_file_name); + } + } + } + Ok(files) + } + async fn upload( &self, data: impl io::AsyncRead + Unpin + Send + Sync + 'static, diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index dafb6dcb45..43d818dfb9 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -347,6 +347,51 @@ impl RemoteStorage for S3Bucket { Ok(document_keys) } + /// See the doc for `RemoteStorage::list_files` + async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result> { + let folder_name = folder + .map(|p| self.relative_path_to_s3_object(p)) + .or_else(|| self.prefix_in_bucket.clone()); + + // AWS may need to break the response into several parts + let mut continuation_token = None; + let mut all_files = vec![]; + loop { + let _guard = self + .concurrency_limiter + .acquire() + .await + .context("Concurrency limiter semaphore got closed during S3 list_files")?; + metrics::inc_list_objects(); + + let response = self + .client + .list_objects_v2() + .bucket(self.bucket_name.clone()) + .set_prefix(folder_name.clone()) + .set_continuation_token(continuation_token) + .set_max_keys(self.max_keys_per_list_response) + .send() + .await + .map_err(|e| { + metrics::inc_list_objects_fail(); + e + }) + .context("Failed to list files in S3 bucket")?; + + for object in response.contents().unwrap_or_default() { + let object_path = object.key().expect("response does not contain a key"); + let remote_path = self.s3_object_to_relative_path(object_path); + all_files.push(remote_path); + } + match response.next_continuation_token { + Some(new_token) => continuation_token = Some(new_token), + None => break, + } + } + Ok(all_files) + } + async fn upload( &self, from: impl io::AsyncRead + Unpin + Send + Sync + 'static, diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index 741c18bf6f..c46ca14ace 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -83,6 +83,11 @@ impl RemoteStorage for UnreliableWrapper { self.inner.list_prefixes(prefix).await } + async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result> { + self.attempt(RemoteOp::ListPrefixes(folder.cloned()))?; + self.inner.list_files(folder).await + } + async fn upload( &self, data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static, diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index 5f52b0754c..6fe65a0362 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -88,6 +88,58 @@ async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3WithTestBlobs) -> any Ok(()) } +/// Tests that S3 client can list all files in a folder, even if the response comes paginated and requirees multiple S3 queries. +/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. Test will skip real code and pass if env vars not set. +/// See `s3_pagination_should_work` for more information. +/// +/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_s3_data`] +/// Then performs the following queries: +/// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt` +/// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt` +#[test_context(MaybeEnabledS3WithSimpleTestBlobs)] +#[tokio::test] +async fn s3_list_files_works(ctx: &mut MaybeEnabledS3WithSimpleTestBlobs) -> anyhow::Result<()> { + let ctx = match ctx { + MaybeEnabledS3WithSimpleTestBlobs::Enabled(ctx) => ctx, + MaybeEnabledS3WithSimpleTestBlobs::Disabled => return Ok(()), + MaybeEnabledS3WithSimpleTestBlobs::UploadsFailed(e, _) => { + anyhow::bail!("S3 init failed: {e:?}") + } + }; + let test_client = Arc::clone(&ctx.enabled.client); + let base_prefix = + RemotePath::new(Path::new("folder1")).context("common_prefix construction")?; + let root_files = test_client + .list_files(None) + .await + .context("client list root files failure")? + .into_iter() + .collect::>(); + assert_eq!( + root_files, + ctx.remote_blobs.clone(), + "remote storage list_files on root mismatches with the uploads." + ); + let nested_remote_files = test_client + .list_files(Some(&base_prefix)) + .await + .context("client list nested files failure")? + .into_iter() + .collect::>(); + let trim_remote_blobs: HashSet<_> = ctx + .remote_blobs + .iter() + .map(|x| x.get_path().to_str().expect("must be valid name")) + .filter(|x| x.starts_with("folder1")) + .map(|x| RemotePath::new(Path::new(x)).expect("must be valid name")) + .collect(); + assert_eq!( + nested_remote_files, trim_remote_blobs, + "remote storage list_files on subdirrectory mismatches with the uploads." + ); + Ok(()) +} + #[test_context(MaybeEnabledS3)] #[tokio::test] async fn s3_delete_non_exising_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> { @@ -248,6 +300,66 @@ impl AsyncTestContext for MaybeEnabledS3WithTestBlobs { } } +// NOTE: the setups for the list_prefixes test and the list_files test are very similar +// However, they are not idential. The list_prefixes function is concerned with listing prefixes, +// whereas the list_files function is concerned with listing files. +// See `RemoteStorage::list_files` documentation for more details +enum MaybeEnabledS3WithSimpleTestBlobs { + Enabled(S3WithSimpleTestBlobs), + Disabled, + UploadsFailed(anyhow::Error, S3WithSimpleTestBlobs), +} +struct S3WithSimpleTestBlobs { + enabled: EnabledS3, + remote_blobs: HashSet, +} + +#[async_trait::async_trait] +impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs { + async fn setup() -> Self { + ensure_logging_ready(); + if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() { + info!( + "`{}` env variable is not set, skipping the test", + ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME + ); + return Self::Disabled; + } + + let max_keys_in_list_response = 10; + let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap()); + + let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await; + + match upload_simple_s3_data(&enabled.client, upload_tasks_count).await { + ControlFlow::Continue(uploads) => { + info!("Remote objects created successfully"); + + Self::Enabled(S3WithSimpleTestBlobs { + enabled, + remote_blobs: uploads, + }) + } + ControlFlow::Break(uploads) => Self::UploadsFailed( + anyhow::anyhow!("One or multiple blobs failed to upload to S3"), + S3WithSimpleTestBlobs { + enabled, + remote_blobs: uploads, + }, + ), + } + } + + async fn teardown(self) { + match self { + Self::Disabled => {} + Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => { + cleanup(&ctx.enabled.client, ctx.remote_blobs).await; + } + } + } +} + fn create_s3_client( max_keys_per_list_response: Option, ) -> anyhow::Result> { @@ -258,7 +370,7 @@ fn create_s3_client( let random_prefix_part = std::time::SystemTime::now() .duration_since(UNIX_EPOCH) .context("random s3 test prefix part calculation")? - .as_millis(); + .as_nanos(); let remote_storage_config = RemoteStorageConfig { max_concurrent_syncs: NonZeroUsize::new(100).unwrap(), max_sync_errors: NonZeroU32::new(5).unwrap(), @@ -364,3 +476,52 @@ async fn cleanup(client: &Arc, objects_to_delete: HashSet< } } } + +// Uploads files `folder{j}/blob{i}.txt`. See test description for more details. +async fn upload_simple_s3_data( + client: &Arc, + upload_tasks_count: usize, +) -> ControlFlow, HashSet> { + info!("Creating {upload_tasks_count} S3 files"); + let mut upload_tasks = JoinSet::new(); + for i in 1..upload_tasks_count + 1 { + let task_client = Arc::clone(client); + upload_tasks.spawn(async move { + let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i)); + let blob_path = RemotePath::new(&blob_path) + .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?; + debug!("Creating remote item {i} at path {blob_path:?}"); + + let data = format!("remote blob data {i}").into_bytes(); + let data_len = data.len(); + task_client + .upload(std::io::Cursor::new(data), data_len, &blob_path, None) + .await?; + + Ok::<_, anyhow::Error>(blob_path) + }); + } + + let mut upload_tasks_failed = false; + let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count); + while let Some(task_run_result) = upload_tasks.join_next().await { + match task_run_result + .context("task join failed") + .and_then(|task_result| task_result.context("upload task failed")) + { + Ok(upload_path) => { + uploaded_blobs.insert(upload_path); + } + Err(e) => { + error!("Upload task failed: {e:?}"); + upload_tasks_failed = true; + } + } + } + + if upload_tasks_failed { + ControlFlow::Break(uploaded_blobs) + } else { + ControlFlow::Continue(uploaded_blobs) + } +} diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs index f9c06453df..527e486fd0 100644 --- a/libs/utils/src/http/error.rs +++ b/libs/utils/src/http/error.rs @@ -1,5 +1,6 @@ use hyper::{header, Body, Response, StatusCode}; use serde::{Deserialize, Serialize}; +use std::error::Error as StdError; use thiserror::Error; use tracing::error; @@ -15,7 +16,7 @@ pub enum ApiError { Unauthorized(String), #[error("NotFound: {0}")] - NotFound(anyhow::Error), + NotFound(Box), #[error("Conflict: {0}")] Conflict(String), diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 17e6e3fb2a..2046d27b1e 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -96,12 +96,12 @@ pub mod defaults { #background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}' -# [tenant_config] +[tenant_config] #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} #compaction_target_size = {DEFAULT_COMPACTION_TARGET_SIZE} # in bytes #compaction_period = '{DEFAULT_COMPACTION_PERIOD}' -#compaction_threshold = '{DEFAULT_COMPACTION_THRESHOLD}' +#compaction_threshold = {DEFAULT_COMPACTION_THRESHOLD} #gc_period = '{DEFAULT_GC_PERIOD}' #gc_horizon = {DEFAULT_GC_HORIZON} @@ -111,7 +111,8 @@ pub mod defaults { #min_resident_size_override = .. # in bytes #evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}' #gc_feedback = false -# [remote_storage] + +[remote_storage] "### ); diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 0a55741f84..8d3bb5552b 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -142,7 +142,7 @@ impl From for ApiError { impl From for ApiError { fn from(tse: TenantStateError) -> ApiError { match tse { - TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)), + TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()), _ => ApiError::InternalServerError(anyhow::Error::new(tse)), } } @@ -151,7 +151,7 @@ impl From for ApiError { impl From for ApiError { fn from(tse: GetTenantError) -> ApiError { match tse { - GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)), + GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()), e @ GetTenantError::NotActive(_) => { // Why is this not `ApiError::NotFound`? // Because we must be careful to never return 404 for a tenant if it does @@ -169,7 +169,7 @@ impl From for ApiError { fn from(e: SetNewTenantConfigError) -> ApiError { match e { SetNewTenantConfigError::GetTenant(tid) => { - ApiError::NotFound(anyhow!("tenant {}", tid)) + ApiError::NotFound(anyhow!("tenant {}", tid).into()) } e @ SetNewTenantConfigError::Persist(_) => { ApiError::InternalServerError(anyhow::Error::new(e)) @@ -182,7 +182,7 @@ impl From for ApiError { fn from(value: crate::tenant::DeleteTimelineError) -> Self { use crate::tenant::DeleteTimelineError::*; match value { - NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found")), + NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()), HasChildren(children) => ApiError::PreconditionFailed( format!("Cannot delete timeline which has child timelines: {children:?}") .into_boxed_str(), @@ -397,7 +397,7 @@ async fn timeline_detail_handler( let timeline = tenant .get_timeline(timeline_id, false) - .map_err(ApiError::NotFound)?; + .map_err(|e| ApiError::NotFound(e.into()))?; let timeline_info = build_timeline_info( &timeline, @@ -1061,7 +1061,7 @@ async fn timeline_download_remote_layers_handler_get( let info = timeline .get_download_all_remote_layers_task_info() .context("task never started since last pageserver process start") - .map_err(ApiError::NotFound)?; + .map_err(|e| ApiError::NotFound(e.into()))?; json_response(StatusCode::OK, info) } @@ -1072,7 +1072,7 @@ async fn active_timeline_of_active_tenant( let tenant = mgr::get_tenant(tenant_id, true).await?; tenant .get_timeline(timeline_id, true) - .map_err(ApiError::NotFound) + .map_err(|e| ApiError::NotFound(e.into())) } async fn always_panic_handler( diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 43d06db6d8..00745143bd 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1,3 +1,4 @@ +use metrics::metric_vec_duration::DurationResultObserver; use metrics::{ register_counter_vec, register_histogram, register_histogram_vec, register_int_counter, register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec, @@ -424,6 +425,27 @@ pub static SMGR_QUERY_TIME: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +pub struct BasebackupQueryTime(HistogramVec); +pub static BASEBACKUP_QUERY_TIME: Lazy = Lazy::new(|| { + BasebackupQueryTime({ + register_histogram_vec!( + "pageserver_basebackup_query_seconds", + "Histogram of basebackup queries durations, by result type", + &["result"], + CRITICAL_OP_BUCKETS.into(), + ) + .expect("failed to define a metric") + }) +}); + +impl DurationResultObserver for BasebackupQueryTime { + fn observe_result(&self, res: &Result, duration: std::time::Duration) { + let label_value = if res.is_ok() { "ok" } else { "error" }; + let metric = self.0.get_metric_with_label_values(&[label_value]).unwrap(); + metric.observe(duration.as_secs_f64()); + } +} + pub static LIVE_CONNECTIONS_COUNT: Lazy = Lazy::new(|| { register_int_gauge_vec!( "pageserver_live_connections", @@ -823,11 +845,6 @@ impl TimelineMetrics { let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id); - // TODO(chi): remove this once we remove Lazy for all metrics. Otherwise this will not appear in the exporter - // and integration test will error. - MATERIALIZED_PAGE_CACHE_HIT_DIRECT.get(); - MATERIALIZED_PAGE_CACHE_HIT.get(); - TimelineMetrics { tenant_id, timeline_id, @@ -1302,4 +1319,8 @@ pub fn preinitialize_metrics() { // Same as above for this metric, but, it's a Vec-type metric for which we don't know all the labels. BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT.reset(); + + // Python tests need these. + MATERIALIZED_PAGE_CACHE_HIT_DIRECT.get(); + MATERIALIZED_PAGE_CACHE_HIT.get(); } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 9e9285a009..d32518b513 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -390,7 +390,9 @@ impl PageServerHandler { }; // Check that the timeline exists - let timeline = tenant.get_timeline(timeline_id, true)?; + let timeline = tenant + .get_timeline(timeline_id, true) + .map_err(|e| anyhow::anyhow!(e))?; // switch client to COPYBOTH pgb.write_message_noflush(&BeMessage::CopyBothResponse)?; @@ -911,10 +913,24 @@ where None }; - // Check that the timeline exists - self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false, ctx) - .await?; - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + metrics::metric_vec_duration::observe_async_block_duration_by_result( + &*crate::metrics::BASEBACKUP_QUERY_TIME, + async move { + self.handle_basebackup_request( + pgb, + tenant_id, + timeline_id, + lsn, + None, + false, + ctx, + ) + .await?; + pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + anyhow::Ok(()) + }, + ) + .await?; } // return pair of prev_lsn and last_lsn else if query_string.starts_with("get_last_record_rlsn ") { @@ -1230,6 +1246,6 @@ async fn get_active_tenant_timeline( .map_err(GetActiveTimelineError::Tenant)?; let timeline = tenant .get_timeline(timeline_id, true) - .map_err(GetActiveTimelineError::Timeline)?; + .map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?; Ok(timeline) } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 7fdd047c96..0e8d6b1287 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -421,6 +421,21 @@ remote: } } +#[derive(Debug, thiserror::Error, PartialEq, Eq)] +pub enum GetTimelineError { + #[error("Timeline {tenant_id}/{timeline_id} is not active, state: {state:?}")] + NotActive { + tenant_id: TenantId, + timeline_id: TimelineId, + state: TimelineState, + }, + #[error("Timeline {tenant_id}/{timeline_id} was not found")] + NotFound { + tenant_id: TenantId, + timeline_id: TimelineId, + }, +} + #[derive(Debug, thiserror::Error)] pub enum DeleteTimelineError { #[error("NotFound")] @@ -946,6 +961,117 @@ impl Tenant { tenant } + pub fn scan_and_sort_timelines_dir( + self: Arc, + ) -> anyhow::Result> { + let timelines_dir = self.conf.timelines_path(&self.tenant_id); + let mut timelines_to_load: HashMap = HashMap::new(); + + for entry in + std::fs::read_dir(&timelines_dir).context("list timelines directory for tenant")? + { + let entry = entry.context("read timeline dir entry")?; + let timeline_dir = entry.path(); + + if crate::is_temporary(&timeline_dir) { + info!( + "Found temporary timeline directory, removing: {}", + timeline_dir.display() + ); + if let Err(e) = std::fs::remove_dir_all(&timeline_dir) { + error!( + "Failed to remove temporary directory '{}': {:?}", + timeline_dir.display(), + e + ); + } + } else if is_uninit_mark(&timeline_dir) { + if !timeline_dir.exists() { + warn!( + "Timeline dir entry become invalid: {}", + timeline_dir.display() + ); + continue; + } + let timeline_uninit_mark_file = &timeline_dir; + info!( + "Found an uninit mark file {}, removing the timeline and its uninit mark", + timeline_uninit_mark_file.display() + ); + let timeline_id = timeline_uninit_mark_file + .file_stem() + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::() + .with_context(|| { + format!( + "Could not parse timeline id out of the timeline uninit mark name {}", + timeline_uninit_mark_file.display() + ) + })?; + let timeline_dir = self.conf.timeline_path(&timeline_id, &self.tenant_id); + if let Err(e) = + remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file) + { + error!("Failed to clean up uninit marked timeline: {e:?}"); + } + } else { + if !timeline_dir.exists() { + warn!( + "Timeline dir entry become invalid: {}", + timeline_dir.display() + ); + continue; + } + let timeline_id = timeline_dir + .file_name() + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::() + .with_context(|| { + format!( + "Could not parse timeline id out of the timeline dir name {}", + timeline_dir.display() + ) + })?; + let timeline_uninit_mark_file = self + .conf + .timeline_uninit_mark_file_path(self.tenant_id, timeline_id); + if timeline_uninit_mark_file.exists() { + info!( + %timeline_id, + "Found an uninit mark file, removing the timeline and its uninit mark", + ); + if let Err(e) = + remove_timeline_and_uninit_mark(&timeline_dir, &timeline_uninit_mark_file) + { + error!("Failed to clean up uninit marked timeline: {e:?}"); + } + continue; + } + + let file_name = entry.file_name(); + if let Ok(timeline_id) = + file_name.to_str().unwrap_or_default().parse::() + { + let metadata = load_metadata(self.conf, timeline_id, self.tenant_id) + .context("failed to load metadata")?; + timelines_to_load.insert(timeline_id, metadata); + } else { + // A file or directory that doesn't look like a timeline ID + warn!( + "unexpected file or directory in timelines directory: {}", + file_name.to_string_lossy() + ); + } + } + } + + // Sort the array of timeline IDs into tree-order, so that parent comes before + // all its children. + tree_sort_timelines(timelines_to_load) + } + /// /// Background task to load in-memory data structures for this tenant, from /// files on disk. Used at pageserver startup. @@ -962,110 +1088,16 @@ impl Tenant { utils::failpoint_sleep_millis_async!("before-loading-tenant"); - // TODO split this into two functions, scan and actual load - // Load in-memory state to reflect the local files on disk // // Scan the directory, peek into the metadata file of each timeline, and // collect a list of timelines and their ancestors. - let tenant_id = self.tenant_id; - let conf = self.conf; let span = info_span!("blocking"); + let cloned = Arc::clone(self); let sorted_timelines: Vec<(_, _)> = tokio::task::spawn_blocking(move || { let _g = span.entered(); - let mut timelines_to_load: HashMap = HashMap::new(); - let timelines_dir = conf.timelines_path(&tenant_id); - - for entry in - std::fs::read_dir(&timelines_dir).context("list timelines directory for tenant")? - { - let entry = entry.context("read timeline dir entry")?; - let timeline_dir = entry.path(); - - if crate::is_temporary(&timeline_dir) { - info!( - "Found temporary timeline directory, removing: {}", - timeline_dir.display() - ); - if let Err(e) = std::fs::remove_dir_all(&timeline_dir) { - error!( - "Failed to remove temporary directory '{}': {:?}", - timeline_dir.display(), - e - ); - } - } else if is_uninit_mark(&timeline_dir) { - let timeline_uninit_mark_file = &timeline_dir; - info!( - "Found an uninit mark file {}, removing the timeline and its uninit mark", - timeline_uninit_mark_file.display() - ); - let timeline_id = timeline_uninit_mark_file - .file_stem() - .and_then(OsStr::to_str) - .unwrap_or_default() - .parse::() - .with_context(|| { - format!( - "Could not parse timeline id out of the timeline uninit mark name {}", - timeline_uninit_mark_file.display() - ) - })?; - let timeline_dir = conf.timeline_path(&timeline_id, &tenant_id); - if let Err(e) = - remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file) - { - error!("Failed to clean up uninit marked timeline: {e:?}"); - } - } else { - let timeline_id = timeline_dir - .file_name() - .and_then(OsStr::to_str) - .unwrap_or_default() - .parse::() - .with_context(|| { - format!( - "Could not parse timeline id out of the timeline dir name {}", - timeline_dir.display() - ) - })?; - let timeline_uninit_mark_file = - conf.timeline_uninit_mark_file_path(tenant_id, timeline_id); - if timeline_uninit_mark_file.exists() { - info!( - %timeline_id, - "Found an uninit mark file, removing the timeline and its uninit mark", - ); - if let Err(e) = remove_timeline_and_uninit_mark( - &timeline_dir, - &timeline_uninit_mark_file, - ) { - error!("Failed to clean up uninit marked timeline: {e:?}"); - } - continue; - } - - let file_name = entry.file_name(); - if let Ok(timeline_id) = - file_name.to_str().unwrap_or_default().parse::() - { - let metadata = load_metadata(conf, timeline_id, tenant_id) - .context("failed to load metadata")?; - timelines_to_load.insert(timeline_id, metadata); - } else { - // A file or directory that doesn't look like a timeline ID - warn!( - "unexpected file or directory in timelines directory: {}", - file_name.to_string_lossy() - ); - } - } - } - - // Sort the array of timeline IDs into tree-order, so that parent comes before - // all its children. - tree_sort_timelines(timelines_to_load) + cloned.scan_and_sort_timelines_dir() }) .await .context("load spawn_blocking") @@ -1213,19 +1245,21 @@ impl Tenant { &self, timeline_id: TimelineId, active_only: bool, - ) -> anyhow::Result> { + ) -> Result, GetTimelineError> { let timelines_accessor = self.timelines.lock().unwrap(); - let timeline = timelines_accessor.get(&timeline_id).with_context(|| { - format!("Timeline {}/{} was not found", self.tenant_id, timeline_id) - })?; + let timeline = timelines_accessor + .get(&timeline_id) + .ok_or(GetTimelineError::NotFound { + tenant_id: self.tenant_id, + timeline_id, + })?; if active_only && !timeline.is_active() { - anyhow::bail!( - "Timeline {}/{} is not active, state: {:?}", - self.tenant_id, + Err(GetTimelineError::NotActive { + tenant_id: self.tenant_id, timeline_id, - timeline.current_state() - ) + state: timeline.current_state(), + }) } else { Ok(Arc::clone(timeline)) } @@ -3375,9 +3409,8 @@ where #[cfg(test)] pub mod harness { use bytes::{Bytes, BytesMut}; - use once_cell::sync::Lazy; use once_cell::sync::OnceCell; - use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}; + use std::sync::Arc; use std::{fs, path::PathBuf}; use utils::logging; use utils::lsn::Lsn; @@ -3410,8 +3443,6 @@ pub mod harness { buf.freeze() } - static LOCK: Lazy> = Lazy::new(|| RwLock::new(())); - impl From for TenantConfOpt { fn from(tenant_conf: TenantConf) -> Self { Self { @@ -3438,33 +3469,16 @@ pub mod harness { } } - pub struct TenantHarness<'a> { + pub struct TenantHarness { pub conf: &'static PageServerConf, pub tenant_conf: TenantConf, pub tenant_id: TenantId, - - pub lock_guard: ( - Option>, - Option>, - ), } static LOG_HANDLE: OnceCell<()> = OnceCell::new(); - impl<'a> TenantHarness<'a> { + impl TenantHarness { pub fn create(test_name: &'static str) -> anyhow::Result { - Self::create_internal(test_name, false) - } - pub fn create_exclusive(test_name: &'static str) -> anyhow::Result { - Self::create_internal(test_name, true) - } - fn create_internal(test_name: &'static str, exclusive: bool) -> anyhow::Result { - let lock_guard = if exclusive { - (None, Some(LOCK.write().unwrap())) - } else { - (Some(LOCK.read().unwrap()), None) - }; - LOG_HANDLE.get_or_init(|| { logging::init( logging::LogFormat::Test, @@ -3500,7 +3514,6 @@ pub mod harness { conf, tenant_conf, tenant_id, - lock_guard, }) } @@ -3525,26 +3538,12 @@ pub mod harness { self.tenant_id, None, )); - // populate tenant with locally available timelines - let mut timelines_to_load = HashMap::new(); - for timeline_dir_entry in fs::read_dir(self.conf.timelines_path(&self.tenant_id)) - .expect("should be able to read timelines dir") - { - let timeline_dir_entry = timeline_dir_entry?; - let timeline_id: TimelineId = timeline_dir_entry - .path() - .file_name() - .unwrap() - .to_string_lossy() - .parse()?; - - let timeline_metadata = load_metadata(self.conf, timeline_id, self.tenant_id)?; - timelines_to_load.insert(timeline_id, timeline_metadata); - } tenant .load(None, ctx) .instrument(info_span!("try_load", tenant_id=%self.tenant_id)) .await?; + + // TODO reuse Tenant::activate (needs broker) tenant.state.send_replace(TenantState::Active); for timeline in tenant.timelines.lock().unwrap().values() { timeline.set_state(TimelineState::Active); @@ -4070,9 +4069,13 @@ mod tests { std::fs::write(metadata_path, metadata_bytes)?; let err = harness.try_load(&ctx).await.err().expect("should fail"); - assert!(err - .to_string() - .starts_with("Failed to parse metadata bytes from path")); + // get all the stack with all .context, not tonly the last one + let message = format!("{err:#}"); + let expected = "Failed to parse metadata bytes from path"; + assert!( + message.contains(expected), + "message '{message}' expected to contain {expected}" + ); let mut found_error_message = false; let mut err_source = err.source(); @@ -4506,6 +4509,44 @@ mod tests { assert!(expect_initdb_optimization); assert!(initdb_optimization_count > 0); } + Ok(()) + } + + #[tokio::test] + async fn test_uninit_mark_crash() -> anyhow::Result<()> { + let name = "test_uninit_mark_crash"; + let harness = TenantHarness::create(name)?; + { + let (tenant, ctx) = harness.load().await; + let tline = + tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; + // Keeps uninit mark in place + std::mem::forget(tline); + } + + let (tenant, _) = harness.load().await; + match tenant.get_timeline(TIMELINE_ID, false) { + Ok(_) => panic!("timeline should've been removed during load"), + Err(e) => { + assert_eq!( + e, + GetTimelineError::NotFound { + tenant_id: tenant.tenant_id, + timeline_id: TIMELINE_ID, + } + ) + } + } + + assert!(!harness + .conf + .timeline_path(&TIMELINE_ID, &tenant.tenant_id) + .exists()); + + assert!(!harness + .conf + .timeline_uninit_mark_file_path(tenant.tenant_id, TIMELINE_ID) + .exists()); Ok(()) } diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 7e123c3fbd..09b825d2e9 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -675,7 +675,7 @@ pub async fn immediate_gc( .get(&tenant_id) .map(Arc::clone) .with_context(|| format!("tenant {tenant_id}")) - .map_err(ApiError::NotFound)?; + .map_err(|e| ApiError::NotFound(e.into()))?; let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon()); // Use tenant's pitr setting @@ -724,11 +724,11 @@ pub async fn immediate_compact( .get(&tenant_id) .map(Arc::clone) .with_context(|| format!("tenant {tenant_id}")) - .map_err(ApiError::NotFound)?; + .map_err(|e| ApiError::NotFound(e.into()))?; let timeline = tenant .get_timeline(timeline_id, true) - .map_err(ApiError::NotFound)?; + .map_err(|e| ApiError::NotFound(e.into()))?; // Run in task_mgr to avoid race with tenant_detach operation let ctx = ctx.detached_child(TaskKind::Compaction, DownloadBehavior::Download); diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 8db2bc4eb2..7808b64d35 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -1367,7 +1367,7 @@ mod tests { struct TestSetup { runtime: &'static tokio::runtime::Runtime, entered_runtime: EnterGuard<'static>, - harness: TenantHarness<'static>, + harness: TenantHarness, tenant: Arc, tenant_ctx: RequestContext, remote_fs_dir: PathBuf, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index de786da322..060000a01a 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -129,7 +129,7 @@ pub struct Timeline { pub pg_version: u32, - pub(crate) layers: tokio::sync::RwLock>, + pub(crate) layers: Arc>>, /// Set of key ranges which should be covered by image layers to /// allow GC to remove old layers. This set is created by GC and its cutoff LSN is also stored. @@ -1418,7 +1418,7 @@ impl Timeline { timeline_id, tenant_id, pg_version, - layers: tokio::sync::RwLock::new(LayerMap::default()), + layers: Arc::new(tokio::sync::RwLock::new(LayerMap::default())), wanted_image_layers: Mutex::new(None), walredo_mgr, @@ -3370,14 +3370,14 @@ struct CompactLevel0Phase1StatsBuilder { version: Option, tenant_id: Option, timeline_id: Option, - first_read_lock_acquisition_micros: DurationRecorder, - get_level0_deltas_plus_drop_lock_micros: DurationRecorder, - level0_deltas_count: Option, - time_spent_between_locks: DurationRecorder, - second_read_lock_acquisition_micros: DurationRecorder, - second_read_lock_held_micros: DurationRecorder, - sort_holes_micros: DurationRecorder, + read_lock_acquisition_micros: DurationRecorder, + read_lock_held_spawn_blocking_startup_micros: DurationRecorder, + read_lock_held_prerequisites_micros: DurationRecorder, + read_lock_held_compute_holes_micros: DurationRecorder, + read_lock_drop_micros: DurationRecorder, + prepare_iterators_micros: DurationRecorder, write_layer_files_micros: DurationRecorder, + level0_deltas_count: Option, new_deltas_count: Option, new_deltas_size: Option, } @@ -3390,14 +3390,14 @@ struct CompactLevel0Phase1Stats { tenant_id: TenantId, #[serde_as(as = "serde_with::DisplayFromStr")] timeline_id: TimelineId, - first_read_lock_acquisition_micros: RecordedDuration, - get_level0_deltas_plus_drop_lock_micros: RecordedDuration, - level0_deltas_count: usize, - time_spent_between_locks: RecordedDuration, - second_read_lock_acquisition_micros: RecordedDuration, - second_read_lock_held_micros: RecordedDuration, - sort_holes_micros: RecordedDuration, + read_lock_acquisition_micros: RecordedDuration, + read_lock_held_spawn_blocking_startup_micros: RecordedDuration, + read_lock_held_prerequisites_micros: RecordedDuration, + read_lock_held_compute_holes_micros: RecordedDuration, + read_lock_drop_micros: RecordedDuration, + prepare_iterators_micros: RecordedDuration, write_layer_files_micros: RecordedDuration, + level0_deltas_count: usize, new_deltas_count: usize, new_deltas_size: u64, } @@ -3406,54 +3406,51 @@ impl TryFrom for CompactLevel0Phase1Stats { type Error = anyhow::Error; fn try_from(value: CompactLevel0Phase1StatsBuilder) -> Result { - let CompactLevel0Phase1StatsBuilder { - version, - tenant_id, - timeline_id, - first_read_lock_acquisition_micros, - get_level0_deltas_plus_drop_lock_micros, - level0_deltas_count, - time_spent_between_locks, - second_read_lock_acquisition_micros, - second_read_lock_held_micros, - sort_holes_micros, - write_layer_files_micros, - new_deltas_count, - new_deltas_size, - } = value; - Ok(CompactLevel0Phase1Stats { - version: version.ok_or_else(|| anyhow::anyhow!("version not set"))?, - tenant_id: tenant_id.ok_or_else(|| anyhow::anyhow!("tenant_id not set"))?, - timeline_id: timeline_id.ok_or_else(|| anyhow::anyhow!("timeline_id not set"))?, - first_read_lock_acquisition_micros: first_read_lock_acquisition_micros + Ok(Self { + version: value.version.ok_or_else(|| anyhow!("version not set"))?, + tenant_id: value + .tenant_id + .ok_or_else(|| anyhow!("tenant_id not set"))?, + timeline_id: value + .timeline_id + .ok_or_else(|| anyhow!("timeline_id not set"))?, + read_lock_acquisition_micros: value + .read_lock_acquisition_micros .into_recorded() - .ok_or_else(|| anyhow::anyhow!("first_read_lock_acquisition_micros not set"))?, - get_level0_deltas_plus_drop_lock_micros: get_level0_deltas_plus_drop_lock_micros + .ok_or_else(|| anyhow!("read_lock_acquisition_micros not set"))?, + read_lock_held_spawn_blocking_startup_micros: value + .read_lock_held_spawn_blocking_startup_micros .into_recorded() - .ok_or_else(|| { - anyhow::anyhow!("get_level0_deltas_plus_drop_lock_micros not set") - })?, - level0_deltas_count: level0_deltas_count - .ok_or_else(|| anyhow::anyhow!("level0_deltas_count not set"))?, - time_spent_between_locks: time_spent_between_locks + .ok_or_else(|| anyhow!("read_lock_held_spawn_blocking_startup_micros not set"))?, + read_lock_held_prerequisites_micros: value + .read_lock_held_prerequisites_micros .into_recorded() - .ok_or_else(|| anyhow::anyhow!("time_spent_between_locks not set"))?, - second_read_lock_acquisition_micros: second_read_lock_acquisition_micros + .ok_or_else(|| anyhow!("read_lock_held_prerequisites_micros not set"))?, + read_lock_held_compute_holes_micros: value + .read_lock_held_compute_holes_micros .into_recorded() - .ok_or_else(|| anyhow::anyhow!("second_read_lock_acquisition_micros not set"))?, - second_read_lock_held_micros: second_read_lock_held_micros + .ok_or_else(|| anyhow!("read_lock_held_compute_holes_micros not set"))?, + read_lock_drop_micros: value + .read_lock_drop_micros .into_recorded() - .ok_or_else(|| anyhow::anyhow!("second_read_lock_held_micros not set"))?, - sort_holes_micros: sort_holes_micros + .ok_or_else(|| anyhow!("read_lock_drop_micros not set"))?, + prepare_iterators_micros: value + .prepare_iterators_micros .into_recorded() - .ok_or_else(|| anyhow::anyhow!("sort_holes_micros not set"))?, - write_layer_files_micros: write_layer_files_micros + .ok_or_else(|| anyhow!("prepare_iterators_micros not set"))?, + write_layer_files_micros: value + .write_layer_files_micros .into_recorded() - .ok_or_else(|| anyhow::anyhow!("write_layer_files_micros not set"))?, - new_deltas_count: new_deltas_count - .ok_or_else(|| anyhow::anyhow!("new_deltas_count not set"))?, - new_deltas_size: new_deltas_size - .ok_or_else(|| anyhow::anyhow!("new_deltas_size not set"))?, + .ok_or_else(|| anyhow!("write_layer_files_micros not set"))?, + level0_deltas_count: value + .level0_deltas_count + .ok_or_else(|| anyhow!("level0_deltas_count not set"))?, + new_deltas_count: value + .new_deltas_count + .ok_or_else(|| anyhow!("new_deltas_count not set"))?, + new_deltas_size: value + .new_deltas_size + .ok_or_else(|| anyhow!("new_deltas_size not set"))?, }) } } @@ -3464,30 +3461,18 @@ impl Timeline { /// This method takes the `_layer_removal_cs` guard to highlight it required downloads are /// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the /// start of level0 files compaction, the on-demand download should be revisited as well. - async fn compact_level0_phase1( - &self, + fn compact_level0_phase1( + self: Arc, _layer_removal_cs: Arc>, + layers: tokio::sync::OwnedRwLockReadGuard>, + mut stats: CompactLevel0Phase1StatsBuilder, target_file_size: u64, ctx: &RequestContext, ) -> Result { - let mut stats = CompactLevel0Phase1StatsBuilder { - version: Some(1), - tenant_id: Some(self.tenant_id), - timeline_id: Some(self.timeline_id), - ..Default::default() - }; - - let begin = tokio::time::Instant::now(); - let layers = self.layers.read().await; - let now = tokio::time::Instant::now(); - stats.first_read_lock_acquisition_micros = - DurationRecorder::Recorded(RecordedDuration(now - begin), now); + stats.read_lock_held_spawn_blocking_startup_micros = + stats.read_lock_acquisition_micros.till_now(); // set by caller let mut level0_deltas = layers.get_level0_deltas()?; - drop(layers); stats.level0_deltas_count = Some(level0_deltas.len()); - stats.get_level0_deltas_plus_drop_lock_micros = - stats.first_read_lock_acquisition_micros.till_now(); - // Only compact if enough layers have accumulated. let threshold = self.get_compaction_threshold(); if level0_deltas.is_empty() || level0_deltas.len() < threshold { @@ -3565,6 +3550,53 @@ impl Timeline { // we don't accidentally use it later in the function. drop(level0_deltas); + stats.read_lock_held_prerequisites_micros = stats + .read_lock_held_spawn_blocking_startup_micros + .till_now(); + + // Determine N largest holes where N is number of compacted layers. + let max_holes = deltas_to_compact.len(); + let last_record_lsn = self.get_last_record_lsn(); + let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128; + let min_hole_coverage_size = 3; // TODO: something more flexible? + + // min-heap (reserve space for one more element added before eviction) + let mut heap: BinaryHeap = BinaryHeap::with_capacity(max_holes + 1); + let mut prev: Option = None; + for (next_key, _next_lsn, _size) in itertools::process_results( + deltas_to_compact.iter().map(|l| l.key_iter(ctx)), + |iter_iter| iter_iter.kmerge_by(|a, b| a.0 <= b.0), + )? { + if let Some(prev_key) = prev { + // just first fast filter + if next_key.to_i128() - prev_key.to_i128() >= min_hole_range { + let key_range = prev_key..next_key; + // Measuring hole by just subtraction of i128 representation of key range boundaries + // has not so much sense, because largest holes will corresponds field1/field2 changes. + // But we are mostly interested to eliminate holes which cause generation of excessive image layers. + // That is why it is better to measure size of hole as number of covering image layers. + let coverage_size = layers.image_coverage(&key_range, last_record_lsn)?.len(); + if coverage_size >= min_hole_coverage_size { + heap.push(Hole { + key_range, + coverage_size, + }); + if heap.len() > max_holes { + heap.pop(); // remove smallest hole + } + } + } + } + prev = Some(next_key.next()); + } + stats.read_lock_held_compute_holes_micros = + stats.read_lock_held_prerequisites_micros.till_now(); + drop(layers); + stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now(); + let mut holes = heap.into_vec(); + holes.sort_unstable_by_key(|hole| hole.key_range.start); + let mut next_hole = 0; // index of next hole in holes vector + // This iterator walks through all key-value pairs from all the layers // we're compacting, in key, LSN order. let all_values_iter = itertools::process_results( @@ -3604,50 +3636,7 @@ impl Timeline { }, )?; - // Determine N largest holes where N is number of compacted layers. - let max_holes = deltas_to_compact.len(); - let last_record_lsn = self.get_last_record_lsn(); - stats.time_spent_between_locks = stats.get_level0_deltas_plus_drop_lock_micros.till_now(); - let layers = self.layers.read().await; // Is'n it better to hold original layers lock till here? - stats.second_read_lock_acquisition_micros = stats.time_spent_between_locks.till_now(); - let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128; - let min_hole_coverage_size = 3; // TODO: something more flexible? - - // min-heap (reserve space for one more element added before eviction) - let mut heap: BinaryHeap = BinaryHeap::with_capacity(max_holes + 1); - let mut prev: Option = None; - for (next_key, _next_lsn, _size) in itertools::process_results( - deltas_to_compact.iter().map(|l| l.key_iter(ctx)), - |iter_iter| iter_iter.kmerge_by(|a, b| a.0 <= b.0), - )? { - if let Some(prev_key) = prev { - // just first fast filter - if next_key.to_i128() - prev_key.to_i128() >= min_hole_range { - let key_range = prev_key..next_key; - // Measuring hole by just subtraction of i128 representation of key range boundaries - // has not so much sense, because largest holes will corresponds field1/field2 changes. - // But we are mostly interested to eliminate holes which cause generation of excessive image layers. - // That is why it is better to measure size of hole as number of covering image layers. - let coverage_size = layers.image_coverage(&key_range, last_record_lsn)?.len(); - if coverage_size >= min_hole_coverage_size { - heap.push(Hole { - key_range, - coverage_size, - }); - if heap.len() > max_holes { - heap.pop(); // remove smallest hole - } - } - } - } - prev = Some(next_key.next()); - } - drop(layers); - stats.second_read_lock_held_micros = stats.second_read_lock_acquisition_micros.till_now(); - let mut holes = heap.into_vec(); - holes.sort_unstable_by_key(|hole| hole.key_range.start); - let mut next_hole = 0; // index of next hole in holes vector - stats.sort_holes_micros = stats.second_read_lock_held_micros.till_now(); + stats.prepare_iterators_micros = stats.read_lock_drop_micros.till_now(); // Merge the contents of all the input delta layers into a new set // of delta layers, based on the current partitioning. @@ -3807,7 +3796,7 @@ impl Timeline { layer_paths.pop().unwrap(); } - stats.write_layer_files_micros = stats.sort_holes_micros.till_now(); + stats.write_layer_files_micros = stats.prepare_iterators_micros.till_now(); stats.new_deltas_count = Some(new_layers.len()); stats.new_deltas_size = Some(new_layers.iter().map(|l| l.desc.file_size).sum()); @@ -3846,9 +3835,36 @@ impl Timeline { let CompactLevel0Phase1Result { new_layers, deltas_to_compact, - } = self - .compact_level0_phase1(layer_removal_cs.clone(), target_file_size, ctx) - .await?; + } = { + let phase1_span = info_span!("compact_level0_phase1"); + let myself = Arc::clone(self); + let ctx = ctx.attached_child(); // technically, the spawn_blocking can outlive this future + let mut stats = CompactLevel0Phase1StatsBuilder { + version: Some(2), + tenant_id: Some(self.tenant_id), + timeline_id: Some(self.timeline_id), + ..Default::default() + }; + + let begin = tokio::time::Instant::now(); + let phase1_layers_locked = Arc::clone(&self.layers).read_owned().await; + let now = tokio::time::Instant::now(); + stats.read_lock_acquisition_micros = + DurationRecorder::Recorded(RecordedDuration(now - begin), now); + let layer_removal_cs = layer_removal_cs.clone(); + tokio::task::spawn_blocking(move || { + let _entered = phase1_span.enter(); + myself.compact_level0_phase1( + layer_removal_cs, + phase1_layers_locked, + stats, + target_file_size, + &ctx, + ) + }) + .await + .context("spawn_blocking")?? + }; if new_layers.is_empty() && deltas_to_compact.is_empty() { // nothing to do @@ -3953,7 +3969,7 @@ impl Timeline { /// for example. The caller should hold `Tenant::gc_cs` lock to ensure /// that. /// - #[instrument(skip_all, fields(timline_id=%self.timeline_id))] + #[instrument(skip_all, fields(timeline_id=%self.timeline_id))] pub(super) async fn update_gc_info( &self, retain_lsns: Vec, diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 83dfc5f598..fa23ae765d 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -1321,7 +1321,7 @@ mod tests { const DUMMY_SAFEKEEPER_HOST: &str = "safekeeper_connstr"; - async fn dummy_state(harness: &TenantHarness<'_>) -> ConnectionManagerState { + async fn dummy_state(harness: &TenantHarness) -> ConnectionManagerState { let (tenant, ctx) = harness.load().await; let timeline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x8), crate::DEFAULT_PG_VERSION, &ctx) diff --git a/rust-toolchain.toml b/rust-toolchain.toml index c39ba4f417..6abb435018 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.68.2" +channel = "1.70.0" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 52c3e8d4be..30036cc7f2 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -266,7 +266,7 @@ impl From for ApiError { fn from(te: TimelineError) -> ApiError { match te { TimelineError::NotFound(ttid) => { - ApiError::NotFound(anyhow!("timeline {} not found", ttid)) + ApiError::NotFound(anyhow!("timeline {} not found", ttid).into()) } _ => ApiError::InternalServerError(anyhow!("{}", te)), } diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index d55d159037..7ee3c33f92 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -62,6 +62,7 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = ( "pageserver_getpage_reconstruct_seconds_bucket", "pageserver_getpage_reconstruct_seconds_count", "pageserver_getpage_reconstruct_seconds_sum", + *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]], ) PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 64c71d2a59..e56bf78019 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2415,6 +2415,17 @@ class Endpoint(PgProtocol): return self + def respec(self, **kwargs): + """Update the endpoint.json file used by control_plane.""" + # Read config + config_path = os.path.join(self.endpoint_path(), "endpoint.json") + with open(config_path, "r") as f: + data_dict = json.load(f) + + # Write it back updated + with open(config_path, "w") as file: + json.dump(dict(data_dict, **kwargs), file, indent=4) + def stop(self) -> "Endpoint": """ Stop the Postgres instance if it's running. diff --git a/test_runner/performance/test_startup.py b/test_runner/performance/test_startup.py index 9c45088d62..8babbbe132 100644 --- a/test_runner/performance/test_startup.py +++ b/test_runner/performance/test_startup.py @@ -32,13 +32,18 @@ def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenc env.neon_cli.create_branch("test_startup") + endpoint = None + # We do two iterations so we can see if the second startup is faster. It should # be because the compute node should already be configured with roles, databases, # extensions, etc from the first run. for i in range(2): # Start with zenbenchmark.record_duration(f"{i}_start_and_select"): - endpoint = env.endpoints.create_start("test_startup") + if endpoint: + endpoint.start() + else: + endpoint = env.endpoints.create_start("test_startup") endpoint.safe_psql("select 1;") # Get metrics @@ -57,6 +62,9 @@ def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenc # Stop so we can restart endpoint.stop() + # Imitate optimizations that console would do for the second start + endpoint.respec(skip_pg_catalog_updates=True) + # This test sometimes runs for longer than the global 5 minute timeout. @pytest.mark.timeout(600) diff --git a/test_runner/pg_clients/python/pg8000/requirements.txt b/test_runner/pg_clients/python/pg8000/requirements.txt index 7bba8da06d..a8407c3cb0 100644 --- a/test_runner/pg_clients/python/pg8000/requirements.txt +++ b/test_runner/pg_clients/python/pg8000/requirements.txt @@ -1,2 +1,2 @@ -pg8000==1.29.4 +pg8000==1.29.8 scramp>=1.4.3 diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock index 30deb3ff20..bdbbe0ad69 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock +++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock @@ -396,9 +396,9 @@ checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" [[package]] name = "openssl" -version = "0.10.52" +version = "0.10.55" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01b8574602df80f7b85fdfc5392fa884a4e3b3f4f35402c070ab34c3d3f78d56" +checksum = "345df152bc43501c5eb9e4654ff05f794effb78d4efe3d53abc158baddc0703d" dependencies = [ "bitflags", "cfg-if", @@ -428,9 +428,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-sys" -version = "0.9.87" +version = "0.9.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e17f59264b2809d77ae94f0e1ebabc434773f370d6ca667bd223ea10e06cc7e" +checksum = "374533b0e45f3a7ced10fcaeccca020e66656bc03dac384f852e4e5a7a8104a6" dependencies = [ "cc", "libc", diff --git a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile index 43fc6f6c92..35ae25a470 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile +++ b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile @@ -1,4 +1,4 @@ -FROM rust:1.69 +FROM rust:1.70 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved index cc12acda4c..9f13106011 100644 --- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved +++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved @@ -5,8 +5,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/vapor/postgres-nio.git", "state" : { - "revision" : "dbf9c2eb596df39cba8ff3f74d74b2e6a31bd937", - "version" : "1.14.1" + "revision" : "061a0836d7c1887e04a975d1d2eaa2ef5fd7dfab", + "version" : "1.16.0" } }, { @@ -59,8 +59,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-nio.git", "state" : { - "revision" : "d1690f85419fdac8d54e350fb6d2ab9fd95afd75", - "version" : "2.51.1" + "revision" : "6213ba7a06febe8fef60563a4a7d26a4085783cf", + "version" : "2.54.0" } }, { diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift index ac32b982e2..a80590daa2 100644 --- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift +++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift @@ -4,7 +4,7 @@ import PackageDescription let package = Package( name: "PostgresNIOExample", dependencies: [ - .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.14.1") + .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.16.0") ], targets: [ .executableTarget( diff --git a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json index e4dfd1dd9d..4cedf56acd 100644 --- a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json +++ b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json @@ -5,23 +5,7 @@ "packages": { "": { "dependencies": { - "postgresql-client": "2.5.5" - } - }, - "node_modules/debug": { - "version": "4.3.4", - "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz", - "integrity": "sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==", - "dependencies": { - "ms": "2.1.2" - }, - "engines": { - "node": ">=6.0" - }, - "peerDependenciesMeta": { - "supports-color": { - "optional": true - } + "postgresql-client": "2.5.9" } }, "node_modules/doublylinked": { @@ -41,11 +25,6 @@ "putil-promisify": "^1.8.6" } }, - "node_modules/ms": { - "version": "2.1.2", - "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz", - "integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==" - }, "node_modules/obuf": { "version": "1.1.2", "resolved": "https://registry.npmjs.org/obuf/-/obuf-1.1.2.tgz", @@ -63,30 +42,28 @@ } }, "node_modules/postgresql-client": { - "version": "2.5.5", - "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.5.5.tgz", - "integrity": "sha512-2Mu3i+6NQ9cnkoZNd0XeSZo9WoUpuWf4ZSiCCoDWSj82T93py2/SKXZ1aUaP8mVaU0oKpyyGe0IwLYZ1VHShnA==", + "version": "2.5.9", + "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.5.9.tgz", + "integrity": "sha512-s+kgTN6TfWLzehEyxw4Im4odnxVRCbZ0DEJzWS6SLowPAmB2m1/DOiOvZC0+ZVoi5AfbGE6SBqFxKguSyVAXZg==", "dependencies": { - "debug": "^4.3.4", "doublylinked": "^2.5.2", "lightning-pool": "^4.2.1", "postgres-bytea": "^3.0.0", - "power-tasks": "^1.6.4", + "power-tasks": "^1.7.0", "putil-merge": "^3.10.3", "putil-promisify": "^1.10.0", "putil-varhelpers": "^1.6.5" }, "engines": { - "node": ">=14.0", + "node": ">=16.0", "npm": ">=7.0.0" } }, "node_modules/power-tasks": { - "version": "1.6.4", - "resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-1.6.4.tgz", - "integrity": "sha512-LX8GGgEIP1N7jsZqlqZ275e6f1Ehq97APCEGj8uVO0NoEoB+77QUX12BFv3LmlNKfq4fIuNSPiHhyHFjqn2gfA==", + "version": "1.7.0", + "resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-1.7.0.tgz", + "integrity": "sha512-rndZXCDxhuIDjPUJJvQwBDHaYagCkjvbPF/NA+omh/Ef4rAI9KtnvdA0k98dyiGpn1zXOpc6c2c0JWzg/xAhJg==", "dependencies": { - "debug": "^4.3.4", "doublylinked": "^2.5.2", "strict-typed-events": "^2.3.1" }, @@ -132,9 +109,9 @@ } }, "node_modules/ts-gems": { - "version": "2.3.0", - "resolved": "https://registry.npmjs.org/ts-gems/-/ts-gems-2.3.0.tgz", - "integrity": "sha512-bUvrwrzlct7vfaNvtgMhynDf6lAki/kTtrNsIGhX6l7GJGK3s6b8Ro7dazOLXabV0m2jyShBzDQ8X1+h/C2Cug==" + "version": "2.4.0", + "resolved": "https://registry.npmjs.org/ts-gems/-/ts-gems-2.4.0.tgz", + "integrity": "sha512-SdugYAXoWvbqrxLodIObzxhEKacDxh5LfAJIiIkiH7q5thvuuCzdmkdTVQYf7uEDrEpPhfx4tokDMamdO3be9A==" } } } diff --git a/test_runner/pg_clients/typescript/postgresql-client/package.json b/test_runner/pg_clients/typescript/postgresql-client/package.json index 9eaa13437a..12703ce89f 100644 --- a/test_runner/pg_clients/typescript/postgresql-client/package.json +++ b/test_runner/pg_clients/typescript/postgresql-client/package.json @@ -1,6 +1,6 @@ { "type": "module", "dependencies": { - "postgresql-client": "2.5.5" + "postgresql-client": "2.5.9" } } diff --git a/test_runner/pg_clients/typescript/serverless-driver/Dockerfile b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile index a5ad832a5c..07e98c586b 100644 --- a/test_runner/pg_clients/typescript/serverless-driver/Dockerfile +++ b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile @@ -1,4 +1,4 @@ -FROM node:18 +FROM node:20 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json index 0fb84cf5b7..72cc452817 100644 --- a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json +++ b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json @@ -5,16 +5,16 @@ "packages": { "": { "dependencies": { - "@neondatabase/serverless": "0.4.3", + "@neondatabase/serverless": "0.4.18", "ws": "8.13.0" } }, "node_modules/@neondatabase/serverless": { - "version": "0.4.3", - "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.4.3.tgz", - "integrity": "sha512-U8tpuF5f0R5WRsciR7iaJ5S2h54DWa6Z6CEW+J4KgwyvRN3q3qDz0MibdfFXU0WqnRoi/9RSf/2XN4TfeaOCbQ==", + "version": "0.4.18", + "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.4.18.tgz", + "integrity": "sha512-2TZnIyRGC/+0fjZ8TKCzaSTPUD94PM7NBGuantGZbUrbWyqBwGnUoRtdZAQ95qBKVHqORLVfymlv2NE+HQMFeA==", "dependencies": { - "@types/pg": "^8.6.6" + "@types/pg": "8.6.6" } }, "node_modules/@types/node": { diff --git a/test_runner/pg_clients/typescript/serverless-driver/package.json b/test_runner/pg_clients/typescript/serverless-driver/package.json index 71ba181afc..840c7a5c4c 100644 --- a/test_runner/pg_clients/typescript/serverless-driver/package.json +++ b/test_runner/pg_clients/typescript/serverless-driver/package.json @@ -1,7 +1,7 @@ { "type": "module", "dependencies": { - "@neondatabase/serverless": "0.4.3", + "@neondatabase/serverless": "0.4.18", "ws": "8.13.0" } } diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 61f86dc3ce..51e7b01eba 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -2,6 +2,7 @@ import copy import os import shutil import subprocess +import tempfile from pathlib import Path from typing import Any, Optional @@ -448,7 +449,7 @@ def dump_differs(first: Path, second: Path, output: Path) -> bool: """ with output.open("w") as stdout: - rv = subprocess.run( + res = subprocess.run( [ "diff", "--unified", # Make diff output more readable @@ -460,4 +461,53 @@ def dump_differs(first: Path, second: Path, output: Path) -> bool: stdout=stdout, ) - return rv.returncode != 0 + differs = res.returncode != 0 + + # TODO: Remove after https://github.com/neondatabase/neon/pull/4425 is merged, and a couple of releases are made + if differs: + with tempfile.NamedTemporaryFile(mode="w") as tmp: + tmp.write(PR4425_ALLOWED_DIFF) + tmp.flush() + + allowed = subprocess.run( + [ + "diff", + "--unified", # Make diff output more readable + r"--ignore-matching-lines=^---", # Ignore diff headers + r"--ignore-matching-lines=^\+\+\+", # Ignore diff headers + "--ignore-matching-lines=^@@", # Ignore diff blocks location + "--ignore-matching-lines=^ *$", # Ignore lines with only spaces + "--ignore-matching-lines=^ --.*", # Ignore the " --" lines for compatibility with PG14 + "--ignore-blank-lines", + str(output), + str(tmp.name), + ], + ) + + differs = allowed.returncode != 0 + + return differs + + +PR4425_ALLOWED_DIFF = """ +--- /tmp/test_output/test_backward_compatibility[release-pg15]/compatibility_snapshot/dump.sql 2023-06-08 18:12:45.000000000 +0000 ++++ /tmp/test_output/test_backward_compatibility[release-pg15]/dump.sql 2023-06-13 07:25:35.211733653 +0000 +@@ -13,12 +13,20 @@ + + CREATE ROLE cloud_admin; + ALTER ROLE cloud_admin WITH SUPERUSER INHERIT CREATEROLE CREATEDB LOGIN REPLICATION BYPASSRLS; ++CREATE ROLE neon_superuser; ++ALTER ROLE neon_superuser WITH NOSUPERUSER INHERIT CREATEROLE CREATEDB NOLOGIN NOREPLICATION NOBYPASSRLS; + + -- + -- User Configurations + -- + + ++-- ++-- Role memberships ++-- ++ ++GRANT pg_read_all_data TO neon_superuser GRANTED BY cloud_admin; ++GRANT pg_write_all_data TO neon_superuser GRANTED BY cloud_admin; +""" diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index 12e034cea2..6b003ce356 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -1,3 +1,5 @@ +import time + import pytest from fixtures.neon_fixtures import NeonEnv @@ -10,9 +12,10 @@ def test_hot_standby(neon_simple_env: NeonEnv): branch_name="main", endpoint_id="primary", ) as primary: + time.sleep(1) with env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") as secondary: primary_lsn = None - cought_up = False + caught_up = False queries = [ "SHOW neon.timeline_id", "SHOW neon.tenant_id", @@ -56,7 +59,7 @@ def test_hot_standby(neon_simple_env: NeonEnv): res = s_cur.fetchone() assert res is not None - while not cought_up: + while not caught_up: with s_con.cursor() as secondary_cursor: secondary_cursor.execute("SELECT pg_last_wal_replay_lsn()") res = secondary_cursor.fetchone() @@ -66,7 +69,7 @@ def test_hot_standby(neon_simple_env: NeonEnv): # due to e.g. autovacuum, but that shouldn't impact the content # of the tables, so we check whether we've replayed up to at # least after the commit of the `test` table. - cought_up = secondary_lsn >= primary_lsn + caught_up = secondary_lsn >= primary_lsn # Explicit commit to flush any transient transaction-level state. s_con.commit() diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index a0f9f854ed..25c6634108 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -16,6 +16,7 @@ from fixtures.pg_version import PgVersion, xfail_on_postgres from fixtures.types import Lsn, TenantId, TimelineId +@pytest.mark.xfail def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path): env = neon_simple_env (tenant_id, _) = env.neon_cli.create_tenant() @@ -44,12 +45,16 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path): # we've disabled the autovacuum and checkpoint # so background processes should not change the size. # If this test will flake we should probably loosen the check - assert size == initial_size, "starting idle compute should not change the tenant size" + assert ( + size == initial_size + ), f"starting idle compute should not change the tenant size (Currently {size}, expected {initial_size})" # the size should be the same, until we increase the size over the # gc_horizon size, inputs = http_client.tenant_size_and_modelinputs(tenant_id) - assert size == initial_size, "tenant_size should not be affected by shutdown of compute" + assert ( + size == initial_size + ), f"tenant_size should not be affected by shutdown of compute (Currently {size}, expected {initial_size})" expected_inputs = { "segments": [ @@ -318,6 +323,7 @@ def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Pa size_debug_file.write(size_debug) +@pytest.mark.xfail def test_single_branch_get_tenant_size_grows( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, pg_version: PgVersion ): @@ -333,13 +339,13 @@ def test_single_branch_get_tenant_size_grows( # inserts is larger than gc_horizon. for example 0x20000 here hid the fact # that there next_gc_cutoff could be smaller than initdb_lsn, which will # obviously lead to issues when calculating the size. - gc_horizon = 0x38000 + gc_horizon = 0x3BA00 # it's a bit of a hack, but different versions of postgres have different # amount of WAL generated for the same amount of data. so we need to # adjust the gc_horizon accordingly. if pg_version == PgVersion.V14: - gc_horizon = 0x40000 + gc_horizon = 0x4A000 neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}" @@ -360,11 +366,11 @@ def test_single_branch_get_tenant_size_grows( if current_lsn - initdb_lsn >= gc_horizon: assert ( size >= prev_size - ), "tenant_size may grow or not grow, because we only add gc_horizon amount of WAL to initial snapshot size" + ), f"tenant_size may grow or not grow, because we only add gc_horizon amount of WAL to initial snapshot size (Currently at: {current_lsn}, Init at: {initdb_lsn})" else: assert ( size > prev_size - ), "tenant_size should grow, because we continue to add WAL to initial snapshot size" + ), f"tenant_size should grow, because we continue to add WAL to initial snapshot size (Currently at: {current_lsn}, Init at: {initdb_lsn})" def get_current_consistent_size( env: NeonEnv,