mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-22 04:42:56 +00:00
Compare commits
8 Commits
vk/compile
...
access_sta
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
83294d771b | ||
|
|
b3ef6c7bf5 | ||
|
|
f88ff9f3c6 | ||
|
|
593c4244fd | ||
|
|
d7aa36c4c0 | ||
|
|
df127ef209 | ||
|
|
72a73d2c82 | ||
|
|
172239c7ee |
9
.github/workflows/benchmarking.yml
vendored
9
.github/workflows/benchmarking.yml
vendored
@@ -180,8 +180,7 @@ jobs:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
|
||||
# Increase timeout to 8h, default timeout is 6h
|
||||
timeout-minutes: 480
|
||||
timeout-minutes: 360 # 6h
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
@@ -322,6 +321,8 @@ jobs:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
|
||||
timeout-minutes: 360 # 6h
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
@@ -413,6 +414,8 @@ jobs:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
|
||||
timeout-minutes: 360 # 6h
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
@@ -498,6 +501,8 @@ jobs:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
|
||||
timeout-minutes: 360 # 6h
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
|
||||
6
.github/workflows/build_and_test.yml
vendored
6
.github/workflows/build_and_test.yml
vendored
@@ -659,7 +659,6 @@ jobs:
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.sha }}
|
||||
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--dockerfile Dockerfile.compute-tools
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
|
||||
@@ -717,7 +716,6 @@ jobs:
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.sha }}
|
||||
--build-arg PG_VERSION=${{ matrix.version }}
|
||||
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--dockerfile Dockerfile.compute-node
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
@@ -916,7 +914,7 @@ jobs:
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Create git tag
|
||||
- name: Create tag "release-${{ needs.tag.outputs.build-tag }}"
|
||||
if: github.ref_name == 'release'
|
||||
uses: actions/github-script@v6
|
||||
with:
|
||||
@@ -926,7 +924,7 @@ jobs:
|
||||
github.rest.git.createRef({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
ref: "refs/tags/${{ needs.tag.outputs.build-tag }}",
|
||||
ref: "refs/tags/release-${{ needs.tag.outputs.build-tag }}",
|
||||
sha: context.sha,
|
||||
})
|
||||
|
||||
|
||||
8
Cargo.lock
generated
8
Cargo.lock
generated
@@ -2349,9 +2349,9 @@ checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
|
||||
|
||||
[[package]]
|
||||
name = "openssl"
|
||||
version = "0.10.55"
|
||||
version = "0.10.52"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "345df152bc43501c5eb9e4654ff05f794effb78d4efe3d53abc158baddc0703d"
|
||||
checksum = "01b8574602df80f7b85fdfc5392fa884a4e3b3f4f35402c070ab34c3d3f78d56"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"cfg-if",
|
||||
@@ -2381,9 +2381,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
|
||||
|
||||
[[package]]
|
||||
name = "openssl-sys"
|
||||
version = "0.9.90"
|
||||
version = "0.9.87"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "374533b0e45f3a7ced10fcaeccca020e66656bc03dac384f852e4e5a7a8104a6"
|
||||
checksum = "8e17f59264b2809d77ae94f0e1ebabc434773f370d6ca667bd223ea10e06cc7e"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
|
||||
@@ -2,7 +2,6 @@ ARG PG_VERSION
|
||||
ARG REPOSITORY=neondatabase
|
||||
ARG IMAGE=rust
|
||||
ARG TAG=pinned
|
||||
ARG BUILD_TAG
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -10,10 +9,10 @@ ARG BUILD_TAG
|
||||
#
|
||||
#########################################################################################
|
||||
FROM debian:bullseye-slim AS build-deps
|
||||
RUN apt update && \
|
||||
RUN apt update && \
|
||||
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
|
||||
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
|
||||
libicu-dev libxslt1-dev liblz4-dev libzstd-dev python3 python3-dev python3-pip libpython3-dev
|
||||
libicu-dev libxslt1-dev liblz4-dev libzstd-dev
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -26,7 +25,7 @@ ARG PG_VERSION
|
||||
COPY vendor/postgres-${PG_VERSION} postgres
|
||||
RUN cd postgres && \
|
||||
export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp \
|
||||
--with-icu --with-libxml --with-libxslt --with-lz4 --with-python" && \
|
||||
--with-icu --with-libxml --with-libxslt --with-lz4" && \
|
||||
if [ "${PG_VERSION}" != "v14" ]; then \
|
||||
# zstd is available only from PG15
|
||||
export CONFIGURE_CMD="${CONFIGURE_CMD} --with-zstd"; \
|
||||
@@ -41,13 +40,9 @@ RUN cd postgres && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/hstore_plpython3u.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/jsonb_plpython3u.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/ltree_plpythonu.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/moddatetime.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpython3u.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_stat_statements.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
|
||||
@@ -485,23 +480,6 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_1.tar.
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/rdkit.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-uuidv7-pg-build"
|
||||
# compile pg_uuidv7 extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-uuidv7-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
|
||||
echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xvzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_uuidv7.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "rust extensions"
|
||||
@@ -635,7 +613,6 @@ COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY pgxn/ pgxn/
|
||||
|
||||
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
@@ -657,9 +634,6 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
#
|
||||
#########################################################################################
|
||||
FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
|
||||
ARG BUILD_TAG
|
||||
ENV BUILD_TAG=$BUILD_TAG
|
||||
|
||||
USER nonroot
|
||||
# Copy entire project to get Cargo.* files with proper dependencies for the whole project
|
||||
COPY --chown=nonroot . .
|
||||
@@ -713,7 +687,6 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
|
||||
# libxml2, libxslt1.1 for xml2
|
||||
# libzstd1 for zstd
|
||||
# libboost*, libfreetype6, and zlib1g for rdkit
|
||||
# python3* for plpython3u
|
||||
RUN apt update && \
|
||||
apt install --no-install-recommends -y \
|
||||
gdb \
|
||||
@@ -737,8 +710,6 @@ RUN apt update && \
|
||||
libcurl4-openssl-dev \
|
||||
locales \
|
||||
procps \
|
||||
python3 \
|
||||
python3-pip \
|
||||
zlib1g && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
|
||||
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
ARG REPOSITORY=neondatabase
|
||||
ARG IMAGE=rust
|
||||
ARG TAG=pinned
|
||||
ARG BUILD_TAG
|
||||
|
||||
FROM $REPOSITORY/$IMAGE:$TAG AS rust-build
|
||||
WORKDIR /home/nonroot
|
||||
@@ -17,8 +16,6 @@ ENV CACHEPOT_S3_KEY_PREFIX=cachepot
|
||||
ARG CACHEPOT_BUCKET=neon-github-dev
|
||||
#ARG AWS_ACCESS_KEY_ID
|
||||
#ARG AWS_SECRET_ACCESS_KEY
|
||||
ARG BUILD_TAG
|
||||
ENV BUILD_TAG=$BUILD_TAG
|
||||
|
||||
COPY . .
|
||||
|
||||
|
||||
@@ -54,15 +54,9 @@ use compute_tools::monitor::launch_monitor;
|
||||
use compute_tools::params::*;
|
||||
use compute_tools::spec::*;
|
||||
|
||||
const BUILD_TAG_DEFAULT: &str = "local";
|
||||
|
||||
fn main() -> Result<()> {
|
||||
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
|
||||
|
||||
let build_tag = option_env!("BUILD_TAG").unwrap_or(BUILD_TAG_DEFAULT);
|
||||
|
||||
info!("build_tag: {build_tag}");
|
||||
|
||||
let matches = cli().get_matches();
|
||||
|
||||
let http_port = *matches
|
||||
|
||||
@@ -133,84 +133,6 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
}
|
||||
}
|
||||
|
||||
/// Create special neon_superuser role, that's a slightly nerfed version of a real superuser
|
||||
/// that we give to customers
|
||||
fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
let roles = spec
|
||||
.cluster
|
||||
.roles
|
||||
.iter()
|
||||
.map(|r| format!("'{}'", escape_literal(&r.name)))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let dbs = spec
|
||||
.cluster
|
||||
.databases
|
||||
.iter()
|
||||
.map(|db| format!("'{}'", escape_literal(&db.name)))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let roles_decl = if roles.is_empty() {
|
||||
String::from("roles text[] := NULL;")
|
||||
} else {
|
||||
format!(
|
||||
r#"
|
||||
roles text[] := ARRAY(SELECT rolname
|
||||
FROM pg_catalog.pg_roles
|
||||
WHERE rolname IN ({}));"#,
|
||||
roles.join(", ")
|
||||
)
|
||||
};
|
||||
|
||||
let database_decl = if dbs.is_empty() {
|
||||
String::from("dbs text[] := NULL;")
|
||||
} else {
|
||||
format!(
|
||||
r#"
|
||||
dbs text[] := ARRAY(SELECT datname
|
||||
FROM pg_catalog.pg_database
|
||||
WHERE datname IN ({}));"#,
|
||||
dbs.join(", ")
|
||||
)
|
||||
};
|
||||
|
||||
// ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on all databases
|
||||
// (see https://www.postgresql.org/docs/current/ddl-priv.html)
|
||||
let query = format!(
|
||||
r#"
|
||||
DO $$
|
||||
DECLARE
|
||||
r text;
|
||||
{}
|
||||
{}
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
|
||||
THEN
|
||||
CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN IN ROLE pg_read_all_data, pg_write_all_data;
|
||||
IF array_length(roles, 1) IS NOT NULL THEN
|
||||
EXECUTE format('GRANT neon_superuser TO %s',
|
||||
array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', '));
|
||||
FOREACH r IN ARRAY roles LOOP
|
||||
EXECUTE format('ALTER ROLE %s CREATEROLE CREATEDB', quote_ident(r));
|
||||
END LOOP;
|
||||
END IF;
|
||||
IF array_length(dbs, 1) IS NOT NULL THEN
|
||||
EXECUTE format('GRANT ALL PRIVILEGES ON DATABASE %s TO neon_superuser',
|
||||
array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(dbs) as x), ', '));
|
||||
END IF;
|
||||
END IF;
|
||||
END
|
||||
$$;"#,
|
||||
roles_decl, database_decl,
|
||||
);
|
||||
info!("Neon superuser created:\n{}", &query);
|
||||
client
|
||||
.simple_query(&query)
|
||||
.map_err(|e| anyhow::anyhow!(e).context(query))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl ComputeNode {
|
||||
pub fn set_status(&self, status: ComputeStatus) {
|
||||
let mut state = self.state.lock().unwrap();
|
||||
@@ -425,8 +347,6 @@ impl ComputeNode {
|
||||
.map_err(|_| anyhow::anyhow!("invalid connstr"))?;
|
||||
|
||||
let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?;
|
||||
// Disable forwarding so that users don't get a cloud_admin role
|
||||
client.simple_query("SET neon.forward_ddl = false")?;
|
||||
client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
|
||||
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
|
||||
drop(client);
|
||||
@@ -437,16 +357,14 @@ impl ComputeNode {
|
||||
Ok(client) => client,
|
||||
};
|
||||
|
||||
// Proceed with post-startup configuration. Note, that order of operations is important.
|
||||
// Disable DDL forwarding because control plane already knows about these roles/databases.
|
||||
client.simple_query("SET neon.forward_ddl = false")?;
|
||||
|
||||
// Proceed with post-startup configuration. Note, that order of operations is important.
|
||||
let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
|
||||
create_neon_superuser(spec, &mut client)?;
|
||||
handle_roles(spec, &mut client)?;
|
||||
handle_databases(spec, &mut client)?;
|
||||
handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
|
||||
handle_grants(spec, self.connstr.as_str())?;
|
||||
handle_grants(spec, self.connstr.as_str(), &mut client)?;
|
||||
handle_extensions(spec, &mut client)?;
|
||||
|
||||
// 'Close' connection
|
||||
@@ -484,7 +402,7 @@ impl ComputeNode {
|
||||
handle_roles(&spec, &mut client)?;
|
||||
handle_databases(&spec, &mut client)?;
|
||||
handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
|
||||
handle_grants(&spec, self.connstr.as_str())?;
|
||||
handle_grants(&spec, self.connstr.as_str(), &mut client)?;
|
||||
handle_extensions(&spec, &mut client)?;
|
||||
}
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@ use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
|
||||
const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds
|
||||
|
||||
/// Escape a string for including it in a SQL literal
|
||||
pub fn escape_literal(s: &str) -> String {
|
||||
fn escape_literal(s: &str) -> String {
|
||||
s.replace('\'', "''").replace('\\', "\\\\")
|
||||
}
|
||||
|
||||
|
||||
@@ -269,13 +269,17 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
xact.execute(query.as_str(), &[])?;
|
||||
}
|
||||
RoleAction::Create => {
|
||||
let mut query: String = format!(
|
||||
"CREATE ROLE {} CREATEROLE CREATEDB IN ROLE neon_superuser",
|
||||
name.pg_quote()
|
||||
);
|
||||
let mut query: String = format!("CREATE ROLE {} ", name.pg_quote());
|
||||
info!("role create query: '{}'", &query);
|
||||
query.push_str(&role.to_pg_options());
|
||||
xact.execute(query.as_str(), &[])?;
|
||||
|
||||
let grant_query = format!(
|
||||
"GRANT pg_read_all_data, pg_write_all_data TO {}",
|
||||
name.pg_quote()
|
||||
);
|
||||
xact.execute(grant_query.as_str(), &[])?;
|
||||
info!("role grant query: '{}'", &grant_query);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -472,11 +476,6 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
query.push_str(&db.to_pg_options());
|
||||
let _guard = info_span!("executing", query).entered();
|
||||
client.execute(query.as_str(), &[])?;
|
||||
let grant_query: String = format!(
|
||||
"GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser",
|
||||
name.pg_quote()
|
||||
);
|
||||
client.execute(grant_query.as_str(), &[])?;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -496,9 +495,35 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
/// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
|
||||
/// to allow users creating trusted extensions and re-creating `public` schema, for example.
|
||||
#[instrument(skip_all)]
|
||||
pub fn handle_grants(spec: &ComputeSpec, connstr: &str) -> Result<()> {
|
||||
pub fn handle_grants(spec: &ComputeSpec, connstr: &str, client: &mut Client) -> Result<()> {
|
||||
info!("cluster spec grants:");
|
||||
|
||||
// We now have a separate `web_access` role to connect to the database
|
||||
// via the web interface and proxy link auth. And also we grant a
|
||||
// read / write all data privilege to every role. So also grant
|
||||
// create to everyone.
|
||||
// XXX: later we should stop messing with Postgres ACL in such horrible
|
||||
// ways.
|
||||
let roles = spec
|
||||
.cluster
|
||||
.roles
|
||||
.iter()
|
||||
.map(|r| r.name.pg_quote())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
for db in &spec.cluster.databases {
|
||||
let dbname = &db.name;
|
||||
|
||||
let query: String = format!(
|
||||
"GRANT CREATE ON DATABASE {} TO {}",
|
||||
dbname.pg_quote(),
|
||||
roles.join(", ")
|
||||
);
|
||||
info!("grant query {}", &query);
|
||||
|
||||
client.execute(query.as_str(), &[])?;
|
||||
}
|
||||
|
||||
// Do some per-database access adjustments. We'd better do this at db creation time,
|
||||
// but CREATE DATABASE isn't transactional. So we cannot create db + do some grants
|
||||
// atomically.
|
||||
|
||||
@@ -67,7 +67,6 @@ pub struct EndpointConf {
|
||||
pg_port: u16,
|
||||
http_port: u16,
|
||||
pg_version: u32,
|
||||
skip_pg_catalog_updates: bool,
|
||||
}
|
||||
|
||||
//
|
||||
@@ -136,7 +135,6 @@ impl ComputeControlPlane {
|
||||
mode,
|
||||
tenant_id,
|
||||
pg_version,
|
||||
skip_pg_catalog_updates: false,
|
||||
});
|
||||
|
||||
ep.create_endpoint_dir()?;
|
||||
@@ -150,7 +148,6 @@ impl ComputeControlPlane {
|
||||
http_port,
|
||||
pg_port,
|
||||
pg_version,
|
||||
skip_pg_catalog_updates: false,
|
||||
})?,
|
||||
)?;
|
||||
std::fs::write(
|
||||
@@ -186,9 +183,6 @@ pub struct Endpoint {
|
||||
// the endpoint runs in.
|
||||
pub env: LocalEnv,
|
||||
pageserver: Arc<PageServerNode>,
|
||||
|
||||
// Optimizations
|
||||
skip_pg_catalog_updates: bool,
|
||||
}
|
||||
|
||||
impl Endpoint {
|
||||
@@ -222,7 +216,6 @@ impl Endpoint {
|
||||
mode: conf.mode,
|
||||
tenant_id: conf.tenant_id,
|
||||
pg_version: conf.pg_version,
|
||||
skip_pg_catalog_updates: conf.skip_pg_catalog_updates,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -457,7 +450,7 @@ impl Endpoint {
|
||||
|
||||
// Create spec file
|
||||
let spec = ComputeSpec {
|
||||
skip_pg_catalog_updates: self.skip_pg_catalog_updates,
|
||||
skip_pg_catalog_updates: false,
|
||||
format_version: 1.0,
|
||||
operation_uuid: None,
|
||||
cluster: Cluster {
|
||||
|
||||
@@ -23,7 +23,6 @@ use prometheus::{Registry, Result};
|
||||
pub mod launch_timestamp;
|
||||
mod wrappers;
|
||||
pub use wrappers::{CountedReader, CountedWriter};
|
||||
pub mod metric_vec_duration;
|
||||
|
||||
pub type UIntGauge = GenericGauge<AtomicU64>;
|
||||
pub type UIntGaugeVec = GenericGaugeVec<AtomicU64>;
|
||||
|
||||
@@ -1,23 +0,0 @@
|
||||
//! Helpers for observing duration on HistogramVec / CounterVec / GaugeVec / MetricVec<T>.
|
||||
|
||||
use std::{future::Future, time::Instant};
|
||||
|
||||
pub trait DurationResultObserver {
|
||||
fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration);
|
||||
}
|
||||
|
||||
pub async fn observe_async_block_duration_by_result<
|
||||
T,
|
||||
E,
|
||||
F: Future<Output = Result<T, E>>,
|
||||
O: DurationResultObserver,
|
||||
>(
|
||||
observer: &O,
|
||||
block: F,
|
||||
) -> Result<T, E> {
|
||||
let start = Instant::now();
|
||||
let result = block.await;
|
||||
let duration = start.elapsed();
|
||||
observer.observe_result(&result, duration);
|
||||
result
|
||||
}
|
||||
@@ -70,14 +70,6 @@ impl RemotePath {
|
||||
pub fn join(&self, segment: &Path) -> Self {
|
||||
Self(self.0.join(segment))
|
||||
}
|
||||
|
||||
pub fn get_path(&self) -> &PathBuf {
|
||||
&self.0
|
||||
}
|
||||
|
||||
pub fn extension(&self) -> Option<&str> {
|
||||
self.0.extension()?.to_str()
|
||||
}
|
||||
}
|
||||
|
||||
/// Storage (potentially remote) API to manage its state.
|
||||
@@ -94,19 +86,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
prefix: Option<&RemotePath>,
|
||||
) -> Result<Vec<RemotePath>, DownloadError>;
|
||||
|
||||
/// Lists all files in directory "recursively"
|
||||
/// (not really recursively, because AWS has a flat namespace)
|
||||
/// Note: This is subtely different than list_prefixes,
|
||||
/// because it is for listing files instead of listing
|
||||
/// names sharing common prefixes.
|
||||
/// For example,
|
||||
/// list_files("foo/bar") = ["foo/bar/cat123.txt",
|
||||
/// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"]
|
||||
/// whereas,
|
||||
/// list_prefixes("foo/bar/") = ["cat", "dog"]
|
||||
/// See `test_real_s3.rs` for more details.
|
||||
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>>;
|
||||
|
||||
/// Streams the local file contents into remote into the remote storage entry.
|
||||
async fn upload(
|
||||
&self,
|
||||
@@ -195,14 +174,6 @@ impl GenericRemoteStorage {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.list_files(folder).await,
|
||||
Self::AwsS3(s) => s.list_files(folder).await,
|
||||
Self::Unreliable(s) => s.list_files(folder).await,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn upload(
|
||||
&self,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
|
||||
@@ -48,14 +48,6 @@ impl LocalFs {
|
||||
Ok(Self { storage_root })
|
||||
}
|
||||
|
||||
// mirrors S3Bucket::s3_object_to_relative_path
|
||||
fn local_file_to_relative_path(&self, key: PathBuf) -> RemotePath {
|
||||
let relative_path = key
|
||||
.strip_prefix(&self.storage_root)
|
||||
.expect("relative path must contain storage_root as prefix");
|
||||
RemotePath(relative_path.into())
|
||||
}
|
||||
|
||||
async fn read_storage_metadata(
|
||||
&self,
|
||||
file_path: &Path,
|
||||
@@ -140,34 +132,6 @@ impl RemoteStorage for LocalFs {
|
||||
Ok(prefixes)
|
||||
}
|
||||
|
||||
// recursively lists all files in a directory,
|
||||
// mirroring the `list_files` for `s3_bucket`
|
||||
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
let full_path = match folder {
|
||||
Some(folder) => folder.with_base(&self.storage_root),
|
||||
None => self.storage_root.clone(),
|
||||
};
|
||||
let mut files = vec![];
|
||||
let mut directory_queue = vec![full_path.clone()];
|
||||
|
||||
while !directory_queue.is_empty() {
|
||||
let cur_folder = directory_queue
|
||||
.pop()
|
||||
.expect("queue cannot be empty: we just checked");
|
||||
let mut entries = fs::read_dir(cur_folder.clone()).await?;
|
||||
while let Some(entry) = entries.next_entry().await? {
|
||||
let file_name: PathBuf = entry.file_name().into();
|
||||
let full_file_name = cur_folder.clone().join(&file_name);
|
||||
let file_remote_path = self.local_file_to_relative_path(full_file_name.clone());
|
||||
files.push(file_remote_path.clone());
|
||||
if full_file_name.is_dir() {
|
||||
directory_queue.push(full_file_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(files)
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
data: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
|
||||
@@ -347,51 +347,6 @@ impl RemoteStorage for S3Bucket {
|
||||
Ok(document_keys)
|
||||
}
|
||||
|
||||
/// See the doc for `RemoteStorage::list_files`
|
||||
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
let folder_name = folder
|
||||
.map(|p| self.relative_path_to_s3_object(p))
|
||||
.or_else(|| self.prefix_in_bucket.clone());
|
||||
|
||||
// AWS may need to break the response into several parts
|
||||
let mut continuation_token = None;
|
||||
let mut all_files = vec![];
|
||||
loop {
|
||||
let _guard = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
.await
|
||||
.context("Concurrency limiter semaphore got closed during S3 list_files")?;
|
||||
metrics::inc_list_objects();
|
||||
|
||||
let response = self
|
||||
.client
|
||||
.list_objects_v2()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.set_prefix(folder_name.clone())
|
||||
.set_continuation_token(continuation_token)
|
||||
.set_max_keys(self.max_keys_per_list_response)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
metrics::inc_list_objects_fail();
|
||||
e
|
||||
})
|
||||
.context("Failed to list files in S3 bucket")?;
|
||||
|
||||
for object in response.contents().unwrap_or_default() {
|
||||
let object_path = object.key().expect("response does not contain a key");
|
||||
let remote_path = self.s3_object_to_relative_path(object_path);
|
||||
all_files.push(remote_path);
|
||||
}
|
||||
match response.next_continuation_token {
|
||||
Some(new_token) => continuation_token = Some(new_token),
|
||||
None => break,
|
||||
}
|
||||
}
|
||||
Ok(all_files)
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
|
||||
@@ -83,11 +83,6 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
self.inner.list_prefixes(prefix).await
|
||||
}
|
||||
|
||||
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
self.attempt(RemoteOp::ListPrefixes(folder.cloned()))?;
|
||||
self.inner.list_files(folder).await
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
|
||||
@@ -88,58 +88,6 @@ async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3WithTestBlobs) -> any
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Tests that S3 client can list all files in a folder, even if the response comes paginated and requirees multiple S3 queries.
|
||||
/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. Test will skip real code and pass if env vars not set.
|
||||
/// See `s3_pagination_should_work` for more information.
|
||||
///
|
||||
/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_s3_data`]
|
||||
/// Then performs the following queries:
|
||||
/// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
|
||||
/// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt`
|
||||
#[test_context(MaybeEnabledS3WithSimpleTestBlobs)]
|
||||
#[tokio::test]
|
||||
async fn s3_list_files_works(ctx: &mut MaybeEnabledS3WithSimpleTestBlobs) -> anyhow::Result<()> {
|
||||
let ctx = match ctx {
|
||||
MaybeEnabledS3WithSimpleTestBlobs::Enabled(ctx) => ctx,
|
||||
MaybeEnabledS3WithSimpleTestBlobs::Disabled => return Ok(()),
|
||||
MaybeEnabledS3WithSimpleTestBlobs::UploadsFailed(e, _) => {
|
||||
anyhow::bail!("S3 init failed: {e:?}")
|
||||
}
|
||||
};
|
||||
let test_client = Arc::clone(&ctx.enabled.client);
|
||||
let base_prefix =
|
||||
RemotePath::new(Path::new("folder1")).context("common_prefix construction")?;
|
||||
let root_files = test_client
|
||||
.list_files(None)
|
||||
.await
|
||||
.context("client list root files failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
assert_eq!(
|
||||
root_files,
|
||||
ctx.remote_blobs.clone(),
|
||||
"remote storage list_files on root mismatches with the uploads."
|
||||
);
|
||||
let nested_remote_files = test_client
|
||||
.list_files(Some(&base_prefix))
|
||||
.await
|
||||
.context("client list nested files failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
let trim_remote_blobs: HashSet<_> = ctx
|
||||
.remote_blobs
|
||||
.iter()
|
||||
.map(|x| x.get_path().to_str().expect("must be valid name"))
|
||||
.filter(|x| x.starts_with("folder1"))
|
||||
.map(|x| RemotePath::new(Path::new(x)).expect("must be valid name"))
|
||||
.collect();
|
||||
assert_eq!(
|
||||
nested_remote_files, trim_remote_blobs,
|
||||
"remote storage list_files on subdirrectory mismatches with the uploads."
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test_context(MaybeEnabledS3)]
|
||||
#[tokio::test]
|
||||
async fn s3_delete_non_exising_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
|
||||
@@ -300,66 +248,6 @@ impl AsyncTestContext for MaybeEnabledS3WithTestBlobs {
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE: the setups for the list_prefixes test and the list_files test are very similar
|
||||
// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
|
||||
// whereas the list_files function is concerned with listing files.
|
||||
// See `RemoteStorage::list_files` documentation for more details
|
||||
enum MaybeEnabledS3WithSimpleTestBlobs {
|
||||
Enabled(S3WithSimpleTestBlobs),
|
||||
Disabled,
|
||||
UploadsFailed(anyhow::Error, S3WithSimpleTestBlobs),
|
||||
}
|
||||
struct S3WithSimpleTestBlobs {
|
||||
enabled: EnabledS3,
|
||||
remote_blobs: HashSet<RemotePath>,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs {
|
||||
async fn setup() -> Self {
|
||||
ensure_logging_ready();
|
||||
if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
|
||||
info!(
|
||||
"`{}` env variable is not set, skipping the test",
|
||||
ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME
|
||||
);
|
||||
return Self::Disabled;
|
||||
}
|
||||
|
||||
let max_keys_in_list_response = 10;
|
||||
let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
|
||||
|
||||
let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await;
|
||||
|
||||
match upload_simple_s3_data(&enabled.client, upload_tasks_count).await {
|
||||
ControlFlow::Continue(uploads) => {
|
||||
info!("Remote objects created successfully");
|
||||
|
||||
Self::Enabled(S3WithSimpleTestBlobs {
|
||||
enabled,
|
||||
remote_blobs: uploads,
|
||||
})
|
||||
}
|
||||
ControlFlow::Break(uploads) => Self::UploadsFailed(
|
||||
anyhow::anyhow!("One or multiple blobs failed to upload to S3"),
|
||||
S3WithSimpleTestBlobs {
|
||||
enabled,
|
||||
remote_blobs: uploads,
|
||||
},
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
async fn teardown(self) {
|
||||
match self {
|
||||
Self::Disabled => {}
|
||||
Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
|
||||
cleanup(&ctx.enabled.client, ctx.remote_blobs).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn create_s3_client(
|
||||
max_keys_per_list_response: Option<i32>,
|
||||
) -> anyhow::Result<Arc<GenericRemoteStorage>> {
|
||||
@@ -370,7 +258,7 @@ fn create_s3_client(
|
||||
let random_prefix_part = std::time::SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.context("random s3 test prefix part calculation")?
|
||||
.as_nanos();
|
||||
.as_millis();
|
||||
let remote_storage_config = RemoteStorageConfig {
|
||||
max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
|
||||
max_sync_errors: NonZeroU32::new(5).unwrap(),
|
||||
@@ -476,52 +364,3 @@ async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
|
||||
async fn upload_simple_s3_data(
|
||||
client: &Arc<GenericRemoteStorage>,
|
||||
upload_tasks_count: usize,
|
||||
) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
|
||||
info!("Creating {upload_tasks_count} S3 files");
|
||||
let mut upload_tasks = JoinSet::new();
|
||||
for i in 1..upload_tasks_count + 1 {
|
||||
let task_client = Arc::clone(client);
|
||||
upload_tasks.spawn(async move {
|
||||
let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
|
||||
let blob_path = RemotePath::new(&blob_path)
|
||||
.with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let data = format!("remote blob data {i}").into_bytes();
|
||||
let data_len = data.len();
|
||||
task_client
|
||||
.upload(std::io::Cursor::new(data), data_len, &blob_path, None)
|
||||
.await?;
|
||||
|
||||
Ok::<_, anyhow::Error>(blob_path)
|
||||
});
|
||||
}
|
||||
|
||||
let mut upload_tasks_failed = false;
|
||||
let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
|
||||
while let Some(task_run_result) = upload_tasks.join_next().await {
|
||||
match task_run_result
|
||||
.context("task join failed")
|
||||
.and_then(|task_result| task_result.context("upload task failed"))
|
||||
{
|
||||
Ok(upload_path) => {
|
||||
uploaded_blobs.insert(upload_path);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Upload task failed: {e:?}");
|
||||
upload_tasks_failed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if upload_tasks_failed {
|
||||
ControlFlow::Break(uploaded_blobs)
|
||||
} else {
|
||||
ControlFlow::Continue(uploaded_blobs)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use hyper::{header, Body, Response, StatusCode};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::error::Error as StdError;
|
||||
use thiserror::Error;
|
||||
use tracing::error;
|
||||
|
||||
@@ -16,7 +15,7 @@ pub enum ApiError {
|
||||
Unauthorized(String),
|
||||
|
||||
#[error("NotFound: {0}")]
|
||||
NotFound(Box<dyn StdError + Send + Sync + 'static>),
|
||||
NotFound(anyhow::Error),
|
||||
|
||||
#[error("Conflict: {0}")]
|
||||
Conflict(String),
|
||||
|
||||
@@ -495,50 +495,50 @@ fn start_pageserver(
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
|
||||
let background_jobs_barrier = background_jobs_barrier;
|
||||
let metrics_ctx = RequestContext::todo_child(
|
||||
TaskKind::MetricsCollection,
|
||||
// This task itself shouldn't download anything.
|
||||
// The actual size calculation does need downloads, and
|
||||
// creates a child context with the right DownloadBehavior.
|
||||
DownloadBehavior::Error,
|
||||
);
|
||||
task_mgr::spawn(
|
||||
crate::BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::MetricsCollection,
|
||||
None,
|
||||
None,
|
||||
"consumption metrics collection",
|
||||
true,
|
||||
async move {
|
||||
// first wait until background jobs are cleared to launch.
|
||||
//
|
||||
// this is because we only process active tenants and timelines, and the
|
||||
// Timeline::get_current_logical_size will spawn the logical size calculation,
|
||||
// which will not be rate-limited.
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
|
||||
let background_jobs_barrier = background_jobs_barrier;
|
||||
let metrics_ctx = RequestContext::todo_child(
|
||||
TaskKind::MetricsCollection,
|
||||
// This task itself shouldn't download anything.
|
||||
// The actual size calculation does need downloads, and
|
||||
// creates a child context with the right DownloadBehavior.
|
||||
DownloadBehavior::Error,
|
||||
);
|
||||
task_mgr::spawn(
|
||||
MGMT_REQUEST_RUNTIME.handle(),
|
||||
TaskKind::MetricsCollection,
|
||||
None,
|
||||
None,
|
||||
"consumption metrics collection",
|
||||
true,
|
||||
async move {
|
||||
// first wait until background jobs are cleared to launch.
|
||||
//
|
||||
// this is because we only process active tenants and timelines, and the
|
||||
// Timeline::get_current_logical_size will spawn the logical size calculation,
|
||||
// which will not be rate-limited.
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return Ok(()); },
|
||||
_ = background_jobs_barrier.wait() => {}
|
||||
};
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return Ok(()); },
|
||||
_ = background_jobs_barrier.wait() => {}
|
||||
};
|
||||
|
||||
pageserver::consumption_metrics::collect_metrics(
|
||||
metric_collection_endpoint,
|
||||
conf.metric_collection_interval,
|
||||
conf.cached_metric_collection_interval,
|
||||
conf.synthetic_size_calculation_interval,
|
||||
conf.id,
|
||||
metrics_ctx,
|
||||
)
|
||||
.instrument(info_span!("metrics_collection"))
|
||||
.await?;
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
pageserver::consumption_metrics::collect_metrics(
|
||||
metric_collection_endpoint,
|
||||
conf.metric_collection_interval,
|
||||
conf.cached_metric_collection_interval,
|
||||
conf.synthetic_size_calculation_interval,
|
||||
conf.id,
|
||||
metrics_ctx,
|
||||
)
|
||||
.instrument(info_span!("metrics_collection"))
|
||||
.await?;
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Spawn a task to listen for libpq connections. It will spawn further tasks
|
||||
|
||||
@@ -96,12 +96,12 @@ pub mod defaults {
|
||||
|
||||
#background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}'
|
||||
|
||||
[tenant_config]
|
||||
# [tenant_config]
|
||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
|
||||
#compaction_target_size = {DEFAULT_COMPACTION_TARGET_SIZE} # in bytes
|
||||
#compaction_period = '{DEFAULT_COMPACTION_PERIOD}'
|
||||
#compaction_threshold = {DEFAULT_COMPACTION_THRESHOLD}
|
||||
#compaction_threshold = '{DEFAULT_COMPACTION_THRESHOLD}'
|
||||
|
||||
#gc_period = '{DEFAULT_GC_PERIOD}'
|
||||
#gc_horizon = {DEFAULT_GC_HORIZON}
|
||||
@@ -111,8 +111,7 @@ pub mod defaults {
|
||||
#min_resident_size_override = .. # in bytes
|
||||
#evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
|
||||
#gc_feedback = false
|
||||
|
||||
[remote_storage]
|
||||
# [remote_storage]
|
||||
|
||||
"###
|
||||
);
|
||||
|
||||
@@ -142,7 +142,7 @@ impl From<TenantMapInsertError> for ApiError {
|
||||
impl From<TenantStateError> for ApiError {
|
||||
fn from(tse: TenantStateError) -> ApiError {
|
||||
match tse {
|
||||
TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
|
||||
TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)),
|
||||
_ => ApiError::InternalServerError(anyhow::Error::new(tse)),
|
||||
}
|
||||
}
|
||||
@@ -151,7 +151,7 @@ impl From<TenantStateError> for ApiError {
|
||||
impl From<GetTenantError> for ApiError {
|
||||
fn from(tse: GetTenantError) -> ApiError {
|
||||
match tse {
|
||||
GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
|
||||
GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)),
|
||||
e @ GetTenantError::NotActive(_) => {
|
||||
// Why is this not `ApiError::NotFound`?
|
||||
// Because we must be careful to never return 404 for a tenant if it does
|
||||
@@ -169,7 +169,7 @@ impl From<SetNewTenantConfigError> for ApiError {
|
||||
fn from(e: SetNewTenantConfigError) -> ApiError {
|
||||
match e {
|
||||
SetNewTenantConfigError::GetTenant(tid) => {
|
||||
ApiError::NotFound(anyhow!("tenant {}", tid).into())
|
||||
ApiError::NotFound(anyhow!("tenant {}", tid))
|
||||
}
|
||||
e @ SetNewTenantConfigError::Persist(_) => {
|
||||
ApiError::InternalServerError(anyhow::Error::new(e))
|
||||
@@ -182,7 +182,7 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
|
||||
fn from(value: crate::tenant::DeleteTimelineError) -> Self {
|
||||
use crate::tenant::DeleteTimelineError::*;
|
||||
match value {
|
||||
NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()),
|
||||
NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found")),
|
||||
HasChildren(children) => ApiError::PreconditionFailed(
|
||||
format!("Cannot delete timeline which has child timelines: {children:?}")
|
||||
.into_boxed_str(),
|
||||
@@ -397,7 +397,7 @@ async fn timeline_detail_handler(
|
||||
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, false)
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?;
|
||||
.map_err(ApiError::NotFound)?;
|
||||
|
||||
let timeline_info = build_timeline_info(
|
||||
&timeline,
|
||||
@@ -1061,7 +1061,7 @@ async fn timeline_download_remote_layers_handler_get(
|
||||
let info = timeline
|
||||
.get_download_all_remote_layers_task_info()
|
||||
.context("task never started since last pageserver process start")
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?;
|
||||
.map_err(ApiError::NotFound)?;
|
||||
json_response(StatusCode::OK, info)
|
||||
}
|
||||
|
||||
@@ -1072,7 +1072,7 @@ async fn active_timeline_of_active_tenant(
|
||||
let tenant = mgr::get_tenant(tenant_id, true).await?;
|
||||
tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(|e| ApiError::NotFound(e.into()))
|
||||
.map_err(ApiError::NotFound)
|
||||
}
|
||||
|
||||
async fn always_panic_handler(
|
||||
@@ -1128,6 +1128,8 @@ async fn disk_usage_eviction_run(
|
||||
freed_bytes: 0,
|
||||
};
|
||||
|
||||
use crate::task_mgr::MGMT_REQUEST_RUNTIME;
|
||||
|
||||
let (tx, rx) = tokio::sync::oneshot::channel();
|
||||
|
||||
let state = get_state(&r);
|
||||
@@ -1145,7 +1147,7 @@ async fn disk_usage_eviction_run(
|
||||
let _g = cancel.drop_guard();
|
||||
|
||||
crate::task_mgr::spawn(
|
||||
crate::task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
MGMT_REQUEST_RUNTIME.handle(),
|
||||
TaskKind::DiskUsageEviction,
|
||||
None,
|
||||
None,
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use metrics::metric_vec_duration::DurationResultObserver;
|
||||
use metrics::{
|
||||
register_counter_vec, register_histogram, register_histogram_vec, register_int_counter,
|
||||
register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec,
|
||||
@@ -425,27 +424,6 @@ pub static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub struct BasebackupQueryTime(HistogramVec);
|
||||
pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
|
||||
BasebackupQueryTime({
|
||||
register_histogram_vec!(
|
||||
"pageserver_basebackup_query_seconds",
|
||||
"Histogram of basebackup queries durations, by result type",
|
||||
&["result"],
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
})
|
||||
});
|
||||
|
||||
impl DurationResultObserver for BasebackupQueryTime {
|
||||
fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration) {
|
||||
let label_value = if res.is_ok() { "ok" } else { "error" };
|
||||
let metric = self.0.get_metric_with_label_values(&[label_value]).unwrap();
|
||||
metric.observe(duration.as_secs_f64());
|
||||
}
|
||||
}
|
||||
|
||||
pub static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"pageserver_live_connections",
|
||||
@@ -845,6 +823,11 @@ impl TimelineMetrics {
|
||||
let evictions_with_low_residence_duration =
|
||||
evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);
|
||||
|
||||
// TODO(chi): remove this once we remove Lazy for all metrics. Otherwise this will not appear in the exporter
|
||||
// and integration test will error.
|
||||
MATERIALIZED_PAGE_CACHE_HIT_DIRECT.get();
|
||||
MATERIALIZED_PAGE_CACHE_HIT.get();
|
||||
|
||||
TimelineMetrics {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
@@ -1319,8 +1302,4 @@ pub fn preinitialize_metrics() {
|
||||
|
||||
// Same as above for this metric, but, it's a Vec-type metric for which we don't know all the labels.
|
||||
BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT.reset();
|
||||
|
||||
// Python tests need these.
|
||||
MATERIALIZED_PAGE_CACHE_HIT_DIRECT.get();
|
||||
MATERIALIZED_PAGE_CACHE_HIT.get();
|
||||
}
|
||||
|
||||
@@ -390,9 +390,7 @@ impl PageServerHandler {
|
||||
};
|
||||
|
||||
// Check that the timeline exists
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(|e| anyhow::anyhow!(e))?;
|
||||
let timeline = tenant.get_timeline(timeline_id, true)?;
|
||||
|
||||
// switch client to COPYBOTH
|
||||
pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
|
||||
@@ -913,24 +911,10 @@ where
|
||||
None
|
||||
};
|
||||
|
||||
metrics::metric_vec_duration::observe_async_block_duration_by_result(
|
||||
&*crate::metrics::BASEBACKUP_QUERY_TIME,
|
||||
async move {
|
||||
self.handle_basebackup_request(
|
||||
pgb,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
None,
|
||||
false,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
anyhow::Ok(())
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
// Check that the timeline exists
|
||||
self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false, ctx)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
}
|
||||
// return pair of prev_lsn and last_lsn
|
||||
else if query_string.starts_with("get_last_record_rlsn ") {
|
||||
@@ -1246,6 +1230,6 @@ async fn get_active_tenant_timeline(
|
||||
.map_err(GetActiveTimelineError::Tenant)?;
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
|
||||
.map_err(GetActiveTimelineError::Timeline)?;
|
||||
Ok(timeline)
|
||||
}
|
||||
|
||||
@@ -421,21 +421,6 @@ remote:
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error, PartialEq, Eq)]
|
||||
pub enum GetTimelineError {
|
||||
#[error("Timeline {tenant_id}/{timeline_id} is not active, state: {state:?}")]
|
||||
NotActive {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
state: TimelineState,
|
||||
},
|
||||
#[error("Timeline {tenant_id}/{timeline_id} was not found")]
|
||||
NotFound {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum DeleteTimelineError {
|
||||
#[error("NotFound")]
|
||||
@@ -961,117 +946,6 @@ impl Tenant {
|
||||
tenant
|
||||
}
|
||||
|
||||
pub fn scan_and_sort_timelines_dir(
|
||||
self: Arc<Tenant>,
|
||||
) -> anyhow::Result<Vec<(TimelineId, TimelineMetadata)>> {
|
||||
let timelines_dir = self.conf.timelines_path(&self.tenant_id);
|
||||
let mut timelines_to_load: HashMap<TimelineId, TimelineMetadata> = HashMap::new();
|
||||
|
||||
for entry in
|
||||
std::fs::read_dir(&timelines_dir).context("list timelines directory for tenant")?
|
||||
{
|
||||
let entry = entry.context("read timeline dir entry")?;
|
||||
let timeline_dir = entry.path();
|
||||
|
||||
if crate::is_temporary(&timeline_dir) {
|
||||
info!(
|
||||
"Found temporary timeline directory, removing: {}",
|
||||
timeline_dir.display()
|
||||
);
|
||||
if let Err(e) = std::fs::remove_dir_all(&timeline_dir) {
|
||||
error!(
|
||||
"Failed to remove temporary directory '{}': {:?}",
|
||||
timeline_dir.display(),
|
||||
e
|
||||
);
|
||||
}
|
||||
} else if is_uninit_mark(&timeline_dir) {
|
||||
if !timeline_dir.exists() {
|
||||
warn!(
|
||||
"Timeline dir entry become invalid: {}",
|
||||
timeline_dir.display()
|
||||
);
|
||||
continue;
|
||||
}
|
||||
let timeline_uninit_mark_file = &timeline_dir;
|
||||
info!(
|
||||
"Found an uninit mark file {}, removing the timeline and its uninit mark",
|
||||
timeline_uninit_mark_file.display()
|
||||
);
|
||||
let timeline_id = timeline_uninit_mark_file
|
||||
.file_stem()
|
||||
.and_then(OsStr::to_str)
|
||||
.unwrap_or_default()
|
||||
.parse::<TimelineId>()
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Could not parse timeline id out of the timeline uninit mark name {}",
|
||||
timeline_uninit_mark_file.display()
|
||||
)
|
||||
})?;
|
||||
let timeline_dir = self.conf.timeline_path(&timeline_id, &self.tenant_id);
|
||||
if let Err(e) =
|
||||
remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file)
|
||||
{
|
||||
error!("Failed to clean up uninit marked timeline: {e:?}");
|
||||
}
|
||||
} else {
|
||||
if !timeline_dir.exists() {
|
||||
warn!(
|
||||
"Timeline dir entry become invalid: {}",
|
||||
timeline_dir.display()
|
||||
);
|
||||
continue;
|
||||
}
|
||||
let timeline_id = timeline_dir
|
||||
.file_name()
|
||||
.and_then(OsStr::to_str)
|
||||
.unwrap_or_default()
|
||||
.parse::<TimelineId>()
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Could not parse timeline id out of the timeline dir name {}",
|
||||
timeline_dir.display()
|
||||
)
|
||||
})?;
|
||||
let timeline_uninit_mark_file = self
|
||||
.conf
|
||||
.timeline_uninit_mark_file_path(self.tenant_id, timeline_id);
|
||||
if timeline_uninit_mark_file.exists() {
|
||||
info!(
|
||||
%timeline_id,
|
||||
"Found an uninit mark file, removing the timeline and its uninit mark",
|
||||
);
|
||||
if let Err(e) =
|
||||
remove_timeline_and_uninit_mark(&timeline_dir, &timeline_uninit_mark_file)
|
||||
{
|
||||
error!("Failed to clean up uninit marked timeline: {e:?}");
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
let file_name = entry.file_name();
|
||||
if let Ok(timeline_id) =
|
||||
file_name.to_str().unwrap_or_default().parse::<TimelineId>()
|
||||
{
|
||||
let metadata = load_metadata(self.conf, timeline_id, self.tenant_id)
|
||||
.context("failed to load metadata")?;
|
||||
timelines_to_load.insert(timeline_id, metadata);
|
||||
} else {
|
||||
// A file or directory that doesn't look like a timeline ID
|
||||
warn!(
|
||||
"unexpected file or directory in timelines directory: {}",
|
||||
file_name.to_string_lossy()
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sort the array of timeline IDs into tree-order, so that parent comes before
|
||||
// all its children.
|
||||
tree_sort_timelines(timelines_to_load)
|
||||
}
|
||||
|
||||
///
|
||||
/// Background task to load in-memory data structures for this tenant, from
|
||||
/// files on disk. Used at pageserver startup.
|
||||
@@ -1088,16 +962,110 @@ impl Tenant {
|
||||
|
||||
utils::failpoint_sleep_millis_async!("before-loading-tenant");
|
||||
|
||||
// TODO split this into two functions, scan and actual load
|
||||
|
||||
// Load in-memory state to reflect the local files on disk
|
||||
//
|
||||
// Scan the directory, peek into the metadata file of each timeline, and
|
||||
// collect a list of timelines and their ancestors.
|
||||
let tenant_id = self.tenant_id;
|
||||
let conf = self.conf;
|
||||
let span = info_span!("blocking");
|
||||
let cloned = Arc::clone(self);
|
||||
|
||||
let sorted_timelines: Vec<(_, _)> = tokio::task::spawn_blocking(move || {
|
||||
let _g = span.entered();
|
||||
cloned.scan_and_sort_timelines_dir()
|
||||
let mut timelines_to_load: HashMap<TimelineId, TimelineMetadata> = HashMap::new();
|
||||
let timelines_dir = conf.timelines_path(&tenant_id);
|
||||
|
||||
for entry in
|
||||
std::fs::read_dir(&timelines_dir).context("list timelines directory for tenant")?
|
||||
{
|
||||
let entry = entry.context("read timeline dir entry")?;
|
||||
let timeline_dir = entry.path();
|
||||
|
||||
if crate::is_temporary(&timeline_dir) {
|
||||
info!(
|
||||
"Found temporary timeline directory, removing: {}",
|
||||
timeline_dir.display()
|
||||
);
|
||||
if let Err(e) = std::fs::remove_dir_all(&timeline_dir) {
|
||||
error!(
|
||||
"Failed to remove temporary directory '{}': {:?}",
|
||||
timeline_dir.display(),
|
||||
e
|
||||
);
|
||||
}
|
||||
} else if is_uninit_mark(&timeline_dir) {
|
||||
let timeline_uninit_mark_file = &timeline_dir;
|
||||
info!(
|
||||
"Found an uninit mark file {}, removing the timeline and its uninit mark",
|
||||
timeline_uninit_mark_file.display()
|
||||
);
|
||||
let timeline_id = timeline_uninit_mark_file
|
||||
.file_stem()
|
||||
.and_then(OsStr::to_str)
|
||||
.unwrap_or_default()
|
||||
.parse::<TimelineId>()
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Could not parse timeline id out of the timeline uninit mark name {}",
|
||||
timeline_uninit_mark_file.display()
|
||||
)
|
||||
})?;
|
||||
let timeline_dir = conf.timeline_path(&timeline_id, &tenant_id);
|
||||
if let Err(e) =
|
||||
remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file)
|
||||
{
|
||||
error!("Failed to clean up uninit marked timeline: {e:?}");
|
||||
}
|
||||
} else {
|
||||
let timeline_id = timeline_dir
|
||||
.file_name()
|
||||
.and_then(OsStr::to_str)
|
||||
.unwrap_or_default()
|
||||
.parse::<TimelineId>()
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Could not parse timeline id out of the timeline dir name {}",
|
||||
timeline_dir.display()
|
||||
)
|
||||
})?;
|
||||
let timeline_uninit_mark_file =
|
||||
conf.timeline_uninit_mark_file_path(tenant_id, timeline_id);
|
||||
if timeline_uninit_mark_file.exists() {
|
||||
info!(
|
||||
%timeline_id,
|
||||
"Found an uninit mark file, removing the timeline and its uninit mark",
|
||||
);
|
||||
if let Err(e) = remove_timeline_and_uninit_mark(
|
||||
&timeline_dir,
|
||||
&timeline_uninit_mark_file,
|
||||
) {
|
||||
error!("Failed to clean up uninit marked timeline: {e:?}");
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
let file_name = entry.file_name();
|
||||
if let Ok(timeline_id) =
|
||||
file_name.to_str().unwrap_or_default().parse::<TimelineId>()
|
||||
{
|
||||
let metadata = load_metadata(conf, timeline_id, tenant_id)
|
||||
.context("failed to load metadata")?;
|
||||
timelines_to_load.insert(timeline_id, metadata);
|
||||
} else {
|
||||
// A file or directory that doesn't look like a timeline ID
|
||||
warn!(
|
||||
"unexpected file or directory in timelines directory: {}",
|
||||
file_name.to_string_lossy()
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sort the array of timeline IDs into tree-order, so that parent comes before
|
||||
// all its children.
|
||||
tree_sort_timelines(timelines_to_load)
|
||||
})
|
||||
.await
|
||||
.context("load spawn_blocking")
|
||||
@@ -1245,21 +1213,19 @@ impl Tenant {
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
active_only: bool,
|
||||
) -> Result<Arc<Timeline>, GetTimelineError> {
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
let timelines_accessor = self.timelines.lock().unwrap();
|
||||
let timeline = timelines_accessor
|
||||
.get(&timeline_id)
|
||||
.ok_or(GetTimelineError::NotFound {
|
||||
tenant_id: self.tenant_id,
|
||||
timeline_id,
|
||||
})?;
|
||||
let timeline = timelines_accessor.get(&timeline_id).with_context(|| {
|
||||
format!("Timeline {}/{} was not found", self.tenant_id, timeline_id)
|
||||
})?;
|
||||
|
||||
if active_only && !timeline.is_active() {
|
||||
Err(GetTimelineError::NotActive {
|
||||
tenant_id: self.tenant_id,
|
||||
anyhow::bail!(
|
||||
"Timeline {}/{} is not active, state: {:?}",
|
||||
self.tenant_id,
|
||||
timeline_id,
|
||||
state: timeline.current_state(),
|
||||
})
|
||||
timeline.current_state()
|
||||
)
|
||||
} else {
|
||||
Ok(Arc::clone(timeline))
|
||||
}
|
||||
@@ -3409,8 +3375,9 @@ where
|
||||
#[cfg(test)]
|
||||
pub mod harness {
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use once_cell::sync::Lazy;
|
||||
use once_cell::sync::OnceCell;
|
||||
use std::sync::Arc;
|
||||
use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard};
|
||||
use std::{fs, path::PathBuf};
|
||||
use utils::logging;
|
||||
use utils::lsn::Lsn;
|
||||
@@ -3443,6 +3410,8 @@ pub mod harness {
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
static LOCK: Lazy<RwLock<()>> = Lazy::new(|| RwLock::new(()));
|
||||
|
||||
impl From<TenantConf> for TenantConfOpt {
|
||||
fn from(tenant_conf: TenantConf) -> Self {
|
||||
Self {
|
||||
@@ -3469,16 +3438,33 @@ pub mod harness {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TenantHarness {
|
||||
pub struct TenantHarness<'a> {
|
||||
pub conf: &'static PageServerConf,
|
||||
pub tenant_conf: TenantConf,
|
||||
pub tenant_id: TenantId,
|
||||
|
||||
pub lock_guard: (
|
||||
Option<RwLockReadGuard<'a, ()>>,
|
||||
Option<RwLockWriteGuard<'a, ()>>,
|
||||
),
|
||||
}
|
||||
|
||||
static LOG_HANDLE: OnceCell<()> = OnceCell::new();
|
||||
|
||||
impl TenantHarness {
|
||||
impl<'a> TenantHarness<'a> {
|
||||
pub fn create(test_name: &'static str) -> anyhow::Result<Self> {
|
||||
Self::create_internal(test_name, false)
|
||||
}
|
||||
pub fn create_exclusive(test_name: &'static str) -> anyhow::Result<Self> {
|
||||
Self::create_internal(test_name, true)
|
||||
}
|
||||
fn create_internal(test_name: &'static str, exclusive: bool) -> anyhow::Result<Self> {
|
||||
let lock_guard = if exclusive {
|
||||
(None, Some(LOCK.write().unwrap()))
|
||||
} else {
|
||||
(Some(LOCK.read().unwrap()), None)
|
||||
};
|
||||
|
||||
LOG_HANDLE.get_or_init(|| {
|
||||
logging::init(
|
||||
logging::LogFormat::Test,
|
||||
@@ -3514,6 +3500,7 @@ pub mod harness {
|
||||
conf,
|
||||
tenant_conf,
|
||||
tenant_id,
|
||||
lock_guard,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -3538,12 +3525,26 @@ pub mod harness {
|
||||
self.tenant_id,
|
||||
None,
|
||||
));
|
||||
// populate tenant with locally available timelines
|
||||
let mut timelines_to_load = HashMap::new();
|
||||
for timeline_dir_entry in fs::read_dir(self.conf.timelines_path(&self.tenant_id))
|
||||
.expect("should be able to read timelines dir")
|
||||
{
|
||||
let timeline_dir_entry = timeline_dir_entry?;
|
||||
let timeline_id: TimelineId = timeline_dir_entry
|
||||
.path()
|
||||
.file_name()
|
||||
.unwrap()
|
||||
.to_string_lossy()
|
||||
.parse()?;
|
||||
|
||||
let timeline_metadata = load_metadata(self.conf, timeline_id, self.tenant_id)?;
|
||||
timelines_to_load.insert(timeline_id, timeline_metadata);
|
||||
}
|
||||
tenant
|
||||
.load(None, ctx)
|
||||
.instrument(info_span!("try_load", tenant_id=%self.tenant_id))
|
||||
.await?;
|
||||
|
||||
// TODO reuse Tenant::activate (needs broker)
|
||||
tenant.state.send_replace(TenantState::Active);
|
||||
for timeline in tenant.timelines.lock().unwrap().values() {
|
||||
timeline.set_state(TimelineState::Active);
|
||||
@@ -4069,13 +4070,9 @@ mod tests {
|
||||
std::fs::write(metadata_path, metadata_bytes)?;
|
||||
|
||||
let err = harness.try_load(&ctx).await.err().expect("should fail");
|
||||
// get all the stack with all .context, not tonly the last one
|
||||
let message = format!("{err:#}");
|
||||
let expected = "Failed to parse metadata bytes from path";
|
||||
assert!(
|
||||
message.contains(expected),
|
||||
"message '{message}' expected to contain {expected}"
|
||||
);
|
||||
assert!(err
|
||||
.to_string()
|
||||
.starts_with("Failed to parse metadata bytes from path"));
|
||||
|
||||
let mut found_error_message = false;
|
||||
let mut err_source = err.source();
|
||||
@@ -4509,44 +4506,6 @@ mod tests {
|
||||
assert!(expect_initdb_optimization);
|
||||
assert!(initdb_optimization_count > 0);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_uninit_mark_crash() -> anyhow::Result<()> {
|
||||
let name = "test_uninit_mark_crash";
|
||||
let harness = TenantHarness::create(name)?;
|
||||
{
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline =
|
||||
tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
|
||||
// Keeps uninit mark in place
|
||||
std::mem::forget(tline);
|
||||
}
|
||||
|
||||
let (tenant, _) = harness.load().await;
|
||||
match tenant.get_timeline(TIMELINE_ID, false) {
|
||||
Ok(_) => panic!("timeline should've been removed during load"),
|
||||
Err(e) => {
|
||||
assert_eq!(
|
||||
e,
|
||||
GetTimelineError::NotFound {
|
||||
tenant_id: tenant.tenant_id,
|
||||
timeline_id: TIMELINE_ID,
|
||||
}
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
assert!(!harness
|
||||
.conf
|
||||
.timeline_path(&TIMELINE_ID, &tenant.tenant_id)
|
||||
.exists());
|
||||
|
||||
assert!(!harness
|
||||
.conf
|
||||
.timeline_uninit_mark_file_path(tenant.tenant_id, TIMELINE_ID)
|
||||
.exists());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -675,7 +675,7 @@ pub async fn immediate_gc(
|
||||
.get(&tenant_id)
|
||||
.map(Arc::clone)
|
||||
.with_context(|| format!("tenant {tenant_id}"))
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?;
|
||||
.map_err(ApiError::NotFound)?;
|
||||
|
||||
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
|
||||
// Use tenant's pitr setting
|
||||
@@ -724,11 +724,11 @@ pub async fn immediate_compact(
|
||||
.get(&tenant_id)
|
||||
.map(Arc::clone)
|
||||
.with_context(|| format!("tenant {tenant_id}"))
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?;
|
||||
.map_err(ApiError::NotFound)?;
|
||||
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?;
|
||||
.map_err(ApiError::NotFound)?;
|
||||
|
||||
// Run in task_mgr to avoid race with tenant_detach operation
|
||||
let ctx = ctx.detached_child(TaskKind::Compaction, DownloadBehavior::Download);
|
||||
|
||||
@@ -1367,7 +1367,7 @@ mod tests {
|
||||
struct TestSetup {
|
||||
runtime: &'static tokio::runtime::Runtime,
|
||||
entered_runtime: EnterGuard<'static>,
|
||||
harness: TenantHarness,
|
||||
harness: TenantHarness<'static>,
|
||||
tenant: Arc<Tenant>,
|
||||
tenant_ctx: RequestContext,
|
||||
remote_fs_dir: PathBuf,
|
||||
|
||||
@@ -12,7 +12,7 @@ use crate::context::RequestContext;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::{Context, Result};
|
||||
use anyhow::Result;
|
||||
use bytes::Bytes;
|
||||
use enum_map::EnumMap;
|
||||
use enumset::EnumSet;
|
||||
@@ -343,8 +343,7 @@ impl LayerAccessStats {
|
||||
/// All layers should implement a minimal `std::fmt::Debug` without tenant or
|
||||
/// timeline names, because those are known in the context of which the layers
|
||||
/// are used in (timeline).
|
||||
#[async_trait::async_trait]
|
||||
pub trait Layer: std::fmt::Debug + Send + Sync + 'static {
|
||||
pub trait Layer: std::fmt::Debug + Send + Sync {
|
||||
/// Range of keys that this layer covers
|
||||
fn get_key_range(&self) -> Range<Key>;
|
||||
|
||||
@@ -374,42 +373,13 @@ pub trait Layer: std::fmt::Debug + Send + Sync + 'static {
|
||||
/// is available. If this returns ValueReconstructResult::Continue, look up
|
||||
/// the predecessor layer and call again with the same 'reconstruct_data' to
|
||||
/// collect more data.
|
||||
fn get_value_reconstruct_data_blocking(
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_data: ValueReconstructState,
|
||||
ctx: RequestContext,
|
||||
) -> Result<(ValueReconstructState, ValueReconstructResult)>;
|
||||
|
||||
/// CANCEL SAFETY: if the returned future is dropped,
|
||||
/// the wrapped closure still run to completion and the return value discarded.
|
||||
/// For the case of get_value_reconstruct_data, we expect the closure to not
|
||||
/// have any side effects, as it only attempts to read a layer (and stuff like
|
||||
/// page cache isn't considered a real side effect).
|
||||
/// But, ...
|
||||
/// TRACING:
|
||||
/// If the returned future is cancelled, the spawn_blocking span can outlive
|
||||
/// the caller's span.
|
||||
/// So, technically, we should be using `parent: None` and `follows_from: current`
|
||||
/// instead. However, in practice, the advantage of maintaining the span stack
|
||||
/// in logs outweighs the disadvantage of having a dangling span in a case that
|
||||
/// is not expected to happen because in pageserver we generally don't drop pending futures.
|
||||
async fn get_value_reconstruct_data(
|
||||
self: Arc<Self>,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_data: ValueReconstructState,
|
||||
ctx: RequestContext,
|
||||
) -> Result<(ValueReconstructState, ValueReconstructResult)> {
|
||||
let span = tracing::info_span!("get_value_reconstruct_data_spawn_blocking");
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let _enter = span.enter();
|
||||
self.get_value_reconstruct_data_blocking(key, lsn_range, reconstruct_data, ctx)
|
||||
})
|
||||
.await
|
||||
.context("spawn_blocking")?
|
||||
}
|
||||
reconstruct_data: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<ValueReconstructResult>;
|
||||
|
||||
/// A short ID string that uniquely identifies the given layer within a [`LayerMap`].
|
||||
fn short_id(&self) -> String;
|
||||
@@ -529,7 +499,6 @@ impl LayerDescriptor {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for LayerDescriptor {
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
self.key.clone()
|
||||
@@ -543,13 +512,13 @@ impl Layer for LayerDescriptor {
|
||||
self.is_incremental
|
||||
}
|
||||
|
||||
fn get_value_reconstruct_data_blocking(
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
_key: Key,
|
||||
_lsn_range: Range<Lsn>,
|
||||
_reconstruct_data: ValueReconstructState,
|
||||
_ctx: RequestContext,
|
||||
) -> Result<(ValueReconstructState, ValueReconstructResult)> {
|
||||
_reconstruct_data: &mut ValueReconstructState,
|
||||
_ctx: &RequestContext,
|
||||
) -> Result<ValueReconstructResult> {
|
||||
todo!("This method shouldn't be part of the Layer trait")
|
||||
}
|
||||
|
||||
|
||||
@@ -218,7 +218,6 @@ impl std::fmt::Debug for DeltaLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for DeltaLayer {
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
@@ -295,13 +294,13 @@ impl Layer for DeltaLayer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_value_reconstruct_data_blocking(
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
mut reconstruct_state: ValueReconstructState,
|
||||
ctx: RequestContext,
|
||||
) -> anyhow::Result<(ValueReconstructState, ValueReconstructResult)> {
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
ensure!(lsn_range.start >= self.desc.lsn_range.start);
|
||||
let mut need_image = true;
|
||||
|
||||
@@ -309,7 +308,7 @@ impl Layer for DeltaLayer {
|
||||
|
||||
{
|
||||
// Open the file and lock the metadata in memory
|
||||
let inner = self.load(LayerAccessKind::GetValueReconstructData, &ctx)?;
|
||||
let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;
|
||||
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
let file = &inner.file;
|
||||
@@ -375,9 +374,9 @@ impl Layer for DeltaLayer {
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know.
|
||||
if need_image {
|
||||
Ok((reconstruct_state, ValueReconstructResult::Continue))
|
||||
Ok(ValueReconstructResult::Continue)
|
||||
} else {
|
||||
Ok((reconstruct_state, ValueReconstructResult::Complete))
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -149,7 +149,6 @@ impl std::fmt::Debug for ImageLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for ImageLayer {
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
@@ -182,18 +181,18 @@ impl Layer for ImageLayer {
|
||||
}
|
||||
|
||||
/// Look up given page in the file
|
||||
fn get_value_reconstruct_data_blocking(
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
mut reconstruct_state: ValueReconstructState,
|
||||
ctx: RequestContext,
|
||||
) -> anyhow::Result<(ValueReconstructState, ValueReconstructResult)> {
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
assert!(self.desc.key_range.contains(&key));
|
||||
assert!(lsn_range.start >= self.lsn);
|
||||
assert!(lsn_range.end >= self.lsn);
|
||||
|
||||
let inner = self.load(LayerAccessKind::GetValueReconstructData, &ctx)?;
|
||||
let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;
|
||||
|
||||
let file = inner.file.as_ref().unwrap();
|
||||
let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);
|
||||
@@ -211,9 +210,9 @@ impl Layer for ImageLayer {
|
||||
let value = Bytes::from(blob);
|
||||
|
||||
reconstruct_state.img = Some((self.lsn, value));
|
||||
Ok((reconstruct_state, ValueReconstructResult::Complete))
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
} else {
|
||||
Ok((reconstruct_state, ValueReconstructResult::Missing))
|
||||
Ok(ValueReconstructResult::Missing)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -110,7 +110,6 @@ impl InMemoryLayer {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for InMemoryLayer {
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
Key::MIN..Key::MAX
|
||||
@@ -191,13 +190,13 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
|
||||
/// Look up given value in the layer.
|
||||
fn get_value_reconstruct_data_blocking(
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
mut reconstruct_state: ValueReconstructState,
|
||||
_ctx: RequestContext,
|
||||
) -> anyhow::Result<(ValueReconstructState, ValueReconstructResult)> {
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
_ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
ensure!(lsn_range.start >= self.start_lsn);
|
||||
let mut need_image = true;
|
||||
|
||||
@@ -214,7 +213,7 @@ impl Layer for InMemoryLayer {
|
||||
match value {
|
||||
Value::Image(img) => {
|
||||
reconstruct_state.img = Some((*entry_lsn, img));
|
||||
return Ok((reconstruct_state, ValueReconstructResult::Complete));
|
||||
return Ok(ValueReconstructResult::Complete);
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let will_init = rec.will_init();
|
||||
@@ -234,9 +233,9 @@ impl Layer for InMemoryLayer {
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know.
|
||||
if need_image {
|
||||
Ok((reconstruct_state, ValueReconstructResult::Continue))
|
||||
Ok(ValueReconstructResult::Continue)
|
||||
} else {
|
||||
Ok((reconstruct_state, ValueReconstructResult::Complete))
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ use crate::context::RequestContext;
|
||||
use crate::repository::Key;
|
||||
use crate::tenant::layer_map::BatchedUpdates;
|
||||
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructState};
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use anyhow::{bail, Result};
|
||||
use pageserver_api::models::HistoricLayerInfo;
|
||||
use std::ops::Range;
|
||||
@@ -21,7 +21,7 @@ use utils::{
|
||||
use super::filename::{DeltaFileName, ImageFileName};
|
||||
use super::{
|
||||
DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
|
||||
LayerResidenceStatus, PersistentLayer, PersistentLayerDesc, ValueReconstructResult,
|
||||
LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
|
||||
};
|
||||
|
||||
/// RemoteLayer is a not yet downloaded [`ImageLayer`] or
|
||||
@@ -63,15 +63,14 @@ impl std::fmt::Debug for RemoteLayer {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for RemoteLayer {
|
||||
fn get_value_reconstruct_data_blocking(
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
_key: Key,
|
||||
_lsn_range: Range<Lsn>,
|
||||
_reconstruct_state: ValueReconstructState,
|
||||
_ctx: RequestContext,
|
||||
) -> Result<(ValueReconstructState, ValueReconstructResult)> {
|
||||
_reconstruct_state: &mut ValueReconstructState,
|
||||
_ctx: &RequestContext,
|
||||
) -> Result<ValueReconstructResult> {
|
||||
bail!(
|
||||
"layer {} needs to be downloaded",
|
||||
self.filename().file_name()
|
||||
|
||||
@@ -129,7 +129,7 @@ pub struct Timeline {
|
||||
|
||||
pub pg_version: u32,
|
||||
|
||||
pub(crate) layers: Arc<tokio::sync::RwLock<LayerMap<dyn PersistentLayer>>>,
|
||||
pub(crate) layers: tokio::sync::RwLock<LayerMap<dyn PersistentLayer>>,
|
||||
|
||||
/// Set of key ranges which should be covered by image layers to
|
||||
/// allow GC to remove old layers. This set is created by GC and its cutoff LSN is also stored.
|
||||
@@ -555,14 +555,13 @@ impl Timeline {
|
||||
None => None,
|
||||
};
|
||||
|
||||
let reconstruct_state = ValueReconstructState {
|
||||
let mut reconstruct_state = ValueReconstructState {
|
||||
records: Vec::new(),
|
||||
img: cached_page_img,
|
||||
};
|
||||
|
||||
let timer = self.metrics.get_reconstruct_data_time_histo.start_timer();
|
||||
let reconstruct_state = self
|
||||
.get_reconstruct_data(key, lsn, reconstruct_state, ctx)
|
||||
self.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
|
||||
.await?;
|
||||
timer.stop_and_record();
|
||||
|
||||
@@ -1419,7 +1418,7 @@ impl Timeline {
|
||||
timeline_id,
|
||||
tenant_id,
|
||||
pg_version,
|
||||
layers: Arc::new(tokio::sync::RwLock::new(LayerMap::default())),
|
||||
layers: tokio::sync::RwLock::new(LayerMap::default()),
|
||||
wanted_image_layers: Mutex::new(None),
|
||||
|
||||
walredo_mgr,
|
||||
@@ -2353,9 +2352,9 @@ impl Timeline {
|
||||
&self,
|
||||
key: Key,
|
||||
request_lsn: Lsn,
|
||||
mut reconstruct_state: ValueReconstructState,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<ValueReconstructState, PageReconstructError> {
|
||||
) -> Result<(), PageReconstructError> {
|
||||
// Start from the current timeline.
|
||||
let mut timeline_owned;
|
||||
let mut timeline = self;
|
||||
@@ -2385,12 +2384,12 @@ impl Timeline {
|
||||
// The function should have updated 'state'
|
||||
//info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn);
|
||||
match result {
|
||||
ValueReconstructResult::Complete => return Ok(reconstruct_state),
|
||||
ValueReconstructResult::Complete => return Ok(()),
|
||||
ValueReconstructResult::Continue => {
|
||||
// If we reached an earlier cached page image, we're done.
|
||||
if cont_lsn == cached_lsn + 1 {
|
||||
MATERIALIZED_PAGE_CACHE_HIT.inc_by(1);
|
||||
return Ok(reconstruct_state);
|
||||
return Ok(());
|
||||
}
|
||||
if prev_lsn <= cont_lsn {
|
||||
// Didn't make any progress in last iteration. Error out to avoid
|
||||
@@ -2494,19 +2493,13 @@ impl Timeline {
|
||||
// Get all the data needed to reconstruct the page version from this layer.
|
||||
// But if we have an older cached page image, no need to go past that.
|
||||
let lsn_floor = max(cached_lsn + 1, start_lsn);
|
||||
result = match Arc::clone(open_layer)
|
||||
.get_value_reconstruct_data(
|
||||
key,
|
||||
lsn_floor..cont_lsn,
|
||||
reconstruct_state,
|
||||
ctx.attached_child(),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok((new_reconstruct_state, result)) => {
|
||||
reconstruct_state = new_reconstruct_state;
|
||||
result
|
||||
}
|
||||
result = match open_layer.get_value_reconstruct_data(
|
||||
key,
|
||||
lsn_floor..cont_lsn,
|
||||
reconstruct_state,
|
||||
ctx,
|
||||
) {
|
||||
Ok(result) => result,
|
||||
Err(e) => return Err(PageReconstructError::from(e)),
|
||||
};
|
||||
cont_lsn = lsn_floor;
|
||||
@@ -2527,19 +2520,13 @@ impl Timeline {
|
||||
if cont_lsn > start_lsn {
|
||||
//info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
|
||||
let lsn_floor = max(cached_lsn + 1, start_lsn);
|
||||
result = match Arc::clone(frozen_layer)
|
||||
.get_value_reconstruct_data(
|
||||
key,
|
||||
lsn_floor..cont_lsn,
|
||||
reconstruct_state,
|
||||
ctx.attached_child(),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok((new_reconstruct_state, result)) => {
|
||||
reconstruct_state = new_reconstruct_state;
|
||||
result
|
||||
}
|
||||
result = match frozen_layer.get_value_reconstruct_data(
|
||||
key,
|
||||
lsn_floor..cont_lsn,
|
||||
reconstruct_state,
|
||||
ctx,
|
||||
) {
|
||||
Ok(result) => result,
|
||||
Err(e) => return Err(PageReconstructError::from(e)),
|
||||
};
|
||||
cont_lsn = lsn_floor;
|
||||
@@ -2568,19 +2555,13 @@ impl Timeline {
|
||||
// Get all the data needed to reconstruct the page version from this layer.
|
||||
// But if we have an older cached page image, no need to go past that.
|
||||
let lsn_floor = max(cached_lsn + 1, lsn_floor);
|
||||
result = match Arc::clone(&layer)
|
||||
.get_value_reconstruct_data(
|
||||
key,
|
||||
lsn_floor..cont_lsn,
|
||||
reconstruct_state,
|
||||
ctx.attached_child(),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok((new_reconstruct_state, result)) => {
|
||||
reconstruct_state = new_reconstruct_state;
|
||||
result
|
||||
}
|
||||
result = match layer.get_value_reconstruct_data(
|
||||
key,
|
||||
lsn_floor..cont_lsn,
|
||||
reconstruct_state,
|
||||
ctx,
|
||||
) {
|
||||
Ok(result) => result,
|
||||
Err(e) => return Err(PageReconstructError::from(e)),
|
||||
};
|
||||
cont_lsn = lsn_floor;
|
||||
@@ -3389,14 +3370,14 @@ struct CompactLevel0Phase1StatsBuilder {
|
||||
version: Option<u64>,
|
||||
tenant_id: Option<TenantId>,
|
||||
timeline_id: Option<TimelineId>,
|
||||
read_lock_acquisition_micros: DurationRecorder,
|
||||
read_lock_held_spawn_blocking_startup_micros: DurationRecorder,
|
||||
read_lock_held_prerequisites_micros: DurationRecorder,
|
||||
read_lock_held_compute_holes_micros: DurationRecorder,
|
||||
read_lock_drop_micros: DurationRecorder,
|
||||
prepare_iterators_micros: DurationRecorder,
|
||||
write_layer_files_micros: DurationRecorder,
|
||||
first_read_lock_acquisition_micros: DurationRecorder,
|
||||
get_level0_deltas_plus_drop_lock_micros: DurationRecorder,
|
||||
level0_deltas_count: Option<usize>,
|
||||
time_spent_between_locks: DurationRecorder,
|
||||
second_read_lock_acquisition_micros: DurationRecorder,
|
||||
second_read_lock_held_micros: DurationRecorder,
|
||||
sort_holes_micros: DurationRecorder,
|
||||
write_layer_files_micros: DurationRecorder,
|
||||
new_deltas_count: Option<usize>,
|
||||
new_deltas_size: Option<u64>,
|
||||
}
|
||||
@@ -3409,14 +3390,14 @@ struct CompactLevel0Phase1Stats {
|
||||
tenant_id: TenantId,
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
timeline_id: TimelineId,
|
||||
read_lock_acquisition_micros: RecordedDuration,
|
||||
read_lock_held_spawn_blocking_startup_micros: RecordedDuration,
|
||||
read_lock_held_prerequisites_micros: RecordedDuration,
|
||||
read_lock_held_compute_holes_micros: RecordedDuration,
|
||||
read_lock_drop_micros: RecordedDuration,
|
||||
prepare_iterators_micros: RecordedDuration,
|
||||
write_layer_files_micros: RecordedDuration,
|
||||
first_read_lock_acquisition_micros: RecordedDuration,
|
||||
get_level0_deltas_plus_drop_lock_micros: RecordedDuration,
|
||||
level0_deltas_count: usize,
|
||||
time_spent_between_locks: RecordedDuration,
|
||||
second_read_lock_acquisition_micros: RecordedDuration,
|
||||
second_read_lock_held_micros: RecordedDuration,
|
||||
sort_holes_micros: RecordedDuration,
|
||||
write_layer_files_micros: RecordedDuration,
|
||||
new_deltas_count: usize,
|
||||
new_deltas_size: u64,
|
||||
}
|
||||
@@ -3425,51 +3406,54 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
|
||||
type Error = anyhow::Error;
|
||||
|
||||
fn try_from(value: CompactLevel0Phase1StatsBuilder) -> Result<Self, Self::Error> {
|
||||
Ok(Self {
|
||||
version: value.version.ok_or_else(|| anyhow!("version not set"))?,
|
||||
tenant_id: value
|
||||
.tenant_id
|
||||
.ok_or_else(|| anyhow!("tenant_id not set"))?,
|
||||
timeline_id: value
|
||||
.timeline_id
|
||||
.ok_or_else(|| anyhow!("timeline_id not set"))?,
|
||||
read_lock_acquisition_micros: value
|
||||
.read_lock_acquisition_micros
|
||||
let CompactLevel0Phase1StatsBuilder {
|
||||
version,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
first_read_lock_acquisition_micros,
|
||||
get_level0_deltas_plus_drop_lock_micros,
|
||||
level0_deltas_count,
|
||||
time_spent_between_locks,
|
||||
second_read_lock_acquisition_micros,
|
||||
second_read_lock_held_micros,
|
||||
sort_holes_micros,
|
||||
write_layer_files_micros,
|
||||
new_deltas_count,
|
||||
new_deltas_size,
|
||||
} = value;
|
||||
Ok(CompactLevel0Phase1Stats {
|
||||
version: version.ok_or_else(|| anyhow::anyhow!("version not set"))?,
|
||||
tenant_id: tenant_id.ok_or_else(|| anyhow::anyhow!("tenant_id not set"))?,
|
||||
timeline_id: timeline_id.ok_or_else(|| anyhow::anyhow!("timeline_id not set"))?,
|
||||
first_read_lock_acquisition_micros: first_read_lock_acquisition_micros
|
||||
.into_recorded()
|
||||
.ok_or_else(|| anyhow!("read_lock_acquisition_micros not set"))?,
|
||||
read_lock_held_spawn_blocking_startup_micros: value
|
||||
.read_lock_held_spawn_blocking_startup_micros
|
||||
.ok_or_else(|| anyhow::anyhow!("first_read_lock_acquisition_micros not set"))?,
|
||||
get_level0_deltas_plus_drop_lock_micros: get_level0_deltas_plus_drop_lock_micros
|
||||
.into_recorded()
|
||||
.ok_or_else(|| anyhow!("read_lock_held_spawn_blocking_startup_micros not set"))?,
|
||||
read_lock_held_prerequisites_micros: value
|
||||
.read_lock_held_prerequisites_micros
|
||||
.ok_or_else(|| {
|
||||
anyhow::anyhow!("get_level0_deltas_plus_drop_lock_micros not set")
|
||||
})?,
|
||||
level0_deltas_count: level0_deltas_count
|
||||
.ok_or_else(|| anyhow::anyhow!("level0_deltas_count not set"))?,
|
||||
time_spent_between_locks: time_spent_between_locks
|
||||
.into_recorded()
|
||||
.ok_or_else(|| anyhow!("read_lock_held_prerequisites_micros not set"))?,
|
||||
read_lock_held_compute_holes_micros: value
|
||||
.read_lock_held_compute_holes_micros
|
||||
.ok_or_else(|| anyhow::anyhow!("time_spent_between_locks not set"))?,
|
||||
second_read_lock_acquisition_micros: second_read_lock_acquisition_micros
|
||||
.into_recorded()
|
||||
.ok_or_else(|| anyhow!("read_lock_held_compute_holes_micros not set"))?,
|
||||
read_lock_drop_micros: value
|
||||
.read_lock_drop_micros
|
||||
.ok_or_else(|| anyhow::anyhow!("second_read_lock_acquisition_micros not set"))?,
|
||||
second_read_lock_held_micros: second_read_lock_held_micros
|
||||
.into_recorded()
|
||||
.ok_or_else(|| anyhow!("read_lock_drop_micros not set"))?,
|
||||
prepare_iterators_micros: value
|
||||
.prepare_iterators_micros
|
||||
.ok_or_else(|| anyhow::anyhow!("second_read_lock_held_micros not set"))?,
|
||||
sort_holes_micros: sort_holes_micros
|
||||
.into_recorded()
|
||||
.ok_or_else(|| anyhow!("prepare_iterators_micros not set"))?,
|
||||
write_layer_files_micros: value
|
||||
.write_layer_files_micros
|
||||
.ok_or_else(|| anyhow::anyhow!("sort_holes_micros not set"))?,
|
||||
write_layer_files_micros: write_layer_files_micros
|
||||
.into_recorded()
|
||||
.ok_or_else(|| anyhow!("write_layer_files_micros not set"))?,
|
||||
level0_deltas_count: value
|
||||
.level0_deltas_count
|
||||
.ok_or_else(|| anyhow!("level0_deltas_count not set"))?,
|
||||
new_deltas_count: value
|
||||
.new_deltas_count
|
||||
.ok_or_else(|| anyhow!("new_deltas_count not set"))?,
|
||||
new_deltas_size: value
|
||||
.new_deltas_size
|
||||
.ok_or_else(|| anyhow!("new_deltas_size not set"))?,
|
||||
.ok_or_else(|| anyhow::anyhow!("write_layer_files_micros not set"))?,
|
||||
new_deltas_count: new_deltas_count
|
||||
.ok_or_else(|| anyhow::anyhow!("new_deltas_count not set"))?,
|
||||
new_deltas_size: new_deltas_size
|
||||
.ok_or_else(|| anyhow::anyhow!("new_deltas_size not set"))?,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -3480,18 +3464,30 @@ impl Timeline {
|
||||
/// This method takes the `_layer_removal_cs` guard to highlight it required downloads are
|
||||
/// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the
|
||||
/// start of level0 files compaction, the on-demand download should be revisited as well.
|
||||
fn compact_level0_phase1(
|
||||
self: Arc<Self>,
|
||||
async fn compact_level0_phase1(
|
||||
&self,
|
||||
_layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||
layers: tokio::sync::OwnedRwLockReadGuard<LayerMap<dyn PersistentLayer>>,
|
||||
mut stats: CompactLevel0Phase1StatsBuilder,
|
||||
target_file_size: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<CompactLevel0Phase1Result, CompactionError> {
|
||||
stats.read_lock_held_spawn_blocking_startup_micros =
|
||||
stats.read_lock_acquisition_micros.till_now(); // set by caller
|
||||
let mut stats = CompactLevel0Phase1StatsBuilder {
|
||||
version: Some(1),
|
||||
tenant_id: Some(self.tenant_id),
|
||||
timeline_id: Some(self.timeline_id),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let begin = tokio::time::Instant::now();
|
||||
let layers = self.layers.read().await;
|
||||
let now = tokio::time::Instant::now();
|
||||
stats.first_read_lock_acquisition_micros =
|
||||
DurationRecorder::Recorded(RecordedDuration(now - begin), now);
|
||||
let mut level0_deltas = layers.get_level0_deltas()?;
|
||||
drop(layers);
|
||||
stats.level0_deltas_count = Some(level0_deltas.len());
|
||||
stats.get_level0_deltas_plus_drop_lock_micros =
|
||||
stats.first_read_lock_acquisition_micros.till_now();
|
||||
|
||||
// Only compact if enough layers have accumulated.
|
||||
let threshold = self.get_compaction_threshold();
|
||||
if level0_deltas.is_empty() || level0_deltas.len() < threshold {
|
||||
@@ -3569,53 +3565,6 @@ impl Timeline {
|
||||
// we don't accidentally use it later in the function.
|
||||
drop(level0_deltas);
|
||||
|
||||
stats.read_lock_held_prerequisites_micros = stats
|
||||
.read_lock_held_spawn_blocking_startup_micros
|
||||
.till_now();
|
||||
|
||||
// Determine N largest holes where N is number of compacted layers.
|
||||
let max_holes = deltas_to_compact.len();
|
||||
let last_record_lsn = self.get_last_record_lsn();
|
||||
let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
|
||||
let min_hole_coverage_size = 3; // TODO: something more flexible?
|
||||
|
||||
// min-heap (reserve space for one more element added before eviction)
|
||||
let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
|
||||
let mut prev: Option<Key> = None;
|
||||
for (next_key, _next_lsn, _size) in itertools::process_results(
|
||||
deltas_to_compact.iter().map(|l| l.key_iter(ctx)),
|
||||
|iter_iter| iter_iter.kmerge_by(|a, b| a.0 <= b.0),
|
||||
)? {
|
||||
if let Some(prev_key) = prev {
|
||||
// just first fast filter
|
||||
if next_key.to_i128() - prev_key.to_i128() >= min_hole_range {
|
||||
let key_range = prev_key..next_key;
|
||||
// Measuring hole by just subtraction of i128 representation of key range boundaries
|
||||
// has not so much sense, because largest holes will corresponds field1/field2 changes.
|
||||
// But we are mostly interested to eliminate holes which cause generation of excessive image layers.
|
||||
// That is why it is better to measure size of hole as number of covering image layers.
|
||||
let coverage_size = layers.image_coverage(&key_range, last_record_lsn)?.len();
|
||||
if coverage_size >= min_hole_coverage_size {
|
||||
heap.push(Hole {
|
||||
key_range,
|
||||
coverage_size,
|
||||
});
|
||||
if heap.len() > max_holes {
|
||||
heap.pop(); // remove smallest hole
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
prev = Some(next_key.next());
|
||||
}
|
||||
stats.read_lock_held_compute_holes_micros =
|
||||
stats.read_lock_held_prerequisites_micros.till_now();
|
||||
drop(layers);
|
||||
stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
|
||||
let mut holes = heap.into_vec();
|
||||
holes.sort_unstable_by_key(|hole| hole.key_range.start);
|
||||
let mut next_hole = 0; // index of next hole in holes vector
|
||||
|
||||
// This iterator walks through all key-value pairs from all the layers
|
||||
// we're compacting, in key, LSN order.
|
||||
let all_values_iter = itertools::process_results(
|
||||
@@ -3655,7 +3604,50 @@ impl Timeline {
|
||||
},
|
||||
)?;
|
||||
|
||||
stats.prepare_iterators_micros = stats.read_lock_drop_micros.till_now();
|
||||
// Determine N largest holes where N is number of compacted layers.
|
||||
let max_holes = deltas_to_compact.len();
|
||||
let last_record_lsn = self.get_last_record_lsn();
|
||||
stats.time_spent_between_locks = stats.get_level0_deltas_plus_drop_lock_micros.till_now();
|
||||
let layers = self.layers.read().await; // Is'n it better to hold original layers lock till here?
|
||||
stats.second_read_lock_acquisition_micros = stats.time_spent_between_locks.till_now();
|
||||
let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
|
||||
let min_hole_coverage_size = 3; // TODO: something more flexible?
|
||||
|
||||
// min-heap (reserve space for one more element added before eviction)
|
||||
let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
|
||||
let mut prev: Option<Key> = None;
|
||||
for (next_key, _next_lsn, _size) in itertools::process_results(
|
||||
deltas_to_compact.iter().map(|l| l.key_iter(ctx)),
|
||||
|iter_iter| iter_iter.kmerge_by(|a, b| a.0 <= b.0),
|
||||
)? {
|
||||
if let Some(prev_key) = prev {
|
||||
// just first fast filter
|
||||
if next_key.to_i128() - prev_key.to_i128() >= min_hole_range {
|
||||
let key_range = prev_key..next_key;
|
||||
// Measuring hole by just subtraction of i128 representation of key range boundaries
|
||||
// has not so much sense, because largest holes will corresponds field1/field2 changes.
|
||||
// But we are mostly interested to eliminate holes which cause generation of excessive image layers.
|
||||
// That is why it is better to measure size of hole as number of covering image layers.
|
||||
let coverage_size = layers.image_coverage(&key_range, last_record_lsn)?.len();
|
||||
if coverage_size >= min_hole_coverage_size {
|
||||
heap.push(Hole {
|
||||
key_range,
|
||||
coverage_size,
|
||||
});
|
||||
if heap.len() > max_holes {
|
||||
heap.pop(); // remove smallest hole
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
prev = Some(next_key.next());
|
||||
}
|
||||
drop(layers);
|
||||
stats.second_read_lock_held_micros = stats.second_read_lock_acquisition_micros.till_now();
|
||||
let mut holes = heap.into_vec();
|
||||
holes.sort_unstable_by_key(|hole| hole.key_range.start);
|
||||
let mut next_hole = 0; // index of next hole in holes vector
|
||||
stats.sort_holes_micros = stats.second_read_lock_held_micros.till_now();
|
||||
|
||||
// Merge the contents of all the input delta layers into a new set
|
||||
// of delta layers, based on the current partitioning.
|
||||
@@ -3815,7 +3807,7 @@ impl Timeline {
|
||||
layer_paths.pop().unwrap();
|
||||
}
|
||||
|
||||
stats.write_layer_files_micros = stats.prepare_iterators_micros.till_now();
|
||||
stats.write_layer_files_micros = stats.sort_holes_micros.till_now();
|
||||
stats.new_deltas_count = Some(new_layers.len());
|
||||
stats.new_deltas_size = Some(new_layers.iter().map(|l| l.desc.file_size).sum());
|
||||
|
||||
@@ -3854,36 +3846,9 @@ impl Timeline {
|
||||
let CompactLevel0Phase1Result {
|
||||
new_layers,
|
||||
deltas_to_compact,
|
||||
} = {
|
||||
let phase1_span = info_span!("compact_level0_phase1");
|
||||
let myself = Arc::clone(self);
|
||||
let ctx = ctx.attached_child(); // technically, the spawn_blocking can outlive this future
|
||||
let mut stats = CompactLevel0Phase1StatsBuilder {
|
||||
version: Some(2),
|
||||
tenant_id: Some(self.tenant_id),
|
||||
timeline_id: Some(self.timeline_id),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let begin = tokio::time::Instant::now();
|
||||
let phase1_layers_locked = Arc::clone(&self.layers).read_owned().await;
|
||||
let now = tokio::time::Instant::now();
|
||||
stats.read_lock_acquisition_micros =
|
||||
DurationRecorder::Recorded(RecordedDuration(now - begin), now);
|
||||
let layer_removal_cs = layer_removal_cs.clone();
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let _entered = phase1_span.enter();
|
||||
myself.compact_level0_phase1(
|
||||
layer_removal_cs,
|
||||
phase1_layers_locked,
|
||||
stats,
|
||||
target_file_size,
|
||||
&ctx,
|
||||
)
|
||||
})
|
||||
.await
|
||||
.context("spawn_blocking")??
|
||||
};
|
||||
} = self
|
||||
.compact_level0_phase1(layer_removal_cs.clone(), target_file_size, ctx)
|
||||
.await?;
|
||||
|
||||
if new_layers.is_empty() && deltas_to_compact.is_empty() {
|
||||
// nothing to do
|
||||
@@ -3988,7 +3953,7 @@ impl Timeline {
|
||||
/// for example. The caller should hold `Tenant::gc_cs` lock to ensure
|
||||
/// that.
|
||||
///
|
||||
#[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
|
||||
#[instrument(skip_all, fields(timline_id=%self.timeline_id))]
|
||||
pub(super) async fn update_gc_info(
|
||||
&self,
|
||||
retain_lsns: Vec<Lsn>,
|
||||
|
||||
@@ -1321,7 +1321,7 @@ mod tests {
|
||||
|
||||
const DUMMY_SAFEKEEPER_HOST: &str = "safekeeper_connstr";
|
||||
|
||||
async fn dummy_state(harness: &TenantHarness) -> ConnectionManagerState {
|
||||
async fn dummy_state(harness: &TenantHarness<'_>) -> ConnectionManagerState {
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x8), crate::DEFAULT_PG_VERSION, &ctx)
|
||||
|
||||
@@ -8,6 +8,7 @@ OBJS = \
|
||||
libpagestore.o \
|
||||
libpqwalproposer.o \
|
||||
neon.o \
|
||||
access_stat.o \
|
||||
pagestore_smgr.o \
|
||||
relsize_cache.o \
|
||||
walproposer.o \
|
||||
|
||||
274
pgxn/neon/access_stat.c
Normal file
274
pgxn/neon/access_stat.c
Normal file
@@ -0,0 +1,274 @@
|
||||
|
||||
/*
|
||||
* We want this statistic to rpresent current access patern mthis is why when
|
||||
* (n_seq_accesses + n_rnd_accesses) > MAX_ACCESS_COUNTER then we divide both counters by two,
|
||||
* so decreasng weight of historical data
|
||||
*/
|
||||
#include "postgres.h"
|
||||
#include "funcapi.h"
|
||||
#include "miscadmin.h"
|
||||
#include "common/hashfn.h"
|
||||
#include "pagestore_client.h"
|
||||
#include "storage/relfilenode.h"
|
||||
#include "utils/guc.h"
|
||||
|
||||
/* Structure used to predict sequential access */
|
||||
|
||||
typedef struct AccessStatEntry {
|
||||
RelFileNode relnode;
|
||||
BlockNumber blkno; /* last accessed black number */
|
||||
uint32 n_seq_accesses; /* number of sequential accesses (when block N+1 is accessed after block N) */
|
||||
uint32 n_rnd_accesses; /* number of random accesses */
|
||||
uint32 hash;
|
||||
uint32 status;
|
||||
uint64 access_count; /* total number of relation accesses since backend start */
|
||||
dlist_node lru_node; /* LRU list node */
|
||||
} AccessStatEntry;
|
||||
|
||||
#define SH_PREFIX as
|
||||
#define SH_ELEMENT_TYPE AccessStatEntry
|
||||
#define SH_KEY_TYPE RelFileNode
|
||||
#define SH_KEY relnode
|
||||
#define SH_STORE_HASH
|
||||
#define SH_GET_HASH(tb, a) ((a)->hash)
|
||||
#define SH_HASH_KEY(tb, key) hash_bytes( \
|
||||
((const unsigned char *) &(key)), \
|
||||
sizeof(RelFileNode) \
|
||||
)
|
||||
|
||||
#define SH_EQUAL(tb, a, b) RelFileNodeEquals((a), (b))
|
||||
#define SH_SCOPE static inline
|
||||
#define SH_DEFINE
|
||||
#define SH_DECLARE
|
||||
#include "lib/simplehash.h"
|
||||
|
||||
static as_hash *hash;
|
||||
static dlist_head lru;
|
||||
static int max_access_stat_size;
|
||||
static int max_access_stat_count;
|
||||
static double min_seq_access_ratio;
|
||||
static int min_seq_access_count;
|
||||
|
||||
void access_stat_init(void)
|
||||
{
|
||||
MemoryContext memctx = AllocSetContextCreate(TopMemoryContext,
|
||||
"NeonSMGR/access_stat",
|
||||
ALLOCSET_DEFAULT_SIZES);
|
||||
DefineCustomIntVariable("neon.max_access_stat_size",
|
||||
"Maximal size of Neon relation access statistic hash",
|
||||
NULL,
|
||||
&max_access_stat_size,
|
||||
1024,
|
||||
0,
|
||||
INT_MAX,
|
||||
PGC_USERSET,
|
||||
0,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL);
|
||||
DefineCustomIntVariable("neon.max_access_stat_count",
|
||||
"Maximal value of relation access counter after which counters are divided by 2",
|
||||
NULL,
|
||||
&max_access_stat_count,
|
||||
1024,
|
||||
0,
|
||||
INT_MAX,
|
||||
PGC_USERSET,
|
||||
0,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL);
|
||||
DefineCustomRealVariable("neon.min_seq_access_ratio",
|
||||
"Minimal seq/(rnd+seq) ratio to determine sequential access",
|
||||
NULL,
|
||||
&min_seq_access_ratio,
|
||||
0.9,
|
||||
0,
|
||||
INT_MAX,
|
||||
PGC_USERSET,
|
||||
0,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL);
|
||||
DefineCustomIntVariable("neon.min_seq_access_count",
|
||||
"Minimal access count to determine sequetial access",
|
||||
NULL,
|
||||
&min_seq_access_count,
|
||||
10,
|
||||
0,
|
||||
INT_MAX,
|
||||
PGC_USERSET,
|
||||
0,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL);
|
||||
hash = as_create(memctx, max_access_stat_size, NULL);
|
||||
dlist_init(&lru);
|
||||
}
|
||||
|
||||
|
||||
bool is_sequential_access(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno)
|
||||
{
|
||||
bool is_seq_access = false;
|
||||
if (forkNum == MAIN_FORKNUM /* prefetch makes sense only for main fork */
|
||||
&& max_access_stat_size != 0)
|
||||
{
|
||||
AccessStatEntry* entry = as_lookup(hash, rnode);
|
||||
if (entry == NULL)
|
||||
{
|
||||
bool found;
|
||||
/* New item */
|
||||
while (hash->members >= max_access_stat_size)
|
||||
{
|
||||
/* Hash overflow: find candidate for replacement */
|
||||
AccessStatEntry* victim = dlist_container(AccessStatEntry, lru_node, dlist_pop_head_node(&lru));
|
||||
as_delete_item(hash, victim);
|
||||
}
|
||||
entry = as_insert(hash, rnode, &found);
|
||||
Assert(!found);
|
||||
/* Set both counter to zero because we don't know whethr first access is sequential or random */
|
||||
entry->n_seq_accesses = 0;
|
||||
entry->n_rnd_accesses = 0;
|
||||
entry->access_count = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
uint32 access_count = entry->n_seq_accesses + entry->n_rnd_accesses;
|
||||
/*
|
||||
* We want this function to represent most recent access pattern,
|
||||
* so when number of accesses exceed threashold value `max_access_stat_count`
|
||||
* we divide bother coutners by two devaluing old data
|
||||
*/
|
||||
if (access_count >= max_access_stat_count)
|
||||
{
|
||||
entry->n_seq_accesses >>= 1;
|
||||
entry->n_rnd_accesses >>= 1;
|
||||
}
|
||||
if (entry->blkno+1 == blkno)
|
||||
entry->n_seq_accesses += 1;
|
||||
else
|
||||
entry->n_rnd_accesses += 1;
|
||||
entry->access_count += 1;
|
||||
access_count = entry->n_seq_accesses + entry->n_rnd_accesses;
|
||||
|
||||
is_seq_access = access_count >= min_seq_access_count
|
||||
&& (double)entry->n_seq_accesses / access_count >= min_seq_access_ratio;
|
||||
|
||||
|
||||
/* Remove entry from LRU list tobe able to insert it to the end of this list */
|
||||
dlist_delete(&entry->lru_node);
|
||||
}
|
||||
/* Place entry to the tail of LRU list */
|
||||
dlist_push_tail(&lru, &entry->lru_node);
|
||||
entry->blkno = blkno;
|
||||
}
|
||||
return is_seq_access;
|
||||
}
|
||||
|
||||
/*
|
||||
* Get relation access pattern
|
||||
*/
|
||||
PG_FUNCTION_INFO_V1(get_relation_access_statistics);
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
TupleDesc tupdesc;
|
||||
dlist_node* curr;
|
||||
} AccessStatContext;
|
||||
|
||||
#define NUM_ACCESS_STAT_COLUMNS 6
|
||||
|
||||
Datum
|
||||
get_relation_access_statistics(PG_FUNCTION_ARGS)
|
||||
{
|
||||
FuncCallContext *funcctx;
|
||||
Datum result;
|
||||
MemoryContext oldcontext;
|
||||
AccessStatContext *fctx; /* User function context. */
|
||||
TupleDesc tupledesc;
|
||||
TupleDesc expected_tupledesc;
|
||||
HeapTuple tuple;
|
||||
|
||||
if (SRF_IS_FIRSTCALL())
|
||||
{
|
||||
funcctx = SRF_FIRSTCALL_INIT();
|
||||
|
||||
/* Switch context when allocating stuff to be used in later calls */
|
||||
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
|
||||
|
||||
/* Create a user function context for cross-call persistence */
|
||||
fctx = (AccessStatContext *) palloc(sizeof(AccessStatContext));
|
||||
|
||||
/*
|
||||
* To smoothly support upgrades from version 1.0 of this extension
|
||||
* transparently handle the (non-)existence of the pinning_backends
|
||||
* column. We unfortunately have to get the result type for that... -
|
||||
* we can't use the result type determined by the function definition
|
||||
* without potentially crashing when somebody uses the old (or even
|
||||
* wrong) function definition though.
|
||||
*/
|
||||
if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
|
||||
elog(ERROR, "return type must be a row type");
|
||||
|
||||
if (expected_tupledesc->natts != NUM_ACCESS_STAT_COLUMNS)
|
||||
elog(ERROR, "incorrect number of output arguments");
|
||||
|
||||
/* Construct a tuple descriptor for the result rows. */
|
||||
tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
|
||||
TupleDescInitEntry(tupledesc, (AttrNumber) 1, "relfilenode",
|
||||
OIDOID, -1, 0);
|
||||
TupleDescInitEntry(tupledesc, (AttrNumber) 2, "reltablespace",
|
||||
OIDOID, -1, 0);
|
||||
TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reldatabase",
|
||||
OIDOID, -1, 0);
|
||||
TupleDescInitEntry(tupledesc, (AttrNumber) 4, "seqaccess",
|
||||
INT4OID, -1, 0);
|
||||
TupleDescInitEntry(tupledesc, (AttrNumber) 5, "rndaccess",
|
||||
INT4OID, -1, 0);
|
||||
TupleDescInitEntry(tupledesc, (AttrNumber) 6, "accesscnt",
|
||||
INT8OID, -1, 0);
|
||||
|
||||
fctx->tupdesc = BlessTupleDesc(tupledesc);
|
||||
fctx->curr = dlist_is_empty(&lru) ? NULL : dlist_tail_node(&lru);
|
||||
|
||||
|
||||
/* Set max calls and remember the user function context. */
|
||||
funcctx->max_calls = hash->members;
|
||||
funcctx->user_fctx = fctx;
|
||||
|
||||
/* Return to original context when allocating transient memory */
|
||||
MemoryContextSwitchTo(oldcontext);
|
||||
}
|
||||
|
||||
funcctx = SRF_PERCALL_SETUP();
|
||||
|
||||
/* Get the saved state */
|
||||
fctx = funcctx->user_fctx;
|
||||
if (fctx->curr)
|
||||
{
|
||||
AccessStatEntry* entry = dlist_container(AccessStatEntry, lru_node, fctx->curr);
|
||||
Datum values[NUM_ACCESS_STAT_COLUMNS];
|
||||
bool nulls[NUM_ACCESS_STAT_COLUMNS] = {
|
||||
false, false, false, false, false, false
|
||||
};
|
||||
|
||||
values[0] = ObjectIdGetDatum(entry->relnode.relNode);
|
||||
values[1] = ObjectIdGetDatum(entry->relnode.spcNode);
|
||||
values[2] = ObjectIdGetDatum(entry->relnode.dbNode);
|
||||
values[3] = Int32GetDatum(entry->n_seq_accesses);
|
||||
values[4] = Int32GetDatum(entry->n_rnd_accesses);
|
||||
values[5] = Int64GetDatum(entry->access_count);
|
||||
|
||||
/* Build and return the tuple. */
|
||||
tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
|
||||
result = HeapTupleGetDatum(tuple);
|
||||
|
||||
fctx->curr = dlist_has_prev(&lru, fctx->curr) ? dlist_prev_node(&lru, fctx->curr) : NULL;
|
||||
|
||||
SRF_RETURN_NEXT(funcctx, result);
|
||||
}
|
||||
else
|
||||
SRF_RETURN_DONE(funcctx);
|
||||
}
|
||||
|
||||
@@ -58,6 +58,7 @@ char *neon_auth_token;
|
||||
int n_unflushed_requests = 0;
|
||||
int flush_every_n_requests = 8;
|
||||
int readahead_buffer_size = 128;
|
||||
int readahead_distance = 10;
|
||||
|
||||
bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
|
||||
|
||||
@@ -452,6 +453,18 @@ pg_init_libpagestore(void)
|
||||
PGC_USERSET,
|
||||
0, /* no flags required */
|
||||
NULL, (GucIntAssignHook) &readahead_buffer_resize, NULL);
|
||||
DefineCustomIntVariable("neon.readahead_distance",
|
||||
"Number of read-ahead blocks",
|
||||
NULL,
|
||||
&readahead_distance,
|
||||
10,
|
||||
0,
|
||||
INT_MAX,
|
||||
PGC_USERSET,
|
||||
0,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL);
|
||||
|
||||
relsize_hash_init();
|
||||
|
||||
@@ -476,4 +489,5 @@ pg_init_libpagestore(void)
|
||||
redo_read_buffer_filter = neon_redo_read_buffer_filter;
|
||||
}
|
||||
lfc_init();
|
||||
access_stat_init();
|
||||
}
|
||||
|
||||
@@ -27,8 +27,18 @@ RETURNS SETOF RECORD
|
||||
AS 'MODULE_PATHNAME', 'local_cache_pages'
|
||||
LANGUAGE C PARALLEL SAFE;
|
||||
|
||||
CREATE FUNCTION get_relation_access_statistics()
|
||||
RETURNS SETOF RECORD
|
||||
AS 'MODULE_PATHNAME', 'get_relation_access_statistics'
|
||||
LANGUAGE C PARALLEL SAFE;
|
||||
|
||||
-- Create a view for convenient access.
|
||||
CREATE VIEW local_cache AS
|
||||
SELECT P.* FROM local_cache_pages() AS P
|
||||
SELECT relname,P.* FROM local_cache_pages() AS P
|
||||
(pageoffs int8, relfilenode oid, reltablespace oid, reldatabase oid,
|
||||
relforknumber int2, relblocknumber int8, accesscount int4);
|
||||
relforknumber int2, relblocknumber int8, accesscount int4) JOIN pg_class pc ON (P.relfilenode = pc.relfilenode);
|
||||
|
||||
CREATE VIEW relation_access_statistics AS
|
||||
SELECT relname,P.* FROM get_relation_access_statistics() AS P
|
||||
(relfilenode oid, reltablespace oid, reldatabase oid,
|
||||
seqaccess int4, rndaccess int4, access_count int8) JOIN pg_class pc ON (P.relfilenode = pc.relfilenode);
|
||||
|
||||
@@ -157,6 +157,7 @@ extern page_server_api * page_server;
|
||||
extern char *page_server_connstring;
|
||||
extern int flush_every_n_requests;
|
||||
extern int readahead_buffer_size;
|
||||
extern int readahead_distance;
|
||||
extern bool seqscan_prefetch_enabled;
|
||||
extern int seqscan_prefetch_distance;
|
||||
extern char *neon_timeline;
|
||||
@@ -210,5 +211,8 @@ extern bool lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumbe
|
||||
extern void lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno);
|
||||
extern void lfc_init(void);
|
||||
|
||||
/* Access statistic */
|
||||
extern void access_stat_init(void);
|
||||
extern bool is_sequential_access(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1881,6 +1881,7 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
if (RecoveryInProgress() && !(MyBackendType == B_STARTUP))
|
||||
XLogWaitForReplayOf(request_lsn);
|
||||
|
||||
|
||||
/*
|
||||
* Try to find prefetched page in the list of received pages.
|
||||
*/
|
||||
@@ -2003,6 +2004,10 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
/* If it is expected to be sequential access then initiate prefetch of next block */
|
||||
if (is_sequential_access(reln->smgr_rnode.node, forkNum, blkno))
|
||||
neon_prefetch(reln, forkNum, blkno + readahead_distance);
|
||||
|
||||
/* Try to read from local file cache */
|
||||
if (lfc_read(reln->smgr_rnode.node, forkNum, blkno, buffer))
|
||||
{
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[toolchain]
|
||||
channel = "1.70.0"
|
||||
channel = "1.68.2"
|
||||
profile = "default"
|
||||
# The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
|
||||
# https://rust-lang.github.io/rustup/concepts/profiles.html
|
||||
|
||||
@@ -266,7 +266,7 @@ impl From<TimelineError> for ApiError {
|
||||
fn from(te: TimelineError) -> ApiError {
|
||||
match te {
|
||||
TimelineError::NotFound(ttid) => {
|
||||
ApiError::NotFound(anyhow!("timeline {} not found", ttid).into())
|
||||
ApiError::NotFound(anyhow!("timeline {} not found", ttid))
|
||||
}
|
||||
_ => ApiError::InternalServerError(anyhow!("{}", te)),
|
||||
}
|
||||
|
||||
@@ -62,7 +62,6 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
|
||||
"pageserver_getpage_reconstruct_seconds_bucket",
|
||||
"pageserver_getpage_reconstruct_seconds_count",
|
||||
"pageserver_getpage_reconstruct_seconds_sum",
|
||||
*[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]],
|
||||
)
|
||||
|
||||
PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
|
||||
|
||||
@@ -2415,17 +2415,6 @@ class Endpoint(PgProtocol):
|
||||
|
||||
return self
|
||||
|
||||
def respec(self, **kwargs):
|
||||
"""Update the endpoint.json file used by control_plane."""
|
||||
# Read config
|
||||
config_path = os.path.join(self.endpoint_path(), "endpoint.json")
|
||||
with open(config_path, "r") as f:
|
||||
data_dict = json.load(f)
|
||||
|
||||
# Write it back updated
|
||||
with open(config_path, "w") as file:
|
||||
json.dump(dict(data_dict, **kwargs), file, indent=4)
|
||||
|
||||
def stop(self) -> "Endpoint":
|
||||
"""
|
||||
Stop the Postgres instance if it's running.
|
||||
|
||||
@@ -32,18 +32,13 @@ def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenc
|
||||
|
||||
env.neon_cli.create_branch("test_startup")
|
||||
|
||||
endpoint = None
|
||||
|
||||
# We do two iterations so we can see if the second startup is faster. It should
|
||||
# be because the compute node should already be configured with roles, databases,
|
||||
# extensions, etc from the first run.
|
||||
for i in range(2):
|
||||
# Start
|
||||
with zenbenchmark.record_duration(f"{i}_start_and_select"):
|
||||
if endpoint:
|
||||
endpoint.start()
|
||||
else:
|
||||
endpoint = env.endpoints.create_start("test_startup")
|
||||
endpoint = env.endpoints.create_start("test_startup")
|
||||
endpoint.safe_psql("select 1;")
|
||||
|
||||
# Get metrics
|
||||
@@ -62,9 +57,6 @@ def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenc
|
||||
# Stop so we can restart
|
||||
endpoint.stop()
|
||||
|
||||
# Imitate optimizations that console would do for the second start
|
||||
endpoint.respec(skip_pg_catalog_updates=True)
|
||||
|
||||
|
||||
# This test sometimes runs for longer than the global 5 minute timeout.
|
||||
@pytest.mark.timeout(600)
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
pg8000==1.29.8
|
||||
pg8000==1.29.4
|
||||
scramp>=1.4.3
|
||||
|
||||
@@ -396,9 +396,9 @@ checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
|
||||
|
||||
[[package]]
|
||||
name = "openssl"
|
||||
version = "0.10.55"
|
||||
version = "0.10.52"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "345df152bc43501c5eb9e4654ff05f794effb78d4efe3d53abc158baddc0703d"
|
||||
checksum = "01b8574602df80f7b85fdfc5392fa884a4e3b3f4f35402c070ab34c3d3f78d56"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"cfg-if",
|
||||
@@ -428,9 +428,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
|
||||
|
||||
[[package]]
|
||||
name = "openssl-sys"
|
||||
version = "0.9.90"
|
||||
version = "0.9.87"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "374533b0e45f3a7ced10fcaeccca020e66656bc03dac384f852e4e5a7a8104a6"
|
||||
checksum = "8e17f59264b2809d77ae94f0e1ebabc434773f370d6ca667bd223ea10e06cc7e"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
FROM rust:1.70
|
||||
FROM rust:1.69
|
||||
WORKDIR /source
|
||||
|
||||
COPY . .
|
||||
|
||||
@@ -5,8 +5,8 @@
|
||||
"kind" : "remoteSourceControl",
|
||||
"location" : "https://github.com/vapor/postgres-nio.git",
|
||||
"state" : {
|
||||
"revision" : "061a0836d7c1887e04a975d1d2eaa2ef5fd7dfab",
|
||||
"version" : "1.16.0"
|
||||
"revision" : "dbf9c2eb596df39cba8ff3f74d74b2e6a31bd937",
|
||||
"version" : "1.14.1"
|
||||
}
|
||||
},
|
||||
{
|
||||
@@ -59,8 +59,8 @@
|
||||
"kind" : "remoteSourceControl",
|
||||
"location" : "https://github.com/apple/swift-nio.git",
|
||||
"state" : {
|
||||
"revision" : "6213ba7a06febe8fef60563a4a7d26a4085783cf",
|
||||
"version" : "2.54.0"
|
||||
"revision" : "d1690f85419fdac8d54e350fb6d2ab9fd95afd75",
|
||||
"version" : "2.51.1"
|
||||
}
|
||||
},
|
||||
{
|
||||
|
||||
@@ -4,7 +4,7 @@ import PackageDescription
|
||||
let package = Package(
|
||||
name: "PostgresNIOExample",
|
||||
dependencies: [
|
||||
.package(url: "https://github.com/vapor/postgres-nio.git", from: "1.16.0")
|
||||
.package(url: "https://github.com/vapor/postgres-nio.git", from: "1.14.1")
|
||||
],
|
||||
targets: [
|
||||
.executableTarget(
|
||||
|
||||
@@ -5,7 +5,23 @@
|
||||
"packages": {
|
||||
"": {
|
||||
"dependencies": {
|
||||
"postgresql-client": "2.5.9"
|
||||
"postgresql-client": "2.5.5"
|
||||
}
|
||||
},
|
||||
"node_modules/debug": {
|
||||
"version": "4.3.4",
|
||||
"resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz",
|
||||
"integrity": "sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==",
|
||||
"dependencies": {
|
||||
"ms": "2.1.2"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=6.0"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"supports-color": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/doublylinked": {
|
||||
@@ -25,6 +41,11 @@
|
||||
"putil-promisify": "^1.8.6"
|
||||
}
|
||||
},
|
||||
"node_modules/ms": {
|
||||
"version": "2.1.2",
|
||||
"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz",
|
||||
"integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w=="
|
||||
},
|
||||
"node_modules/obuf": {
|
||||
"version": "1.1.2",
|
||||
"resolved": "https://registry.npmjs.org/obuf/-/obuf-1.1.2.tgz",
|
||||
@@ -42,28 +63,30 @@
|
||||
}
|
||||
},
|
||||
"node_modules/postgresql-client": {
|
||||
"version": "2.5.9",
|
||||
"resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.5.9.tgz",
|
||||
"integrity": "sha512-s+kgTN6TfWLzehEyxw4Im4odnxVRCbZ0DEJzWS6SLowPAmB2m1/DOiOvZC0+ZVoi5AfbGE6SBqFxKguSyVAXZg==",
|
||||
"version": "2.5.5",
|
||||
"resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.5.5.tgz",
|
||||
"integrity": "sha512-2Mu3i+6NQ9cnkoZNd0XeSZo9WoUpuWf4ZSiCCoDWSj82T93py2/SKXZ1aUaP8mVaU0oKpyyGe0IwLYZ1VHShnA==",
|
||||
"dependencies": {
|
||||
"debug": "^4.3.4",
|
||||
"doublylinked": "^2.5.2",
|
||||
"lightning-pool": "^4.2.1",
|
||||
"postgres-bytea": "^3.0.0",
|
||||
"power-tasks": "^1.7.0",
|
||||
"power-tasks": "^1.6.4",
|
||||
"putil-merge": "^3.10.3",
|
||||
"putil-promisify": "^1.10.0",
|
||||
"putil-varhelpers": "^1.6.5"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=16.0",
|
||||
"node": ">=14.0",
|
||||
"npm": ">=7.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/power-tasks": {
|
||||
"version": "1.7.0",
|
||||
"resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-1.7.0.tgz",
|
||||
"integrity": "sha512-rndZXCDxhuIDjPUJJvQwBDHaYagCkjvbPF/NA+omh/Ef4rAI9KtnvdA0k98dyiGpn1zXOpc6c2c0JWzg/xAhJg==",
|
||||
"version": "1.6.4",
|
||||
"resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-1.6.4.tgz",
|
||||
"integrity": "sha512-LX8GGgEIP1N7jsZqlqZ275e6f1Ehq97APCEGj8uVO0NoEoB+77QUX12BFv3LmlNKfq4fIuNSPiHhyHFjqn2gfA==",
|
||||
"dependencies": {
|
||||
"debug": "^4.3.4",
|
||||
"doublylinked": "^2.5.2",
|
||||
"strict-typed-events": "^2.3.1"
|
||||
},
|
||||
@@ -109,9 +132,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/ts-gems": {
|
||||
"version": "2.4.0",
|
||||
"resolved": "https://registry.npmjs.org/ts-gems/-/ts-gems-2.4.0.tgz",
|
||||
"integrity": "sha512-SdugYAXoWvbqrxLodIObzxhEKacDxh5LfAJIiIkiH7q5thvuuCzdmkdTVQYf7uEDrEpPhfx4tokDMamdO3be9A=="
|
||||
"version": "2.3.0",
|
||||
"resolved": "https://registry.npmjs.org/ts-gems/-/ts-gems-2.3.0.tgz",
|
||||
"integrity": "sha512-bUvrwrzlct7vfaNvtgMhynDf6lAki/kTtrNsIGhX6l7GJGK3s6b8Ro7dazOLXabV0m2jyShBzDQ8X1+h/C2Cug=="
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"type": "module",
|
||||
"dependencies": {
|
||||
"postgresql-client": "2.5.9"
|
||||
"postgresql-client": "2.5.5"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
FROM node:20
|
||||
FROM node:18
|
||||
WORKDIR /source
|
||||
|
||||
COPY . .
|
||||
|
||||
@@ -5,16 +5,16 @@
|
||||
"packages": {
|
||||
"": {
|
||||
"dependencies": {
|
||||
"@neondatabase/serverless": "0.4.18",
|
||||
"@neondatabase/serverless": "0.4.3",
|
||||
"ws": "8.13.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@neondatabase/serverless": {
|
||||
"version": "0.4.18",
|
||||
"resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.4.18.tgz",
|
||||
"integrity": "sha512-2TZnIyRGC/+0fjZ8TKCzaSTPUD94PM7NBGuantGZbUrbWyqBwGnUoRtdZAQ95qBKVHqORLVfymlv2NE+HQMFeA==",
|
||||
"version": "0.4.3",
|
||||
"resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.4.3.tgz",
|
||||
"integrity": "sha512-U8tpuF5f0R5WRsciR7iaJ5S2h54DWa6Z6CEW+J4KgwyvRN3q3qDz0MibdfFXU0WqnRoi/9RSf/2XN4TfeaOCbQ==",
|
||||
"dependencies": {
|
||||
"@types/pg": "8.6.6"
|
||||
"@types/pg": "^8.6.6"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/node": {
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"type": "module",
|
||||
"dependencies": {
|
||||
"@neondatabase/serverless": "0.4.18",
|
||||
"@neondatabase/serverless": "0.4.3",
|
||||
"ws": "8.13.0"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,6 @@ import copy
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
|
||||
@@ -449,7 +448,7 @@ def dump_differs(first: Path, second: Path, output: Path) -> bool:
|
||||
"""
|
||||
|
||||
with output.open("w") as stdout:
|
||||
res = subprocess.run(
|
||||
rv = subprocess.run(
|
||||
[
|
||||
"diff",
|
||||
"--unified", # Make diff output more readable
|
||||
@@ -461,53 +460,4 @@ def dump_differs(first: Path, second: Path, output: Path) -> bool:
|
||||
stdout=stdout,
|
||||
)
|
||||
|
||||
differs = res.returncode != 0
|
||||
|
||||
# TODO: Remove after https://github.com/neondatabase/neon/pull/4425 is merged, and a couple of releases are made
|
||||
if differs:
|
||||
with tempfile.NamedTemporaryFile(mode="w") as tmp:
|
||||
tmp.write(PR4425_ALLOWED_DIFF)
|
||||
tmp.flush()
|
||||
|
||||
allowed = subprocess.run(
|
||||
[
|
||||
"diff",
|
||||
"--unified", # Make diff output more readable
|
||||
r"--ignore-matching-lines=^---", # Ignore diff headers
|
||||
r"--ignore-matching-lines=^\+\+\+", # Ignore diff headers
|
||||
"--ignore-matching-lines=^@@", # Ignore diff blocks location
|
||||
"--ignore-matching-lines=^ *$", # Ignore lines with only spaces
|
||||
"--ignore-matching-lines=^ --.*", # Ignore the " --" lines for compatibility with PG14
|
||||
"--ignore-blank-lines",
|
||||
str(output),
|
||||
str(tmp.name),
|
||||
],
|
||||
)
|
||||
|
||||
differs = allowed.returncode != 0
|
||||
|
||||
return differs
|
||||
|
||||
|
||||
PR4425_ALLOWED_DIFF = """
|
||||
--- /tmp/test_output/test_backward_compatibility[release-pg15]/compatibility_snapshot/dump.sql 2023-06-08 18:12:45.000000000 +0000
|
||||
+++ /tmp/test_output/test_backward_compatibility[release-pg15]/dump.sql 2023-06-13 07:25:35.211733653 +0000
|
||||
@@ -13,12 +13,20 @@
|
||||
|
||||
CREATE ROLE cloud_admin;
|
||||
ALTER ROLE cloud_admin WITH SUPERUSER INHERIT CREATEROLE CREATEDB LOGIN REPLICATION BYPASSRLS;
|
||||
+CREATE ROLE neon_superuser;
|
||||
+ALTER ROLE neon_superuser WITH NOSUPERUSER INHERIT CREATEROLE CREATEDB NOLOGIN NOREPLICATION NOBYPASSRLS;
|
||||
|
||||
--
|
||||
-- User Configurations
|
||||
--
|
||||
|
||||
|
||||
+--
|
||||
+-- Role memberships
|
||||
+--
|
||||
+
|
||||
+GRANT pg_read_all_data TO neon_superuser GRANTED BY cloud_admin;
|
||||
+GRANT pg_write_all_data TO neon_superuser GRANTED BY cloud_admin;
|
||||
"""
|
||||
return rv.returncode != 0
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
import time
|
||||
|
||||
import pytest
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
|
||||
@@ -12,10 +10,9 @@ def test_hot_standby(neon_simple_env: NeonEnv):
|
||||
branch_name="main",
|
||||
endpoint_id="primary",
|
||||
) as primary:
|
||||
time.sleep(1)
|
||||
with env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") as secondary:
|
||||
primary_lsn = None
|
||||
caught_up = False
|
||||
cought_up = False
|
||||
queries = [
|
||||
"SHOW neon.timeline_id",
|
||||
"SHOW neon.tenant_id",
|
||||
@@ -59,7 +56,7 @@ def test_hot_standby(neon_simple_env: NeonEnv):
|
||||
res = s_cur.fetchone()
|
||||
assert res is not None
|
||||
|
||||
while not caught_up:
|
||||
while not cought_up:
|
||||
with s_con.cursor() as secondary_cursor:
|
||||
secondary_cursor.execute("SELECT pg_last_wal_replay_lsn()")
|
||||
res = secondary_cursor.fetchone()
|
||||
@@ -69,7 +66,7 @@ def test_hot_standby(neon_simple_env: NeonEnv):
|
||||
# due to e.g. autovacuum, but that shouldn't impact the content
|
||||
# of the tables, so we check whether we've replayed up to at
|
||||
# least after the commit of the `test` table.
|
||||
caught_up = secondary_lsn >= primary_lsn
|
||||
cought_up = secondary_lsn >= primary_lsn
|
||||
|
||||
# Explicit commit to flush any transient transaction-level state.
|
||||
s_con.commit()
|
||||
|
||||
@@ -16,7 +16,6 @@ from fixtures.pg_version import PgVersion, xfail_on_postgres
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path):
|
||||
env = neon_simple_env
|
||||
(tenant_id, _) = env.neon_cli.create_tenant()
|
||||
@@ -45,16 +44,12 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path):
|
||||
# we've disabled the autovacuum and checkpoint
|
||||
# so background processes should not change the size.
|
||||
# If this test will flake we should probably loosen the check
|
||||
assert (
|
||||
size == initial_size
|
||||
), f"starting idle compute should not change the tenant size (Currently {size}, expected {initial_size})"
|
||||
assert size == initial_size, "starting idle compute should not change the tenant size"
|
||||
|
||||
# the size should be the same, until we increase the size over the
|
||||
# gc_horizon
|
||||
size, inputs = http_client.tenant_size_and_modelinputs(tenant_id)
|
||||
assert (
|
||||
size == initial_size
|
||||
), f"tenant_size should not be affected by shutdown of compute (Currently {size}, expected {initial_size})"
|
||||
assert size == initial_size, "tenant_size should not be affected by shutdown of compute"
|
||||
|
||||
expected_inputs = {
|
||||
"segments": [
|
||||
@@ -323,7 +318,6 @@ def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Pa
|
||||
size_debug_file.write(size_debug)
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_single_branch_get_tenant_size_grows(
|
||||
neon_env_builder: NeonEnvBuilder, test_output_dir: Path, pg_version: PgVersion
|
||||
):
|
||||
@@ -339,13 +333,13 @@ def test_single_branch_get_tenant_size_grows(
|
||||
# inserts is larger than gc_horizon. for example 0x20000 here hid the fact
|
||||
# that there next_gc_cutoff could be smaller than initdb_lsn, which will
|
||||
# obviously lead to issues when calculating the size.
|
||||
gc_horizon = 0x3BA00
|
||||
gc_horizon = 0x38000
|
||||
|
||||
# it's a bit of a hack, but different versions of postgres have different
|
||||
# amount of WAL generated for the same amount of data. so we need to
|
||||
# adjust the gc_horizon accordingly.
|
||||
if pg_version == PgVersion.V14:
|
||||
gc_horizon = 0x4A000
|
||||
gc_horizon = 0x40000
|
||||
|
||||
neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}"
|
||||
|
||||
@@ -366,11 +360,11 @@ def test_single_branch_get_tenant_size_grows(
|
||||
if current_lsn - initdb_lsn >= gc_horizon:
|
||||
assert (
|
||||
size >= prev_size
|
||||
), f"tenant_size may grow or not grow, because we only add gc_horizon amount of WAL to initial snapshot size (Currently at: {current_lsn}, Init at: {initdb_lsn})"
|
||||
), "tenant_size may grow or not grow, because we only add gc_horizon amount of WAL to initial snapshot size"
|
||||
else:
|
||||
assert (
|
||||
size > prev_size
|
||||
), f"tenant_size should grow, because we continue to add WAL to initial snapshot size (Currently at: {current_lsn}, Init at: {initdb_lsn})"
|
||||
), "tenant_size should grow, because we continue to add WAL to initial snapshot size"
|
||||
|
||||
def get_current_consistent_size(
|
||||
env: NeonEnv,
|
||||
|
||||
Reference in New Issue
Block a user