Compare commits

..

36 Commits

Author SHA1 Message Date
Christian Schwarz
2d37857351 pq bench: avoid repeated conversion to_i128 2023-11-02 17:43:59 +00:00
Christian Schwarz
ddfce0cfa5 per-second RPS 2023-11-02 17:11:37 +00:00
Christian Schwarz
d52a622115 pq bench: proper shutdown 2023-11-02 17:07:00 +00:00
Christian Schwarz
a066eecda0 http bench: sligthly improved stats 2023-11-02 17:06:36 +00:00
Christian Schwarz
94e94af6c7 WIP: libpq-based client
depends on https://github.com/neondatabase/rust-postgres/pull/25
2023-11-02 16:28:16 +01:00
Christian Schwarz
df7346eaff Revert "CP tokio_epoll_uring for read path"
This reverts commit 82d9c68667.
2023-11-02 11:32:48 +01:00
Christian Schwarz
76efb1b79b Revert "CP: use hacked-together open_at for async VirtualFile open calls instead of spawn_blocking"
This reverts commit 55cdf6c7ff.
2023-11-02 11:32:41 +01:00
Christian Schwarz
2f656c6691 rename getpage_bench to getpage_bench_http 2023-11-02 10:59:54 +01:00
Christian Schwarz
bb5b5cbdac WIP: benchmark that does random getpage requests against the keyspace
backup of pageserver.toml

    d =1
    pg_distrib_dir ='/home/admin/neon-main/pg_install'
    http_auth_type ='Trust'
    pg_auth_type ='Trust'
    listen_http_addr ='127.0.0.1:9898'
    listen_pg_addr ='127.0.0.1:64000'
    broker_endpoint ='http://127.0.0.1:50051/'
    #control_plane_api ='http://127.0.0.1:1234/'

    # Initial configuration file created by 'pageserver --init'
    #listen_pg_addr = '127.0.0.1:64000'
    #listen_http_addr = '127.0.0.1:9898'

    #wait_lsn_timeout = '60 s'
    #wal_redo_timeout = '60 s'

    #max_file_descriptors = 10000
    #page_cache_size = 160000

    # initial superuser role name to use when creating a new tenant
    #initial_superuser_name = 'cloud_admin'

    #broker_endpoint = 'http://127.0.0.1:50051'

    #log_format = 'plain'

    #concurrent_tenant_size_logical_size_queries = '1'

    #metric_collection_interval = '10 min'
    #cached_metric_collection_interval = '0s'
    #synthetic_size_calculation_interval = '10 min'

    #disk_usage_based_eviction = { max_usage_pct = .., min_avail_bytes = .., period = "10s"}

    #background_task_maximum_delay = '10s'

    [tenant_config]
    #checkpoint_distance = 268435456 # in bytes
    #checkpoint_timeout = 10 m
    #compaction_target_size = 134217728 # in bytes
    #compaction_period = '20 s'
    #compaction_threshold = 10

    #gc_period = '1 hr'
    #gc_horizon = 67108864
    #image_creation_threshold = 3
    #pitr_interval = '7 days'

    #min_resident_size_override = .. # in bytes
    #evictions_low_residence_duration_metric_threshold = '24 hour'
    #gc_feedback = false

    # make it determinsitic
    gc_period = '0s'
    checkpoint_timeout = '3650 day'
    compaction_period = '20 s'
    compaction_threshold = 10
    compaction_target_size = 134217728
    checkpoint_distance = 268435456
    image_creation_threshold = 3

    [remote_storage]
    local_path = '/home/admin/neon-main/bench_repo_dir/repo/remote_storage_local_fs'
2023-10-26 17:53:03 +00:00
Christian Schwarz
55cdf6c7ff CP: use hacked-together open_at for async VirtualFile open calls instead of spawn_blocking
This makes Delta/Image ::load fns fully tokio-epoll-uring
2023-10-26 17:40:33 +00:00
Christian Schwarz
82d9c68667 CP tokio_epoll_uring for read path 2023-10-26 17:22:23 +00:00
Christian Schwarz
bc91c40f56 Revert "revert recent VirtualFile asyncification changes (#5291)"
This reverts commit ab1f37e908.
2023-10-26 17:22:10 +00:00
Christian Schwarz
c5f58ef3f7 API to duplicate a tenant 2023-10-26 16:30:11 +00:00
Christian Schwarz
bb8531d920 Revert "WIP cleanup unused RemoteStorage fields + half-baked copy_file impl"
This reverts commit 7553bbe3f5.
2023-10-26 15:44:26 +00:00
Christian Schwarz
7553bbe3f5 WIP cleanup unused RemoteStorage fields + half-baked copy_file impl 2023-10-26 15:44:03 +00:00
dependabot[bot]
378daa358b build(deps): bump werkzeug from 2.2.3 to 3.0.1 (#5665) 2023-10-25 22:50:35 +00:00
Alexander Bayandin
85f4514e7d Get env var for real Azure tests from GitHub (#5662)
## Problem

We'll need to switch `REMOTE_STORAGE_AZURE_REGION` from the current
`eastus2` region to something `eu-central-1`-like. This may require
changing `AZURE_STORAGE_ACCESS_KEY`.
To make it possible to switch from one place (not to break a lot of
builds on CI), move `REMOTE_STORAGE_AZURE_CONTAINER` and
`REMOTE_STORAGE_AZURE_REGION` to GitHub Variables.

See https://github.com/neondatabase/neon/settings/variables/actions

## Summary of changes
- Get values for `REMOTE_STORAGE_AZURE_CONTAINER` &
`REMOTE_STORAGE_AZURE_REGION` from GitHub Variables
2023-10-25 22:54:23 +01:00
Joonas Koivunen
f70019797c refactor(rtc): schedule compaction update (#5649)
a single operation instead of N uploads and 1 deletion scheduling with
write(layer_map) lock releasing in the between. Compaction update will
make for a much better place to change how the operation will change in
future compared to more general file based operations.

builds upon #5645. solves the problem of difficult to see hopeful
correctness w.r.t. other `index_part.json` changing operations.

Co-authored-by: Shany Pozin <shany@neon.tech>
2023-10-25 22:25:43 +01:00
Joonas Koivunen
325258413a fix: trampling on global physical size metric (#5663)
All loading (attached, or from disk) timelines overwrite the global
gauge for physical size. The `_set` method cannot be used safely, so
remove it and just "add" the physical size.
2023-10-25 19:29:12 +01:00
Konstantin Knizhnik
4ddbc0e46d Ignore missed AUX_FILES_KEY when generating image layer (#5660)
## Problem

Logical replication requires new AUX_FILES_KEY which is definitely
absent in existed database.
We do not have function to check if key exists in our KV storage.
So I have to handle the error in `list_aux_files` method.
But this key is also included in key space range and accessed y
`create_image_layer` method.

## Summary of changes

Check if AUX_FILES_KEY  exists before including it in keyspace.

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Shany Pozin <shany@neon.tech>
Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
2023-10-25 18:35:23 +01:00
Arpad Müller
a673e4e7a9 Optionally return json from get_lsn_by_timestamp (#5608)
This does two things: first a minor refactor to not use HTTP/1.x
style header names and also to not panic if some certain requests had no
"Accept" header. As a second thing, it addresses the third bullet point
from #3689:

> Change `get_lsn_by_timestamp` API method to return LSN even if we only
found commit before the specified timestamp.

This is done by adding a version parameter to the `get_lsn_by_timestamp`
API call and making its behaviour depend on the version number.

Part of #3414 (but doesn't address it in its entirety).

---------

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
2023-10-25 18:46:34 +02:00
bojanserafimov
c155cc0c3f Fix test instructions readme (#5644) 2023-10-25 11:53:04 -04:00
Conrad Ludgate
32126d705b proxy refactor serverless (#4685)
## Problem

Our serverless backend was a bit jumbled. As a comment indicated, we
were handling SQL-over-HTTP in our `websocket.rs` file.

I've extracted out the `sql_over_http` and `websocket` files from the
`http` module and put them into a new module called `serverless`.

## Summary of changes

```sh
mkdir proxy/src/serverless
mv proxy/src/http/{conn_pool,sql_over_http,websocket}.rs proxy/src/serverless/
mv proxy/src/http/server.rs proxy/src/http/health_server.rs
mv proxy/src/metrics proxy/src/usage_metrics.rs
```

I have also extracted the hyper server and handler from websocket.rs
into `serverless.rs`
2023-10-25 15:43:03 +01:00
John Spray
5683ae9eab pageserver: suppress some of the most common spurious warnings (#5658)
Two of the most common spurious log messages:
- broker connections terminate & we log at error severity. Unfortunately
tonic gives us an "Unknown" error so to suppress these we're doing
string matching. It's hacky but worthwhile for operations.
- the first iteration of tenant background tasks tends to over-run its
schedule and emit a warning. Ultimately we should fix these to run on
time, but for now we are not benefiting from polluting our logs with the
warnings.
2023-10-25 14:55:37 +01:00
Alexander Bayandin
4778b6a12e Switch to querying new tests results DB (#5616)
## Problem

We started to store test results in a new format in
https://github.com/neondatabase/neon/pull/4549.
This PR switches scripts to query this db.

(we can completely remove old DB/ingestions scripts in a couple of
weeks after the PR merged)

## Summary of changes
- `scripts/benchmark_durations.py` query new database
- `scripts/flaky_tests.py` query new database
2023-10-25 14:25:13 +01:00
John Spray
8b8be7bed4 tests: don't fail tests on torn log lines (#5655)
## Problem

Tests that force-kill and restart a service can generate torn log lines
that might match WARN|ERROR, but not match the allow expression that a
test has loaded, e.g.
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-5651/6638398772/index.html#suites/7538959189f4501983ddd9e167836c8b/d272ba8a73e6945c

## Summary of changes

Ignore log lines which match a regex for torn log lines on restart: they
have two timestamps and the second line is an "INFO version"... message.
2023-10-25 13:29:30 +01:00
Conrad Ludgate
a461c459d8 fix http pool test (#5653)
## Problem

We defer the returning of connections the the connection pool. It's
possible for our test to be faster than the returning of connections -
which then gets a differing process ID because it opens a new
connection.

## Summary of changes

1. Delay the tests just a little (20ms) to give more chance for
connections to return.
2. Correlate connection IDs with the connection logs a bit more
2023-10-25 13:20:45 +01:00
Joonas Koivunen
4ae2d1390d refactor(remote_timeline_client): Split deletion into unlinking + deletion (#5645)
Quest: #4745. Prerequisite for #4938. Original
https://github.com/neondatabase/neon/pull/4938#issuecomment-1777150665.

The new Layer implementation has so far been using
`RemoteTimelineClient::schedule_layer_file_deletion` from `Layer::drop`
but it was noticed that this could mean that the L0s compaction wanted
to remove could linger in the index part for longer time or be left
there for longer time. Solution is to split the
`RemoteTimelineClient::schedule_layer_file_deletion` into two parts:
- unlinking from index_part.json, to be called from end of compaction
and gc
- scheduling of actual deletions, to be called from `Layer::drop`

The added methods are added unused.
2023-10-25 15:01:19 +03:00
Joonas Koivunen
c5949e1fd6 misc smaller improvements (#5527)
- finally add an `#[instrument]` to Timeline::create_image_layers,
making it easier to see that something is happening because we create
image layers
- format some macro context code
- add a warning not to create new validation functions a la parse do not
validate

Split off from #5198.
2023-10-25 14:59:43 +03:00
John Spray
127837abb0 tests: de-flake test_eviction_across_generations (#5650)
## Problem

There was an edge case where initial logical size calculation can be
downloading a layer that wasn't hit by the test's `SELECT`, and it's
on-disk but still marked as remote in the pageserver's internal state,
so evicting it fails.


https://neon-github-public-dev.s3.amazonaws.com/reports/pr-5648/6630099807/index.html#categories/dee044ec96f666edb90a77c01099a941/e38e97a2735ffa8c/

## Summary of changes

Use pageserver API to learn about layers, instead of inspecting local
disk, so that we will always agree with the pageserver about which layer
are local.
2023-10-25 10:55:45 +01:00
Conrad Ludgate
b2c96047d0 move wake compute after the auth quirks logic (#5642)
## Problem

https://github.com/neondatabase/neon/issues/5568#issuecomment-1777015606

## Summary of changes

Make the auth_quirks_creds return the authentication information, and
push the wake_compute loop to after, inside `auth_quirks`
2023-10-25 08:30:47 +01:00
Em Sharnoff
44202eeb3b Bump vm-builder v0.18.1 -> v0.18.2 (#5646)
Only applicable change was neondatabase/autoscaling#571, removing the
postgres_exporter flags `--auto-discover-databases` and
`--exclude-databases=...`
2023-10-24 16:04:28 -07:00
Arpad Müller
4bef977c56 Use tuples instead of manual comparison chain (#5637)
Makes code a little bit simpler
2023-10-24 17:16:23 +00:00
John Spray
a0b862a8bd pageserver: schedule frozen layer uploads inside the layers lock (#5639)
## Problem

Compaction's source of truth for what layers exist is the LayerManager.
`flush_frozen_layer` updates LayerManager before it has scheduled upload
of the frozen layer.

Compaction can then "see" the new layer, decide to delete it, schedule
uploads of replacement layers, all before `flush_frozen_layer` wakes up
again and schedules the upload. When the upload is scheduled, the local
layer file may be gone, in which case we end up with no such layer in
remote storage, but an entry still added to IndexPart pointing to the
missing layer.

## Summary of changes

Schedule layer uploads inside the `self.layers` lock, so that whenever a
frozen layer is present in LayerManager, it is also present in
RemoteTimelineClient's metadata.

Closes: #5635
2023-10-24 13:57:01 +01:00
Conrad Ludgate
767ef29390 proxy: filter out more quota exceeded errors (#5640)
## Problem

Looking at logs, I saw more retries being performed for other quota
exceeded errors

## Summary of changes

Filter out all quota exceeded family of errors
2023-10-24 13:13:23 +01:00
Alexander Bayandin
a8a800af51 Run real Azure tests on CI (#5627)
## Problem
We do not run real Azure-related tests on CI 

## Summary of changes
- Set required env variables to run real Azure blob storage tests on CI
2023-10-24 12:12:11 +01:00
54 changed files with 2142 additions and 697 deletions

View File

@@ -5,4 +5,6 @@ self-hosted-runner:
- small
- us-east-2
config-variables:
- REMOTE_STORAGE_AZURE_CONTAINER
- REMOTE_STORAGE_AZURE_REGION
- SLACK_UPCOMING_RELEASE_CHANNEL_ID

View File

@@ -203,6 +203,10 @@ runs:
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
BASE_S3_URL: ${{ steps.generate-report.outputs.base-s3-url }}
run: |
if [ ! -d "${WORKDIR}/report/data/test-cases" ]; then
exit 0
fi
export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR_NEW}
./scripts/pysync

View File

@@ -338,6 +338,16 @@ jobs:
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
# Run separate tests for real Azure Blob Storage
# XXX: replace region with `eu-central-1`-like region
export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
- name: Install rust binaries
run: |
# Install target binaries
@@ -423,7 +433,7 @@ jobs:
rerun_flaky: true
pg_version: ${{ matrix.pg_version }}
env:
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
- name: Merge and upload coverage data
@@ -458,7 +468,7 @@ jobs:
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}"
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
# XXX: no coverage data handling here, since benchmarks are run on release builds,
# while coverage is currently collected for the debug ones
@@ -837,7 +847,7 @@ jobs:
run:
shell: sh -eu {0}
env:
VM_BUILDER_VERSION: v0.18.1
VM_BUILDER_VERSION: v0.18.2
steps:
- name: Checkout

29
Cargo.lock generated
View File

@@ -2932,6 +2932,16 @@ dependencies = [
"winapi",
]
[[package]]
name = "nu-ansi-term"
version = "0.46.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
dependencies = [
"overload",
"winapi",
]
[[package]]
name = "num-bigint"
version = "0.4.3"
@@ -3198,6 +3208,12 @@ version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"
[[package]]
name = "overload"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
[[package]]
name = "pagectl"
version = "0.1.0"
@@ -3283,10 +3299,12 @@ dependencies = [
"tokio",
"tokio-io-timeout",
"tokio-postgres",
"tokio-stream",
"tokio-tar",
"tokio-util",
"toml_edit",
"tracing",
"tracing-subscriber",
"url",
"utils",
"walkdir",
@@ -3561,7 +3579,7 @@ dependencies = [
[[package]]
name = "postgres"
version = "0.19.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=problame/copy-both-duplex-public#5c462bd3500e657c014ef087e4eef2c1a8f0ebda"
dependencies = [
"bytes",
"fallible-iterator",
@@ -3574,7 +3592,7 @@ dependencies = [
[[package]]
name = "postgres-native-tls"
version = "0.5.0"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=problame/copy-both-duplex-public#5c462bd3500e657c014ef087e4eef2c1a8f0ebda"
dependencies = [
"native-tls",
"tokio",
@@ -3585,7 +3603,7 @@ dependencies = [
[[package]]
name = "postgres-protocol"
version = "0.6.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=problame/copy-both-duplex-public#5c462bd3500e657c014ef087e4eef2c1a8f0ebda"
dependencies = [
"base64 0.20.0",
"byteorder",
@@ -3603,7 +3621,7 @@ dependencies = [
[[package]]
name = "postgres-types"
version = "0.2.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=problame/copy-both-duplex-public#5c462bd3500e657c014ef087e4eef2c1a8f0ebda"
dependencies = [
"bytes",
"fallible-iterator",
@@ -5407,7 +5425,7 @@ dependencies = [
[[package]]
name = "tokio-postgres"
version = "0.7.7"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=problame/copy-both-duplex-public#5c462bd3500e657c014ef087e4eef2c1a8f0ebda"
dependencies = [
"async-trait",
"byteorder",
@@ -5764,6 +5782,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
dependencies = [
"matchers",
"nu-ansi-term",
"once_cell",
"regex",
"serde",

View File

@@ -161,11 +161,11 @@ env_logger = "0.10"
log = "0.4"
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="problame/copy-both-duplex-public" }
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="problame/copy-both-duplex-public" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="problame/copy-both-duplex-public" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="problame/copy-both-duplex-public" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="problame/copy-both-duplex-public" }
## Other git libraries
heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -202,7 +202,7 @@ tonic-build = "0.9"
# This is only needed for proxy's tests.
# TODO: we should probably fork `tokio-postgres-rustls` instead.
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="problame/copy-both-duplex-public" }
################# Binary contents sections

View File

@@ -18,7 +18,7 @@ use utils::{
use crate::reltag::RelTag;
use anyhow::bail;
use bytes::{BufMut, Bytes, BytesMut};
use bytes::{Buf, BufMut, Bytes, BytesMut};
/// The state of a tenant in this pageserver.
///
@@ -807,6 +807,36 @@ impl PagestreamBeMessage {
bytes.into()
}
pub fn deserialize(buf: Bytes) -> anyhow::Result<Self> {
let mut buf = buf.reader();
let msg_tag = buf.read_u8()?;
match msg_tag {
100 => todo!(),
101 => todo!(),
102 => {
let buf = buf.get_ref();
/* TODO use constant */
if buf.len() == 8192 {
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
page: buf.clone(),
}))
} else {
anyhow::bail!("invalid page size: {}", buf.len());
}
}
103 => {
let buf = buf.get_ref();
let cstr = std::ffi::CStr::from_bytes_until_nul(&buf)?;
let rust_str = cstr.to_str()?;
Ok(PagestreamBeMessage::Error(PagestreamErrorResponse {
message: rust_str.to_owned(),
}))
}
104 => todo!(),
_ => bail!("unknown tag: {:?}", msg_tag),
}
}
}
#[cfg(test)]

View File

@@ -22,9 +22,9 @@ use postgres_ffi::Oid;
/// [See more related comments here](https:///github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/relfilenode.h#L57).
///
// FIXME: should move 'forknum' as last field to keep this consistent with Postgres.
// Then we could replace the custo Ord and PartialOrd implementations below with
// deriving them.
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize)]
// Then we could replace the custom Ord and PartialOrd implementations below with
// deriving them. This will require changes in walredoproc.c.
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize)]
pub struct RelTag {
pub forknum: u8,
pub spcnode: Oid,
@@ -40,21 +40,9 @@ impl PartialOrd for RelTag {
impl Ord for RelTag {
fn cmp(&self, other: &Self) -> Ordering {
let mut cmp = self.spcnode.cmp(&other.spcnode);
if cmp != Ordering::Equal {
return cmp;
}
cmp = self.dbnode.cmp(&other.dbnode);
if cmp != Ordering::Equal {
return cmp;
}
cmp = self.relnode.cmp(&other.relnode);
if cmp != Ordering::Equal {
return cmp;
}
cmp = self.forknum.cmp(&other.forknum);
cmp
// Custom ordering where we put forknum to the end of the list
let other_tup = (other.spcnode, other.dbnode, other.relnode, other.forknum);
(self.spcnode, self.dbnode, self.relnode, self.forknum).cmp(&other_tup)
}
}

View File

@@ -112,7 +112,7 @@ impl RemotePath {
self.0.file_name()
}
pub fn join(&self, segment: &Utf8Path) -> Self {
pub fn join<P: AsRef<Utf8Path>>(&self, segment: P) -> Self {
Self(self.0.join(segment))
}

View File

@@ -82,6 +82,8 @@ enum-map.workspace = true
enumset.workspace = true
strum.workspace = true
strum_macros.workspace = true
tokio-stream.workspace = true
tracing-subscriber = { version = "0.3.17", features = ["env-filter"] }
[dev-dependencies]
criterion.workspace = true

View File

@@ -0,0 +1,245 @@
use clap::Parser;
use hyper::client::conn::Parts;
use hyper::client::HttpConnector;
use hyper::{Body, Client, Uri};
use pageserver::{repository, tenant};
use rand::prelude::*;
use std::env::args;
use std::future::Future;
use std::str::FromStr;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::{Arc, Mutex};
use std::thread;
use tokio::sync::mpsc::{channel, Sender};
use tokio::sync::Mutex as AsyncMutex;
use tokio::task::JoinHandle;
struct Key(repository::Key);
impl std::str::FromStr for Key {
type Err = anyhow::Error;
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
repository::Key::from_hex(s).map(Key)
}
}
struct KeyRange {
start: Key,
end: Key,
}
impl KeyRange {
fn len(&self) -> i128 {
self.end.0.to_i128() - self.start.0.to_i128()
}
}
#[derive(clap::Parser)]
struct Args {
#[clap(long, default_value = "http://localhost:9898")]
ps_endpoint: String,
// tenant_id: String,
// timeline_id: String,
num_tasks: usize,
num_requests: usize,
tenants: Option<Vec<String>>,
#[clap(long)]
pick_n_tenants: Option<usize>,
}
#[derive(Debug, Default)]
struct Stats {
completed_requests: AtomicU64,
}
impl Stats {
fn inc(&self) {
self.completed_requests.fetch_add(1, Ordering::Relaxed);
}
}
#[tokio::main]
async fn main() {
let args: &'static Args = Box::leak(Box::new(Args::parse()));
let client = Client::new();
let tenants = if let Some(tenants) = &args.tenants {
tenants.clone()
} else {
// let tenant_id = "b97965931096047b2d54958756baee7b";
// let timeline_id = "2868f84a8d166779e4c651b116c45059";
let resp = client
.get(Uri::try_from(&format!("{}/v1/tenant", args.ps_endpoint)).unwrap())
.await
.unwrap();
let body = hyper::body::to_bytes(resp).await.unwrap();
let tenants: serde_json::Value = serde_json::from_slice(&body).unwrap();
let mut out = Vec::new();
for t in tenants.as_array().unwrap() {
if let Some(limit) = args.pick_n_tenants {
if out.len() >= limit {
break;
}
}
out.push(t.get("id").unwrap().as_str().unwrap().to_owned());
}
if let Some(limit) = args.pick_n_tenants {
assert_eq!(out.len(), limit);
}
out
};
let mut tenant_timelines = Vec::new();
for tenant_id in tenants {
let resp = client
.get(
Uri::try_from(&format!(
"{}/v1/tenant/{}/timeline",
args.ps_endpoint, tenant_id
))
.unwrap(),
)
.await
.unwrap();
let body = hyper::body::to_bytes(resp).await.unwrap();
let timelines: serde_json::Value = serde_json::from_slice(&body).unwrap();
for t in timelines.as_array().unwrap() {
let timeline_id = t.get("timeline_id").unwrap().as_str().unwrap().to_owned();
tenant_timelines.push((tenant_id.clone(), timeline_id));
}
}
println!("tenant_timelines:\n{:?}", tenant_timelines);
let mut stats = Arc::new(Stats::default());
tokio::spawn({
let stats = Arc::clone(&stats);
async move {
loop {
let start = std::time::Instant::now();
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
let elapsed = start.elapsed();
println!(
"RPS: {:.0}",
completed_requests as f64 / elapsed.as_secs_f64()
);
}
}
});
let mut tasks = Vec::new();
for (tenant_id, timeline_id) in tenant_timelines {
let t = tokio::spawn(timeline(
args,
client.clone(),
tenant_id,
timeline_id,
Arc::clone(&stats),
));
tasks.push(t);
}
for t in tasks {
t.await.unwrap();
}
}
fn timeline(
args: &'static Args,
client: Client<HttpConnector, Body>,
tenant_id: String,
timeline_id: String,
stats: Arc<Stats>,
) -> impl Future<Output = ()> {
async move {
let mut resp = client
.get(
Uri::try_from(&format!(
"{}/v1/tenant/{}/timeline/{}/keyspace",
args.ps_endpoint, tenant_id, timeline_id
))
.unwrap(),
)
.await
.unwrap();
if !resp.status().is_success() {
panic!("Failed to get keyspace: {resp:?}");
}
let body = hyper::body::to_bytes(resp).await.unwrap();
let keyspace: serde_json::Value = serde_json::from_slice(&body).unwrap();
let lsn = Arc::new(keyspace["at_lsn"].as_str().unwrap().to_owned());
let ranges = keyspace["keys"]
.as_array()
.unwrap()
.iter()
.map(|r| {
let r = r.as_array().unwrap();
assert_eq!(r.len(), 2);
let start = Key::from_str(r[0].as_str().unwrap()).unwrap();
let end = Key::from_str(r[1].as_str().unwrap()).unwrap();
KeyRange { start, end }
})
.collect::<Vec<_>>();
// weighted ranges
let weights = ranges.iter().map(|r| r.len()).collect::<Vec<_>>();
let ranges = Arc::new(ranges);
let weights = Arc::new(weights);
let (tx, mut rx) = channel::<i32>(1000);
let tx = Arc::new(AsyncMutex::new(tx));
let mut tasks = Vec::<JoinHandle<()>>::new();
let start = std::time::Instant::now();
for i in 0..args.num_tasks {
let ranges = ranges.clone();
let weights = weights.clone();
let lsn = lsn.clone();
let client = client.clone();
let tenant_id = tenant_id.clone();
let timeline_id = timeline_id.clone();
let stats = Arc::clone(&stats);
let task = tokio::spawn(async move {
for i in 0..args.num_requests {
let key = {
let mut rng = rand::thread_rng();
let r = ranges.choose_weighted(&mut rng, |r| r.len()).unwrap();
let key = rng.gen_range((r.start.0.to_i128()..r.end.0.to_i128()));
key
};
let url = format!(
"{}/v1/tenant/{}/timeline/{}/getpage?key={:036x}&lsn={}",
args.ps_endpoint, tenant_id, timeline_id, key, lsn
);
let uri = url.parse::<Uri>().unwrap();
let resp = client.get(uri).await.unwrap();
stats.inc();
}
});
tasks.push(task);
}
drop(tx);
for task in tasks {
task.await.unwrap();
}
let elapsed = start.elapsed();
println!(
"RPS: {:.0}",
(args.num_requests * args.num_tasks) as f64 / elapsed.as_secs_f64()
);
}
}

View File

@@ -0,0 +1,373 @@
use anyhow::Context;
use clap::Parser;
use futures::{SinkExt, TryStreamExt};
use hyper::client::conn::Parts;
use hyper::client::HttpConnector;
use hyper::{Client, Uri};
use pageserver::page_cache::PAGE_SZ;
use pageserver::pgdatadir_mapping::{is_rel_block_key, key_to_rel_block};
use pageserver::{repository, tenant};
use pageserver_api::models::{
PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
};
use pageserver_api::reltag::RelTag;
use rand::prelude::*;
use scopeguard::defer;
use std::env::args;
use std::future::Future;
use std::str::FromStr;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::{Arc, Mutex};
use std::thread;
use tokio::sync::mpsc::{channel, Sender};
use tokio::sync::Mutex as AsyncMutex;
use tokio::task::JoinHandle;
use tokio_stream::{Stream, StreamExt};
use utils::completion;
use utils::lsn::Lsn;
struct Key(repository::Key);
impl std::str::FromStr for Key {
type Err = anyhow::Error;
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
repository::Key::from_hex(s).map(Key)
}
}
struct KeyRange {
start: i128,
end: i128,
}
impl KeyRange {
fn len(&self) -> i128 {
self.end - self.start
}
}
struct RelTagBlockNo {
rel_tag: RelTag,
block_no: u32,
}
#[derive(clap::Parser)]
struct Args {
#[clap(long, default_value = "http://localhost:9898")]
ps_endpoint: String,
// tenant_id: String,
// timeline_id: String,
num_tasks: usize,
num_requests: usize,
tenants: Option<Vec<String>>,
#[clap(long)]
pick_n_tenants: Option<usize>,
}
#[derive(Debug, Default)]
struct Stats {
completed_requests: AtomicU64,
}
impl Stats {
fn inc(&self) {
self.completed_requests.fetch_add(1, Ordering::Relaxed);
}
}
#[tokio::main]
async fn main() {
let args: &'static Args = Box::leak(Box::new(Args::parse()));
// std::env::set_var("RUST_LOG", "info,tokio_postgres=trace");
// tracing_subscriber::fmt::init();
let client = Client::new();
let tenants = if let Some(tenants) = &args.tenants {
tenants.clone()
} else {
// let tenant_id = "b97965931096047b2d54958756baee7b";
// let timeline_id = "2868f84a8d166779e4c651b116c45059";
let resp = client
.get(Uri::try_from(&format!("{}/v1/tenant", args.ps_endpoint)).unwrap())
.await
.unwrap();
let body = hyper::body::to_bytes(resp).await.unwrap();
let tenants: serde_json::Value = serde_json::from_slice(&body).unwrap();
let mut out = Vec::new();
for t in tenants.as_array().unwrap() {
if let Some(limit) = args.pick_n_tenants {
if out.len() >= limit {
break;
}
}
out.push(t.get("id").unwrap().as_str().unwrap().to_owned());
}
if let Some(limit) = args.pick_n_tenants {
assert_eq!(out.len(), limit);
}
out
};
let mut tenant_timelines = Vec::new();
for tenant_id in tenants {
let resp = client
.get(
Uri::try_from(&format!(
"{}/v1/tenant/{}/timeline",
args.ps_endpoint, tenant_id
))
.unwrap(),
)
.await
.unwrap();
let body = hyper::body::to_bytes(resp).await.unwrap();
let timelines: serde_json::Value = serde_json::from_slice(&body).unwrap();
for t in timelines.as_array().unwrap() {
let timeline_id = t.get("timeline_id").unwrap().as_str().unwrap().to_owned();
tenant_timelines.push((tenant_id.clone(), timeline_id));
}
}
println!("tenant_timelines:\n{:?}", tenant_timelines);
let mut stats = Arc::new(Stats::default());
tokio::spawn({
let stats = Arc::clone(&stats);
async move {
loop {
let start = std::time::Instant::now();
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
let elapsed = start.elapsed();
println!(
"RPS: {:.0}",
completed_requests as f64 / elapsed.as_secs_f64()
);
}
}
});
let mut tasks = Vec::new();
for (tenant_id, timeline_id) in tenant_timelines {
let stats = Arc::clone(&stats);
let t = tokio::spawn(timeline(
args,
client.clone(),
tenant_id,
timeline_id,
stats,
));
tasks.push(t);
}
for t in tasks {
t.await.unwrap();
}
}
fn timeline(
args: &'static Args,
http_client: Client<HttpConnector, hyper::Body>,
tenant_id: String,
timeline_id: String,
stats: Arc<Stats>,
) -> impl Future<Output = ()> + Send + Sync {
async move {
let mut resp = http_client
.get(
Uri::try_from(&format!(
"{}/v1/tenant/{}/timeline/{}/keyspace",
args.ps_endpoint, tenant_id, timeline_id
))
.unwrap(),
)
.await
.unwrap();
if !resp.status().is_success() {
panic!("Failed to get keyspace: {resp:?}");
}
let body = hyper::body::to_bytes(resp).await.unwrap();
let keyspace: serde_json::Value = serde_json::from_slice(&body).unwrap();
let lsn: Lsn = keyspace["at_lsn"].as_str().unwrap().parse().unwrap();
let ranges = keyspace["keys"]
.as_array()
.unwrap()
.iter()
.filter_map(|r| {
let r = r.as_array().unwrap();
assert_eq!(r.len(), 2);
let start = Key::from_str(r[0].as_str().unwrap()).unwrap();
let end = Key::from_str(r[1].as_str().unwrap()).unwrap();
// filter out non-relblock keys
match (is_rel_block_key(start.0), is_rel_block_key(end.0)) {
(true, true) => Some(KeyRange {
start: start.0.to_i128(),
end: end.0.to_i128(),
}),
(true, false) | (false, true) => {
unimplemented!("split up range")
}
(false, false) => None,
}
})
.collect::<Vec<_>>();
// weighted ranges
let weights = ranges.iter().map(|r| r.len()).collect::<Vec<_>>();
let ranges = Arc::new(ranges);
let weights = Arc::new(weights);
let mut tasks = Vec::<JoinHandle<()>>::new();
let start = std::time::Instant::now();
for i in 0..args.num_tasks {
let ranges = ranges.clone();
let weights = weights.clone();
let client = http_client.clone();
let tenant_id = tenant_id.clone();
let timeline_id = timeline_id.clone();
let task = tokio::spawn({
let stats = Arc::clone(&stats);
async move {
let mut client =
getpage_client::Client::new(tenant_id.clone(), timeline_id.clone())
.await
.unwrap();
for i in 0..args.num_requests {
let key = {
let mut rng = rand::thread_rng();
let r = ranges.choose_weighted(&mut rng, |r| r.len()).unwrap();
let key: i128 = rng.gen_range((r.start..r.end));
let key = repository::Key::from_i128(key);
// XXX filter these out when we iterate the keyspace
assert!(
is_rel_block_key(key),
"we filter non-relblock keys out above"
);
let (rel_tag, block_no) =
key_to_rel_block(key).expect("we just checked");
RelTagBlockNo { rel_tag, block_no }
};
client
.getpage(key, lsn)
.await
.with_context(|| {
format!("getpage for tenant {} timeline {}", tenant_id, timeline_id)
})
.unwrap();
stats.inc();
}
client.shutdown().await;
}
});
tasks.push(task);
}
for task in tasks {
task.await.unwrap();
}
}
}
mod getpage_client {
use std::pin::Pin;
use futures::SinkExt;
use pageserver_api::models::{
PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest,
PagestreamGetPageResponse,
};
use tokio::task::JoinHandle;
use tokio_stream::StreamExt;
use tokio_util::sync::CancellationToken;
use utils::lsn::Lsn;
use crate::RelTagBlockNo;
pub(crate) struct Client {
copy_both: Pin<Box<tokio_postgres::CopyBothDuplex<bytes::Bytes>>>,
cancel_on_client_drop: Option<tokio_util::sync::DropGuard>,
conn_task: JoinHandle<()>,
}
impl Client {
pub fn new(
tenant_id: String,
timeline_id: String,
) -> impl std::future::Future<Output = anyhow::Result<Self>> + Send {
async move {
let (client, connection) =
tokio_postgres::connect("host=localhost port=64000", postgres::NoTls).await?;
let conn_task_cancel = CancellationToken::new();
let conn_task = tokio::spawn({
let conn_task_cancel = conn_task_cancel.clone();
async move {
tokio::select! {
_ = conn_task_cancel.cancelled() => {
return;
}
res = connection => {
res.unwrap();
}
}
}
});
let copy_both: tokio_postgres::CopyBothDuplex<bytes::Bytes> = client
.copy_both_simple(&format!("pagestream {tenant_id} {timeline_id}"))
.await?;
Ok(Self {
copy_both: Box::pin(copy_both),
conn_task,
cancel_on_client_drop: Some(conn_task_cancel.drop_guard()),
})
}
}
pub async fn shutdown(mut self) {
let _ = self.cancel_on_client_drop.take();
self.conn_task.await.unwrap();
}
pub async fn getpage(
&mut self,
key: RelTagBlockNo,
lsn: Lsn,
) -> anyhow::Result<PagestreamGetPageResponse> {
let req = PagestreamGetPageRequest {
latest: false,
rel: key.rel_tag,
blkno: key.block_no,
lsn,
};
let req = PagestreamFeMessage::GetPage(req);
let req: bytes::Bytes = req.serialize();
// let mut req = tokio_util::io::ReaderStream::new(&req);
let mut req = tokio_stream::once(Ok(req));
self.copy_both.send_all(&mut req).await?;
let next: Option<Result<bytes::Bytes, _>> = self.copy_both.next().await;
let next = next.unwrap().unwrap();
match PagestreamBeMessage::deserialize(next)? {
PagestreamBeMessage::Exists(_) => todo!(),
PagestreamBeMessage::Nblocks(_) => todo!(),
PagestreamBeMessage::GetPage(p) => Ok(p),
PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e),
PagestreamBeMessage::DbSize(_) => todo!(),
}
}
}
}

View File

@@ -392,13 +392,19 @@ paths:
type: string
format: date-time
description: A timestamp to get the LSN
- name: version
in: query
required: false
schema:
type: integer
description: The version of the endpoint to use
responses:
"200":
description: OK
content:
application/json:
schema:
type: string
$ref: "#/components/schemas/LsnByTimestampResponse"
"400":
description: Error when no tenant id found in path, no timeline id or invalid timestamp
content:
@@ -1384,6 +1390,19 @@ components:
type: string
format: hex
LsnByTimestampResponse:
type: object
required:
- lsn
- kind
properties:
lsn:
type: string
format: hex
kind:
type: string
enum: [past, present, future, nodata]
Error:
type: object
required:

View File

@@ -8,7 +8,7 @@ use std::sync::Arc;
use anyhow::{anyhow, Context, Result};
use futures::TryFutureExt;
use humantime::format_rfc3339;
use hyper::header::CONTENT_TYPE;
use hyper::header;
use hyper::StatusCode;
use hyper::{Body, Request, Response, Uri};
use metrics::launch_timestamp::LaunchTimestamp;
@@ -17,6 +17,7 @@ use pageserver_api::models::{
TenantLoadRequest, TenantLocationConfigRequest,
};
use remote_storage::GenericRemoteStorage;
use serde_with::{serde_as, DisplayFromStr};
use tenant_size_model::{SizeResult, StorageModel};
use tokio_util::sync::CancellationToken;
use tracing::*;
@@ -484,6 +485,8 @@ async fn get_lsn_by_timestamp_handler(
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let version: Option<u8> = parse_query_param(&request, "version")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let timestamp_raw = must_get_query_param(&request, "timestamp")?;
let timestamp = humantime::parse_rfc3339(&timestamp_raw)
@@ -495,13 +498,32 @@ async fn get_lsn_by_timestamp_handler(
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
let result = timeline.find_lsn_for_timestamp(timestamp_pg, &ctx).await?;
let result = match result {
LsnForTimestamp::Present(lsn) => format!("{lsn}"),
LsnForTimestamp::Future(_lsn) => "future".into(),
LsnForTimestamp::Past(_lsn) => "past".into(),
LsnForTimestamp::NoData(_lsn) => "nodata".into(),
};
json_response(StatusCode::OK, result)
if version.unwrap_or(0) > 1 {
#[serde_as]
#[derive(serde::Serialize)]
struct Result {
#[serde_as(as = "DisplayFromStr")]
lsn: Lsn,
kind: &'static str,
}
let (lsn, kind) = match result {
LsnForTimestamp::Present(lsn) => (lsn, "present"),
LsnForTimestamp::Future(lsn) => (lsn, "future"),
LsnForTimestamp::Past(lsn) => (lsn, "past"),
LsnForTimestamp::NoData(lsn) => (lsn, "nodata"),
};
json_response(StatusCode::OK, Result { lsn, kind })
} else {
// FIXME: this is a temporary crutch not to break backwards compatibility
// See https://github.com/neondatabase/neon/pull/5608
let result = match result {
LsnForTimestamp::Present(lsn) => format!("{lsn}"),
LsnForTimestamp::Future(_lsn) => "future".into(),
LsnForTimestamp::Past(_lsn) => "past".into(),
LsnForTimestamp::NoData(_lsn) => "nodata".into(),
};
json_response(StatusCode::OK, result)
}
}
async fn get_timestamp_of_lsn_handler(
@@ -659,6 +681,45 @@ async fn tenant_ignore_handler(
json_response(StatusCode::OK, ())
}
async fn tenant_duplicate_handler(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let src_tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let request_data: TenantCreateRequest = json_request(&mut request).await?;
let new_tenant_id = request_data.new_tenant_id;
check_permission(&request, None)?;
let _timer = STORAGE_TIME_GLOBAL
.get_metric_with_label_values(&[StorageTimeOperation::DuplicateTenant.into()])
.expect("bug")
.start_timer();
let tenant_conf =
TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
let state = get_state(&request);
let generation = get_request_generation(state, request_data.generation)?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
mgr::duplicate_tenant(
state.conf,
tenant_conf,
src_tenant_id,
new_tenant_id,
generation,
state.tenant_resources(),
&ctx,
)
.instrument(info_span!("tenant_duplicate", %src_tenant_id, tenant_id = %new_tenant_id))
.await?;
json_response(StatusCode::CREATED, TenantCreateResponse(new_tenant_id))
}
async fn tenant_list_handler(
request: Request<Body>,
_cancel: CancellationToken,
@@ -767,6 +828,10 @@ async fn tenant_size_handler(
.map_err(ApiError::InternalServerError)?;
let mut sizes = None;
let accepts_html = headers
.get(header::ACCEPT)
.map(|v| v == "text/html")
.unwrap_or_default();
if !inputs_only.unwrap_or(false) {
let storage_model = inputs
.calculate_model()
@@ -774,11 +839,11 @@ async fn tenant_size_handler(
let size = storage_model.calculate();
// If request header expects html, return html
if headers["Accept"] == "text/html" {
if accepts_html {
return synthetic_size_html_response(inputs, storage_model, size);
}
sizes = Some(size);
} else if headers["Accept"] == "text/html" {
} else if accepts_html {
return Err(ApiError::BadRequest(anyhow!(
"inputs_only parameter is incompatible with html output request"
)));
@@ -929,7 +994,7 @@ fn synthetic_size_html_response(
pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>, ApiError> {
let response = Response::builder()
.status(status)
.header(hyper::header::CONTENT_TYPE, "text/html")
.header(header::CONTENT_TYPE, "text/html")
.body(Body::from(data.as_bytes().to_vec()))
.map_err(|e| ApiError::InternalServerError(e.into()))?;
Ok(response)
@@ -1310,7 +1375,7 @@ async fn getpage_at_lsn_handler(
Result::<_, ApiError>::Ok(
Response::builder()
.status(StatusCode::OK)
.header(CONTENT_TYPE, "application/octet-stream")
.header(header::CONTENT_TYPE, "application/octet-stream")
.body(hyper::Body::from(page))
.unwrap(),
)
@@ -1702,6 +1767,9 @@ pub fn make_router(
.post("/v1/tenant/:tenant_id/ignore", |r| {
api_handler(r, tenant_ignore_handler)
})
.post("/v1/tenant/:tenant_id/duplicate", |r| {
api_handler(r, tenant_duplicate_handler)
})
.get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
api_handler(r, timeline_detail_handler)
})

View File

@@ -149,6 +149,10 @@ fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool {
}
}
// FIXME: DO NOT ADD new query methods like this, which will have a next step of parsing timelineid
// from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once
// from the name.
pub fn is_uninit_mark(path: &Utf8Path) -> bool {
ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
}

View File

@@ -51,6 +51,9 @@ pub enum StorageTimeOperation {
#[strum(serialize = "create tenant")]
CreateTenant,
#[strum(serialize = "duplicate tenant")]
DuplicateTenant,
}
pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
@@ -1388,27 +1391,22 @@ impl TimelineMetrics {
}
}
pub fn record_new_file_metrics(&self, sz: u64) {
pub(crate) fn record_new_file_metrics(&self, sz: u64) {
self.resident_physical_size_add(sz);
self.num_persistent_files_created.inc_by(1);
self.persistent_bytes_written.inc_by(sz);
}
pub fn resident_physical_size_sub(&self, sz: u64) {
pub(crate) fn resident_physical_size_sub(&self, sz: u64) {
self.resident_physical_size_gauge.sub(sz);
crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(sz);
}
pub fn resident_physical_size_add(&self, sz: u64) {
pub(crate) fn resident_physical_size_add(&self, sz: u64) {
self.resident_physical_size_gauge.add(sz);
crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.add(sz);
}
pub fn resident_physical_size_set(&self, sz: u64) {
self.resident_physical_size_gauge.set(sz);
crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.set(sz);
}
pub fn resident_physical_size_get(&self) -> u64 {
self.resident_physical_size_gauge.get()
}

View File

@@ -675,8 +675,9 @@ impl Timeline {
result.add_key(CONTROLFILE_KEY);
result.add_key(CHECKPOINT_KEY);
result.add_key(AUX_FILES_KEY);
if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
result.add_key(AUX_FILES_KEY);
}
Ok(result.to_keyspace())
}
@@ -1693,6 +1694,7 @@ const AUX_FILES_KEY: Key = Key {
// Reverse mappings for a few Keys.
// These are needed by WAL redo manager.
/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
Ok(match key.field1 {
0x00 => (
@@ -1708,7 +1710,8 @@ pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
})
}
fn is_rel_block_key(key: Key) -> bool {
/// See [[key_to_rel_block]].
pub fn is_rel_block_key(key: Key) -> bool {
key.field1 == 0x00 && key.field4 != 0
}

View File

@@ -1057,8 +1057,8 @@ impl Tenant {
TimelineId::try_from(timeline_uninit_mark_file.file_stem())
.with_context(|| {
format!(
"Could not parse timeline id out of the timeline uninit mark name {timeline_uninit_mark_file}",
)
"Could not parse timeline id out of the timeline uninit mark name {timeline_uninit_mark_file}",
)
})?;
let timeline_dir = self.conf.timeline_path(&self.tenant_id, &timeline_id);
if let Err(e) =

View File

@@ -4,8 +4,10 @@
use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
use rand::{distributions::Alphanumeric, Rng};
use std::collections::{hash_map, HashMap};
use std::str::FromStr;
use std::sync::Arc;
use tokio::fs;
use tokio::io::AsyncSeekExt;
use anyhow::Context;
use once_cell::sync::Lazy;
@@ -26,9 +28,11 @@ use crate::deletion_queue::DeletionQueueClient;
use crate::task_mgr::{self, TaskKind};
use crate::tenant::config::{AttachmentMode, LocationConf, LocationMode, TenantConfOpt};
use crate::tenant::delete::DeleteTenantFlow;
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
use crate::tenant::storage_layer::{DeltaLayer, ImageLayer, LayerFileName};
use crate::tenant::{
create_tenant_files, AttachMarkerMode, AttachedTenantConf, CreateTenantFilesMode, Tenant,
TenantState,
create_tenant_files, remote_timeline_client, AttachMarkerMode, AttachedTenantConf,
CreateTenantFilesMode, IndexPart, Tenant, TenantState,
};
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
@@ -695,6 +699,159 @@ pub(crate) async fn create_tenant(
}).await
}
pub(crate) async fn duplicate_tenant(
conf: &'static PageServerConf,
tenant_conf: TenantConfOpt,
src_tenant_id: TenantId,
new_tenant_id: TenantId,
generation: Generation,
resources: TenantSharedResources,
ctx: &RequestContext,
) -> Result<(), TenantMapInsertError> {
debug_assert_current_span_has_tenant_id();
// TODO: would be nice to use tenant_map_insert here, but, we're not ready to create a Tenant object yet
let tempdir = path_with_suffix_extension(
conf.tenants_path().join(&new_tenant_id.to_string()),
&format!("duplication.{TEMP_FILE_SUFFIX}"),
);
tokio::fs::remove_dir_all(&tempdir)
.await
.or_else(|e| match e.kind() {
std::io::ErrorKind::NotFound => Ok(()),
_ => Err(e),
})
.context("pre-run clean up tempdir")?;
tokio::fs::create_dir(&tempdir)
.await
.context("create tempdir")?;
// Copy the tenant's data in S3
let remote_storage = resources
.remote_storage
.as_ref()
.context("only works with remote storage")?;
let remote_src_timelines =
remote_timeline_client::list_remote_timelines(remote_storage, src_tenant_id)
.await
.context("list src timelines")?;
info!(?remote_src_timelines, "got src timelines");
for timeline_id in remote_src_timelines {
async {
let tempdir = tempdir.join(&timeline_id.to_string());
tokio::fs::create_dir(&tempdir)
.await
.context("create tempdir for timeline")?;
let remote_src_tl =
remote_timeline_client::remote_timeline_path(&src_tenant_id, &timeline_id);
let remote_dst_tl =
remote_timeline_client::remote_timeline_path(&new_tenant_id, &timeline_id);
let object_names = remote_storage
.list_prefixes(Some(&remote_src_tl))
.await
.context("list timeline remote prefix")?;
for name in object_names {
async {
let name = name.object_name().context(
"list_prefixes return values should always have object_name()=Some",
)?;
let remote_src_obj = remote_src_tl.join(name);
let remote_dst_obj = remote_dst_tl.join(name);
let tmp_obj_filepath = tempdir.join(name);
let mut tmp_obj_file = tokio::fs::OpenOptions::new()
.read(true)
.write(true)
.create_new(true)
.open(&tmp_obj_filepath)
.await
.context("create temp file")?;
let mut tmp_dl = remote_storage
.download(&remote_src_obj)
.await
.context("start download")?;
let tmp_obj_size =
tokio::io::copy(&mut tmp_dl.download_stream, &mut tmp_obj_file)
.await
.context("do the download")?;
if name == IndexPart::FILE_NAME {
// needs no patching
} else {
let name = LayerFileName::from_str(name).map_err(|e: String| {
anyhow::anyhow!("unknown key in timeline s3 prefix: {name:?}: {e}")
})?;
match name {
LayerFileName::Image(_) => {
ImageLayer::rewrite_tenant_timeline(
&tmp_obj_filepath,
new_tenant_id,
timeline_id, /* leave as is */
ctx,
)
.await
.context("rewrite tenant timeline")?;
}
LayerFileName::Delta(_) => {
DeltaLayer::rewrite_tenant_timeline(
&tmp_obj_filepath,
new_tenant_id,
timeline_id, /* leave as is */
ctx,
)
.await
.context("rewrite tenant timeline")?;
}
}
}
info!(?remote_dst_obj, "uploading");
tmp_obj_file
.seek(std::io::SeekFrom::Start(0))
.await
.context("seek tmp file to beginning for upload")?;
remote_storage
.upload(
tmp_obj_file,
tmp_obj_size as usize,
&remote_dst_obj,
tmp_dl.metadata,
)
.await
.context("upload modified")?;
tokio::fs::remove_file(tmp_obj_filepath)
.await
.context("remove temp file")?;
anyhow::Ok(())
}
.instrument(info_span!("copy object", object_name=?name))
.await
.context("copy object")?;
}
anyhow::Ok(())
}
.instrument(info_span!("copy_timeline", timeline_id=%timeline_id))
.await?;
}
tokio::fs::remove_dir_all(&tempdir)
.await
.context("post-run clean up tempdir")?;
attach_tenant(conf, new_tenant_id, generation, tenant_conf, resources, ctx).await
}
#[derive(Debug, thiserror::Error)]
pub(crate) enum SetNewTenantConfigError {
#[error(transparent)]

View File

@@ -627,7 +627,7 @@ impl RemoteTimelineClient {
///
/// Launch an upload operation in the background.
///
pub fn schedule_layer_file_upload(
pub(crate) fn schedule_layer_file_upload(
self: &Arc<Self>,
layer_file_name: &LayerFileName,
layer_metadata: &LayerFileMetadata,
@@ -635,6 +635,17 @@ impl RemoteTimelineClient {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
self.schedule_layer_file_upload0(upload_queue, layer_file_name, layer_metadata);
self.launch_queued_tasks(upload_queue);
Ok(())
}
fn schedule_layer_file_upload0(
self: &Arc<Self>,
upload_queue: &mut UploadQueueInitialized,
layer_file_name: &LayerFileName,
layer_metadata: &LayerFileMetadata,
) {
upload_queue
.latest_files
.insert(layer_file_name.clone(), layer_metadata.clone());
@@ -643,21 +654,15 @@ impl RemoteTimelineClient {
let op = UploadOp::UploadLayer(layer_file_name.clone(), layer_metadata.clone());
self.calls_unfinished_metric_begin(&op);
upload_queue.queued_operations.push_back(op);
info!("scheduled layer file upload {layer_file_name}");
// Launch the task immediately, if possible
self.launch_queued_tasks(upload_queue);
Ok(())
}
/// Launch a delete operation in the background.
///
/// The operation does not modify local state but assumes the local files have already been
/// deleted, and is used to mirror those changes to remote.
/// The operation does not modify local filesystem state.
///
/// Note: This schedules an index file upload before the deletions. The
/// deletion won't actually be performed, until any previously scheduled
/// deletion won't actually be performed, until all previously scheduled
/// upload operations, and the index file upload, have completed
/// successfully.
pub fn schedule_layer_file_deletion(
@@ -667,61 +672,133 @@ impl RemoteTimelineClient {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
let with_generations =
self.schedule_unlinking_of_layers_from_index_part0(upload_queue, &names);
self.schedule_deletion_of_unlinked0(upload_queue, with_generations);
// Launch the tasks immediately, if possible
self.launch_queued_tasks(upload_queue);
Ok(())
}
/// Unlinks the layer files from `index_part.json` but does not yet schedule deletion for the
/// layer files, leaving them dangling.
///
/// The files will be leaked in remote storage unless [`Self::schedule_deletion_of_unlinked`]
/// is invoked on them.
#[allow(unused)] // will be used by PR#4938
pub(crate) fn schedule_unlinking_of_layers_from_index_part(
self: &Arc<Self>,
names: Vec<LayerFileName>,
) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
// just forget the return value; after uploading the next index_part.json, we can consider
// the layer files as "dangling". this is fine however.
self.schedule_unlinking_of_layers_from_index_part0(upload_queue, &names);
self.launch_queued_tasks(upload_queue);
Ok(())
}
/// Update the remote index file, removing the to-be-deleted files from the index,
/// allowing scheduling of actual deletions later.
fn schedule_unlinking_of_layers_from_index_part0(
self: &Arc<Self>,
upload_queue: &mut UploadQueueInitialized,
names: &[LayerFileName],
) -> Vec<(LayerFileName, Generation)> {
// Deleting layers doesn't affect the values stored in TimelineMetadata,
// so we don't need update it. Just serialize it.
let metadata = upload_queue.latest_metadata.clone();
// Update the remote index file, removing the to-be-deleted files from the index,
// before deleting the actual files.
//
// Once we start removing files from upload_queue.latest_files, there's
// no going back! Otherwise, some of the files would already be removed
// from latest_files, but not yet scheduled for deletion. Use a closure
// to syntactically forbid ? or bail! calls here.
let no_bail_here = || {
// Decorate our list of names with each name's generation, dropping
// makes that are unexpectedly missing from our metadata.
let with_generations: Vec<_> = names
.into_iter()
.filter_map(|name| {
// Remove from latest_files, learning the file's remote generation in the process
let meta = upload_queue.latest_files.remove(&name);
// Decorate our list of names with each name's generation, dropping
// makes that are unexpectedly missing from our metadata.
let with_generations: Vec<_> = names
.iter()
.filter_map(|name| {
// Remove from latest_files, learning the file's remote generation in the process
let meta = upload_queue.latest_files.remove(name);
if let Some(meta) = meta {
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
Some((name, meta.generation))
} else {
// This can only happen if we forgot to to schedule the file upload
// before scheduling the delete. Log it because it is a rare/strange
// situation, and in case something is misbehaving, we'd like to know which
// layers experienced this.
info!(
"Deleting layer {name} not found in latest_files list, never uploaded?"
);
None
}
})
.collect();
if let Some(meta) = meta {
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
Some((name.to_owned(), meta.generation))
} else {
// This can only happen if we forgot to to schedule the file upload
// before scheduling the delete. Log it because it is a rare/strange
// situation, and in case something is misbehaving, we'd like to know which
// layers experienced this.
info!("Deleting layer {name} not found in latest_files list, never uploaded?");
None
}
})
.collect();
if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
self.schedule_index_upload(upload_queue, metadata);
}
// after unlinking files from the upload_queue.latest_files we must always schedule an
// index_part update, because that needs to be uploaded before we can actually delete the
// files.
if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
self.schedule_index_upload(upload_queue, metadata);
}
for (name, gen) in &with_generations {
info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
}
with_generations
}
// schedule the actual deletions
let op = UploadOp::Delete(Delete {
layers: with_generations,
});
self.calls_unfinished_metric_begin(&op);
upload_queue.queued_operations.push_back(op);
/// Schedules deletion for layer files which have previously been unlinked from the
/// `index_part.json` with [`Self::schedule_unlinking_of_layers_from_index_part`].
#[allow(unused)] // will be used by Layer::drop in PR#4938
pub(crate) fn schedule_deletion_of_unlinked(
self: &Arc<Self>,
layers: Vec<(LayerFileName, Generation)>,
) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
self.schedule_deletion_of_unlinked0(upload_queue, layers);
self.launch_queued_tasks(upload_queue);
Ok(())
}
fn schedule_deletion_of_unlinked0(
self: &Arc<Self>,
upload_queue: &mut UploadQueueInitialized,
with_generations: Vec<(LayerFileName, Generation)>,
) {
for (name, gen) in &with_generations {
info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
}
// schedule the actual deletions
let op = UploadOp::Delete(Delete {
layers: with_generations,
});
self.calls_unfinished_metric_begin(&op);
upload_queue.queued_operations.push_back(op);
}
/// Schedules a compaction update to the remote `index_part.json`.
///
/// `compacted_from` represent the L0 names which have been `compacted_to` L1 layers.
pub(crate) fn schedule_compaction_update(
self: &Arc<Self>,
compacted_from: &[LayerFileName],
compacted_to: &[(LayerFileName, LayerFileMetadata)],
) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
for (name, m) in compacted_to {
self.schedule_layer_file_upload0(upload_queue, name, m);
}
let with_generations =
self.schedule_unlinking_of_layers_from_index_part0(upload_queue, compacted_from);
self.schedule_deletion_of_unlinked0(upload_queue, with_generations);
self.launch_queued_tasks(upload_queue);
// Launch the tasks immediately, if possible
self.launch_queued_tasks(upload_queue);
};
no_bail_here();
Ok(())
}

View File

@@ -60,6 +60,8 @@ pub(super) async fn upload_timeline_layer<'a>(
bail!("failpoint before-upload-layer")
});
pausable_failpoint!("before-upload-layer-pausable");
let storage_path = remote_path(conf, source_path, generation)?;
let source_file_res = fs::File::open(&source_path).await;
let source_file = match source_file_res {

View File

@@ -844,6 +844,49 @@ impl Drop for DeltaLayerWriter {
}
}
impl DeltaLayer {
/// Assume the file at `path` is corrupt if this function returns with an error.
pub(crate) async fn rewrite_tenant_timeline(
path: &Utf8Path,
new_tenant: TenantId,
new_timeline: TimelineId,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let file = VirtualFile::open_with_options(
path,
&*std::fs::OpenOptions::new().read(true).write(true),
)
.await
.with_context(|| format!("Failed to open file '{}'", path))?;
let file = FileBlockReader::new(file);
let summary_blk = file.read_blk(0, ctx).await?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
let mut file = file.file;
if actual_summary.magic != DELTA_FILE_MAGIC {
bail!("File '{}' is not a delta layer", path);
}
let new_summary = Summary {
tenant_id: new_tenant,
timeline_id: new_timeline,
..actual_summary
};
let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
Summary::ser_into(&new_summary, &mut buf)?;
if buf.spilled() {
// The code in ImageLayerWriterInner just warn!()s for this.
// It should probably error out as well.
anyhow::bail!(
"Used more than one page size for summary buffer: {}",
buf.len()
);
}
file.seek(SeekFrom::Start(0)).await?;
file.write_all(&buf).await?;
Ok(())
}
}
impl DeltaLayerInner {
pub(super) async fn load(
path: &Utf8Path,

View File

@@ -436,6 +436,49 @@ impl ImageLayer {
}
}
impl ImageLayer {
/// Assume the file at `path` is corrupt if this function returns with an error.
pub(crate) async fn rewrite_tenant_timeline(
path: &Utf8Path,
new_tenant: TenantId,
new_timeline: TimelineId,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let file = VirtualFile::open_with_options(
path,
&*std::fs::OpenOptions::new().read(true).write(true),
)
.await
.with_context(|| format!("Failed to open file '{}'", path))?;
let file = FileBlockReader::new(file);
let summary_blk = file.read_blk(0, ctx).await?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
let mut file = file.file;
if actual_summary.magic != IMAGE_FILE_MAGIC {
bail!("File '{}' is not a delta layer", path);
}
let new_summary = Summary {
tenant_id: new_tenant,
timeline_id: new_timeline,
..actual_summary
};
let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
Summary::ser_into(&new_summary, &mut buf)?;
if buf.spilled() {
// The code in ImageLayerWriterInner just warn!()s for this.
// It should probably error out as well.
anyhow::bail!(
"Used more than one page size for summary buffer: {}",
buf.len()
);
}
file.seek(SeekFrom::Start(0)).await?;
file.write_all(&buf).await?;
Ok(())
}
}
impl ImageLayerInner {
pub(super) async fn load(
path: &Utf8Path,

View File

@@ -159,11 +159,8 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
// TODO: we shouldn't need to await to find tenant and this could be moved outside of
// loop, #3501. There are also additional "allowed_errors" in tests.
if first {
first = false;
if random_init_delay(period, &cancel).await.is_err() {
break;
}
if first && random_init_delay(period, &cancel).await.is_err() {
break;
}
let started_at = Instant::now();
@@ -183,7 +180,16 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
}
};
warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction);
if !first {
// The first iteration is typically much slower, because all tenants compete for the
// compaction sempahore to run, and because of concurrent startup work like initializing
// logical sizes. To avoid routinely spamming warnings, we suppress this log on first iteration.
warn_when_period_overrun(
started_at.elapsed(),
period,
BackgroundLoopKind::Compaction,
);
}
// Sleep
if tokio::time::timeout(sleep_duration, cancel.cancelled())
@@ -192,6 +198,8 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
{
break;
}
first = false;
}
}
.await;
@@ -223,11 +231,8 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
let period = tenant.get_gc_period();
if first {
first = false;
if random_init_delay(period, &cancel).await.is_err() {
break;
}
if first && random_init_delay(period, &cancel).await.is_err() {
break;
}
let started_at = Instant::now();
@@ -251,7 +256,12 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
}
};
warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
if !first {
// The first iteration is typically much slower, because all tenants compete for the
// compaction sempahore to run, and because of concurrent startup work like initializing
// logical sizes. To avoid routinely spamming warnings, we suppress this log on first iteration.
warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
}
// Sleep
if tokio::time::timeout(sleep_duration, cancel.cancelled())
@@ -260,6 +270,8 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
{
break;
}
first = false;
}
}
.await;

View File

@@ -1871,7 +1871,7 @@ impl Timeline {
"loaded layer map with {} layers at {}, total physical size: {}",
num_layers, disk_consistent_lsn, total_physical_size
);
self.metrics.resident_physical_size_set(total_physical_size);
self.metrics.resident_physical_size_add(total_physical_size);
timer.stop_and_record();
Ok(())
@@ -2793,10 +2793,13 @@ impl Timeline {
)
};
let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1);
let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
// The new on-disk layers are now in the layer map. We can remove the
// in-memory layer from the map now. The flushed layer is stored in
// the mapping in `create_delta_layer`.
{
let metadata = {
let mut guard = self.layers.write().await;
if let Some(ref l) = delta_layer_to_add {
@@ -2812,8 +2815,17 @@ impl Timeline {
}
guard.finish_flush_l0_layer(delta_layer_to_add, &frozen_layer);
if disk_consistent_lsn != old_disk_consistent_lsn {
assert!(disk_consistent_lsn > old_disk_consistent_lsn);
self.disk_consistent_lsn.store(disk_consistent_lsn);
// Schedule remote uploads that will reflect our new disk_consistent_lsn
Some(self.schedule_uploads(disk_consistent_lsn, layer_paths_to_upload)?)
} else {
None
}
// release lock on 'layers'
}
};
// FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
// a compaction can delete the file and then it won't be available for uploads any more.
@@ -2829,28 +2841,22 @@ impl Timeline {
//
// TODO: This perhaps should be done in 'flush_frozen_layers', after flushing
// *all* the layers, to avoid fsyncing the file multiple times.
let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1);
let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
// If we were able to advance 'disk_consistent_lsn', save it the metadata file.
// After crash, we will restart WAL streaming and processing from that point.
if disk_consistent_lsn != old_disk_consistent_lsn {
assert!(disk_consistent_lsn > old_disk_consistent_lsn);
self.update_metadata_file(disk_consistent_lsn, layer_paths_to_upload)
// If we updated our disk_consistent_lsn, persist the updated metadata to local disk.
if let Some(metadata) = metadata {
save_metadata(self.conf, &self.tenant_id, &self.timeline_id, &metadata)
.await
.context("update_metadata_file")?;
// Also update the in-memory copy
self.disk_consistent_lsn.store(disk_consistent_lsn);
.context("save_metadata")?;
}
Ok(())
}
/// Update metadata file
async fn update_metadata_file(
fn schedule_uploads(
&self,
disk_consistent_lsn: Lsn,
layer_paths_to_upload: HashMap<LayerFileName, LayerFileMetadata>,
) -> anyhow::Result<()> {
) -> anyhow::Result<TimelineMetadata> {
// We can only save a valid 'prev_record_lsn' value on disk if we
// flushed *all* in-memory changes to disk. We only track
// 'prev_record_lsn' in memory for the latest processed record, so we
@@ -2887,10 +2893,6 @@ impl Timeline {
x.unwrap()
));
save_metadata(self.conf, &self.tenant_id, &self.timeline_id, &metadata)
.await
.context("save_metadata")?;
if let Some(remote_client) = &self.remote_client {
for (path, layer_metadata) in layer_paths_to_upload {
remote_client.schedule_layer_file_upload(&path, &layer_metadata)?;
@@ -2898,6 +2900,20 @@ impl Timeline {
remote_client.schedule_index_upload_for_metadata_update(&metadata)?;
}
Ok(metadata)
}
async fn update_metadata_file(
&self,
disk_consistent_lsn: Lsn,
layer_paths_to_upload: HashMap<LayerFileName, LayerFileMetadata>,
) -> anyhow::Result<()> {
let metadata = self.schedule_uploads(disk_consistent_lsn, layer_paths_to_upload)?;
save_metadata(self.conf, &self.tenant_id, &self.timeline_id, &metadata)
.await
.context("save_metadata")?;
Ok(())
}
@@ -3063,6 +3079,7 @@ impl Timeline {
Ok(false)
}
#[tracing::instrument(skip_all, fields(%lsn, %force))]
async fn create_image_layers(
&self,
partitioning: &KeyPartitioning,
@@ -3853,22 +3870,21 @@ impl Timeline {
// now, we just skip the file to avoid unintentional modification to files on the disk and in the layer map.
let mut duplicated_layers = HashSet::new();
let mut uploaded_layers = Vec::with_capacity(new_layers.len());
let mut insert_layers = Vec::new();
let mut remove_layers = Vec::new();
for l in new_layers {
for l in &new_layers {
let new_delta_path = l.path();
let metadata = new_delta_path.metadata().with_context(|| {
format!("read file metadata for new created layer {new_delta_path}")
})?;
if let Some(remote_client) = &self.remote_client {
remote_client.schedule_layer_file_upload(
&l.filename(),
&LayerFileMetadata::new(metadata.len(), self.generation),
)?;
}
uploaded_layers.push((
l.filename(),
LayerFileMetadata::new(metadata.len(), self.generation),
));
// update metrics, including the timeline's physical size
self.metrics.record_new_file_metrics(metadata.len());
@@ -3881,7 +3897,7 @@ impl Timeline {
LayerResidenceStatus::Resident,
LayerResidenceEventReason::LayerCreate,
);
let l = l as Arc<dyn PersistentLayer>;
let l = l.to_owned() as Arc<dyn PersistentLayer>;
if guard.contains(&l) {
tracing::error!(layer=%l, "duplicated L1 layer");
duplicated_layers.insert(l.layer_desc().key());
@@ -3913,13 +3929,12 @@ impl Timeline {
&self.metrics,
)?;
drop_wlock(guard);
// Also schedule the deletions in remote storage
if let Some(remote_client) = &self.remote_client {
remote_client.schedule_layer_file_deletion(layer_names_to_delete)?;
if let Some(remote_client) = self.remote_client.as_ref() {
remote_client.schedule_compaction_update(&layer_names_to_delete, &uploaded_layers)?;
}
drop_wlock(guard);
Ok(())
}

View File

@@ -26,8 +26,7 @@ use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey;
use storage_broker::proto::SafekeeperTimelineInfo;
use storage_broker::proto::SubscribeSafekeeperInfoRequest;
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
use storage_broker::BrokerClientChannel;
use storage_broker::Streaming;
use storage_broker::{BrokerClientChannel, Code, Streaming};
use tokio::select;
use tracing::*;
@@ -137,8 +136,17 @@ pub(super) async fn connection_manager_loop_step(
broker_update = broker_subscription.message() => {
match broker_update {
Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
Err(e) => {
error!("broker subscription failed: {e}");
Err(status) => {
match status.code() {
Code::Unknown if status.message().contains("stream closed because of a broken pipe") => {
// tonic's error handling doesn't provide a clear code for disconnections: we get
// "h2 protocol error: error reading a body from connection: stream closed because of a broken pipe"
info!("broker disconnected: {status}");
},
_ => {
warn!("broker subscription failed: {status}");
}
}
return ControlFlow::Continue(());
}
Ok(None) => {

View File

@@ -18,7 +18,8 @@ use std::fs::{self, File, OpenOptions};
use std::io::{Error, ErrorKind, Seek, SeekFrom};
use std::os::unix::fs::FileExt;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use std::sync::{RwLock, RwLockWriteGuard};
use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
use tokio::time::Instant;
///
/// A virtual file descriptor. You can use this just like std::fs::File, but internally
@@ -110,7 +111,7 @@ impl OpenFiles {
///
/// On return, we hold a lock on the slot, and its 'tag' has been updated
/// recently_used has been set. It's all ready for reuse.
fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard<SlotInner>) {
async fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard<SlotInner>) {
//
// Run the clock algorithm to find a slot to replace.
//
@@ -142,7 +143,7 @@ impl OpenFiles {
}
retries += 1;
} else {
slot_guard = slot.inner.write().unwrap();
slot_guard = slot.inner.write().await;
index = next;
break;
}
@@ -153,7 +154,7 @@ impl OpenFiles {
// old file.
//
if let Some(old_file) = slot_guard.file.take() {
// the normal path of dropping VirtualFile uses "close", use "close-by-replace" here to
// the normal path of dropping VirtualFile uses `Close`, use `CloseByReplace` here to
// distinguish the two.
STORAGE_IO_TIME_METRIC
.get(StorageIoOperation::CloseByReplace)
@@ -208,6 +209,29 @@ impl CrashsafeOverwriteError {
}
}
/// Observe duration for the given storage I/O operation
///
/// Unlike `observe_closure_duration`, this supports async,
/// where "support" means that we measure wall clock time.
macro_rules! observe_duration {
($op:expr, $($body:tt)*) => {{
let instant = Instant::now();
let result = $($body)*;
let elapsed = instant.elapsed().as_secs_f64();
STORAGE_IO_TIME_METRIC
.get($op)
.observe(elapsed);
result
}}
}
macro_rules! with_file {
($this:expr, $op:expr, | $ident:ident | $($body:tt)*) => {{
let $ident = $this.lock_file().await?;
observe_duration!($op, $($body)*)
}};
}
impl VirtualFile {
/// Open a file in read-only mode. Like File::open.
pub async fn open(path: &Utf8Path) -> Result<VirtualFile, std::io::Error> {
@@ -244,11 +268,9 @@ impl VirtualFile {
tenant_id = "*".to_string();
timeline_id = "*".to_string();
}
let (handle, mut slot_guard) = get_open_files().find_victim_slot();
let (handle, mut slot_guard) = get_open_files().find_victim_slot().await;
let file = STORAGE_IO_TIME_METRIC
.get(StorageIoOperation::Open)
.observe_closure_duration(|| open_options.open(path))?;
let file = observe_duration!(StorageIoOperation::Open, open_options.open(path))?;
// Strip all options other than read and write.
//
@@ -331,22 +353,24 @@ impl VirtualFile {
/// Call File::sync_all() on the underlying File.
pub async fn sync_all(&self) -> Result<(), Error> {
self.with_file(StorageIoOperation::Fsync, |file| file.sync_all())
.await?
with_file!(self, StorageIoOperation::Fsync, |file| file
.as_ref()
.sync_all())
}
pub async fn metadata(&self) -> Result<fs::Metadata, Error> {
self.with_file(StorageIoOperation::Metadata, |file| file.metadata())
.await?
with_file!(self, StorageIoOperation::Metadata, |file| file
.as_ref()
.metadata())
}
/// Helper function that looks up the underlying File for this VirtualFile,
/// opening it and evicting some other File if necessary. It calls 'func'
/// with the physical File.
async fn with_file<F, R>(&self, op: StorageIoOperation, mut func: F) -> Result<R, Error>
where
F: FnMut(&File) -> R,
{
/// Helper function internal to `VirtualFile` that looks up the underlying File,
/// opens it and evicts some other File if necessary. The passed parameter is
/// assumed to be a function available for the physical `File`.
///
/// We are doing it via a macro as Rust doesn't support async closures that
/// take on parameters with lifetimes.
async fn lock_file(&self) -> Result<FileGuard<'_>, Error> {
let open_files = get_open_files();
let mut handle_guard = {
@@ -356,27 +380,23 @@ impl VirtualFile {
// We only need to hold the handle lock while we read the current handle. If
// another thread closes the file and recycles the slot for a different file,
// we will notice that the handle we read is no longer valid and retry.
let mut handle = *self.handle.read().unwrap();
let mut handle = *self.handle.read().await;
loop {
// Check if the slot contains our File
{
let slot = &open_files.slots[handle.index];
let slot_guard = slot.inner.read().unwrap();
if slot_guard.tag == handle.tag {
if let Some(file) = &slot_guard.file {
// Found a cached file descriptor.
slot.recently_used.store(true, Ordering::Relaxed);
return Ok(STORAGE_IO_TIME_METRIC
.get(op)
.observe_closure_duration(|| func(file)));
}
let slot_guard = slot.inner.read().await;
if slot_guard.tag == handle.tag && slot_guard.file.is_some() {
// Found a cached file descriptor.
slot.recently_used.store(true, Ordering::Relaxed);
return Ok(FileGuard { slot_guard });
}
}
// The slot didn't contain our File. We will have to open it ourselves,
// but before that, grab a write lock on handle in the VirtualFile, so
// that no other thread will try to concurrently open the same file.
let handle_guard = self.handle.write().unwrap();
let handle_guard = self.handle.write().await;
// If another thread changed the handle while we were not holding the lock,
// then the handle might now be valid again. Loop back to retry.
@@ -390,17 +410,10 @@ impl VirtualFile {
// We need to open the file ourselves. The handle in the VirtualFile is
// now locked in write-mode. Find a free slot to put it in.
let (handle, mut slot_guard) = open_files.find_victim_slot();
let (handle, mut slot_guard) = open_files.find_victim_slot().await;
// Open the physical file
let file = STORAGE_IO_TIME_METRIC
.get(StorageIoOperation::Open)
.observe_closure_duration(|| self.open_options.open(&self.path))?;
// Perform the requested operation on it
let result = STORAGE_IO_TIME_METRIC
.get(op)
.observe_closure_duration(|| func(&file));
let file = observe_duration!(StorageIoOperation::Open, self.open_options.open(&self.path))?;
// Store the File in the slot and update the handle in the VirtualFile
// to point to it.
@@ -408,7 +421,9 @@ impl VirtualFile {
*handle_guard = handle;
Ok(result)
return Ok(FileGuard {
slot_guard: slot_guard.downgrade(),
});
}
pub fn remove(self) {
@@ -423,11 +438,9 @@ impl VirtualFile {
self.pos = offset;
}
SeekFrom::End(offset) => {
self.pos = self
.with_file(StorageIoOperation::Seek, |mut file| {
file.seek(SeekFrom::End(offset))
})
.await??
self.pos = with_file!(self, StorageIoOperation::Seek, |file| file
.as_ref()
.seek(SeekFrom::End(offset)))?
}
SeekFrom::Current(offset) => {
let pos = self.pos as i128 + offset as i128;
@@ -515,9 +528,9 @@ impl VirtualFile {
}
pub async fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
let result = self
.with_file(StorageIoOperation::Read, |file| file.read_at(buf, offset))
.await?;
let result = with_file!(self, StorageIoOperation::Read, |file| file
.as_ref()
.read_at(buf, offset));
if let Ok(size) = result {
STORAGE_IO_SIZE
.with_label_values(&["read", &self.tenant_id, &self.timeline_id])
@@ -527,9 +540,9 @@ impl VirtualFile {
}
async fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
let result = self
.with_file(StorageIoOperation::Write, |file| file.write_at(buf, offset))
.await?;
let result = with_file!(self, StorageIoOperation::Write, |file| file
.as_ref()
.write_at(buf, offset));
if let Ok(size) = result {
STORAGE_IO_SIZE
.with_label_values(&["write", &self.tenant_id, &self.timeline_id])
@@ -539,6 +552,18 @@ impl VirtualFile {
}
}
struct FileGuard<'a> {
slot_guard: RwLockReadGuard<'a, SlotInner>,
}
impl<'a> AsRef<File> for FileGuard<'a> {
fn as_ref(&self) -> &File {
// This unwrap is safe because we only create `FileGuard`s
// if we know that the file is Some.
self.slot_guard.file.as_ref().unwrap()
}
}
#[cfg(test)]
impl VirtualFile {
pub(crate) async fn read_blk(
@@ -571,20 +596,39 @@ impl VirtualFile {
impl Drop for VirtualFile {
/// If a VirtualFile is dropped, close the underlying file if it was open.
fn drop(&mut self) {
let handle = self.handle.get_mut().unwrap();
let handle = self.handle.get_mut();
// We could check with a read-lock first, to avoid waiting on an
// unrelated I/O.
let slot = &get_open_files().slots[handle.index];
let mut slot_guard = slot.inner.write().unwrap();
if slot_guard.tag == handle.tag {
slot.recently_used.store(false, Ordering::Relaxed);
// there is also operation "close-by-replace" for closes done on eviction for
// comparison.
STORAGE_IO_TIME_METRIC
.get(StorageIoOperation::Close)
.observe_closure_duration(|| drop(slot_guard.file.take()));
fn clean_slot(slot: &Slot, mut slot_guard: RwLockWriteGuard<'_, SlotInner>, tag: u64) {
if slot_guard.tag == tag {
slot.recently_used.store(false, Ordering::Relaxed);
// there is also the `CloseByReplace` operation for closes done on eviction for
// comparison.
STORAGE_IO_TIME_METRIC
.get(StorageIoOperation::Close)
.observe_closure_duration(|| drop(slot_guard.file.take()));
}
}
// We don't have async drop so we cannot directly await the lock here.
// Instead, first do a best-effort attempt at closing the underlying
// file descriptor by using `try_write`, and if that fails, spawn
// a tokio task to do it asynchronously: we just want it to be
// cleaned up eventually.
// Most of the time, the `try_lock` should succeed though,
// as we have `&mut self` access. In other words, if the slot
// is still occupied by our file, there should be no access from
// other I/O operations; the only other possible place to lock
// the slot is the lock algorithm looking for free slots.
let slot = &get_open_files().slots[handle.index];
if let Ok(slot_guard) = slot.inner.try_write() {
clean_slot(slot, slot_guard, handle.tag);
} else {
let tag = handle.tag;
tokio::spawn(async move {
let slot_guard = slot.inner.write().await;
clean_slot(slot, slot_guard, tag);
});
};
}
}

12
poetry.lock generated
View File

@@ -2447,20 +2447,20 @@ test = ["websockets"]
[[package]]
name = "werkzeug"
version = "2.2.3"
version = "3.0.1"
description = "The comprehensive WSGI web application library."
optional = false
python-versions = ">=3.7"
python-versions = ">=3.8"
files = [
{file = "Werkzeug-2.2.3-py3-none-any.whl", hash = "sha256:56433961bc1f12533306c624f3be5e744389ac61d722175d543e1751285da612"},
{file = "Werkzeug-2.2.3.tar.gz", hash = "sha256:2e1ccc9417d4da358b9de6f174e3ac094391ea1d4fbef2d667865d819dfd0afe"},
{file = "werkzeug-3.0.1-py3-none-any.whl", hash = "sha256:90a285dc0e42ad56b34e696398b8122ee4c681833fb35b8334a095d82c56da10"},
{file = "werkzeug-3.0.1.tar.gz", hash = "sha256:507e811ecea72b18a404947aded4b3390e1db8f826b494d76550ef45bb3b1dcc"},
]
[package.dependencies]
MarkupSafe = ">=2.1.1"
[package.extras]
watchdog = ["watchdog"]
watchdog = ["watchdog (>=2.3)"]
[[package]]
name = "wrapt"
@@ -2719,4 +2719,4 @@ cffi = ["cffi (>=1.11)"]
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
content-hash = "c5981d8d7c2deadd47c823bc35f86f830c8e320b653d2d3718bade1f4d2dabca"
content-hash = "74649cf47c52f21b01b096a42044750b1c9677576b405be0489c2909127a9bf1"

View File

@@ -3,7 +3,9 @@ mod hacks;
mod link;
pub use link::LinkAuthError;
use tokio_postgres::config::AuthKeys;
use crate::proxy::{handle_try_wake, retry_after};
use crate::{
auth::{self, ClientCredentials},
config::AuthenticationConfig,
@@ -16,8 +18,9 @@ use crate::{
};
use futures::TryFutureExt;
use std::borrow::Cow;
use std::ops::ControlFlow;
use tokio::io::{AsyncRead, AsyncWrite};
use tracing::info;
use tracing::{error, info, warn};
/// A product of successful authentication.
pub struct AuthSuccess<T> {
@@ -117,22 +120,27 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
}
}
pub enum ComputeCredentials {
Password(Vec<u8>),
AuthKeys(AuthKeys),
}
/// True to its name, this function encapsulates our current auth trade-offs.
/// Here, we choose the appropriate auth flow based on circumstances.
async fn auth_quirks(
async fn auth_quirks_creds(
api: &impl console::Api,
extra: &ConsoleReqExtra<'_>,
creds: &mut ClientCredentials<'_>,
client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
allow_cleartext: bool,
config: &'static AuthenticationConfig,
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
) -> auth::Result<AuthSuccess<ComputeCredentials>> {
// If there's no project so far, that entails that client doesn't
// support SNI or other means of passing the endpoint (project) name.
// We now expect to see a very specific payload in the place of password.
if creds.project.is_none() {
// Password will be checked by the compute node later.
return hacks::password_hack(api, extra, creds, client).await;
return hacks::password_hack(creds, client).await;
}
// Password hack should set the project name.
@@ -143,13 +151,55 @@ async fn auth_quirks(
// Currently, we use it for websocket connections (latency).
if allow_cleartext {
// Password will be checked by the compute node later.
return hacks::cleartext_hack(api, extra, creds, client).await;
return hacks::cleartext_hack(client).await;
}
// Finally, proceed with the main auth flow (SCRAM-based).
classic::authenticate(api, extra, creds, client, config).await
}
/// True to its name, this function encapsulates our current auth trade-offs.
/// Here, we choose the appropriate auth flow based on circumstances.
async fn auth_quirks(
api: &impl console::Api,
extra: &ConsoleReqExtra<'_>,
creds: &mut ClientCredentials<'_>,
client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
allow_cleartext: bool,
config: &'static AuthenticationConfig,
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
let auth_stuff = auth_quirks_creds(api, extra, creds, client, allow_cleartext, config).await?;
let mut num_retries = 0;
let mut node = loop {
let wake_res = api.wake_compute(extra, creds).await;
match handle_try_wake(wake_res, num_retries) {
Err(e) => {
error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
return Err(e.into());
}
Ok(ControlFlow::Continue(e)) => {
warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
}
Ok(ControlFlow::Break(n)) => break n,
}
let wait_duration = retry_after(num_retries);
num_retries += 1;
tokio::time::sleep(wait_duration).await;
};
match auth_stuff.value {
ComputeCredentials::Password(password) => node.config.password(password),
ComputeCredentials::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys),
};
Ok(AuthSuccess {
reported_auth_ok: auth_stuff.reported_auth_ok,
value: node,
})
}
impl BackendType<'_, ClientCredentials<'_>> {
/// Get compute endpoint name from the credentials.
pub fn get_endpoint(&self) -> Option<String> {

View File

@@ -1,17 +1,14 @@
use std::ops::ControlFlow;
use super::AuthSuccess;
use super::{AuthSuccess, ComputeCredentials};
use crate::{
auth::{self, AuthFlow, ClientCredentials},
compute,
config::AuthenticationConfig,
console::{self, AuthInfo, CachedNodeInfo, ConsoleReqExtra},
proxy::{handle_try_wake, retry_after},
console::{self, AuthInfo, ConsoleReqExtra},
sasl, scram,
stream::PqStream,
};
use tokio::io::{AsyncRead, AsyncWrite};
use tracing::{error, info, warn};
use tracing::{info, warn};
pub(super) async fn authenticate(
api: &impl console::Api,
@@ -19,7 +16,7 @@ pub(super) async fn authenticate(
creds: &ClientCredentials<'_>,
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
config: &'static AuthenticationConfig,
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
) -> auth::Result<AuthSuccess<ComputeCredentials>> {
info!("fetching user's authentication info");
let info = api.get_auth_info(extra, creds).await?.unwrap_or_else(|| {
// If we don't have an authentication secret, we mock one to
@@ -66,38 +63,17 @@ pub(super) async fn authenticate(
}
};
Some(compute::ScramKeys {
compute::ScramKeys {
client_key: client_key.as_bytes(),
server_key: secret.server_key.as_bytes(),
})
}
}
};
let mut num_retries = 0;
let mut node = loop {
let wake_res = api.wake_compute(extra, creds).await;
match handle_try_wake(wake_res, num_retries) {
Err(e) => {
error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
return Err(e.into());
}
Ok(ControlFlow::Continue(e)) => {
warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
}
Ok(ControlFlow::Break(n)) => break n,
}
let wait_duration = retry_after(num_retries);
num_retries += 1;
tokio::time::sleep(wait_duration).await;
};
if let Some(keys) = scram_keys {
use tokio_postgres::config::AuthKeys;
node.config.auth_keys(AuthKeys::ScramSha256(keys));
}
Ok(AuthSuccess {
reported_auth_ok: false,
value: node,
value: ComputeCredentials::AuthKeys(tokio_postgres::config::AuthKeys::ScramSha256(
scram_keys,
)),
})
}

View File

@@ -1,10 +1,6 @@
use super::AuthSuccess;
use super::{AuthSuccess, ComputeCredentials};
use crate::{
auth::{self, AuthFlow, ClientCredentials},
console::{
self,
provider::{CachedNodeInfo, ConsoleReqExtra},
},
stream,
};
use tokio::io::{AsyncRead, AsyncWrite};
@@ -15,11 +11,8 @@ use tracing::{info, warn};
/// These properties are benefical for serverless JS workers, so we
/// use this mechanism for websocket connections.
pub async fn cleartext_hack(
api: &impl console::Api,
extra: &ConsoleReqExtra<'_>,
creds: &mut ClientCredentials<'_>,
client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
) -> auth::Result<AuthSuccess<ComputeCredentials>> {
warn!("cleartext auth flow override is enabled, proceeding");
let password = AuthFlow::new(client)
.begin(auth::CleartextPassword)
@@ -27,24 +20,19 @@ pub async fn cleartext_hack(
.authenticate()
.await?;
let mut node = api.wake_compute(extra, creds).await?;
node.config.password(password);
// Report tentative success; compute node will check the password anyway.
Ok(AuthSuccess {
reported_auth_ok: false,
value: node,
value: ComputeCredentials::Password(password),
})
}
/// Workaround for clients which don't provide an endpoint (project) name.
/// Very similar to [`cleartext_hack`], but there's a specific password format.
pub async fn password_hack(
api: &impl console::Api,
extra: &ConsoleReqExtra<'_>,
creds: &mut ClientCredentials<'_>,
client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
) -> auth::Result<AuthSuccess<ComputeCredentials>> {
warn!("project not specified, resorting to the password hack auth flow");
let payload = AuthFlow::new(client)
.begin(auth::PasswordHack)
@@ -55,12 +43,9 @@ pub async fn password_hack(
info!(project = &payload.endpoint, "received missing parameter");
creds.project = Some(payload.endpoint);
let mut node = api.wake_compute(extra, creds).await?;
node.config.password(payload.password);
// Report tentative success; compute node will check the password anyway.
Ok(AuthSuccess {
reported_auth_ok: false,
value: node,
value: ComputeCredentials::Password(payload.password),
})
}

View File

@@ -4,10 +4,11 @@ use proxy::config::AuthenticationConfig;
use proxy::config::HttpConfig;
use proxy::console;
use proxy::http;
use proxy::metrics;
use proxy::usage_metrics;
use anyhow::bail;
use proxy::config::{self, ProxyConfig};
use proxy::serverless;
use std::pin::pin;
use std::{borrow::Cow, net::SocketAddr};
use tokio::net::TcpListener;
@@ -129,14 +130,16 @@ async fn main() -> anyhow::Result<()> {
cancellation_token.clone(),
));
if let Some(wss_address) = args.wss {
let wss_address: SocketAddr = wss_address.parse()?;
info!("Starting wss on {wss_address}");
let wss_listener = TcpListener::bind(wss_address).await?;
// TODO: rename the argument to something like serverless.
// It now covers more than just websockets, it also covers SQL over HTTP.
if let Some(serverless_address) = args.wss {
let serverless_address: SocketAddr = serverless_address.parse()?;
info!("Starting wss on {serverless_address}");
let serverless_listener = TcpListener::bind(serverless_address).await?;
client_tasks.spawn(http::websocket::task_main(
client_tasks.spawn(serverless::task_main(
config,
wss_listener,
serverless_listener,
cancellation_token.clone(),
));
}
@@ -144,11 +147,11 @@ async fn main() -> anyhow::Result<()> {
// maintenance tasks. these never return unless there's an error
let mut maintenance_tasks = JoinSet::new();
maintenance_tasks.spawn(proxy::handle_signals(cancellation_token));
maintenance_tasks.spawn(http::server::task_main(http_listener));
maintenance_tasks.spawn(http::health_server::task_main(http_listener));
maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener));
if let Some(metrics_config) = &config.metric_collection {
maintenance_tasks.spawn(metrics::task_main(metrics_config));
maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
}
let maintenance = loop {

View File

@@ -90,7 +90,11 @@ pub mod errors {
status: http::StatusCode::LOCKED,
ref text,
} => {
!text.contains("written data quota exceeded")
// written data quota exceeded
// data transfer quota exceeded
// compute time quota exceeded
// logical size quota exceeded
!text.contains("quota exceeded")
&& !text.contains("the limit for current plan reached")
}
// retry server errors

View File

@@ -2,10 +2,7 @@
//! Other modules should use stuff from this module instead of
//! directly relying on deps like `reqwest` (think loose coupling).
pub mod conn_pool;
pub mod server;
pub mod sql_over_http;
pub mod websocket;
pub mod health_server;
use std::{sync::Arc, time::Duration};

View File

@@ -14,14 +14,15 @@ pub mod console;
pub mod error;
pub mod http;
pub mod logging;
pub mod metrics;
pub mod parse;
pub mod protocol2;
pub mod proxy;
pub mod sasl;
pub mod scram;
pub mod serverless;
pub mod stream;
pub mod url;
pub mod usage_metrics;
pub mod waiters;
/// Handle unix signals appropriately.

View File

@@ -8,9 +8,9 @@ use crate::{
config::{AuthenticationConfig, ProxyConfig, TlsConfig},
console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api},
http::StatusCode,
metrics::{Ids, USAGE_METRICS},
protocol2::WithClientIp,
stream::{PqStream, Stream},
usage_metrics::{Ids, USAGE_METRICS},
};
use anyhow::{bail, Context};
use async_trait::async_trait;

View File

@@ -1,235 +1,36 @@
use crate::{
cancellation::CancelMap,
config::ProxyConfig,
error::io_error,
protocol2::{ProxyProtocolAccept, WithClientIp},
proxy::{
handle_client, ClientMode, NUM_CLIENT_CONNECTION_CLOSED_COUNTER,
NUM_CLIENT_CONNECTION_OPENED_COUNTER,
},
};
//! Routers for our serverless APIs
//!
//! Handles both SQL over HTTP and SQL over Websockets.
mod conn_pool;
mod sql_over_http;
mod websocket;
use anyhow::bail;
use bytes::{Buf, Bytes};
use futures::{Sink, Stream, StreamExt};
use hyper::StatusCode;
pub use reqwest_middleware::{ClientWithMiddleware, Error};
pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
use crate::proxy::{NUM_CLIENT_CONNECTION_CLOSED_COUNTER, NUM_CLIENT_CONNECTION_OPENED_COUNTER};
use crate::{cancellation::CancelMap, config::ProxyConfig};
use futures::StreamExt;
use hyper::{
server::{
accept,
conn::{AddrIncoming, AddrStream},
},
upgrade::Upgraded,
Body, Method, Request, Response, StatusCode,
Body, Method, Request, Response,
};
use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
use pin_project_lite::pin_project;
use std::{
future::ready,
pin::Pin,
sync::Arc,
task::{ready, Context, Poll},
};
use std::task::Poll;
use std::{future::ready, sync::Arc};
use tls_listener::TlsListener;
use tokio::{
io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf},
net::TcpListener,
};
use tokio::net::TcpListener;
use tokio_util::sync::CancellationToken;
use tracing::{error, info, info_span, warn, Instrument};
use utils::http::{error::ApiError, json::json_response};
// TODO: use `std::sync::Exclusive` once it's stabilized.
// Tracking issue: https://github.com/rust-lang/rust/issues/98407.
use sync_wrapper::SyncWrapper;
use super::{conn_pool::GlobalConnPool, sql_over_http};
pin_project! {
/// This is a wrapper around a [`WebSocketStream`] that
/// implements [`AsyncRead`] and [`AsyncWrite`].
pub struct WebSocketRw {
#[pin]
stream: SyncWrapper<WebSocketStream<Upgraded>>,
bytes: Bytes,
}
}
impl WebSocketRw {
pub fn new(stream: WebSocketStream<Upgraded>) -> Self {
Self {
stream: stream.into(),
bytes: Bytes::new(),
}
}
}
impl AsyncWrite for WebSocketRw {
fn poll_write(
self: Pin<&mut Self>,
cx: &mut Context<'_>,
buf: &[u8],
) -> Poll<io::Result<usize>> {
let mut stream = self.project().stream.get_pin_mut();
ready!(stream.as_mut().poll_ready(cx).map_err(io_error))?;
match stream.as_mut().start_send(Message::Binary(buf.into())) {
Ok(()) => Poll::Ready(Ok(buf.len())),
Err(e) => Poll::Ready(Err(io_error(e))),
}
}
fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
let stream = self.project().stream.get_pin_mut();
stream.poll_flush(cx).map_err(io_error)
}
fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
let stream = self.project().stream.get_pin_mut();
stream.poll_close(cx).map_err(io_error)
}
}
impl AsyncRead for WebSocketRw {
fn poll_read(
mut self: Pin<&mut Self>,
cx: &mut Context<'_>,
buf: &mut ReadBuf<'_>,
) -> Poll<io::Result<()>> {
if buf.remaining() > 0 {
let bytes = ready!(self.as_mut().poll_fill_buf(cx))?;
let len = std::cmp::min(bytes.len(), buf.remaining());
buf.put_slice(&bytes[..len]);
self.consume(len);
}
Poll::Ready(Ok(()))
}
}
impl AsyncBufRead for WebSocketRw {
fn poll_fill_buf(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<&[u8]>> {
// Please refer to poll_fill_buf's documentation.
const EOF: Poll<io::Result<&[u8]>> = Poll::Ready(Ok(&[]));
let mut this = self.project();
loop {
if !this.bytes.chunk().is_empty() {
let chunk = (*this.bytes).chunk();
return Poll::Ready(Ok(chunk));
}
let res = ready!(this.stream.as_mut().get_pin_mut().poll_next(cx));
match res.transpose().map_err(io_error)? {
Some(message) => match message {
Message::Ping(_) => {}
Message::Pong(_) => {}
Message::Text(text) => {
// We expect to see only binary messages.
let error = "unexpected text message in the websocket";
warn!(length = text.len(), error);
return Poll::Ready(Err(io_error(error)));
}
Message::Frame(_) => {
// This case is impossible according to Frame's doc.
panic!("unexpected raw frame in the websocket");
}
Message::Binary(chunk) => {
assert!(this.bytes.is_empty());
*this.bytes = Bytes::from(chunk);
}
Message::Close(_) => return EOF,
},
None => return EOF,
}
}
}
fn consume(self: Pin<&mut Self>, amount: usize) {
self.project().bytes.advance(amount);
}
}
async fn serve_websocket(
websocket: HyperWebsocket,
config: &'static ProxyConfig,
cancel_map: &CancelMap,
session_id: uuid::Uuid,
hostname: Option<String>,
) -> anyhow::Result<()> {
let websocket = websocket.await?;
handle_client(
config,
cancel_map,
session_id,
WebSocketRw::new(websocket),
ClientMode::Websockets { hostname },
)
.await?;
Ok(())
}
async fn ws_handler(
mut request: Request<Body>,
config: &'static ProxyConfig,
conn_pool: Arc<GlobalConnPool>,
cancel_map: Arc<CancelMap>,
session_id: uuid::Uuid,
sni_hostname: Option<String>,
) -> Result<Response<Body>, ApiError> {
let host = request
.headers()
.get("host")
.and_then(|h| h.to_str().ok())
.and_then(|h| h.split(':').next())
.map(|s| s.to_string());
// Check if the request is a websocket upgrade request.
if hyper_tungstenite::is_upgrade_request(&request) {
info!(session_id = ?session_id, "performing websocket upgrade");
let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
.map_err(|e| ApiError::BadRequest(e.into()))?;
tokio::spawn(
async move {
if let Err(e) =
serve_websocket(websocket, config, &cancel_map, session_id, host).await
{
error!(session_id = ?session_id, "error in websocket connection: {e:#}");
}
}
.in_current_span(),
);
// Return the response so the spawned future can continue.
Ok(response)
// TODO: that deserves a refactor as now this function also handles http json client besides websockets.
// Right now I don't want to blow up sql-over-http patch with file renames and do that as a follow up instead.
} else if request.uri().path() == "/sql" && request.method() == Method::POST {
sql_over_http::handle(
request,
sni_hostname,
conn_pool,
session_id,
&config.http_config,
)
.await
} else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
Response::builder()
.header("Allow", "OPTIONS, POST")
.header("Access-Control-Allow-Origin", "*")
.header(
"Access-Control-Allow-Headers",
"Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In",
)
.header("Access-Control-Max-Age", "86400" /* 24 hours */)
.status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code
.body(Body::empty())
.map_err(|e| ApiError::InternalServerError(e.into()))
} else {
json_response(StatusCode::BAD_REQUEST, "query is not supported")
}
}
pub async fn task_main(
config: &'static ProxyConfig,
ws_listener: TcpListener,
@@ -239,7 +40,7 @@ pub async fn task_main(
info!("websocket server has shut down");
}
let conn_pool: Arc<GlobalConnPool> = GlobalConnPool::new(config);
let conn_pool = conn_pool::GlobalConnPool::new(config);
// shutdown the connection pool
tokio::spawn({
@@ -300,13 +101,15 @@ pub async fn task_main(
let cancel_map = Arc::new(CancelMap::default());
let session_id = uuid::Uuid::new_v4();
ws_handler(req, config, conn_pool, cancel_map, session_id, sni_name)
.instrument(info_span!(
"ws-client",
session = %session_id,
%peer_addr,
))
.await
request_handler(
req, config, conn_pool, cancel_map, session_id, sni_name,
)
.instrument(info_span!(
"serverless",
session = %session_id,
%peer_addr,
))
.await
}
},
)))
@@ -359,3 +162,65 @@ where
self.inner.call(req)
}
}
async fn request_handler(
mut request: Request<Body>,
config: &'static ProxyConfig,
conn_pool: Arc<conn_pool::GlobalConnPool>,
cancel_map: Arc<CancelMap>,
session_id: uuid::Uuid,
sni_hostname: Option<String>,
) -> Result<Response<Body>, ApiError> {
let host = request
.headers()
.get("host")
.and_then(|h| h.to_str().ok())
.and_then(|h| h.split(':').next())
.map(|s| s.to_string());
// Check if the request is a websocket upgrade request.
if hyper_tungstenite::is_upgrade_request(&request) {
info!(session_id = ?session_id, "performing websocket upgrade");
let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
.map_err(|e| ApiError::BadRequest(e.into()))?;
tokio::spawn(
async move {
if let Err(e) =
websocket::serve_websocket(websocket, config, &cancel_map, session_id, host)
.await
{
error!(session_id = ?session_id, "error in websocket connection: {e:#}");
}
}
.in_current_span(),
);
// Return the response so the spawned future can continue.
Ok(response)
} else if request.uri().path() == "/sql" && request.method() == Method::POST {
sql_over_http::handle(
request,
sni_hostname,
conn_pool,
session_id,
&config.http_config,
)
.await
} else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
Response::builder()
.header("Allow", "OPTIONS, POST")
.header("Access-Control-Allow-Origin", "*")
.header(
"Access-Control-Allow-Headers",
"Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In",
)
.header("Access-Control-Max-Age", "86400" /* 24 hours */)
.status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code
.body(Body::empty())
.map_err(|e| ApiError::InternalServerError(e.into()))
} else {
json_response(StatusCode::BAD_REQUEST, "query is not supported")
}
}

View File

@@ -22,8 +22,8 @@ use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
use crate::{
auth, console,
metrics::{Ids, MetricCounter, USAGE_METRICS},
proxy::{LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER, NUM_DB_CONNECTIONS_OPENED_COUNTER},
usage_metrics::{Ids, MetricCounter, USAGE_METRICS},
};
use crate::{compute, config};
@@ -191,22 +191,39 @@ impl GlobalConnPool {
// ok return cached connection if found and establish a new one otherwise
let new_client = if let Some(client) = client {
if client.inner.is_closed() {
info!("pool: cached connection '{conn_info}' is closed, opening a new one");
connect_to_compute(self.proxy_config, conn_info, session_id, latency_timer).await
let conn_id = uuid::Uuid::new_v4();
info!(%conn_id, "pool: cached connection '{conn_info}' is closed, opening a new one");
connect_to_compute(
self.proxy_config,
conn_info,
conn_id,
session_id,
latency_timer,
)
.await
} else {
info!("pool: reusing connection '{conn_info}'");
client.session.send(session_id)?;
latency_timer.pool_hit();
latency_timer.success();
return Ok(Client {
conn_id: client.conn_id,
inner: Some(client),
span: Span::current(),
pool,
});
}
} else {
info!("pool: opening a new connection '{conn_info}'");
connect_to_compute(self.proxy_config, conn_info, session_id, latency_timer).await
let conn_id = uuid::Uuid::new_v4();
info!(%conn_id, "pool: opening a new connection '{conn_info}'");
connect_to_compute(
self.proxy_config,
conn_info,
conn_id,
session_id,
latency_timer,
)
.await
};
match &new_client {
@@ -243,6 +260,7 @@ impl GlobalConnPool {
}
new_client.map(|inner| Client {
conn_id: inner.conn_id,
inner: Some(inner),
span: Span::current(),
pool,
@@ -250,16 +268,18 @@ impl GlobalConnPool {
}
fn put(&self, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> {
let conn_id = client.conn_id;
// We want to hold this open while we return. This ensures that the pool can't close
// while we are in the middle of returning the connection.
let closed = self.closed.read();
if *closed {
info!("pool: throwing away connection '{conn_info}' because pool is closed");
info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is closed");
return Ok(());
}
if client.inner.is_closed() {
info!("pool: throwing away connection '{conn_info}' because connection is closed");
info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
return Ok(());
}
@@ -291,9 +311,9 @@ impl GlobalConnPool {
// do logging outside of the mutex
if returned {
info!("pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
} else {
info!("pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
}
Ok(())
@@ -340,6 +360,7 @@ impl GlobalConnPool {
struct TokioMechanism<'a> {
conn_info: &'a ConnInfo,
session_id: uuid::Uuid,
conn_id: uuid::Uuid,
}
#[async_trait]
@@ -353,7 +374,14 @@ impl ConnectMechanism for TokioMechanism<'_> {
node_info: &console::CachedNodeInfo,
timeout: time::Duration,
) -> Result<Self::Connection, Self::ConnectError> {
connect_to_compute_once(node_info, self.conn_info, timeout, self.session_id).await
connect_to_compute_once(
node_info,
self.conn_info,
timeout,
self.conn_id,
self.session_id,
)
.await
}
fn update_connect_config(&self, _config: &mut compute::ConnCfg) {}
@@ -366,6 +394,7 @@ impl ConnectMechanism for TokioMechanism<'_> {
async fn connect_to_compute(
config: &config::ProxyConfig,
conn_info: &ConnInfo,
conn_id: uuid::Uuid,
session_id: uuid::Uuid,
latency_timer: LatencyTimer,
) -> anyhow::Result<ClientInner> {
@@ -401,6 +430,7 @@ async fn connect_to_compute(
crate::proxy::connect_to_compute(
&TokioMechanism {
conn_id,
conn_info,
session_id,
},
@@ -416,6 +446,7 @@ async fn connect_to_compute_once(
node_info: &console::CachedNodeInfo,
conn_info: &ConnInfo,
timeout: time::Duration,
conn_id: uuid::Uuid,
mut session: uuid::Uuid,
) -> Result<ClientInner, tokio_postgres::Error> {
let mut config = (*node_info.config).clone();
@@ -430,7 +461,6 @@ async fn connect_to_compute_once(
let (tx, mut rx) = tokio::sync::watch::channel(session);
let conn_id = uuid::Uuid::new_v4();
let span = info_span!(parent: None, "connection", %conn_id);
span.in_scope(|| {
info!(%conn_info, %session, "new connection");
@@ -484,6 +514,7 @@ async fn connect_to_compute_once(
inner: client,
session: tx,
ids,
conn_id,
})
}
@@ -491,6 +522,7 @@ struct ClientInner {
inner: tokio_postgres::Client,
session: tokio::sync::watch::Sender<uuid::Uuid>,
ids: Ids,
conn_id: uuid::Uuid,
}
impl Client {
@@ -500,12 +532,14 @@ impl Client {
}
pub struct Client {
conn_id: uuid::Uuid,
span: Span,
inner: Option<ClientInner>,
pool: Option<(ConnInfo, Arc<GlobalConnPool>)>,
}
pub struct Discard<'a> {
conn_id: uuid::Uuid,
pool: &'a mut Option<(ConnInfo, Arc<GlobalConnPool>)>,
}
@@ -514,6 +548,7 @@ impl Client {
let Self {
inner,
pool,
conn_id,
span: _,
} = self;
(
@@ -521,7 +556,10 @@ impl Client {
.as_mut()
.expect("client inner should not be removed")
.inner,
Discard { pool },
Discard {
pool,
conn_id: *conn_id,
},
)
}
@@ -537,13 +575,13 @@ impl Discard<'_> {
pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
if status != ReadyForQueryStatus::Idle {
if let Some((conn_info, _)) = self.pool.take() {
info!("pool: throwing away connection '{conn_info}' because connection is not idle")
info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is not idle")
}
}
}
pub fn discard(&mut self) {
if let Some((conn_info, _)) = self.pool.take() {
info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state")
info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is potentially in a broken state")
}
}
}

View File

@@ -0,0 +1,146 @@
use crate::{
cancellation::CancelMap,
config::ProxyConfig,
error::io_error,
proxy::{handle_client, ClientMode},
};
use bytes::{Buf, Bytes};
use futures::{Sink, Stream};
use hyper::upgrade::Upgraded;
use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
use pin_project_lite::pin_project;
use std::{
pin::Pin,
task::{ready, Context, Poll},
};
use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf};
use tracing::warn;
// TODO: use `std::sync::Exclusive` once it's stabilized.
// Tracking issue: https://github.com/rust-lang/rust/issues/98407.
use sync_wrapper::SyncWrapper;
pin_project! {
/// This is a wrapper around a [`WebSocketStream`] that
/// implements [`AsyncRead`] and [`AsyncWrite`].
pub struct WebSocketRw {
#[pin]
stream: SyncWrapper<WebSocketStream<Upgraded>>,
bytes: Bytes,
}
}
impl WebSocketRw {
pub fn new(stream: WebSocketStream<Upgraded>) -> Self {
Self {
stream: stream.into(),
bytes: Bytes::new(),
}
}
}
impl AsyncWrite for WebSocketRw {
fn poll_write(
self: Pin<&mut Self>,
cx: &mut Context<'_>,
buf: &[u8],
) -> Poll<io::Result<usize>> {
let mut stream = self.project().stream.get_pin_mut();
ready!(stream.as_mut().poll_ready(cx).map_err(io_error))?;
match stream.as_mut().start_send(Message::Binary(buf.into())) {
Ok(()) => Poll::Ready(Ok(buf.len())),
Err(e) => Poll::Ready(Err(io_error(e))),
}
}
fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
let stream = self.project().stream.get_pin_mut();
stream.poll_flush(cx).map_err(io_error)
}
fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
let stream = self.project().stream.get_pin_mut();
stream.poll_close(cx).map_err(io_error)
}
}
impl AsyncRead for WebSocketRw {
fn poll_read(
mut self: Pin<&mut Self>,
cx: &mut Context<'_>,
buf: &mut ReadBuf<'_>,
) -> Poll<io::Result<()>> {
if buf.remaining() > 0 {
let bytes = ready!(self.as_mut().poll_fill_buf(cx))?;
let len = std::cmp::min(bytes.len(), buf.remaining());
buf.put_slice(&bytes[..len]);
self.consume(len);
}
Poll::Ready(Ok(()))
}
}
impl AsyncBufRead for WebSocketRw {
fn poll_fill_buf(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<&[u8]>> {
// Please refer to poll_fill_buf's documentation.
const EOF: Poll<io::Result<&[u8]>> = Poll::Ready(Ok(&[]));
let mut this = self.project();
loop {
if !this.bytes.chunk().is_empty() {
let chunk = (*this.bytes).chunk();
return Poll::Ready(Ok(chunk));
}
let res = ready!(this.stream.as_mut().get_pin_mut().poll_next(cx));
match res.transpose().map_err(io_error)? {
Some(message) => match message {
Message::Ping(_) => {}
Message::Pong(_) => {}
Message::Text(text) => {
// We expect to see only binary messages.
let error = "unexpected text message in the websocket";
warn!(length = text.len(), error);
return Poll::Ready(Err(io_error(error)));
}
Message::Frame(_) => {
// This case is impossible according to Frame's doc.
panic!("unexpected raw frame in the websocket");
}
Message::Binary(chunk) => {
assert!(this.bytes.is_empty());
*this.bytes = Bytes::from(chunk);
}
Message::Close(_) => return EOF,
},
None => return EOF,
}
}
}
fn consume(self: Pin<&mut Self>, amount: usize) {
self.project().bytes.advance(amount);
}
}
pub async fn serve_websocket(
websocket: HyperWebsocket,
config: &'static ProxyConfig,
cancel_map: &CancelMap,
session_id: uuid::Uuid,
hostname: Option<String>,
) -> anyhow::Result<()> {
let websocket = websocket.await?;
handle_client(
config,
cancel_map,
session_id,
WebSocketRw::new(websocket),
ClientMode::Websockets { hostname },
)
.await?;
Ok(())
}

View File

@@ -24,7 +24,7 @@ backoff = "^2.2.1"
pytest-lazy-fixture = "^0.6.3"
prometheus-client = "^0.14.1"
pytest-timeout = "^2.1.0"
Werkzeug = "^2.2.3"
Werkzeug = "^3.0.1"
pytest-order = "^1.1.0"
allure-pytest = "^2.13.2"
pytest-asyncio = "^0.21.0"

View File

@@ -15,28 +15,15 @@ The script fetches the durations of benchmarks from the database and stores it i
BENCHMARKS_DURATION_QUERY = """
SELECT
DISTINCT parent_suite, suite, test,
PERCENTILE_DISC(%s) WITHIN GROUP (ORDER BY duration_ms) as percentile_ms
FROM
(
SELECT
jsonb_array_elements(data -> 'children') ->> 'name' as parent_suite,
jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') ->> 'name' as suite,
jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'name' as test,
jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'status' as status,
to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'start')::bigint / 1000)::date as timestamp,
(jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'duration')::int as duration_ms
FROM
regress_test_results
WHERE
reference = 'refs/heads/main'
) data
DISTINCT parent_suite, suite, name,
PERCENTILE_DISC(%s) WITHIN GROUP (ORDER BY duration) as percentile_ms
FROM results
WHERE
timestamp > CURRENT_DATE - INTERVAL '%s' day
started_at > CURRENT_DATE - INTERVAL '%s' day
AND parent_suite = 'test_runner.performance'
AND status = 'passed'
GROUP BY
parent_suite, suite, test
parent_suite, suite, name
;
"""
@@ -44,68 +31,69 @@ BENCHMARKS_DURATION_QUERY = """
# the total duration varies from 8 to 40 minutes.
# We use some pre-collected durations as a fallback to have a better distribution.
FALLBACK_DURATION = {
"test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 57.0,
"test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 28.0,
"test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 71.0,
"test_runner/performance/test_branching.py::test_compare_child_and_root_pgbench_perf": 27.0,
"test_runner/performance/test_branching.py::test_compare_child_and_root_read_perf": 11.0,
"test_runner/performance/test_branching.py::test_compare_child_and_root_write_perf": 30.0,
"test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]": 40.0,
"test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]": 5.0,
"test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[1]": 3.0,
"test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[5]": 10.0,
"test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[10]": 19.0,
"test_runner/performance/test_bulk_update.py::test_bulk_update[10]": 66.0,
"test_runner/performance/test_bulk_update.py::test_bulk_update[50]": 30.0,
"test_runner/performance/test_bulk_update.py::test_bulk_update[100]": 60.0,
"test_runner/performance/test_compaction.py::test_compaction": 77.0,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[neon-5-10-100]": 11.0,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[vanilla-5-10-100]": 16.0,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[neon-5-10-100]": 11.0,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[vanilla-5-10-100]": 18.0,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[neon-5-10-100]": 11.0,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[vanilla-5-10-100]": 16.0,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-1]": 11.0,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-10]": 11.0,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-1]": 10.0,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-10]": 10.0,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[neon-5-10-100]": 11.0,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[vanilla-5-10-100]": 16.0,
"test_runner/performance/test_copy.py::test_copy[neon]": 12.0,
"test_runner/performance/test_copy.py::test_copy[vanilla]": 10.0,
"test_runner/performance/test_gc_feedback.py::test_gc_feedback": 284.0,
"test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 11.0,
"test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 7.0,
"test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 85.0,
"test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[vanilla-1]": 29.0,
"test_runner/performance/test_layer_map.py::test_layer_map": 44.0,
"test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[neon]": 16.0,
"test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[vanilla]": 67.0,
"test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[neon]": 67.0,
"test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[vanilla]": 80.0,
"test_runner/performance/test_perf_pgbench.py::test_pgbench[neon-45-10]": 102.0,
"test_runner/performance/test_perf_pgbench.py::test_pgbench[vanilla-45-10]": 99.0,
"test_runner/performance/test_random_writes.py::test_random_writes[neon]": 9.0,
"test_runner/performance/test_random_writes.py::test_random_writes[vanilla]": 2.0,
"test_runner/performance/test_seqscans.py::test_seqscans[neon-100000-100-0]": 4.0,
"test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-0]": 80.0,
"test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-4]": 68.0,
"test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.0,
"test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 11.0,
"test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 10.0,
"test_runner/performance/test_startup.py::test_startup_simple": 2.0,
"test_runner/performance/test_startup.py::test_startup": 539.0,
"test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 375.0,
"test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 370.0,
"test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[vanilla-10-5-5]": 94.0,
"test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_off-1000]": 164.0,
"test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_on-1000]": 274.0,
"test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[vanilla-1000]": 949.0,
"test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_off-45-100]": 142.0,
"test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_on-45-100]": 151.0,
"test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[vanilla-45-100]": 182.0,
"test_runner/performance/test_write_amplification.py::test_write_amplification[neon]": 13.0,
"test_runner/performance/test_write_amplification.py::test_write_amplification[vanilla]": 16.0,
"test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 62.144,
"test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 90.941,
"test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 26.053,
"test_runner/performance/test_branching.py::test_compare_child_and_root_pgbench_perf": 25.67,
"test_runner/performance/test_branching.py::test_compare_child_and_root_read_perf": 14.497,
"test_runner/performance/test_branching.py::test_compare_child_and_root_write_perf": 18.852,
"test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]": 26.572,
"test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]": 6.259,
"test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[10]": 21.206,
"test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[1]": 3.474,
"test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[5]": 11.262,
"test_runner/performance/test_bulk_update.py::test_bulk_update[100]": 94.225,
"test_runner/performance/test_bulk_update.py::test_bulk_update[10]": 68.159,
"test_runner/performance/test_bulk_update.py::test_bulk_update[50]": 76.719,
"test_runner/performance/test_compaction.py::test_compaction": 110.222,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[neon-5-10-100]": 10.743,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[vanilla-5-10-100]": 16.541,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[neon-5-10-100]": 11.109,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[vanilla-5-10-100]": 18.121,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[neon-5-10-100]": 11.3,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[vanilla-5-10-100]": 16.086,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-10]": 12.024,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-1]": 11.14,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-10]": 10.375,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-1]": 10.075,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[neon-5-10-100]": 11.147,
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[vanilla-5-10-100]": 16.321,
"test_runner/performance/test_copy.py::test_copy[neon]": 16.579,
"test_runner/performance/test_copy.py::test_copy[vanilla]": 10.094,
"test_runner/performance/test_gc_feedback.py::test_gc_feedback": 590.157,
"test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.102,
"test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 8.677,
"test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 31.079,
"test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[vanilla-1]": 38.119,
"test_runner/performance/test_layer_map.py::test_layer_map": 24.784,
"test_runner/performance/test_logical_replication.py::test_logical_replication": 117.707,
"test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[neon]": 21.194,
"test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[vanilla]": 59.068,
"test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[neon]": 73.235,
"test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[vanilla]": 82.586,
"test_runner/performance/test_perf_pgbench.py::test_pgbench[neon-45-10]": 106.536,
"test_runner/performance/test_perf_pgbench.py::test_pgbench[vanilla-45-10]": 98.753,
"test_runner/performance/test_random_writes.py::test_random_writes[neon]": 6.975,
"test_runner/performance/test_random_writes.py::test_random_writes[vanilla]": 3.69,
"test_runner/performance/test_seqscans.py::test_seqscans[neon-100000-100-0]": 3.529,
"test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-0]": 64.522,
"test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-4]": 40.964,
"test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.55,
"test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 12.189,
"test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 13.899,
"test_runner/performance/test_startup.py::test_startup": 890.114,
"test_runner/performance/test_startup.py::test_startup_simple": 2.51,
"test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 527.245,
"test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 583.46,
"test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[vanilla-10-5-5]": 113.653,
"test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_off-1000]": 233.728,
"test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_on-1000]": 419.093,
"test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[vanilla-1000]": 982.461,
"test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_off-45-100]": 116.522,
"test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_on-45-100]": 115.583,
"test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[vanilla-45-100]": 155.282,
"test_runner/performance/test_write_amplification.py::test_write_amplification[neon]": 26.704,
"test_runner/performance/test_write_amplification.py::test_write_amplification[vanilla]": 16.088,
}
@@ -130,7 +118,7 @@ def main(args: argparse.Namespace):
res = FALLBACK_DURATION
for row in rows:
pytest_name = f"{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{row['test']}"
pytest_name = f"{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{row['name']}"
duration = row["percentile_ms"] / 1000
logging.info(f"\t{pytest_name}: {duration}")
res[pytest_name] = duration

View File

@@ -9,28 +9,15 @@ from typing import DefaultDict, Dict
import psycopg2
import psycopg2.extras
# We call the test "flaky" if it failed at least once on the main branch in the last N=10 days.
FLAKY_TESTS_QUERY = """
SELECT
DISTINCT parent_suite, suite, REGEXP_REPLACE(test, '(release|debug)-pg(\\d+)-?', '') as deparametrized_test
FROM
(
SELECT
reference,
jsonb_array_elements(data -> 'children') ->> 'name' as parent_suite,
jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') ->> 'name' as suite,
jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'name' as test,
jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'status' as status,
jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'retriesStatusChange' as retries_status_change,
to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' ->> 'start')::bigint / 1000)::date as timestamp
FROM
regress_test_results
) data
DISTINCT parent_suite, suite, name
FROM results
WHERE
timestamp > CURRENT_DATE - INTERVAL '%s' day
started_at > CURRENT_DATE - INTERVAL '%s' day
AND (
(status IN ('failed', 'broken') AND reference = 'refs/heads/main')
OR retries_status_change::boolean
OR flaky
)
;
"""
@@ -63,12 +50,14 @@ def main(args: argparse.Namespace):
if row["parent_suite"] != "test_runner.regress":
continue
deparametrized_test = row["deparametrized_test"]
dash_if_needed = "" if deparametrized_test.endswith("[]") else "-"
parametrized_test = deparametrized_test.replace(
"[",
f"[{build_type}-pg{pg_version}{dash_if_needed}",
)
if row["name"].endswith("]"):
parametrized_test = row["name"].replace(
"[",
f"[{build_type}-pg{pg_version}-",
)
else:
parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}]"
res[row["parent_suite"]][row["suite"]][parametrized_test] = True
logging.info(

View File

@@ -4,7 +4,7 @@ use std::task::{Context, Poll};
use std::time::Duration;
use tonic::codegen::StdError;
use tonic::transport::{ClientTlsConfig, Endpoint};
use tonic::{transport::Channel, Code, Status};
use tonic::{transport::Channel, Status};
use utils::id::{TenantId, TenantTimelineId, TimelineId};
use proto::{
@@ -23,6 +23,7 @@ pub mod proto {
pub mod metrics;
// Re-exports to avoid direct tonic dependency in user crates.
pub use tonic::Code;
pub use tonic::Request;
pub use tonic::Streaming;

View File

@@ -0,0 +1,43 @@
# Usage from top of repo:
# poetry run python3 test_runner/duplicate_tenant.py b97965931096047b2d54958756baee7b 10
from queue import Queue
import sys
import threading
import requests
from fixtures.pageserver.http import PageserverHttpClient
from fixtures.types import TenantId
initial_tenant = sys.argv[1]
ncopies = int(sys.argv[2])
numthreads = int(sys.argv[3])
# class DuckTypedNeonEnv:
# pass
# cli = NeonCli(DuckTypedNeonEnv())
q = Queue()
for i in range(0, ncopies):
q.put(i)
for i in range(0, numthreads):
q.put(None)
def create():
while True:
if q.get() == None:
break
new_tenant = TenantId.generate()
res = requests.post(
f"http://localhost:9898/v1/tenant/{initial_tenant}/duplicate",
json={"new_tenant_id": str(new_tenant)},
)
res.raise_for_status()
for i in range(0, numthreads):
threading.Thread(target=create).start()

View File

@@ -1719,6 +1719,11 @@ class NeonPageserver(PgProtocol):
break
if error_or_warn.search(line):
# Is this a torn log line? This happens when force-killing a process and restarting
# Example: "2023-10-25T09:38:31.752314Z WARN deletion executo2023-10-25T09:38:31.875947Z INFO version: git-env:0f9452f76e8ccdfc88291bccb3f53e3016f40192"
if re.match("\\d{4}-\\d{2}-\\d{2}T.+\\d{4}-\\d{2}-\\d{2}T.+INFO version.+", line):
continue
# It's an ERROR or WARN. Is it in the allow-list?
for a in self.allowed_errors:
if re.match(a, line):

View File

@@ -215,6 +215,25 @@ class PageserverHttpClient(requests.Session):
assert isinstance(new_tenant_id, str)
return TenantId(new_tenant_id)
def tenant_duplicate(
self, src_tenant_id: TenantId, new_tenant_id: TenantId, conf: Optional[Dict[str, Any]] = None
) -> TenantId:
if conf is not None:
assert "new_tenant_id" not in conf.keys()
res = self.post(
f"http://localhost:{self.port}/v1/tenant/{src_tenant_id}/duplicate",
json={
"new_tenant_id": str(new_tenant_id),
**(conf or {}),
},
)
self.verbose_error(res)
if res.status_code == 409:
raise Exception(f"could not create tenant: already exists for id {new_tenant_id}")
new_tenant_id = res.json()
assert isinstance(new_tenant_id, str)
return TenantId(new_tenant_id)
def tenant_attach(
self,
tenant_id: TenantId,
@@ -441,13 +460,13 @@ class PageserverHttpClient(requests.Session):
assert res_json is None
def timeline_get_lsn_by_timestamp(
self, tenant_id: TenantId, timeline_id: TimelineId, timestamp
self, tenant_id: TenantId, timeline_id: TimelineId, timestamp, version: int
):
log.info(
f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}"
)
res = self.get(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}",
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}&version={version}",
)
self.verbose_error(res)
res_json = res.json()

View File

@@ -4,8 +4,10 @@ First make a release build. The `-s` flag silences a lot of output, and makes it
easier to see if you have compile errors without scrolling up.
`BUILD_TYPE=release CARGO_BUILD_FLAGS="--features=testing" make -s -j8`
You may also need to run `./scripts/pysync`.
Then run the tests
`NEON_BIN=./target/release poetry run pytest test_runner/performance"`
`DEFAULT_PG_VERSION=15 NEON_BIN=./target/release poetry run pytest test_runner/performance`
Some handy pytest flags for local development:
- `-x` tells pytest to stop on first error

View File

@@ -8,6 +8,71 @@ from fixtures.types import Lsn
from fixtures.utils import query_scalar
#
# Test pageserver get_lsn_by_timestamp API
#
def test_lsn_mapping_old(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
new_timeline_id = env.neon_cli.create_branch("test_lsn_mapping")
endpoint_main = env.endpoints.create_start("test_lsn_mapping")
log.info("postgres is running on 'test_lsn_mapping' branch")
cur = endpoint_main.connect().cursor()
# Create table, and insert rows, each in a separate transaction
# Disable synchronous_commit to make this initialization go faster.
#
# Each row contains current insert LSN and the current timestamp, when
# the row was inserted.
cur.execute("SET synchronous_commit=off")
cur.execute("CREATE TABLE foo (x integer)")
tbl = []
for i in range(1000):
cur.execute("INSERT INTO foo VALUES(%s)", (i,))
# Get the timestamp at UTC
after_timestamp = query_scalar(cur, "SELECT clock_timestamp()").replace(tzinfo=None)
tbl.append([i, after_timestamp])
# Execute one more transaction with synchronous_commit enabled, to flush
# all the previous transactions
cur.execute("SET synchronous_commit=on")
cur.execute("INSERT INTO foo VALUES (-1)")
# Wait until WAL is received by pageserver
wait_for_last_flush_lsn(env, endpoint_main, env.initial_tenant, new_timeline_id)
with env.pageserver.http_client() as client:
# Check edge cases: timestamp in the future
probe_timestamp = tbl[-1][1] + timedelta(hours=1)
result = client.timeline_get_lsn_by_timestamp(
env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 1
)
assert result == "future"
# timestamp too the far history
probe_timestamp = tbl[0][1] - timedelta(hours=10)
result = client.timeline_get_lsn_by_timestamp(
env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 1
)
assert result == "past"
# Probe a bunch of timestamps in the valid range
for i in range(1, len(tbl), 100):
probe_timestamp = tbl[i][1]
lsn = client.timeline_get_lsn_by_timestamp(
env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 1
)
# Call get_lsn_by_timestamp to get the LSN
# Launch a new read-only node at that LSN, and check that only the rows
# that were supposed to be committed at that point in time are visible.
endpoint_here = env.endpoints.create_start(
branch_name="test_lsn_mapping", endpoint_id="ep-lsn_mapping_read", lsn=lsn
)
assert endpoint_here.safe_psql("SELECT max(x) FROM foo")[0][0] == i
endpoint_here.stop_and_destroy()
#
# Test pageserver get_lsn_by_timestamp API
#
@@ -45,23 +110,24 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
# Check edge cases: timestamp in the future
probe_timestamp = tbl[-1][1] + timedelta(hours=1)
result = client.timeline_get_lsn_by_timestamp(
env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z"
env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 2
)
assert result == "future"
assert result["kind"] == "future"
# timestamp too the far history
probe_timestamp = tbl[0][1] - timedelta(hours=10)
result = client.timeline_get_lsn_by_timestamp(
env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z"
env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 2
)
assert result == "past"
assert result["kind"] == "past"
# Probe a bunch of timestamps in the valid range
for i in range(1, len(tbl), 100):
probe_timestamp = tbl[i][1]
lsn = client.timeline_get_lsn_by_timestamp(
env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z"
result = client.timeline_get_lsn_by_timestamp(
env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 2
)
lsn = result["lsn"]
# Call get_lsn_by_timestamp to get the LSN
# Launch a new read-only node at that LSN, and check that only the rows
# that were supposed to be committed at that point in time are visible.

View File

@@ -486,16 +486,20 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
def evict_all_layers(env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId):
timeline_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
initial_local_layers = sorted(
list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
)
client = env.pageserver.http_client()
for layer in initial_local_layers:
if "ephemeral" in layer.name or "temp" in layer.name:
layer_map = client.layer_map_info(tenant_id, timeline_id)
for layer in layer_map.historic_layers:
if layer.remote:
log.info(
f"Skipping trying to evict remote layer {tenant_id}/{timeline_id} {layer.layer_file_name}"
)
continue
log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer.name}")
client.evict_layer(tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer.name)
log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer.layer_file_name}")
client.evict_layer(
tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer.layer_file_name
)
def test_eviction_across_generations(neon_env_builder: NeonEnvBuilder):

View File

@@ -1,5 +1,6 @@
import json
import subprocess
import time
from typing import Any, List, Optional, Tuple
import psycopg2
@@ -364,10 +365,14 @@ def test_sql_over_http_pool(static_proxy: NeonProxy):
pid1 = get_pid(200, "http")["rows"][0]["pid"]
time.sleep(0.02)
# query should be on the same connection
rows = get_pid(200, "http")["rows"]
assert rows == [{"pid": pid1}]
time.sleep(0.02)
# incorrect password should not work
res = get_pid(400, "foobar")
assert "password authentication failed for user" in res["message"]
@@ -378,10 +383,14 @@ def test_sql_over_http_pool(static_proxy: NeonProxy):
pid2 = get_pid(200, "http2")["rows"][0]["pid"]
assert pid1 != pid2
time.sleep(0.02)
# query should be on an existing connection
pid = get_pid(200, "http2")["rows"][0]["pid"]
assert pid in [pid1, pid2]
time.sleep(0.02)
# old password should not work
res = get_pid(400, "http")
assert "password authentication failed for user" in res["message"]
@@ -419,6 +428,7 @@ def test_sql_over_http_pool_idle(static_proxy: NeonProxy):
)
pid1 = query(200, GET_CONNECTION_PID_QUERY)["rows"][0]["pid"]
time.sleep(0.02)
query(200, "BEGIN")
pid2 = query(200, GET_CONNECTION_PID_QUERY)["rows"][0]["pid"]
assert pid1 != pid2

View File

@@ -757,12 +757,14 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv
create_thread.join()
# Regression test for a race condition where L0 layers are compacted before the upload,
# resulting in the uploading complaining about the file not being found
# https://github.com/neondatabase/neon/issues/4526
def test_compaction_delete_before_upload(
def test_compaction_waits_for_upload(
neon_env_builder: NeonEnvBuilder,
):
"""
Compaction waits for outstanding uploads to complete, so that it avoids deleting layers
files that have not yet been uploaded. This test forces a race between upload and
compaction.
"""
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
env = neon_env_builder.init_start(
@@ -792,50 +794,81 @@ def test_compaction_delete_before_upload(
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
# Now make the flushing hang and update one small piece of data
client.configure_failpoints(("flush-frozen-pausable", "pause"))
client.configure_failpoints(("before-upload-layer-pausable", "pause"))
endpoint.safe_psql("UPDATE foo SET x = 0 WHERE x = 1")
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
q: queue.Queue[Optional[PageserverApiException]] = queue.Queue()
barrier = threading.Barrier(2)
checkpoint_result: queue.Queue[Optional[PageserverApiException]] = queue.Queue()
compact_result: queue.Queue[Optional[PageserverApiException]] = queue.Queue()
compact_barrier = threading.Barrier(2)
def checkpoint_in_background():
barrier.wait()
try:
log.info("Checkpoint starting")
client.timeline_checkpoint(tenant_id, timeline_id)
q.put(None)
log.info("Checkpoint complete")
checkpoint_result.put(None)
except PageserverApiException as e:
q.put(e)
log.info("Checkpoint errored: {e}")
checkpoint_result.put(e)
create_thread = threading.Thread(target=checkpoint_in_background)
create_thread.start()
def compact_in_background():
compact_barrier.wait()
try:
log.info("Compaction starting")
client.timeline_compact(tenant_id, timeline_id)
log.info("Compaction complete")
compact_result.put(None)
except PageserverApiException as e:
log.info("Compaction errored: {e}")
compact_result.put(e)
checkpoint_thread = threading.Thread(target=checkpoint_in_background)
checkpoint_thread.start()
compact_thread = threading.Thread(target=compact_in_background)
compact_thread.start()
try:
barrier.wait()
# Start the checkpoint, see that it blocks
log.info("Waiting to see checkpoint hang...")
time.sleep(5)
assert checkpoint_result.empty()
time.sleep(4)
client.timeline_compact(tenant_id, timeline_id)
# Start the compaction, see that it finds work to do but blocks
compact_barrier.wait()
log.info("Waiting to see compaction hang...")
time.sleep(5)
assert compact_result.empty()
client.configure_failpoints(("flush-frozen-pausable", "off"))
# This is logged once compaction is started, but before we wait for operations to complete
assert env.pageserver.log_contains("compact_level0_phase1 stats available.")
conflict = q.get()
# Once we unblock uploads the compaction should complete successfully
log.info("Disabling failpoint")
client.configure_failpoints(("before-upload-layer-pausable", "off"))
log.info("Awaiting compaction result")
assert compact_result.get(timeout=10) is None
log.info("Awaiting checkpoint result")
assert checkpoint_result.get(timeout=10) is None
assert conflict is None
except Exception:
# Log the actual failure's backtrace here, before we proceed to join threads
log.exception("Failure, cleaning up...")
raise
finally:
create_thread.join()
compact_barrier.abort()
# Add a delay for the uploads to run into either the file not found or the
time.sleep(4)
checkpoint_thread.join()
compact_thread.join()
# Ensure that this actually terminates
wait_upload_queue_empty(client, tenant_id, timeline_id)
# For now we are hitting this message.
# Maybe in the future the underlying race condition will be fixed,
# but until then, ensure that this message is hit instead.
assert env.pageserver.log_contains(
# We should not have hit the error handling path in uploads where the remote file is gone
assert not env.pageserver.log_contains(
"File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more."
)

View File

@@ -0,0 +1,54 @@
import time
from fixtures.neon_fixtures import (
NeonEnvBuilder,
last_flush_lsn_upload,
)
from fixtures.remote_storage import (
RemoteStorageKind,
)
from fixtures.types import TenantId
from fixtures.log_helper import log
def test_tenant_duplicate(
neon_env_builder: NeonEnvBuilder,
):
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
env = neon_env_builder.init_start()
with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep_main:
ep_main.safe_psql("CREATE TABLE foo (i int);")
ep_main.safe_psql("INSERT INTO foo VALUES (1), (2), (3);")
last_flush_lsn = last_flush_lsn_upload(
env, ep_main, env.initial_tenant, env.initial_timeline
)
new_tenant_id = TenantId.generate()
# timeline id remains unchanged with tenant_duplicate
# TODO: implement a remapping scheme so timeline ids remain globally unique
new_timeline_id = env.initial_timeline
log.info(f"Duplicate tenant/timeline will be: {new_tenant_id}/{new_timeline_id}")
ps_http = env.pageserver.http_client()
ps_http.tenant_duplicate(env.initial_tenant, new_tenant_id)
ps_http.tenant_delete(env.initial_tenant)
env.neon_cli.map_branch("duplicate", new_tenant_id, new_timeline_id)
# start read-only replicate and validate
with env.endpoints.create_start(
"duplicate", tenant_id=new_tenant_id, lsn=last_flush_lsn
) as ep_dup:
with ep_dup.connect() as conn:
with conn.cursor() as cur:
cur.execute("SELECT * FROM foo ORDER BY i;")
cur.fetchall() == [(1,), (2,), (3,)]
# ensure restarting PS works
env.pageserver.stop()
env.pageserver.start()