mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-14 17:02:56 +00:00
initial pageserver SLO sloth rules
The generated files were deployed manually in https://github.com/neondatabase/cloud/pull/5812
This commit is contained in:
3
.gitmodules
vendored
3
.gitmodules
vendored
@@ -6,3 +6,6 @@
|
||||
path = vendor/postgres-v15
|
||||
url = https://github.com/neondatabase/postgres.git
|
||||
branch = REL_15_STABLE_neon
|
||||
[submodule "pageserver/slo/sloth.git"]
|
||||
path = pageserver/slo/sloth.git
|
||||
url = https://github.com/neondatabase/sloth.git
|
||||
|
||||
1
pageserver/slo/.gitignore
vendored
Normal file
1
pageserver/slo/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
*.sloth-output.yaml
|
||||
20
pageserver/slo/Makefile
Normal file
20
pageserver/slo/Makefile
Normal file
@@ -0,0 +1,20 @@
|
||||
|
||||
BINARY := sloth.git/bin/sloth-linux-amd64
|
||||
|
||||
.PHONY: all
|
||||
all: pageserver-slo.dev.sloth-output.yaml pageserver-slo.prod.sloth-output.yaml
|
||||
|
||||
%.sloth-output.yaml: %.sloth.yaml $(BINARY)
|
||||
$(BINARY) generate \
|
||||
--disable-promExpr-validation \
|
||||
--input $< \
|
||||
--out $@
|
||||
|
||||
CLOUD_GIT_CHECKOUT := /nonexistent
|
||||
|
||||
.PHONY: sync-to-cloud-git
|
||||
sync-to-cloud-git: pageserver-slo.dev.sloth-output.yaml pageserver-slo.prod.sloth-output.yaml
|
||||
cp pageserver-slo.dev.sloth-output.yaml \
|
||||
$(CLOUD_GIT_CHECKOUT)/ops/infra/workloads/values/dev-eu-central-1-alpha/neon-vm/pageserver-slo.dev.sloth-output.yaml
|
||||
cp pageserver-slo.prod.sloth-output.yaml \
|
||||
$(CLOUD_GIT_CHECKOUT)/ops/infra/workloads/values/prod-eu-central-1-gamma/neon-vm/pageserver-slo.prod.sloth-output.yaml
|
||||
83
pageserver/slo/README.md
Normal file
83
pageserver/slo/README.md
Normal file
@@ -0,0 +1,83 @@
|
||||
|
||||
## Install Sloth
|
||||
|
||||
https://sloth.dev/introduction/install/
|
||||
|
||||
```sh
|
||||
wget https://github.com/slok/sloth/releases/download/v0.11.0/sloth-linux-amd64
|
||||
chmod +x ./sloth-linux-amd64
|
||||
```
|
||||
|
||||
## Background on Sloth
|
||||
|
||||
https://sloth.dev/introduction/
|
||||
https://sloth.dev/introduction/architecture/
|
||||
|
||||
|
||||
## Generate Prometheus Rules From Sloth Spec
|
||||
|
||||
```
|
||||
./sloth-linux-amd64 generate --input ./spec.sloth.yml --out generated.prometheus.rules.yml
|
||||
```
|
||||
|
||||
## Background reading:
|
||||
|
||||
SRE workbook chapter on "Implementing SLOs", section "Calculating the SLIs"
|
||||
|
||||
https://sre.google/workbook/implementing-slos/
|
||||
|
||||
Citation:
|
||||
|
||||
```
|
||||
Availability
|
||||
|
||||
sum(rate(http_requests_total{host="api", status!~"5.."}[7d]))
|
||||
/
|
||||
sum(rate(http_requests_total{host="api"}[7d])
|
||||
|
||||
Latency
|
||||
|
||||
histogram_quantile(0.9, rate(http_request_duration_seconds_bucket[7d]))
|
||||
|
||||
histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[7d]))
|
||||
```
|
||||
|
||||
## Sloth Rule Syntax
|
||||
|
||||
It's under-documented.
|
||||
|
||||
Best to go to the Go types:
|
||||
https://pkg.go.dev/github.com/slok/sloth@v0.6.0/pkg/prometheus/api/v1#section-readme
|
||||
|
||||
For latency SLOs, pageserver, we want the "Raw SLI" type SLI, not the one that is based on events.
|
||||
Seach for `error_ratio_query` ; example: https://sloth.dev/examples/default/raw-sli/
|
||||
|
||||
Use victoriametrics `histogram_share` to compute the error ratio.
|
||||
It's the inverese of histogram_quantile.
|
||||
https://docs.victoriametrics.com/MetricsQL.html#histogram_share
|
||||
|
||||
`share_le_over_time` seems also useful
|
||||
https://docs.victoriametrics.com/MetricsQL.html#share_le_over_time
|
||||
|
||||
https://stackoverflow.com/questions/72559302/is-it-possible-to-calculate-ranks-of-metrics?rq=1
|
||||
|
||||
Problem with the VictoriaMetrics-only functions is that sloth has an internal validation pass:
|
||||
https://github.com/slok/sloth/issues/510
|
||||
Option to skip the check:
|
||||
https://github.com/slok/sloth/pull/511
|
||||
=>
|
||||
```
|
||||
git submodule update --init
|
||||
pushd sloth.git
|
||||
make build
|
||||
popd
|
||||
sloth.git/bin/sloth-linux-amd64 generate \
|
||||
--disable-promExpr-validation \
|
||||
--input ./spec.sloth.yml \
|
||||
--out generated.prometheus.rules.yml
|
||||
```
|
||||
|
||||
## Notes On How To Scale The Process To Multiple Teams / Automate Sloth In Neon
|
||||
|
||||
* SLO directory discovery: https://sloth.dev/usage/cli/
|
||||
* allows using directories instead of indivudal files as input
|
||||
4
pageserver/slo/pageserver-slo.dev.sloth.yaml
Normal file
4
pageserver/slo/pageserver-slo.dev.sloth.yaml
Normal file
@@ -0,0 +1,4 @@
|
||||
version: "prometheus/v1"
|
||||
service: "pageserver"
|
||||
slos: []
|
||||
|
||||
63
pageserver/slo/pageserver-slo.prod.sloth.yaml
Normal file
63
pageserver/slo/pageserver-slo.prod.sloth.yaml
Normal file
@@ -0,0 +1,63 @@
|
||||
version: "prometheus/v1"
|
||||
service: "pageserver"
|
||||
#labels:
|
||||
# owner: "pageserver-team"
|
||||
# repo: "myorg/myservice"
|
||||
# tier: "2"
|
||||
slos:
|
||||
# # We allow failing (5xx and 429) 1 request every 1000 requests (99.9%).
|
||||
# - name: "requests-availability"
|
||||
# objective: 99.9
|
||||
# description: "Common SLO based on availability for HTTP request responses."
|
||||
# sli:
|
||||
# events:
|
||||
# error_query: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}]))
|
||||
# total_query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}]))
|
||||
# alerting:
|
||||
# name: MyServiceHighErrorRate
|
||||
# labels:
|
||||
# category: "availability"
|
||||
# annotations:
|
||||
# # Overwrite default Sloth SLO alert summmary on ticket and page alerts.
|
||||
# summary: "High error rate on 'myservice' requests responses"
|
||||
# page_alert:
|
||||
# labels:
|
||||
# severity: pageteam
|
||||
# routing_key: myteam
|
||||
# ticket_alert:
|
||||
# labels:
|
||||
# severity: "slack"
|
||||
# slack_channel: "#alerts-myteam"
|
||||
|
||||
- name: "basebackup_ms-latency"
|
||||
description: "basebackup_ms latency should be less than 0.2s for 99.9% of requests"
|
||||
objective: 99.9
|
||||
#labels:
|
||||
sli:
|
||||
raw:
|
||||
# VictoriaMetrics-only (fails due to sloth validation pass)
|
||||
error_ratio_query: |
|
||||
histogram_share(200, sum by (le) (compute_basebackup_ms_bucket[5m]))
|
||||
# error_ratio_query: |
|
||||
# https://docs.victoriametrics.com/MetricsQL.html#share_le_over_time
|
||||
#
|
||||
# Prometheus has a native one, not supported by VictoriaMetrics validation pass
|
||||
#error_ratio_query: |
|
||||
# histogram_fraction(0, 200, compute_basebackup_ms_bucket[{{.window}}])
|
||||
# error_ratio_query: |
|
||||
# histogram_quantile(0.999, sum by (le) (rate(compute_basebackup_ms_bucket[{{.window}}])))
|
||||
alerting:
|
||||
name: "basebackup_ms-latency"
|
||||
labels:
|
||||
category: "latency"
|
||||
annotations: {}
|
||||
page_alert:
|
||||
labels:
|
||||
severity: "pageteam"
|
||||
routing_key: "myteam"
|
||||
ticket_alert:
|
||||
labels:
|
||||
severity: "slack"
|
||||
slack_channel: "#alerts-myteam"
|
||||
|
||||
|
||||
1
pageserver/slo/sloth.git
Submodule
1
pageserver/slo/sloth.git
Submodule
Submodule pageserver/slo/sloth.git added at fcbbc5e34f
Reference in New Issue
Block a user