initial pageserver SLO sloth rules

The generated files were deployed manually in https://github.com/neondatabase/cloud/pull/5812
2026-01-14 17:02:56 +00:00 · 2023-07-12 16:03:28 +02:00
parent 618d36ee6d
commit 1251ef6dd4
7 changed files with 175 additions and 0 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -6,3 +6,6 @@
 	path = vendor/postgres-v15
 	url = https://github.com/neondatabase/postgres.git
 	branch = REL_15_STABLE_neon
+[submodule "pageserver/slo/sloth.git"]
+	path = pageserver/slo/sloth.git
+	url = https://github.com/neondatabase/sloth.git
--- a/pageserver/slo/.gitignore
+++ b/pageserver/slo/.gitignore
@@ -0,0 +1 @@
+*.sloth-output.yaml
--- a/pageserver/slo/Makefile
+++ b/pageserver/slo/Makefile
@@ -0,0 +1,20 @@
+
+BINARY := sloth.git/bin/sloth-linux-amd64
+
+.PHONY: all
+all: pageserver-slo.dev.sloth-output.yaml pageserver-slo.prod.sloth-output.yaml
+
+%.sloth-output.yaml: %.sloth.yaml $(BINARY)
+	$(BINARY) generate  \
+		--disable-promExpr-validation \
+		--input $< \
+		--out $@
+
+CLOUD_GIT_CHECKOUT := /nonexistent
+
+.PHONY: sync-to-cloud-git
+sync-to-cloud-git: pageserver-slo.dev.sloth-output.yaml pageserver-slo.prod.sloth-output.yaml
+	cp pageserver-slo.dev.sloth-output.yaml \
+		$(CLOUD_GIT_CHECKOUT)/ops/infra/workloads/values/dev-eu-central-1-alpha/neon-vm/pageserver-slo.dev.sloth-output.yaml
+	cp pageserver-slo.prod.sloth-output.yaml \
+		$(CLOUD_GIT_CHECKOUT)/ops/infra/workloads/values/prod-eu-central-1-gamma/neon-vm/pageserver-slo.prod.sloth-output.yaml
--- a/pageserver/slo/README.md
+++ b/pageserver/slo/README.md
@@ -0,0 +1,83 @@
+
+## Install Sloth
+
+https://sloth.dev/introduction/install/
+
+```sh
+wget https://github.com/slok/sloth/releases/download/v0.11.0/sloth-linux-amd64
+chmod +x ./sloth-linux-amd64
+```
+
+## Background on Sloth
+
+https://sloth.dev/introduction/
+https://sloth.dev/introduction/architecture/
+
+
+## Generate Prometheus Rules From Sloth Spec
+
+```
+./sloth-linux-amd64 generate --input ./spec.sloth.yml --out generated.prometheus.rules.yml
+```
+
+## Background reading:
+
+SRE workbook chapter on "Implementing SLOs", section "Calculating the SLIs"
+
+https://sre.google/workbook/implementing-slos/
+
+Citation:
+
+```
+Availability
+
+    sum(rate(http_requests_total{host="api", status!~"5.."}[7d]))
+    /
+    sum(rate(http_requests_total{host="api"}[7d])
+
+Latency
+
+    histogram_quantile(0.9, rate(http_request_duration_seconds_bucket[7d]))
+
+    histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[7d]))
+```
+
+## Sloth Rule Syntax
+
+It's under-documented.
+
+Best to go to the Go types:
+https://pkg.go.dev/github.com/slok/sloth@v0.6.0/pkg/prometheus/api/v1#section-readme
+
+For latency SLOs,  pageserver, we want the "Raw SLI" type SLI, not the one that is based on events.
+Seach for `error_ratio_query` ; example: https://sloth.dev/examples/default/raw-sli/
+
+Use victoriametrics `histogram_share` to compute the error ratio.
+It's the inverese of histogram_quantile.
+https://docs.victoriametrics.com/MetricsQL.html#histogram_share
+
+`share_le_over_time` seems also useful
+https://docs.victoriametrics.com/MetricsQL.html#share_le_over_time
+
+https://stackoverflow.com/questions/72559302/is-it-possible-to-calculate-ranks-of-metrics?rq=1
+
+Problem with the VictoriaMetrics-only functions is that sloth has an internal validation pass:
+https://github.com/slok/sloth/issues/510
+Option to skip the check:
+https://github.com/slok/sloth/pull/511
+=>
+```
+git submodule update --init
+pushd sloth.git
+make build
+popd
+sloth.git/bin/sloth-linux-amd64 generate  \
+    --disable-promExpr-validation \
+    --input ./spec.sloth.yml \
+    --out generated.prometheus.rules.yml
+```
+
+## Notes On How To Scale The Process To Multiple Teams / Automate Sloth In Neon
+
+* SLO directory discovery: https://sloth.dev/usage/cli/
+  * allows using directories instead of indivudal files as input
--- a/pageserver/slo/pageserver-slo.dev.sloth.yaml
+++ b/pageserver/slo/pageserver-slo.dev.sloth.yaml
@@ -0,0 +1,4 @@
+version: "prometheus/v1"
+service: "pageserver"
+slos: []
+
--- a/pageserver/slo/pageserver-slo.prod.sloth.yaml
+++ b/pageserver/slo/pageserver-slo.prod.sloth.yaml
@@ -0,0 +1,63 @@
+version: "prometheus/v1"
+service: "pageserver"
+#labels:
+#  owner: "pageserver-team"
+#  repo: "myorg/myservice"
+#  tier: "2"
+slos:
+  # # We allow failing (5xx and 429) 1 request every 1000 requests (99.9%).
+  # - name: "requests-availability"
+  #   objective: 99.9
+  #   description: "Common SLO based on availability for HTTP request responses."
+  #   sli:
+  #     events:
+  #       error_query: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}]))
+  #       total_query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}]))
+  #   alerting:
+  #     name: MyServiceHighErrorRate
+  #     labels:
+  #       category: "availability"
+  #     annotations:
+  #       # Overwrite default Sloth SLO alert summmary on ticket and page alerts.
+  #       summary: "High error rate on 'myservice' requests responses"
+  #     page_alert:
+  #       labels:
+  #         severity: pageteam
+  #         routing_key: myteam
+  #     ticket_alert:
+  #       labels:
+  #         severity: "slack"
+  #         slack_channel: "#alerts-myteam"
+
+  - name: "basebackup_ms-latency"
+    description: "basebackup_ms latency should be less than 0.2s for 99.9% of requests"
+    objective: 99.9
+    #labels:
+    sli:
+      raw:
+        # VictoriaMetrics-only (fails due to sloth validation pass)
+        error_ratio_query: |
+          histogram_share(200, sum by (le) (compute_basebackup_ms_bucket[5m]))
+        # error_ratio_query: |
+        #    https://docs.victoriametrics.com/MetricsQL.html#share_le_over_time
+        #
+        # Prometheus has a native one, not supported by VictoriaMetrics validation pass
+        #error_ratio_query: |
+        #  histogram_fraction(0, 200, compute_basebackup_ms_bucket[{{.window}}])
+        # error_ratio_query: |
+        #  histogram_quantile(0.999, sum by (le) (rate(compute_basebackup_ms_bucket[{{.window}}])))
+    alerting:
+      name: "basebackup_ms-latency"
+      labels:
+        category: "latency"
+      annotations: {}
+      page_alert:
+        labels:
+          severity: "pageteam"
+          routing_key: "myteam"
+      ticket_alert:
+        labels:
+          severity: "slack"
+          slack_channel: "#alerts-myteam"
+
+
--- a/pageserver/slo/sloth.git
+++ b/pageserver/slo/sloth.git