diff --git a/.gitmodules b/.gitmodules index 081a404135..fea1ffc882 100644 --- a/.gitmodules +++ b/.gitmodules @@ -6,3 +6,6 @@ path = vendor/postgres-v15 url = https://github.com/neondatabase/postgres.git branch = REL_15_STABLE_neon +[submodule "pageserver/slo/sloth.git"] + path = pageserver/slo/sloth.git + url = https://github.com/neondatabase/sloth.git diff --git a/pageserver/slo/.gitignore b/pageserver/slo/.gitignore new file mode 100644 index 0000000000..75b60b4e94 --- /dev/null +++ b/pageserver/slo/.gitignore @@ -0,0 +1 @@ +*.sloth-output.yaml diff --git a/pageserver/slo/Makefile b/pageserver/slo/Makefile new file mode 100644 index 0000000000..0772422fc4 --- /dev/null +++ b/pageserver/slo/Makefile @@ -0,0 +1,20 @@ + +BINARY := sloth.git/bin/sloth-linux-amd64 + +.PHONY: all +all: pageserver-slo.dev.sloth-output.yaml pageserver-slo.prod.sloth-output.yaml + +%.sloth-output.yaml: %.sloth.yaml $(BINARY) + $(BINARY) generate \ + --disable-promExpr-validation \ + --input $< \ + --out $@ + +CLOUD_GIT_CHECKOUT := /nonexistent + +.PHONY: sync-to-cloud-git +sync-to-cloud-git: pageserver-slo.dev.sloth-output.yaml pageserver-slo.prod.sloth-output.yaml + cp pageserver-slo.dev.sloth-output.yaml \ + $(CLOUD_GIT_CHECKOUT)/ops/infra/workloads/values/dev-eu-central-1-alpha/neon-vm/pageserver-slo.dev.sloth-output.yaml + cp pageserver-slo.prod.sloth-output.yaml \ + $(CLOUD_GIT_CHECKOUT)/ops/infra/workloads/values/prod-eu-central-1-gamma/neon-vm/pageserver-slo.prod.sloth-output.yaml diff --git a/pageserver/slo/README.md b/pageserver/slo/README.md new file mode 100644 index 0000000000..ed77999586 --- /dev/null +++ b/pageserver/slo/README.md @@ -0,0 +1,83 @@ + +## Install Sloth + +https://sloth.dev/introduction/install/ + +```sh +wget https://github.com/slok/sloth/releases/download/v0.11.0/sloth-linux-amd64 +chmod +x ./sloth-linux-amd64 +``` + +## Background on Sloth + +https://sloth.dev/introduction/ +https://sloth.dev/introduction/architecture/ + + +## Generate Prometheus Rules From Sloth Spec + +``` +./sloth-linux-amd64 generate --input ./spec.sloth.yml --out generated.prometheus.rules.yml +``` + +## Background reading: + +SRE workbook chapter on "Implementing SLOs", section "Calculating the SLIs" + +https://sre.google/workbook/implementing-slos/ + +Citation: + +``` +Availability + + sum(rate(http_requests_total{host="api", status!~"5.."}[7d])) + / + sum(rate(http_requests_total{host="api"}[7d]) + +Latency + + histogram_quantile(0.9, rate(http_request_duration_seconds_bucket[7d])) + + histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[7d])) +``` + +## Sloth Rule Syntax + +It's under-documented. + +Best to go to the Go types: +https://pkg.go.dev/github.com/slok/sloth@v0.6.0/pkg/prometheus/api/v1#section-readme + +For latency SLOs, pageserver, we want the "Raw SLI" type SLI, not the one that is based on events. +Seach for `error_ratio_query` ; example: https://sloth.dev/examples/default/raw-sli/ + +Use victoriametrics `histogram_share` to compute the error ratio. +It's the inverese of histogram_quantile. +https://docs.victoriametrics.com/MetricsQL.html#histogram_share + +`share_le_over_time` seems also useful +https://docs.victoriametrics.com/MetricsQL.html#share_le_over_time + +https://stackoverflow.com/questions/72559302/is-it-possible-to-calculate-ranks-of-metrics?rq=1 + +Problem with the VictoriaMetrics-only functions is that sloth has an internal validation pass: +https://github.com/slok/sloth/issues/510 +Option to skip the check: +https://github.com/slok/sloth/pull/511 +=> +``` +git submodule update --init +pushd sloth.git +make build +popd +sloth.git/bin/sloth-linux-amd64 generate \ + --disable-promExpr-validation \ + --input ./spec.sloth.yml \ + --out generated.prometheus.rules.yml +``` + +## Notes On How To Scale The Process To Multiple Teams / Automate Sloth In Neon + +* SLO directory discovery: https://sloth.dev/usage/cli/ + * allows using directories instead of indivudal files as input diff --git a/pageserver/slo/pageserver-slo.dev.sloth.yaml b/pageserver/slo/pageserver-slo.dev.sloth.yaml new file mode 100644 index 0000000000..fdbce8e21b --- /dev/null +++ b/pageserver/slo/pageserver-slo.dev.sloth.yaml @@ -0,0 +1,4 @@ +version: "prometheus/v1" +service: "pageserver" +slos: [] + diff --git a/pageserver/slo/pageserver-slo.prod.sloth.yaml b/pageserver/slo/pageserver-slo.prod.sloth.yaml new file mode 100644 index 0000000000..46c67913de --- /dev/null +++ b/pageserver/slo/pageserver-slo.prod.sloth.yaml @@ -0,0 +1,63 @@ +version: "prometheus/v1" +service: "pageserver" +#labels: +# owner: "pageserver-team" +# repo: "myorg/myservice" +# tier: "2" +slos: + # # We allow failing (5xx and 429) 1 request every 1000 requests (99.9%). + # - name: "requests-availability" + # objective: 99.9 + # description: "Common SLO based on availability for HTTP request responses." + # sli: + # events: + # error_query: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}])) + # total_query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}])) + # alerting: + # name: MyServiceHighErrorRate + # labels: + # category: "availability" + # annotations: + # # Overwrite default Sloth SLO alert summmary on ticket and page alerts. + # summary: "High error rate on 'myservice' requests responses" + # page_alert: + # labels: + # severity: pageteam + # routing_key: myteam + # ticket_alert: + # labels: + # severity: "slack" + # slack_channel: "#alerts-myteam" + + - name: "basebackup_ms-latency" + description: "basebackup_ms latency should be less than 0.2s for 99.9% of requests" + objective: 99.9 + #labels: + sli: + raw: + # VictoriaMetrics-only (fails due to sloth validation pass) + error_ratio_query: | + histogram_share(200, sum by (le) (compute_basebackup_ms_bucket[5m])) + # error_ratio_query: | + # https://docs.victoriametrics.com/MetricsQL.html#share_le_over_time + # + # Prometheus has a native one, not supported by VictoriaMetrics validation pass + #error_ratio_query: | + # histogram_fraction(0, 200, compute_basebackup_ms_bucket[{{.window}}]) + # error_ratio_query: | + # histogram_quantile(0.999, sum by (le) (rate(compute_basebackup_ms_bucket[{{.window}}]))) + alerting: + name: "basebackup_ms-latency" + labels: + category: "latency" + annotations: {} + page_alert: + labels: + severity: "pageteam" + routing_key: "myteam" + ticket_alert: + labels: + severity: "slack" + slack_channel: "#alerts-myteam" + + diff --git a/pageserver/slo/sloth.git b/pageserver/slo/sloth.git new file mode 160000 index 0000000000..fcbbc5e34f --- /dev/null +++ b/pageserver/slo/sloth.git @@ -0,0 +1 @@ +Subproject commit fcbbc5e34f116d22aea99b17c1343fc3b5c03690