chore(ci): collect monitor logs and traces on fuzz test failures (#7728)

* feat: fuzz tests monitoring

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: resource limit

Signed-off-by: WenyXu <wenymedia@gmail.com>

* refactor: minor

Signed-off-by: WenyXu <wenymedia@gmail.com>

* dump create table statement

Signed-off-by: WenyXu <wenymedia@gmail.com>

* modify standalone storage size

Signed-off-by: WenyXu <wenymedia@gmail.com>

---------

Signed-off-by: WenyXu <wenymedia@gmail.com>
This commit is contained in:
Weny Xu
2026-02-27 15:54:01 +08:00
committed by GitHub
parent 6b54fb6c21
commit e6abea1b3c
4 changed files with 185 additions and 1 deletions

View File

@@ -24,6 +24,8 @@ inputs:
description: "Etcd endpoints"
values-filename:
default: "with-minio.yaml"
base-values-filename:
default: "with-observability.yaml"
enable-region-failover:
default: false
@@ -49,6 +51,9 @@ runs:
- name: Install GreptimeDB cluster
shell: bash
run: |
BASE_VALUES="./.github/actions/setup-greptimedb-cluster/${{ inputs.base-values-filename }}"
SCENARIO_VALUES="./.github/actions/setup-greptimedb-cluster/${{ inputs.values-filename }}"
helm upgrade \
--install my-greptimedb \
--set 'meta.backendStorage.etcd.endpoints[0]=${{ inputs.etcd-endpoints }}' \
@@ -66,7 +71,8 @@ runs:
greptime/greptimedb-cluster \
--create-namespace \
-n my-greptimedb \
--values ./.github/actions/setup-greptimedb-cluster/${{ inputs.values-filename }} \
--values "${BASE_VALUES}" \
--values "${SCENARIO_VALUES}" \
--wait \
--wait-for-jobs
- name: Wait for GreptimeDB

View File

@@ -0,0 +1,28 @@
tracing:
enabled: true
endpoint: "http://my-greptimedb-monitor-standalone.my-greptimedb:4000/v1/otlp/v1/traces"
sampleRatio: "1.0"
monitoring:
enabled: true
standalone:
base:
main:
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 500m
memory: 256Mi
datanodeStorage:
fs:
storageSize: 5Gi
vector:
resources:
requests:
cpu: "50m"
memory: "64Mi"
limits:
cpu: "500m"
memory: "256Mi"

View File

@@ -0,0 +1,114 @@
#!/usr/bin/env bash
set -euo pipefail
GT_FUZZ_NS="${GT_FUZZ_NS:-my-greptimedb}"
GT_FUZZ_CLUSTER="${GT_FUZZ_CLUSTER:-my-greptimedb}"
GT_MONITOR_HTTP_LOCAL_PORT="${GT_MONITOR_HTTP_LOCAL_PORT:-14000}"
GT_MONITOR_ARTIFACT_DIR="${GT_MONITOR_ARTIFACT_DIR:-/tmp/fuzz-monitor-dumps}"
GT_MONITOR_SERVER_EXPORT_DIR="${GT_MONITOR_SERVER_EXPORT_DIR:-/tmp/gt-monitor-dump}"
MONITOR_SERVICE="${GT_FUZZ_CLUSTER}-monitor-standalone"
MONITOR_POD="${GT_FUZZ_CLUSTER}-monitor-standalone-0"
PORT_FORWARD_LOG="${GT_MONITOR_ARTIFACT_DIR}/port-forward.log"
SQL_LOG="${GT_MONITOR_ARTIFACT_DIR}/sql.log"
COPY_LOG="${GT_MONITOR_ARTIFACT_DIR}/copy.log"
STATE_LOG="${GT_MONITOR_ARTIFACT_DIR}/state.log"
log() {
printf '[collect-fuzz-monitor-artifacts] %s\n' "$*" | tee -a "${STATE_LOG}"
}
cleanup() {
if [ -n "${PORT_FORWARD_PID:-}" ]; then
kill "${PORT_FORWARD_PID}" >/dev/null 2>&1 || true
fi
}
exec_sql() {
local db="$1"
local sql="$2"
local output
log "execute sql on db=${db}: ${sql}"
output="$(curl -sS -G "http://127.0.0.1:${GT_MONITOR_HTTP_LOCAL_PORT}/v1/sql" \
--data-urlencode "db=${db}" \
--data-urlencode "sql=${sql}")"
printf '%s\n' "${output}" >>"${SQL_LOG}"
if printf '%s' "${output}" | grep -q '"error"'; then
log "sql failed: ${output}"
return 1
fi
}
export_show_create_table() {
local table="$1"
local output_file="${GT_MONITOR_ARTIFACT_DIR}/${table}.show_create_table.sql"
local output
log "export SHOW CREATE TABLE for ${table}"
output="$(curl -sS -G "http://127.0.0.1:${GT_MONITOR_HTTP_LOCAL_PORT}/v1/sql" \
--data-urlencode "db=public" \
--data-urlencode "sql=SHOW CREATE TABLE ${table};")"
printf '%s\n' "${output}" >>"${SQL_LOG}"
if printf '%s' "${output}" | grep -q '"error"'; then
log "show create table failed for ${table}: ${output}"
return 1
fi
printf '%s' "${output}" | python3 -c 'import json,sys; d=json.load(sys.stdin); print(d["output"][0]["records"]["rows"][0][1])' >"${output_file}"
}
mkdir -p "${GT_MONITOR_ARTIFACT_DIR}"
rm -rf "${GT_MONITOR_ARTIFACT_DIR:?}/"*
{
echo "namespace=${GT_FUZZ_NS}"
echo "cluster=${GT_FUZZ_CLUSTER}"
echo "service=${MONITOR_SERVICE}"
echo "pod=${MONITOR_POD}"
echo "http_port=${GT_MONITOR_HTTP_LOCAL_PORT}"
} >"${STATE_LOG}"
log "start port-forward service/${MONITOR_SERVICE} in namespace ${GT_FUZZ_NS}"
kubectl port-forward "service/${MONITOR_SERVICE}" "${GT_MONITOR_HTTP_LOCAL_PORT}:4000" -n "${GT_FUZZ_NS}" >"${PORT_FORWARD_LOG}" 2>&1 &
PORT_FORWARD_PID="$!"
trap cleanup EXIT
log "port-forward pid=${PORT_FORWARD_PID}"
log "wait for port-forward bootstrap"
for i in {1..30}; do
if curl -s --fail "http://127.0.0.1:${GT_MONITOR_HTTP_LOCAL_PORT}/health" &> /dev/null; then
log "port-forward is ready"
break
fi
if [ "$i" -eq 30 ]; then
log "Timed out waiting for port-forward to be ready."
exit 1
fi
sleep 1
done
log "ensure export dir exists in pod ${MONITOR_POD}"
kubectl exec -n "${GT_FUZZ_NS}" "${MONITOR_POD}" -- mkdir -p "${GT_MONITOR_SERVER_EXPORT_DIR}" >>"${COPY_LOG}" 2>&1
export_show_create_table "_gt_logs"
export_show_create_table "opentelemetry_traces"
exec_sql "public" "COPY _gt_logs TO '${GT_MONITOR_SERVER_EXPORT_DIR}/_gt_logs.parquet' WITH (FORMAT='parquet');"
exec_sql "public" "COPY opentelemetry_traces TO '${GT_MONITOR_SERVER_EXPORT_DIR}/opentelemetry_traces.parquet' WITH (FORMAT='parquet');"
log "copy _gt_logs.parquet from pod"
kubectl cp "${GT_FUZZ_NS}/${MONITOR_POD}:${GT_MONITOR_SERVER_EXPORT_DIR}/_gt_logs.parquet" "${GT_MONITOR_ARTIFACT_DIR}/_gt_logs.parquet" >>"${COPY_LOG}" 2>&1
log "copy opentelemetry_traces.parquet from pod"
kubectl cp "${GT_FUZZ_NS}/${MONITOR_POD}:${GT_MONITOR_SERVER_EXPORT_DIR}/opentelemetry_traces.parquet" "${GT_MONITOR_ARTIFACT_DIR}/opentelemetry_traces.parquet" >>"${COPY_LOG}" 2>&1
ls -la "${GT_MONITOR_ARTIFACT_DIR}" >>"${STATE_LOG}" 2>&1
log "artifacts collected under ${GT_MONITOR_ARTIFACT_DIR}"

View File

@@ -430,6 +430,24 @@ jobs:
name: fuzz-tests-kind-logs-${{ matrix.mode.name }}-${{ matrix.target }}
path: /tmp/kind
retention-days: 3
- name: Collect monitor dumps
if: failure()
shell: bash
env:
GT_FUZZ_NS: my-greptimedb
GT_FUZZ_CLUSTER: my-greptimedb
GT_MONITOR_HTTP_LOCAL_PORT: 14000
GT_MONITOR_ARTIFACT_DIR: /tmp/fuzz-monitor-dumps
run: |
bash .github/scripts/collect-fuzz-monitor-artifacts.sh
- name: Upload monitor dumps
if: failure()
uses: actions/upload-artifact@v4
with:
name: fuzz-tests-monitor-dumps-${{ matrix.mode.name }}-${{ matrix.target }}
path: /tmp/fuzz-monitor-dumps
if-no-files-found: warn
retention-days: 3
- name: Delete cluster
if: success()
shell: bash
@@ -584,6 +602,24 @@ jobs:
name: fuzz-tests-kind-logs-${{ matrix.mode.name }}-${{ matrix.target }}
path: /tmp/kind
retention-days: 3
- name: Collect monitor dumps
if: failure()
shell: bash
env:
GT_FUZZ_NS: my-greptimedb
GT_FUZZ_CLUSTER: my-greptimedb
GT_MONITOR_HTTP_LOCAL_PORT: 14000
GT_MONITOR_ARTIFACT_DIR: /tmp/fuzz-monitor-dumps
run: |
bash .github/scripts/collect-fuzz-monitor-artifacts.sh
- name: Upload monitor dumps
if: failure()
uses: actions/upload-artifact@v4
with:
name: fuzz-tests-monitor-dumps-${{ matrix.mode.name }}-${{ matrix.target }}
path: /tmp/fuzz-monitor-dumps
if-no-files-found: warn
retention-days: 3
- name: Delete cluster
if: success()
shell: bash