ci: notify jsonbench result (#8273)

Signed-off-by: luofucong <luofc@foxmail.com>
2026-07-03 20:40:37 +00:00 · 2026-06-10 16:51:37 +08:00
parent 05c4588f90
commit 962990009c
3 changed files with 490 additions and 1 deletions
--- a/.github/scripts/find-previous-workflow-artifact.sh
+++ b/.github/scripts/find-previous-workflow-artifact.sh
@@ -0,0 +1,184 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+usage() {
+  cat <<'EOF'
+Find the most recent previous successful workflow run that has a non-expired artifact.
+
+Usage:
+  find-previous-workflow-artifact.sh --workflow-path PATH --artifact-name NAME [options]
+
+Options:
+  --repo OWNER/REPO          GitHub repository. Defaults to GITHUB_REPOSITORY.
+  --current-run-id ID        Current workflow run id to exclude. Defaults to GITHUB_RUN_ID.
+  --workflow-path PATH       Workflow path, for example .github/workflows/nightly-jsonbench.yaml.
+  --artifact-name NAME       Artifact name to find.
+  --status STATUS            Workflow run status filter. Defaults to success.
+  --per-page N               GitHub API page size. Defaults to 100.
+  --run-id-only              Print only the run id. This is the default.
+  --artifact-id-only         Print only the artifact id.
+  --json                     Print a JSON object with run_id and artifact_id.
+  --debug                    Print GitHub API requests and responses to stderr.
+  -h, --help                 Show this help.
+
+The script uses gh CLI and jq. Provide GH_TOKEN or authenticate gh before running it.
+EOF
+}
+
+repo="${GITHUB_REPOSITORY:-}"
+current_run_id="${GITHUB_RUN_ID:-}"
+workflow_path=""
+artifact_name=""
+status="success"
+per_page="100"
+output_format="run_id"
+debug="false"
+
+debug_log() {
+  if [[ "${debug}" == "true" ]]; then
+    printf '[debug] %s\n' "$*" >&2
+  fi
+}
+
+log_stderr_file() {
+  if [[ "${debug}" != "true" || ! -s "${err_file}" ]]; then
+    return
+  fi
+
+  while read -r line; do
+    debug_log "stderr: ${line}"
+  done < "${err_file}"
+  : > "${err_file}"
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --repo)
+      repo="$2"
+      shift 2
+      ;;
+    --current-run-id)
+      current_run_id="$2"
+      shift 2
+      ;;
+    --workflow-path)
+      workflow_path="$2"
+      shift 2
+      ;;
+    --artifact-name)
+      artifact_name="$2"
+      shift 2
+      ;;
+    --status)
+      status="$2"
+      shift 2
+      ;;
+    --per-page)
+      per_page="$2"
+      shift 2
+      ;;
+    --run-id-only)
+      output_format="run_id"
+      shift
+      ;;
+    --artifact-id-only)
+      output_format="artifact_id"
+      shift
+      ;;
+    --json)
+      output_format="json"
+      shift
+      ;;
+    --debug)
+      debug="true"
+      shift
+      ;;
+    -h | --help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "Unknown argument: $1" >&2
+      usage >&2
+      exit 2
+      ;;
+  esac
+done
+
+if [[ -z "${repo}" ]]; then
+  echo "--repo is required when GITHUB_REPOSITORY is not set." >&2
+  exit 2
+fi
+
+if [[ -z "${workflow_path}" ]]; then
+  echo "--workflow-path is required." >&2
+  exit 2
+fi
+
+if [[ -z "${artifact_name}" ]]; then
+  echo "--artifact-name is required." >&2
+  exit 2
+fi
+
+err_file=$(mktemp)
+trap 'rm -f "${err_file}"' EXIT
+
+debug_log "request: gh api --method GET repos/${repo}/actions/runs -f status=${status} -f per_page=${per_page} --paginate"
+candidate_run_ids=$(
+  gh api --method GET "repos/${repo}/actions/runs" \
+    -f "status=${status}" \
+    -f "per_page=${per_page}" \
+    --paginate \
+    --jq ".workflow_runs[] | select(.path == \"${workflow_path}\") | .id" \
+    2> "${err_file}" || true
+)
+log_stderr_file
+debug_log "response run ids: ${candidate_run_ids:-<none>}"
+
+while read -r run_id; do
+  if [[ -z "${run_id}" || "${run_id}" == "${current_run_id}" ]]; then
+    debug_log "skip run id: ${run_id:-<empty>}"
+    continue
+  fi
+
+  debug_log "request: gh api repos/${repo}/actions/runs/${run_id}/artifacts"
+  artifacts_response=$(
+    gh api "repos/${repo}/actions/runs/${run_id}/artifacts" \
+      2> "${err_file}" || true
+  )
+  log_stderr_file
+  debug_log "response for run ${run_id}: ${artifacts_response}"
+
+  artifact_id=$(
+    printf '%s\n' "${artifacts_response}" \
+      | jq -r --arg name "${artifact_name}" '.artifacts[]? | select(.name == $name and (.expired | not)) | .id' \
+      | head -n 1 || true
+  )
+  debug_log "artifact id for run ${run_id}: ${artifact_id:-<none>}"
+
+  if [[ -z "${artifact_id}" ]]; then
+    continue
+  fi
+
+  case "${output_format}" in
+    run_id)
+      echo "${run_id}"
+      ;;
+    artifact_id)
+      echo "${artifact_id}"
+      ;;
+    json)
+      printf '{"run_id":"%s","artifact_id":"%s"}\n' "${run_id}" "${artifact_id}"
+      ;;
+  esac
+  exit 0
+done <<< "${candidate_run_ids}"
+
+debug_log "no previous workflow run with artifact '${artifact_name}' found"
+
+case "${output_format}" in
+  json)
+    printf '{"run_id":"","artifact_id":""}\n'
+    ;;
+esac
--- a/.github/scripts/jsonbench-summary.py
+++ b/.github/scripts/jsonbench-summary.py
@@ -0,0 +1,217 @@
+# Copyright 2023 Greptime Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+import argparse
+import ast
+import json
+import pathlib
+import re
+
+
+def read_number(result_dir, patterns):
+    for pattern in patterns:
+        for path in sorted(result_dir.rglob(pattern)):
+            text = path.read_text(encoding="utf-8", errors="replace")
+            match = re.search(r"\d+(?:\.\d+)?", text)
+            if match:
+                return match.group(0)
+    return None
+
+
+def format_gb(value):
+    if value is None:
+        return "N/A"
+
+    try:
+        bytes_size = float(value)
+    except ValueError:
+        return "N/A"
+
+    return f"{bytes_size / 1000 / 1000 / 1000:.2f} GB"
+
+
+def format_dataset(choice):
+    datasets = {
+        "1": "1M",
+        "2": "10M",
+        "3": "100M",
+        "4": "1000M",
+    }
+    if choice is None:
+        return "N/A"
+    return datasets.get(choice, f"choice {choice}")
+
+
+def read_runtime_text(result_dir):
+    runtime_files = sorted(result_dir.rglob("*.results_runtime"))
+    if runtime_files:
+        return "\n".join(
+            path.read_text(encoding="utf-8", errors="replace")
+            for path in runtime_files
+        )
+
+    log_files = sorted(result_dir.rglob("*.log"))
+    return "\n".join(
+        path.read_text(encoding="utf-8", errors="replace")
+        for path in log_files
+    )
+
+
+def parse_query_rows(text):
+    rows = []
+    query_index = 0
+    has_timings = False
+
+    for line in text.splitlines():
+        stripped = line.strip()
+        if stripped.startswith("Running query:"):
+            query_index += 1
+            has_timings = False
+            continue
+
+        if not (
+            query_index > 0
+            and not has_timings
+            and stripped.startswith("[")
+            and stripped.endswith("]")
+        ):
+            continue
+
+        try:
+            timings = ast.literal_eval(stripped)
+        except (SyntaxError, ValueError):
+            continue
+
+        has_timings = True
+        for label, value in (
+            ("cold", timings[0] if len(timings) > 0 else None),
+            ("hot", timings[1] if len(timings) > 1 else None),
+        ):
+            if value is not None:
+                rows.append((query_index, label, float(value)))
+
+    return rows
+
+
+def query_rows_to_map(query_rows):
+    queries = {}
+    for query_index, label, value in query_rows:
+        queries.setdefault(query_index, {})[label] = value
+    return queries
+
+
+def format_duration(value):
+    if value is None:
+        return "N/A"
+    return f"{value:.3f}"
+
+
+def format_delta(current, last):
+    if current is None or last in (None, 0):
+        return "N/A"
+
+    percent = (current - last) / last * 100
+    if abs(percent) <= 0.1:
+        return "0"
+
+    formatted = f"{percent:+.1f}"
+    return formatted.rstrip("0").rstrip(".")
+
+
+def format_query_table(query_rows, previous_query_rows):
+    queries = query_rows_to_map(query_rows)
+    previous_queries = query_rows_to_map(previous_query_rows)
+
+    rows = [("Query", "Cold (s)", "Cold Last (%)", "Hot (s)", "Hot Last (%)")]
+    rows.extend(
+        (
+            f"Q{query_index}",
+            format_duration(values.get("cold")),
+            format_delta(
+                values.get("cold"),
+                previous_queries.get(query_index, {}).get("cold"),
+            ),
+            format_duration(values.get("hot")),
+            format_delta(
+                values.get("hot"),
+                previous_queries.get(query_index, {}).get("hot"),
+            ),
+        )
+        for query_index, values in sorted(queries.items())
+    )
+
+    widths = [max(len(row[column]) for row in rows) for column in range(5)]
+    separator = tuple("-" * width for width in widths)
+    rows.insert(1, separator)
+
+    return "\n".join(
+        (
+            f"| {query:<{widths[0]}} | {cold:>{widths[1]}} | "
+            f"{cold_delta:>{widths[2]}} | {hot:>{widths[3]}} | "
+            f"{hot_delta:>{widths[4]}} |"
+        )
+        for query, cold, cold_delta, hot, hot_delta in rows
+    )
+
+
+def build_payload(result_dir, previous_result_dir, result, run_url):
+    if result != "success":
+        return {"text": f"Nightly JSONBench failed, please check {run_url}."}
+
+    data_size = read_number(result_dir, ["*.total_size", "*.data_size"])
+    count = read_number(result_dir, ["*.count"])
+    dataset = read_number(result_dir, ["*.dataset"])
+    query_rows = parse_query_rows(read_runtime_text(result_dir))
+    previous_query_rows = []
+    if previous_result_dir and previous_result_dir.exists():
+        previous_query_rows = parse_query_rows(read_runtime_text(previous_result_dir))
+
+    summary = (
+        f"Dataset: {format_dataset(dataset)}\n"
+        f"Data size: {format_gb(data_size)}\n"
+        f"Count: {count or 'N/A'}"
+    )
+    table = format_query_table(query_rows, previous_query_rows)
+    text = (
+        "Nightly JSONBench has completed successfully.\n"
+        f"<{run_url}|Workflow run>\n"
+        f"```{summary}\n\n{table}```"
+    )
+    return {"text": text}
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--result-dir", required=True, type=pathlib.Path)
+    parser.add_argument("--previous-result-dir", type=pathlib.Path)
+    parser.add_argument("--result", required=True)
+    parser.add_argument("--run-url", required=True)
+    args = parser.parse_args()
+
+    print(
+        json.dumps(
+            build_payload(
+                args.result_dir,
+                args.previous_result_dir,
+                args.result,
+                args.run_url,
+            )
+        )
+    )
+
+
+if __name__ == "__main__":
+    main()