Performance test for pgvector HNSW index build and queries (#7873)

## Problem We want to regularly verify the performance of pgvector HNSW parallel index builds and parallel similarity search using HNSW indexes. The first release that considerably improved the index-build parallelism was pgvector 0.7.0 and we want to make sure that we do not regress by our neon compute VM settings (swap, memory over commit, pg conf etc.) ## Summary of changes Prepare a Neon project with 1 million openAI vector embeddings (vector size 1536). Run HNSW indexing operations in the regression test for the various distance metrics. Run similarity queries using pgbench with 100 concurrent clients. I have also added the relevant metrics to the grafana dashboards pgbench and olape --------- Co-authored-by: Alexander Bayandin <alexander@neon.tech>
2025-12-22 21:59:59 +00:00 · 2024-05-28 13:05:33 +02:00
parent 4a0ce9512b
commit fabeff822f
10 changed files with 395 additions and 3 deletions
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -38,6 +38,11 @@ on:
        description: 'AWS-RDS and AWS-AURORA normally only run on Saturday. Set this to true to run them on every workflow_dispatch'
        required: false
        default: false
+      run_only_pgvector_tests:
+        type: boolean
+        description: 'Run pgvector tests but no other tests. If not set, all tests including pgvector tests will be run'
+        required: false
+        default: false

 defaults:
  run:
@@ -50,6 +55,7 @@ concurrency:

 jobs:
  bench:
+    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "300"
      TEST_PG_BENCH_SCALES_MATRIX: "10,100"
@@ -120,6 +126,7 @@ jobs:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

  generate-matrices:
+    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
    #
    # Available platforms:
@@ -197,6 +204,7 @@ jobs:
        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT

  pgbench-compare:
+    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    needs: [ generate-matrices ]

    strategy:
@@ -343,6 +351,92 @@ jobs:
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

+  pgbench-pgvector:
+    env:
+      TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
+      TEST_PG_BENCH_SCALES_MATRIX: "1"
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      DEFAULT_PG_VERSION: 16
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
+      PLATFORM: "neon-captest-pgvector"
+
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      options: --init
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+
+    - name: Add Postgres binaries to PATH
+      run: |
+        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
+        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
+
+    - name: Set up Connection String
+      id: set-up-connstr
+      run: |
+        CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
+        
+        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
+
+        QUERIES=("SELECT version()")
+        QUERIES+=("SHOW neon.tenant_id")
+        QUERIES+=("SHOW neon.timeline_id")
+        
+        for q in "${QUERIES[@]}"; do
+          psql ${CONNSTR} -c "${q}"
+        done
+
+    - name: Benchmark pgvector hnsw indexing
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance/test_perf_olap.py
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing
+      env:
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+
+    - name: Benchmark pgvector hnsw queries
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_pgvector
+      env:
+        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+    
+    - name: Create Allure report
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
+
+    - name: Post to a Slack channel
+      if: ${{ github.event.schedule && failure() }}
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C033QLM5P7D" # dev-staging-stream
+        slack-message: "Periodic perf testing neon-captest-pgvector: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
+
  clickbench-compare:
    # ClichBench DB for rds-aurora and rds-Postgres deployed to the same clusters
    # we use for performance testing in pgbench-compare.
@@ -351,7 +445,7 @@ jobs:
    #
    # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
    # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
-    if: ${{ !cancelled() }}
+    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
    needs: [ generate-matrices, pgbench-compare ]

    strategy:
@@ -455,7 +549,7 @@ jobs:
    # We might change it after https://github.com/neondatabase/neon/issues/2900.
    #
    # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
-    if: ${{ !cancelled() }}
+    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
    needs: [ generate-matrices, clickbench-compare ]

    strategy:
@@ -557,7 +651,7 @@ jobs:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

  user-examples-compare:
-    if: ${{ !cancelled() }}
+    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
    needs: [ generate-matrices, tpch-compare ]

    strategy:
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,6 +54,7 @@ build-backend = "poetry.core.masonry.api"
 exclude = [
    "^vendor/",
    "^target/",
+    "test_runner/performance/pgvector/loaddata.py",
 ]
 check_untyped_defs = true
 # Help mypy find imports when running against list of individual files.
--- a/test_runner/performance/pgvector/HNSW_build.sql
+++ b/test_runner/performance/pgvector/HNSW_build.sql
@@ -0,0 +1,47 @@
+
+\set ECHO queries
+\timing
+
+-- prepare test table
+DROP TABLE IF EXISTS hnsw_test_table;
+CREATE TABLE hnsw_test_table AS TABLE documents WITH NO DATA;
+INSERT INTO hnsw_test_table SELECT * FROM documents;
+CREATE INDEX ON hnsw_test_table (_id); -- needed later for random tuple queries
+-- tune index build params
+SET max_parallel_maintenance_workers = 7; 
+SET maintenance_work_mem = '8GB';
+-- create HNSW index for the supported distance metrics
+CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_cosine_ops);
+CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_ip_ops);
+CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_l1_ops);
+CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_hamming_ops);
+CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_jaccard_ops);
+-- note: in a second psql session we can monitor the progress of the index build phases using
+-- the following query:
+-- SELECT phase, round(100.0 * blocks_done / nullif(blocks_total, 0), 1) AS "%" FROM pg_stat_progress_create_index;
+
+-- show all indexes built on the table
+SELECT 
+    idx.relname AS index_name,
+    tbl.relname AS table_name,
+    am.amname AS access_method,
+    a.attname AS column_name,
+    opc.opcname AS operator_class
+FROM 
+    pg_index i
+JOIN 
+    pg_class idx ON idx.oid = i.indexrelid
+JOIN 
+    pg_class tbl ON tbl.oid = i.indrelid
+JOIN 
+    pg_am am ON am.oid = idx.relam
+JOIN 
+    pg_attribute a ON a.attrelid = tbl.oid AND a.attnum = ANY(i.indkey)
+JOIN 
+    pg_opclass opc ON opc.oid = i.indclass[0]
+WHERE 
+    tbl.relname = 'hnsw_test_table' 
+    AND a.attname = 'embeddings';
+
+-- show table sizes
+\dt+
--- a/test_runner/performance/pgvector/IVFFLAT_build.sql
+++ b/test_runner/performance/pgvector/IVFFLAT_build.sql
@@ -0,0 +1,52 @@
+
+\set ECHO queries
+\timing
+
+-- prepare test table
+DROP TABLE IF EXISTS ivfflat_test_table;
+CREATE TABLE ivfflat_test_table AS TABLE documents WITH NO DATA;
+INSERT INTO ivfflat_test_table SELECT * FROM documents;
+CREATE INDEX ON ivfflat_test_table (_id); -- needed later for random tuple queries
+-- tune index build params
+SET max_parallel_maintenance_workers = 7; 
+SET maintenance_work_mem = '8GB';
+-- create ivfflat index for the supported distance metrics
+-- the formulat for lists is # rows / 1000 or sqrt(# rows) if # rows > 1 million
+-- we have 1 million embeddings of vector size 1536 in column embeddings of table documents
+-- so we use 1000 lists
+CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings vector_l2_ops) WITH (lists = 1000);
+CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings vector_ip_ops) WITH (lists = 1000);
+CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings vector_cosine_ops) WITH (lists = 1000);
+CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings::halfvec(1536) halfvec_l2_ops) WITH (lists = 1000);
+CREATE INDEX ON ivfflat_test_table
+    USING ivfflat ((binary_quantize(embeddings)::bit(1536)) bit_hamming_ops) WITH (lists = 1000);
+
+\d ivfflat_test_table
+
+
+-- show all indexes built on the table
+SELECT 
+    idx.relname AS index_name,
+    tbl.relname AS table_name,
+    am.amname AS access_method,
+    a.attname AS column_name,
+    opc.opcname AS operator_class
+FROM 
+    pg_index i
+JOIN 
+    pg_class idx ON idx.oid = i.indexrelid
+JOIN 
+    pg_class tbl ON tbl.oid = i.indrelid
+JOIN 
+    pg_am am ON am.oid = idx.relam
+JOIN 
+    pg_attribute a ON a.attrelid = tbl.oid AND a.attnum = ANY(i.indkey)
+JOIN 
+    pg_opclass opc ON opc.oid = i.indclass[0]
+WHERE 
+    tbl.relname = 'ivfflat_test_table' 
+    AND a.attname = 'embeddings';
+-- show table sizes
+\dt+
+
+
--- a/test_runner/performance/pgvector/README.md
+++ b/test_runner/performance/pgvector/README.md
@@ -0,0 +1,38 @@
+---
+dataset_info:
+  features:
+  - name: _id
+    dtype: string
+  - name: title
+    dtype: string
+  - name: text
+    dtype: string
+  - name: text-embedding-3-large-1536-embedding
+    sequence: float64
+  splits:
+  - name: train
+    num_bytes: 12679725776
+    num_examples: 1000000
+  download_size: 9551862565
+  dataset_size: 12679725776
+configs:
+- config_name: default
+  data_files:
+  - split: train
+    path: data/train-*
+license: mit
+task_categories:
+- feature-extraction
+language:
+- en
+size_categories:
+- 1M<n<10M
+---
+
+
+1M OpenAI Embeddings: text-embedding-3-large 1536 dimensions
+
+- Created: February 2024. 
+- Text used for Embedding: title (string) + text (string)
+- Embedding Model: OpenAI text-embedding-3-large
+- This dataset was generated from the first 1M entries of https://huggingface.co/datasets/BeIR/dbpedia-entity, extracted by @KShivendu_ [here](https://huggingface.co/datasets/KShivendu/dbpedia-entities-openai-1M)
--- a/test_runner/performance/pgvector/loaddata.py
+++ b/test_runner/performance/pgvector/loaddata.py
@@ -0,0 +1,72 @@
+import sys
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import psycopg2
+from pgvector.psycopg2 import register_vector
+from psycopg2.extras import execute_values
+
+
+def print_usage():
+    print("Usage: loaddata.py <CONNSTR> <DATADIR>")
+
+
+def main(conn_str, directory_path):
+    # Connection to PostgreSQL
+    with psycopg2.connect(conn_str) as conn:
+        with conn.cursor() as cursor:
+            # Run SQL statements
+            cursor.execute("CREATE EXTENSION IF NOT EXISTS vector;")
+            register_vector(conn)
+            cursor.execute("DROP TABLE IF EXISTS documents;")
+            cursor.execute(
+                """
+                CREATE TABLE documents (
+                    _id TEXT PRIMARY KEY,
+                    title TEXT,
+                    text TEXT,
+                    embeddings vector(1536) -- text-embedding-3-large-1536-embedding (OpenAI)
+                );
+            """
+            )
+            conn.commit()
+
+            # List and sort Parquet files
+            parquet_files = sorted(Path(directory_path).glob("*.parquet"))
+
+            for file in parquet_files:
+                print(f"Loading {file} into PostgreSQL")
+                df = pd.read_parquet(file)
+
+                print(df.head())
+
+                data_list = [
+                    (
+                        row["_id"],
+                        row["title"],
+                        row["text"],
+                        np.array(row["text-embedding-3-large-1536-embedding"]),
+                    )
+                    for index, row in df.iterrows()
+                ]
+                # Use execute_values to perform batch insertion
+                execute_values(
+                    cursor,
+                    "INSERT INTO documents (_id, title, text, embeddings) VALUES %s",
+                    data_list,
+                )
+                # Commit after we insert all embeddings
+                conn.commit()
+
+                print(f"Loaded {file} into PostgreSQL")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print_usage()
+        sys.exit(1)
+
+    conn_str = sys.argv[1]
+    directory_path = sys.argv[2]
+    main(conn_str, directory_path)
--- a/test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql
+++ b/test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql
@@ -0,0 +1,10 @@
+with x (x) as (
+  select "embeddings" as x
+  from hnsw_test_table 
+  TABLESAMPLE SYSTEM (1) 
+  LIMIT 1
+)
+SELECT title, "embeddings" <=> (select x from x) as distance
+FROM hnsw_test_table
+ORDER BY 2
+LIMIT 30;
--- a/test_runner/performance/pgvector/pgbench_hnsw_queries.sql
+++ b/test_runner/performance/pgvector/pgbench_hnsw_queries.sql
@@ -0,0 +1,13 @@
+-- run with pooled connection
+-- pgbench -T 300 -c 100 -j20 -f pgbench_hnsw_queries.sql -postgresql://neondb_owner:<secret>@ep-floral-thunder-w1gzhaxi-pooler.eu-west-1.aws.neon.build/neondb?sslmode=require"
+
+with x (x) as (
+  select "embeddings" as x
+  from hnsw_test_table 
+  TABLESAMPLE SYSTEM (1) 
+  LIMIT 1
+)
+SELECT title, "embeddings" <=> (select x from x) as distance
+FROM hnsw_test_table
+ORDER BY 2
+LIMIT 30;
--- a/test_runner/performance/test_perf_olap.py
+++ b/test_runner/performance/test_perf_olap.py
@@ -100,6 +100,25 @@ QUERIES: Tuple[LabelledQuery, ...] = (
 )
 # fmt: on

+# A list of pgvector HNSW index builds to run.
+# Please do not alter the label for the query, as it is used to identify it.
+#
+# Disable auto formatting for the list of queries so that it's easier to read
+# fmt: off
+PGVECTOR_QUERIES: Tuple[LabelledQuery, ...] = (
+    LabelledQuery("PGV0",  r"DROP TABLE IF EXISTS hnsw_test_table;"),
+    LabelledQuery("PGV1",  r"CREATE TABLE hnsw_test_table AS TABLE documents WITH NO DATA;"),
+    LabelledQuery("PGV2",  r"INSERT INTO hnsw_test_table SELECT * FROM documents;"),
+    LabelledQuery("PGV3",  r"CREATE INDEX ON hnsw_test_table (_id);"),
+    LabelledQuery("PGV4",  r"CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_cosine_ops);"),
+    LabelledQuery("PGV5",  r"CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_ip_ops);"),
+    LabelledQuery("PGV6",  r"CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_l1_ops);"),
+    LabelledQuery("PGV7",  r"CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_hamming_ops);"),
+    LabelledQuery("PGV8",  r"CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_jaccard_ops);"),
+)
+# fmt: on
+
+
 EXPLAIN_STRING: str = "EXPLAIN (ANALYZE, VERBOSE, BUFFERS, COSTS, SETTINGS, FORMAT JSON)"


@@ -245,3 +264,18 @@ def test_clickbench_collect_pg_stat_statements(remote_compare: RemoteCompare):
    log.info("Collecting pg_stat_statements")
    query = LabelledQuery("Q_COLLECT_PG_STAT_STATEMENTS", r"SELECT * from pg_stat_statements;")
    run_psql(remote_compare, query, times=1, explain=False)
+
+
+@pytest.mark.parametrize("query", PGVECTOR_QUERIES)
+@pytest.mark.remote_cluster
+def test_pgvector_indexing(query: LabelledQuery, remote_compare: RemoteCompare):
+    """
+    An pgvector test that tests HNSW index build performance and parallelism.
+
+    The DB prepared manually in advance.
+    See
+    - test_runner/performance/pgvector/README.md
+    - test_runner/performance/pgvector/loaddata.py
+    - test_runner/performance/pgvector/HNSW_build.sql
+    """
+    run_psql(remote_compare, query, times=1, explain=False)
--- a/test_runner/performance/test_perf_pgbench.py
+++ b/test_runner/performance/test_perf_pgbench.py
@@ -17,6 +17,7 @@ class PgBenchLoadType(enum.Enum):
    INIT = "init"
    SIMPLE_UPDATE = "simple-update"
    SELECT_ONLY = "select-only"
+    PGVECTOR_HNSW = "pgvector-hnsw"


 def utc_now_timestamp() -> int:
@@ -132,6 +133,26 @@ def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: P
            password=password,
        )

+    if workload_type == PgBenchLoadType.PGVECTOR_HNSW:
+        # Run simple-update workload
+        run_pgbench(
+            env,
+            "pgvector-hnsw",
+            [
+                "pgbench",
+                "-f",
+                "test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql",
+                "-c100",
+                "-j20",
+                f"-T{duration}",
+                "-P2",
+                "--protocol=prepared",
+                "--progress-timestamp",
+                connstr,
+            ],
+            password=password,
+        )
+
    env.report_size()


@@ -201,3 +222,13 @@ def test_pgbench_remote_simple_update(remote_compare: PgCompare, scale: int, dur
@pytest.mark.remote_cluster
 def test_pgbench_remote_select_only(remote_compare: PgCompare, scale: int, duration: int):
    run_test_pgbench(remote_compare, scale, duration, PgBenchLoadType.SELECT_ONLY)
+
+
+# The following test runs on an existing database that has pgvector extension installed
+# and a table with 1 million embedding vectors loaded and indexed with HNSW.
+#
+# Run this pgbench tests against an existing remote Postgres cluster with the necessary setup.
+@pytest.mark.parametrize("duration", get_durations_matrix())
+@pytest.mark.remote_cluster
+def test_pgbench_remote_pgvector(remote_compare: PgCompare, duration: int):
+    run_test_pgbench(remote_compare, 1, duration, PgBenchLoadType.PGVECTOR_HNSW)