Performance test for pgvector HNSW index build and queries (#7873)

## Problem We want to regularly verify the performance of pgvector HNSW parallel index builds and parallel similarity search using HNSW indexes. The first release that considerably improved the index-build parallelism was pgvector 0.7.0 and we want to make sure that we do not regress by our neon compute VM settings (swap, memory over commit, pg conf etc.) ## Summary of changes Prepare a Neon project with 1 million openAI vector embeddings (vector size 1536). Run HNSW indexing operations in the regression test for the various distance metrics. Run similarity queries using pgbench with 100 concurrent clients. I have also added the relevant metrics to the grafana dashboards pgbench and olape --------- Co-authored-by: Alexander Bayandin <alexander@neon.tech>
2025-12-22 21:59:59 +00:00 · 2024-05-28 13:05:33 +02:00
parent 4a0ce9512b
commit fabeff822f
10 changed files with 395 additions and 3 deletions
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -38,6 +38,11 @@ on:
        description: 'AWS-RDS and AWS-AURORA normally only run on Saturday. Set this to true to run them on every workflow_dispatch'
        required: false
        default: false
      run_only_pgvector_tests:
        type: boolean
        description: 'Run pgvector tests but no other tests. If not set, all tests including pgvector tests will be run'
        required: false
        default: false
 defaults:
  run:
@@ -50,6 +55,7 @@ concurrency:
 jobs:
  bench:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "300"
      TEST_PG_BENCH_SCALES_MATRIX: "10,100"
@@ -120,6 +126,7 @@ jobs:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
  generate-matrices:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
    #
    # Available platforms:
@@ -197,6 +204,7 @@ jobs:
        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
  pgbench-compare:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    needs: [ generate-matrices ]
    strategy:
@@ -343,6 +351,92 @@ jobs:
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
  pgbench-pgvector:
    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
      TEST_PG_BENCH_SCALES_MATRIX: "1"
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
      DEFAULT_PG_VERSION: 16
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
      PLATFORM: "neon-captest-pgvector"
    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
      options: --init
    steps:
    - uses: actions/checkout@v4
    - name: Download Neon artifact
      uses: ./.github/actions/download
      with:
        name: neon-${{ runner.os }}-release-artifact
        path: /tmp/neon/
        prefix: latest
    - name: Add Postgres binaries to PATH
      run: |
        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
    - name: Set up Connection String
      id: set-up-connstr
      run: |
        CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
        QUERIES=("SELECT version()")
        QUERIES+=("SHOW neon.tenant_id")
        QUERIES+=("SHOW neon.timeline_id")
        for q in "${QUERIES[@]}"; do
          psql ${CONNSTR} -c "${q}"
        done
    - name: Benchmark pgvector hnsw indexing
      uses: ./.github/actions/run-python-test-set
      with:
        build_type: ${{ env.BUILD_TYPE }}
        test_selection: performance/test_perf_olap.py
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
    - name: Benchmark pgvector hnsw queries
      uses: ./.github/actions/run-python-test-set
      with:
        build_type: ${{ env.BUILD_TYPE }}
        test_selection: performance
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_pgvector
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
    - name: Create Allure report
      if: ${{ !cancelled() }}
      uses: ./.github/actions/allure-report-generate
    - name: Post to a Slack channel
      if: ${{ github.event.schedule && failure() }}
      uses: slackapi/slack-github-action@v1
      with:
        channel-id: "C033QLM5P7D" # dev-staging-stream
        slack-message: "Periodic perf testing neon-captest-pgvector: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
  clickbench-compare:
    # ClichBench DB for rds-aurora and rds-Postgres deployed to the same clusters
    # we use for performance testing in pgbench-compare.
@@ -351,7 +445,7 @@ jobs:
    #
    # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
    # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
-    if: ${{ !cancelled() }}
+    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
    needs: [ generate-matrices, pgbench-compare ]
    strategy:
@@ -455,7 +549,7 @@ jobs:
    # We might change it after https://github.com/neondatabase/neon/issues/2900.
    #
    # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
-    if: ${{ !cancelled() }}
+    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
    needs: [ generate-matrices, clickbench-compare ]
    strategy:
@@ -557,7 +651,7 @@ jobs:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
  user-examples-compare:
-    if: ${{ !cancelled() }}
+    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
    needs: [ generate-matrices, tpch-compare ]
    strategy:
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,6 +54,7 @@ build-backend = "poetry.core.masonry.api"
 exclude = [
    "^vendor/",
    "^target/",
    "test_runner/performance/pgvector/loaddata.py",
 ]
 check_untyped_defs = true
 # Help mypy find imports when running against list of individual files.
--- a/test_runner/performance/pgvector/HNSW_build.sql
+++ b/test_runner/performance/pgvector/HNSW_build.sql
@@ -0,0 +1,47 @@
 \set ECHO queries
 \timing
 -- prepare test table
 DROP TABLE IF EXISTS hnsw_test_table;
 CREATE TABLE hnsw_test_table AS TABLE documents WITH NO DATA;
 INSERT INTO hnsw_test_table SELECT * FROM documents;
 CREATE INDEX ON hnsw_test_table (_id); -- needed later for random tuple queries
 -- tune index build params
 SET max_parallel_maintenance_workers = 7; 
 SET maintenance_work_mem = '8GB';
 -- create HNSW index for the supported distance metrics
 CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_cosine_ops);
 CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_ip_ops);
 CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_l1_ops);
 CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_hamming_ops);
 CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_jaccard_ops);
 -- note: in a second psql session we can monitor the progress of the index build phases using
 -- the following query:
 -- SELECT phase, round(100.0 * blocks_done / nullif(blocks_total, 0), 1) AS "%" FROM pg_stat_progress_create_index;
 -- show all indexes built on the table
 SELECT 
    idx.relname AS index_name,
    tbl.relname AS table_name,
    am.amname AS access_method,
    a.attname AS column_name,
    opc.opcname AS operator_class
 FROM 
    pg_index i
 JOIN 
    pg_class idx ON idx.oid = i.indexrelid
 JOIN 
    pg_class tbl ON tbl.oid = i.indrelid
 JOIN 
    pg_am am ON am.oid = idx.relam
 JOIN 
    pg_attribute a ON a.attrelid = tbl.oid AND a.attnum = ANY(i.indkey)
 JOIN 
    pg_opclass opc ON opc.oid = i.indclass[0]
 WHERE 
    tbl.relname = 'hnsw_test_table' 
    AND a.attname = 'embeddings';
 -- show table sizes
 \dt+
--- a/test_runner/performance/pgvector/IVFFLAT_build.sql
+++ b/test_runner/performance/pgvector/IVFFLAT_build.sql
@@ -0,0 +1,52 @@
 \set ECHO queries
 \timing
 -- prepare test table
 DROP TABLE IF EXISTS ivfflat_test_table;
 CREATE TABLE ivfflat_test_table AS TABLE documents WITH NO DATA;
 INSERT INTO ivfflat_test_table SELECT * FROM documents;
 CREATE INDEX ON ivfflat_test_table (_id); -- needed later for random tuple queries
 -- tune index build params
 SET max_parallel_maintenance_workers = 7; 
 SET maintenance_work_mem = '8GB';
 -- create ivfflat index for the supported distance metrics
 -- the formulat for lists is # rows / 1000 or sqrt(# rows) if # rows > 1 million
 -- we have 1 million embeddings of vector size 1536 in column embeddings of table documents
 -- so we use 1000 lists
 CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings vector_l2_ops) WITH (lists = 1000);
 CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings vector_ip_ops) WITH (lists = 1000);
 CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings vector_cosine_ops) WITH (lists = 1000);
 CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings::halfvec(1536) halfvec_l2_ops) WITH (lists = 1000);
 CREATE INDEX ON ivfflat_test_table
    USING ivfflat ((binary_quantize(embeddings)::bit(1536)) bit_hamming_ops) WITH (lists = 1000);
 \d ivfflat_test_table
 -- show all indexes built on the table
 SELECT 
    idx.relname AS index_name,
    tbl.relname AS table_name,
    am.amname AS access_method,
    a.attname AS column_name,
    opc.opcname AS operator_class
 FROM 
    pg_index i
 JOIN 
    pg_class idx ON idx.oid = i.indexrelid
 JOIN 
    pg_class tbl ON tbl.oid = i.indrelid
 JOIN 
    pg_am am ON am.oid = idx.relam
 JOIN 
    pg_attribute a ON a.attrelid = tbl.oid AND a.attnum = ANY(i.indkey)
 JOIN 
    pg_opclass opc ON opc.oid = i.indclass[0]
 WHERE 
    tbl.relname = 'ivfflat_test_table' 
    AND a.attname = 'embeddings';
 -- show table sizes
 \dt+
--- a/test_runner/performance/pgvector/README.md
+++ b/test_runner/performance/pgvector/README.md
@@ -0,0 +1,38 @@
 ---
 dataset_info:
  features:
  - name: _id
    dtype: string
  - name: title
    dtype: string
  - name: text
    dtype: string
  - name: text-embedding-3-large-1536-embedding
    sequence: float64
  splits:
  - name: train
    num_bytes: 12679725776
    num_examples: 1000000
  download_size: 9551862565
  dataset_size: 12679725776
 configs:
 - config_name: default
  data_files:
  - split: train
    path: data/train-*
 license: mit
 task_categories:
 - feature-extraction
 language:
 - en
 size_categories:
 - 1M<n<10M
 ---
 1M OpenAI Embeddings: text-embedding-3-large 1536 dimensions
 - Created: February 2024. 
 - Text used for Embedding: title (string) + text (string)
 - Embedding Model: OpenAI text-embedding-3-large
 - This dataset was generated from the first 1M entries of https://huggingface.co/datasets/BeIR/dbpedia-entity, extracted by @KShivendu_ [here](https://huggingface.co/datasets/KShivendu/dbpedia-entities-openai-1M)
--- a/test_runner/performance/pgvector/loaddata.py
+++ b/test_runner/performance/pgvector/loaddata.py
@@ -0,0 +1,72 @@
 import sys
 from pathlib import Path
 import numpy as np
 import pandas as pd
 import psycopg2
 from pgvector.psycopg2 import register_vector
 from psycopg2.extras import execute_values
 def print_usage():
    print("Usage: loaddata.py <CONNSTR> <DATADIR>")
 def main(conn_str, directory_path):
    # Connection to PostgreSQL
    with psycopg2.connect(conn_str) as conn:
        with conn.cursor() as cursor:
            # Run SQL statements
            cursor.execute("CREATE EXTENSION IF NOT EXISTS vector;")
            register_vector(conn)
            cursor.execute("DROP TABLE IF EXISTS documents;")
            cursor.execute(
                """
                CREATE TABLE documents (
                    _id TEXT PRIMARY KEY,
                    title TEXT,
                    text TEXT,
                    embeddings vector(1536) -- text-embedding-3-large-1536-embedding (OpenAI)
                );
            """
            )
            conn.commit()
            # List and sort Parquet files
            parquet_files = sorted(Path(directory_path).glob("*.parquet"))
            for file in parquet_files:
                print(f"Loading {file} into PostgreSQL")
                df = pd.read_parquet(file)
                print(df.head())
                data_list = [
                    (
                        row["_id"],
                        row["title"],
                        row["text"],
                        np.array(row["text-embedding-3-large-1536-embedding"]),
                    )
                    for index, row in df.iterrows()
                ]
                # Use execute_values to perform batch insertion
                execute_values(
                    cursor,
                    "INSERT INTO documents (_id, title, text, embeddings) VALUES %s",
                    data_list,
                )
                # Commit after we insert all embeddings
                conn.commit()
                print(f"Loaded {file} into PostgreSQL")
 if __name__ == "__main__":
    if len(sys.argv) != 3:
        print_usage()
        sys.exit(1)
    conn_str = sys.argv[1]
    directory_path = sys.argv[2]
    main(conn_str, directory_path)
--- a/test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql
+++ b/test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql
@@ -0,0 +1,10 @@
 with x (x) as (
  select "embeddings" as x
  from hnsw_test_table 
  TABLESAMPLE SYSTEM (1) 
  LIMIT 1
 )
 SELECT title, "embeddings" <=> (select x from x) as distance
 FROM hnsw_test_table
 ORDER BY 2
 LIMIT 30;
--- a/test_runner/performance/pgvector/pgbench_hnsw_queries.sql
+++ b/test_runner/performance/pgvector/pgbench_hnsw_queries.sql
@@ -0,0 +1,13 @@
 -- run with pooled connection
 -- pgbench -T 300 -c 100 -j20 -f pgbench_hnsw_queries.sql -postgresql://neondb_owner:<secret>@ep-floral-thunder-w1gzhaxi-pooler.eu-west-1.aws.neon.build/neondb?sslmode=require"
 with x (x) as (
  select "embeddings" as x
  from hnsw_test_table 
  TABLESAMPLE SYSTEM (1) 
  LIMIT 1
 )
 SELECT title, "embeddings" <=> (select x from x) as distance
 FROM hnsw_test_table
 ORDER BY 2
 LIMIT 30;
--- a/test_runner/performance/test_perf_olap.py
+++ b/test_runner/performance/test_perf_olap.py
@@ -100,6 +100,25 @@ QUERIES: Tuple[LabelledQuery, ...] = (
 )
 # fmt: on
 # A list of pgvector HNSW index builds to run.
 # Please do not alter the label for the query, as it is used to identify it.
 #
 # Disable auto formatting for the list of queries so that it's easier to read
 # fmt: off
 PGVECTOR_QUERIES: Tuple[LabelledQuery, ...] = (
    LabelledQuery("PGV0",  r"DROP TABLE IF EXISTS hnsw_test_table;"),
    LabelledQuery("PGV1",  r"CREATE TABLE hnsw_test_table AS TABLE documents WITH NO DATA;"),
    LabelledQuery("PGV2",  r"INSERT INTO hnsw_test_table SELECT * FROM documents;"),
    LabelledQuery("PGV3",  r"CREATE INDEX ON hnsw_test_table (_id);"),
    LabelledQuery("PGV4",  r"CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_cosine_ops);"),
    LabelledQuery("PGV5",  r"CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_ip_ops);"),
    LabelledQuery("PGV6",  r"CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_l1_ops);"),
    LabelledQuery("PGV7",  r"CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_hamming_ops);"),
    LabelledQuery("PGV8",  r"CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_jaccard_ops);"),
 )
 # fmt: on
 EXPLAIN_STRING: str = "EXPLAIN (ANALYZE, VERBOSE, BUFFERS, COSTS, SETTINGS, FORMAT JSON)"
@@ -245,3 +264,18 @@ def test_clickbench_collect_pg_stat_statements(remote_compare: RemoteCompare):
    log.info("Collecting pg_stat_statements")
    query = LabelledQuery("Q_COLLECT_PG_STAT_STATEMENTS", r"SELECT * from pg_stat_statements;")
    run_psql(remote_compare, query, times=1, explain=False)
@pytest.mark.parametrize("query", PGVECTOR_QUERIES)
@pytest.mark.remote_cluster
 def test_pgvector_indexing(query: LabelledQuery, remote_compare: RemoteCompare):
    """
    An pgvector test that tests HNSW index build performance and parallelism.
    The DB prepared manually in advance.
    See
    - test_runner/performance/pgvector/README.md
    - test_runner/performance/pgvector/loaddata.py
    - test_runner/performance/pgvector/HNSW_build.sql
    """
    run_psql(remote_compare, query, times=1, explain=False)
--- a/test_runner/performance/test_perf_pgbench.py
+++ b/test_runner/performance/test_perf_pgbench.py
@@ -17,6 +17,7 @@ class PgBenchLoadType(enum.Enum):
    INIT = "init"
    SIMPLE_UPDATE = "simple-update"
    SELECT_ONLY = "select-only"
    PGVECTOR_HNSW = "pgvector-hnsw"
 def utc_now_timestamp() -> int:
@@ -132,6 +133,26 @@ def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: P
            password=password,
        )
    if workload_type == PgBenchLoadType.PGVECTOR_HNSW:
        # Run simple-update workload
        run_pgbench(
            env,
            "pgvector-hnsw",
            [
                "pgbench",
                "-f",
                "test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql",
                "-c100",
                "-j20",
                f"-T{duration}",
                "-P2",
                "--protocol=prepared",
                "--progress-timestamp",
                connstr,
            ],
            password=password,
        )
    env.report_size()
@@ -201,3 +222,13 @@ def test_pgbench_remote_simple_update(remote_compare: PgCompare, scale: int, dur
@pytest.mark.remote_cluster
 def test_pgbench_remote_select_only(remote_compare: PgCompare, scale: int, duration: int):
    run_test_pgbench(remote_compare, scale, duration, PgBenchLoadType.SELECT_ONLY)
 # The following test runs on an existing database that has pgvector extension installed
 # and a table with 1 million embedding vectors loaded and indexed with HNSW.
 #
 # Run this pgbench tests against an existing remote Postgres cluster with the necessary setup.
@pytest.mark.parametrize("duration", get_durations_matrix())
@pytest.mark.remote_cluster
 def test_pgbench_remote_pgvector(remote_compare: PgCompare, duration: int):
    run_test_pgbench(remote_compare, 1, duration, PgBenchLoadType.PGVECTOR_HNSW)