mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-23 08:00:37 +00:00
Performance test for pgvector HNSW index build and queries (#7873)
## Problem We want to regularly verify the performance of pgvector HNSW parallel index builds and parallel similarity search using HNSW indexes. The first release that considerably improved the index-build parallelism was pgvector 0.7.0 and we want to make sure that we do not regress by our neon compute VM settings (swap, memory over commit, pg conf etc.) ## Summary of changes Prepare a Neon project with 1 million openAI vector embeddings (vector size 1536). Run HNSW indexing operations in the regression test for the various distance metrics. Run similarity queries using pgbench with 100 concurrent clients. I have also added the relevant metrics to the grafana dashboards pgbench and olape --------- Co-authored-by: Alexander Bayandin <alexander@neon.tech>
This commit is contained in:
72
test_runner/performance/pgvector/loaddata.py
Normal file
72
test_runner/performance/pgvector/loaddata.py
Normal file
@@ -0,0 +1,72 @@
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import psycopg2
|
||||
from pgvector.psycopg2 import register_vector
|
||||
from psycopg2.extras import execute_values
|
||||
|
||||
|
||||
def print_usage():
|
||||
print("Usage: loaddata.py <CONNSTR> <DATADIR>")
|
||||
|
||||
|
||||
def main(conn_str, directory_path):
|
||||
# Connection to PostgreSQL
|
||||
with psycopg2.connect(conn_str) as conn:
|
||||
with conn.cursor() as cursor:
|
||||
# Run SQL statements
|
||||
cursor.execute("CREATE EXTENSION IF NOT EXISTS vector;")
|
||||
register_vector(conn)
|
||||
cursor.execute("DROP TABLE IF EXISTS documents;")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE documents (
|
||||
_id TEXT PRIMARY KEY,
|
||||
title TEXT,
|
||||
text TEXT,
|
||||
embeddings vector(1536) -- text-embedding-3-large-1536-embedding (OpenAI)
|
||||
);
|
||||
"""
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
# List and sort Parquet files
|
||||
parquet_files = sorted(Path(directory_path).glob("*.parquet"))
|
||||
|
||||
for file in parquet_files:
|
||||
print(f"Loading {file} into PostgreSQL")
|
||||
df = pd.read_parquet(file)
|
||||
|
||||
print(df.head())
|
||||
|
||||
data_list = [
|
||||
(
|
||||
row["_id"],
|
||||
row["title"],
|
||||
row["text"],
|
||||
np.array(row["text-embedding-3-large-1536-embedding"]),
|
||||
)
|
||||
for index, row in df.iterrows()
|
||||
]
|
||||
# Use execute_values to perform batch insertion
|
||||
execute_values(
|
||||
cursor,
|
||||
"INSERT INTO documents (_id, title, text, embeddings) VALUES %s",
|
||||
data_list,
|
||||
)
|
||||
# Commit after we insert all embeddings
|
||||
conn.commit()
|
||||
|
||||
print(f"Loaded {file} into PostgreSQL")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 3:
|
||||
print_usage()
|
||||
sys.exit(1)
|
||||
|
||||
conn_str = sys.argv[1]
|
||||
directory_path = sys.argv[2]
|
||||
main(conn_str, directory_path)
|
||||
Reference in New Issue
Block a user