Files
neon/test_runner/performance/pgvector/loaddata.py
Peter Bendel fabeff822f Performance test for pgvector HNSW index build and queries (#7873)
## Problem

We want to regularly verify the performance of pgvector HNSW parallel
index builds and parallel similarity search using HNSW indexes.
The first release that considerably improved the index-build parallelism
was pgvector 0.7.0 and we want to make sure that we do not regress by
our neon compute VM settings (swap, memory over commit, pg conf etc.)

## Summary of changes

Prepare a Neon project with 1 million openAI vector embeddings (vector
size 1536).
Run HNSW indexing operations in the regression test for the various
distance metrics.
Run similarity queries using pgbench with 100 concurrent clients.

I have also added the relevant metrics to the grafana dashboards pgbench
and olape

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
2024-05-28 11:05:33 +00:00

73 lines
2.1 KiB
Python

import sys
from pathlib import Path
import numpy as np
import pandas as pd
import psycopg2
from pgvector.psycopg2 import register_vector
from psycopg2.extras import execute_values
def print_usage():
print("Usage: loaddata.py <CONNSTR> <DATADIR>")
def main(conn_str, directory_path):
# Connection to PostgreSQL
with psycopg2.connect(conn_str) as conn:
with conn.cursor() as cursor:
# Run SQL statements
cursor.execute("CREATE EXTENSION IF NOT EXISTS vector;")
register_vector(conn)
cursor.execute("DROP TABLE IF EXISTS documents;")
cursor.execute(
"""
CREATE TABLE documents (
_id TEXT PRIMARY KEY,
title TEXT,
text TEXT,
embeddings vector(1536) -- text-embedding-3-large-1536-embedding (OpenAI)
);
"""
)
conn.commit()
# List and sort Parquet files
parquet_files = sorted(Path(directory_path).glob("*.parquet"))
for file in parquet_files:
print(f"Loading {file} into PostgreSQL")
df = pd.read_parquet(file)
print(df.head())
data_list = [
(
row["_id"],
row["title"],
row["text"],
np.array(row["text-embedding-3-large-1536-embedding"]),
)
for index, row in df.iterrows()
]
# Use execute_values to perform batch insertion
execute_values(
cursor,
"INSERT INTO documents (_id, title, text, embeddings) VALUES %s",
data_list,
)
# Commit after we insert all embeddings
conn.commit()
print(f"Loaded {file} into PostgreSQL")
if __name__ == "__main__":
if len(sys.argv) != 3:
print_usage()
sys.exit(1)
conn_str = sys.argv[1]
directory_path = sys.argv[2]
main(conn_str, directory_path)