fix: hnsw default partitions (#1667)

PR fixes #1662

---------

Co-authored-by: Will Jones <willjones127@gmail.com>
This commit is contained in:
Gagan Bhullar
2024-09-25 10:16:03 -06:00
committed by GitHub
parent 2f2721e242
commit 8f0eb34109
2 changed files with 26 additions and 5 deletions

View File

@@ -214,6 +214,11 @@ pub(crate) fn suggested_num_partitions(rows: usize) -> u32 {
max(1, num_partitions)
}
pub(crate) fn suggested_num_partitions_for_hnsw(rows: usize, dim: u32) -> u32 {
let num_partitions = (((rows as u64) * (dim as u64)) / (256 * 5_000_000)) as u32;
max(1, num_partitions)
}
pub(crate) fn suggested_num_sub_vectors(dim: u32) -> u32 {
if dim % 16 == 0 {
// Should be more aggressive than this default.

View File

@@ -54,7 +54,8 @@ use crate::embeddings::{EmbeddingDefinition, EmbeddingRegistry, MaybeEmbedded, M
use crate::error::{Error, Result};
use crate::index::scalar::FtsIndexBuilder;
use crate::index::vector::{
IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder, VectorIndex,
suggested_num_partitions_for_hnsw, IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder,
IvfPqIndexBuilder, VectorIndex,
};
use crate::index::IndexConfig;
use crate::index::IndexStatistics;
@@ -1440,11 +1441,19 @@ impl NativeTable {
});
}
let num_partitions = if let Some(n) = index.num_partitions {
let num_partitions: u32 = if let Some(n) = index.num_partitions {
n
} else {
suggested_num_partitions(self.count_rows(None).await?)
match field.data_type() {
arrow_schema::DataType::FixedSizeList(_, n) => Ok::<u32, Error>(
suggested_num_partitions_for_hnsw(self.count_rows(None).await?, *n as u32),
),
_ => Err(Error::Schema {
message: format!("Column '{}' is not a FixedSizeList", field.name()),
}),
}?
};
let num_sub_vectors: u32 = if let Some(n) = index.num_sub_vectors {
n
} else {
@@ -1503,10 +1512,17 @@ impl NativeTable {
});
}
let num_partitions = if let Some(n) = index.num_partitions {
let num_partitions: u32 = if let Some(n) = index.num_partitions {
n
} else {
suggested_num_partitions(self.count_rows(None).await?)
match field.data_type() {
arrow_schema::DataType::FixedSizeList(_, n) => Ok::<u32, Error>(
suggested_num_partitions_for_hnsw(self.count_rows(None).await?, *n as u32),
),
_ => Err(Error::Schema {
message: format!("Column '{}' is not a FixedSizeList", field.name()),
}),
}?
};
let mut dataset = self.dataset.get_mut().await?;