From dc5126d8d14e420e24c249275ae4e1e3d09d2208 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Thu, 21 Dec 2023 09:50:10 -0800 Subject: [PATCH] feat: add the ability to create scalar indices (#679) This is a pretty direct binding to the underlying lance capability --- node/src/index.ts | 56 ++++++++++++++++++++++- node/src/remote/index.ts | 4 ++ node/src/test/test.ts | 11 +++++ python/lancedb/remote/table.py | 6 +++ python/lancedb/table.py | 74 +++++++++++++++++++++++++++++++ python/tests/test_table.py | 27 +++++++++++ rust/ffi/node/src/index.rs | 1 + rust/ffi/node/src/index/scalar.rs | 43 ++++++++++++++++++ rust/ffi/node/src/lib.rs | 4 ++ rust/vectordb/src/table.rs | 11 +++++ 10 files changed, 236 insertions(+), 1 deletion(-) create mode 100644 rust/ffi/node/src/index/scalar.rs diff --git a/node/src/index.ts b/node/src/index.ts index 9cefbd1e..e2000334 100644 --- a/node/src/index.ts +++ b/node/src/index.ts @@ -24,7 +24,7 @@ import { isEmbeddingFunction } from './embedding/embedding_function' import { type Literal, toSQL } from './util' // eslint-disable-next-line @typescript-eslint/no-var-requires -const { databaseNew, databaseTableNames, databaseOpenTable, databaseDropTable, tableCreate, tableAdd, tableCreateVectorIndex, tableCountRows, tableDelete, tableUpdate, tableCleanupOldVersions, tableCompactFiles, tableListIndices, tableIndexStats } = require('../native.js') +const { databaseNew, databaseTableNames, databaseOpenTable, databaseDropTable, tableCreate, tableAdd, tableCreateScalarIndex, tableCreateVectorIndex, tableCountRows, tableDelete, tableUpdate, tableCleanupOldVersions, tableCompactFiles, tableListIndices, tableIndexStats } = require('../native.js') export { Query } export type { EmbeddingFunction } @@ -223,6 +223,56 @@ export interface Table { */ createIndex: (indexParams: VectorIndexParams) => Promise + /** + * Create a scalar index on this Table for the given column + * + * @param column The column to index + * @param replace If false, fail if an index already exists on the column + * + * Scalar indices, like vector indices, can be used to speed up scans. A scalar + * index can speed up scans that contain filter expressions on the indexed column. + * For example, the following scan will be faster if the column `my_col` has + * a scalar index: + * + * ```ts + * const con = await lancedb.connect('./.lancedb'); + * const table = await con.openTable('images'); + * const results = await table.where('my_col = 7').execute(); + * ``` + * + * Scalar indices can also speed up scans containing a vector search and a + * prefilter: + * + * ```ts + * const con = await lancedb.connect('././lancedb'); + * const table = await con.openTable('images'); + * const results = await table.search([1.0, 2.0]).where('my_col != 7').prefilter(true); + * ``` + * + * Scalar indices can only speed up scans for basic filters using + * equality, comparison, range (e.g. `my_col BETWEEN 0 AND 100`), and set + * membership (e.g. `my_col IN (0, 1, 2)`) + * + * Scalar indices can be used if the filter contains multiple indexed columns and + * the filter criteria are AND'd or OR'd together + * (e.g. `my_col < 0 AND other_col> 100`) + * + * Scalar indices may be used if the filter contains non-indexed columns but, + * depending on the structure of the filter, they may not be usable. For example, + * if the column `not_indexed` does not have a scalar index then the filter + * `my_col = 0 OR not_indexed = 1` will not be able to use any scalar index on + * `my_col`. + * + * @examples + * + * ```ts + * const con = await lancedb.connect('././lancedb') + * const table = await con.openTable('images') + * await table.createScalarIndex('my_col') + * ``` + */ + createScalarIndex: (column: string, replace: boolean) => Promise + /** * Returns the number of rows in this table. */ @@ -537,6 +587,10 @@ export class LocalTable implements Table { return tableCreateVectorIndex.call(this._tbl, indexParams).then((newTable: any) => { this._tbl = newTable }) } + async createScalarIndex (column: string, replace: boolean): Promise { + return tableCreateScalarIndex.call(this._tbl, column, replace) + } + /** * Returns the number of rows in this table. */ diff --git a/node/src/remote/index.ts b/node/src/remote/index.ts index f2cd35ea..7fdcefd8 100644 --- a/node/src/remote/index.ts +++ b/node/src/remote/index.ts @@ -283,6 +283,10 @@ export class RemoteTable implements Table { } } + async createScalarIndex (column: string, replace: boolean): Promise { + throw new Error('Not implemented') + } + async countRows (): Promise { const result = await this._client.post(`/v1/table/${this._name}/describe/`) return result.data?.stats?.num_rows diff --git a/node/src/test/test.ts b/node/src/test/test.ts index 89a44c6c..cb3fc6f1 100644 --- a/node/src/test/test.ts +++ b/node/src/test/test.ts @@ -135,6 +135,17 @@ describe('LanceDB client', function () { assert.isTrue(results.length === 10) }) + it('should allow creation and use of scalar indices', async function () { + const uri = await createTestDB(16, 300) + const con = await lancedb.connect(uri) + const table = await con.openTable('vectors') + await table.createScalarIndex('id', true) + + // Prefiltering should still work the same + const results = await table.search(new Array(16).fill(0.1)).limit(10).filter('id >= 10').prefilter(true).execute() + assert.isTrue(results.length === 10) + }) + it('select only a subset of columns', async function () { const uri = await createTestDB() const con = await lancedb.connect(uri) diff --git a/python/lancedb/remote/table.py b/python/lancedb/remote/table.py index fbe24460..158728fb 100644 --- a/python/lancedb/remote/table.py +++ b/python/lancedb/remote/table.py @@ -64,6 +64,12 @@ class RemoteTable(Table): """to_pandas() is not supported on the LanceDB cloud""" return NotImplementedError("to_pandas() is not supported on the LanceDB cloud") + def create_scalar_index(self, *args, **kwargs): + """Creates a scalar index""" + return NotImplementedError( + "create_scalar_index() is not supported on the LanceDB cloud" + ) + def create_index( self, metric="L2", diff --git a/python/lancedb/table.py b/python/lancedb/table.py index 9a2bf395..a5746d47 100644 --- a/python/lancedb/table.py +++ b/python/lancedb/table.py @@ -221,6 +221,77 @@ class Table(ABC): """ raise NotImplementedError + @abstractmethod + def create_scalar_index( + self, + column: str, + *, + replace: bool = True, + ): + """Create a scalar index on a column. + + Scalar indices, like vector indices, can be used to speed up scans. A scalar + index can speed up scans that contain filter expressions on the indexed column. + For example, the following scan will be faster if the column ``my_col`` has + a scalar index: + + .. code-block:: python + + import lancedb + + db = lancedb.connect("/data/lance") + img_table = db.open_table("images") + my_df = img_table.search().where("my_col = 7", prefilter=True).to_pandas() + + Scalar indices can also speed up scans containing a vector search and a + prefilter: + + .. code-block::python + + import lancedb + + db = lancedb.connect("/data/lance") + img_table = db.open_table("images") + img_table.search([1, 2, 3, 4], vector_column_name="vector") + .where("my_col != 7", prefilter=True) + .to_pandas() + + Scalar indices can only speed up scans for basic filters using + equality, comparison, range (e.g. ``my_col BETWEEN 0 AND 100``), and set + membership (e.g. `my_col IN (0, 1, 2)`) + + Scalar indices can be used if the filter contains multiple indexed columns and + the filter criteria are AND'd or OR'd together + (e.g. ``my_col < 0 AND other_col> 100``) + + Scalar indices may be used if the filter contains non-indexed columns but, + depending on the structure of the filter, they may not be usable. For example, + if the column ``not_indexed`` does not have a scalar index then the filter + ``my_col = 0 OR not_indexed = 1`` will not be able to use any scalar index on + ``my_col``. + + **Experimental API** + + Parameters + ---------- + column : str + The column to be indexed. Must be a boolean, integer, float, + or string column. + replace : bool, default True + Replace the existing index if it exists. + + Examples + -------- + + .. code-block:: python + + import lance + + dataset = lance.dataset("/tmp/images.lance") + dataset.create_scalar_index("category") + """ + raise NotImplementedError + @abstractmethod def add( self, @@ -578,6 +649,9 @@ class LanceTable(Table): self._reset_dataset() register_event("create_index") + def create_scalar_index(self, column: str, *, replace: bool = True): + self._dataset.create_scalar_index(column, index_type="BTREE", replace=replace) + def create_fts_index( self, field_names: Union[str, List[str]], *, replace: bool = False ): diff --git a/python/tests/test_table.py b/python/tests/test_table.py index 3ae193a9..c6d948cf 100644 --- a/python/tests/test_table.py +++ b/python/tests/test_table.py @@ -532,6 +532,33 @@ def test_multiple_vector_columns(db): assert result1["text"].iloc[0] != result2["text"].iloc[0] +def test_create_scalar_index(db): + vec_array = pa.array( + [[1, 1], [2, 2], [3, 3], [4, 4], [5, 5]], pa.list_(pa.float32(), 2) + ) + test_data = pa.Table.from_pydict( + {"x": ["c", "b", "a", "e", "b"], "y": [1, 2, 3, 4, 5], "vector": vec_array} + ) + table = LanceTable.create( + db, + "my_table", + data=test_data, + ) + table.create_scalar_index("x") + indices = table.to_lance().list_indices() + assert len(indices) == 1 + scalar_index = indices[0] + assert scalar_index["type"] == "Scalar" + + # Confirm that prefiltering still works with the scalar index column + results = table.search().where("x = 'c'").to_arrow() + assert results == test_data.slice(0, 1) + results = table.search([5, 5]).to_arrow() + assert results["_distance"][0].as_py() == 0 + results = table.search([5, 5]).where("x != 'b'").to_arrow() + assert results["_distance"][0].as_py() > 0 + + def test_empty_query(db): table = LanceTable.create( db, diff --git a/rust/ffi/node/src/index.rs b/rust/ffi/node/src/index.rs index ed07a8d5..3dcc9bea 100644 --- a/rust/ffi/node/src/index.rs +++ b/rust/ffi/node/src/index.rs @@ -12,4 +12,5 @@ // See the License for the specific language governing permissions and // limitations under the License. +pub mod scalar; pub mod vector; diff --git a/rust/ffi/node/src/index/scalar.rs b/rust/ffi/node/src/index/scalar.rs new file mode 100644 index 00000000..f940b62b --- /dev/null +++ b/rust/ffi/node/src/index/scalar.rs @@ -0,0 +1,43 @@ +// Copyright 2023 Lance Developers. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use neon::{ + context::{Context, FunctionContext}, + result::JsResult, + types::{JsBoolean, JsBox, JsPromise, JsString}, +}; + +use crate::{error::ResultExt, runtime, table::JsTable}; + +pub(crate) fn table_create_scalar_index(mut cx: FunctionContext) -> JsResult { + let js_table = cx.this().downcast_or_throw::, _>(&mut cx)?; + let column = cx.argument::(0)?.value(&mut cx); + let replace = cx.argument::(1)?.value(&mut cx); + + let rt = runtime(&mut cx)?; + + let (deferred, promise) = cx.promise(); + let channel = cx.channel(); + let mut table = js_table.table.clone(); + + rt.spawn(async move { + let idx_result = table.create_scalar_index(&column, replace).await; + + deferred.settle_with(&channel, move |mut cx| { + idx_result.or_throw(&mut cx)?; + Ok(cx.undefined()) + }); + }); + Ok(promise) +} diff --git a/rust/ffi/node/src/lib.rs b/rust/ffi/node/src/lib.rs index bd358651..46539916 100644 --- a/rust/ffi/node/src/lib.rs +++ b/rust/ffi/node/src/lib.rs @@ -242,6 +242,10 @@ fn main(mut cx: ModuleContext) -> NeonResult<()> { cx.export_function("tableCompactFiles", JsTable::js_compact)?; cx.export_function("tableListIndices", JsTable::js_list_indices)?; cx.export_function("tableIndexStats", JsTable::js_index_stats)?; + cx.export_function( + "tableCreateScalarIndex", + index::scalar::table_create_scalar_index, + )?; cx.export_function( "tableCreateVectorIndex", index::vector::table_create_vector_index, diff --git a/rust/vectordb/src/table.rs b/rust/vectordb/src/table.rs index 9e216559..fba001a1 100644 --- a/rust/vectordb/src/table.rs +++ b/rust/vectordb/src/table.rs @@ -14,6 +14,7 @@ use chrono::Duration; use lance::dataset::builder::DatasetBuilder; +use lance::index::scalar::ScalarIndexParams; use lance_index::IndexType; use std::sync::Arc; @@ -262,6 +263,16 @@ impl Table { Ok(()) } + /// Create a scalar index on the table + pub async fn create_scalar_index(&mut self, column: &str, replace: bool) -> Result<()> { + let mut dataset = self.dataset.as_ref().clone(); + let params = ScalarIndexParams::default(); + dataset + .create_index(&[column], IndexType::Scalar, None, ¶ms, replace) + .await?; + Ok(()) + } + pub async fn optimize_indices(&mut self) -> Result<()> { let mut dataset = self.dataset.as_ref().clone();