mirror of
https://github.com/lancedb/lancedb.git
synced 2026-01-06 20:02:58 +00:00
feat: add the ability to create scalar indices (#679)
This is a pretty direct binding to the underlying lance capability
This commit is contained in:
@@ -24,7 +24,7 @@ import { isEmbeddingFunction } from './embedding/embedding_function'
|
|||||||
import { type Literal, toSQL } from './util'
|
import { type Literal, toSQL } from './util'
|
||||||
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-var-requires
|
// eslint-disable-next-line @typescript-eslint/no-var-requires
|
||||||
const { databaseNew, databaseTableNames, databaseOpenTable, databaseDropTable, tableCreate, tableAdd, tableCreateVectorIndex, tableCountRows, tableDelete, tableUpdate, tableCleanupOldVersions, tableCompactFiles, tableListIndices, tableIndexStats } = require('../native.js')
|
const { databaseNew, databaseTableNames, databaseOpenTable, databaseDropTable, tableCreate, tableAdd, tableCreateScalarIndex, tableCreateVectorIndex, tableCountRows, tableDelete, tableUpdate, tableCleanupOldVersions, tableCompactFiles, tableListIndices, tableIndexStats } = require('../native.js')
|
||||||
|
|
||||||
export { Query }
|
export { Query }
|
||||||
export type { EmbeddingFunction }
|
export type { EmbeddingFunction }
|
||||||
@@ -223,6 +223,56 @@ export interface Table<T = number[]> {
|
|||||||
*/
|
*/
|
||||||
createIndex: (indexParams: VectorIndexParams) => Promise<any>
|
createIndex: (indexParams: VectorIndexParams) => Promise<any>
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a scalar index on this Table for the given column
|
||||||
|
*
|
||||||
|
* @param column The column to index
|
||||||
|
* @param replace If false, fail if an index already exists on the column
|
||||||
|
*
|
||||||
|
* Scalar indices, like vector indices, can be used to speed up scans. A scalar
|
||||||
|
* index can speed up scans that contain filter expressions on the indexed column.
|
||||||
|
* For example, the following scan will be faster if the column `my_col` has
|
||||||
|
* a scalar index:
|
||||||
|
*
|
||||||
|
* ```ts
|
||||||
|
* const con = await lancedb.connect('./.lancedb');
|
||||||
|
* const table = await con.openTable('images');
|
||||||
|
* const results = await table.where('my_col = 7').execute();
|
||||||
|
* ```
|
||||||
|
*
|
||||||
|
* Scalar indices can also speed up scans containing a vector search and a
|
||||||
|
* prefilter:
|
||||||
|
*
|
||||||
|
* ```ts
|
||||||
|
* const con = await lancedb.connect('././lancedb');
|
||||||
|
* const table = await con.openTable('images');
|
||||||
|
* const results = await table.search([1.0, 2.0]).where('my_col != 7').prefilter(true);
|
||||||
|
* ```
|
||||||
|
*
|
||||||
|
* Scalar indices can only speed up scans for basic filters using
|
||||||
|
* equality, comparison, range (e.g. `my_col BETWEEN 0 AND 100`), and set
|
||||||
|
* membership (e.g. `my_col IN (0, 1, 2)`)
|
||||||
|
*
|
||||||
|
* Scalar indices can be used if the filter contains multiple indexed columns and
|
||||||
|
* the filter criteria are AND'd or OR'd together
|
||||||
|
* (e.g. `my_col < 0 AND other_col> 100`)
|
||||||
|
*
|
||||||
|
* Scalar indices may be used if the filter contains non-indexed columns but,
|
||||||
|
* depending on the structure of the filter, they may not be usable. For example,
|
||||||
|
* if the column `not_indexed` does not have a scalar index then the filter
|
||||||
|
* `my_col = 0 OR not_indexed = 1` will not be able to use any scalar index on
|
||||||
|
* `my_col`.
|
||||||
|
*
|
||||||
|
* @examples
|
||||||
|
*
|
||||||
|
* ```ts
|
||||||
|
* const con = await lancedb.connect('././lancedb')
|
||||||
|
* const table = await con.openTable('images')
|
||||||
|
* await table.createScalarIndex('my_col')
|
||||||
|
* ```
|
||||||
|
*/
|
||||||
|
createScalarIndex: (column: string, replace: boolean) => Promise<void>
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the number of rows in this table.
|
* Returns the number of rows in this table.
|
||||||
*/
|
*/
|
||||||
@@ -537,6 +587,10 @@ export class LocalTable<T = number[]> implements Table<T> {
|
|||||||
return tableCreateVectorIndex.call(this._tbl, indexParams).then((newTable: any) => { this._tbl = newTable })
|
return tableCreateVectorIndex.call(this._tbl, indexParams).then((newTable: any) => { this._tbl = newTable })
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async createScalarIndex (column: string, replace: boolean): Promise<void> {
|
||||||
|
return tableCreateScalarIndex.call(this._tbl, column, replace)
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the number of rows in this table.
|
* Returns the number of rows in this table.
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -283,6 +283,10 @@ export class RemoteTable<T = number[]> implements Table<T> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async createScalarIndex (column: string, replace: boolean): Promise<void> {
|
||||||
|
throw new Error('Not implemented')
|
||||||
|
}
|
||||||
|
|
||||||
async countRows (): Promise<number> {
|
async countRows (): Promise<number> {
|
||||||
const result = await this._client.post(`/v1/table/${this._name}/describe/`)
|
const result = await this._client.post(`/v1/table/${this._name}/describe/`)
|
||||||
return result.data?.stats?.num_rows
|
return result.data?.stats?.num_rows
|
||||||
|
|||||||
@@ -135,6 +135,17 @@ describe('LanceDB client', function () {
|
|||||||
assert.isTrue(results.length === 10)
|
assert.isTrue(results.length === 10)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
it('should allow creation and use of scalar indices', async function () {
|
||||||
|
const uri = await createTestDB(16, 300)
|
||||||
|
const con = await lancedb.connect(uri)
|
||||||
|
const table = await con.openTable('vectors')
|
||||||
|
await table.createScalarIndex('id', true)
|
||||||
|
|
||||||
|
// Prefiltering should still work the same
|
||||||
|
const results = await table.search(new Array(16).fill(0.1)).limit(10).filter('id >= 10').prefilter(true).execute()
|
||||||
|
assert.isTrue(results.length === 10)
|
||||||
|
})
|
||||||
|
|
||||||
it('select only a subset of columns', async function () {
|
it('select only a subset of columns', async function () {
|
||||||
const uri = await createTestDB()
|
const uri = await createTestDB()
|
||||||
const con = await lancedb.connect(uri)
|
const con = await lancedb.connect(uri)
|
||||||
|
|||||||
@@ -64,6 +64,12 @@ class RemoteTable(Table):
|
|||||||
"""to_pandas() is not supported on the LanceDB cloud"""
|
"""to_pandas() is not supported on the LanceDB cloud"""
|
||||||
return NotImplementedError("to_pandas() is not supported on the LanceDB cloud")
|
return NotImplementedError("to_pandas() is not supported on the LanceDB cloud")
|
||||||
|
|
||||||
|
def create_scalar_index(self, *args, **kwargs):
|
||||||
|
"""Creates a scalar index"""
|
||||||
|
return NotImplementedError(
|
||||||
|
"create_scalar_index() is not supported on the LanceDB cloud"
|
||||||
|
)
|
||||||
|
|
||||||
def create_index(
|
def create_index(
|
||||||
self,
|
self,
|
||||||
metric="L2",
|
metric="L2",
|
||||||
|
|||||||
@@ -221,6 +221,77 @@ class Table(ABC):
|
|||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def create_scalar_index(
|
||||||
|
self,
|
||||||
|
column: str,
|
||||||
|
*,
|
||||||
|
replace: bool = True,
|
||||||
|
):
|
||||||
|
"""Create a scalar index on a column.
|
||||||
|
|
||||||
|
Scalar indices, like vector indices, can be used to speed up scans. A scalar
|
||||||
|
index can speed up scans that contain filter expressions on the indexed column.
|
||||||
|
For example, the following scan will be faster if the column ``my_col`` has
|
||||||
|
a scalar index:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
import lancedb
|
||||||
|
|
||||||
|
db = lancedb.connect("/data/lance")
|
||||||
|
img_table = db.open_table("images")
|
||||||
|
my_df = img_table.search().where("my_col = 7", prefilter=True).to_pandas()
|
||||||
|
|
||||||
|
Scalar indices can also speed up scans containing a vector search and a
|
||||||
|
prefilter:
|
||||||
|
|
||||||
|
.. code-block::python
|
||||||
|
|
||||||
|
import lancedb
|
||||||
|
|
||||||
|
db = lancedb.connect("/data/lance")
|
||||||
|
img_table = db.open_table("images")
|
||||||
|
img_table.search([1, 2, 3, 4], vector_column_name="vector")
|
||||||
|
.where("my_col != 7", prefilter=True)
|
||||||
|
.to_pandas()
|
||||||
|
|
||||||
|
Scalar indices can only speed up scans for basic filters using
|
||||||
|
equality, comparison, range (e.g. ``my_col BETWEEN 0 AND 100``), and set
|
||||||
|
membership (e.g. `my_col IN (0, 1, 2)`)
|
||||||
|
|
||||||
|
Scalar indices can be used if the filter contains multiple indexed columns and
|
||||||
|
the filter criteria are AND'd or OR'd together
|
||||||
|
(e.g. ``my_col < 0 AND other_col> 100``)
|
||||||
|
|
||||||
|
Scalar indices may be used if the filter contains non-indexed columns but,
|
||||||
|
depending on the structure of the filter, they may not be usable. For example,
|
||||||
|
if the column ``not_indexed`` does not have a scalar index then the filter
|
||||||
|
``my_col = 0 OR not_indexed = 1`` will not be able to use any scalar index on
|
||||||
|
``my_col``.
|
||||||
|
|
||||||
|
**Experimental API**
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
column : str
|
||||||
|
The column to be indexed. Must be a boolean, integer, float,
|
||||||
|
or string column.
|
||||||
|
replace : bool, default True
|
||||||
|
Replace the existing index if it exists.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
import lance
|
||||||
|
|
||||||
|
dataset = lance.dataset("/tmp/images.lance")
|
||||||
|
dataset.create_scalar_index("category")
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def add(
|
def add(
|
||||||
self,
|
self,
|
||||||
@@ -578,6 +649,9 @@ class LanceTable(Table):
|
|||||||
self._reset_dataset()
|
self._reset_dataset()
|
||||||
register_event("create_index")
|
register_event("create_index")
|
||||||
|
|
||||||
|
def create_scalar_index(self, column: str, *, replace: bool = True):
|
||||||
|
self._dataset.create_scalar_index(column, index_type="BTREE", replace=replace)
|
||||||
|
|
||||||
def create_fts_index(
|
def create_fts_index(
|
||||||
self, field_names: Union[str, List[str]], *, replace: bool = False
|
self, field_names: Union[str, List[str]], *, replace: bool = False
|
||||||
):
|
):
|
||||||
|
|||||||
@@ -532,6 +532,33 @@ def test_multiple_vector_columns(db):
|
|||||||
assert result1["text"].iloc[0] != result2["text"].iloc[0]
|
assert result1["text"].iloc[0] != result2["text"].iloc[0]
|
||||||
|
|
||||||
|
|
||||||
|
def test_create_scalar_index(db):
|
||||||
|
vec_array = pa.array(
|
||||||
|
[[1, 1], [2, 2], [3, 3], [4, 4], [5, 5]], pa.list_(pa.float32(), 2)
|
||||||
|
)
|
||||||
|
test_data = pa.Table.from_pydict(
|
||||||
|
{"x": ["c", "b", "a", "e", "b"], "y": [1, 2, 3, 4, 5], "vector": vec_array}
|
||||||
|
)
|
||||||
|
table = LanceTable.create(
|
||||||
|
db,
|
||||||
|
"my_table",
|
||||||
|
data=test_data,
|
||||||
|
)
|
||||||
|
table.create_scalar_index("x")
|
||||||
|
indices = table.to_lance().list_indices()
|
||||||
|
assert len(indices) == 1
|
||||||
|
scalar_index = indices[0]
|
||||||
|
assert scalar_index["type"] == "Scalar"
|
||||||
|
|
||||||
|
# Confirm that prefiltering still works with the scalar index column
|
||||||
|
results = table.search().where("x = 'c'").to_arrow()
|
||||||
|
assert results == test_data.slice(0, 1)
|
||||||
|
results = table.search([5, 5]).to_arrow()
|
||||||
|
assert results["_distance"][0].as_py() == 0
|
||||||
|
results = table.search([5, 5]).where("x != 'b'").to_arrow()
|
||||||
|
assert results["_distance"][0].as_py() > 0
|
||||||
|
|
||||||
|
|
||||||
def test_empty_query(db):
|
def test_empty_query(db):
|
||||||
table = LanceTable.create(
|
table = LanceTable.create(
|
||||||
db,
|
db,
|
||||||
|
|||||||
@@ -12,4 +12,5 @@
|
|||||||
// See the License for the specific language governing permissions and
|
// See the License for the specific language governing permissions and
|
||||||
// limitations under the License.
|
// limitations under the License.
|
||||||
|
|
||||||
|
pub mod scalar;
|
||||||
pub mod vector;
|
pub mod vector;
|
||||||
|
|||||||
43
rust/ffi/node/src/index/scalar.rs
Normal file
43
rust/ffi/node/src/index/scalar.rs
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
// Copyright 2023 Lance Developers.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
use neon::{
|
||||||
|
context::{Context, FunctionContext},
|
||||||
|
result::JsResult,
|
||||||
|
types::{JsBoolean, JsBox, JsPromise, JsString},
|
||||||
|
};
|
||||||
|
|
||||||
|
use crate::{error::ResultExt, runtime, table::JsTable};
|
||||||
|
|
||||||
|
pub(crate) fn table_create_scalar_index(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
||||||
|
let js_table = cx.this().downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
|
||||||
|
let column = cx.argument::<JsString>(0)?.value(&mut cx);
|
||||||
|
let replace = cx.argument::<JsBoolean>(1)?.value(&mut cx);
|
||||||
|
|
||||||
|
let rt = runtime(&mut cx)?;
|
||||||
|
|
||||||
|
let (deferred, promise) = cx.promise();
|
||||||
|
let channel = cx.channel();
|
||||||
|
let mut table = js_table.table.clone();
|
||||||
|
|
||||||
|
rt.spawn(async move {
|
||||||
|
let idx_result = table.create_scalar_index(&column, replace).await;
|
||||||
|
|
||||||
|
deferred.settle_with(&channel, move |mut cx| {
|
||||||
|
idx_result.or_throw(&mut cx)?;
|
||||||
|
Ok(cx.undefined())
|
||||||
|
});
|
||||||
|
});
|
||||||
|
Ok(promise)
|
||||||
|
}
|
||||||
@@ -242,6 +242,10 @@ fn main(mut cx: ModuleContext) -> NeonResult<()> {
|
|||||||
cx.export_function("tableCompactFiles", JsTable::js_compact)?;
|
cx.export_function("tableCompactFiles", JsTable::js_compact)?;
|
||||||
cx.export_function("tableListIndices", JsTable::js_list_indices)?;
|
cx.export_function("tableListIndices", JsTable::js_list_indices)?;
|
||||||
cx.export_function("tableIndexStats", JsTable::js_index_stats)?;
|
cx.export_function("tableIndexStats", JsTable::js_index_stats)?;
|
||||||
|
cx.export_function(
|
||||||
|
"tableCreateScalarIndex",
|
||||||
|
index::scalar::table_create_scalar_index,
|
||||||
|
)?;
|
||||||
cx.export_function(
|
cx.export_function(
|
||||||
"tableCreateVectorIndex",
|
"tableCreateVectorIndex",
|
||||||
index::vector::table_create_vector_index,
|
index::vector::table_create_vector_index,
|
||||||
|
|||||||
@@ -14,6 +14,7 @@
|
|||||||
|
|
||||||
use chrono::Duration;
|
use chrono::Duration;
|
||||||
use lance::dataset::builder::DatasetBuilder;
|
use lance::dataset::builder::DatasetBuilder;
|
||||||
|
use lance::index::scalar::ScalarIndexParams;
|
||||||
use lance_index::IndexType;
|
use lance_index::IndexType;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
@@ -262,6 +263,16 @@ impl Table {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Create a scalar index on the table
|
||||||
|
pub async fn create_scalar_index(&mut self, column: &str, replace: bool) -> Result<()> {
|
||||||
|
let mut dataset = self.dataset.as_ref().clone();
|
||||||
|
let params = ScalarIndexParams::default();
|
||||||
|
dataset
|
||||||
|
.create_index(&[column], IndexType::Scalar, None, ¶ms, replace)
|
||||||
|
.await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn optimize_indices(&mut self) -> Result<()> {
|
pub async fn optimize_indices(&mut self) -> Result<()> {
|
||||||
let mut dataset = self.dataset.as_ref().clone();
|
let mut dataset = self.dataset.as_ref().clone();
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user