Compare commits

...

9 Commits

Author SHA1 Message Date
Chang She
2b26775ed1 python v0.1.4 2023-05-31 20:11:25 -07:00
Lei Xu
306ada5cb8 Support S3 and GCS from typescript SDK (#106) 2023-05-30 21:32:17 -07:00
gsilvestrin
d3aa8bfbc5 add embedding functions to the nodejs client (#95) 2023-05-26 18:09:20 -07:00
Chang She
04d97347d7 move tantivy-py installation to be separate from wheel (#97)
pypi does not allow packages to be uploaded that has a direct reference

for now we'll just ask the user to install tantivy separately

---------

Co-authored-by: Chang She <chang@lancedb.com>
2023-05-25 17:57:26 -06:00
Chang She
22aa8a93c2 bump version for v0.1.3 2023-05-25 17:01:52 -06:00
Chang She
f485378ea4 Basic full text search capabilities (#62)
This is v1 of integrating full text search index into LanceDB.

# API
The query API is roughly the same as before, except if the input is text
instead of a vector we assume that its fts search.

## Example
If `table` is a LanceDB LanceTable, then:

Build index: `table.create_fts_index("text")`

Query: `df = table.search("puppy").limit(10).select(["text"]).to_df()`

# Implementation
Here we use the tantivy-py package to build the index. We then use the
row id's as the full-text-search index's doc id then we just do a Take
operation to fetch the rows.

# Limitations

1. don't support incremental row appends yet. New data won't show up in
search
2. local filesystem only 
3. requires building tantivy explicitly

---------

Co-authored-by: Chang She <chang@lancedb.com>
2023-05-24 22:25:31 -06:00
gsilvestrin
f923cfe47f add create index to nodejs client (#89) 2023-05-24 16:45:58 -06:00
gsilvestrin
06cb7b6458 add query params to to nodejs client (#87) 2023-05-24 15:48:31 -06:00
gsilvestrin
bdef634954 bugfix: string columns should be converted to Utf8Array (#94) 2023-05-23 14:58:49 -07:00
29 changed files with 1333 additions and 190 deletions

View File

@@ -31,6 +31,7 @@ jobs:
- name: Install lancedb - name: Install lancedb
run: | run: |
pip install -e . pip install -e .
pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985
pip install pytest pip install pytest
- name: Run tests - name: Run tests
run: pytest -x -v --durations=30 tests run: pytest -x -v --durations=30 tests
@@ -49,10 +50,11 @@ jobs:
- name: Set up Python - name: Set up Python
uses: actions/setup-python@v4 uses: actions/setup-python@v4
with: with:
python-version: "3.10" python-version: "3.11"
- name: Install lancedb - name: Install lancedb
run: | run: |
pip install -e . pip install -e .
pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985
pip install pytest pip install pytest
- name: Run tests - name: Run tests
run: pytest -x -v --durations=30 tests run: pytest -x -v --durations=30 tests

9
Cargo.lock generated
View File

@@ -1052,6 +1052,7 @@ dependencies = [
"paste", "paste",
"petgraph", "petgraph",
"rand", "rand",
"regex",
"uuid", "uuid",
] ]
@@ -1645,9 +1646,9 @@ dependencies = [
[[package]] [[package]]
name = "lance" name = "lance"
version = "0.4.12" version = "0.4.17"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fc96cf89139af6f439a0e28ccd04ddf81be795b79fda3105b7a8952fadeb778e" checksum = "86dda8185bd1ffae7b910c1f68035af23be9b717c52e9cc4de176cd30b47f772"
dependencies = [ dependencies = [
"accelerate-src", "accelerate-src",
"arrow", "arrow",
@@ -1684,6 +1685,7 @@ dependencies = [
"rand", "rand",
"reqwest", "reqwest",
"shellexpand", "shellexpand",
"snafu",
"sqlparser-lance", "sqlparser-lance",
"tokio", "tokio",
"url", "url",
@@ -3359,8 +3361,11 @@ name = "vectordb"
version = "0.0.1" version = "0.0.1"
dependencies = [ dependencies = [
"arrow-array", "arrow-array",
"arrow-data",
"arrow-schema", "arrow-schema",
"lance", "lance",
"object_store",
"rand",
"tempfile", "tempfile",
"tokio", "tokio",
] ]

View File

@@ -19,6 +19,7 @@ nav:
- Basics: basic.md - Basics: basic.md
- Embeddings: embedding.md - Embeddings: embedding.md
- Indexing: ann_indexes.md - Indexing: ann_indexes.md
- Full-text search: fts.md
- Integrations: integrations.md - Integrations: integrations.md
- Python API: python.md - Python API: python.md

51
docs/src/fts.md Normal file
View File

@@ -0,0 +1,51 @@
# [EXPERIMENTAL] Full text search
LanceDB now provides experimental support for full text search.
This is currently Python only. We plan to push the integration down to Rust in the future
to make this available for JS as well.
## Installation
To use full text search, you must install optional dependency tantivy-py:
# tantivy 0.19.2
pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985
## Quickstart
Assume:
1. `table` is a LanceDB Table
2. `text` is the name of the Table column that we want to index
To create the index:
```python
table.create_fts_index("text")
```
To search:
```python
df = table.search("puppy").limit(10).select(["text"]).to_df()
```
LanceDB automatically looks for an FTS index if the input is str.
## Multiple text columns
If you have multiple columns to index, pass them all as a list to `create_fts_index`:
```python
table.create_fts_index(["text1", "text2"])
```
Note that the search API call does not change - you can search over all indexed columns at once.
## Current limitations
1. Currently we do not yet support incremental writes.
If you add data after fts index creation, it won't be reflected
in search results until you do a full reindex.
2. We currently only support local filesystem paths for the fts index.

View File

@@ -45,5 +45,6 @@ We will be adding completed demo apps built using LanceDB.
* [`Basic Operations`](basic.md) - basic functionality of LanceDB. * [`Basic Operations`](basic.md) - basic functionality of LanceDB.
* [`Embedding Functions`](embedding.md) - functions for working with embeddings. * [`Embedding Functions`](embedding.md) - functions for working with embeddings.
* [`Indexing`](ann_indexes.md) - create vector indexes to speed up queries. * [`Indexing`](ann_indexes.md) - create vector indexes to speed up queries.
* [`Full text search`](fts.md) - [EXPERIMENTAL] full-text search API
* [`Ecosystem Integrations`](integrations.md) - integrating LanceDB with python data tooling ecosystem. * [`Ecosystem Integrations`](integrations.md) - integrating LanceDB with python data tooling ecosystem.
* [`API Reference`](python.md) - detailed documentation for the LanceDB Python SDK. * [`API Reference`](python.md) - detailed documentation for the LanceDB Python SDK.

View File

@@ -15,15 +15,16 @@
import { import {
Field, Field,
Float32, Float32,
List, List, type ListBuilder,
makeBuilder, makeBuilder,
RecordBatchFileWriter, RecordBatchFileWriter,
Table, Table, Utf8,
type Vector, type Vector,
vectorFromArray vectorFromArray
} from 'apache-arrow' } from 'apache-arrow'
import { type EmbeddingFunction } from './index'
export function convertToTable (data: Array<Record<string, unknown>>): Table { export function convertToTable<T> (data: Array<Record<string, unknown>>, embeddings?: EmbeddingFunction<T>): Table {
if (data.length === 0) { if (data.length === 0) {
throw new Error('At least one record needs to be provided') throw new Error('At least one record needs to be provided')
} }
@@ -33,11 +34,7 @@ export function convertToTable (data: Array<Record<string, unknown>>): Table {
for (const columnsKey of columns) { for (const columnsKey of columns) {
if (columnsKey === 'vector') { if (columnsKey === 'vector') {
const children = new Field<Float32>('item', new Float32()) const listBuilder = newVectorListBuilder()
const list = new List(children)
const listBuilder = makeBuilder({
type: list
})
const vectorSize = (data[0].vector as any[]).length const vectorSize = (data[0].vector as any[]).length
for (const datum of data) { for (const datum of data) {
if ((datum[columnsKey] as any[]).length !== vectorSize) { if ((datum[columnsKey] as any[]).length !== vectorSize) {
@@ -52,15 +49,37 @@ export function convertToTable (data: Array<Record<string, unknown>>): Table {
for (const datum of data) { for (const datum of data) {
values.push(datum[columnsKey]) values.push(datum[columnsKey])
} }
if (columnsKey === embeddings?.sourceColumn) {
const vectors = embeddings.embed(values as T[])
const listBuilder = newVectorListBuilder()
vectors.map(v => listBuilder.append(v))
records.vector = listBuilder.finish().toVector()
}
if (typeof values[0] === 'string') {
// `vectorFromArray` converts strings into dictionary vectors, forcing it back to a string column
records[columnsKey] = vectorFromArray(values, new Utf8())
} else {
records[columnsKey] = vectorFromArray(values) records[columnsKey] = vectorFromArray(values)
} }
} }
}
return new Table(records) return new Table(records)
} }
export async function fromRecordsToBuffer (data: Array<Record<string, unknown>>): Promise<Buffer> { // Creates a new Arrow ListBuilder that stores a Vector column
const table = convertToTable(data) function newVectorListBuilder (): ListBuilder<Float32, any> {
const children = new Field<Float32>('item', new Float32())
const list = new List(children)
return makeBuilder({
type: list
})
}
export async function fromRecordsToBuffer<T> (data: Array<Record<string, unknown>>, embeddings?: EmbeddingFunction<T>): Promise<Buffer> {
const table = convertToTable(data, embeddings)
const writer = RecordBatchFileWriter.writeAll(table) const writer = RecordBatchFileWriter.writeAll(table)
return Buffer.from(await writer.toUint8Array()) return Buffer.from(await writer.toUint8Array())
} }

View File

@@ -21,14 +21,15 @@ import {
import { fromRecordsToBuffer } from './arrow' import { fromRecordsToBuffer } from './arrow'
// eslint-disable-next-line @typescript-eslint/no-var-requires // eslint-disable-next-line @typescript-eslint/no-var-requires
const { databaseNew, databaseTableNames, databaseOpenTable, tableCreate, tableSearch, tableAdd } = require('../native.js') const { databaseNew, databaseTableNames, databaseOpenTable, tableCreate, tableSearch, tableAdd, tableCreateVectorIndex } = require('../native.js')
/** /**
* Connect to a LanceDB instance at the given URI * Connect to a LanceDB instance at the given URI
* @param uri The uri of the database. * @param uri The uri of the database.
*/ */
export async function connect (uri: string): Promise<Connection> { export async function connect (uri: string): Promise<Connection> {
return new Connection(uri) const db = await databaseNew(uri)
return new Connection(db, uri)
} }
/** /**
@@ -38,9 +39,9 @@ export class Connection {
private readonly _uri: string private readonly _uri: string
private readonly _db: any private readonly _db: any
constructor (uri: string) { constructor (db: any, uri: string) {
this._uri = uri this._uri = uri
this._db = databaseNew(uri) this._db = db
} }
get uri (): string { get uri (): string {
@@ -56,16 +57,49 @@ export class Connection {
/** /**
* Open a table in the database. * Open a table in the database.
*
* @param name The name of the table. * @param name The name of the table.
*/ */
async openTable (name: string): Promise<Table> { async openTable (name: string): Promise<Table>
/**
* Open a table in the database.
*
* @param name The name of the table.
* @param embeddings An embedding function to use on this Table
*/
async openTable<T> (name: string, embeddings: EmbeddingFunction<T>): Promise<Table<T>>
async openTable<T> (name: string, embeddings?: EmbeddingFunction<T>): Promise<Table<T>> {
const tbl = await databaseOpenTable.call(this._db, name) const tbl = await databaseOpenTable.call(this._db, name)
if (embeddings !== undefined) {
return new Table(tbl, name, embeddings)
} else {
return new Table(tbl, name) return new Table(tbl, name)
} }
}
async createTable (name: string, data: Array<Record<string, unknown>>): Promise<Table> { /**
await tableCreate.call(this._db, name, await fromRecordsToBuffer(data)) * Creates a new Table and initialize it with new data.
return await this.openTable(name) *
* @param name The name of the table.
* @param data Non-empty Array of Records to be inserted into the Table
*/
async createTable (name: string, data: Array<Record<string, unknown>>): Promise<Table>
/**
* Creates a new Table and initialize it with new data.
*
* @param name The name of the table.
* @param data Non-empty Array of Records to be inserted into the Table
* @param embeddings An embedding function to use on this Table
*/
async createTable<T> (name: string, data: Array<Record<string, unknown>>, embeddings: EmbeddingFunction<T>): Promise<Table<T>>
async createTable<T> (name: string, data: Array<Record<string, unknown>>, embeddings?: EmbeddingFunction<T>): Promise<Table<T>> {
const tbl = await tableCreate.call(this._db, name, await fromRecordsToBuffer(data, embeddings))
if (embeddings !== undefined) {
return new Table(tbl, name, embeddings)
} else {
return new Table(tbl, name)
}
} }
async createTableArrow (name: string, table: ArrowTable): Promise<Table> { async createTableArrow (name: string, table: ArrowTable): Promise<Table> {
@@ -75,16 +109,22 @@ export class Connection {
} }
} }
/** export class Table<T = number[]> {
* A table in a LanceDB database.
*/
export class Table {
private readonly _tbl: any private readonly _tbl: any
private readonly _name: string private readonly _name: string
private readonly _embeddings?: EmbeddingFunction<T>
constructor (tbl: any, name: string) { constructor (tbl: any, name: string)
/**
* @param tbl
* @param name
* @param embeddings An embedding function to use when interacting with this table
*/
constructor (tbl: any, name: string, embeddings: EmbeddingFunction<T>)
constructor (tbl: any, name: string, embeddings?: EmbeddingFunction<T>) {
this._tbl = tbl this._tbl = tbl
this._name = name this._name = name
this._embeddings = embeddings
} }
get name (): string { get name (): string {
@@ -92,72 +132,173 @@ export class Table {
} }
/** /**
* Create a search query to find the nearest neighbors of the given query vector. * Creates a search query to find the nearest neighbors of the given search term
* @param queryVector The query vector. * @param query The query search term
*/ */
search (queryVector: number[]): Query { search (query: T): Query {
let queryVector: number[]
if (this._embeddings !== undefined) {
queryVector = this._embeddings.embed([query])[0]
} else {
queryVector = query as number[]
}
return new Query(this._tbl, queryVector) return new Query(this._tbl, queryVector)
} }
/** /**
* Insert records into this Table * Insert records into this Table.
* @param data Records to be inserted into the Table
* *
* @param mode Append / Overwrite existing records. Default: Append * @param data Records to be inserted into the Table
* @return The number of rows added to the table * @return The number of rows added to the table
*/ */
async add (data: Array<Record<string, unknown>>): Promise<number> { async add (data: Array<Record<string, unknown>>): Promise<number> {
return tableAdd.call(this._tbl, await fromRecordsToBuffer(data), WriteMode.Append.toString()) return tableAdd.call(this._tbl, await fromRecordsToBuffer(data, this._embeddings), WriteMode.Append.toString())
} }
/**
* Insert records into this Table, replacing its contents.
*
* @param data Records to be inserted into the Table
* @return The number of rows added to the table
*/
async overwrite (data: Array<Record<string, unknown>>): Promise<number> { async overwrite (data: Array<Record<string, unknown>>): Promise<number> {
return tableAdd.call(this._tbl, await fromRecordsToBuffer(data), WriteMode.Overwrite.toString()) return tableAdd.call(this._tbl, await fromRecordsToBuffer(data, this._embeddings), WriteMode.Overwrite.toString())
}
/**
* Create an ANN index on this Table vector index.
*
* @param indexParams The parameters of this Index, @see VectorIndexParams.
*/
async create_index (indexParams: VectorIndexParams): Promise<any> {
return tableCreateVectorIndex.call(this._tbl, indexParams)
} }
} }
interface IvfPQIndexConfig {
/**
* The column to be indexed
*/
column?: string
/**
* A unique name for the index
*/
index_name?: string
/**
* Metric type, L2 or Cosine
*/
metric_type?: MetricType
/**
* The number of partitions this index
*/
num_partitions?: number
/**
* The max number of iterations for kmeans training.
*/
max_iters?: number
/**
* Train as optimized product quantization.
*/
use_opq?: boolean
/**
* Number of subvectors to build PQ code
*/
num_sub_vectors?: number
/**
* The number of bits to present one PQ centroid.
*/
num_bits?: number
/**
* Max number of iterations to train OPQ, if `use_opq` is true.
*/
max_opq_iters?: number
type: 'ivf_pq'
}
export type VectorIndexParams = IvfPQIndexConfig
/** /**
* A builder for nearest neighbor queries for LanceDB. * A builder for nearest neighbor queries for LanceDB.
*/ */
export class Query { export class Query {
private readonly _tbl: any private readonly _tbl: any
private readonly _query_vector: number[] private readonly _queryVector: number[]
private _limit: number private _limit: number
private readonly _refine_factor?: number private _refineFactor?: number
private readonly _nprobes: number private _nprobes: number
private readonly _columns?: string[] private readonly _columns?: string[]
private _filter?: string private _filter?: string
private readonly _metric = 'L2' private _metricType?: MetricType
constructor (tbl: any, queryVector: number[]) { constructor (tbl: any, queryVector: number[]) {
this._tbl = tbl this._tbl = tbl
this._query_vector = queryVector this._queryVector = queryVector
this._limit = 10 this._limit = 10
this._nprobes = 20 this._nprobes = 20
this._refine_factor = undefined this._refineFactor = undefined
this._columns = undefined this._columns = undefined
this._filter = undefined this._filter = undefined
this._metricType = undefined
} }
/***
* Sets the number of results that will be returned
* @param value number of results
*/
limit (value: number): Query { limit (value: number): Query {
this._limit = value this._limit = value
return this return this
} }
/**
* Refine the results by reading extra elements and re-ranking them in memory.
* @param value refine factor to use in this query.
*/
refineFactor (value: number): Query {
this._refineFactor = value
return this
}
/**
* The number of probes used. A higher number makes search more accurate but also slower.
* @param value The number of probes used.
*/
nprobes (value: number): Query {
this._nprobes = value
return this
}
/**
* A filter statement to be applied to this query.
* @param value A filter in the same format used by a sql WHERE clause.
*/
filter (value: string): Query { filter (value: string): Query {
this._filter = value this._filter = value
return this return this
} }
/**
* The MetricType used for this Query.
* @param value The metric to the. @see MetricType for the different options
*/
metricType (value: MetricType): Query {
this._metricType = value
return this
}
/** /**
* Execute the query and return the results as an Array of Objects * Execute the query and return the results as an Array of Objects
*/ */
async execute<T = Record<string, unknown>> (): Promise<T[]> { async execute<T = Record<string, unknown>> (): Promise<T[]> {
let buffer const buffer = await tableSearch.call(this._tbl, this)
if (this._filter != null) {
buffer = await tableSearch.call(this._tbl, this._query_vector, this._limit, this._filter)
} else {
buffer = await tableSearch.call(this._tbl, this._query_vector, this._limit)
}
const data = tableFromIPC(buffer) const data = tableFromIPC(buffer)
return data.toArray().map((entry: Record<string, unknown>) => { return data.toArray().map((entry: Record<string, unknown>) => {
const newObject: Record<string, unknown> = {} const newObject: Record<string, unknown> = {}
@@ -177,3 +318,33 @@ export enum WriteMode {
Overwrite = 'overwrite', Overwrite = 'overwrite',
Append = 'append' Append = 'append'
} }
/**
* An embedding function that automatically creates vector representation for a given column.
*/
export interface EmbeddingFunction<T> {
/**
* The name of the column that will be used as input for the Embedding Function.
*/
sourceColumn: string
/**
* Creates a vector representation for the given values.
*/
embed: (data: T[]) => number[][]
}
/**
* Distance metrics type.
*/
export enum MetricType {
/**
* Euclidean distance
*/
L2 = 'l2',
/**
* Cosine distance
*/
Cosine = 'cosine'
}

52
node/src/test/io.ts Normal file
View File

@@ -0,0 +1,52 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// IO tests
import { describe } from 'mocha'
import { assert } from 'chai'
import * as lancedb from '../index'
describe('LanceDB S3 client', function () {
if (process.env.TEST_S3_BASE_URL != null) {
const baseUri = process.env.TEST_S3_BASE_URL
it('should have a valid url', async function () {
const uri = `${baseUri}/valid_url`
const table = await createTestDB(uri, 2, 20)
const con = await lancedb.connect(uri)
assert.equal(con.uri, uri)
const results = await table.search([0.1, 0.3]).limit(5).execute()
assert.equal(results.length, 5)
})
} else {
describe.skip('Skip S3 test', function () {})
}
})
async function createTestDB (uri: string, numDimensions: number = 2, numRows: number = 2): Promise<lancedb.Table> {
const con = await lancedb.connect(uri)
const data = []
for (let i = 0; i < numRows; i++) {
const vector = []
for (let j = 0; j < numDimensions; j++) {
vector.push(i + (j * 0.1))
}
data.push({ id: i + 1, name: `name_${i}`, price: i + 10, is_active: (i % 2 === 0), vector })
}
return await con.createTable('vectors', data)
}

View File

@@ -17,6 +17,7 @@ import { assert } from 'chai'
import { track } from 'temp' import { track } from 'temp'
import * as lancedb from '../index' import * as lancedb from '../index'
import { type EmbeddingFunction, MetricType, Query } from '../index'
describe('LanceDB client', function () { describe('LanceDB client', function () {
describe('when creating a connection to lancedb', function () { describe('when creating a connection to lancedb', function () {
@@ -67,7 +68,7 @@ describe('LanceDB client', function () {
const uri = await createTestDB() const uri = await createTestDB()
const con = await lancedb.connect(uri) const con = await lancedb.connect(uri)
const table = await con.openTable('vectors') const table = await con.openTable('vectors')
const results = await table.search([0.1, 0.3]).filter('id == 2').execute() const results = await table.search([0.1, 0.1]).filter('id == 2').execute()
assert.equal(results.length, 1) assert.equal(results.length, 1)
assert.equal(results[0].id, 2) assert.equal(results[0].id, 2)
}) })
@@ -96,8 +97,8 @@ describe('LanceDB client', function () {
const con = await lancedb.connect(dir) const con = await lancedb.connect(dir)
const data = [ const data = [
{ id: 1, vector: [0.1, 0.2], price: 10 }, { id: 1, vector: [0.1, 0.2], price: 10, name: 'a' },
{ id: 2, vector: [1.1, 1.2], price: 50 } { id: 2, vector: [1.1, 1.2], price: 50, name: 'b' }
] ]
const table = await con.createTable('vectors', data) const table = await con.createTable('vectors', data)
@@ -105,8 +106,8 @@ describe('LanceDB client', function () {
assert.equal(results.length, 2) assert.equal(results.length, 2)
const dataAdd = [ const dataAdd = [
{ id: 3, vector: [2.1, 2.2], price: 10 }, { id: 3, vector: [2.1, 2.2], price: 10, name: 'c' },
{ id: 4, vector: [3.1, 3.2], price: 50 } { id: 4, vector: [3.1, 3.2], price: 50, name: 'd' }
] ]
await table.add(dataAdd) await table.add(dataAdd)
const resultsAdd = await table.search([0.1, 0.3]).execute() const resultsAdd = await table.search([0.1, 0.3]).execute()
@@ -130,16 +131,76 @@ describe('LanceDB client', function () {
assert.equal(resultsAdd.length, 2) assert.equal(resultsAdd.length, 2)
}) })
}) })
describe('when creating a vector index', function () {
it('overwrite all records in a table', async function () {
const uri = await createTestDB(32, 300)
const con = await lancedb.connect(uri)
const table = await con.openTable('vectors')
await table.create_index({ type: 'ivf_pq', column: 'vector', num_partitions: 2, max_iters: 2 })
}).timeout(10_000) // Timeout is high partially because GH macos runner is pretty slow
})
describe('when using a custom embedding function', function () {
class TextEmbedding implements EmbeddingFunction<string> {
sourceColumn: string
constructor (targetColumn: string) {
this.sourceColumn = targetColumn
}
_embedding_map = new Map<string, number[]>([
['foo', [2.1, 2.2]],
['bar', [3.1, 3.2]]
])
embed (data: string[]): number[][] {
return data.map(datum => this._embedding_map.get(datum) ?? [0.0, 0.0])
}
}
it('should encode the original data into embeddings', async function () {
const dir = await track().mkdir('lancejs')
const con = await lancedb.connect(dir)
const embeddings = new TextEmbedding('name')
const data = [
{ price: 10, name: 'foo' },
{ price: 50, name: 'bar' }
]
const table = await con.createTable('vectors', data, embeddings)
const results = await table.search('foo').execute()
assert.equal(results.length, 2)
})
})
}) })
async function createTestDB (): Promise<string> { describe('Query object', function () {
it('sets custom parameters', async function () {
const query = new Query(undefined, [0.1, 0.3])
.limit(1)
.metricType(MetricType.Cosine)
.refineFactor(100)
.nprobes(20) as Record<string, any>
assert.equal(query._limit, 1)
assert.equal(query._metricType, MetricType.Cosine)
assert.equal(query._refineFactor, 100)
assert.equal(query._nprobes, 20)
})
})
async function createTestDB (numDimensions: number = 2, numRows: number = 2): Promise<string> {
const dir = await track().mkdir('lancejs') const dir = await track().mkdir('lancejs')
const con = await lancedb.connect(dir) const con = await lancedb.connect(dir)
const data = [ const data = []
{ id: 1, vector: [0.1, 0.2], name: 'foo', price: 10, is_active: true }, for (let i = 0; i < numRows; i++) {
{ id: 2, vector: [1.1, 1.2], name: 'bar', price: 50, is_active: false } const vector = []
] for (let j = 0; j < numDimensions; j++) {
vector.push(i + (j * 0.1))
}
data.push({ id: i + 1, name: `name_${i}`, price: i + 10, is_active: (i % 2 === 0), vector })
}
await con.createTable('vectors', data) await con.createTable('vectors', data)
return dir return dir

128
python/lancedb/fts.py Normal file
View File

@@ -0,0 +1,128 @@
# Copyright 2023 LanceDB Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Full text search index using tantivy-py"""
import os
from typing import List, Tuple
import pyarrow as pa
try:
import tantivy
except ImportError:
raise ImportError(
"Please install tantivy-py `pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985` to use the full text search feature."
)
from .table import LanceTable
def create_index(index_path: str, text_fields: List[str]) -> tantivy.Index:
"""
Create a new Index (not populated)
Parameters
----------
index_path : str
Path to the index directory
text_fields : List[str]
List of text fields to index
Returns
-------
index : tantivy.Index
The index object (not yet populated)
"""
# Declaring our schema.
schema_builder = tantivy.SchemaBuilder()
# special field that we'll populate with row_id
schema_builder.add_integer_field("doc_id", stored=True)
# data fields
for name in text_fields:
schema_builder.add_text_field(name, stored=True)
schema = schema_builder.build()
os.makedirs(index_path, exist_ok=True)
index = tantivy.Index(schema, path=index_path)
return index
def populate_index(index: tantivy.Index, table: LanceTable, fields: List[str]) -> int:
"""
Populate an index with data from a LanceTable
Parameters
----------
index : tantivy.Index
The index object
table : LanceTable
The table to index
fields : List[str]
List of fields to index
"""
# first check the fields exist and are string or large string type
for name in fields:
f = table.schema.field(name) # raises KeyError if not found
if not pa.types.is_string(f.type) and not pa.types.is_large_string(f.type):
raise TypeError(f"Field {name} is not a string type")
# create a tantivy writer
writer = index.writer()
# write data into index
dataset = table.to_lance()
row_id = 0
for b in dataset.to_batches(columns=fields):
for i in range(b.num_rows):
doc = tantivy.Document()
doc.add_integer("doc_id", row_id)
for name in fields:
doc.add_text(name, b[name][i].as_py())
writer.add_document(doc)
row_id += 1
# commit changes
writer.commit()
return row_id
def search_index(
index: tantivy.Index, query: str, limit: int = 10
) -> Tuple[Tuple[int], Tuple[float]]:
"""
Search an index for a query
Parameters
----------
index : tantivy.Index
The index object
query : str
The query string
limit : int
The maximum number of results to return
Returns
-------
ids_and_score: list[tuple[int], tuple[float]]
A tuple of two tuples, the first containing the document ids
and the second containing the scores
"""
searcher = index.searcher()
query = index.parse_query(query)
# get top results
results = searcher.search(query, limit)
return tuple(
zip(
*[
(searcher.doc(doc_address)["doc_id"][0], score)
for score, doc_address in results.hits
]
)
)

View File

@@ -14,6 +14,7 @@ from __future__ import annotations
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import pyarrow as pa
from .common import VECTOR_COLUMN_NAME from .common import VECTOR_COLUMN_NAME
@@ -131,7 +132,6 @@ class LanceQueryBuilder:
vector and the returned vector. vector and the returned vector.
""" """
ds = self._table.to_lance() ds = self._table.to_lance()
# TODO indexed search
tbl = ds.to_table( tbl = ds.to_table(
columns=self._columns, columns=self._columns,
filter=self._where, filter=self._where,
@@ -145,3 +145,26 @@ class LanceQueryBuilder:
}, },
) )
return tbl.to_pandas() return tbl.to_pandas()
class LanceFtsQueryBuilder(LanceQueryBuilder):
def to_df(self) -> pd.DataFrame:
try:
import tantivy
except ImportError:
raise ImportError(
"Please install tantivy-py `pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985` to use the full text search feature."
)
from .fts import search_index
# get the index path
index_path = self._table._get_fts_index_path()
# open the index
index = tantivy.Index.open(index_path)
# get the scores and doc ids
row_ids, scores = search_index(index, self._query, self._limit)
scores = pa.array(scores)
output_tbl = self._table.to_lance().take(row_ids, columns=self._columns)
output_tbl = output_tbl.append_column("score", scores)
return output_tbl.to_pandas()

View File

@@ -14,7 +14,9 @@
from __future__ import annotations from __future__ import annotations
import os import os
import shutil
from functools import cached_property from functools import cached_property
from typing import List, Union
import lance import lance
import numpy as np import numpy as np
@@ -24,7 +26,8 @@ from lance import LanceDataset
from lance.vector import vec_to_table from lance.vector import vec_to_table
from .common import DATA, VEC, VECTOR_COLUMN_NAME from .common import DATA, VEC, VECTOR_COLUMN_NAME
from .query import LanceQueryBuilder from .query import LanceFtsQueryBuilder, LanceQueryBuilder
from .util import get_uri_scheme
def _sanitize_data(data, schema): def _sanitize_data(data, schema):
@@ -130,6 +133,27 @@ class LanceTable:
) )
self._reset_dataset() self._reset_dataset()
def create_fts_index(self, field_names: Union[str, List[str]]):
"""Create a full-text search index on the table.
Warning - this API is highly experimental and is highly likely to change
in the future.
Parameters
----------
field_names: str or list of str
The name(s) of the field to index.
"""
from .fts import create_index, populate_index
if isinstance(field_names, str):
field_names = [field_names]
index = create_index(self._get_fts_index_path(), field_names)
populate_index(index, self, field_names)
def _get_fts_index_path(self):
return os.path.join(self._dataset_uri, "_indices", "tantivy")
@cached_property @cached_property
def _dataset(self) -> LanceDataset: def _dataset(self) -> LanceDataset:
return lance.dataset(self._dataset_uri, version=self._version) return lance.dataset(self._dataset_uri, version=self._version)
@@ -158,7 +182,7 @@ class LanceTable:
self._reset_dataset() self._reset_dataset()
return len(self) return len(self)
def search(self, query: VEC) -> LanceQueryBuilder: def search(self, query: Union[VEC, str]) -> LanceQueryBuilder:
"""Create a search query to find the nearest neighbors """Create a search query to find the nearest neighbors
of the given query vector. of the given query vector.
@@ -174,6 +198,10 @@ class LanceTable:
and also the "score" column which is the distance between the query and also the "score" column which is the distance between the query
vector and the returned vector. vector and the returned vector.
""" """
if isinstance(query, str):
# fts
return LanceFtsQueryBuilder(self, query)
if isinstance(query, list): if isinstance(query, list):
query = np.array(query) query = np.array(query)
if isinstance(query, np.ndarray): if isinstance(query, np.ndarray):

View File

@@ -1,7 +1,7 @@
[project] [project]
name = "lancedb" name = "lancedb"
version = "0.1.2" version = "0.1.4"
dependencies = ["pylance>=0.4.6", "ratelimiter", "retry", "tqdm"] dependencies = ["pylance>=0.4.17", "ratelimiter", "retry", "tqdm"]
description = "lancedb" description = "lancedb"
authors = [ authors = [
{ name = "LanceDB Devs", email = "dev@lancedb.com" }, { name = "LanceDB Devs", email = "dev@lancedb.com" },

View File

@@ -14,7 +14,6 @@ import sys
import numpy as np import numpy as np
import pyarrow as pa import pyarrow as pa
from lancedb.embeddings import with_embeddings from lancedb.embeddings import with_embeddings

84
python/tests/test_fts.py Normal file
View File

@@ -0,0 +1,84 @@
# Copyright 2023 LanceDB Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import random
import lancedb.fts
import numpy as np
import pandas as pd
import pytest
import tantivy
import lancedb as ldb
@pytest.fixture
def table(tmp_path) -> ldb.table.LanceTable:
db = ldb.connect(tmp_path)
vectors = [np.random.randn(128) for _ in range(100)]
nouns = ("puppy", "car", "rabbit", "girl", "monkey")
verbs = ("runs", "hits", "jumps", "drives", "barfs")
adv = ("crazily.", "dutifully.", "foolishly.", "merrily.", "occasionally.")
adj = ("adorable", "clueless", "dirty", "odd", "stupid")
text = [
" ".join(
[
nouns[random.randrange(0, 5)],
verbs[random.randrange(0, 5)],
adv[random.randrange(0, 5)],
adj[random.randrange(0, 5)],
]
)
for _ in range(100)
]
table = db.create_table(
"test", data=pd.DataFrame({"vector": vectors, "text": text, "text2": text})
)
return table
def test_create_index(tmp_path):
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
assert isinstance(index, tantivy.Index)
assert os.path.exists(str(tmp_path / "index"))
def test_populate_index(tmp_path, table):
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
assert ldb.fts.populate_index(index, table, ["text"]) == len(table)
def test_search_index(tmp_path, table):
index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
ldb.fts.populate_index(index, table, ["text"])
index.reload()
results = ldb.fts.search_index(index, query="puppy", limit=10)
assert len(results) == 2
assert len(results[0]) == 10 # row_ids
assert len(results[1]) == 10 # scores
def test_create_index_from_table(tmp_path, table):
table.create_fts_index("text")
df = table.search("puppy").limit(10).select(["text"]).to_df()
assert len(df) == 10
assert "text" in df.columns
def test_create_index_multiple_columns(tmp_path, table):
table.create_fts_index(["text", "text2"])
df = table.search("puppy").limit(10).to_df()
assert len(df) == 10
assert "text" in df.columns
assert "text2" in df.columns

View File

@@ -17,7 +17,6 @@ import pandas as pd
import pandas.testing as tm import pandas.testing as tm
import pyarrow as pa import pyarrow as pa
import pytest import pytest
from lancedb.query import LanceQueryBuilder from lancedb.query import LanceQueryBuilder

View File

@@ -16,7 +16,6 @@ from pathlib import Path
import pandas as pd import pandas as pd
import pyarrow as pa import pyarrow as pa
import pytest import pytest
from lancedb.table import LanceTable from lancedb.table import LanceTable

View File

@@ -15,7 +15,7 @@ arrow-ipc = "37.0"
arrow-schema = "37.0" arrow-schema = "37.0"
once_cell = "1" once_cell = "1"
futures = "0.3" futures = "0.3"
lance = "0.4.3" lance = "0.4.17"
vectordb = { path = "../../vectordb" } vectordb = { path = "../../vectordb" }
tokio = { version = "1.23", features = ["rt-multi-thread"] } tokio = { version = "1.23", features = ["rt-multi-thread"] }
neon = {version = "0.10.1", default-features = false, features = ["channel-api", "napi-6", "promise-api", "task-api"] } neon = {version = "0.10.1", default-features = false, features = ["channel-api", "napi-6", "promise-api", "task-api"] }

View File

@@ -0,0 +1,15 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
pub mod vector;

View File

@@ -0,0 +1,128 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::convert::TryFrom;
use lance::index::vector::ivf::IvfBuildParams;
use lance::index::vector::pq::PQBuildParams;
use lance::index::vector::MetricType;
use neon::context::FunctionContext;
use neon::prelude::*;
use vectordb::index::vector::{IvfPQIndexBuilder, VectorIndexBuilder};
use crate::{runtime, JsTable};
pub(crate) fn table_create_vector_index(mut cx: FunctionContext) -> JsResult<JsPromise> {
let js_table = cx.this().downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
let index_params = cx.argument::<JsObject>(0)?;
let index_params_builder = get_index_params_builder(&mut cx, index_params).unwrap();
let rt = runtime(&mut cx)?;
let channel = cx.channel();
let (deferred, promise) = cx.promise();
let table = js_table.table.clone();
rt.block_on(async move {
let add_result = table
.lock()
.unwrap()
.create_index(&index_params_builder)
.await;
deferred.settle_with(&channel, move |mut cx| {
add_result
.map(|_| cx.undefined())
.or_else(|err| cx.throw_error(err.to_string()))
});
});
Ok(promise)
}
fn get_index_params_builder(
cx: &mut FunctionContext,
obj: Handle<JsObject>,
) -> Result<impl VectorIndexBuilder, String> {
let idx_type = obj
.get::<JsString, _, _>(cx, "type")
.map_err(|t| t.to_string())?
.value(cx);
match idx_type.as_str() {
"ivf_pq" => {
let mut index_builder: IvfPQIndexBuilder = IvfPQIndexBuilder::new();
let mut pq_params = PQBuildParams::default();
obj.get_opt::<JsString, _, _>(cx, "column")
.map_err(|t| t.to_string())?
.map(|s| index_builder.column(s.value(cx)));
obj.get_opt::<JsString, _, _>(cx, "index_name")
.map_err(|t| t.to_string())?
.map(|s| index_builder.index_name(s.value(cx)));
obj.get_opt::<JsString, _, _>(cx, "metric_type")
.map_err(|t| t.to_string())?
.map(|s| MetricType::try_from(s.value(cx).as_str()))
.map(|mt| {
let metric_type = mt.unwrap();
index_builder.metric_type(metric_type);
pq_params.metric_type = metric_type;
});
let num_partitions = obj
.get_opt::<JsNumber, _, _>(cx, "num_partitions")
.map_err(|t| t.to_string())?
.map(|s| s.value(cx) as usize);
let max_iters = obj
.get_opt::<JsNumber, _, _>(cx, "max_iters")
.map_err(|t| t.to_string())?
.map(|s| s.value(cx) as usize);
num_partitions.map(|np| {
let max_iters = max_iters.unwrap_or(50);
let ivf_params = IvfBuildParams {
num_partitions: np,
max_iters,
};
index_builder.ivf_params(ivf_params)
});
obj.get_opt::<JsBoolean, _, _>(cx, "use_opq")
.map_err(|t| t.to_string())?
.map(|s| pq_params.use_opq = s.value(cx));
obj.get_opt::<JsNumber, _, _>(cx, "num_sub_vectors")
.map_err(|t| t.to_string())?
.map(|s| pq_params.num_sub_vectors = s.value(cx) as usize);
obj.get_opt::<JsNumber, _, _>(cx, "num_bits")
.map_err(|t| t.to_string())?
.map(|s| pq_params.num_bits = s.value(cx) as usize);
obj.get_opt::<JsNumber, _, _>(cx, "max_iters")
.map_err(|t| t.to_string())?
.map(|s| pq_params.max_iters = s.value(cx) as usize);
obj.get_opt::<JsNumber, _, _>(cx, "max_opq_iters")
.map_err(|t| t.to_string())?
.map(|s| pq_params.max_opq_iters = s.value(cx) as usize);
Ok(index_builder)
}
t => Err(format!("{} is not a valid index type", t).to_string()),
}
}

View File

@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
use std::collections::HashMap; use std::collections::HashMap;
use std::convert::TryFrom;
use std::ops::Deref; use std::ops::Deref;
use std::sync::{Arc, Mutex}; use std::sync::{Arc, Mutex};
@@ -21,6 +22,7 @@ use arrow_ipc::writer::FileWriter;
use futures::{TryFutureExt, TryStreamExt}; use futures::{TryFutureExt, TryStreamExt};
use lance::arrow::RecordBatchBuffer; use lance::arrow::RecordBatchBuffer;
use lance::dataset::WriteMode; use lance::dataset::WriteMode;
use lance::index::vector::MetricType;
use neon::prelude::*; use neon::prelude::*;
use neon::types::buffer::TypedArray; use neon::types::buffer::TypedArray;
use once_cell::sync::OnceCell; use once_cell::sync::OnceCell;
@@ -34,17 +36,18 @@ use crate::arrow::arrow_buffer_to_record_batch;
mod arrow; mod arrow;
mod convert; mod convert;
mod index;
struct JsDatabase { struct JsDatabase {
database: Arc<Database>, database: Arc<Database>,
} }
impl Finalize for JsDatabase {}
struct JsTable { struct JsTable {
table: Arc<Mutex<Table>>, table: Arc<Mutex<Table>>,
} }
impl Finalize for JsDatabase {}
impl Finalize for JsTable {} impl Finalize for JsTable {}
fn runtime<'a, C: Context<'a>>(cx: &mut C) -> NeonResult<&'static Runtime> { fn runtime<'a, C: Context<'a>>(cx: &mut C) -> NeonResult<&'static Runtime> {
@@ -53,23 +56,46 @@ fn runtime<'a, C: Context<'a>>(cx: &mut C) -> NeonResult<&'static Runtime> {
RUNTIME.get_or_try_init(|| Runtime::new().or_else(|err| cx.throw_error(err.to_string()))) RUNTIME.get_or_try_init(|| Runtime::new().or_else(|err| cx.throw_error(err.to_string())))
} }
fn database_new(mut cx: FunctionContext) -> JsResult<JsBox<JsDatabase>> { fn database_new(mut cx: FunctionContext) -> JsResult<JsPromise> {
let path = cx.argument::<JsString>(0)?.value(&mut cx); let path = cx.argument::<JsString>(0)?.value(&mut cx);
let rt = runtime(&mut cx)?;
let channel = cx.channel();
let (deferred, promise) = cx.promise();
rt.spawn(async move {
let database = Database::connect(&path).await;
deferred.settle_with(&channel, move |mut cx| {
let db = JsDatabase { let db = JsDatabase {
database: Arc::new(Database::connect(path).or_else(|err| cx.throw_error(err.to_string()))?), database: Arc::new(database.or_else(|err| cx.throw_error(err.to_string()))?),
}; };
Ok(cx.boxed(db)) Ok(cx.boxed(db))
});
});
Ok(promise)
} }
fn database_table_names(mut cx: FunctionContext) -> JsResult<JsArray> { fn database_table_names(mut cx: FunctionContext) -> JsResult<JsPromise> {
let db = cx let db = cx
.this() .this()
.downcast_or_throw::<JsBox<JsDatabase>, _>(&mut cx)?; .downcast_or_throw::<JsBox<JsDatabase>, _>(&mut cx)?;
let tables = db
.database let rt = runtime(&mut cx)?;
.table_names() let (deferred, promise) = cx.promise();
.or_else(|err| cx.throw_error(err.to_string()))?; let channel = cx.channel();
convert::vec_str_to_array(&tables, &mut cx) let database = db.database.clone();
rt.spawn(async move {
let tables_rst = database.table_names().await;
deferred.settle_with(&channel, move |mut cx| {
let tables = tables_rst.or_else(|err| cx.throw_error(err.to_string()))?;
let table_names = convert::vec_str_to_array(&tables, &mut cx);
table_names
});
});
Ok(promise)
} }
fn database_open_table(mut cx: FunctionContext) -> JsResult<JsPromise> { fn database_open_table(mut cx: FunctionContext) -> JsResult<JsPromise> {
@@ -84,10 +110,12 @@ fn database_open_table(mut cx: FunctionContext) -> JsResult<JsPromise> {
let (deferred, promise) = cx.promise(); let (deferred, promise) = cx.promise();
rt.spawn(async move { rt.spawn(async move {
let table_rst = database.open_table(table_name).await; let table_rst = database.open_table(&table_name).await;
deferred.settle_with(&channel, move |mut cx| { deferred.settle_with(&channel, move |mut cx| {
let table = Arc::new(Mutex::new(table_rst.or_else(|err| cx.throw_error(err.to_string()))?)); let table = Arc::new(Mutex::new(
table_rst.or_else(|err| cx.throw_error(err.to_string()))?,
));
Ok(cx.boxed(JsTable { table })) Ok(cx.boxed(JsTable { table }))
}); });
}); });
@@ -96,15 +124,32 @@ fn database_open_table(mut cx: FunctionContext) -> JsResult<JsPromise> {
fn table_search(mut cx: FunctionContext) -> JsResult<JsPromise> { fn table_search(mut cx: FunctionContext) -> JsResult<JsPromise> {
let js_table = cx.this().downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?; let js_table = cx.this().downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
let query_vector = cx.argument::<JsArray>(0)?; //. .as_value(&mut cx); let query_obj = cx.argument::<JsObject>(0)?;
let limit = cx.argument::<JsNumber>(1)?.value(&mut cx);
let filter = cx.argument_opt(2).map(|f| f.downcast_or_throw::<JsString, _>(&mut cx).unwrap().value(&mut cx)); let limit = query_obj
.get::<JsNumber, _, _>(&mut cx, "_limit")?
.value(&mut cx);
let filter = query_obj
.get_opt::<JsString, _, _>(&mut cx, "_filter")?
.map(|s| s.value(&mut cx));
let refine_factor = query_obj
.get_opt::<JsNumber, _, _>(&mut cx, "_refineFactor")?
.map(|s| s.value(&mut cx))
.map(|i| i as u32);
let nprobes = query_obj
.get::<JsNumber, _, _>(&mut cx, "_nprobes")?
.value(&mut cx) as usize;
let metric_type = query_obj
.get_opt::<JsString, _, _>(&mut cx, "_metricType")?
.map(|s| s.value(&mut cx))
.map(|s| MetricType::try_from(s.as_str()).unwrap());
let rt = runtime(&mut cx)?; let rt = runtime(&mut cx)?;
let channel = cx.channel(); let channel = cx.channel();
let (deferred, promise) = cx.promise(); let (deferred, promise) = cx.promise();
let table = js_table.table.clone(); let table = js_table.table.clone();
let query_vector = query_obj.get::<JsArray, _, _>(&mut cx, "_queryVector")?;
let query = convert::js_array_to_vec(query_vector.deref(), &mut cx); let query = convert::js_array_to_vec(query_vector.deref(), &mut cx);
rt.spawn(async move { rt.spawn(async move {
@@ -113,7 +158,10 @@ fn table_search(mut cx: FunctionContext) -> JsResult<JsPromise> {
.unwrap() .unwrap()
.search(Float32Array::from(query)) .search(Float32Array::from(query))
.limit(limit as usize) .limit(limit as usize)
.filter(filter); .refine_factor(refine_factor)
.nprobes(nprobes)
.filter(filter)
.metric_type(metric_type);
let record_batch_stream = builder.execute(); let record_batch_stream = builder.execute();
let results = record_batch_stream let results = record_batch_stream
.and_then(|stream| stream.try_collect::<Vec<_>>().map_err(Error::from)) .and_then(|stream| stream.try_collect::<Vec<_>>().map_err(Error::from))
@@ -161,10 +209,12 @@ fn table_create(mut cx: FunctionContext) -> JsResult<JsPromise> {
rt.block_on(async move { rt.block_on(async move {
let batch_reader: Box<dyn RecordBatchReader> = Box::new(RecordBatchBuffer::new(batches)); let batch_reader: Box<dyn RecordBatchReader> = Box::new(RecordBatchBuffer::new(batches));
let table_rst = database.create_table(table_name, batch_reader).await; let table_rst = database.create_table(&table_name, batch_reader).await;
deferred.settle_with(&channel, move |mut cx| { deferred.settle_with(&channel, move |mut cx| {
let table = Arc::new(Mutex::new(table_rst.or_else(|err| cx.throw_error(err.to_string()))?)); let table = Arc::new(Mutex::new(
table_rst.or_else(|err| cx.throw_error(err.to_string()))?,
));
Ok(cx.boxed(JsTable { table })) Ok(cx.boxed(JsTable { table }))
}); });
}); });
@@ -178,9 +228,7 @@ fn table_add(mut cx: FunctionContext) -> JsResult<JsPromise> {
("overwrite", WriteMode::Overwrite), ("overwrite", WriteMode::Overwrite),
]); ]);
let js_table = cx let js_table = cx.this().downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
.this()
.downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
let buffer = cx.argument::<JsBuffer>(0)?; let buffer = cx.argument::<JsBuffer>(0)?;
let write_mode = cx.argument::<JsString>(1)?.value(&mut cx); let write_mode = cx.argument::<JsString>(1)?.value(&mut cx);
let batches = arrow_buffer_to_record_batch(buffer.as_slice(&mut cx)); let batches = arrow_buffer_to_record_batch(buffer.as_slice(&mut cx));
@@ -204,7 +252,6 @@ fn table_add(mut cx: FunctionContext) -> JsResult<JsPromise> {
Ok(promise) Ok(promise)
} }
#[neon::main] #[neon::main]
fn main(mut cx: ModuleContext) -> NeonResult<()> { fn main(mut cx: ModuleContext) -> NeonResult<()> {
cx.export_function("databaseNew", database_new)?; cx.export_function("databaseNew", database_new)?;
@@ -213,5 +260,9 @@ fn main(mut cx: ModuleContext) -> NeonResult<()> {
cx.export_function("tableSearch", table_search)?; cx.export_function("tableSearch", table_search)?;
cx.export_function("tableCreate", table_create)?; cx.export_function("tableCreate", table_create)?;
cx.export_function("tableAdd", table_add)?; cx.export_function("tableAdd", table_add)?;
cx.export_function(
"tableCreateVectorIndex",
index::vector::table_create_vector_index,
)?;
Ok(()) Ok(())
} }

View File

@@ -10,9 +10,13 @@ repository = "https://github.com/lancedb/lancedb"
[dependencies] [dependencies]
arrow-array = "37.0" arrow-array = "37.0"
arrow-data = "37.0"
arrow-schema = "37.0" arrow-schema = "37.0"
lance = "0.4.3" object_store = "0.5.6"
lance = "0.4.17"
tokio = { version = "1.23", features = ["rt-multi-thread"] } tokio = { version = "1.23", features = ["rt-multi-thread"] }
[dev-dependencies] [dev-dependencies]
tempfile = "3.5.0" tempfile = "3.5.0"
rand = { version = "0.8.3", features = ["small_rng"] }

View File

@@ -12,16 +12,19 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
use arrow_array::RecordBatchReader;
use std::fs::create_dir_all; use std::fs::create_dir_all;
use std::path::{Path, PathBuf}; use std::path::Path;
use std::sync::Arc;
use arrow_array::RecordBatchReader;
use lance::io::object_store::ObjectStore;
use crate::error::Result; use crate::error::Result;
use crate::table::Table; use crate::table::Table;
pub struct Database { pub struct Database {
pub(crate) path: Arc<PathBuf>, object_store: ObjectStore,
pub(crate) uri: String,
} }
const LANCE_EXTENSION: &str = "lance"; const LANCE_EXTENSION: &str = "lance";
@@ -37,12 +40,17 @@ impl Database {
/// # Returns /// # Returns
/// ///
/// * A [Database] object. /// * A [Database] object.
pub fn connect<P: AsRef<Path>>(path: P) -> Result<Database> { pub async fn connect(uri: &str) -> Result<Database> {
if !path.as_ref().try_exists()? { let object_store = ObjectStore::new(uri).await?;
if object_store.is_local() {
let path = Path::new(uri);
if !path.try_exists()? {
create_dir_all(&path)?; create_dir_all(&path)?;
} }
}
Ok(Database { Ok(Database {
path: Arc::new(path.as_ref().to_path_buf()), uri: uri.to_string(),
object_store,
}) })
} }
@@ -51,12 +59,13 @@ impl Database {
/// # Returns /// # Returns
/// ///
/// * A [Vec<String>] with all table names. /// * A [Vec<String>] with all table names.
pub fn table_names(&self) -> Result<Vec<String>> { pub async fn table_names(&self) -> Result<Vec<String>> {
let f = self let f = self
.path .object_store
.read_dir()? .read_dir("/")
.flatten() .await?
.map(|dir_entry| dir_entry.path()) .iter()
.map(|fname| Path::new(fname))
.filter(|path| { .filter(|path| {
let is_lance = path let is_lance = path
.extension() .extension()
@@ -76,10 +85,10 @@ impl Database {
pub async fn create_table( pub async fn create_table(
&self, &self,
name: String, name: &str,
batches: Box<dyn RecordBatchReader>, batches: Box<dyn RecordBatchReader>,
) -> Result<Table> { ) -> Result<Table> {
Table::create(self.path.clone(), name, batches).await Table::create(&self.uri, name, batches).await
} }
/// Open a table in the database. /// Open a table in the database.
@@ -90,8 +99,8 @@ impl Database {
/// # Returns /// # Returns
/// ///
/// * A [Table] object. /// * A [Table] object.
pub async fn open_table(&self, name: String) -> Result<Table> { pub async fn open_table(&self, name: &str) -> Result<Table> {
Table::open(self.path.clone(), name).await Table::open(&self.uri, name).await
} }
} }
@@ -105,10 +114,10 @@ mod tests {
#[tokio::test] #[tokio::test]
async fn test_connect() { async fn test_connect() {
let tmp_dir = tempdir().unwrap(); let tmp_dir = tempdir().unwrap();
let path_buf = tmp_dir.into_path(); let uri = tmp_dir.path().to_str().unwrap();
let db = Database::connect(&path_buf); let db = Database::connect(uri).await.unwrap();
assert_eq!(db.unwrap().path.as_path(), path_buf.as_path()) assert_eq!(db.uri, uri);
} }
#[tokio::test] #[tokio::test]
@@ -118,10 +127,16 @@ mod tests {
create_dir_all(tmp_dir.path().join("table2.lance")).unwrap(); create_dir_all(tmp_dir.path().join("table2.lance")).unwrap();
create_dir_all(tmp_dir.path().join("invalidlance")).unwrap(); create_dir_all(tmp_dir.path().join("invalidlance")).unwrap();
let db = Database::connect(&tmp_dir.into_path()).unwrap(); let uri = tmp_dir.path().to_str().unwrap();
let tables = db.table_names().unwrap(); let db = Database::connect(uri).await.unwrap();
let tables = db.table_names().await.unwrap();
assert_eq!(tables.len(), 2); assert_eq!(tables.len(), 2);
assert!(tables.contains(&String::from("table1"))); assert!(tables.contains(&String::from("table1")));
assert!(tables.contains(&String::from("table2"))); assert!(tables.contains(&String::from("table2")));
} }
#[tokio::test]
async fn test_connect_s3() {
// let db = Database::connect("s3://bucket/path/to/database").await.unwrap();
}
} }

View File

@@ -41,3 +41,15 @@ impl From<lance::Error> for Error {
Self::Lance(e.to_string()) Self::Lance(e.to_string())
} }
} }
impl From<object_store::Error> for Error {
fn from(e: object_store::Error) -> Self {
Self::IO(e.to_string())
}
}
impl From<object_store::path::Error> for Error {
fn from(e: object_store::path::Error) -> Self {
Self::IO(e.to_string())
}
}

View File

@@ -0,0 +1,15 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
pub mod vector;

View File

@@ -0,0 +1,163 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use lance::index::vector::ivf::IvfBuildParams;
use lance::index::vector::pq::PQBuildParams;
use lance::index::vector::{MetricType, VectorIndexParams};
pub trait VectorIndexBuilder {
fn get_column(&self) -> Option<String>;
fn get_index_name(&self) -> Option<String>;
fn build(&self) -> VectorIndexParams;
}
pub struct IvfPQIndexBuilder {
column: Option<String>,
index_name: Option<String>,
metric_type: Option<MetricType>,
ivf_params: Option<IvfBuildParams>,
pq_params: Option<PQBuildParams>,
}
impl IvfPQIndexBuilder {
pub fn new() -> IvfPQIndexBuilder {
IvfPQIndexBuilder {
column: None,
index_name: None,
metric_type: None,
ivf_params: None,
pq_params: None,
}
}
}
impl IvfPQIndexBuilder {
pub fn column(&mut self, column: String) -> &mut IvfPQIndexBuilder {
self.column = Some(column);
self
}
pub fn index_name(&mut self, index_name: String) -> &mut IvfPQIndexBuilder {
self.index_name = Some(index_name);
self
}
pub fn metric_type(&mut self, metric_type: MetricType) -> &mut IvfPQIndexBuilder {
self.metric_type = Some(metric_type);
self
}
pub fn ivf_params(&mut self, ivf_params: IvfBuildParams) -> &mut IvfPQIndexBuilder {
self.ivf_params = Some(ivf_params);
self
}
pub fn pq_params(&mut self, pq_params: PQBuildParams) -> &mut IvfPQIndexBuilder {
self.pq_params = Some(pq_params);
self
}
}
impl VectorIndexBuilder for IvfPQIndexBuilder {
fn get_column(&self) -> Option<String> {
self.column.clone()
}
fn get_index_name(&self) -> Option<String> {
self.index_name.clone()
}
fn build(&self) -> VectorIndexParams {
let ivf_params = self.ivf_params.clone().unwrap_or(IvfBuildParams::default());
let pq_params = self.pq_params.clone().unwrap_or(PQBuildParams::default());
VectorIndexParams::with_ivf_pq_params(pq_params.metric_type, ivf_params, pq_params)
}
}
#[cfg(test)]
mod tests {
use lance::index::vector::ivf::IvfBuildParams;
use lance::index::vector::pq::PQBuildParams;
use lance::index::vector::{MetricType, StageParams};
use crate::index::vector::{IvfPQIndexBuilder, VectorIndexBuilder};
#[test]
fn test_builder_no_params() {
let index_builder = IvfPQIndexBuilder::new();
assert!(index_builder.get_column().is_none());
assert!(index_builder.get_index_name().is_none());
let index_params = index_builder.build();
assert_eq!(index_params.stages.len(), 2);
if let StageParams::Ivf(ivf_params) = index_params.stages.get(0).unwrap() {
let default = IvfBuildParams::default();
assert_eq!(ivf_params.num_partitions, default.num_partitions);
assert_eq!(ivf_params.max_iters, default.max_iters);
} else {
panic!("Expected first stage to be ivf")
}
if let StageParams::PQ(pq_params) = index_params.stages.get(1).unwrap() {
assert_eq!(pq_params.use_opq, false);
} else {
panic!("Expected second stage to be pq")
}
}
#[test]
fn test_builder_all_params() {
let mut index_builder = IvfPQIndexBuilder::new();
index_builder
.column("c".to_owned())
.metric_type(MetricType::Cosine)
.index_name("index".to_owned());
assert_eq!(index_builder.column.clone().unwrap(), "c");
assert_eq!(index_builder.metric_type.unwrap(), MetricType::Cosine);
assert_eq!(index_builder.index_name.clone().unwrap(), "index");
let ivf_params = IvfBuildParams::new(500);
let mut pq_params = PQBuildParams::default();
pq_params.use_opq = true;
pq_params.max_iters = 1;
pq_params.num_bits = 8;
pq_params.num_sub_vectors = 50;
pq_params.metric_type = MetricType::Cosine;
pq_params.max_opq_iters = 2;
index_builder.ivf_params(ivf_params);
index_builder.pq_params(pq_params);
let index_params = index_builder.build();
assert_eq!(index_params.stages.len(), 2);
if let StageParams::Ivf(ivf_params) = index_params.stages.get(0).unwrap() {
assert_eq!(ivf_params.num_partitions, 500);
} else {
assert!(false, "Expected first stage to be ivf")
}
if let StageParams::PQ(pq_params) = index_params.stages.get(1).unwrap() {
assert_eq!(pq_params.use_opq, true);
assert_eq!(pq_params.max_iters, 1);
assert_eq!(pq_params.num_bits, 8);
assert_eq!(pq_params.num_sub_vectors, 50);
assert_eq!(pq_params.metric_type, MetricType::Cosine);
assert_eq!(pq_params.max_opq_iters, 2);
} else {
assert!(false, "Expected second stage to be pq")
}
}
}

View File

@@ -14,5 +14,6 @@
pub mod database; pub mod database;
pub mod error; pub mod error;
pub mod index;
pub mod query; pub mod query;
pub mod table; pub mod table;

View File

@@ -29,7 +29,7 @@ pub struct Query {
pub filter: Option<String>, pub filter: Option<String>,
pub nprobes: usize, pub nprobes: usize,
pub refine_factor: Option<u32>, pub refine_factor: Option<u32>,
pub metric_type: MetricType, pub metric_type: Option<MetricType>,
pub use_index: bool, pub use_index: bool,
} }
@@ -51,9 +51,9 @@ impl Query {
limit: 10, limit: 10,
nprobes: 20, nprobes: 20,
refine_factor: None, refine_factor: None,
metric_type: MetricType::L2, metric_type: None,
use_index: false, use_index: false,
filter: None filter: None,
} }
} }
@@ -71,10 +71,10 @@ impl Query {
self.limit, self.limit,
)?; )?;
scanner.nprobs(self.nprobes); scanner.nprobs(self.nprobes);
scanner.distance_metric(self.metric_type);
scanner.use_index(self.use_index); scanner.use_index(self.use_index);
self.filter.as_ref().map(|f| scanner.filter(f)); self.filter.as_ref().map(|f| scanner.filter(f));
self.refine_factor.map(|rf| scanner.refine(rf)); self.refine_factor.map(|rf| scanner.refine(rf));
self.metric_type.map(|mt| scanner.distance_metric(mt));
Ok(scanner.try_into_stream().await?) Ok(scanner.try_into_stream().await?)
} }
@@ -123,7 +123,7 @@ impl Query {
/// # Arguments /// # Arguments
/// ///
/// * `metric_type` - The distance metric to use. By default [MetricType::L2] is used. /// * `metric_type` - The distance metric to use. By default [MetricType::L2] is used.
pub fn metric_type(mut self, metric_type: MetricType) -> Query { pub fn metric_type(mut self, metric_type: Option<MetricType>) -> Query {
self.metric_type = metric_type; self.metric_type = metric_type;
self self
} }
@@ -174,14 +174,14 @@ mod tests {
.limit(100) .limit(100)
.nprobes(1000) .nprobes(1000)
.use_index(true) .use_index(true)
.metric_type(MetricType::Cosine) .metric_type(Some(MetricType::Cosine))
.refine_factor(Some(999)); .refine_factor(Some(999));
assert_eq!(query.query_vector, new_vector); assert_eq!(query.query_vector, new_vector);
assert_eq!(query.limit, 100); assert_eq!(query.limit, 100);
assert_eq!(query.nprobes, 1000); assert_eq!(query.nprobes, 1000);
assert_eq!(query.use_index, true); assert_eq!(query.use_index, true);
assert_eq!(query.metric_type, MetricType::Cosine); assert_eq!(query.metric_type, Some(MetricType::Cosine));
assert_eq!(query.refine_factor, Some(999)); assert_eq!(query.refine_factor, Some(999));
} }

View File

@@ -12,26 +12,33 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
use std::path::PathBuf; use std::path::Path;
use std::sync::Arc; use std::sync::Arc;
use arrow_array::{Float32Array, RecordBatchReader}; use arrow_array::{Float32Array, RecordBatchReader};
use lance::dataset::{Dataset, WriteMode, WriteParams}; use lance::dataset::{Dataset, WriteMode, WriteParams};
use lance::index::IndexType;
use crate::error::{Error, Result}; use crate::error::{Error, Result};
use crate::index::vector::VectorIndexBuilder;
use crate::query::Query; use crate::query::Query;
pub const VECTOR_COLUMN_NAME: &str = "vector"; pub const VECTOR_COLUMN_NAME: &str = "vector";
pub const LANCE_FILE_EXTENSION: &str = "lance"; pub const LANCE_FILE_EXTENSION: &str = "lance";
/// A table in a LanceDB database. /// A table in a LanceDB database.
pub struct Table { pub struct Table {
name: String, name: String,
path: String, uri: String,
dataset: Arc<Dataset>, dataset: Arc<Dataset>,
} }
impl std::fmt::Display for Table {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "Table({})", self.name)
}
}
impl Table { impl Table {
/// Opens an existing Table /// Opens an existing Table
/// ///
@@ -43,18 +50,21 @@ impl Table {
/// # Returns /// # Returns
/// ///
/// * A [Table] object. /// * A [Table] object.
pub async fn open(base_path: Arc<PathBuf>, name: String) -> Result<Self> { pub async fn open(base_uri: &str, name: &str) -> Result<Self> {
let ds_path = base_path.join(format!("{}.{}", name, LANCE_FILE_EXTENSION)); let path = Path::new(base_uri);
let ds_uri = ds_path
let table_uri = path.join(format!("{}.{}", name, LANCE_FILE_EXTENSION));
let uri = table_uri
.as_path()
.to_str() .to_str()
.ok_or(Error::IO(format!("Unable to find table {}", name)))?; .ok_or(Error::IO(format!("Invalid table name: {}", name)))?;
let dataset = Dataset::open(ds_uri).await?;
let table = Table { let dataset = Dataset::open(&uri).await?;
name, Ok(Table {
path: ds_uri.to_string(), name: name.to_string(),
uri: uri.to_string(),
dataset: Arc::new(dataset), dataset: Arc::new(dataset),
}; })
Ok(table)
} }
/// Creates a new Table /// Creates a new Table
@@ -69,18 +79,44 @@ impl Table {
/// ///
/// * A [Table] object. /// * A [Table] object.
pub async fn create( pub async fn create(
base_path: Arc<PathBuf>, base_uri: &str,
name: String, name: &str,
mut batches: Box<dyn RecordBatchReader>, mut batches: Box<dyn RecordBatchReader>,
) -> Result<Self> { ) -> Result<Self> {
let ds_path = base_path.join(format!("{}.{}", name, LANCE_FILE_EXTENSION)); let base_path = Path::new(base_uri);
let path = ds_path let table_uri = base_path.join(format!("{}.{}", name, LANCE_FILE_EXTENSION));
let uri = table_uri
.as_path()
.to_str() .to_str()
.ok_or(Error::IO(format!("Unable to find table {}", name)))?; .ok_or(Error::IO(format!("Invalid table name: {}", name)))?
.to_string();
let dataset = let dataset =
Arc::new(Dataset::write(&mut batches, path, Some(WriteParams::default())).await?); Arc::new(Dataset::write(&mut batches, &uri, Some(WriteParams::default())).await?);
Ok(Table { name, path: path.to_string(), dataset }) Ok(Table {
name: name.to_string(),
uri,
dataset,
})
}
/// Create index on the table.
pub async fn create_index(&mut self, index_builder: &impl VectorIndexBuilder) -> Result<()> {
use lance::index::DatasetIndexExt;
let dataset = self
.dataset
.create_index(
&[index_builder
.get_column()
.unwrap_or(VECTOR_COLUMN_NAME.to_string())
.as_str()],
IndexType::Vector,
index_builder.get_index_name(),
&index_builder.build(),
)
.await?;
self.dataset = Arc::new(dataset);
Ok(())
} }
/// Insert records into this Table /// Insert records into this Table
@@ -95,12 +131,12 @@ impl Table {
pub async fn add( pub async fn add(
&mut self, &mut self,
mut batches: Box<dyn RecordBatchReader>, mut batches: Box<dyn RecordBatchReader>,
write_mode: Option<WriteMode> write_mode: Option<WriteMode>,
) -> Result<usize> { ) -> Result<usize> {
let mut params = WriteParams::default(); let mut params = WriteParams::default();
params.mode = write_mode.unwrap_or(WriteMode::Append); params.mode = write_mode.unwrap_or(WriteMode::Append);
self.dataset = Arc::new(Dataset::write(&mut batches, self.path.as_str(), Some(params)).await?); self.dataset = Arc::new(Dataset::write(&mut batches, &self.uri, Some(params)).await?);
Ok(batches.count()) Ok(batches.count())
} }
@@ -125,56 +161,68 @@ impl Table {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use arrow_array::{Float32Array, Int32Array, RecordBatch, RecordBatchReader}; use std::sync::Arc;
use arrow_array::{
Array, FixedSizeListArray, Float32Array, Int32Array, RecordBatch, RecordBatchReader,
};
use arrow_data::ArrayDataBuilder;
use arrow_schema::{DataType, Field, Schema}; use arrow_schema::{DataType, Field, Schema};
use lance::arrow::RecordBatchBuffer; use lance::arrow::RecordBatchBuffer;
use lance::dataset::{Dataset, WriteMode}; use lance::dataset::{Dataset, WriteMode};
use std::sync::Arc; use lance::index::vector::ivf::IvfBuildParams;
use lance::index::vector::pq::PQBuildParams;
use rand::Rng;
use tempfile::tempdir; use tempfile::tempdir;
use crate::table::Table; use super::*;
use crate::index::vector::IvfPQIndexBuilder;
#[tokio::test] #[tokio::test]
async fn test_new_table_not_exists() { async fn test_new_table_not_exists() {
let tmp_dir = tempdir().unwrap(); let tmp_dir = tempdir().unwrap();
let path_buf = tmp_dir.into_path(); let uri = tmp_dir.path().to_str().unwrap();
let table = Table::open(Arc::new(path_buf), "test".to_string()).await; let table = Table::open(&uri, "test").await;
assert!(table.is_err()); assert!(table.is_err());
} }
#[tokio::test] #[tokio::test]
async fn test_open() { async fn test_open() {
let tmp_dir = tempdir().unwrap(); let tmp_dir = tempdir().unwrap();
let path_buf = tmp_dir.into_path(); let dataset_path = tmp_dir.path().join("test.lance");
let uri = tmp_dir.path().to_str().unwrap();
let mut batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches()); let mut batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
Dataset::write( Dataset::write(&mut batches, dataset_path.to_str().unwrap(), None)
&mut batches,
path_buf.join("test.lance").to_str().unwrap(),
None,
)
.await .await
.unwrap(); .unwrap();
let table = Table::open(Arc::new(path_buf), "test".to_string()) let table = Table::open(uri, "test").await.unwrap();
.await
.unwrap();
assert_eq!(table.name, "test") assert_eq!(table.name, "test")
} }
#[test]
fn test_object_store_path() {
use std::path::Path as StdPath;
let p = StdPath::new("s3://bucket/path/to/file");
let c = p.join("subfile");
assert_eq!(c.to_str().unwrap(), "s3://bucket/path/to/file/subfile");
}
#[tokio::test] #[tokio::test]
async fn test_add() { async fn test_add() {
let tmp_dir = tempdir().unwrap(); let tmp_dir = tempdir().unwrap();
let path_buf = tmp_dir.into_path(); let uri = tmp_dir.path().to_str().unwrap();
let batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches()); let batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
let schema = batches.schema().clone(); let schema = batches.schema().clone();
let mut table = Table::create(Arc::new(path_buf), "test".to_string(), batches).await.unwrap(); let mut table = Table::create(&uri, "test", batches).await.unwrap();
assert_eq!(table.count_rows().await.unwrap(), 10); assert_eq!(table.count_rows().await.unwrap(), 10);
let new_batches: Box<dyn RecordBatchReader> = Box::new(RecordBatchBuffer::new(vec![RecordBatch::try_new( let new_batches: Box<dyn RecordBatchReader> =
Box::new(RecordBatchBuffer::new(vec![RecordBatch::try_new(
schema, schema,
vec![Arc::new(Int32Array::from_iter_values(100..110))], vec![Arc::new(Int32Array::from_iter_values(100..110))],
) )
@@ -188,19 +236,24 @@ mod tests {
#[tokio::test] #[tokio::test]
async fn test_add_overwrite() { async fn test_add_overwrite() {
let tmp_dir = tempdir().unwrap(); let tmp_dir = tempdir().unwrap();
let path_buf = tmp_dir.into_path(); let uri = tmp_dir.path().to_str().unwrap();
let batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches()); let batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
let schema = batches.schema().clone(); let schema = batches.schema().clone();
let mut table = Table::create(Arc::new(path_buf), "test".to_string(), batches).await.unwrap(); let mut table = Table::create(uri, "test", batches).await.unwrap();
assert_eq!(table.count_rows().await.unwrap(), 10); assert_eq!(table.count_rows().await.unwrap(), 10);
let new_batches: Box<dyn RecordBatchReader> = Box::new(RecordBatchBuffer::new(vec![RecordBatch::try_new( let new_batches: Box<dyn RecordBatchReader> =
Box::new(RecordBatchBuffer::new(vec![RecordBatch::try_new(
schema, schema,
vec![Arc::new(Int32Array::from_iter_values(100..110))], vec![Arc::new(Int32Array::from_iter_values(100..110))],
).unwrap()])); )
.unwrap()]));
table.add(new_batches, Some(WriteMode::Overwrite)).await.unwrap(); table
.add(new_batches, Some(WriteMode::Overwrite))
.await
.unwrap();
assert_eq!(table.count_rows().await.unwrap(), 10); assert_eq!(table.count_rows().await.unwrap(), 10);
assert_eq!(table.name, "test"); assert_eq!(table.name, "test");
} }
@@ -208,20 +261,15 @@ mod tests {
#[tokio::test] #[tokio::test]
async fn test_search() { async fn test_search() {
let tmp_dir = tempdir().unwrap(); let tmp_dir = tempdir().unwrap();
let path_buf = tmp_dir.into_path(); let dataset_path = tmp_dir.path().join("test.lance");
let uri = tmp_dir.path().to_str().unwrap();
let mut batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches()); let mut batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
Dataset::write( Dataset::write(&mut batches, dataset_path.to_str().unwrap(), None)
&mut batches,
path_buf.join("test.lance").to_str().unwrap(),
None,
)
.await .await
.unwrap(); .unwrap();
let table = Table::open(Arc::new(path_buf), "test".to_string()) let table = Table::open(uri, "test").await.unwrap();
.await
.unwrap();
let vector = Float32Array::from_iter_values([0.1, 0.2]); let vector = Float32Array::from_iter_values([0.1, 0.2]);
let query = table.search(vector.clone()); let query = table.search(vector.clone());
@@ -236,4 +284,72 @@ mod tests {
) )
.unwrap()]) .unwrap()])
} }
#[tokio::test]
async fn test_create_index() {
use arrow_array::RecordBatch;
use arrow_schema::{DataType, Field, Schema as ArrowSchema};
use rand;
use std::iter::repeat_with;
use arrow_array::Float32Array;
let tmp_dir = tempdir().unwrap();
let uri = tmp_dir.path().to_str().unwrap();
let dimension = 16;
let schema = Arc::new(ArrowSchema::new(vec![Field::new(
"embeddings",
DataType::FixedSizeList(
Arc::new(Field::new("item", DataType::Float32, true)),
dimension,
),
false,
)]));
let mut rng = rand::thread_rng();
let float_arr = Float32Array::from(
repeat_with(|| rng.gen::<f32>())
.take(512 * dimension as usize)
.collect::<Vec<f32>>(),
);
let vectors = Arc::new(create_fixed_size_list(float_arr, dimension).unwrap());
let batches = RecordBatchBuffer::new(vec![RecordBatch::try_new(
schema.clone(),
vec![vectors.clone()],
)
.unwrap()]);
let reader: Box<dyn RecordBatchReader + Send> = Box::new(batches);
let mut table = Table::create(uri, "test", reader).await.unwrap();
let mut i = IvfPQIndexBuilder::new();
let index_builder = i
.column("embeddings".to_string())
.index_name("my_index".to_string())
.ivf_params(IvfBuildParams::new(256))
.pq_params(PQBuildParams::default());
table.create_index(index_builder).await.unwrap();
assert_eq!(table.dataset.load_indices().await.unwrap().len(), 1);
assert_eq!(table.count_rows().await.unwrap(), 512);
assert_eq!(table.name, "test");
}
fn create_fixed_size_list<T: Array>(values: T, list_size: i32) -> Result<FixedSizeListArray> {
let list_type = DataType::FixedSizeList(
Arc::new(Field::new("item", values.data_type().clone(), true)),
list_size,
);
let data = ArrayDataBuilder::new(list_type)
.len(values.len() / list_size as usize)
.add_child_data(values.into_data())
.build()
.unwrap();
Ok(FixedSizeListArray::from(data))
}
} }